2 * Stateless TCP Tunnel (STT) vport.
4 * Copyright (c) 2015 Nicira, Inc.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 #include <asm/unaligned.h>
14 #include <linux/delay.h>
15 #include <linux/flex_array.h>
17 #include <linux/if_vlan.h>
19 #include <linux/ipv6.h>
20 #include <linux/jhash.h>
21 #include <linux/list.h>
22 #include <linux/log2.h>
23 #include <linux/module.h>
24 #include <linux/netfilter.h>
25 #include <linux/percpu.h>
26 #include <linux/skbuff.h>
27 #include <linux/tcp.h>
28 #include <linux/workqueue.h>
31 #include <net/inet_ecn.h>
33 #include <net/net_namespace.h>
34 #include <net/netns/generic.h>
45 #define STT_CSUM_VERIFIED BIT(0)
46 #define STT_CSUM_PARTIAL BIT(1)
47 #define STT_PROTO_IPV4 BIT(2)
48 #define STT_PROTO_TCP BIT(3)
49 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
51 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
54 /* The length and offset of a fragment are encoded in the sequence number.
55 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
56 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
58 #define STT_SEQ_LEN_SHIFT 16
59 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
61 /* The maximum amount of memory used to store packets waiting to be reassembled
62 * on a given CPU. Once this threshold is exceeded we will begin freeing the
63 * least recently used fragments.
65 #define REASM_HI_THRESH (4 * 1024 * 1024)
66 /* The target for the high memory evictor. Once we have exceeded
67 * REASM_HI_THRESH, we will continue freeing fragments until we hit
70 #define REASM_LO_THRESH (3 * 1024 * 1024)
71 /* The length of time a given packet has to be reassembled from the time the
72 * first fragment arrives. Once this limit is exceeded it becomes available
75 #define FRAG_EXP_TIME (30 * HZ)
76 /* Number of hash entries. Each entry has only a single slot to hold a packet
77 * so if there are collisions, we will drop packets. This is allocated
78 * per-cpu and each entry consists of struct pkt_frag.
80 #define FRAG_HASH_SHIFT 8
81 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
82 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
84 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
95 unsigned long timestamp;
96 struct list_head lru_node;
101 struct flex_array *frag_hash;
102 struct list_head frag_lru;
103 unsigned int frag_mem_used;
105 /* Protect frags table. */
110 struct sk_buff *last_skb;
111 unsigned int mem_used;
120 /* Only valid for the first skb in the chain. */
121 struct first_frag first;
124 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
126 /* per-network namespace private data for this module */
128 struct list_head sock_list;
131 static int stt_net_id;
133 static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
134 static u32 frag_hash_seed __read_mostly;
136 /* Protects sock-hash and refcounts. */
137 static DEFINE_MUTEX(stt_mutex);
139 static int n_tunnels;
140 static DEFINE_PER_CPU(u32, pkt_seq_counter);
142 static void clean_percpu(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
145 static struct stt_sock *stt_find_sock(struct net *net, __be16 port)
147 struct stt_net *sn = net_generic(net, stt_net_id);
148 struct stt_sock *stt_sock;
150 list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) {
151 if (inet_sk(stt_sock->sock->sk)->inet_sport == port)
157 static __be32 ack_seq(void)
162 pkt_seq = this_cpu_read(pkt_seq_counter);
163 ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
164 this_cpu_inc(pkt_seq_counter);
166 return (__force __be32)ack;
168 #error "Support for greater than 64k CPUs not implemented"
172 static int clear_gso(struct sk_buff *skb)
174 struct skb_shared_info *shinfo = skb_shinfo(skb);
177 if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
178 shinfo->gso_segs == 0)
181 err = skb_unclone(skb, GFP_ATOMIC);
185 shinfo = skb_shinfo(skb);
186 shinfo->gso_type = 0;
187 shinfo->gso_size = 0;
188 shinfo->gso_segs = 0;
192 static struct sk_buff *normalize_frag_list(struct sk_buff *head,
193 struct sk_buff **skbp)
195 struct sk_buff *skb = *skbp;
196 struct sk_buff *last;
199 struct sk_buff *frags;
201 if (skb_shared(skb)) {
202 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
205 return ERR_PTR(-ENOMEM);
207 nskb->next = skb->next;
214 head->len -= skb->len;
215 head->data_len -= skb->len;
216 head->truesize -= skb->truesize;
219 frags = skb_shinfo(skb)->frag_list;
223 err = skb_unclone(skb, GFP_ATOMIC);
227 last = normalize_frag_list(skb, &frags);
231 skb_shinfo(skb)->frag_list = NULL;
232 last->next = skb->next;
239 } while ((skb = skb->next));
244 /* Takes a linked list of skbs, which potentially contain frag_list
245 * (whose members in turn potentially contain frag_lists, etc.) and
246 * converts them into a single linear linked list.
248 static int straighten_frag_list(struct sk_buff **skbp)
250 struct sk_buff *err_skb;
252 err_skb = normalize_frag_list(NULL, skbp);
254 return PTR_ERR(err_skb);
259 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
261 to->protocol = from->protocol;
262 to->tstamp = from->tstamp;
263 to->priority = from->priority;
264 to->mark = from->mark;
265 to->vlan_tci = from->vlan_tci;
266 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
267 to->vlan_proto = from->vlan_proto;
269 skb_copy_secmark(to, from);
272 static void update_headers(struct sk_buff *skb, bool head,
273 unsigned int l4_offset, unsigned int hdr_len,
274 bool ipv4, u32 tcp_seq)
276 u16 old_len, new_len;
282 struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
284 old_len = ntohs(iph->tot_len);
285 new_len = skb->len - ETH_HLEN;
286 iph->tot_len = htons(new_len);
290 struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
292 old_len = ntohs(ip6h->payload_len);
293 new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
294 ip6h->payload_len = htons(new_len);
297 tcph = (struct tcphdr *)(skb->data + l4_offset);
299 tcph->seq = htonl(tcp_seq);
308 delta = htonl(~old_len + new_len);
309 tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
310 (__force u32)delta));
312 gso_size = skb_shinfo(skb)->gso_size;
313 if (gso_size && skb->len - hdr_len <= gso_size)
314 BUG_ON(clear_gso(skb));
317 static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
319 /* If no offloading is in use then we don't have enough information
320 * to process the headers.
325 /* Handling UDP packets requires IP fragmentation, which means that
326 * the L4 checksum can no longer be calculated by hardware (since the
327 * fragments are in different packets. If we have to compute the
328 * checksum it's faster just to linearize and large UDP packets are
329 * pretty uncommon anyways, so it's not worth dealing with for now.
335 struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
337 /* It's difficult to get the IP IDs exactly right here due to
338 * varying segment sizes and potentially multiple layers of
339 * segmentation. IP ID isn't important when DF is set and DF
340 * is generally set for TCP packets, so just linearize if it's
343 if (!(iph->frag_off & htons(IP_DF)))
346 struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
348 /* Jumbograms require more processing to update and we'll
349 * probably never see them, so just linearize.
351 if (ip6h->payload_len == 0)
360 static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
365 if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
366 int extra_head = hdr_len - skb_headroom(frag);
368 extra_head = extra_head > 0 ? extra_head : 0;
369 if (unlikely(pskb_expand_head(frag, extra_head, 0,
374 memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
376 csum_start = head->csum_start - skb_headroom(head);
377 frag->csum_start = skb_headroom(frag) + csum_start;
378 frag->csum_offset = head->csum_offset;
379 frag->ip_summed = head->ip_summed;
381 skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
382 skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
383 skb_shinfo(frag)->gso_segs = 0;
385 copy_skb_metadata(frag, head);
389 static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
398 if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
401 tcph = (struct tcphdr *)(head->data + l4_offset);
402 tcp_len = tcph->doff * 4;
403 hdr_len = l4_offset + tcp_len;
405 if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
406 (head->len < hdr_len)))
409 if (unlikely(!pskb_may_pull(head, hdr_len)))
412 tcph = (struct tcphdr *)(head->data + l4_offset);
413 /* Update header of each segment. */
414 seq = ntohl(tcph->seq);
415 seg_len = skb_pagelen(head) - hdr_len;
417 skb = skb_shinfo(head)->frag_list;
418 skb_shinfo(head)->frag_list = NULL;
420 for (; skb; skb = skb->next) {
423 head->len -= skb->len;
424 head->data_len -= skb->len;
425 head->truesize -= skb->truesize;
429 err = copy_headers(head, skb, hdr_len);
432 update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
434 update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
438 static int coalesce_skb(struct sk_buff **headp)
440 struct sk_buff *frag, *head, *prev;
443 err = straighten_frag_list(headp);
448 /* Coalesce frag list. */
450 for (frag = head->next; frag; frag = frag->next) {
454 if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
457 if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
462 prev->next = frag->next;
465 frag->truesize -= delta;
466 kfree_skb_partial(frag, headstolen);
473 for (frag = head->next; frag; frag = frag->next) {
474 head->len += frag->len;
475 head->data_len += frag->len;
476 head->truesize += frag->truesize;
479 skb_shinfo(head)->frag_list = head->next;
484 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
485 bool ipv4, bool tcp, int l4_offset)
487 if (can_segment(skb, ipv4, tcp, csum_partial))
488 return skb_list_segment(skb, ipv4, l4_offset);
490 return skb_linearize(skb);
493 static int try_to_segment(struct sk_buff *skb)
495 struct stthdr *stth = stt_hdr(skb);
496 bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
497 bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
498 bool tcp = !!(stth->flags & STT_PROTO_TCP);
499 int l4_offset = stth->l4_offset;
501 return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
504 static int segment_skb(struct sk_buff **headp, bool csum_partial,
505 bool ipv4, bool tcp, int l4_offset)
509 err = coalesce_skb(headp);
513 if (skb_shinfo(*headp)->frag_list)
514 return __try_to_segment(*headp, csum_partial,
515 ipv4, tcp, l4_offset);
519 static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
520 __be16 s_port, __be16 d_port,
521 __be32 saddr, __be32 dst,
522 __be16 l3_proto, u8 l4_proto,
525 int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
526 unsigned short encap_mss;
530 skb_push(skb, STT_HEADER_LEN);
531 skb_reset_transport_header(skb);
533 memset(tcph, 0, STT_HEADER_LEN);
536 if (skb->ip_summed == CHECKSUM_PARTIAL) {
537 stth->flags |= STT_CSUM_PARTIAL;
539 stth->l4_offset = skb->csum_start -
543 if (l3_proto == htons(ETH_P_IP))
544 stth->flags |= STT_PROTO_IPV4;
546 if (l4_proto == IPPROTO_TCP)
547 stth->flags |= STT_PROTO_TCP;
549 stth->mss = htons(skb_shinfo(skb)->gso_size);
550 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
551 stth->flags |= STT_CSUM_VERIFIED;
554 stth->vlan_tci = htons(skb->vlan_tci);
556 put_unaligned(tun_id, &stth->key);
558 tcph->source = s_port;
560 tcph->doff = sizeof(struct tcphdr) / 4;
563 tcph->window = htons(USHRT_MAX);
564 tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT);
565 tcph->ack_seq = ack_seq();
566 tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0);
568 skb->csum_start = skb_transport_header(skb) - skb->head;
569 skb->csum_offset = offsetof(struct tcphdr, check);
570 skb->ip_summed = CHECKSUM_PARTIAL;
572 encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
573 if (data_len > encap_mss) {
574 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
577 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
578 skb_shinfo(skb)->gso_size = encap_mss;
579 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
581 if (unlikely(clear_gso(skb)))
587 static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
588 __be16 s_port, __be16 d_port,
589 __be32 saddr, __be32 dst,
590 __be16 l3_proto, u8 l4_proto,
595 if (skb_shinfo(head)->frag_list) {
596 bool ipv4 = (l3_proto == htons(ETH_P_IP));
597 bool tcp = (l4_proto == IPPROTO_TCP);
598 bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
599 int l4_offset = skb_transport_offset(head);
601 /* Need to call skb_orphan() to report currect true-size.
602 * calling skb_orphan() in this layer is odd but SKB with
603 * frag-list should not be associated with any socket, so
604 * skb-orphan should be no-op. */
606 if (unlikely(segment_skb(&head, csum_partial,
607 ipv4, tcp, l4_offset)))
611 for (skb = head; skb; skb = skb->next) {
612 if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
613 l3_proto, l4_proto, dst_mtu))
619 kfree_skb_list(head);
623 static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
625 if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
630 if (l4_proto == IPPROTO_TCP)
631 csum_offset = offsetof(struct tcphdr, check);
632 else if (l4_proto == IPPROTO_UDP)
633 csum_offset = offsetof(struct udphdr, check);
637 len = skb->len - skb_transport_offset(skb);
638 csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
640 if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
641 csum_offset + sizeof(*csum))))
644 if (l3_proto == htons(ETH_P_IP)) {
645 struct iphdr *iph = ip_hdr(skb);
647 *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
649 } else if (l3_proto == htons(ETH_P_IPV6)) {
650 struct ipv6hdr *ip6h = ipv6_hdr(skb);
652 *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
657 skb->csum_start = skb_transport_header(skb) - skb->head;
658 skb->csum_offset = csum_offset;
659 skb->ip_summed = CHECKSUM_PARTIAL;
662 if (skb->ip_summed == CHECKSUM_PARTIAL) {
663 /* Assume receiver can only offload TCP/UDP over IPv4/6,
664 * and require 802.1Q VLANs to be accelerated.
666 if (l3_proto != htons(ETH_P_IP) &&
667 l3_proto != htons(ETH_P_IPV6))
670 if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
673 /* L4 offset must fit in a 1-byte field. */
674 if (skb->csum_start - skb_headroom(skb) > 255)
677 if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
680 /* Total size of encapsulated packet must fit in 16 bits. */
681 if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
684 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
685 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
691 static bool need_linearize(const struct sk_buff *skb)
693 struct skb_shared_info *shinfo = skb_shinfo(skb);
696 if (unlikely(shinfo->frag_list))
699 /* Generally speaking we should linearize if there are paged frags.
700 * However, if all of the refcounts are 1 we know nobody else can
701 * change them from underneath us and we can skip the linearization.
703 for (i = 0; i < shinfo->nr_frags; i++)
704 if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
710 static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
714 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
715 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
717 min_headroom += VLAN_HLEN;
718 if (skb_headroom(skb) < min_headroom) {
719 int head_delta = SKB_DATA_ALIGN(min_headroom -
720 skb_headroom(skb) + 16);
722 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
728 skb = __vlan_hwaccel_push_inside(skb);
736 if (skb_is_gso(skb)) {
737 struct sk_buff *nskb;
738 char cb[sizeof(skb->cb)];
740 memcpy(cb, skb->cb, sizeof(cb));
742 nskb = __skb_gso_segment(skb, 0, false);
751 memcpy(nskb->cb, cb, sizeof(cb));
754 } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
755 /* Pages aren't locked and could change at any time.
756 * If this happens after we compute the checksum, the
757 * checksum will be wrong. We linearize now to avoid
760 if (unlikely(need_linearize(skb))) {
761 err = __skb_linearize(skb);
766 err = skb_checksum_help(skb);
770 skb->ip_summed = CHECKSUM_NONE;
778 static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
779 __be32 dst, __u8 tos, __u8 ttl, __be16 df)
784 struct sk_buff *next = skb->next;
789 skb_clear_ovs_gso_cb(skb);
791 len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
792 tos, ttl, df, false);
799 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
801 unsigned int nh_ofs = skb_network_offset(skb);
807 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
811 nexthdr = nh->nexthdr;
812 payload_ofs = (u8 *)(nh + 1) - skb->data;
814 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
815 if (unlikely(payload_ofs < 0))
821 static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
823 if (l3_proto == htons(ETH_P_IP)) {
824 unsigned int nh_ofs = skb_network_offset(skb);
826 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
829 return ip_hdr(skb)->protocol;
830 } else if (l3_proto == htons(ETH_P_IPV6)) {
831 return parse_ipv6_l4_proto(skb);
836 int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
837 __be32 src, __be32 dst, __u8 tos,
838 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
841 struct ethhdr *eh = eth_hdr(skb);
842 int ret = 0, min_headroom;
843 __be16 inner_l3_proto;
846 inner_l3_proto = eh->h_proto;
847 inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
849 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
850 + STT_HEADER_LEN + sizeof(struct iphdr);
852 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
853 int head_delta = SKB_DATA_ALIGN(min_headroom -
857 ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
863 ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
867 skb = handle_offloads(skb, min_headroom);
877 struct sk_buff *next_skb = skb->next;
884 /* Push STT and TCP header. */
885 skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
886 dst, inner_l3_proto, inner_l4_proto,
888 if (unlikely(!skb)) {
893 /* Push IP header. */
894 ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
907 EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb);
909 static void free_frag(struct stt_percpu *stt_percpu,
910 struct pkt_frag *frag)
912 stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
913 kfree_skb_list(frag->skbs);
914 list_del(&frag->lru_node);
918 static void evict_frags(struct stt_percpu *stt_percpu)
920 while (!list_empty(&stt_percpu->frag_lru) &&
921 stt_percpu->frag_mem_used > REASM_LO_THRESH) {
922 struct pkt_frag *frag;
924 frag = list_first_entry(&stt_percpu->frag_lru,
927 free_frag(stt_percpu, frag);
931 static bool pkt_key_match(struct net *net,
932 const struct pkt_frag *a, const struct pkt_key *b)
934 return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
935 a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
936 net_eq(dev_net(a->skbs->dev), net);
939 static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
941 u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
943 return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
944 (__force u32)key->pkt_seq, initval);
947 static struct pkt_frag *lookup_frag(struct net *net,
948 struct stt_percpu *stt_percpu,
949 const struct pkt_key *key, u32 hash)
951 struct pkt_frag *frag, *victim_frag = NULL;
954 for (i = 0; i < FRAG_HASH_SEGS; i++) {
955 frag = flex_array_get(stt_percpu->frag_hash,
956 hash & (FRAG_HASH_ENTRIES - 1));
959 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
960 pkt_key_match(net, frag, key))
964 (victim_frag->skbs &&
966 time_before(frag->timestamp, victim_frag->timestamp))))
969 hash >>= FRAG_HASH_SHIFT;
972 if (victim_frag->skbs)
973 free_frag(stt_percpu, victim_frag);
978 static struct sk_buff *reassemble(struct sk_buff *skb)
980 struct iphdr *iph = ip_hdr(skb);
981 struct tcphdr *tcph = tcp_hdr(skb);
982 u32 seq = ntohl(tcph->seq);
983 struct stt_percpu *stt_percpu;
984 struct sk_buff *last_skb;
985 struct pkt_frag *frag;
990 tot_len = seq >> STT_SEQ_LEN_SHIFT;
991 FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
993 if (unlikely(skb->len == 0))
996 if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
999 if (tot_len == skb->len)
1002 key.saddr = iph->saddr;
1003 key.daddr = iph->daddr;
1004 key.pkt_seq = tcph->ack_seq;
1005 key.mark = skb->mark;
1006 hash = pkt_key_hash(dev_net(skb->dev), &key);
1008 stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
1010 spin_lock(&stt_percpu->lock);
1012 if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
1013 evict_frags(stt_percpu);
1015 frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
1019 frag->timestamp = jiffies;
1020 FRAG_CB(skb)->first.last_skb = skb;
1021 FRAG_CB(skb)->first.mem_used = skb->truesize;
1022 FRAG_CB(skb)->first.tot_len = tot_len;
1023 FRAG_CB(skb)->first.rcvd_len = skb->len;
1024 FRAG_CB(skb)->first.set_ecn_ce = false;
1025 list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
1026 stt_percpu->frag_mem_used += skb->truesize;
1032 /* Optimize for the common case where fragments are received in-order
1033 * and not overlapping.
1035 last_skb = FRAG_CB(frag->skbs)->first.last_skb;
1036 if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
1037 FRAG_CB(skb)->offset)) {
1038 last_skb->next = skb;
1039 FRAG_CB(frag->skbs)->first.last_skb = skb;
1041 struct sk_buff *prev = NULL, *next;
1043 for (next = frag->skbs; next; next = next->next) {
1044 if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
1049 /* Overlapping fragments aren't allowed. We shouldn't start
1050 * before the end of the previous fragment.
1053 FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
1056 /* We also shouldn't end after the beginning of the next
1060 FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
1066 FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
1073 FRAG_CB(frag->skbs)->first.last_skb = skb;
1076 FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
1077 FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
1078 FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
1079 stt_percpu->frag_mem_used += skb->truesize;
1081 if (FRAG_CB(frag->skbs)->first.tot_len ==
1082 FRAG_CB(frag->skbs)->first.rcvd_len) {
1083 struct sk_buff *frag_head = frag->skbs;
1085 frag_head->tstamp = skb->tstamp;
1086 if (FRAG_CB(frag_head)->first.set_ecn_ce)
1087 INET_ECN_set_ce(frag_head);
1089 list_del(&frag->lru_node);
1090 stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
1094 list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
1104 spin_unlock(&stt_percpu->lock);
1113 static bool validate_checksum(struct sk_buff *skb)
1115 struct iphdr *iph = ip_hdr(skb);
1117 if (skb_csum_unnecessary(skb))
1120 if (skb->ip_summed == CHECKSUM_COMPLETE &&
1121 !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
1124 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
1127 return __tcp_checksum_complete(skb) == 0;
1130 static bool set_offloads(struct sk_buff *skb)
1132 struct stthdr *stth = stt_hdr(skb);
1133 unsigned short gso_type;
1140 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1141 ntohs(stth->vlan_tci));
1143 if (!(stth->flags & STT_CSUM_PARTIAL)) {
1144 if (stth->flags & STT_CSUM_VERIFIED)
1145 skb->ip_summed = CHECKSUM_UNNECESSARY;
1147 skb->ip_summed = CHECKSUM_NONE;
1149 return clear_gso(skb) == 0;
1152 proto_type = stth->flags & STT_PROTO_TYPES;
1154 switch (proto_type) {
1155 case (STT_PROTO_IPV4 | STT_PROTO_TCP):
1157 csum_offset = offsetof(struct tcphdr, check);
1158 gso_type = SKB_GSO_TCPV4;
1159 l3_header_size = sizeof(struct iphdr);
1160 l4_header_size = sizeof(struct tcphdr);
1161 skb->protocol = htons(ETH_P_IP);
1165 csum_offset = offsetof(struct tcphdr, check);
1166 gso_type = SKB_GSO_TCPV6;
1167 l3_header_size = sizeof(struct ipv6hdr);
1168 l4_header_size = sizeof(struct tcphdr);
1169 skb->protocol = htons(ETH_P_IPV6);
1171 case STT_PROTO_IPV4:
1173 csum_offset = offsetof(struct udphdr, check);
1174 gso_type = SKB_GSO_UDP;
1175 l3_header_size = sizeof(struct iphdr);
1176 l4_header_size = sizeof(struct udphdr);
1177 skb->protocol = htons(ETH_P_IP);
1181 csum_offset = offsetof(struct udphdr, check);
1182 gso_type = SKB_GSO_UDP;
1183 l3_header_size = sizeof(struct ipv6hdr);
1184 l4_header_size = sizeof(struct udphdr);
1185 skb->protocol = htons(ETH_P_IPV6);
1188 if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
1191 if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
1194 stth = stt_hdr(skb);
1196 skb->csum_start = skb_headroom(skb) + stth->l4_offset;
1197 skb->csum_offset = csum_offset;
1198 skb->ip_summed = CHECKSUM_PARTIAL;
1201 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
1204 skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
1205 skb_shinfo(skb)->gso_size = ntohs(stth->mss);
1206 skb_shinfo(skb)->gso_segs = 0;
1208 if (unlikely(clear_gso(skb)))
1214 static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
1218 if (unlikely(!validate_checksum(skb)))
1221 skb = reassemble(skb);
1225 if (skb->next && coalesce_skb(&skb))
1228 err = iptunnel_pull_header(skb,
1229 sizeof(struct stthdr) + STT_ETH_PAD,
1234 if (unlikely(stt_hdr(skb)->version != 0))
1237 if (unlikely(!set_offloads(skb)))
1240 if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
1243 stt_sock->rcv(stt_sock, skb);
1246 /* Consume bad packet */
1247 kfree_skb_list(skb);
1250 static void tcp_sock_release(struct socket *sock)
1252 kernel_sock_shutdown(sock, SHUT_RDWR);
1253 sk_release_kernel(sock->sk);
1256 static int tcp_sock_create4(struct net *net, __be16 port,
1257 struct socket **sockp)
1259 struct sockaddr_in tcp_addr;
1260 struct socket *sock = NULL;
1263 err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1267 sk_change_net(sock->sk, net);
1269 memset(&tcp_addr, 0, sizeof(tcp_addr));
1270 tcp_addr.sin_family = AF_INET;
1271 tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
1272 tcp_addr.sin_port = port;
1273 err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
1283 tcp_sock_release(sock);
1288 static void schedule_clean_percpu(void)
1290 schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
1293 static void clean_percpu(struct work_struct *work)
1297 for_each_possible_cpu(i) {
1298 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1301 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1302 struct pkt_frag *frag;
1304 frag = flex_array_get(stt_percpu->frag_hash, j);
1306 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
1309 spin_lock_bh(&stt_percpu->lock);
1312 time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
1313 free_frag(stt_percpu, frag);
1315 spin_unlock_bh(&stt_percpu->lock);
1318 schedule_clean_percpu();
1321 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
1322 #define FIRST_PARAM const struct nf_hook_ops *ops,
1324 #define FIRST_PARAM unsigned int hooknum,
1327 static unsigned int nf_ip_hook(FIRST_PARAM
1328 struct sk_buff *skb,
1329 const struct net_device *in,
1330 const struct net_device *out,
1331 int (*okfn)(struct sk_buff *))
1333 struct stt_sock *stt_sock;
1336 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
1339 ip_hdr_len = ip_hdrlen(skb);
1340 if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
1343 skb_set_transport_header(skb, ip_hdr_len);
1345 stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest);
1349 __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
1350 stt_rcv(stt_sock, skb);
1354 static struct nf_hook_ops nf_hook_ops __read_mostly = {
1356 .owner = THIS_MODULE,
1358 .hooknum = NF_INET_LOCAL_IN,
1359 .priority = INT_MAX,
1362 static int stt_start(void)
1371 get_random_bytes(&frag_hash_seed, sizeof(u32));
1373 stt_percpu_data = alloc_percpu(struct stt_percpu);
1374 if (!stt_percpu_data) {
1379 for_each_possible_cpu(i) {
1380 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1381 struct flex_array *frag_hash;
1383 spin_lock_init(&stt_percpu->lock);
1384 INIT_LIST_HEAD(&stt_percpu->frag_lru);
1385 get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
1387 frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
1389 GFP_KERNEL | __GFP_ZERO);
1394 stt_percpu->frag_hash = frag_hash;
1396 err = flex_array_prealloc(stt_percpu->frag_hash, 0,
1398 GFP_KERNEL | __GFP_ZERO);
1402 err = nf_register_hook(&nf_hook_ops);
1406 schedule_clean_percpu();
1411 for_each_possible_cpu(i) {
1412 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1414 if (stt_percpu->frag_hash)
1415 flex_array_free(stt_percpu->frag_hash);
1418 free_percpu(stt_percpu_data);
1424 static void stt_cleanup(void)
1432 cancel_delayed_work_sync(&clean_percpu_wq);
1433 nf_unregister_hook(&nf_hook_ops);
1435 for_each_possible_cpu(i) {
1436 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1439 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1440 struct pkt_frag *frag;
1442 frag = flex_array_get(stt_percpu->frag_hash, j);
1443 kfree_skb_list(frag->skbs);
1446 flex_array_free(stt_percpu->frag_hash);
1449 free_percpu(stt_percpu_data);
1452 static struct stt_sock *stt_socket_create(struct net *net, __be16 port,
1453 stt_rcv_t *rcv, void *data)
1455 struct stt_net *sn = net_generic(net, stt_net_id);
1456 struct stt_sock *stt_sock;
1457 struct socket *sock;
1460 stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL);
1462 return ERR_PTR(-ENOMEM);
1464 err = tcp_sock_create4(net, port, &sock);
1467 return ERR_PTR(err);
1470 stt_sock->sock = sock;
1471 stt_sock->rcv = rcv;
1472 stt_sock->rcv_data = data;
1474 list_add_rcu(&stt_sock->list, &sn->sock_list);
1479 static void __stt_sock_release(struct stt_sock *stt_sock)
1481 list_del_rcu(&stt_sock->list);
1482 tcp_sock_release(stt_sock->sock);
1483 kfree_rcu(stt_sock, rcu);
1486 struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
1487 stt_rcv_t *rcv, void *data)
1489 struct stt_sock *stt_sock;
1494 return ERR_PTR(err);
1496 mutex_lock(&stt_mutex);
1498 stt_sock = stt_find_sock(net, port);
1501 stt_sock = ERR_PTR(-EBUSY);
1503 stt_sock = stt_socket_create(net, port, rcv, data);
1505 mutex_unlock(&stt_mutex);
1507 if (IS_ERR(stt_sock))
1512 EXPORT_SYMBOL_GPL(rpl_stt_sock_add);
1514 void rpl_stt_sock_release(struct stt_sock *stt_sock)
1516 mutex_lock(&stt_mutex);
1518 __stt_sock_release(stt_sock);
1521 mutex_unlock(&stt_mutex);
1523 EXPORT_SYMBOL_GPL(rpl_stt_sock_release);
1525 static int stt_init_net(struct net *net)
1527 struct stt_net *sn = net_generic(net, stt_net_id);
1529 INIT_LIST_HEAD(&sn->sock_list);
1533 static struct pernet_operations stt_net_ops = {
1534 .init = stt_init_net,
1536 .size = sizeof(struct stt_net),
1539 int ovs_stt_init_module(void)
1541 return register_pernet_subsys(&stt_net_ops);
1543 EXPORT_SYMBOL_GPL(ovs_stt_init_module);
1545 void ovs_stt_cleanup_module(void)
1547 unregister_pernet_subsys(&stt_net_ops);
1549 EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module);