2 * Stateless TCP Tunnel (STT) vport.
4 * Copyright (c) 2015 Nicira, Inc.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <asm/unaligned.h>
15 #include <linux/delay.h>
16 #include <linux/flex_array.h>
18 #include <linux/if_vlan.h>
20 #include <linux/ipv6.h>
21 #include <linux/jhash.h>
22 #include <linux/list.h>
23 #include <linux/log2.h>
24 #include <linux/module.h>
25 #include <linux/net.h>
26 #include <linux/netfilter.h>
27 #include <linux/percpu.h>
28 #include <linux/skbuff.h>
29 #include <linux/tcp.h>
30 #include <linux/workqueue.h>
32 #include <net/dst_metadata.h>
34 #include <net/inet_ecn.h>
36 #include <net/ip_tunnels.h>
37 #include <net/ip6_checksum.h>
38 #include <net/net_namespace.h>
39 #include <net/netns/generic.h>
48 #define STT_NETDEV_VER "0.1"
49 #define STT_DST_PORT 7471
54 /* @list: Per-net list of STT ports.
55 * @rcv: The callback is called on STT packet recv, STT reassembly can generate
56 * multiple packets, in this case first packet has tunnel outer header, rest
57 * of the packets are inner packet segments with no stt header.
58 * @rcv_data: user data.
59 * @sock: Fake TCP socket for the STT port.
62 struct net_device *dev;
64 struct list_head next;
65 struct list_head up_next;
70 #define STT_CSUM_VERIFIED BIT(0)
71 #define STT_CSUM_PARTIAL BIT(1)
72 #define STT_PROTO_IPV4 BIT(2)
73 #define STT_PROTO_TCP BIT(3)
74 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
76 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
79 /* The length and offset of a fragment are encoded in the sequence number.
80 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
81 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
83 #define STT_SEQ_LEN_SHIFT 16
84 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
86 /* The maximum amount of memory used to store packets waiting to be reassembled
87 * on a given CPU. Once this threshold is exceeded we will begin freeing the
88 * least recently used fragments.
90 #define REASM_HI_THRESH (4 * 1024 * 1024)
91 /* The target for the high memory evictor. Once we have exceeded
92 * REASM_HI_THRESH, we will continue freeing fragments until we hit
95 #define REASM_LO_THRESH (3 * 1024 * 1024)
96 /* The length of time a given packet has to be reassembled from the time the
97 * first fragment arrives. Once this limit is exceeded it becomes available
100 #define FRAG_EXP_TIME (30 * HZ)
101 /* Number of hash entries. Each entry has only a single slot to hold a packet
102 * so if there are collisions, we will drop packets. This is allocated
103 * per-cpu and each entry consists of struct pkt_frag.
105 #define FRAG_HASH_SHIFT 8
106 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
107 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
109 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
119 struct sk_buff *skbs;
120 unsigned long timestamp;
121 struct list_head lru_node;
126 struct flex_array *frag_hash;
127 struct list_head frag_lru;
128 unsigned int frag_mem_used;
130 /* Protect frags table. */
135 struct sk_buff *last_skb;
136 unsigned int mem_used;
145 /* Only valid for the first skb in the chain. */
146 struct first_frag first;
149 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
151 /* per-network namespace private data for this module */
153 struct list_head stt_list;
154 struct list_head stt_up_list; /* Devices which are in IFF_UP state. */
156 #ifdef HAVE_NF_REGISTER_NET_HOOK
157 bool nf_hook_reg_done;
161 static int stt_net_id;
163 static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
164 static u32 frag_hash_seed __read_mostly;
166 /* Protects sock-hash and refcounts. */
167 static DEFINE_MUTEX(stt_mutex);
169 static int n_tunnels;
170 static DEFINE_PER_CPU(u32, pkt_seq_counter);
172 static void clean_percpu(struct work_struct *work);
173 static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
175 static struct stt_dev *stt_find_up_dev(struct net *net, __be16 port)
177 struct stt_net *sn = net_generic(net, stt_net_id);
178 struct stt_dev *stt_dev;
180 list_for_each_entry_rcu(stt_dev, &sn->stt_up_list, up_next) {
181 if (stt_dev->dst_port == port)
187 static __be32 ack_seq(void)
192 pkt_seq = this_cpu_read(pkt_seq_counter);
193 ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
194 this_cpu_inc(pkt_seq_counter);
196 return (__force __be32)ack;
198 #error "Support for greater than 64k CPUs not implemented"
202 static int clear_gso(struct sk_buff *skb)
204 struct skb_shared_info *shinfo = skb_shinfo(skb);
207 if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
208 shinfo->gso_segs == 0)
211 err = skb_unclone(skb, GFP_ATOMIC);
215 shinfo = skb_shinfo(skb);
216 shinfo->gso_type = 0;
217 shinfo->gso_size = 0;
218 shinfo->gso_segs = 0;
222 static struct sk_buff *normalize_frag_list(struct sk_buff *head,
223 struct sk_buff **skbp)
225 struct sk_buff *skb = *skbp;
226 struct sk_buff *last;
229 struct sk_buff *frags;
231 if (skb_shared(skb)) {
232 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
235 return ERR_PTR(-ENOMEM);
237 nskb->next = skb->next;
244 head->len -= skb->len;
245 head->data_len -= skb->len;
246 head->truesize -= skb->truesize;
249 frags = skb_shinfo(skb)->frag_list;
253 err = skb_unclone(skb, GFP_ATOMIC);
257 last = normalize_frag_list(skb, &frags);
261 skb_shinfo(skb)->frag_list = NULL;
262 last->next = skb->next;
269 } while ((skb = skb->next));
274 /* Takes a linked list of skbs, which potentially contain frag_list
275 * (whose members in turn potentially contain frag_lists, etc.) and
276 * converts them into a single linear linked list.
278 static int straighten_frag_list(struct sk_buff **skbp)
280 struct sk_buff *err_skb;
282 err_skb = normalize_frag_list(NULL, skbp);
284 return PTR_ERR(err_skb);
289 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
291 to->protocol = from->protocol;
292 to->tstamp = from->tstamp;
293 to->priority = from->priority;
294 to->mark = from->mark;
295 to->vlan_tci = from->vlan_tci;
296 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
297 to->vlan_proto = from->vlan_proto;
299 skb_copy_secmark(to, from);
302 static void update_headers(struct sk_buff *skb, bool head,
303 unsigned int l4_offset, unsigned int hdr_len,
304 bool ipv4, u32 tcp_seq)
306 u16 old_len, new_len;
312 struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
314 old_len = ntohs(iph->tot_len);
315 new_len = skb->len - ETH_HLEN;
316 iph->tot_len = htons(new_len);
320 struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
322 old_len = ntohs(ip6h->payload_len);
323 new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
324 ip6h->payload_len = htons(new_len);
327 tcph = (struct tcphdr *)(skb->data + l4_offset);
329 tcph->seq = htonl(tcp_seq);
338 delta = htonl(~old_len + new_len);
339 tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
340 (__force u32)delta));
342 gso_size = skb_shinfo(skb)->gso_size;
343 if (gso_size && skb->len - hdr_len <= gso_size)
344 BUG_ON(clear_gso(skb));
347 static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
349 /* If no offloading is in use then we don't have enough information
350 * to process the headers.
355 /* Handling UDP packets requires IP fragmentation, which means that
356 * the L4 checksum can no longer be calculated by hardware (since the
357 * fragments are in different packets. If we have to compute the
358 * checksum it's faster just to linearize and large UDP packets are
359 * pretty uncommon anyways, so it's not worth dealing with for now.
365 struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
367 /* It's difficult to get the IP IDs exactly right here due to
368 * varying segment sizes and potentially multiple layers of
369 * segmentation. IP ID isn't important when DF is set and DF
370 * is generally set for TCP packets, so just linearize if it's
373 if (!(iph->frag_off & htons(IP_DF)))
376 struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
378 /* Jumbograms require more processing to update and we'll
379 * probably never see them, so just linearize.
381 if (ip6h->payload_len == 0)
390 static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
395 if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
396 int extra_head = hdr_len - skb_headroom(frag);
398 extra_head = extra_head > 0 ? extra_head : 0;
399 if (unlikely(pskb_expand_head(frag, extra_head, 0,
404 memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
406 csum_start = head->csum_start - skb_headroom(head);
407 frag->csum_start = skb_headroom(frag) + csum_start;
408 frag->csum_offset = head->csum_offset;
409 frag->ip_summed = head->ip_summed;
411 skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
412 skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
413 skb_shinfo(frag)->gso_segs = 0;
415 copy_skb_metadata(frag, head);
419 static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
428 if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
431 tcph = (struct tcphdr *)(head->data + l4_offset);
432 tcp_len = tcph->doff * 4;
433 hdr_len = l4_offset + tcp_len;
435 if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
436 (head->len < hdr_len)))
439 if (unlikely(!pskb_may_pull(head, hdr_len)))
442 tcph = (struct tcphdr *)(head->data + l4_offset);
443 /* Update header of each segment. */
444 seq = ntohl(tcph->seq);
445 seg_len = skb_pagelen(head) - hdr_len;
447 skb = skb_shinfo(head)->frag_list;
448 skb_shinfo(head)->frag_list = NULL;
450 for (; skb; skb = skb->next) {
453 head->len -= skb->len;
454 head->data_len -= skb->len;
455 head->truesize -= skb->truesize;
459 err = copy_headers(head, skb, hdr_len);
462 update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
464 update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
468 static int coalesce_skb(struct sk_buff **headp)
470 struct sk_buff *frag, *head, *prev;
473 err = straighten_frag_list(headp);
478 /* Coalesce frag list. */
480 for (frag = head->next; frag; frag = frag->next) {
484 if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
487 if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
492 prev->next = frag->next;
495 frag->truesize -= delta;
496 kfree_skb_partial(frag, headstolen);
503 for (frag = head->next; frag; frag = frag->next) {
504 head->len += frag->len;
505 head->data_len += frag->len;
506 head->truesize += frag->truesize;
509 skb_shinfo(head)->frag_list = head->next;
514 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
515 bool ipv4, bool tcp, int l4_offset)
517 if (can_segment(skb, ipv4, tcp, csum_partial))
518 return skb_list_segment(skb, ipv4, l4_offset);
520 return skb_linearize(skb);
523 static int try_to_segment(struct sk_buff *skb)
525 struct stthdr *stth = stt_hdr(skb);
526 bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
527 bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
528 bool tcp = !!(stth->flags & STT_PROTO_TCP);
529 int l4_offset = stth->l4_offset;
531 return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
534 static int segment_skb(struct sk_buff **headp, bool csum_partial,
535 bool ipv4, bool tcp, int l4_offset)
539 err = coalesce_skb(headp);
543 if (skb_shinfo(*headp)->frag_list)
544 return __try_to_segment(*headp, csum_partial,
545 ipv4, tcp, l4_offset);
549 static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
550 __be16 s_port, __be16 d_port,
551 __be32 saddr, __be32 dst,
552 __be16 l3_proto, u8 l4_proto,
555 int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
556 unsigned short encap_mss;
560 skb_push(skb, STT_HEADER_LEN);
561 skb_reset_transport_header(skb);
563 memset(tcph, 0, STT_HEADER_LEN);
566 if (skb->ip_summed == CHECKSUM_PARTIAL) {
567 stth->flags |= STT_CSUM_PARTIAL;
569 stth->l4_offset = skb->csum_start -
573 if (l3_proto == htons(ETH_P_IP))
574 stth->flags |= STT_PROTO_IPV4;
576 if (l4_proto == IPPROTO_TCP)
577 stth->flags |= STT_PROTO_TCP;
579 stth->mss = htons(skb_shinfo(skb)->gso_size);
580 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
581 stth->flags |= STT_CSUM_VERIFIED;
584 stth->vlan_tci = htons(skb->vlan_tci);
586 put_unaligned(tun_id, &stth->key);
588 tcph->source = s_port;
590 tcph->doff = sizeof(struct tcphdr) / 4;
593 tcph->window = htons(USHRT_MAX);
594 tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT);
595 tcph->ack_seq = ack_seq();
596 tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0);
598 skb->csum_start = skb_transport_header(skb) - skb->head;
599 skb->csum_offset = offsetof(struct tcphdr, check);
600 skb->ip_summed = CHECKSUM_PARTIAL;
602 encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
603 if (data_len > encap_mss) {
604 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
607 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
608 skb_shinfo(skb)->gso_size = encap_mss;
609 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
611 if (unlikely(clear_gso(skb)))
617 static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
618 __be16 s_port, __be16 d_port,
619 __be32 saddr, __be32 dst,
620 __be16 l3_proto, u8 l4_proto,
625 if (skb_shinfo(head)->frag_list) {
626 bool ipv4 = (l3_proto == htons(ETH_P_IP));
627 bool tcp = (l4_proto == IPPROTO_TCP);
628 bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
629 int l4_offset = skb_transport_offset(head);
631 /* Need to call skb_orphan() to report currect true-size.
632 * calling skb_orphan() in this layer is odd but SKB with
633 * frag-list should not be associated with any socket, so
634 * skb-orphan should be no-op. */
636 if (unlikely(segment_skb(&head, csum_partial,
637 ipv4, tcp, l4_offset)))
641 for (skb = head; skb; skb = skb->next) {
642 if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
643 l3_proto, l4_proto, dst_mtu))
649 kfree_skb_list(head);
653 static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
655 if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
660 if (l4_proto == IPPROTO_TCP)
661 csum_offset = offsetof(struct tcphdr, check);
662 else if (l4_proto == IPPROTO_UDP)
663 csum_offset = offsetof(struct udphdr, check);
667 len = skb->len - skb_transport_offset(skb);
668 csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
670 if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
671 csum_offset + sizeof(*csum))))
674 if (l3_proto == htons(ETH_P_IP)) {
675 struct iphdr *iph = ip_hdr(skb);
677 *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
679 } else if (l3_proto == htons(ETH_P_IPV6)) {
680 struct ipv6hdr *ip6h = ipv6_hdr(skb);
682 *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
687 skb->csum_start = skb_transport_header(skb) - skb->head;
688 skb->csum_offset = csum_offset;
689 skb->ip_summed = CHECKSUM_PARTIAL;
692 if (skb->ip_summed == CHECKSUM_PARTIAL) {
693 /* Assume receiver can only offload TCP/UDP over IPv4/6,
694 * and require 802.1Q VLANs to be accelerated.
696 if (l3_proto != htons(ETH_P_IP) &&
697 l3_proto != htons(ETH_P_IPV6))
700 if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
703 /* L4 offset must fit in a 1-byte field. */
704 if (skb->csum_start - skb_headroom(skb) > 255)
707 if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
710 /* Total size of encapsulated packet must fit in 16 bits. */
711 if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
714 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
715 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
721 static bool need_linearize(const struct sk_buff *skb)
723 struct skb_shared_info *shinfo = skb_shinfo(skb);
726 if (unlikely(shinfo->frag_list))
729 /* Generally speaking we should linearize if there are paged frags.
730 * However, if all of the refcounts are 1 we know nobody else can
731 * change them from underneath us and we can skip the linearization.
733 for (i = 0; i < shinfo->nr_frags; i++)
734 if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
740 static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
744 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
745 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
747 min_headroom += VLAN_HLEN;
748 if (skb_headroom(skb) < min_headroom) {
749 int head_delta = SKB_DATA_ALIGN(min_headroom -
750 skb_headroom(skb) + 16);
752 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
758 skb = __vlan_hwaccel_push_inside(skb);
766 if (skb_is_gso(skb)) {
767 struct sk_buff *nskb;
768 char cb[sizeof(skb->cb)];
770 memcpy(cb, skb->cb, sizeof(cb));
772 nskb = __skb_gso_segment(skb, 0, false);
781 memcpy(nskb->cb, cb, sizeof(cb));
784 } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
785 /* Pages aren't locked and could change at any time.
786 * If this happens after we compute the checksum, the
787 * checksum will be wrong. We linearize now to avoid
790 if (unlikely(need_linearize(skb))) {
791 err = __skb_linearize(skb);
796 err = skb_checksum_help(skb);
800 skb->ip_summed = CHECKSUM_NONE;
808 static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
809 __be32 dst, __u8 tos, __u8 ttl, __be16 df)
814 struct sk_buff *next = skb->next;
820 len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
821 tos, ttl, df, false);
828 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
830 unsigned int nh_ofs = skb_network_offset(skb);
836 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
840 nexthdr = nh->nexthdr;
841 payload_ofs = (u8 *)(nh + 1) - skb->data;
843 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
844 if (unlikely(payload_ofs < 0))
850 static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
852 if (l3_proto == htons(ETH_P_IP)) {
853 unsigned int nh_ofs = skb_network_offset(skb);
855 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
858 return ip_hdr(skb)->protocol;
859 } else if (l3_proto == htons(ETH_P_IPV6)) {
860 return parse_ipv6_l4_proto(skb);
865 static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
866 __be32 src, __be32 dst, __u8 tos,
867 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
870 struct ethhdr *eh = eth_hdr(skb);
871 int ret = 0, min_headroom;
872 __be16 inner_l3_proto;
875 inner_l3_proto = eh->h_proto;
876 inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
878 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
879 + STT_HEADER_LEN + sizeof(struct iphdr);
881 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
882 int head_delta = SKB_DATA_ALIGN(min_headroom -
886 ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
892 ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
896 skb = handle_offloads(skb, min_headroom);
906 struct sk_buff *next_skb = skb->next;
913 /* Push STT and TCP header. */
914 skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
915 dst, inner_l3_proto, inner_l4_proto,
917 if (unlikely(!skb)) {
922 /* Push IP header. */
923 ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
937 netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
939 struct net_device *dev = skb->dev;
940 struct stt_dev *stt_dev = netdev_priv(dev);
941 struct net *net = stt_dev->net;
942 __be16 dport = stt_dev->dst_port;
943 struct ip_tunnel_key *tun_key;
944 struct ip_tunnel_info *tun_info;
951 tun_info = skb_tunnel_info(skb);
952 if (unlikely(!tun_info)) {
957 tun_key = &tun_info->key;
960 memset(&fl, 0, sizeof(fl));
961 fl.daddr = tun_key->u.ipv4.dst;
962 fl.saddr = tun_key->u.ipv4.src;
963 fl.flowi4_tos = RT_TOS(tun_key->tos);
964 fl.flowi4_mark = skb->mark;
965 fl.flowi4_proto = IPPROTO_TCP;
966 rt = ip_route_output_key(net, &fl);
972 df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
973 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
976 err = stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
977 tun_key->tos, tun_key->ttl,
978 df, sport, dport, tun_key->tun_id);
979 iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
983 dev->stats.tx_errors++;
986 EXPORT_SYMBOL(ovs_stt_xmit);
988 static void free_frag(struct stt_percpu *stt_percpu,
989 struct pkt_frag *frag)
991 stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
992 kfree_skb_list(frag->skbs);
993 list_del(&frag->lru_node);
997 static void evict_frags(struct stt_percpu *stt_percpu)
999 while (!list_empty(&stt_percpu->frag_lru) &&
1000 stt_percpu->frag_mem_used > REASM_LO_THRESH) {
1001 struct pkt_frag *frag;
1003 frag = list_first_entry(&stt_percpu->frag_lru,
1006 free_frag(stt_percpu, frag);
1010 static bool pkt_key_match(struct net *net,
1011 const struct pkt_frag *a, const struct pkt_key *b)
1013 return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
1014 a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
1015 net_eq(dev_net(a->skbs->dev), net);
1018 static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
1020 u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
1022 return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
1023 (__force u32)key->pkt_seq, initval);
1026 static struct pkt_frag *lookup_frag(struct net *net,
1027 struct stt_percpu *stt_percpu,
1028 const struct pkt_key *key, u32 hash)
1030 struct pkt_frag *frag, *victim_frag = NULL;
1033 for (i = 0; i < FRAG_HASH_SEGS; i++) {
1034 frag = flex_array_get(stt_percpu->frag_hash,
1035 hash & (FRAG_HASH_ENTRIES - 1));
1038 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
1039 pkt_key_match(net, frag, key))
1043 (victim_frag->skbs &&
1045 time_before(frag->timestamp, victim_frag->timestamp))))
1048 hash >>= FRAG_HASH_SHIFT;
1051 if (victim_frag->skbs)
1052 free_frag(stt_percpu, victim_frag);
1057 static struct sk_buff *reassemble(struct sk_buff *skb)
1059 struct iphdr *iph = ip_hdr(skb);
1060 struct tcphdr *tcph = tcp_hdr(skb);
1061 u32 seq = ntohl(tcph->seq);
1062 struct stt_percpu *stt_percpu;
1063 struct sk_buff *last_skb;
1064 struct pkt_frag *frag;
1069 tot_len = seq >> STT_SEQ_LEN_SHIFT;
1070 FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
1072 if (unlikely(skb->len == 0))
1075 if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
1078 if (tot_len == skb->len)
1081 key.saddr = iph->saddr;
1082 key.daddr = iph->daddr;
1083 key.pkt_seq = tcph->ack_seq;
1084 key.mark = skb->mark;
1085 hash = pkt_key_hash(dev_net(skb->dev), &key);
1087 stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
1089 spin_lock(&stt_percpu->lock);
1091 if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
1092 evict_frags(stt_percpu);
1094 frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
1098 frag->timestamp = jiffies;
1099 FRAG_CB(skb)->first.last_skb = skb;
1100 FRAG_CB(skb)->first.mem_used = skb->truesize;
1101 FRAG_CB(skb)->first.tot_len = tot_len;
1102 FRAG_CB(skb)->first.rcvd_len = skb->len;
1103 FRAG_CB(skb)->first.set_ecn_ce = false;
1104 list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
1105 stt_percpu->frag_mem_used += skb->truesize;
1111 /* Optimize for the common case where fragments are received in-order
1112 * and not overlapping.
1114 last_skb = FRAG_CB(frag->skbs)->first.last_skb;
1115 if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
1116 FRAG_CB(skb)->offset)) {
1117 last_skb->next = skb;
1118 FRAG_CB(frag->skbs)->first.last_skb = skb;
1120 struct sk_buff *prev = NULL, *next;
1122 for (next = frag->skbs; next; next = next->next) {
1123 if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
1128 /* Overlapping fragments aren't allowed. We shouldn't start
1129 * before the end of the previous fragment.
1132 FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
1135 /* We also shouldn't end after the beginning of the next
1139 FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
1145 FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
1152 FRAG_CB(frag->skbs)->first.last_skb = skb;
1155 FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
1156 FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
1157 FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
1158 stt_percpu->frag_mem_used += skb->truesize;
1160 if (FRAG_CB(frag->skbs)->first.tot_len ==
1161 FRAG_CB(frag->skbs)->first.rcvd_len) {
1162 struct sk_buff *frag_head = frag->skbs;
1164 frag_head->tstamp = skb->tstamp;
1165 if (FRAG_CB(frag_head)->first.set_ecn_ce)
1166 INET_ECN_set_ce(frag_head);
1168 list_del(&frag->lru_node);
1169 stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
1173 list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
1183 spin_unlock(&stt_percpu->lock);
1192 static bool validate_checksum(struct sk_buff *skb)
1194 struct iphdr *iph = ip_hdr(skb);
1196 if (skb_csum_unnecessary(skb))
1199 if (skb->ip_summed == CHECKSUM_COMPLETE &&
1200 !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
1203 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
1206 return __tcp_checksum_complete(skb) == 0;
1209 static bool set_offloads(struct sk_buff *skb)
1211 struct stthdr *stth = stt_hdr(skb);
1212 unsigned short gso_type;
1219 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1220 ntohs(stth->vlan_tci));
1222 if (!(stth->flags & STT_CSUM_PARTIAL)) {
1223 if (stth->flags & STT_CSUM_VERIFIED)
1224 skb->ip_summed = CHECKSUM_UNNECESSARY;
1226 skb->ip_summed = CHECKSUM_NONE;
1228 return clear_gso(skb) == 0;
1231 proto_type = stth->flags & STT_PROTO_TYPES;
1233 switch (proto_type) {
1234 case (STT_PROTO_IPV4 | STT_PROTO_TCP):
1236 csum_offset = offsetof(struct tcphdr, check);
1237 gso_type = SKB_GSO_TCPV4;
1238 l3_header_size = sizeof(struct iphdr);
1239 l4_header_size = sizeof(struct tcphdr);
1240 skb->protocol = htons(ETH_P_IP);
1244 csum_offset = offsetof(struct tcphdr, check);
1245 gso_type = SKB_GSO_TCPV6;
1246 l3_header_size = sizeof(struct ipv6hdr);
1247 l4_header_size = sizeof(struct tcphdr);
1248 skb->protocol = htons(ETH_P_IPV6);
1250 case STT_PROTO_IPV4:
1252 csum_offset = offsetof(struct udphdr, check);
1253 gso_type = SKB_GSO_UDP;
1254 l3_header_size = sizeof(struct iphdr);
1255 l4_header_size = sizeof(struct udphdr);
1256 skb->protocol = htons(ETH_P_IP);
1260 csum_offset = offsetof(struct udphdr, check);
1261 gso_type = SKB_GSO_UDP;
1262 l3_header_size = sizeof(struct ipv6hdr);
1263 l4_header_size = sizeof(struct udphdr);
1264 skb->protocol = htons(ETH_P_IPV6);
1267 if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
1270 if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
1273 stth = stt_hdr(skb);
1275 skb->csum_start = skb_headroom(skb) + stth->l4_offset;
1276 skb->csum_offset = csum_offset;
1277 skb->ip_summed = CHECKSUM_PARTIAL;
1280 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
1283 skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
1284 skb_shinfo(skb)->gso_size = ntohs(stth->mss);
1285 skb_shinfo(skb)->gso_segs = 0;
1287 if (unlikely(clear_gso(skb)))
1294 static void rcv_list(struct net_device *dev, struct sk_buff *skb,
1295 struct metadata_dst *tun_dst)
1297 struct sk_buff *next;
1303 ovs_dst_hold((struct dst_entry *)tun_dst);
1304 ovs_skb_dst_set(next, (struct dst_entry *)tun_dst);
1306 ovs_ip_tunnel_rcv(dev, skb, tun_dst);
1307 } while ((skb = next));
1310 #ifndef HAVE_METADATA_DST
1311 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1313 struct metadata_dst tun_dst;
1315 ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, TUNNEL_KEY | TUNNEL_CSUM,
1316 get_unaligned(&stt_hdr(skb)->key), 0);
1317 tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1318 tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1320 rcv_list(stt_dev->dev, skb, &tun_dst);
1324 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1326 struct metadata_dst *tun_dst;
1330 flags = TUNNEL_KEY | TUNNEL_CSUM;
1331 tun_id = get_unaligned(&stt_hdr(skb)->key);
1332 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
1335 tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1336 tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1338 rcv_list(stt_dev->dev, skb, tun_dst);
1343 static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1347 if (unlikely(!validate_checksum(skb)))
1350 __skb_pull(skb, sizeof(struct tcphdr));
1351 skb = reassemble(skb);
1355 if (skb->next && coalesce_skb(&skb))
1358 err = iptunnel_pull_header(skb,
1359 sizeof(struct stthdr) + STT_ETH_PAD,
1364 if (unlikely(stt_hdr(skb)->version != 0))
1367 if (unlikely(!set_offloads(skb)))
1370 if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
1373 err = __stt_rcv(stt_dev, skb);
1378 /* Consume bad packet */
1379 kfree_skb_list(skb);
1380 stt_dev->dev->stats.rx_errors++;
1383 static void tcp_sock_release(struct socket *sock)
1385 kernel_sock_shutdown(sock, SHUT_RDWR);
1389 static int tcp_sock_create4(struct net *net, __be16 port,
1390 struct socket **sockp)
1392 struct sockaddr_in tcp_addr;
1393 struct socket *sock = NULL;
1396 err = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1400 memset(&tcp_addr, 0, sizeof(tcp_addr));
1401 tcp_addr.sin_family = AF_INET;
1402 tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
1403 tcp_addr.sin_port = port;
1404 err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
1414 tcp_sock_release(sock);
1419 static void schedule_clean_percpu(void)
1421 schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
1424 static void clean_percpu(struct work_struct *work)
1428 for_each_possible_cpu(i) {
1429 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1432 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1433 struct pkt_frag *frag;
1435 frag = flex_array_get(stt_percpu->frag_hash, j);
1437 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
1440 spin_lock_bh(&stt_percpu->lock);
1443 time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
1444 free_frag(stt_percpu, frag);
1446 spin_unlock_bh(&stt_percpu->lock);
1449 schedule_clean_percpu();
1452 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1453 #define FIRST_PARAM const struct nf_hook_ops *ops
1455 #define FIRST_PARAM unsigned int hooknum
1458 #ifdef HAVE_NF_HOOK_STATE
1459 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)
1460 /* RHEL nfhook hacks. */
1461 #ifndef __GENKSYMS__
1462 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1463 const struct nf_hook_state *state
1465 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1466 int (*okfn)(struct sk_buff *)
1469 #define LAST_PARAM const struct nf_hook_state *state
1472 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1473 int (*okfn)(struct sk_buff *)
1476 static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
1478 struct stt_dev *stt_dev;
1481 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
1484 ip_hdr_len = ip_hdrlen(skb);
1485 if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
1488 skb_set_transport_header(skb, ip_hdr_len);
1490 stt_dev = stt_find_up_dev(dev_net(skb->dev), tcp_hdr(skb)->dest);
1494 __skb_pull(skb, ip_hdr_len);
1495 stt_rcv(stt_dev, skb);
1499 static struct nf_hook_ops nf_hook_ops __read_mostly = {
1501 .owner = THIS_MODULE,
1503 .hooknum = NF_INET_LOCAL_IN,
1504 .priority = INT_MAX,
1507 static int stt_start(struct net *net)
1509 struct stt_net *sn = net_generic(net, stt_net_id);
1517 get_random_bytes(&frag_hash_seed, sizeof(u32));
1519 stt_percpu_data = alloc_percpu(struct stt_percpu);
1520 if (!stt_percpu_data) {
1525 for_each_possible_cpu(i) {
1526 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1527 struct flex_array *frag_hash;
1529 spin_lock_init(&stt_percpu->lock);
1530 INIT_LIST_HEAD(&stt_percpu->frag_lru);
1531 get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
1533 frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
1535 GFP_KERNEL | __GFP_ZERO);
1540 stt_percpu->frag_hash = frag_hash;
1542 err = flex_array_prealloc(stt_percpu->frag_hash, 0,
1544 GFP_KERNEL | __GFP_ZERO);
1548 schedule_clean_percpu();
1551 if (sn->n_tunnels) {
1555 #ifdef HAVE_NF_REGISTER_NET_HOOK
1556 /* On kernel which support per net nf-hook, nf_register_hook() takes
1557 * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1561 if (sn->nf_hook_reg_done)
1564 err = nf_register_net_hook(net, &nf_hook_ops);
1566 sn->nf_hook_reg_done = true;
1568 /* Register STT only on very first STT device addition. */
1569 if (!list_empty(&nf_hook_ops.list))
1572 err = nf_register_hook(&nf_hook_ops);
1583 for_each_possible_cpu(i) {
1584 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1586 if (stt_percpu->frag_hash)
1587 flex_array_free(stt_percpu->frag_hash);
1590 free_percpu(stt_percpu_data);
1596 static void stt_cleanup(struct net *net)
1598 struct stt_net *sn = net_generic(net, stt_net_id);
1609 cancel_delayed_work_sync(&clean_percpu_wq);
1610 for_each_possible_cpu(i) {
1611 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1614 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1615 struct pkt_frag *frag;
1617 frag = flex_array_get(stt_percpu->frag_hash, j);
1618 kfree_skb_list(frag->skbs);
1621 flex_array_free(stt_percpu->frag_hash);
1624 free_percpu(stt_percpu_data);
1627 static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
1629 #ifdef HAVE_METADATA_DST
1630 return ovs_stt_xmit(skb);
1632 /* Drop All packets coming from networking stack. OVS-CB is
1633 * not initialized for these packets.
1636 dev->stats.tx_dropped++;
1637 return NETDEV_TX_OK;
1641 /* Setup stats when device is created */
1642 static int stt_init(struct net_device *dev)
1644 dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1651 static void stt_uninit(struct net_device *dev)
1653 free_percpu(dev->tstats);
1656 static int stt_open(struct net_device *dev)
1658 struct stt_dev *stt = netdev_priv(dev);
1659 struct net *net = stt->net;
1660 struct stt_net *sn = net_generic(net, stt_net_id);
1663 err = stt_start(net);
1667 err = tcp_sock_create4(net, stt->dst_port, &stt->sock);
1670 list_add_rcu(&stt->up_next, &sn->stt_up_list);
1674 static int stt_stop(struct net_device *dev)
1676 struct stt_dev *stt_dev = netdev_priv(dev);
1677 struct net *net = stt_dev->net;
1679 list_del_rcu(&stt_dev->up_next);
1681 tcp_sock_release(stt_dev->sock);
1682 stt_dev->sock = NULL;
1687 static int __stt_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1689 int max_mtu = IP_MAX_MTU - STT_HEADER_LEN - sizeof(struct iphdr)
1690 - dev->hard_header_len;
1695 if (new_mtu > max_mtu) {
1706 static int stt_change_mtu(struct net_device *dev, int new_mtu)
1708 return __stt_change_mtu(dev, new_mtu, true);
1711 static const struct net_device_ops stt_netdev_ops = {
1712 .ndo_init = stt_init,
1713 .ndo_uninit = stt_uninit,
1714 .ndo_open = stt_open,
1715 .ndo_stop = stt_stop,
1716 .ndo_start_xmit = stt_dev_xmit,
1717 .ndo_get_stats64 = ip_tunnel_get_stats64,
1718 .ndo_change_mtu = stt_change_mtu,
1719 .ndo_validate_addr = eth_validate_addr,
1720 .ndo_set_mac_address = eth_mac_addr,
1723 static void stt_get_drvinfo(struct net_device *dev,
1724 struct ethtool_drvinfo *drvinfo)
1726 strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version));
1727 strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver));
1730 static const struct ethtool_ops stt_ethtool_ops = {
1731 .get_drvinfo = stt_get_drvinfo,
1732 .get_link = ethtool_op_get_link,
1735 /* Info for udev, that this is a virtual tunnel endpoint */
1736 static struct device_type stt_type = {
1740 /* Initialize the device structure. */
1741 static void stt_setup(struct net_device *dev)
1745 dev->netdev_ops = &stt_netdev_ops;
1746 dev->ethtool_ops = &stt_ethtool_ops;
1747 dev->destructor = free_netdev;
1749 SET_NETDEV_DEVTYPE(dev, &stt_type);
1751 dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
1752 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
1753 dev->features |= NETIF_F_RXCSUM;
1754 dev->features |= NETIF_F_GSO_SOFTWARE;
1756 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1757 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1759 #ifdef HAVE_METADATA_DST
1760 netif_keep_dst(dev);
1762 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1763 eth_hw_addr_random(dev);
1766 static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = {
1767 [IFLA_STT_PORT] = { .type = NLA_U16 },
1770 static int stt_validate(struct nlattr *tb[], struct nlattr *data[])
1772 if (tb[IFLA_ADDRESS]) {
1773 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1776 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1777 return -EADDRNOTAVAIL;
1783 static struct stt_dev *find_dev(struct net *net, __be16 dst_port)
1785 struct stt_net *sn = net_generic(net, stt_net_id);
1786 struct stt_dev *dev;
1788 list_for_each_entry(dev, &sn->stt_list, next) {
1789 if (dev->dst_port == dst_port)
1795 static int stt_configure(struct net *net, struct net_device *dev,
1798 struct stt_net *sn = net_generic(net, stt_net_id);
1799 struct stt_dev *stt = netdev_priv(dev);
1805 stt->dst_port = dst_port;
1807 if (find_dev(net, dst_port))
1810 err = __stt_change_mtu(dev, IP_MAX_MTU, false);
1814 err = register_netdevice(dev);
1818 list_add(&stt->next, &sn->stt_list);
1822 static int stt_newlink(struct net *net, struct net_device *dev,
1823 struct nlattr *tb[], struct nlattr *data[])
1825 __be16 dst_port = htons(STT_DST_PORT);
1827 if (data[IFLA_STT_PORT])
1828 dst_port = nla_get_be16(data[IFLA_STT_PORT]);
1830 return stt_configure(net, dev, dst_port);
1833 static void stt_dellink(struct net_device *dev, struct list_head *head)
1835 struct stt_dev *stt = netdev_priv(dev);
1837 list_del(&stt->next);
1838 unregister_netdevice_queue(dev, head);
1841 static size_t stt_get_size(const struct net_device *dev)
1843 return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */
1846 static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev)
1848 struct stt_dev *stt = netdev_priv(dev);
1850 if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port))
1851 goto nla_put_failure;
1859 static struct rtnl_link_ops stt_link_ops __read_mostly = {
1861 .maxtype = IFLA_STT_MAX,
1862 .policy = stt_policy,
1863 .priv_size = sizeof(struct stt_dev),
1865 .validate = stt_validate,
1866 .newlink = stt_newlink,
1867 .dellink = stt_dellink,
1868 .get_size = stt_get_size,
1869 .fill_info = stt_fill_info,
1872 struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
1873 u8 name_assign_type, u16 dst_port)
1875 struct nlattr *tb[IFLA_MAX + 1];
1876 struct net_device *dev;
1879 memset(tb, 0, sizeof(tb));
1880 dev = rtnl_create_link(net, (char *) name, name_assign_type,
1885 err = stt_configure(net, dev, htons(dst_port));
1888 return ERR_PTR(err);
1892 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb);
1894 static int stt_init_net(struct net *net)
1896 struct stt_net *sn = net_generic(net, stt_net_id);
1898 INIT_LIST_HEAD(&sn->stt_list);
1899 INIT_LIST_HEAD(&sn->stt_up_list);
1900 #ifdef HAVE_NF_REGISTER_NET_HOOK
1901 sn->nf_hook_reg_done = false;
1906 static void stt_exit_net(struct net *net)
1908 struct stt_net *sn = net_generic(net, stt_net_id);
1909 struct stt_dev *stt, *next;
1910 struct net_device *dev, *aux;
1913 #ifdef HAVE_NF_REGISTER_NET_HOOK
1914 /* Ideally this should be done from stt_stop(), But on some kernels
1915 * nf-unreg operation needs RTNL-lock, which can cause deallock.
1916 * So it is done from here. */
1917 if (sn->nf_hook_reg_done)
1918 nf_unregister_net_hook(net, &nf_hook_ops);
1923 /* gather any stt devices that were moved into this ns */
1924 for_each_netdev_safe(net, dev, aux)
1925 if (dev->rtnl_link_ops == &stt_link_ops)
1926 unregister_netdevice_queue(dev, &list);
1928 list_for_each_entry_safe(stt, next, &sn->stt_list, next) {
1929 /* If stt->dev is in the same netns, it was already added
1930 * to the stt by the previous loop.
1932 if (!net_eq(dev_net(stt->dev), net))
1933 unregister_netdevice_queue(stt->dev, &list);
1936 /* unregister the devices gathered above */
1937 unregister_netdevice_many(&list);
1941 static struct pernet_operations stt_net_ops = {
1942 .init = stt_init_net,
1943 .exit = stt_exit_net,
1945 .size = sizeof(struct stt_net),
1948 int stt_init_module(void)
1952 rc = register_pernet_subsys(&stt_net_ops);
1956 rc = rtnl_link_register(&stt_link_ops);
1960 INIT_LIST_HEAD(&nf_hook_ops.list);
1961 pr_info("STT tunneling driver\n");
1964 unregister_pernet_subsys(&stt_net_ops);
1969 void stt_cleanup_module(void)
1971 #ifndef HAVE_NF_REGISTER_NET_HOOK
1972 if (!list_empty(&nf_hook_ops.list))
1973 nf_unregister_hook(&nf_hook_ops);
1975 rtnl_link_unregister(&stt_link_ops);
1976 unregister_pernet_subsys(&stt_net_ops);