+/*
+ * Stateless TCP Tunnel (STT) vport.
+ *
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+
+#include <linux/delay.h>
+#include <linux/flex_array.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+#include <net/icmp.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/stt.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#include "gso.h"
+
+#ifdef OVS_STT
+#define STT_VER 0
+
+#define STT_CSUM_VERIFIED BIT(0)
+#define STT_CSUM_PARTIAL BIT(1)
+#define STT_PROTO_IPV4 BIT(2)
+#define STT_PROTO_TCP BIT(3)
+#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
+
+#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
+ SKB_GSO_TCPV6)
+
+/* The length and offset of a fragment are encoded in the sequence number.
+ * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
+ * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
+ */
+#define STT_SEQ_LEN_SHIFT 16
+#define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
+
+/* The maximum amount of memory used to store packets waiting to be reassembled
+ * on a given CPU. Once this threshold is exceeded we will begin freeing the
+ * least recently used fragments.
+ */
+#define REASM_HI_THRESH (4 * 1024 * 1024)
+/* The target for the high memory evictor. Once we have exceeded
+ * REASM_HI_THRESH, we will continue freeing fragments until we hit
+ * this limit.
+ */
+#define REASM_LO_THRESH (3 * 1024 * 1024)
+/* The length of time a given packet has to be reassembled from the time the
+ * first fragment arrives. Once this limit is exceeded it becomes available
+ * for cleaning.
+ */
+#define FRAG_EXP_TIME (30 * HZ)
+/* Number of hash entries. Each entry has only a single slot to hold a packet
+ * so if there are collisions, we will drop packets. This is allocated
+ * per-cpu and each entry consists of struct pkt_frag.
+ */
+#define FRAG_HASH_SHIFT 8
+#define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
+#define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
+
+#define CLEAN_PERCPU_INTERVAL (30 * HZ)
+
+struct pkt_key {
+ __be32 saddr;
+ __be32 daddr;
+ __be32 pkt_seq;
+ u32 mark;
+};
+
+struct pkt_frag {
+ struct sk_buff *skbs;
+ unsigned long timestamp;
+ struct list_head lru_node;
+ struct pkt_key key;
+};
+
+struct stt_percpu {
+ struct flex_array *frag_hash;
+ struct list_head frag_lru;
+ unsigned int frag_mem_used;
+
+ /* Protect frags table. */
+ spinlock_t lock;
+};
+
+struct first_frag {
+ struct sk_buff *last_skb;
+ unsigned int mem_used;
+ u16 tot_len;
+ u16 rcvd_len;
+ bool set_ecn_ce;
+};
+
+struct frag_skb_cb {
+ u16 offset;
+
+ /* Only valid for the first skb in the chain. */
+ struct first_frag first;
+};
+
+#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
+
+/* per-network namespace private data for this module */
+struct stt_net {
+ struct list_head sock_list;
+};
+
+static int stt_net_id;
+
+static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
+static u32 frag_hash_seed __read_mostly;
+
+/* Protects sock-hash and refcounts. */
+static DEFINE_MUTEX(stt_mutex);
+
+static int n_tunnels;
+static DEFINE_PER_CPU(u32, pkt_seq_counter);
+
+static void clean_percpu(struct work_struct *work);
+static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
+
+static struct stt_sock *stt_find_sock(struct net *net, __be16 port)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+ struct stt_sock *stt_sock;
+
+ list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) {
+ if (inet_sk(stt_sock->sock->sk)->inet_sport == port)
+ return stt_sock;
+ }
+ return NULL;
+}
+
+static __be32 ack_seq(void)
+{
+#if NR_CPUS <= 65536
+ u32 pkt_seq, ack;
+
+ pkt_seq = this_cpu_read(pkt_seq_counter);
+ ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
+ this_cpu_inc(pkt_seq_counter);
+
+ return (__force __be32)ack;
+#else
+#error "Support for greater than 64k CPUs not implemented"
+#endif
+}
+
+static int clear_gso(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int err;
+
+ if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
+ shinfo->gso_segs == 0)
+ return 0;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_type = 0;
+ shinfo->gso_size = 0;
+ shinfo->gso_segs = 0;
+ return 0;
+}
+
+static struct sk_buff *normalize_frag_list(struct sk_buff *head,
+ struct sk_buff **skbp)
+{
+ struct sk_buff *skb = *skbp;
+ struct sk_buff *last;
+
+ do {
+ struct sk_buff *frags;
+
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb))
+ return ERR_PTR(-ENOMEM);
+
+ nskb->next = skb->next;
+ consume_skb(skb);
+ skb = nskb;
+ *skbp = skb;
+ }
+
+ if (head) {
+ head->len -= skb->len;
+ head->data_len -= skb->len;
+ head->truesize -= skb->truesize;
+ }
+
+ frags = skb_shinfo(skb)->frag_list;
+ if (frags) {
+ int err;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ return ERR_PTR(err);
+
+ last = normalize_frag_list(skb, &frags);
+ if (IS_ERR(last))
+ return last;
+
+ skb_shinfo(skb)->frag_list = NULL;
+ last->next = skb->next;
+ skb->next = frags;
+ } else {
+ last = skb;
+ }
+
+ skbp = &skb->next;
+ } while ((skb = skb->next));
+
+ return last;
+}
+
+/* Takes a linked list of skbs, which potentially contain frag_list
+ * (whose members in turn potentially contain frag_lists, etc.) and
+ * converts them into a single linear linked list.
+ */
+static int straighten_frag_list(struct sk_buff **skbp)
+{
+ struct sk_buff *err_skb;
+
+ err_skb = normalize_frag_list(NULL, skbp);
+ if (IS_ERR(err_skb))
+ return PTR_ERR(err_skb);
+
+ return 0;
+}
+
+static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->protocol = from->protocol;
+ to->tstamp = from->tstamp;
+ to->priority = from->priority;
+ to->mark = from->mark;
+ to->vlan_tci = from->vlan_tci;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+ to->vlan_proto = from->vlan_proto;
+#endif
+ skb_copy_secmark(to, from);
+}
+
+static void update_headers(struct sk_buff *skb, bool head,
+ unsigned int l4_offset, unsigned int hdr_len,
+ bool ipv4, u32 tcp_seq)
+{
+ u16 old_len, new_len;
+ __be32 delta;
+ struct tcphdr *tcph;
+ int gso_size;
+
+ if (ipv4) {
+ struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
+
+ old_len = ntohs(iph->tot_len);
+ new_len = skb->len - ETH_HLEN;
+ iph->tot_len = htons(new_len);
+
+ ip_send_check(iph);
+ } else {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
+
+ old_len = ntohs(ip6h->payload_len);
+ new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
+ ip6h->payload_len = htons(new_len);
+ }
+
+ tcph = (struct tcphdr *)(skb->data + l4_offset);
+ if (!head) {
+ tcph->seq = htonl(tcp_seq);
+ tcph->cwr = 0;
+ }
+
+ if (skb->next) {
+ tcph->fin = 0;
+ tcph->psh = 0;
+ }
+
+ delta = htonl(~old_len + new_len);
+ tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
+ (__force u32)delta));
+
+ gso_size = skb_shinfo(skb)->gso_size;
+ if (gso_size && skb->len - hdr_len <= gso_size)
+ BUG_ON(clear_gso(skb));
+}
+
+static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
+{
+ /* If no offloading is in use then we don't have enough information
+ * to process the headers.
+ */
+ if (!csum_partial)
+ goto linearize;
+
+ /* Handling UDP packets requires IP fragmentation, which means that
+ * the L4 checksum can no longer be calculated by hardware (since the
+ * fragments are in different packets. If we have to compute the
+ * checksum it's faster just to linearize and large UDP packets are
+ * pretty uncommon anyways, so it's not worth dealing with for now.
+ */
+ if (!tcp)
+ goto linearize;
+
+ if (ipv4) {
+ struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
+
+ /* It's difficult to get the IP IDs exactly right here due to
+ * varying segment sizes and potentially multiple layers of
+ * segmentation. IP ID isn't important when DF is set and DF
+ * is generally set for TCP packets, so just linearize if it's
+ * not.
+ */
+ if (!(iph->frag_off & htons(IP_DF)))
+ goto linearize;
+ } else {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
+
+ /* Jumbograms require more processing to update and we'll
+ * probably never see them, so just linearize.
+ */
+ if (ip6h->payload_len == 0)
+ goto linearize;
+ }
+ return true;
+
+linearize:
+ return false;
+}
+
+static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
+ int hdr_len)
+{
+ u16 csum_start;
+
+ if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
+ int extra_head = hdr_len - skb_headroom(frag);
+
+ extra_head = extra_head > 0 ? extra_head : 0;
+ if (unlikely(pskb_expand_head(frag, extra_head, 0,
+ GFP_ATOMIC)))
+ return -ENOMEM;
+ }
+
+ memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
+
+ csum_start = head->csum_start - skb_headroom(head);
+ frag->csum_start = skb_headroom(frag) + csum_start;
+ frag->csum_offset = head->csum_offset;
+ frag->ip_summed = head->ip_summed;
+
+ skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
+ skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
+ skb_shinfo(frag)->gso_segs = 0;
+
+ copy_skb_metadata(frag, head);
+ return 0;
+}
+
+static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
+{
+ struct sk_buff *skb;
+ struct tcphdr *tcph;
+ int seg_len;
+ int hdr_len;
+ int tcp_len;
+ u32 seq;
+
+ if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
+ return -ENOMEM;
+
+ tcph = (struct tcphdr *)(head->data + l4_offset);
+ tcp_len = tcph->doff * 4;
+ hdr_len = l4_offset + tcp_len;
+
+ if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
+ (head->len < hdr_len)))
+ return -EINVAL;
+
+ if (unlikely(!pskb_may_pull(head, hdr_len)))
+ return -ENOMEM;
+
+ tcph = (struct tcphdr *)(head->data + l4_offset);
+ /* Update header of each segment. */
+ seq = ntohl(tcph->seq);
+ seg_len = skb_pagelen(head) - hdr_len;
+
+ skb = skb_shinfo(head)->frag_list;
+ skb_shinfo(head)->frag_list = NULL;
+ head->next = skb;
+ for (; skb; skb = skb->next) {
+ int err;
+
+ head->len -= skb->len;
+ head->data_len -= skb->len;
+ head->truesize -= skb->truesize;
+
+ seq += seg_len;
+ seg_len = skb->len;
+ err = copy_headers(head, skb, hdr_len);
+ if (err)
+ return err;
+ update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
+ }
+ update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
+ return 0;
+}
+
+static int coalesce_skb(struct sk_buff **headp)
+{
+ struct sk_buff *frag, *head, *prev;
+ int err;
+
+ err = straighten_frag_list(headp);
+ if (unlikely(err))
+ return err;
+ head = *headp;
+
+ /* Coalesce frag list. */
+ prev = head;
+ for (frag = head->next; frag; frag = frag->next) {
+ bool headstolen;
+ int delta;
+
+ if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
+ return -ENOMEM;
+
+ if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
+ prev = frag;
+ continue;
+ }
+
+ prev->next = frag->next;
+ frag->len = 0;
+ frag->data_len = 0;
+ frag->truesize -= delta;
+ kfree_skb_partial(frag, headstolen);
+ frag = prev;
+ }
+
+ if (!head->next)
+ return 0;
+
+ for (frag = head->next; frag; frag = frag->next) {
+ head->len += frag->len;
+ head->data_len += frag->len;
+ head->truesize += frag->truesize;
+ }
+
+ skb_shinfo(head)->frag_list = head->next;
+ head->next = NULL;
+ return 0;
+}
+
+static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
+ bool ipv4, bool tcp, int l4_offset)
+{
+ if (can_segment(skb, ipv4, tcp, csum_partial))
+ return skb_list_segment(skb, ipv4, l4_offset);
+ else
+ return skb_linearize(skb);
+}
+
+static int try_to_segment(struct sk_buff *skb)
+{
+ struct stthdr *stth = stt_hdr(skb);
+ bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
+ bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
+ bool tcp = !!(stth->flags & STT_PROTO_TCP);
+ int l4_offset = stth->l4_offset;
+
+ return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
+}
+
+static int segment_skb(struct sk_buff **headp, bool csum_partial,
+ bool ipv4, bool tcp, int l4_offset)
+{
+ int err;
+
+ err = coalesce_skb(headp);
+ if (err)
+ return err;
+
+ if (skb_shinfo(*headp)->frag_list)
+ return __try_to_segment(*headp, csum_partial,
+ ipv4, tcp, l4_offset);
+ return 0;
+}
+
+static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
+ __be16 s_port, __be16 d_port,
+ __be32 saddr, __be32 dst,
+ __be16 l3_proto, u8 l4_proto,
+ int dst_mtu)
+{
+ int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
+ unsigned short encap_mss;
+ struct tcphdr *tcph;
+ struct stthdr *stth;
+
+ skb_push(skb, STT_HEADER_LEN);
+ skb_reset_transport_header(skb);
+ tcph = tcp_hdr(skb);
+ memset(tcph, 0, STT_HEADER_LEN);
+ stth = stt_hdr(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ stth->flags |= STT_CSUM_PARTIAL;
+
+ stth->l4_offset = skb->csum_start -
+ (skb_headroom(skb) +
+ STT_HEADER_LEN);
+
+ if (l3_proto == htons(ETH_P_IP))
+ stth->flags |= STT_PROTO_IPV4;
+
+ if (l4_proto == IPPROTO_TCP)
+ stth->flags |= STT_PROTO_TCP;
+
+ stth->mss = htons(skb_shinfo(skb)->gso_size);
+ } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ stth->flags |= STT_CSUM_VERIFIED;
+ }
+
+ stth->vlan_tci = htons(skb->vlan_tci);
+ skb->vlan_tci = 0;
+ put_unaligned(tun_id, &stth->key);
+
+ tcph->source = s_port;
+ tcph->dest = d_port;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+ tcph->ack = 1;
+ tcph->psh = 1;
+ tcph->window = htons(USHRT_MAX);
+ tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT);
+ tcph->ack_seq = ack_seq();
+ tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0);
+
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ if (data_len > encap_mss) {
+ if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
+ return -EINVAL;
+
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_size = encap_mss;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
+ } else {
+ if (unlikely(clear_gso(skb)))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
+ __be16 s_port, __be16 d_port,
+ __be32 saddr, __be32 dst,
+ __be16 l3_proto, u8 l4_proto,
+ int dst_mtu)
+{
+ struct sk_buff *skb;
+
+ if (skb_shinfo(head)->frag_list) {
+ bool ipv4 = (l3_proto == htons(ETH_P_IP));
+ bool tcp = (l4_proto == IPPROTO_TCP);
+ bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
+ int l4_offset = skb_transport_offset(head);
+
+ /* Need to call skb_orphan() to report currect true-size.
+ * calling skb_orphan() in this layer is odd but SKB with
+ * frag-list should not be associated with any socket, so
+ * skb-orphan should be no-op. */
+ skb_orphan(head);
+ if (unlikely(segment_skb(&head, csum_partial,
+ ipv4, tcp, l4_offset)))
+ goto error;
+ }
+
+ for (skb = head; skb; skb = skb->next) {
+ if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
+ l3_proto, l4_proto, dst_mtu))
+ goto error;
+ }
+
+ return head;
+error:
+ kfree_skb_list(head);
+ return NULL;
+}
+
+static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
+{
+ if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
+ int csum_offset;
+ __sum16 *csum;
+ int len;
+
+ if (l4_proto == IPPROTO_TCP)
+ csum_offset = offsetof(struct tcphdr, check);
+ else if (l4_proto == IPPROTO_UDP)
+ csum_offset = offsetof(struct udphdr, check);
+ else
+ return 0;
+
+ len = skb->len - skb_transport_offset(skb);
+ csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
+
+ if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
+ csum_offset + sizeof(*csum))))
+ return -EINVAL;
+
+ if (l3_proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ len, l4_proto, 0);
+ } else if (l3_proto == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+ *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ len, l4_proto, 0);
+ } else {
+ return 0;
+ }
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Assume receiver can only offload TCP/UDP over IPv4/6,
+ * and require 802.1Q VLANs to be accelerated.
+ */
+ if (l3_proto != htons(ETH_P_IP) &&
+ l3_proto != htons(ETH_P_IPV6))
+ return 0;
+
+ if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
+ return 0;
+
+ /* L4 offset must fit in a 1-byte field. */
+ if (skb->csum_start - skb_headroom(skb) > 255)
+ return 0;
+
+ if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
+ return 0;
+ }
+ /* Total size of encapsulated packet must fit in 16 bits. */
+ if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
+ return 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+ if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
+ return 0;
+#endif
+ return 1;
+}
+
+static bool need_linearize(const struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
+
+ if (unlikely(shinfo->frag_list))
+ return true;
+
+ /* Generally speaking we should linearize if there are paged frags.
+ * However, if all of the refcounts are 1 we know nobody else can
+ * change them from underneath us and we can skip the linearization.
+ */
+ for (i = 0; i < shinfo->nr_frags; i++)
+ if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
+ return true;
+
+ return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
+{
+ int err;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+ if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
+
+ min_headroom += VLAN_HLEN;
+ if (skb_headroom(skb) < min_headroom) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) + 16);
+
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ }
+
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (!skb) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+#endif
+
+ if (skb_is_gso(skb)) {
+ struct sk_buff *nskb;
+ char cb[sizeof(skb->cb)];
+
+ memcpy(cb, skb->cb, sizeof(cb));
+
+ nskb = __skb_gso_segment(skb, 0, false);
+ if (IS_ERR(nskb)) {
+ err = PTR_ERR(nskb);
+ goto error;
+ }
+
+ consume_skb(skb);
+ skb = nskb;
+ while (nskb) {
+ memcpy(nskb->cb, cb, sizeof(cb));
+ nskb = nskb->next;
+ }
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Pages aren't locked and could change at any time.
+ * If this happens after we compute the checksum, the
+ * checksum will be wrong. We linearize now to avoid
+ * this problem.
+ */
+ if (unlikely(need_linearize(skb))) {
+ err = __skb_linearize(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ }
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+
+static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
+ __be32 dst, __u8 tos, __u8 ttl, __be16 df)
+{
+ int len = 0;
+
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ if (next)
+ dst_clone(&rt->dst);
+
+ skb_clear_ovs_gso_cb(skb);
+ skb->next = NULL;
+ len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
+ tos, ttl, df, false);
+
+ skb = next;
+ }
+ return len;
+}
+
+static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
+{
+ unsigned int nh_ofs = skb_network_offset(skb);
+ int payload_ofs;
+ struct ipv6hdr *nh;
+ uint8_t nexthdr;
+ __be16 frag_off;
+
+ if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
+ return 0;
+
+ nh = ipv6_hdr(skb);
+ nexthdr = nh->nexthdr;
+ payload_ofs = (u8 *)(nh + 1) - skb->data;
+
+ payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
+ if (unlikely(payload_ofs < 0))
+ return 0;
+
+ return nexthdr;
+}
+
+static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
+{
+ if (l3_proto == htons(ETH_P_IP)) {
+ unsigned int nh_ofs = skb_network_offset(skb);
+
+ if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
+ return 0;
+
+ return ip_hdr(skb)->protocol;
+ } else if (l3_proto == htons(ETH_P_IPV6)) {
+ return parse_ipv6_l4_proto(skb);
+ }
+ return 0;
+}
+
+int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
+ __be32 src, __be32 dst, __u8 tos,
+ __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+ __be64 tun_id)
+{
+ struct ethhdr *eh = eth_hdr(skb);
+ int ret = 0, min_headroom;
+ __be16 inner_l3_proto;
+ u8 inner_l4_proto;
+
+ inner_l3_proto = eh->h_proto;
+ inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + STT_HEADER_LEN + sizeof(struct iphdr);
+
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+
+ ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(ret))
+ goto err_free_rt;
+ }
+
+ ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
+ if (ret < 0)
+ goto err_free_rt;
+ if (!ret) {
+ skb = handle_offloads(skb, min_headroom);
+ if (IS_ERR(skb)) {
+ ret = PTR_ERR(skb);
+ skb = NULL;
+ goto err_free_rt;
+ }
+ }
+
+ ret = 0;
+ while (skb) {
+ struct sk_buff *next_skb = skb->next;
+
+ skb->next = NULL;
+
+ if (next_skb)
+ dst_clone(&rt->dst);
+
+ /* Push STT and TCP header. */
+ skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
+ dst, inner_l3_proto, inner_l4_proto,
+ dst_mtu(&rt->dst));
+ if (unlikely(!skb)) {
+ ip_rt_put(rt);
+ goto next;
+ }
+
+ /* Push IP header. */
+ ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
+
+next:
+ skb = next_skb;
+ }
+
+ return ret;
+
+err_free_rt:
+ ip_rt_put(rt);
+ kfree_skb(skb);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb);
+
+static void free_frag(struct stt_percpu *stt_percpu,
+ struct pkt_frag *frag)
+{
+ stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
+ kfree_skb_list(frag->skbs);
+ list_del(&frag->lru_node);
+ frag->skbs = NULL;
+}
+
+static void evict_frags(struct stt_percpu *stt_percpu)
+{
+ while (!list_empty(&stt_percpu->frag_lru) &&
+ stt_percpu->frag_mem_used > REASM_LO_THRESH) {
+ struct pkt_frag *frag;
+
+ frag = list_first_entry(&stt_percpu->frag_lru,
+ struct pkt_frag,
+ lru_node);
+ free_frag(stt_percpu, frag);
+ }
+}
+
+static bool pkt_key_match(struct net *net,
+ const struct pkt_frag *a, const struct pkt_key *b)
+{
+ return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
+ a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
+ net_eq(dev_net(a->skbs->dev), net);
+}
+
+static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
+{
+ u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
+
+ return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
+ (__force u32)key->pkt_seq, initval);
+}
+
+static struct pkt_frag *lookup_frag(struct net *net,
+ struct stt_percpu *stt_percpu,
+ const struct pkt_key *key, u32 hash)
+{
+ struct pkt_frag *frag, *victim_frag = NULL;
+ int i;
+
+ for (i = 0; i < FRAG_HASH_SEGS; i++) {
+ frag = flex_array_get(stt_percpu->frag_hash,
+ hash & (FRAG_HASH_ENTRIES - 1));
+
+ if (frag->skbs &&
+ time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
+ pkt_key_match(net, frag, key))
+ return frag;
+
+ if (!victim_frag ||
+ (victim_frag->skbs &&
+ (!frag->skbs ||
+ time_before(frag->timestamp, victim_frag->timestamp))))
+ victim_frag = frag;
+
+ hash >>= FRAG_HASH_SHIFT;
+ }
+
+ if (victim_frag->skbs)
+ free_frag(stt_percpu, victim_frag);
+
+ return victim_frag;
+}
+
+static struct sk_buff *reassemble(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *tcph = tcp_hdr(skb);
+ u32 seq = ntohl(tcph->seq);
+ struct stt_percpu *stt_percpu;
+ struct sk_buff *last_skb;
+ struct pkt_frag *frag;
+ struct pkt_key key;
+ int tot_len;
+ u32 hash;
+
+ tot_len = seq >> STT_SEQ_LEN_SHIFT;
+ FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
+
+ if (unlikely(skb->len == 0))
+ goto out_free;
+
+ if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
+ goto out_free;
+
+ if (tot_len == skb->len)
+ goto out;
+
+ key.saddr = iph->saddr;
+ key.daddr = iph->daddr;
+ key.pkt_seq = tcph->ack_seq;
+ key.mark = skb->mark;
+ hash = pkt_key_hash(dev_net(skb->dev), &key);
+
+ stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
+
+ spin_lock(&stt_percpu->lock);
+
+ if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
+ evict_frags(stt_percpu);
+
+ frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
+ if (!frag->skbs) {
+ frag->skbs = skb;
+ frag->key = key;
+ frag->timestamp = jiffies;
+ FRAG_CB(skb)->first.last_skb = skb;
+ FRAG_CB(skb)->first.mem_used = skb->truesize;
+ FRAG_CB(skb)->first.tot_len = tot_len;
+ FRAG_CB(skb)->first.rcvd_len = skb->len;
+ FRAG_CB(skb)->first.set_ecn_ce = false;
+ list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
+ stt_percpu->frag_mem_used += skb->truesize;
+
+ skb = NULL;
+ goto unlock;
+ }
+
+ /* Optimize for the common case where fragments are received in-order
+ * and not overlapping.
+ */
+ last_skb = FRAG_CB(frag->skbs)->first.last_skb;
+ if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
+ FRAG_CB(skb)->offset)) {
+ last_skb->next = skb;
+ FRAG_CB(frag->skbs)->first.last_skb = skb;
+ } else {
+ struct sk_buff *prev = NULL, *next;
+
+ for (next = frag->skbs; next; next = next->next) {
+ if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
+ break;
+ prev = next;
+ }
+
+ /* Overlapping fragments aren't allowed. We shouldn't start
+ * before the end of the previous fragment.
+ */
+ if (prev &&
+ FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
+ goto unlock_free;
+
+ /* We also shouldn't end after the beginning of the next
+ * fragment.
+ */
+ if (next &&
+ FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
+ goto unlock_free;
+
+ if (prev) {
+ prev->next = skb;
+ } else {
+ FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
+ frag->skbs = skb;
+ }
+
+ if (next)
+ skb->next = next;
+ else
+ FRAG_CB(frag->skbs)->first.last_skb = skb;
+ }
+
+ FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
+ FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
+ FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
+ stt_percpu->frag_mem_used += skb->truesize;
+
+ if (FRAG_CB(frag->skbs)->first.tot_len ==
+ FRAG_CB(frag->skbs)->first.rcvd_len) {
+ struct sk_buff *frag_head = frag->skbs;
+
+ frag_head->tstamp = skb->tstamp;
+ if (FRAG_CB(frag_head)->first.set_ecn_ce)
+ INET_ECN_set_ce(frag_head);
+
+ list_del(&frag->lru_node);
+ stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
+ frag->skbs = NULL;
+ skb = frag_head;
+ } else {
+ list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
+ skb = NULL;
+ }
+
+ goto unlock;
+
+unlock_free:
+ kfree_skb(skb);
+ skb = NULL;
+unlock:
+ spin_unlock(&stt_percpu->lock);
+ return skb;
+out_free:
+ kfree_skb(skb);
+ skb = NULL;
+out:
+ return skb;
+}
+
+static bool validate_checksum(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE &&
+ !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
+ return true;
+
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
+ IPPROTO_TCP, 0);
+
+ return __tcp_checksum_complete(skb) == 0;
+}
+
+static bool set_offloads(struct sk_buff *skb)
+{
+ struct stthdr *stth = stt_hdr(skb);
+ unsigned short gso_type;
+ int l3_header_size;
+ int l4_header_size;
+ u16 csum_offset;
+ u8 proto_type;
+
+ if (stth->vlan_tci)
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+ ntohs(stth->vlan_tci));
+
+ if (!(stth->flags & STT_CSUM_PARTIAL)) {
+ if (stth->flags & STT_CSUM_VERIFIED)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return clear_gso(skb) == 0;
+ }
+
+ proto_type = stth->flags & STT_PROTO_TYPES;
+
+ switch (proto_type) {
+ case (STT_PROTO_IPV4 | STT_PROTO_TCP):
+ /* TCP/IPv4 */
+ csum_offset = offsetof(struct tcphdr, check);
+ gso_type = SKB_GSO_TCPV4;
+ l3_header_size = sizeof(struct iphdr);
+ l4_header_size = sizeof(struct tcphdr);
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case STT_PROTO_TCP:
+ /* TCP/IPv6 */
+ csum_offset = offsetof(struct tcphdr, check);
+ gso_type = SKB_GSO_TCPV6;
+ l3_header_size = sizeof(struct ipv6hdr);
+ l4_header_size = sizeof(struct tcphdr);
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ case STT_PROTO_IPV4:
+ /* UDP/IPv4 */
+ csum_offset = offsetof(struct udphdr, check);
+ gso_type = SKB_GSO_UDP;
+ l3_header_size = sizeof(struct iphdr);
+ l4_header_size = sizeof(struct udphdr);
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ default:
+ /* UDP/IPv6 */
+ csum_offset = offsetof(struct udphdr, check);
+ gso_type = SKB_GSO_UDP;
+ l3_header_size = sizeof(struct ipv6hdr);
+ l4_header_size = sizeof(struct udphdr);
+ skb->protocol = htons(ETH_P_IPV6);
+ }
+
+ if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
+ return false;
+
+ if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
+ return false;
+
+ stth = stt_hdr(skb);
+
+ skb->csum_start = skb_headroom(skb) + stth->l4_offset;
+ skb->csum_offset = csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (stth->mss) {
+ if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
+ return false;
+
+ skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_size = ntohs(stth->mss);
+ skb_shinfo(skb)->gso_segs = 0;
+ } else {
+ if (unlikely(clear_gso(skb)))
+ return false;
+ }
+
+ return true;
+}
+static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
+{
+ int err;
+
+ if (unlikely(!validate_checksum(skb)))
+ goto drop;
+
+ skb = reassemble(skb);
+ if (!skb)
+ return;
+
+ if (skb->next && coalesce_skb(&skb))
+ goto drop;
+
+ err = iptunnel_pull_header(skb,
+ sizeof(struct stthdr) + STT_ETH_PAD,
+ htons(ETH_P_TEB));
+ if (unlikely(err))
+ goto drop;
+
+ if (unlikely(stt_hdr(skb)->version != 0))
+ goto drop;
+
+ if (unlikely(!set_offloads(skb)))
+ goto drop;
+
+ if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
+ goto drop;
+
+ stt_sock->rcv(stt_sock, skb);
+ return;
+drop:
+ /* Consume bad packet */
+ kfree_skb_list(skb);
+}
+
+static void tcp_sock_release(struct socket *sock)
+{
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+}
+
+static int tcp_sock_create4(struct net *net, __be16 port,
+ struct socket **sockp)
+{
+ struct sockaddr_in tcp_addr;
+ struct socket *sock = NULL;
+ int err;
+
+ err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ memset(&tcp_addr, 0, sizeof(tcp_addr));
+ tcp_addr.sin_family = AF_INET;
+ tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ tcp_addr.sin_port = port;
+ err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
+ sizeof(tcp_addr));
+ if (err < 0)
+ goto error;
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock)
+ tcp_sock_release(sock);
+ *sockp = NULL;
+ return err;
+}
+
+static void schedule_clean_percpu(void)
+{
+ schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
+}
+
+static void clean_percpu(struct work_struct *work)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+ int j;
+
+ for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
+ struct pkt_frag *frag;
+
+ frag = flex_array_get(stt_percpu->frag_hash, j);
+ if (!frag->skbs ||
+ time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
+ continue;
+
+ spin_lock_bh(&stt_percpu->lock);
+
+ if (frag->skbs &&
+ time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
+ free_frag(stt_percpu, frag);
+
+ spin_unlock_bh(&stt_percpu->lock);
+ }
+ }
+ schedule_clean_percpu();
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
+#define FIRST_PARAM const struct nf_hook_ops *ops,
+#else
+#define FIRST_PARAM unsigned int hooknum,
+#endif
+
+static unsigned int nf_ip_hook(FIRST_PARAM
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct stt_sock *stt_sock;
+ int ip_hdr_len;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ ip_hdr_len = ip_hdrlen(skb);
+ if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
+ return NF_ACCEPT;
+
+ skb_set_transport_header(skb, ip_hdr_len);
+
+ stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest);
+ if (!stt_sock)
+ return NF_ACCEPT;
+
+ __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
+ stt_rcv(stt_sock, skb);
+ return NF_STOLEN;
+}
+
+static struct nf_hook_ops nf_hook_ops __read_mostly = {
+ .hook = nf_ip_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = INT_MAX,
+};
+
+static int stt_start(void)
+{
+ int err;
+ int i;
+
+ if (n_tunnels) {
+ n_tunnels++;
+ return 0;
+ }
+ get_random_bytes(&frag_hash_seed, sizeof(u32));
+
+ stt_percpu_data = alloc_percpu(struct stt_percpu);
+ if (!stt_percpu_data) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ for_each_possible_cpu(i) {
+ struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+ struct flex_array *frag_hash;
+
+ spin_lock_init(&stt_percpu->lock);
+ INIT_LIST_HEAD(&stt_percpu->frag_lru);
+ get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
+
+ frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
+ FRAG_HASH_ENTRIES,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!frag_hash) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ stt_percpu->frag_hash = frag_hash;
+
+ err = flex_array_prealloc(stt_percpu->frag_hash, 0,
+ FRAG_HASH_ENTRIES,
+ GFP_KERNEL | __GFP_ZERO);
+ if (err)
+ goto free_percpu;
+ }
+ err = nf_register_hook(&nf_hook_ops);
+ if (err)
+ goto free_percpu;
+
+ schedule_clean_percpu();
+ n_tunnels++;
+ return 0;
+
+free_percpu:
+ for_each_possible_cpu(i) {
+ struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+
+ if (stt_percpu->frag_hash)
+ flex_array_free(stt_percpu->frag_hash);
+ }
+
+ free_percpu(stt_percpu_data);
+
+error:
+ return err;
+}
+
+static void stt_cleanup(void)
+{
+ int i;
+
+ n_tunnels--;
+ if (n_tunnels)
+ return;
+
+ cancel_delayed_work_sync(&clean_percpu_wq);
+ nf_unregister_hook(&nf_hook_ops);
+
+ for_each_possible_cpu(i) {
+ struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+ int j;
+
+ for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
+ struct pkt_frag *frag;
+
+ frag = flex_array_get(stt_percpu->frag_hash, j);
+ kfree_skb_list(frag->skbs);
+ }
+
+ flex_array_free(stt_percpu->frag_hash);
+ }
+
+ free_percpu(stt_percpu_data);
+}
+
+static struct stt_sock *stt_socket_create(struct net *net, __be16 port,
+ stt_rcv_t *rcv, void *data)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+ struct stt_sock *stt_sock;
+ struct socket *sock;
+ int err;
+
+ stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL);
+ if (!stt_sock)
+ return ERR_PTR(-ENOMEM);
+
+ err = tcp_sock_create4(net, port, &sock);
+ if (err) {
+ kfree(stt_sock);
+ return ERR_PTR(err);
+ }
+
+ stt_sock->sock = sock;
+ stt_sock->rcv = rcv;
+ stt_sock->rcv_data = data;
+
+ list_add_rcu(&stt_sock->list, &sn->sock_list);
+
+ return stt_sock;
+}
+
+static void __stt_sock_release(struct stt_sock *stt_sock)
+{
+ list_del_rcu(&stt_sock->list);
+ tcp_sock_release(stt_sock->sock);
+ kfree_rcu(stt_sock, rcu);
+}
+
+struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
+ stt_rcv_t *rcv, void *data)
+{
+ struct stt_sock *stt_sock;
+ int err;
+
+ err = stt_start();
+ if (err)
+ return ERR_PTR(err);
+
+ mutex_lock(&stt_mutex);
+ rcu_read_lock();
+ stt_sock = stt_find_sock(net, port);
+ rcu_read_unlock();
+ if (stt_sock)
+ stt_sock = ERR_PTR(-EBUSY);
+ else
+ stt_sock = stt_socket_create(net, port, rcv, data);
+
+ mutex_unlock(&stt_mutex);
+
+ if (IS_ERR(stt_sock))
+ stt_cleanup();
+
+ return stt_sock;
+}
+EXPORT_SYMBOL_GPL(rpl_stt_sock_add);
+
+void rpl_stt_sock_release(struct stt_sock *stt_sock)
+{
+ mutex_lock(&stt_mutex);
+ if (stt_sock) {
+ __stt_sock_release(stt_sock);
+ stt_cleanup();
+ }
+ mutex_unlock(&stt_mutex);
+}
+EXPORT_SYMBOL_GPL(rpl_stt_sock_release);
+
+static int stt_init_net(struct net *net)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+
+ INIT_LIST_HEAD(&sn->sock_list);
+ return 0;
+}
+
+static struct pernet_operations stt_net_ops = {
+ .init = stt_init_net,
+ .id = &stt_net_id,
+ .size = sizeof(struct stt_net),
+};
+
+int ovs_stt_init_module(void)
+{
+ return register_pernet_subsys(&stt_net_ops);
+}
+EXPORT_SYMBOL_GPL(ovs_stt_init_module);
+
+void ovs_stt_cleanup_module(void)
+{
+ unregister_pernet_subsys(&stt_net_ops);
+}
+EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module);
+#endif