datapath: Add Stateless TCP Tunneling protocol.

author Pravin B Shelar <pshelar@nicira.com>

Fri, 10 Apr 2015 03:12:32 +0000 (20:12 -0700)

committer Pravin B Shelar <pshelar@nicira.com>

Wed, 29 Apr 2015 17:33:18 +0000 (10:33 -0700)
author Pravin B Shelar <pshelar@nicira.com>
Fri, 10 Apr 2015 03:12:32 +0000 (20:12 -0700)
committer Pravin B Shelar <pshelar@nicira.com>
Wed, 29 Apr 2015 17:33:18 +0000 (10:33 -0700)
diff --git a/FAQ.md b/FAQ.md

index d4e6393..ff9f932 100644 (file)
--- a/FAQ.md
+++ b/FAQ.md
@@ -218,6 +218,7 @@ A: Support for tunnels was added to the upstream Linux kernel module
  | VXLAN    |    3.12
  | Geneve   |    3.18
  | LISP     | <not upstream>
  | VXLAN    |    3.12
  | Geneve   |    3.18
  | LISP     | <not upstream>
+| STT      | <not upstream>
  
     If you are using a version of the kernel that is older than the one
     listed above, it is still possible to use that tunnel protocol. However,
  
     If you are using a version of the kernel that is older than the one
     listed above, it is still possible to use that tunnel protocol. However,
diff --git a/NEWS b/NEWS

index a78e79c..882a381 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -80,6 +80,7 @@ Post-v2.3.0
     - The kernel vports with dependencies are no longer part of the overall
       openvswitch.ko but built and loaded automatically as individual kernel
       modules (vport-*.ko).
     - The kernel vports with dependencies are no longer part of the overall
       openvswitch.ko but built and loaded automatically as individual kernel
       modules (vport-*.ko).
+   - Support for STT tunneling.
  
  
  v2.3.0 - 14 Aug 2014
  
  
  v2.3.0 - 14 Aug 2014
diff --git a/datapath/Modules.mk b/datapath/Modules.mk

index b1210c5..8dc3415 100644 (file)
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -9,6 +9,7 @@ both_modules = \
         vport_geneve \
         vport_gre \
         vport_lisp \
         vport_geneve \
         vport_gre \
         vport_lisp \
+       vport_stt \
         vport_vxlan
  # When changing the name of 'build_modules', please also update the
  # print-build-modules in Makefile.am.
         vport_vxlan
  # When changing the name of 'build_modules', please also update the
  # print-build-modules in Makefile.am.
@@ -30,6 +31,7 @@ vport_geneve_sources = vport-geneve.c
  vport_vxlan_sources = vport-vxlan.c
  vport_gre_sources = vport-gre.c
  vport_lisp_sources = vport-lisp.c
  vport_vxlan_sources = vport-vxlan.c
  vport_gre_sources = vport-gre.c
  vport_lisp_sources = vport-lisp.c
+vport_stt_sources = vport-stt.c
  
  openvswitch_headers = \
         compat.h \
  
  openvswitch_headers = \
         compat.h \
diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore

index 69d6658..a4a930d 100644 (file)
--- a/datapath/linux/.gitignore
+++ b/datapath/linux/.gitignore
@@ -35,6 +35,7 @@
  /random32.c
  /reciprocal_div.c
  /skbuff-openvswitch.c
  /random32.c
  /reciprocal_div.c
  /skbuff-openvswitch.c
+/stt.c
  /table.c
  /time.c
  /tmp
  /table.c
  /time.c
  /tmp
@@ -50,6 +51,7 @@
  /vport-lisp.c
  /vport-netdev.c
  /vport-patch.c
  /vport-lisp.c
  /vport-netdev.c
  /vport-patch.c
+/vport-stt.c
  /vport-vxlan.c
  /vport.c
  /vxlan.c
  /vport-vxlan.c
  /vport.c
  /vxlan.c
diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk

index 74ebc06..be3a8d8 100644 (file)
--- a/datapath/linux/Modules.mk
+++ b/datapath/linux/Modules.mk
@@ -12,6 +12,7 @@ openvswitch_sources += \
         linux/compat/net_namespace.c \
         linux/compat/reciprocal_div.c \
         linux/compat/skbuff-openvswitch.c \
         linux/compat/net_namespace.c \
         linux/compat/reciprocal_div.c \
         linux/compat/skbuff-openvswitch.c \
+       linux/compat/stt.c \
         linux/compat/udp.c \
         linux/compat/udp_tunnel.c \
         linux/compat/vxlan.c    \
         linux/compat/udp.c \
         linux/compat/udp_tunnel.c \
         linux/compat/vxlan.c    \
@@ -75,6 +76,7 @@ openvswitch_headers += \
         linux/compat/include/net/udp.h \
         linux/compat/include/net/udp_tunnel.h \
         linux/compat/include/net/sock.h \
         linux/compat/include/net/udp.h \
         linux/compat/include/net/udp_tunnel.h \
         linux/compat/include/net/sock.h \
+       linux/compat/include/net/stt.h \
         linux/compat/include/net/vxlan.h \
         linux/compat/include/net/sctp/checksum.h
  EXTRA_DIST += linux/compat/build-aux/export-check-whitelist
         linux/compat/include/net/vxlan.h \
         linux/compat/include/net/sctp/checksum.h
  EXTRA_DIST += linux/compat/build-aux/export-check-whitelist
diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h

index 8f5b076..6fcaff8 100644 (file)
--- a/datapath/linux/compat/gso.h
+++ b/datapath/linux/compat/gso.h
@@ -26,6 +26,15 @@ struct ovs_gso_cb {
  };
  #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb)
  
  };
  #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb)
  
+static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb)
+{
+       OVS_GSO_CB(skb)->fix_segment = NULL;
+}
+#else
+static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb)
+{
+
+}
  #endif
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
  #endif
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h

index 61c59a2..f53bc81 100644 (file)
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -229,6 +229,7 @@ enum ovs_vport_type {
         OVS_VPORT_TYPE_GENEVE,   /* Geneve tunnel. */
         OVS_VPORT_TYPE_GRE64 = 104, /* GRE tunnel with 64-bit keys */
         OVS_VPORT_TYPE_LISP = 105,  /* LISP tunnel */
         OVS_VPORT_TYPE_GENEVE,   /* Geneve tunnel. */
         OVS_VPORT_TYPE_GRE64 = 104, /* GRE tunnel with 64-bit keys */
         OVS_VPORT_TYPE_LISP = 105,  /* LISP tunnel */
+       OVS_VPORT_TYPE_STT = 106, /* STT tunnel */
         __OVS_VPORT_TYPE_MAX
  };
  
         __OVS_VPORT_TYPE_MAX
  };
  
diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h

new file mode 100644 (file)

index 0000000..13812b1
--- /dev/null
+++ b/datapath/linux/compat/include/net/stt.h
@@ -0,0 +1,71 @@
+#ifndef __NET_STT_H
+#define __NET_STT_H  1
+
+#include <linux/kconfig.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER)
+#include <net/ip_tunnels.h>
+#define OVS_STT
+
+struct stthdr {
+       __u8            version;
+       __u8            flags;
+       __u8            l4_offset;
+       __u8            reserved;
+       __be16          mss;
+       __be16          vlan_tci;
+       __be64          key;
+};
+
+/* Padding after the end of the tunnel headers to provide alignment
+ * for inner packet IP header after 14 byte Ethernet header.
+ */
+#define STT_ETH_PAD 2
+
+#define STT_BASE_HLEN   (sizeof(struct stthdr) + STT_ETH_PAD)
+#define STT_HEADER_LEN (sizeof(struct tcphdr) + STT_BASE_HLEN)
+
+static inline struct stthdr *stt_hdr(const struct sk_buff *skb)
+{
+       return (struct stthdr *)(skb_transport_header(skb) +
+                                sizeof(struct tcphdr));
+}
+
+struct stt_sock;
+typedef void (stt_rcv_t)(struct stt_sock *stt_sock, struct sk_buff *skb);
+
+/* @list: Per-net list of STT ports.
+ * @rcv: The callback is called on STT packet recv, STT reassembly can generate
+ * multiple packets, in this case first packet has tunnel outer header, rest
+ * of the packets are inner packet segments with no stt header.
+ * @rcv_data: user data.
+ * @sock: Fake TCP socket for the STT port.
+ */
+struct stt_sock {
+       struct list_head        list;
+       stt_rcv_t               *rcv;
+       void                    *rcv_data;
+       struct socket           *sock;
+       struct rcu_head         rcu;
+};
+
+#define stt_sock_add rpl_stt_sock_add
+struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
+                             stt_rcv_t *rcv, void *data);
+
+#define stt_sock_release rpl_stt_sock_release
+void rpl_stt_sock_release(struct stt_sock *stt_sock);
+
+#define stt_xmit_skb rpl_stt_xmit_skb
+int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
+                __be32 src, __be32 dst, __u8 tos,
+                __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+                __be64 tun_id);
+
+#define stt_init_module ovs_stt_init_module
+int ovs_stt_init_module(void);
+
+#define stt_cleanup_module ovs_stt_cleanup_module
+void ovs_stt_cleanup_module(void);
+
+#endif
+#endif /*ifdef__NET_STT_H */
diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c

new file mode 100644 (file)

index 0000000..d68d10e
--- /dev/null
+++ b/datapath/linux/compat/stt.c
@@ -0,0 +1,1550 @@
+/*
+ * Stateless TCP Tunnel (STT) vport.
+ *
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+
+#include <linux/delay.h>
+#include <linux/flex_array.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+#include <net/icmp.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/stt.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#include "gso.h"
+
+#ifdef OVS_STT
+#define STT_VER 0
+
+#define STT_CSUM_VERIFIED      BIT(0)
+#define STT_CSUM_PARTIAL       BIT(1)
+#define STT_PROTO_IPV4         BIT(2)
+#define STT_PROTO_TCP          BIT(3)
+#define STT_PROTO_TYPES                (STT_PROTO_IPV4 | STT_PROTO_TCP)
+
+#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
+                            SKB_GSO_TCPV6)
+
+/* The length and offset of a fragment are encoded in the sequence number.
+ * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
+ * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
+ */
+#define STT_SEQ_LEN_SHIFT 16
+#define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
+
+/* The maximum amount of memory used to store packets waiting to be reassembled
+ * on a given CPU.  Once this threshold is exceeded we will begin freeing the
+ * least recently used fragments.
+ */
+#define REASM_HI_THRESH (4 * 1024 * 1024)
+/* The target for the high memory evictor.  Once we have exceeded
+ * REASM_HI_THRESH, we will continue freeing fragments until we hit
+ * this limit.
+ */
+#define REASM_LO_THRESH (3 * 1024 * 1024)
+/* The length of time a given packet has to be reassembled from the time the
+ * first fragment arrives.  Once this limit is exceeded it becomes available
+ * for cleaning.
+ */
+#define FRAG_EXP_TIME (30 * HZ)
+/* Number of hash entries.  Each entry has only a single slot to hold a packet
+ * so if there are collisions, we will drop packets.  This is allocated
+ * per-cpu and each entry consists of struct pkt_frag.
+ */
+#define FRAG_HASH_SHIFT                8
+#define FRAG_HASH_ENTRIES      BIT(FRAG_HASH_SHIFT)
+#define FRAG_HASH_SEGS         ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
+
+#define CLEAN_PERCPU_INTERVAL (30 * HZ)
+
+struct pkt_key {
+       __be32 saddr;
+       __be32 daddr;
+       __be32 pkt_seq;
+       u32 mark;
+};
+
+struct pkt_frag {
+       struct sk_buff *skbs;
+       unsigned long timestamp;
+       struct list_head lru_node;
+       struct pkt_key key;
+};
+
+struct stt_percpu {
+       struct flex_array *frag_hash;
+       struct list_head frag_lru;
+       unsigned int frag_mem_used;
+
+       /* Protect frags table. */
+       spinlock_t lock;
+};
+
+struct first_frag {
+       struct sk_buff *last_skb;
+       unsigned int mem_used;
+       u16 tot_len;
+       u16 rcvd_len;
+       bool set_ecn_ce;
+};
+
+struct frag_skb_cb {
+       u16 offset;
+
+       /* Only valid for the first skb in the chain. */
+       struct first_frag first;
+};
+
+#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
+
+/* per-network namespace private data for this module */
+struct stt_net {
+       struct list_head sock_list;
+};
+
+static int stt_net_id;
+
+static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
+static u32 frag_hash_seed __read_mostly;
+
+/* Protects sock-hash and refcounts. */
+static DEFINE_MUTEX(stt_mutex);
+
+static int n_tunnels;
+static DEFINE_PER_CPU(u32, pkt_seq_counter);
+
+static void clean_percpu(struct work_struct *work);
+static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
+
+static struct stt_sock *stt_find_sock(struct net *net, __be16 port)
+{
+       struct stt_net *sn = net_generic(net, stt_net_id);
+       struct stt_sock *stt_sock;
+
+       list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) {
+               if (inet_sk(stt_sock->sock->sk)->inet_sport == port)
+                       return stt_sock;
+       }
+       return NULL;
+}
+
+static __be32 ack_seq(void)
+{
+#if NR_CPUS <= 65536
+       u32 pkt_seq, ack;
+
+       pkt_seq = this_cpu_read(pkt_seq_counter);
+       ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
+       this_cpu_inc(pkt_seq_counter);
+
+       return (__force __be32)ack;
+#else
+#error "Support for greater than 64k CPUs not implemented"
+#endif
+}
+
+static int clear_gso(struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int err;
+
+       if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
+           shinfo->gso_segs == 0)
+               return 0;
+
+       err = skb_unclone(skb, GFP_ATOMIC);
+       if (unlikely(err))
+               return err;
+
+       shinfo = skb_shinfo(skb);
+       shinfo->gso_type = 0;
+       shinfo->gso_size = 0;
+       shinfo->gso_segs = 0;
+       return 0;
+}
+
+static struct sk_buff *normalize_frag_list(struct sk_buff *head,
+                                          struct sk_buff **skbp)
+{
+       struct sk_buff *skb = *skbp;
+       struct sk_buff *last;
+
+       do {
+               struct sk_buff *frags;
+
+               if (skb_shared(skb)) {
+                       struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+                       if (unlikely(!nskb))
+                               return ERR_PTR(-ENOMEM);
+
+                       nskb->next = skb->next;
+                       consume_skb(skb);
+                       skb = nskb;
+                       *skbp = skb;
+               }
+
+               if (head) {
+                       head->len -= skb->len;
+                       head->data_len -= skb->len;
+                       head->truesize -= skb->truesize;
+               }
+
+               frags = skb_shinfo(skb)->frag_list;
+               if (frags) {
+                       int err;
+
+                       err = skb_unclone(skb, GFP_ATOMIC);
+                       if (unlikely(err))
+                               return ERR_PTR(err);
+
+                       last = normalize_frag_list(skb, &frags);
+                       if (IS_ERR(last))
+                               return last;
+
+                       skb_shinfo(skb)->frag_list = NULL;
+                       last->next = skb->next;
+                       skb->next = frags;
+               } else {
+                       last = skb;
+               }
+
+               skbp = &skb->next;
+       } while ((skb = skb->next));
+
+       return last;
+}
+
+/* Takes a linked list of skbs, which potentially contain frag_list
+ * (whose members in turn potentially contain frag_lists, etc.) and
+ * converts them into a single linear linked list.
+ */
+static int straighten_frag_list(struct sk_buff **skbp)
+{
+       struct sk_buff *err_skb;
+
+       err_skb = normalize_frag_list(NULL, skbp);
+       if (IS_ERR(err_skb))
+               return PTR_ERR(err_skb);
+
+       return 0;
+}
+
+static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+       to->protocol = from->protocol;
+       to->tstamp = from->tstamp;
+       to->priority = from->priority;
+       to->mark = from->mark;
+       to->vlan_tci = from->vlan_tci;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+       to->vlan_proto = from->vlan_proto;
+#endif
+       skb_copy_secmark(to, from);
+}
+
+static void update_headers(struct sk_buff *skb, bool head,
+                              unsigned int l4_offset, unsigned int hdr_len,
+                              bool ipv4, u32 tcp_seq)
+{
+       u16 old_len, new_len;
+       __be32 delta;
+       struct tcphdr *tcph;
+       int gso_size;
+
+       if (ipv4) {
+               struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
+
+               old_len = ntohs(iph->tot_len);
+               new_len = skb->len - ETH_HLEN;
+               iph->tot_len = htons(new_len);
+
+               ip_send_check(iph);
+       } else {
+               struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
+
+               old_len = ntohs(ip6h->payload_len);
+               new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
+               ip6h->payload_len = htons(new_len);
+       }
+
+       tcph = (struct tcphdr *)(skb->data + l4_offset);
+       if (!head) {
+               tcph->seq = htonl(tcp_seq);
+               tcph->cwr = 0;
+       }
+
+       if (skb->next) {
+               tcph->fin = 0;
+               tcph->psh = 0;
+       }
+
+       delta = htonl(~old_len + new_len);
+       tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
+                                (__force u32)delta));
+
+       gso_size = skb_shinfo(skb)->gso_size;
+       if (gso_size && skb->len - hdr_len <= gso_size)
+               BUG_ON(clear_gso(skb));
+}
+
+static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
+{
+       /* If no offloading is in use then we don't have enough information
+        * to process the headers.
+        */
+       if (!csum_partial)
+               goto linearize;
+
+       /* Handling UDP packets requires IP fragmentation, which means that
+        * the L4 checksum can no longer be calculated by hardware (since the
+        * fragments are in different packets.  If we have to compute the
+        * checksum it's faster just to linearize and large UDP packets are
+        * pretty uncommon anyways, so it's not worth dealing with for now.
+        */
+       if (!tcp)
+               goto linearize;
+
+       if (ipv4) {
+               struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
+
+               /* It's difficult to get the IP IDs exactly right here due to
+                * varying segment sizes and potentially multiple layers of
+                * segmentation.  IP ID isn't important when DF is set and DF
+                * is generally set for TCP packets, so just linearize if it's
+                * not.
+                */
+               if (!(iph->frag_off & htons(IP_DF)))
+                       goto linearize;
+       } else {
+               struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
+
+               /* Jumbograms require more processing to update and we'll
+                * probably never see them, so just linearize.
+                */
+               if (ip6h->payload_len == 0)
+                       goto linearize;
+       }
+       return true;
+
+linearize:
+       return false;
+}
+
+static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
+                           int hdr_len)
+{
+       u16 csum_start;
+
+       if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
+               int extra_head = hdr_len - skb_headroom(frag);
+
+               extra_head = extra_head > 0 ? extra_head : 0;
+               if (unlikely(pskb_expand_head(frag, extra_head, 0,
+                                             GFP_ATOMIC)))
+                       return -ENOMEM;
+       }
+
+       memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
+
+       csum_start = head->csum_start - skb_headroom(head);
+       frag->csum_start = skb_headroom(frag) + csum_start;
+       frag->csum_offset = head->csum_offset;
+       frag->ip_summed = head->ip_summed;
+
+       skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
+       skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
+       skb_shinfo(frag)->gso_segs = 0;
+
+       copy_skb_metadata(frag, head);
+       return 0;
+}
+
+static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
+{
+       struct sk_buff *skb;
+       struct tcphdr *tcph;
+       int seg_len;
+       int hdr_len;
+       int tcp_len;
+       u32 seq;
+
+       if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
+               return -ENOMEM;
+
+       tcph = (struct tcphdr *)(head->data + l4_offset);
+       tcp_len = tcph->doff * 4;
+       hdr_len = l4_offset + tcp_len;
+
+       if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
+                    (head->len < hdr_len)))
+               return -EINVAL;
+
+       if (unlikely(!pskb_may_pull(head, hdr_len)))
+               return -ENOMEM;
+
+       tcph = (struct tcphdr *)(head->data + l4_offset);
+       /* Update header of each segment. */
+       seq = ntohl(tcph->seq);
+       seg_len = skb_pagelen(head) - hdr_len;
+
+       skb = skb_shinfo(head)->frag_list;
+       skb_shinfo(head)->frag_list = NULL;
+       head->next = skb;
+       for (; skb; skb = skb->next) {
+               int err;
+
+               head->len -= skb->len;
+               head->data_len -= skb->len;
+               head->truesize -= skb->truesize;
+
+               seq += seg_len;
+               seg_len = skb->len;
+               err = copy_headers(head, skb, hdr_len);
+               if (err)
+                       return err;
+               update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
+       }
+       update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
+       return 0;
+}
+
+static int coalesce_skb(struct sk_buff **headp)
+{
+       struct sk_buff *frag, *head, *prev;
+       int err;
+
+       err = straighten_frag_list(headp);
+       if (unlikely(err))
+               return err;
+       head = *headp;
+
+       /* Coalesce frag list. */
+       prev = head;
+       for (frag = head->next; frag; frag = frag->next) {
+               bool headstolen;
+               int delta;
+
+               if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
+                       return -ENOMEM;
+
+               if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
+                       prev = frag;
+                       continue;
+               }
+
+               prev->next = frag->next;
+               frag->len = 0;
+               frag->data_len = 0;
+               frag->truesize -= delta;
+               kfree_skb_partial(frag, headstolen);
+               frag = prev;
+       }
+
+       if (!head->next)
+               return 0;
+
+       for (frag = head->next; frag; frag = frag->next) {
+               head->len += frag->len;
+               head->data_len += frag->len;
+               head->truesize += frag->truesize;
+       }
+
+       skb_shinfo(head)->frag_list = head->next;
+       head->next = NULL;
+       return 0;
+}
+
+static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
+                           bool ipv4, bool tcp, int l4_offset)
+{
+       if (can_segment(skb, ipv4, tcp, csum_partial))
+               return skb_list_segment(skb, ipv4, l4_offset);
+       else
+               return skb_linearize(skb);
+}
+
+static int try_to_segment(struct sk_buff *skb)
+{
+       struct stthdr *stth = stt_hdr(skb);
+       bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
+       bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
+       bool tcp = !!(stth->flags & STT_PROTO_TCP);
+       int l4_offset = stth->l4_offset;
+
+       return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
+}
+
+static int segment_skb(struct sk_buff **headp, bool csum_partial,
+                      bool ipv4, bool tcp, int l4_offset)
+{
+       int err;
+
+       err = coalesce_skb(headp);
+       if (err)
+               return err;
+
+       if (skb_shinfo(*headp)->frag_list)
+               return __try_to_segment(*headp, csum_partial,
+                                       ipv4, tcp, l4_offset);
+       return 0;
+}
+
+static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
+                            __be16 s_port, __be16 d_port,
+                            __be32 saddr, __be32 dst,
+                            __be16 l3_proto, u8 l4_proto,
+                            int dst_mtu)
+{
+       int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
+       unsigned short encap_mss;
+       struct tcphdr *tcph;
+       struct stthdr *stth;
+
+       skb_push(skb, STT_HEADER_LEN);
+       skb_reset_transport_header(skb);
+       tcph = tcp_hdr(skb);
+       memset(tcph, 0, STT_HEADER_LEN);
+       stth = stt_hdr(skb);
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               stth->flags |= STT_CSUM_PARTIAL;
+
+               stth->l4_offset = skb->csum_start -
+                                       (skb_headroom(skb) +
+                                       STT_HEADER_LEN);
+
+               if (l3_proto == htons(ETH_P_IP))
+                       stth->flags |= STT_PROTO_IPV4;
+
+               if (l4_proto == IPPROTO_TCP)
+                       stth->flags |= STT_PROTO_TCP;
+
+               stth->mss = htons(skb_shinfo(skb)->gso_size);
+       } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+               stth->flags |= STT_CSUM_VERIFIED;
+       }
+
+       stth->vlan_tci = htons(skb->vlan_tci);
+       skb->vlan_tci = 0;
+       put_unaligned(tun_id, &stth->key);
+
+       tcph->source    = s_port;
+       tcph->dest      = d_port;
+       tcph->doff      = sizeof(struct tcphdr) / 4;
+       tcph->ack       = 1;
+       tcph->psh       = 1;
+       tcph->window    = htons(USHRT_MAX);
+       tcph->seq       = htonl(data_len << STT_SEQ_LEN_SHIFT);
+       tcph->ack_seq   = ack_seq();
+       tcph->check     = ~tcp_v4_check(skb->len, saddr, dst, 0);
+
+       skb->csum_start = skb_transport_header(skb) - skb->head;
+       skb->csum_offset = offsetof(struct tcphdr, check);
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+       if (data_len > encap_mss) {
+               if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
+                       return -EINVAL;
+
+               skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+               skb_shinfo(skb)->gso_size = encap_mss;
+               skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
+       } else {
+               if (unlikely(clear_gso(skb)))
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
+                                      __be16 s_port, __be16 d_port,
+                                      __be32 saddr, __be32 dst,
+                                      __be16 l3_proto, u8 l4_proto,
+                                      int dst_mtu)
+{
+       struct sk_buff *skb;
+
+       if (skb_shinfo(head)->frag_list) {
+               bool ipv4 = (l3_proto == htons(ETH_P_IP));
+               bool tcp = (l4_proto == IPPROTO_TCP);
+               bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
+               int l4_offset = skb_transport_offset(head);
+
+               /* Need to call skb_orphan() to report currect true-size.
+                * calling skb_orphan() in this layer is odd but SKB with
+                * frag-list should not be associated with any socket, so
+                * skb-orphan should be no-op. */
+               skb_orphan(head);
+               if (unlikely(segment_skb(&head, csum_partial,
+                                        ipv4, tcp, l4_offset)))
+                       goto error;
+       }
+
+       for (skb = head; skb; skb = skb->next) {
+               if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
+                                     l3_proto, l4_proto, dst_mtu))
+                       goto error;
+       }
+
+       return head;
+error:
+       kfree_skb_list(head);
+       return NULL;
+}
+
+static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
+{
+       if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
+               int csum_offset;
+               __sum16 *csum;
+               int len;
+
+               if (l4_proto == IPPROTO_TCP)
+                       csum_offset = offsetof(struct tcphdr, check);
+               else if (l4_proto == IPPROTO_UDP)
+                       csum_offset = offsetof(struct udphdr, check);
+               else
+                       return 0;
+
+               len = skb->len - skb_transport_offset(skb);
+               csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
+
+               if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
+                                                csum_offset + sizeof(*csum))))
+                       return -EINVAL;
+
+               if (l3_proto == htons(ETH_P_IP)) {
+                       struct iphdr *iph = ip_hdr(skb);
+
+                       *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                  len, l4_proto, 0);
+               } else if (l3_proto == htons(ETH_P_IPV6)) {
+                       struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+                       *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+                                                len, l4_proto, 0);
+               } else {
+                       return 0;
+               }
+               skb->csum_start = skb_transport_header(skb) - skb->head;
+               skb->csum_offset = csum_offset;
+               skb->ip_summed = CHECKSUM_PARTIAL;
+       }
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               /* Assume receiver can only offload TCP/UDP over IPv4/6,
+                * and require 802.1Q VLANs to be accelerated.
+                */
+               if (l3_proto != htons(ETH_P_IP) &&
+                   l3_proto != htons(ETH_P_IPV6))
+                       return 0;
+
+               if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
+                       return 0;
+
+               /* L4 offset must fit in a 1-byte field. */
+               if (skb->csum_start - skb_headroom(skb) > 255)
+                       return 0;
+
+               if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
+                       return 0;
+       }
+       /* Total size of encapsulated packet must fit in 16 bits. */
+       if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
+               return 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+       if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
+               return 0;
+#endif
+       return 1;
+}
+
+static bool need_linearize(const struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int i;
+
+       if (unlikely(shinfo->frag_list))
+               return true;
+
+       /* Generally speaking we should linearize if there are paged frags.
+        * However, if all of the refcounts are 1 we know nobody else can
+        * change them from underneath us and we can skip the linearization.
+        */
+       for (i = 0; i < shinfo->nr_frags; i++)
+               if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
+                       return true;
+
+       return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
+{
+       int err;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+       if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
+
+               min_headroom += VLAN_HLEN;
+               if (skb_headroom(skb) < min_headroom) {
+                       int head_delta = SKB_DATA_ALIGN(min_headroom -
+                                                       skb_headroom(skb) + 16);
+
+                       err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+                                              0, GFP_ATOMIC);
+                       if (unlikely(err))
+                               goto error;
+               }
+
+               skb = __vlan_hwaccel_push_inside(skb);
+               if (!skb) {
+                       err = -ENOMEM;
+                       goto error;
+               }
+       }
+#endif
+
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb;
+               char cb[sizeof(skb->cb)];
+
+               memcpy(cb, skb->cb, sizeof(cb));
+
+               nskb = __skb_gso_segment(skb, 0, false);
+               if (IS_ERR(nskb)) {
+                       err = PTR_ERR(nskb);
+                       goto error;
+               }
+
+               consume_skb(skb);
+               skb = nskb;
+               while (nskb) {
+                       memcpy(nskb->cb, cb, sizeof(cb));
+                       nskb = nskb->next;
+               }
+       } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               /* Pages aren't locked and could change at any time.
+                * If this happens after we compute the checksum, the
+                * checksum will be wrong.  We linearize now to avoid
+                * this problem.
+                */
+               if (unlikely(need_linearize(skb))) {
+                       err = __skb_linearize(skb);
+                       if (unlikely(err))
+                               goto error;
+               }
+
+               err = skb_checksum_help(skb);
+               if (unlikely(err))
+                       goto error;
+       }
+       skb->ip_summed = CHECKSUM_NONE;
+
+       return skb;
+error:
+       kfree_skb(skb);
+       return ERR_PTR(err);
+}
+
+static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
+                        __be32 dst, __u8 tos, __u8 ttl, __be16 df)
+{
+       int len = 0;
+
+       while (skb) {
+               struct sk_buff *next = skb->next;
+
+               if (next)
+                       dst_clone(&rt->dst);
+
+               skb_clear_ovs_gso_cb(skb);
+               skb->next = NULL;
+               len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
+                                    tos, ttl, df, false);
+
+               skb = next;
+       }
+       return len;
+}
+
+static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
+{
+       unsigned int nh_ofs = skb_network_offset(skb);
+       int payload_ofs;
+       struct ipv6hdr *nh;
+       uint8_t nexthdr;
+       __be16 frag_off;
+
+       if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
+               return 0;
+
+       nh = ipv6_hdr(skb);
+       nexthdr = nh->nexthdr;
+       payload_ofs = (u8 *)(nh + 1) - skb->data;
+
+       payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
+       if (unlikely(payload_ofs < 0))
+               return 0;
+
+       return nexthdr;
+}
+
+static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
+{
+       if (l3_proto == htons(ETH_P_IP)) {
+               unsigned int nh_ofs = skb_network_offset(skb);
+
+               if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
+                       return 0;
+
+               return ip_hdr(skb)->protocol;
+       } else if (l3_proto == htons(ETH_P_IPV6)) {
+               return parse_ipv6_l4_proto(skb);
+       }
+       return 0;
+}
+
+int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
+                __be32 src, __be32 dst, __u8 tos,
+                __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+                __be64 tun_id)
+{
+       struct ethhdr *eh = eth_hdr(skb);
+       int ret = 0, min_headroom;
+       __be16 inner_l3_proto;
+        u8 inner_l4_proto;
+
+       inner_l3_proto = eh->h_proto;
+       inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
+
+       min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+                       + STT_HEADER_LEN + sizeof(struct iphdr);
+
+       if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+               int head_delta = SKB_DATA_ALIGN(min_headroom -
+                                               skb_headroom(skb) +
+                                               16);
+
+               ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
+                                      0, GFP_ATOMIC);
+               if (unlikely(ret))
+                       goto err_free_rt;
+       }
+
+       ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
+       if (ret < 0)
+               goto err_free_rt;
+       if (!ret) {
+               skb = handle_offloads(skb, min_headroom);
+               if (IS_ERR(skb)) {
+                       ret = PTR_ERR(skb);
+                       skb = NULL;
+                       goto err_free_rt;
+               }
+       }
+
+       ret = 0;
+       while (skb) {
+               struct sk_buff *next_skb = skb->next;
+
+               skb->next = NULL;
+
+               if (next_skb)
+                       dst_clone(&rt->dst);
+
+               /* Push STT and TCP header. */
+               skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
+                                     dst, inner_l3_proto, inner_l4_proto,
+                                     dst_mtu(&rt->dst));
+               if (unlikely(!skb)) {
+                       ip_rt_put(rt);
+                       goto next;
+               }
+
+               /* Push IP header. */
+               ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
+
+next:
+               skb = next_skb;
+       }
+
+       return ret;
+
+err_free_rt:
+       ip_rt_put(rt);
+       kfree_skb(skb);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb);
+
+static void free_frag(struct stt_percpu *stt_percpu,
+                     struct pkt_frag *frag)
+{
+       stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
+       kfree_skb_list(frag->skbs);
+       list_del(&frag->lru_node);
+       frag->skbs = NULL;
+}
+
+static void evict_frags(struct stt_percpu *stt_percpu)
+{
+       while (!list_empty(&stt_percpu->frag_lru) &&
+              stt_percpu->frag_mem_used > REASM_LO_THRESH) {
+               struct pkt_frag *frag;
+
+               frag = list_first_entry(&stt_percpu->frag_lru,
+                                       struct pkt_frag,
+                                       lru_node);
+               free_frag(stt_percpu, frag);
+       }
+}
+
+static bool pkt_key_match(struct net *net,
+                         const struct pkt_frag *a, const struct pkt_key *b)
+{
+       return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
+              a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
+              net_eq(dev_net(a->skbs->dev), net);
+}
+
+static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
+{
+       u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
+
+       return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
+                           (__force u32)key->pkt_seq, initval);
+}
+
+static struct pkt_frag *lookup_frag(struct net *net,
+                                   struct stt_percpu *stt_percpu,
+                                   const struct pkt_key *key, u32 hash)
+{
+       struct pkt_frag *frag, *victim_frag = NULL;
+       int i;
+
+       for (i = 0; i < FRAG_HASH_SEGS; i++) {
+               frag = flex_array_get(stt_percpu->frag_hash,
+                                     hash & (FRAG_HASH_ENTRIES - 1));
+
+               if (frag->skbs &&
+                   time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
+                   pkt_key_match(net, frag, key))
+                       return frag;
+
+               if (!victim_frag ||
+                   (victim_frag->skbs &&
+                    (!frag->skbs ||
+                     time_before(frag->timestamp, victim_frag->timestamp))))
+                       victim_frag = frag;
+
+               hash >>= FRAG_HASH_SHIFT;
+       }
+
+       if (victim_frag->skbs)
+               free_frag(stt_percpu, victim_frag);
+
+       return victim_frag;
+}
+
+static struct sk_buff *reassemble(struct sk_buff *skb)
+{
+       struct iphdr *iph = ip_hdr(skb);
+       struct tcphdr *tcph = tcp_hdr(skb);
+       u32 seq = ntohl(tcph->seq);
+       struct stt_percpu *stt_percpu;
+       struct sk_buff *last_skb;
+       struct pkt_frag *frag;
+       struct pkt_key key;
+       int tot_len;
+       u32 hash;
+
+       tot_len = seq >> STT_SEQ_LEN_SHIFT;
+       FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
+
+       if (unlikely(skb->len == 0))
+               goto out_free;
+
+       if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
+               goto out_free;
+
+       if (tot_len == skb->len)
+               goto out;
+
+       key.saddr = iph->saddr;
+       key.daddr = iph->daddr;
+       key.pkt_seq = tcph->ack_seq;
+       key.mark = skb->mark;
+       hash = pkt_key_hash(dev_net(skb->dev), &key);
+
+       stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
+
+       spin_lock(&stt_percpu->lock);
+
+       if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
+               evict_frags(stt_percpu);
+
+       frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
+       if (!frag->skbs) {
+               frag->skbs = skb;
+               frag->key = key;
+               frag->timestamp = jiffies;
+               FRAG_CB(skb)->first.last_skb = skb;
+               FRAG_CB(skb)->first.mem_used = skb->truesize;
+               FRAG_CB(skb)->first.tot_len = tot_len;
+               FRAG_CB(skb)->first.rcvd_len = skb->len;
+               FRAG_CB(skb)->first.set_ecn_ce = false;
+               list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
+               stt_percpu->frag_mem_used += skb->truesize;
+
+               skb = NULL;
+               goto unlock;
+       }
+
+       /* Optimize for the common case where fragments are received in-order
+        * and not overlapping.
+        */
+       last_skb = FRAG_CB(frag->skbs)->first.last_skb;
+       if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
+                  FRAG_CB(skb)->offset)) {
+               last_skb->next = skb;
+               FRAG_CB(frag->skbs)->first.last_skb = skb;
+       } else {
+               struct sk_buff *prev = NULL, *next;
+
+               for (next = frag->skbs; next; next = next->next) {
+                       if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
+                               break;
+                       prev = next;
+               }
+
+               /* Overlapping fragments aren't allowed.  We shouldn't start
+                * before the end of the previous fragment.
+                */
+               if (prev &&
+                   FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
+                       goto unlock_free;
+
+               /* We also shouldn't end after the beginning of the next
+                * fragment.
+                */
+               if (next &&
+                   FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
+                       goto unlock_free;
+
+               if (prev) {
+                       prev->next = skb;
+               } else {
+                       FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
+                       frag->skbs = skb;
+               }
+
+               if (next)
+                       skb->next = next;
+               else
+                       FRAG_CB(frag->skbs)->first.last_skb = skb;
+       }
+
+       FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
+       FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
+       FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
+       stt_percpu->frag_mem_used += skb->truesize;
+
+       if (FRAG_CB(frag->skbs)->first.tot_len ==
+           FRAG_CB(frag->skbs)->first.rcvd_len) {
+               struct sk_buff *frag_head = frag->skbs;
+
+               frag_head->tstamp = skb->tstamp;
+               if (FRAG_CB(frag_head)->first.set_ecn_ce)
+                       INET_ECN_set_ce(frag_head);
+
+               list_del(&frag->lru_node);
+               stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
+               frag->skbs = NULL;
+               skb = frag_head;
+       } else {
+               list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
+               skb = NULL;
+       }
+
+       goto unlock;
+
+unlock_free:
+       kfree_skb(skb);
+       skb = NULL;
+unlock:
+       spin_unlock(&stt_percpu->lock);
+       return skb;
+out_free:
+       kfree_skb(skb);
+       skb = NULL;
+out:
+       return skb;
+}
+
+static bool validate_checksum(struct sk_buff *skb)
+{
+       struct iphdr *iph = ip_hdr(skb);
+
+       if (skb_csum_unnecessary(skb))
+               return true;
+
+       if (skb->ip_summed == CHECKSUM_COMPLETE &&
+           !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
+               return true;
+
+       skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
+                                      IPPROTO_TCP, 0);
+
+       return __tcp_checksum_complete(skb) == 0;
+}
+
+static bool set_offloads(struct sk_buff *skb)
+{
+       struct stthdr *stth = stt_hdr(skb);
+       unsigned short gso_type;
+       int l3_header_size;
+       int l4_header_size;
+       u16 csum_offset;
+       u8 proto_type;
+
+       if (stth->vlan_tci)
+               __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+                                      ntohs(stth->vlan_tci));
+
+       if (!(stth->flags & STT_CSUM_PARTIAL)) {
+               if (stth->flags & STT_CSUM_VERIFIED)
+                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+               else
+                       skb->ip_summed = CHECKSUM_NONE;
+
+               return clear_gso(skb) == 0;
+       }
+
+       proto_type = stth->flags & STT_PROTO_TYPES;
+
+       switch (proto_type) {
+       case (STT_PROTO_IPV4 | STT_PROTO_TCP):
+               /* TCP/IPv4 */
+               csum_offset = offsetof(struct tcphdr, check);
+               gso_type = SKB_GSO_TCPV4;
+               l3_header_size = sizeof(struct iphdr);
+               l4_header_size = sizeof(struct tcphdr);
+               skb->protocol = htons(ETH_P_IP);
+               break;
+       case STT_PROTO_TCP:
+               /* TCP/IPv6 */
+               csum_offset = offsetof(struct tcphdr, check);
+               gso_type = SKB_GSO_TCPV6;
+               l3_header_size = sizeof(struct ipv6hdr);
+               l4_header_size = sizeof(struct tcphdr);
+               skb->protocol = htons(ETH_P_IPV6);
+               break;
+       case STT_PROTO_IPV4:
+               /* UDP/IPv4 */
+               csum_offset = offsetof(struct udphdr, check);
+               gso_type = SKB_GSO_UDP;
+               l3_header_size = sizeof(struct iphdr);
+               l4_header_size = sizeof(struct udphdr);
+               skb->protocol = htons(ETH_P_IP);
+               break;
+       default:
+               /* UDP/IPv6 */
+               csum_offset = offsetof(struct udphdr, check);
+               gso_type = SKB_GSO_UDP;
+               l3_header_size = sizeof(struct ipv6hdr);
+               l4_header_size = sizeof(struct udphdr);
+               skb->protocol = htons(ETH_P_IPV6);
+       }
+
+       if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
+               return false;
+
+       if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
+               return false;
+
+       stth = stt_hdr(skb);
+
+       skb->csum_start = skb_headroom(skb) + stth->l4_offset;
+       skb->csum_offset = csum_offset;
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       if (stth->mss) {
+               if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
+                       return false;
+
+               skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
+               skb_shinfo(skb)->gso_size = ntohs(stth->mss);
+               skb_shinfo(skb)->gso_segs = 0;
+       } else {
+               if (unlikely(clear_gso(skb)))
+                       return false;
+       }
+
+       return true;
+}
+static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
+{
+       int err;
+
+       if (unlikely(!validate_checksum(skb)))
+               goto drop;
+
+       skb = reassemble(skb);
+       if (!skb)
+               return;
+
+       if (skb->next && coalesce_skb(&skb))
+               goto drop;
+
+       err = iptunnel_pull_header(skb,
+                                  sizeof(struct stthdr) + STT_ETH_PAD,
+                                  htons(ETH_P_TEB));
+       if (unlikely(err))
+               goto drop;
+
+       if (unlikely(stt_hdr(skb)->version != 0))
+               goto drop;
+
+       if (unlikely(!set_offloads(skb)))
+               goto drop;
+
+       if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
+               goto drop;
+
+       stt_sock->rcv(stt_sock, skb);
+       return;
+drop:
+       /* Consume bad packet */
+       kfree_skb_list(skb);
+}
+
+static void tcp_sock_release(struct socket *sock)
+{
+       kernel_sock_shutdown(sock, SHUT_RDWR);
+       sk_release_kernel(sock->sk);
+}
+
+static int tcp_sock_create4(struct net *net, __be16 port,
+                           struct socket **sockp)
+{
+       struct sockaddr_in tcp_addr;
+       struct socket *sock = NULL;
+       int err;
+
+       err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+       if (err < 0)
+               goto error;
+
+       sk_change_net(sock->sk, net);
+
+       memset(&tcp_addr, 0, sizeof(tcp_addr));
+       tcp_addr.sin_family = AF_INET;
+       tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+       tcp_addr.sin_port = port;
+       err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
+                         sizeof(tcp_addr));
+       if (err < 0)
+               goto error;
+
+       *sockp = sock;
+       return 0;
+
+error:
+       if (sock)
+               tcp_sock_release(sock);
+       *sockp = NULL;
+       return err;
+}
+
+static void schedule_clean_percpu(void)
+{
+       schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
+}
+
+static void clean_percpu(struct work_struct *work)
+{
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+               int j;
+
+               for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
+                       struct pkt_frag *frag;
+
+                       frag = flex_array_get(stt_percpu->frag_hash, j);
+                       if (!frag->skbs ||
+                           time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
+                               continue;
+
+                       spin_lock_bh(&stt_percpu->lock);
+
+                       if (frag->skbs &&
+                           time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
+                               free_frag(stt_percpu, frag);
+
+                       spin_unlock_bh(&stt_percpu->lock);
+               }
+       }
+       schedule_clean_percpu();
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
+#define FIRST_PARAM const struct nf_hook_ops *ops,
+#else
+#define FIRST_PARAM unsigned int hooknum,
+#endif
+
+static unsigned int nf_ip_hook(FIRST_PARAM
+                              struct sk_buff *skb,
+                              const struct net_device *in,
+                              const struct net_device *out,
+                              int (*okfn)(struct sk_buff *))
+{
+       struct stt_sock *stt_sock;
+       int ip_hdr_len;
+
+       if (ip_hdr(skb)->protocol != IPPROTO_TCP)
+               return NF_ACCEPT;
+
+       ip_hdr_len = ip_hdrlen(skb);
+       if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
+               return NF_ACCEPT;
+
+       skb_set_transport_header(skb, ip_hdr_len);
+
+       stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest);
+       if (!stt_sock)
+               return NF_ACCEPT;
+
+       __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
+       stt_rcv(stt_sock, skb);
+       return NF_STOLEN;
+}
+
+static struct nf_hook_ops nf_hook_ops __read_mostly = {
+       .hook           = nf_ip_hook,
+       .owner          = THIS_MODULE,
+       .pf             = NFPROTO_IPV4,
+       .hooknum        = NF_INET_LOCAL_IN,
+       .priority       = INT_MAX,
+};
+
+static int stt_start(void)
+{
+       int err;
+       int i;
+
+       if (n_tunnels) {
+               n_tunnels++;
+               return 0;
+       }
+       get_random_bytes(&frag_hash_seed, sizeof(u32));
+
+       stt_percpu_data = alloc_percpu(struct stt_percpu);
+       if (!stt_percpu_data) {
+               err = -ENOMEM;
+               goto error;
+       }
+
+       for_each_possible_cpu(i) {
+               struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+               struct flex_array *frag_hash;
+
+               spin_lock_init(&stt_percpu->lock);
+               INIT_LIST_HEAD(&stt_percpu->frag_lru);
+               get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
+
+               frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
+                                            FRAG_HASH_ENTRIES,
+                                            GFP_KERNEL | __GFP_ZERO);
+               if (!frag_hash) {
+                       err = -ENOMEM;
+                       goto free_percpu;
+               }
+               stt_percpu->frag_hash = frag_hash;
+
+               err = flex_array_prealloc(stt_percpu->frag_hash, 0,
+                                         FRAG_HASH_ENTRIES,
+                                         GFP_KERNEL | __GFP_ZERO);
+               if (err)
+                       goto free_percpu;
+       }
+       err = nf_register_hook(&nf_hook_ops);
+       if (err)
+               goto free_percpu;
+
+       schedule_clean_percpu();
+       n_tunnels++;
+       return 0;
+
+free_percpu:
+       for_each_possible_cpu(i) {
+               struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+
+               if (stt_percpu->frag_hash)
+                       flex_array_free(stt_percpu->frag_hash);
+       }
+
+       free_percpu(stt_percpu_data);
+
+error:
+       return err;
+}
+
+static void stt_cleanup(void)
+{
+       int i;
+
+       n_tunnels--;
+       if (n_tunnels)
+               return;
+
+       cancel_delayed_work_sync(&clean_percpu_wq);
+       nf_unregister_hook(&nf_hook_ops);
+
+       for_each_possible_cpu(i) {
+               struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
+               int j;
+
+               for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
+                       struct pkt_frag *frag;
+
+                       frag = flex_array_get(stt_percpu->frag_hash, j);
+                       kfree_skb_list(frag->skbs);
+               }
+
+               flex_array_free(stt_percpu->frag_hash);
+       }
+
+       free_percpu(stt_percpu_data);
+}
+
+static struct stt_sock *stt_socket_create(struct net *net, __be16 port,
+                                         stt_rcv_t *rcv, void *data)
+{
+       struct stt_net *sn = net_generic(net, stt_net_id);
+       struct stt_sock *stt_sock;
+       struct socket *sock;
+       int err;
+
+       stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL);
+       if (!stt_sock)
+               return ERR_PTR(-ENOMEM);
+
+       err = tcp_sock_create4(net, port, &sock);
+       if (err) {
+               kfree(stt_sock);
+               return ERR_PTR(err);
+       }
+
+       stt_sock->sock = sock;
+       stt_sock->rcv = rcv;
+       stt_sock->rcv_data = data;
+
+       list_add_rcu(&stt_sock->list, &sn->sock_list);
+
+       return stt_sock;
+}
+
+static void __stt_sock_release(struct stt_sock *stt_sock)
+{
+       list_del_rcu(&stt_sock->list);
+       tcp_sock_release(stt_sock->sock);
+       kfree_rcu(stt_sock, rcu);
+}
+
+struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
+                             stt_rcv_t *rcv, void *data)
+{
+       struct stt_sock *stt_sock;
+       int err;
+
+       err = stt_start();
+       if (err)
+               return ERR_PTR(err);
+
+       mutex_lock(&stt_mutex);
+       rcu_read_lock();
+       stt_sock = stt_find_sock(net, port);
+       rcu_read_unlock();
+       if (stt_sock)
+               stt_sock = ERR_PTR(-EBUSY);
+       else
+               stt_sock = stt_socket_create(net, port, rcv, data);
+
+       mutex_unlock(&stt_mutex);
+
+       if (IS_ERR(stt_sock))
+               stt_cleanup();
+
+       return stt_sock;
+}
+EXPORT_SYMBOL_GPL(rpl_stt_sock_add);
+
+void rpl_stt_sock_release(struct stt_sock *stt_sock)
+{
+       mutex_lock(&stt_mutex);
+       if (stt_sock) {
+               __stt_sock_release(stt_sock);
+               stt_cleanup();
+       }
+       mutex_unlock(&stt_mutex);
+}
+EXPORT_SYMBOL_GPL(rpl_stt_sock_release);
+
+static int stt_init_net(struct net *net)
+{
+       struct stt_net *sn = net_generic(net, stt_net_id);
+
+       INIT_LIST_HEAD(&sn->sock_list);
+       return 0;
+}
+
+static struct pernet_operations stt_net_ops = {
+       .init = stt_init_net,
+       .id   = &stt_net_id,
+       .size = sizeof(struct stt_net),
+};
+
+int ovs_stt_init_module(void)
+{
+       return register_pernet_subsys(&stt_net_ops);
+}
+EXPORT_SYMBOL_GPL(ovs_stt_init_module);
+
+void ovs_stt_cleanup_module(void)
+{
+       unregister_pernet_subsys(&stt_net_ops);
+}
+EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module);
+#endif
diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c

new file mode 100644 (file)

index 0000000..9a1c8a6
--- /dev/null
+++ b/datapath/vport-stt.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/stt.h>
+#include <net/udp.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+#ifdef OVS_STT
+static struct vport_ops ovs_stt_vport_ops;
+
+/**
+ * struct stt_port
+ * @stt_sock: The socket created for this port number.
+ * @name: vport name.
+ */
+struct stt_port {
+       struct stt_sock *stt_sock;
+       char name[IFNAMSIZ];
+};
+
+static inline struct stt_port *stt_vport(const struct vport *vport)
+{
+       return vport_priv(vport);
+}
+
+static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
+{
+       struct vport *vport = stt_sock->rcv_data;
+       struct stthdr *stth = stt_hdr(skb);
+       struct ovs_tunnel_info tun_info;
+       struct sk_buff *next;
+
+       ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
+                              tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+                              get_unaligned(&stth->key),
+                              TUNNEL_KEY | TUNNEL_CSUM,
+                              NULL, 0);
+       do {
+               next = skb->next;
+               skb->next = NULL;
+               ovs_vport_receive(vport, skb, &tun_info);
+       } while ((skb = next));
+}
+
+static int stt_tnl_get_options(const struct vport *vport,
+                              struct sk_buff *skb)
+{
+       struct stt_port *stt_port = stt_vport(vport);
+       struct inet_sock *sk = inet_sk(stt_port->stt_sock->sock->sk);
+
+       if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport)))
+               return -EMSGSIZE;
+       return 0;
+}
+
+static void stt_tnl_destroy(struct vport *vport)
+{
+       struct stt_port *stt_port = stt_vport(vport);
+
+       stt_sock_release(stt_port->stt_sock);
+       ovs_vport_deferred_free(vport);
+}
+
+static struct vport *stt_tnl_create(const struct vport_parms *parms)
+{
+       struct net *net = ovs_dp_get_net(parms->dp);
+       struct nlattr *options = parms->options;
+       struct stt_port *stt_port;
+       struct stt_sock *stt_sock;
+       struct vport *vport;
+       struct nlattr *a;
+       int err;
+       u16 dst_port;
+
+       if (!options) {
+               err = -EINVAL;
+               goto error;
+       }
+
+       a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+       if (a && nla_len(a) == sizeof(u16)) {
+               dst_port = nla_get_u16(a);
+       } else {
+               /* Require destination port from userspace. */
+               err = -EINVAL;
+               goto error;
+       }
+
+       vport = ovs_vport_alloc(sizeof(struct stt_port),
+                               &ovs_stt_vport_ops, parms);
+       if (IS_ERR(vport))
+               return vport;
+
+       stt_port = stt_vport(vport);
+       strncpy(stt_port->name, parms->name, IFNAMSIZ);
+
+       stt_sock = stt_sock_add(net, htons(dst_port), stt_rcv, vport);
+       if (IS_ERR(stt_sock)) {
+               ovs_vport_free(vport);
+               return ERR_CAST(stt_sock);
+       }
+       stt_port->stt_sock = stt_sock;
+
+       return vport;
+error:
+       return ERR_PTR(err);
+}
+
+static int stt_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+       struct net *net = ovs_dp_get_net(vport->dp);
+       struct stt_port *stt_port = stt_vport(vport);
+       __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport;
+       const struct ovs_key_ipv4_tunnel *tun_key;
+       const struct ovs_tunnel_info *tun_info;
+       struct rtable *rt;
+       __be16 sport;
+       __be32 saddr;
+       __be16 df;
+       int err;
+
+       tun_info = OVS_CB(skb)->egress_tun_info;
+       if (unlikely(!tun_info)) {
+               err = -EINVAL;
+               goto error;
+       }
+
+       tun_key = &tun_info->tunnel;
+       /* Route lookup */
+       saddr = tun_key->ipv4_src;
+       rt = find_route(ovs_dp_get_net(vport->dp),
+                       &saddr, tun_key->ipv4_dst,
+                       IPPROTO_TCP, tun_key->ipv4_tos,
+                       skb->mark);
+
+       if (IS_ERR(rt)) {
+               err = PTR_ERR(rt);
+               goto error;
+       }
+
+       df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+       sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+       skb->ignore_df = 1;
+
+       return stt_xmit_skb(skb, rt, saddr, tun_key->ipv4_dst,
+                           tun_key->ipv4_tos, tun_key->ipv4_ttl,
+                           df, sport, dport, tun_key->tun_id);
+error:
+       kfree_skb(skb);
+       return err;
+}
+
+static const char *stt_tnl_get_name(const struct vport *vport)
+{
+       return stt_vport(vport)->name;
+}
+
+static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+                                  struct ovs_tunnel_info *egress_tun_info)
+{
+       struct stt_port *stt_port = stt_vport(vport);
+       struct net *net = ovs_dp_get_net(vport->dp);
+       __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport;
+       __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+
+       /* Get tp_src and tp_dst, refert to stt_build_header().
+        */
+       return ovs_tunnel_get_egress_info(egress_tun_info,
+                                         ovs_dp_get_net(vport->dp),
+                                         OVS_CB(skb)->egress_tun_info,
+                                         IPPROTO_UDP, skb->mark, sport, dport);
+}
+
+static struct vport_ops ovs_stt_vport_ops = {
+       .type                   = OVS_VPORT_TYPE_STT,
+       .create                 = stt_tnl_create,
+       .destroy                = stt_tnl_destroy,
+       .get_name               = stt_tnl_get_name,
+       .get_options            = stt_tnl_get_options,
+       .send                   = stt_tnl_send,
+       .get_egress_tun_info    = stt_get_egress_tun_info,
+       .owner                  = THIS_MODULE,
+};
+
+static int __init ovs_stt_tnl_init(void)
+{
+       int err;
+
+       err = stt_init_module();
+       if (err)
+               return err;
+       err = ovs_vport_ops_register(&ovs_stt_vport_ops);
+       if (err)
+               stt_cleanup_module();
+       return err;
+}
+
+static void __exit ovs_stt_tnl_exit(void)
+{
+       ovs_vport_ops_unregister(&ovs_stt_vport_ops);
+       stt_cleanup_module();
+}
+
+module_init(ovs_stt_tnl_init);
+module_exit(ovs_stt_tnl_exit);
+
+MODULE_DESCRIPTION("OVS: STT switching port");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("vport-type-106");
+#endif
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c

index ef9d318..6838def 100644 (file)
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -768,6 +768,9 @@ get_vport_type(const struct dpif_netlink_vport *vport)
      case OVS_VPORT_TYPE_LISP:
          return "lisp";
  
      case OVS_VPORT_TYPE_LISP:
          return "lisp";
  
+    case OVS_VPORT_TYPE_STT:
+        return "stt";
+
      case OVS_VPORT_TYPE_UNSPEC:
      case __OVS_VPORT_TYPE_MAX:
          break;
      case OVS_VPORT_TYPE_UNSPEC:
      case __OVS_VPORT_TYPE_MAX:
          break;
@@ -787,6 +790,8 @@ netdev_to_ovs_vport_type(const struct netdev *netdev)
          return OVS_VPORT_TYPE_NETDEV;
      } else if (!strcmp(type, "internal")) {
          return OVS_VPORT_TYPE_INTERNAL;
          return OVS_VPORT_TYPE_NETDEV;
      } else if (!strcmp(type, "internal")) {
          return OVS_VPORT_TYPE_INTERNAL;
+    } else if (strstr(type, "stt")) {
+        return OVS_VPORT_TYPE_STT;
      } else if (!strcmp(type, "geneve")) {
          return OVS_VPORT_TYPE_GENEVE;
      } else if (strstr(type, "gre64")) {
      } else if (!strcmp(type, "geneve")) {
          return OVS_VPORT_TYPE_GENEVE;
      } else if (strstr(type, "gre64")) {
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c

index 297320c..ff28608 100644 (file)
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -55,6 +55,7 @@ static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
  #define GENEVE_DST_PORT 6081
  #define VXLAN_DST_PORT 4789
  #define LISP_DST_PORT 4341
  #define GENEVE_DST_PORT 6081
  #define VXLAN_DST_PORT 4789
  #define LISP_DST_PORT 4341
+#define STT_DST_PORT 7471
  
  #define VXLAN_HLEN   (sizeof(struct eth_header) +         \
                        sizeof(struct ip_header)  +         \
  
  #define VXLAN_HLEN   (sizeof(struct eth_header) +         \
                        sizeof(struct ip_header)  +         \
@@ -158,7 +159,7 @@ netdev_vport_needs_dst_port(const struct netdev *dev)
  
      return (class->get_config == get_tunnel_config &&
              (!strcmp("geneve", type) || !strcmp("vxlan", type) ||
  
      return (class->get_config == get_tunnel_config &&
              (!strcmp("geneve", type) || !strcmp("vxlan", type) ||
-             !strcmp("lisp", type)));
+             !strcmp("lisp", type) || !strcmp("stt", type)) );
  }
  
  const char *
  }
  
  const char *
@@ -257,6 +258,8 @@ netdev_vport_construct(struct netdev *netdev_)
          dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
      } else if (!strcmp(type, "lisp")) {
          dev->tnl_cfg.dst_port = htons(LISP_DST_PORT);
          dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
      } else if (!strcmp(type, "lisp")) {
          dev->tnl_cfg.dst_port = htons(LISP_DST_PORT);
+    } else if (!strcmp(type, "stt")) {
+        dev->tnl_cfg.dst_port = htons(STT_DST_PORT);
      }
  
      return 0;
      }
  
      return 0;
@@ -432,7 +435,7 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
      struct smap_node *node;
  
      has_csum = strstr(type, "gre") || strstr(type, "geneve") ||
      struct smap_node *node;
  
      has_csum = strstr(type, "gre") || strstr(type, "geneve") ||
-               strstr(type, "vxlan");
+               strstr(type, "stt") || strstr(type, "vxlan");
      ipsec_mech_set = false;
      memset(&tnl_cfg, 0, sizeof tnl_cfg);
  
      ipsec_mech_set = false;
      memset(&tnl_cfg, 0, sizeof tnl_cfg);
  
@@ -449,6 +452,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
          tnl_cfg.dst_port = htons(LISP_DST_PORT);
      }
  
          tnl_cfg.dst_port = htons(LISP_DST_PORT);
      }
  
+    if (!strcmp(type, "stt")) {
+        tnl_cfg.dst_port = htons(STT_DST_PORT);
+    }
+
      needs_dst_port = netdev_vport_needs_dst_port(dev_);
      tnl_cfg.ipsec = strstr(type, "ipsec");
      tnl_cfg.dont_fragment = true;
      needs_dst_port = netdev_vport_needs_dst_port(dev_);
      tnl_cfg.ipsec = strstr(type, "ipsec");
      tnl_cfg.dont_fragment = true;
@@ -688,7 +695,8 @@ get_tunnel_config(const struct netdev *dev, struct smap *args)
  
          if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) ||
              (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
  
          if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) ||
              (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
-            (!strcmp("lisp", type) && dst_port != LISP_DST_PORT)) {
+            (!strcmp("lisp", type) && dst_port != LISP_DST_PORT) ||
+            (!strcmp("stt", type) && dst_port != STT_DST_PORT)) {
              smap_add_format(args, "dst_port", "%d", dst_port);
          }
      }
              smap_add_format(args, "dst_port", "%d", dst_port);
          }
      }
@@ -1401,7 +1409,8 @@ netdev_vport_tunnel_register(void)
          TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
                                             push_udp_header,
                                             netdev_vxlan_pop_header),
          TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
                                             push_udp_header,
                                             netdev_vxlan_pop_header),
-        TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL)
+        TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL),
+        TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL),
      };
      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  
      };
      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  
diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c

index f73d8b4..8a931d6 100644 (file)
--- a/ofproto/ofproto-dpif-ipfix.c
+++ b/ofproto/ofproto-dpif-ipfix.c
@@ -48,8 +48,8 @@ static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
   * used to indicate the type of tunnel (0x01 = VxLAN, 0x02 = GRE) and the three
   * least significant bytes hold the value of the layer 2 overlay network
   * segment identifier: a 24-bit VxLAN tunnel's VNI or a 24-bit GRE tunnel's
   * used to indicate the type of tunnel (0x01 = VxLAN, 0x02 = GRE) and the three
   * least significant bytes hold the value of the layer 2 overlay network
   * segment identifier: a 24-bit VxLAN tunnel's VNI or a 24-bit GRE tunnel's
- * TNI. This is not compatible with GRE-64, as implemented in OVS, as its
- * tunnel IDs are 64-bit.
+ * TNI. This is not compatible with GRE-64 or STT, as implemented in OVS, as
+ * their tunnel IDs are 64-bit.
   *
   * Two new enterprise information elements are defined which are similar to
   * laryerSegmentId but support 64-bit IDs:
   *
   * Two new enterprise information elements are defined which are similar to
   * laryerSegmentId but support 64-bit IDs:
@@ -64,6 +64,7 @@ enum dpif_ipfix_tunnel_type {
      DPIF_IPFIX_TUNNEL_VXLAN = 0x01,
      DPIF_IPFIX_TUNNEL_GRE = 0x02,
      DPIF_IPFIX_TUNNEL_LISP = 0x03,
      DPIF_IPFIX_TUNNEL_VXLAN = 0x01,
      DPIF_IPFIX_TUNNEL_GRE = 0x02,
      DPIF_IPFIX_TUNNEL_LISP = 0x03,
+    DPIF_IPFIX_TUNNEL_STT = 0x04,
      DPIF_IPFIX_TUNNEL_IPSEC_GRE = 0x05,
      DPIF_IPFIX_TUNNEL_GENEVE = 0x07,
      NUM_DPIF_IPFIX_TUNNEL
      DPIF_IPFIX_TUNNEL_IPSEC_GRE = 0x05,
      DPIF_IPFIX_TUNNEL_GENEVE = 0x07,
      NUM_DPIF_IPFIX_TUNNEL
@@ -299,7 +300,7 @@ static uint8_t tunnel_protocol[NUM_DPIF_IPFIX_TUNNEL] = {
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_VXLAN */
      IPPROTO_GRE,    /* DPIF_IPFIX_TUNNEL_GRE */
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_LISP*/
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_VXLAN */
      IPPROTO_GRE,    /* DPIF_IPFIX_TUNNEL_GRE */
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_LISP*/
-    0          ,    /* reserved */
+    IPPROTO_TCP,    /* DPIF_IPFIX_TUNNEL_STT*/
      IPPROTO_GRE,    /* DPIF_IPFIX_TUNNEL_IPSEC_GRE */
      0          ,    /* reserved */
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_GENEVE*/
      IPPROTO_GRE,    /* DPIF_IPFIX_TUNNEL_IPSEC_GRE */
      0          ,    /* reserved */
      IPPROTO_UDP,    /* DPIF_IPFIX_TUNNEL_GENEVE*/
@@ -353,6 +354,7 @@ BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_ip) == 32);
   * VxLAN: 24-bit VIN,
   * GRE: 32- or 64-bit key,
   * LISP: 24-bit instance ID
   * VxLAN: 24-bit VIN,
   * GRE: 32- or 64-bit key,
   * LISP: 24-bit instance ID
+ * STT: 64-bit key
   */
  #define MAX_TUNNEL_KEY_LEN 8
  
   */
  #define MAX_TUNNEL_KEY_LEN 8
  
@@ -607,6 +609,9 @@ dpif_ipfix_add_tunnel_port(struct dpif_ipfix *di, struct ofport *ofport,
      } else if (strcmp(type, "geneve") == 0) {
          dip->tunnel_type = DPIF_IPFIX_TUNNEL_GENEVE;
          dip->tunnel_key_length = 3;
      } else if (strcmp(type, "geneve") == 0) {
          dip->tunnel_type = DPIF_IPFIX_TUNNEL_GENEVE;
          dip->tunnel_key_length = 3;
+    } else if (strcmp(type, "stt") == 0) {
+        dip->tunnel_type = DPIF_IPFIX_TUNNEL_STT;
+        dip->tunnel_key_length = 8;
      } else {
          free(dip);
          goto out;
      } else {
          free(dip);
          goto out;
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml

index 3256ce0..79b5606 100644 (file)
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1892,6 +1892,25 @@
              </p>
            </dd>
  
              </p>
            </dd>
  
+          <dt><code>stt</code></dt>
+          <dd>
+             The Stateless TCP Tunnel (STT) is particularly useful when tunnel
+             endpoints are in end-systems, as it utilizes the capabilities of
+             standard network interface cards to improve performance.  STT utilizes
+             a TCP-like header inside the IP header. It is stateless, i.e., there is
+             no TCP connection state of any kind associated with the tunnel.  The
+             TCP-like header is used to leverage the capabilities of existing
+             network interface cards, but should not be interpreted as implying
+             any sort of connection state between endpoints.
+             Since the STT protocol does not engage in the usual TCP 3-way handshake,
+             so it will have difficulty traversing stateful firewalls.
+             The protocol is documented at
+             http://www.ietf.org/archive/id/draft-davie-stt-06.txt
+
+             All traffic uses a default destination port of 7471. STT is only
+             available in kernel datapath on kernel 3.5 or newer.
+          </dd>
+
            <dt><code>patch</code></dt>
            <dd>
              A pair of virtual devices that act as a patch cable.
            <dt><code>patch</code></dt>
            <dd>
              A pair of virtual devices that act as a patch cable.
@@ -1909,7 +1928,7 @@
          These options apply to interfaces with <ref column="type"/> of
          <code>geneve</code>, <code>gre</code>, <code>ipsec_gre</code>,
          <code>gre64</code>, <code>ipsec_gre64</code>, <code>vxlan</code>,
          These options apply to interfaces with <ref column="type"/> of
          <code>geneve</code>, <code>gre</code>, <code>ipsec_gre</code>,
          <code>gre64</code>, <code>ipsec_gre64</code>, <code>vxlan</code>,
-        and <code>lisp</code>.
+        <code>lisp</code> and <code>stt</code>.
        </p>
  
        <p>
        </p>
  
        <p>
@@ -1998,8 +2017,8 @@
            </li>
            <li>
              A positive 24-bit (for Geneve, VXLAN, and LISP), 32-bit (for GRE)
            </li>
            <li>
              A positive 24-bit (for Geneve, VXLAN, and LISP), 32-bit (for GRE)
-            or 64-bit (for GRE64) number.  The tunnel receives only packets
-            with the specified key.
+            or 64-bit (for GRE64 and STT) number.  The tunnel receives only
+            packets with the specified key.
            </li>
            <li>
              The word <code>flow</code>.  The tunnel accepts packets with any
            </li>
            <li>
              The word <code>flow</code>.  The tunnel accepts packets with any
@@ -2025,8 +2044,8 @@
            </li>
            <li>
              A positive 24-bit (for Geneve, VXLAN and LISP), 32-bit (for GRE) or
            </li>
            <li>
              A positive 24-bit (for Geneve, VXLAN and LISP), 32-bit (for GRE) or
-            64-bit (for GRE64) number.  Packets sent through the tunnel will
-            have the specified key.
+            64-bit (for GRE64 and STT) number.  Packets sent through the tunnel
+            will have the specified key.
            </li>
            <li>
              The word <code>flow</code>.  Packets sent through the tunnel will
            </li>
            <li>
              The word <code>flow</code>.  Packets sent through the tunnel will
author	Pravin B Shelar <pshelar@nicira.com>
	Fri, 10 Apr 2015 03:12:32 +0000 (20:12 -0700)
committer	Pravin B Shelar <pshelar@nicira.com>
	Wed, 29 Apr 2015 17:33:18 +0000 (10:33 -0700)
FAQ.md		patch \| blob \| history
NEWS		patch \| blob \| history
datapath/Modules.mk		patch \| blob \| history
datapath/linux/.gitignore		patch \| blob \| history
datapath/linux/Modules.mk		patch \| blob \| history
datapath/linux/compat/gso.h		patch \| blob \| history
datapath/linux/compat/include/linux/openvswitch.h		patch \| blob \| history
datapath/linux/compat/include/net/stt.h	[new file with mode: 0644]	patch \| blob
datapath/linux/compat/stt.c	[new file with mode: 0644]	patch \| blob
datapath/vport-stt.c	[new file with mode: 0644]	patch \| blob
lib/dpif-netlink.c		patch \| blob \| history
lib/netdev-vport.c		patch \| blob \| history
ofproto/ofproto-dpif-ipfix.c		patch \| blob \| history
vswitchd/vswitch.xml		patch \| blob \| history