datapath: Add support for Geneve tunneling.
authorJesse Gross <jesse@nicira.com>
Fri, 6 Jun 2014 02:07:32 +0000 (19:07 -0700)
committerJesse Gross <jesse@nicira.com>
Fri, 20 Jun 2014 22:19:35 +0000 (15:19 -0700)
This adds support for Geneve - Generic Network Virtualization
Encapsulation. The protocol is documented at
http://tools.ietf.org/html/draft-gross-geneve-00

The kernel implementation is completely agnostic to the options
that are in use and can handle newly defined options without
further work. It does this by simply matching on a byte array
of options and allowing userspace to setup flows on this array.

Userspace currently implements only support for basic version of
Geneve. It can work with the base header (including the VNI) and
is capable of parsing options but does not currently support any
particular option definitions. Over time, the intention is to
allow options to be matched through OpenFlow without requiring
explicit support in OVS userspace.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
25 files changed:
NEWS
datapath/Modules.mk
datapath/datapath.c
datapath/flow.c
datapath/flow.h
datapath/flow_netlink.c
datapath/flow_netlink.h
datapath/linux/Modules.mk
datapath/linux/compat/include/net/geneve.h [new file with mode: 0644]
datapath/linux/compat/include/net/ip_tunnels.h
datapath/vport-geneve.c [new file with mode: 0644]
datapath/vport-gre.c
datapath/vport-lisp.c
datapath/vport-vxlan.c
datapath/vport.c
datapath/vport.h
include/linux/openvswitch.h
lib/dpif-linux.c
lib/netdev-vport.c
lib/odp-util.c
lib/odp-util.h
lib/packets.h
tests/ovs-vsctl.at
tests/tunnel.at
vswitchd/vswitch.xml

diff --git a/NEWS b/NEWS
index 23d0523..26b0d74 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,10 @@ Post-v2.3.0
    - The "learn" action supports a new flag "delete_learned" that causes
      the learned flows to be deleted when the flow with the "learn" action
      is deleted.
+   - Basic support for the Geneve tunneling protocol. It is not yet
+     possible to generate or match options. This is planned for a future
+     release. The protocol is documented at
+     http://tools.ietf.org/html/draft-gross-geneve-00
 
 
 v2.3.0 - xx xxx xxxx
index b652411..41ffbea 100644 (file)
@@ -14,6 +14,7 @@ openvswitch_sources = \
        flow_netlink.c \
        flow_table.c \
        vport.c \
+       vport-geneve.c \
        vport-gre.c \
        vport-internal_dev.c \
        vport-lisp.c \
index 37e3243..6f4236b 100644 (file)
@@ -394,6 +394,7 @@ static size_t key_attr_size(void)
                  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
                  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
                  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
+                 + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
                + nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
                + nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
                + nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
@@ -488,7 +489,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
        upcall->dp_ifindex = dp_ifindex;
 
        nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
-       err = ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb);
+       err = ovs_nla_put_flow(dp, upcall_info->key,
+                              upcall_info->key, user_skb);
        BUG_ON(err);
        nla_nest_end(user_skb, nla);
 
@@ -696,7 +698,8 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 }
 
 /* Called with ovs_mutex or RCU read lock. */
-static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
+static int ovs_flow_cmd_fill_info(struct datapath *dp,
+                                 const struct sw_flow *flow, int dp_ifindex,
                                  struct sk_buff *skb, u32 portid,
                                  u32 seq, u32 flags, u8 cmd)
 {
@@ -720,7 +723,8 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
        if (!nla)
                goto nla_put_failure;
 
-       err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
+       err = ovs_nla_put_flow(dp, &flow->unmasked_key,
+                              &flow->unmasked_key, skb);
        if (err)
                goto error;
        nla_nest_end(skb, nla);
@@ -729,7 +733,7 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
        if (!nla)
                goto nla_put_failure;
 
-       err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
+       err = ovs_nla_put_flow(dp, &flow->key, &flow->mask->key, skb);
        if (err)
                goto error;
 
@@ -806,7 +810,8 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
 }
 
 /* Called with ovs_mutex. */
-static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
+static struct sk_buff *ovs_flow_cmd_build_info(struct datapath *dp,
+                                              const struct sw_flow *flow,
                                               int dp_ifindex,
                                               struct genl_info *info, u8 cmd,
                                               bool always)
@@ -819,7 +824,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
        if (!skb || IS_ERR(skb))
                return skb;
 
-       retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
+       retval = ovs_flow_cmd_fill_info(dp, flow, dp_ifindex, skb,
                                        info->snd_portid, info->snd_seq, 0,
                                        cmd);
        BUG_ON(retval < 0);
@@ -900,7 +905,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
                }
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(new_flow,
+                       error = ovs_flow_cmd_fill_info(dp, new_flow,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -932,7 +937,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
                rcu_assign_pointer(flow->sf_acts, acts);
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(flow,
+                       error = ovs_flow_cmd_fill_info(dp, flow,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -1048,7 +1053,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
                rcu_assign_pointer(flow->sf_acts, acts);
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(flow,
+                       error = ovs_flow_cmd_fill_info(dp, flow,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -1057,7 +1062,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
                }
        } else {
                /* Could not alloc without acts before locking. */
-               reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
+               reply = ovs_flow_cmd_build_info(dp, flow,
+                                               ovs_header->dp_ifindex,
                                                info, OVS_FLOW_CMD_NEW, false);
                if (unlikely(IS_ERR(reply))) {
                        error = PTR_ERR(reply);
@@ -1119,7 +1125,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
                goto unlock;
        }
 
-       reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
+       reply = ovs_flow_cmd_build_info(dp, flow, ovs_header->dp_ifindex, info,
                                        OVS_FLOW_CMD_NEW, true);
        if (IS_ERR(reply)) {
                err = PTR_ERR(reply);
@@ -1176,7 +1182,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
        if (likely(reply)) {
                if (likely(!IS_ERR(reply))) {
                        rcu_read_lock(); /* Keep RCU checker happy. */
-                       err = ovs_flow_cmd_fill_info(flow,
+                       err = ovs_flow_cmd_fill_info(dp, flow,
                                                     ovs_header->dp_ifindex,
                                                     reply, info->snd_portid,
                                                     info->snd_seq, 0,
@@ -1222,7 +1228,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
                if (!flow)
                        break;
 
-               if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
+               if (ovs_flow_cmd_fill_info(dp, flow, ovs_header->dp_ifindex, skb,
                                           NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                           OVS_FLOW_CMD_NEW) < 0)
index f1bb95d..e90f99a 100644 (file)
@@ -455,7 +455,17 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
                struct ovs_tunnel_info *tun_info = OVS_CB(skb)->tun_info;
                memcpy(&key->tun_key, &tun_info->tunnel,
                        sizeof(key->tun_key));
+               if (tun_info->options) {
+                       BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1
+                                       > sizeof(key->tun_opts));
+                       memcpy(GENEVE_OPTS(key, tun_info->options_len),
+                               tun_info->options, tun_info->options_len);
+                       key->tun_opts_len = tun_info->options_len;
+               } else {
+                       key->tun_opts_len = 0;
+               }
        } else {
+               key->tun_opts_len = 0;
                memset(&key->tun_key, 0, sizeof(key->tun_key));
        }
 
index 0ecf78b..9414869 100644 (file)
@@ -53,11 +53,24 @@ struct ovs_key_ipv4_tunnel {
 
 struct ovs_tunnel_info {
        struct ovs_key_ipv4_tunnel tunnel;
+       struct geneve_opt *options;
+       u8 options_len;
 };
 
+/* Store options at the end of the array if they are less than the
+ * maximum size. This allows us to get the benefits of variable length
+ * matching for small options.
+ */
+#define GENEVE_OPTS(flow_key, opt_len) (struct geneve_opt *) \
+                                       ((flow_key)->tun_opts + \
+                                       FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \
+                                          opt_len)
+
 static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
                                         const struct iphdr *iph, __be64 tun_id,
-                                        __be16 tun_flags)
+                                        __be16 tun_flags,
+                                        struct geneve_opt *opts,
+                                        u8 opts_len)
 {
        tun_info->tunnel.tun_id = tun_id;
        tun_info->tunnel.ipv4_src = iph->saddr;
@@ -69,9 +82,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
        /* clear struct padding. */
        memset((unsigned char *) &tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
               sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
+
+       tun_info->options = opts;
+       tun_info->options_len = opts_len;
 }
 
 struct sw_flow_key {
+       u8 tun_opts[255];
+       u8 tun_opts_len;
        struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
        struct {
                u32     priority;       /* Packet QoS priority. */
index c5ca2f4..22ad2d0 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/icmp.h>
 #include <linux/icmpv6.h>
 #include <linux/rculist.h>
+#include <net/geneve.h>
 #include <net/ip.h>
 #include <net/ip_tunnels.h>
 #include <net/ipv6.h>
@@ -89,18 +90,21 @@ static void update_range__(struct sw_flow_match *match,
                }                                                           \
        } while (0)
 
-#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \
        do { \
-               update_range__(match, offsetof(struct sw_flow_key, field),  \
-                               len, is_mask);                              \
+               update_range__(match, offset, len, is_mask);                \
                if (is_mask) {                                              \
                        if ((match)->mask)                                  \
-                               memcpy(&(match)->mask->key.field, value_p, len);\
+                               memcpy((u8 *)&(match)->mask->key + offset, value_p, len);\
                } else {                                                    \
-                       memcpy(&(match)->key->field, value_p, len);         \
+                       memcpy((u8 *)(match)->key + offset, value_p, len);         \
                }                                                           \
        } while (0)
 
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+       SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
+                                 value_p, len, is_mask)
+
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
        return range->end - range->start;
@@ -348,6 +352,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
                        [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
                        [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
                        [OVS_TUNNEL_KEY_ATTR_OAM] = 0,
+                       [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1,
                };
 
                if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
@@ -356,7 +361,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
                        return -EINVAL;
                }
 
-               if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+               if (ovs_tunnel_key_lens[type] != nla_len(a) &&
+                   ovs_tunnel_key_lens[type] != -1) {
                        OVS_NLERR("IPv4 tunnel attribute type has unexpected "
                                  " length (type=%d, length=%d, expected=%d).\n",
                                  type, nla_len(a), ovs_tunnel_key_lens[type]);
@@ -395,6 +401,56 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
                case OVS_TUNNEL_KEY_ATTR_OAM:
                        tun_flags |= TUNNEL_OAM;
                        break;
+               case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+                       if (nla_len(a) > sizeof(match->key->tun_opts)) {
+                               OVS_NLERR("Geneve option length exceeds "
+                                         "maximum size (len %d, max %zu).\n",
+                                         nla_len(a),
+                                         sizeof(match->key->tun_opts));
+                               return -EINVAL;
+                       }
+
+                       if (nla_len(a) % 4 != 0) {
+                               OVS_NLERR("Geneve option length is not "
+                                         "a multiple of 4 (len %d).\n",
+                                         nla_len(a));
+                               return -EINVAL;
+                       }
+
+                       /* We need to record the length of the options passed
+                        * down, otherwise packets with the same format but
+                        * additional options will be silently matched.
+                        */
+                       if (!is_mask) {
+                               SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
+                                               false);
+                       } else {
+                               /* This is somewhat unusual because it looks at
+                                * both the key and mask while parsing the
+                                * attributes (and by extension assumes the key
+                                * is parsed first). Normally, we would verify
+                                * that each is the correct length and that the
+                                * attributes line up in the validate function.
+                                * However, that is difficult because this is
+                                * variable length and we won't have the
+                                * information later.
+                                */
+                               if (match->key->tun_opts_len != nla_len(a)) {
+                                       OVS_NLERR("Geneve option key length (%d)"
+                                          " is different from mask length (%d).",
+                                          match->key->tun_opts_len, nla_len(a));
+                                       return -EINVAL;
+                               }
+
+                               SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff,
+                                               true);
+                       }
+
+                       SW_FLOW_KEY_MEMCPY_OFFSET(match,
+                               (unsigned long)GENEVE_OPTS((struct sw_flow_key *)0,
+                                                          nla_len(a)),
+                               nla_data(a), nla_len(a), is_mask);
+                       break;
                default:
                        return -EINVAL;
                }
@@ -423,8 +479,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 }
 
 static int ipv4_tun_to_nlattr(struct sk_buff *skb,
-                             const struct ovs_key_ipv4_tunnel *tun_key,
-                             const struct ovs_key_ipv4_tunnel *output)
+                             const struct ovs_key_ipv4_tunnel *output,
+                             const struct geneve_opt *tun_opts,
+                             int swkey_tun_opts_len)
 {
        struct nlattr *nla;
 
@@ -455,6 +512,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
        if ((output->tun_flags & TUNNEL_OAM) &&
                nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
                return -EMSGSIZE;
+       if (tun_opts &&
+           nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+                   swkey_tun_opts_len, tun_opts));
 
        nla_nest_end(skb, nla);
        return 0;
@@ -900,7 +960,7 @@ int ovs_nla_get_flow_metadata(struct sw_flow *flow,
        return 0;
 }
 
-int ovs_nla_put_flow(const struct sw_flow_key *swkey,
+int ovs_nla_put_flow(struct datapath *dp, const struct sw_flow_key *swkey,
                     const struct sw_flow_key *output, struct sk_buff *skb)
 {
        struct ovs_key_ethernet *eth_key;
@@ -916,9 +976,24 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
        if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
                goto nla_put_failure;
 
-       if ((swkey->tun_key.ipv4_dst || is_mask) &&
-           ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
-               goto nla_put_failure;
+       if ((swkey->tun_key.ipv4_dst || is_mask)) {
+               const struct geneve_opt *opts = NULL;
+
+               if (!is_mask) {
+                       struct vport *in_port;
+
+                       in_port = ovs_vport_ovsl_rcu(dp, swkey->phy.in_port);
+                       if (in_port->ops->type == OVS_VPORT_TYPE_GENEVE)
+                               opts = GENEVE_OPTS(output, swkey->tun_opts_len);
+               } else {
+                       if (output->tun_opts_len)
+                               opts = GENEVE_OPTS(output, swkey->tun_opts_len);
+               }
+
+               if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
+                                       swkey->tun_opts_len))
+                       goto nla_put_failure;
+       }
 
        if (swkey->phy.in_port == DP_MAX_PORTS) {
                if (is_mask && (output->phy.in_port == 0xffff))
@@ -1309,17 +1384,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
        if (err)
                return err;
 
+       if (key.tun_opts_len) {
+               struct geneve_opt *option = GENEVE_OPTS(&key,
+                                                       key.tun_opts_len);
+               int opts_len = key.tun_opts_len;
+               bool crit_opt = false;
+
+               while (opts_len > 0) {
+                       int len;
+
+                       if (opts_len < sizeof(*option))
+                               return -EINVAL;
+
+                       len = sizeof(*option) + option->length * 4;
+                       if (len > opts_len)
+                               return -EINVAL;
+
+                       crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
+
+                       option = (struct geneve_opt *)((u8 *)option + len);
+                       opts_len -= len;
+               };
+
+               key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+       };
+
        start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
        if (start < 0)
                return start;
 
        a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
-                       sizeof(*tun_info));
+                       sizeof(*tun_info) + key.tun_opts_len);
        if (IS_ERR(a))
                return PTR_ERR(a);
 
        tun_info = nla_data(a);
        tun_info->tunnel = key.tun_key;
+       tun_info->options_len = key.tun_opts_len;
+
+       if (tun_info->options_len) {
+               /* We need to store the options in the action itself since
+                * everything else will go away after flow setup. We can append
+                * it to tun_info and then point there.
+                */
+               tun_info->options = (struct geneve_opt *)(tun_info + 1);
+               memcpy(tun_info->options, GENEVE_OPTS(&key, key.tun_opts_len),
+                       key.tun_opts_len);
+       } else {
+               tun_info->options = NULL;
+       }
 
        add_nested_action_end(*sfa, start);
 
@@ -1611,7 +1724,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
                        return -EMSGSIZE;
 
                err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
-                                        &tun_info->tunnel);
+                                        tun_info->options_len ?
+                                               tun_info->options : NULL,
+                                        tun_info->options_len);
                if (err)
                        return err;
                nla_nest_end(skb, start);
index 4401510..42de456 100644 (file)
@@ -40,7 +40,7 @@
 void ovs_match_init(struct sw_flow_match *match,
                    struct sw_flow_key *key, struct sw_flow_mask *mask);
 
-int ovs_nla_put_flow(const struct sw_flow_key *,
+int ovs_nla_put_flow(struct datapath *dp, const struct sw_flow_key *,
                     const struct sw_flow_key *, struct sk_buff *);
 int ovs_nla_get_flow_metadata(struct sw_flow *flow,
                              const struct nlattr *attr);
index 224eb02..46aa1f6 100644 (file)
@@ -63,6 +63,7 @@ openvswitch_headers += \
        linux/compat/include/net/dst.h \
        linux/compat/include/net/flow_keys.h \
        linux/compat/include/net/genetlink.h \
+       linux/compat/include/net/geneve.h \
        linux/compat/include/net/gre.h \
        linux/compat/include/net/inet_frag.h \
        linux/compat/include/net/ip.h \
diff --git a/datapath/linux/compat/include/net/geneve.h b/datapath/linux/compat/include/net/geneve.h
new file mode 100644 (file)
index 0000000..2cb294f
--- /dev/null
@@ -0,0 +1,23 @@
+#ifndef __NET_GENEVE_WRAPPER_H
+#define __NET_GENEVE_WRAPPER_H  1
+
+/* Not yet upstream. */
+#define GENEVE_CRIT_OPT_TYPE (1 << 7)
+struct geneve_opt {
+       __be16  opt_class;
+       u8      type;
+#ifdef __LITTLE_ENDIAN_BITFIELD
+       u8      length:5;
+       u8      r3:1;
+       u8      r2:1;
+       u8      r1:1;
+#else
+       u8      r1:1;
+       u8      r2:1;
+       u8      r3:1;
+       u8      length:5;
+#endif
+        u8     opt_data[];
+};
+
+#endif
index e2f3c30..c7a14ef 100644 (file)
@@ -47,5 +47,6 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
 
 /* Not yet upstream */
 #define TUNNEL_OAM     __cpu_to_be16(0x0200)
+#define TUNNEL_CRIT_OPT        __cpu_to_be16(0x0400)
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/datapath/vport-geneve.c b/datapath/vport-geneve.c
new file mode 100644 (file)
index 0000000..969e812
--- /dev/null
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/version.h>
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/geneve.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/vxlan.h>
+#include <net/xfrm.h>
+
+#include "datapath.h"
+#include "gso.h"
+#include "vport.h"
+
+/*
+ * Geneve Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |        Virtual Network Identifier (VNI)       |    Reserved   |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                    Variable Length Options                    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Option Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Option Class         |      Type     |R|R|R| Length  |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                      Variable Option Data                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+struct genevehdr {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+       u8 opt_len:6;
+       u8 ver:2;
+       u8 rsvd1:6;
+       u8 critical:1;
+       u8 oam:1;
+#else
+       u8 ver:2;
+       u8 opt_len:6;
+       u8 oam:1;
+       u8 critical:1;
+       u8 rsvd1:6;
+#endif
+       __be16 proto_type;
+       u8 vni[3];
+       u8 rsvd2;
+       struct geneve_opt options[];
+};
+
+#define GENEVE_VER 0
+
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
+/**
+ * struct geneve_port - Keeps track of open UDP ports
+ * @sock: The socket created for this port number.
+ * @name: vport name.
+ */
+struct geneve_port {
+       struct socket *sock;
+       char name[IFNAMSIZ];
+};
+
+static LIST_HEAD(geneve_ports);
+
+static inline struct geneve_port *geneve_vport(const struct vport *vport)
+{
+       return vport_priv(vport);
+}
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+       return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+       vni[0] = (__force __u8)(tun_id >> 16);
+       vni[1] = (__force __u8)(tun_id >> 8);
+       vni[2] = (__force __u8)tun_id;
+#else
+       vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+       vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+       vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
+/* Convert 24 bit VNI to 64 bit tunnel ID. */
+static __be64 vni_to_tunnel_id(__u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+       return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+       return (__force __be64)(((__force u64)vni[0] << 40) |
+                               ((__force u64)vni[1] << 48) |
+                               ((__force u64)vni[2] << 56));
+#endif
+}
+
+static void geneve_build_header(const struct vport *vport,
+                             struct sk_buff *skb)
+{
+       struct geneve_port *geneve_port = geneve_vport(vport);
+       struct udphdr *udph = udp_hdr(skb);
+       struct genevehdr *geneveh = (struct genevehdr *)(udph + 1);
+       const struct ovs_tunnel_info *tun_info = OVS_CB(skb)->tun_info;
+
+       udph->dest = inet_sport(geneve_port->sock->sk);
+       udph->source = vxlan_src_port(1, USHRT_MAX, skb);
+       udph->check = 0;
+       udph->len = htons(skb->len - skb_transport_offset(skb));
+
+       geneveh->ver = GENEVE_VER;
+       geneveh->opt_len = tun_info->options_len / 4;
+       geneveh->oam = !!(tun_info->tunnel.tun_flags & TUNNEL_OAM);
+       geneveh->critical = !!(tun_info->tunnel.tun_flags & TUNNEL_CRIT_OPT);
+       geneveh->rsvd1 = 0;
+       geneveh->proto_type = htons(ETH_P_TEB);
+       tunnel_id_to_vni(tun_info->tunnel.tun_id, geneveh->vni);
+       geneveh->rsvd2 = 0;
+
+       memcpy(geneveh->options, tun_info->options, tun_info->options_len);
+}
+
+static int geneve_rcv(struct sock *sk, struct sk_buff *skb)
+{
+       struct geneve_port *geneve_port;
+       struct genevehdr *geneveh;
+       int opts_len;
+       struct ovs_tunnel_info tun_info;
+       __be64 key;
+       __be16 flags;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+       if (unlikely(udp_lib_checksum_complete(skb)))
+               goto error;
+#endif
+
+       if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+               goto error;
+
+       geneveh = geneve_hdr(skb);
+
+       if (unlikely(geneveh->ver != GENEVE_VER))
+               goto error;
+
+       if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+               goto error;
+
+       geneve_port = rcu_dereference_sk_user_data(sk);
+       if (unlikely(!geneve_port))
+               goto error;
+
+       opts_len = geneveh->opt_len * 4;
+       if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+                                htons(ETH_P_TEB)))
+               goto error;
+
+       geneveh = geneve_hdr(skb);
+
+       flags = TUNNEL_KEY |
+               (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
+               (geneveh->oam ? TUNNEL_OAM : 0) |
+               (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
+
+       key = vni_to_tunnel_id(geneveh->vni);
+       ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags,
+                               geneveh->options, opts_len);
+
+       ovs_vport_receive(vport_from_priv(geneve_port), skb, &tun_info);
+       goto out;
+
+error:
+       kfree_skb(skb);
+out:
+       return 0;
+}
+
+/* Arbitrary value.  Irrelevant as long as it's not 0 since we set the handler. */
+#define UDP_ENCAP_GENEVE 1
+static int geneve_socket_init(struct geneve_port *geneve_port, struct net *net,
+                             __be16 dst_port)
+{
+       struct sockaddr_in sin;
+       int err;
+
+       err = sock_create_kern(AF_INET, SOCK_DGRAM, 0,
+                              &geneve_port->sock);
+       if (err)
+               goto error;
+
+       /* release net ref. */
+       sk_change_net(geneve_port->sock->sk, net);
+
+       sin.sin_family = AF_INET;
+       sin.sin_addr.s_addr = htonl(INADDR_ANY);
+       sin.sin_port = dst_port;
+
+       err = kernel_bind(geneve_port->sock,
+                         (struct sockaddr *)&sin, sizeof(struct sockaddr_in));
+       if (err)
+               goto error_sock;
+
+       rcu_assign_sk_user_data(geneve_port->sock->sk, geneve_port);
+       udp_sk(geneve_port->sock->sk)->encap_type = UDP_ENCAP_GENEVE;
+       udp_sk(geneve_port->sock->sk)->encap_rcv = geneve_rcv;
+
+       udp_encap_enable();
+
+       return 0;
+
+error_sock:
+       sk_release_kernel(geneve_port->sock->sk);
+error:
+       pr_warn("cannot register geneve protocol handler: %d\n", err);
+       return err;
+}
+
+static int geneve_get_options(const struct vport *vport,
+                             struct sk_buff *skb)
+{
+       struct geneve_port *geneve_port = geneve_vport(vport);
+
+       if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT,
+                       ntohs(inet_sport(geneve_port->sock->sk))))
+               return -EMSGSIZE;
+       return 0;
+}
+
+static void geneve_tnl_destroy(struct vport *vport)
+{
+       struct geneve_port *geneve_port = geneve_vport(vport);
+
+       /* Release socket */
+       rcu_assign_sk_user_data(geneve_port->sock->sk, NULL);
+       sk_release_kernel(geneve_port->sock->sk);
+
+       ovs_vport_deferred_free(vport);
+}
+
+static struct vport *geneve_tnl_create(const struct vport_parms *parms)
+{
+       struct net *net = ovs_dp_get_net(parms->dp);
+       struct nlattr *options = parms->options;
+       struct geneve_port *geneve_port;
+       struct vport *vport;
+       struct nlattr *a;
+       int err;
+       u16 dst_port;
+
+       if (!options) {
+               err = -EINVAL;
+               goto error;
+       }
+
+       a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+       if (a && nla_len(a) == sizeof(u16)) {
+               dst_port = nla_get_u16(a);
+       } else {
+               /* Require destination port from userspace. */
+               err = -EINVAL;
+               goto error;
+       }
+
+       vport = ovs_vport_alloc(sizeof(struct geneve_port),
+                               &ovs_geneve_vport_ops, parms);
+       if (IS_ERR(vport))
+               return vport;
+
+       geneve_port = geneve_vport(vport);
+       strncpy(geneve_port->name, parms->name, IFNAMSIZ);
+
+       err = geneve_socket_init(geneve_port, net, htons(dst_port));
+       if (err)
+               goto error_free;
+
+       return vport;
+
+error_free:
+       ovs_vport_free(vport);
+error:
+       return ERR_PTR(err);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)
+
+static void geneve_fix_segment(struct sk_buff *skb)
+{
+       struct udphdr *udph = udp_hdr(skb);
+
+       udph->len = htons(skb->len - skb_transport_offset(skb));
+}
+
+static int handle_offloads(struct sk_buff *skb)
+{
+       if (skb_is_gso(skb))
+               OVS_GSO_CB(skb)->fix_segment = geneve_fix_segment;
+       else if (skb->ip_summed != CHECKSUM_PARTIAL)
+               skb->ip_summed = CHECKSUM_NONE;
+       return 0;
+}
+#else
+static int handle_offloads(struct sk_buff *skb)
+{
+       if (skb_is_gso(skb)) {
+               int err = skb_unclone(skb, GFP_ATOMIC);
+               if (unlikely(err))
+                       return err;
+
+               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+       } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+               skb->ip_summed = CHECKSUM_NONE;
+
+       skb->encapsulation = 1;
+       return 0;
+}
+#endif
+
+static int geneve_send(struct vport *vport, struct sk_buff *skb)
+{
+       struct ovs_key_ipv4_tunnel *tun_key = &OVS_CB(skb)->tun_info->tunnel;
+       int network_offset = skb_network_offset(skb);
+       struct rtable *rt;
+       int min_headroom;
+       __be32 saddr;
+       __be16 df;
+       int sent_len;
+       int err;
+
+       if (unlikely(!OVS_CB(skb)->tun_info))
+               return -EINVAL;
+
+       /* Route lookup */
+       saddr = tun_key->ipv4_src;
+       rt = find_route(ovs_dp_get_net(vport->dp),
+                       &saddr, tun_key->ipv4_dst,
+                       IPPROTO_UDP, tun_key->ipv4_tos,
+                       skb->mark);
+       if (IS_ERR(rt)) {
+               err = PTR_ERR(rt);
+               goto error;
+       }
+
+       min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+                       + GENEVE_BASE_HLEN + OVS_CB(skb)->tun_info->options_len
+                       + sizeof(struct iphdr)
+                       + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+       if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+               int head_delta = SKB_DATA_ALIGN(min_headroom -
+                                               skb_headroom(skb) +
+                                               16);
+
+               err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+                                       0, GFP_ATOMIC);
+               if (unlikely(err))
+                       goto err_free_rt;
+       }
+
+       if (vlan_tx_tag_present(skb)) {
+               if (unlikely(!__vlan_put_tag(skb,
+                                            skb->vlan_proto,
+                                            vlan_tx_tag_get(skb)))) {
+                       err = -ENOMEM;
+                       goto err_free_rt;
+               }
+               vlan_set_tci(skb, 0);
+       }
+
+       skb_reset_inner_headers(skb);
+
+       __skb_push(skb, GENEVE_BASE_HLEN + OVS_CB(skb)->tun_info->options_len);
+       skb_reset_transport_header(skb);
+
+       geneve_build_header(vport, skb);
+
+       /* Offloading */
+       err = handle_offloads(skb);
+       if (err)
+               goto err_free_rt;
+
+       df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+
+       sent_len = iptunnel_xmit(rt, skb,
+                            saddr, tun_key->ipv4_dst,
+                            IPPROTO_UDP, tun_key->ipv4_tos,
+                            tun_key->ipv4_ttl,
+                            df, false);
+
+       return sent_len > 0 ? sent_len + network_offset : sent_len;
+
+err_free_rt:
+       ip_rt_put(rt);
+error:
+       return err;
+}
+
+static const char *geneve_get_name(const struct vport *vport)
+{
+       struct geneve_port *geneve_port = geneve_vport(vport);
+       return geneve_port->name;
+}
+
+const struct vport_ops ovs_geneve_vport_ops = {
+       .type           = OVS_VPORT_TYPE_GENEVE,
+       .create         = geneve_tnl_create,
+       .destroy        = geneve_tnl_destroy,
+       .get_name       = geneve_get_name,
+       .get_options    = geneve_get_options,
+       .send           = geneve_send,
+};
index f30f090..d2a2602 100644 (file)
@@ -111,7 +111,7 @@ static int gre_rcv(struct sk_buff *skb,
 
        key = key_to_tunnel_id(tpi->key, tpi->seq);
        ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key,
-                              filter_tnl_flags(tpi->flags));
+                              filter_tnl_flags(tpi->flags), NULL, 0);
 
        ovs_vport_receive(vport, skb, &tun_info);
        return PACKET_RCVD;
index 8f96815..a124e73 100644 (file)
@@ -245,7 +245,7 @@ static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
 
        /* Save outer tunnel values */
        iph = ip_hdr(skb);
-       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
+       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0);
 
        /* Drop non-IP inner packets */
        inner_iph = (struct iphdr *)(lisph + 1);
index 41c1756..8a08af8 100644 (file)
@@ -68,7 +68,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
        /* Save outer tunnel values */
        iph = ip_hdr(skb);
        key = cpu_to_be64(ntohl(vx_vni) >> 8);
-       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
+       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0);
 
        ovs_vport_receive(vport, skb, &tun_info);
 }
index 5fce377..02ccc89 100644 (file)
@@ -43,6 +43,7 @@ static void ovs_vport_record_error(struct vport *,
 static const struct vport_ops *vport_ops_list[] = {
        &ovs_netdev_vport_ops,
        &ovs_internal_vport_ops,
+       &ovs_geneve_vport_ops,
 #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX)
        &ovs_gre_vport_ops,
        &ovs_gre64_vport_ops,
index c02daf5..bdd9a89 100644 (file)
@@ -217,6 +217,7 @@ void ovs_vport_receive(struct vport *, struct sk_buff *,
  * add yours to the list at the top of vport.c. */
 extern const struct vport_ops ovs_netdev_vport_ops;
 extern const struct vport_ops ovs_internal_vport_ops;
+extern const struct vport_ops ovs_geneve_vport_ops;
 extern const struct vport_ops ovs_gre_vport_ops;
 extern const struct vport_ops ovs_gre64_vport_ops;
 extern const struct vport_ops ovs_vxlan_vport_ops;
index 57d40e3..4f84045 100644 (file)
@@ -215,6 +215,7 @@ enum ovs_vport_type {
        OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
        OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
        OVS_VPORT_TYPE_VXLAN,    /* VXLAN tunnel */
+       OVS_VPORT_TYPE_GENEVE = 6,  /* Geneve tunnel */
        OVS_VPORT_TYPE_GRE64 = 104, /* GRE tunnel with 64-bit keys */
        OVS_VPORT_TYPE_LISP = 105,  /* LISP tunnel */
        __OVS_VPORT_TYPE_MAX
@@ -341,9 +342,9 @@ enum ovs_tunnel_key_attr {
        OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
        OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
        OVS_TUNNEL_KEY_ATTR_OAM,                /* No argument, OAM frame. */
+       OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,        /* Array of Geneve options */
        __OVS_TUNNEL_KEY_ATTR_MAX
 };
-
 #define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1)
 
 /**
index afe9340..66911c7 100644 (file)
@@ -580,6 +580,9 @@ get_vport_type(const struct dpif_linux_vport *vport)
     case OVS_VPORT_TYPE_INTERNAL:
         return "internal";
 
+    case OVS_VPORT_TYPE_GENEVE:
+        return "geneve";
+
     case OVS_VPORT_TYPE_GRE:
         return "gre";
 
@@ -611,6 +614,8 @@ netdev_to_ovs_vport_type(const struct netdev *netdev)
         return OVS_VPORT_TYPE_NETDEV;
     } else if (!strcmp(type, "internal")) {
         return OVS_VPORT_TYPE_INTERNAL;
+    } else if (!strcmp(type, "geneve")) {
+        return OVS_VPORT_TYPE_GENEVE;
     } else if (strstr(type, "gre64")) {
         return OVS_VPORT_TYPE_GRE64;
     } else if (strstr(type, "gre")) {
index 835a98c..9fa15f5 100644 (file)
@@ -42,6 +42,7 @@
 
 VLOG_DEFINE_THIS_MODULE(netdev_vport);
 
+#define GENEVE_DST_PORT 6081
 #define VXLAN_DST_PORT 4789
 #define LISP_DST_PORT 4341
 
@@ -133,7 +134,8 @@ netdev_vport_needs_dst_port(const struct netdev *dev)
     const char *type = netdev_get_type(dev);
 
     return (class->get_config == get_tunnel_config &&
-            (!strcmp("vxlan", type) || !strcmp("lisp", type)));
+            (!strcmp("geneve", type) || !strcmp("vxlan", type) ||
+             !strcmp("lisp", type)));
 }
 
 const char *
@@ -495,12 +497,15 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
         }
     }
 
-    /* Add a default destination port for VXLAN if none specified. */
+    /* Add a default destination port for tunnel ports if none specified. */
+    if (!strcmp(type, "geneve") && !tnl_cfg.dst_port) {
+        tnl_cfg.dst_port = htons(GENEVE_DST_PORT);
+    }
+
     if (!strcmp(type, "vxlan") && !tnl_cfg.dst_port) {
         tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
     }
 
-    /* Add a default destination port for LISP if none specified. */
     if (!strcmp(type, "lisp") && !tnl_cfg.dst_port) {
         tnl_cfg.dst_port = htons(LISP_DST_PORT);
     }
@@ -628,7 +633,8 @@ get_tunnel_config(const struct netdev *dev, struct smap *args)
         uint16_t dst_port = ntohs(tnl_cfg.dst_port);
         const char *type = netdev_get_type(dev);
 
-        if ((!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
+        if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) ||
+            (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
             (!strcmp("lisp", type) && dst_port != LISP_DST_PORT)) {
             smap_add_format(args, "dst_port", "%d", dst_port);
         }
@@ -831,6 +837,7 @@ netdev_vport_tunnel_register(void)
     /* The name of the dpif_port should be short enough to accomodate adding
      * a port number to the end if one is necessary. */
     static const struct vport_class vport_classes[] = {
+        TUNNEL_CLASS("geneve", "genev_sys"),
         TUNNEL_CLASS("gre", "gre_sys"),
         TUNNEL_CLASS("ipsec_gre", "gre_sys"),
         TUNNEL_CLASS("gre64", "gre64_sys"),
index 8f71c7c..162d85a 100644 (file)
@@ -833,12 +833,46 @@ tunnel_key_attr_len(int type)
     case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: return 0;
     case OVS_TUNNEL_KEY_ATTR_CSUM: return 0;
     case OVS_TUNNEL_KEY_ATTR_OAM: return 0;
+    case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: return -2;
     case __OVS_TUNNEL_KEY_ATTR_MAX:
         return -1;
     }
     return -1;
 }
 
+#define GENEVE_OPT(class, type) ((OVS_FORCE uint32_t)(class) << 8 | (type))
+static int
+parse_geneve_opts(const struct nlattr *attr)
+{
+    int opts_len = nl_attr_get_size(attr);
+    const struct geneve_opt *opt = nl_attr_get(attr);
+
+    while (opts_len > 0) {
+        int len;
+
+        if (opts_len < sizeof(*opt)) {
+            return -EINVAL;
+        }
+
+        len = sizeof(*opt) + opt->length * 4;
+        if (len > opts_len) {
+            return -EINVAL;
+        }
+
+        switch (GENEVE_OPT(opt->opt_class, opt->type)) {
+        default:
+            if (opt->type & GENEVE_CRIT_OPT_TYPE) {
+                return -EINVAL;
+            }
+        };
+
+        opt = opt + len / sizeof(*opt);
+        opts_len -= len;
+    };
+
+    return 0;
+}
+
 enum odp_key_fitness
 odp_tun_key_from_attr(const struct nlattr *attr, struct flow_tnl *tun)
 {
@@ -883,6 +917,15 @@ odp_tun_key_from_attr(const struct nlattr *attr, struct flow_tnl *tun)
         case OVS_TUNNEL_KEY_ATTR_OAM:
             tun->flags |= FLOW_TNL_F_OAM;
             break;
+        case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: {
+            if (parse_geneve_opts(a)) {
+                return ODP_FIT_ERROR;
+            }
+            /* It is necessary to reproduce options exactly (including order)
+             * so it's easiest to just echo them back. */
+            unknown = true;
+            break;
+        }
         default:
             /* Allow this to show up as unexpected, if there are unknown
              * tunnel attribute, eventually resulting in ODP_FIT_TOO_MUCH. */
index 0e912a4..ed76c92 100644 (file)
@@ -105,6 +105,7 @@ void odp_portno_names_destroy(struct hmap *portno_names);
  *  - OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT  0    --     4      4
  *  - OVS_TUNNEL_KEY_ATTR_CSUM           0    --     4      4
  *  - OVS_TUNNEL_KEY_ATTR_OAM            0    --     4      4
+ *  - OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS    256  --     4      260
  *  OVS_KEY_ATTR_IN_PORT                 4    --     4      8
  *  OVS_KEY_ATTR_SKB_MARK                4    --     4      8
  *  OVS_KEY_ATTR_DP_HASH                 4    --     4      8
@@ -118,12 +119,12 @@ void odp_portno_names_destroy(struct hmap *portno_names);
  *  OVS_KEY_ATTR_ICMPV6                  2     2     4      8
  *  OVS_KEY_ATTR_ND                     28    --     4     32
  *  ----------------------------------------------------------
- *  total                                                 228
+ *  total                                                 488
  *
  * We include some slack space in case the calculation isn't quite right or we
  * add another field and forget to adjust this value.
  */
-#define ODPUTIL_FLOW_KEY_BYTES 256
+#define ODPUTIL_FLOW_KEY_BYTES 512
 BUILD_ASSERT_DECL(FLOW_WC_SEQ == 26);
 
 /* A buffer with sufficient size and alignment to hold an nlattr-formatted flow
index 4575dd0..c04e3bb 100644 (file)
@@ -674,6 +674,24 @@ static inline bool dl_type_is_ip_any(ovs_be16 dl_type)
         || dl_type == htons(ETH_TYPE_IPV6);
 }
 
+#define GENEVE_CRIT_OPT_TYPE (1 << 7)
+struct geneve_opt {
+    ovs_be16  opt_class;
+    uint8_t   type;
+#ifdef LITTLE_ENDIAN
+    uint8_t   length:5;
+    uint8_t   r3:1;
+    uint8_t   r2:1;
+    uint8_t   r1:1;
+#else
+    uint8_t   r1:1;
+    uint8_t   r2:1;
+    uint8_t   r3:1;
+    uint8_t   length:5;
+#endif
+    uint8_t   opt_data[];
+};
+
 void format_ipv6_addr(char *addr_str, const struct in6_addr *addr);
 void print_ipv6_addr(struct ds *string, const struct in6_addr *addr);
 void print_ipv6_masked(struct ds *string, const struct in6_addr *addr,
index 8bc5f4a..1c4ce4f 100644 (file)
@@ -1203,6 +1203,7 @@ m4_foreach(
 [reserved_name],
 [[ovs-netdev],
 [ovs-dummy],
+[genev_sys],
 [gre_sys],
 [gre64_sys],
 [lisp_sys],
@@ -1233,12 +1234,15 @@ OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=gre \
                     -- add-port br0 p3 -- set Interface p3 type=lisp \
                     options:remote_ip=2.2.2.2 ofport_request=3 \
                     -- add-port br0 p4 -- set Interface p4 type=vxlan \
-                    options:remote_ip=2.2.2.2 ofport_request=4])
+                    options:remote_ip=2.2.2.2 ofport_request=4 \
+                    -- add-port br0 p5 -- set Interface p5 type=geneve \
+                    options:remote_ip=2.2.2.2 ofport_request=5])
 
 # Test creating all reserved tunnel port names
 m4_foreach(
 [reserved_name],
-[[gre_sys],
+[[genev_sys],
+[gre_sys],
 [gre64_sys],
 [lisp_sys],
 [vxlan_sys]],
index aa16d58..2ae8179 100644 (file)
@@ -310,6 +310,18 @@ Datapath actions: drop
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([tunnel - Geneve])
+OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=geneve \
+                    options:remote_ip=1.1.1.1 ofport_request=1 options:dst_port=5000])
+
+AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl
+               br0 65534/100: (dummy)
+               p1 1/5000: (geneve: dst_port=5000, remote_ip=1.1.1.1)
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([tunnel - VXLAN])
 OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=vxlan \
                     options:remote_ip=1.1.1.1 ofport_request=1])
index acefed2..c3e10fb 100644 (file)
           <dt><code>tap</code></dt>
           <dd>A TUN/TAP device managed by Open vSwitch.</dd>
 
+          <dt><code>geneve</code></dt>
+          <dd>
+            An Ethernet over Geneve (<code>http://tools.ietf.org/html/draft-gross-geneve-00</code>)
+            IPv4 tunnel.
+
+            Geneve supports options as a means to transport additional metadata,
+            however, currently only the 24-bit VNI is supported. This is planned
+            to be extended in the future.
+          </dd>
+
           <dt><code>gre</code></dt>
           <dd>
             An Ethernet over RFC 2890 Generic Routing Encapsulation over IPv4
     <group title="Tunnel Options">
       <p>
         These options apply to interfaces with <ref column="type"/> of
-        <code>gre</code>, <code>ipsec_gre</code>, <code>gre64</code>,
-        <code>ipsec_gre64</code>, <code>vxlan</code>, and <code>lisp</code>.
+        <code>geneve</code>, <code>gre</code>, <code>ipsec_gre</code>,
+        <code>gre64</code>, <code>ipsec_gre64</code>, <code>vxlan</code>,
+        and <code>lisp</code>.
       </p>
 
       <p>