datapath: add skb mark matching and set action
[cascardo/ovs.git] / datapath / flow.c
index 9f93550..65d6cce 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2011 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -226,9 +226,7 @@ struct sw_flow *ovs_flow_alloc(void)
                return ERR_PTR(-ENOMEM);
 
        spin_lock_init(&flow->lock);
-       atomic_set(&flow->refcnt, 1);
        flow->sf_acts = NULL;
-       flow->dead = false;
 
        return flow;
 }
@@ -290,12 +288,6 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
        return table;
 }
 
-static void flow_free(struct sw_flow *flow)
-{
-       flow->dead = true;
-       ovs_flow_put(flow);
-}
-
 void ovs_flow_tbl_destroy(struct flow_table *table)
 {
        int i;
@@ -314,7 +306,7 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
 
                hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
                        hlist_del_rcu(&flow->hash_node[ver]);
-                       flow_free(flow);
+                       ovs_flow_free(flow);
                }
        }
 
@@ -365,6 +357,14 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
        return NULL;
 }
 
+static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+{
+       struct hlist_head *head;
+       head = find_bucket(table, flow->hash);
+       hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+       table->count++;
+}
+
 static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
 {
        int old_ver;
@@ -382,7 +382,7 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
                head = flex_array_get(old->buckets, i);
 
                hlist_for_each_entry(flow, n, head, hash_node[old_ver])
-                       ovs_flow_tbl_insert(new, flow);
+                       __flow_tbl_insert(new, flow);
        }
        old->keep_flows = true;
 }
@@ -410,13 +410,21 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
        return __flow_tbl_rehash(table, table->n_buckets * 2);
 }
 
+void ovs_flow_free(struct sw_flow *flow)
+{
+       if (unlikely(!flow))
+               return;
+
+       kfree((struct sf_flow_acts __force *)flow->sf_acts);
+       kmem_cache_free(flow_cache, flow);
+}
+
 /* RCU callback used by ovs_flow_deferred_free. */
 static void rcu_free_flow_callback(struct rcu_head *rcu)
 {
        struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 
-       flow->dead = true;
-       ovs_flow_put(flow);
+       ovs_flow_free(flow);
 }
 
 /* Schedules 'flow' to be freed after the next RCU grace period.
@@ -426,22 +434,6 @@ void ovs_flow_deferred_free(struct sw_flow *flow)
        call_rcu(&flow->rcu, rcu_free_flow_callback);
 }
 
-void ovs_flow_hold(struct sw_flow *flow)
-{
-       atomic_inc(&flow->refcnt);
-}
-
-void ovs_flow_put(struct sw_flow *flow)
-{
-       if (unlikely(!flow))
-               return;
-
-       if (atomic_dec_and_test(&flow->refcnt)) {
-               kfree((struct sf_flow_acts __force *)flow->sf_acts);
-               kmem_cache_free(flow_cache, flow);
-       }
-}
-
 /* RCU callback used by ovs_flow_deferred_free_acts. */
 static void rcu_free_acts_callback(struct rcu_head *rcu)
 {
@@ -629,8 +621,10 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
        memset(key, 0, sizeof(*key));
 
        key->phy.priority = skb->priority;
-       key->phy.tun_id = OVS_CB(skb)->tun_id;
+       if (OVS_CB(skb)->tun_key)
+               memcpy(&key->phy.tun.tun_key, OVS_CB(skb)->tun_key, sizeof(key->phy.tun.tun_key));
        key->phy.in_port = in_port;
+       key->phy.skb_mark = skb_get_mark(skb);
 
        skb_reset_mac_header(skb);
 
@@ -716,7 +710,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                        }
                }
 
-       } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
+       } else if ((key->eth.type == htons(ETH_P_ARP) ||
+                  key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
                struct arp_eth_header *arp;
 
                arp = (struct arp_eth_header *)skb_network_header(skb);
@@ -729,15 +724,11 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                        /* We only match on the lower 8 bits of the opcode. */
                        if (ntohs(arp->ar_op) <= 0xff)
                                key->ip.proto = ntohs(arp->ar_op);
-
-                       if (key->ip.proto == ARPOP_REQUEST
-                                       || key->ip.proto == ARPOP_REPLY) {
-                               memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
-                               memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
-                               memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
-                               memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
-                               key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
-                       }
+                       memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
+                       memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
+                       memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
+                       memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
+                       key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
                }
        } else if (key->eth.type == htons(ETH_P_IPV6)) {
                int nh_len;             /* IPv6 Header + Extensions */
@@ -786,9 +777,18 @@ out:
        return error;
 }
 
-u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
+static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
+{
+       return jhash2((u32 *)((u8 *)key + key_start),
+                     DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
+}
+
+static int flow_key_start(struct sw_flow_key *key)
 {
-       return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
+       if (key->phy.tun.tun_key.ipv4_dst)
+               return 0;
+       else
+               return offsetof(struct sw_flow_key, phy.priority);
 }
 
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
@@ -797,28 +797,31 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
        struct sw_flow *flow;
        struct hlist_node *n;
        struct hlist_head *head;
+       u8 *_key;
+       int key_start;
        u32 hash;
 
-       hash = ovs_flow_hash(key, key_len);
+       key_start = flow_key_start(key);
+       hash = ovs_flow_hash(key, key_start, key_len);
 
+       _key = (u8 *) key + key_start;
        head = find_bucket(table, hash);
        hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
 
                if (flow->hash == hash &&
-                   !memcmp(&flow->key, key, key_len)) {
+                   !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) {
                        return flow;
                }
        }
        return NULL;
 }
 
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+                        struct sw_flow_key *key, int key_len)
 {
-       struct hlist_head *head;
-
-       head = find_bucket(table, flow->hash);
-       hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
-       table->count++;
+       flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len);
+       memcpy(&flow->key, key, sizeof(flow->key));
+       __flow_tbl_insert(table, flow);
 }
 
 void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
@@ -833,6 +836,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
        [OVS_KEY_ATTR_ENCAP] = -1,
        [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
        [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+       [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
        [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
        [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
        [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
@@ -844,6 +848,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
        [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
        [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
        [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+       [OVS_KEY_ATTR_IPV4_TUNNEL] = sizeof(struct ovs_key_ipv4_tunnel),
 
        /* Not upstream. */
        [OVS_KEY_ATTR_TUN_ID] = sizeof(__be64),
@@ -1021,10 +1026,48 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
        } else {
                swkey->phy.in_port = DP_MAX_PORTS;
        }
+       if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+               uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) && !defined(CONFIG_NETFILTER)
+               if (mark != 0)
+                       return -EINVAL;
+#endif
+               swkey->phy.skb_mark = mark;
+               attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID) &&
+           attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
+               struct ovs_key_ipv4_tunnel *tun_key;
+               __be64 tun_id;
+
+               tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
+
+               if (!tun_key->ipv4_dst)
+                       return -EINVAL;
+               if (!(tun_key->tun_flags & OVS_TNL_F_KEY))
+                       return -EINVAL;
+
+               tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
+               if (tun_id != tun_key->tun_id)
+                       return -EINVAL;
+
+               memcpy(&swkey->phy.tun.tun_key, tun_key,
+                       sizeof(swkey->phy.tun.tun_key));
 
-       if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
-               swkey->phy.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
                attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
+       } else if (attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
+               struct ovs_key_ipv4_tunnel *tun_key;
+               tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
+
+               if (!tun_key->ipv4_dst)
+                       return -EINVAL;
+
+               memcpy(&swkey->phy.tun.tun_key, tun_key,
+                       sizeof(swkey->phy.tun.tun_key));
+
+               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
        }
 
        /* Data attributes. */
@@ -1125,7 +1168,8 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
                        if (err)
                                return err;
                }
-       } else if (swkey->eth.type == htons(ETH_P_ARP)) {
+       } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+                  swkey->eth.type == htons(ETH_P_RARP)) {
                const struct ovs_key_arp *arp_key;
 
                if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
@@ -1162,15 +1206,18 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
  */
-int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
-                                  const struct nlattr *attr)
+
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, const struct nlattr *attr)
 {
+       struct ovs_key_ipv4_tunnel *tun_key = &flow->key.phy.tun.tun_key;
        const struct nlattr *nla;
        int rem;
+       __be64 tun_id = 0;
 
-       *in_port = DP_MAX_PORTS;
-       *tun_id = 0;
-       *priority = 0;
+       flow->key.phy.in_port = DP_MAX_PORTS;
+       flow->key.phy.priority = 0;
+       flow->key.phy.skb_mark = 0;
+       memset(tun_key, 0, sizeof(flow->key.phy.tun.tun_key));
 
        nla_for_each_nested(nla, attr, rem) {
                int type = nla_type(nla);
@@ -1181,23 +1228,63 @@ int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
 
                        switch (type) {
                        case OVS_KEY_ATTR_PRIORITY:
-                               *priority = nla_get_u32(nla);
+                               flow->key.phy.priority = nla_get_u32(nla);
                                break;
 
                        case OVS_KEY_ATTR_TUN_ID:
-                               *tun_id = nla_get_be64(nla);
+                               tun_id = nla_get_be64(nla);
+
+                               if (tun_key->ipv4_dst) {
+                                       if (!(tun_key->tun_flags & OVS_TNL_F_KEY))
+                                               return -EINVAL;
+                                       if (tun_key->tun_id != tun_id)
+                                               return -EINVAL;
+                                       break;
+                               }
+                               tun_key->tun_id = tun_id;
+                               tun_key->tun_flags |= OVS_TNL_F_KEY;
+
+                               break;
+
+                       case OVS_KEY_ATTR_IPV4_TUNNEL:
+                               if (tun_key->tun_flags & OVS_TNL_F_KEY) {
+                                       tun_id = tun_key->tun_id;
+
+                                       memcpy(tun_key, nla_data(nla), sizeof(*tun_key));
+                                       if (!(tun_key->tun_flags & OVS_TNL_F_KEY))
+                                               return -EINVAL;
+
+                                       if (tun_key->tun_id != tun_id)
+                                               return -EINVAL;
+                               } else
+                                       memcpy(tun_key, nla_data(nla), sizeof(*tun_key));
+
+                               if (!tun_key->ipv4_dst)
+                                       return -EINVAL;
                                break;
 
                        case OVS_KEY_ATTR_IN_PORT:
                                if (nla_get_u32(nla) >= DP_MAX_PORTS)
                                        return -EINVAL;
-                               *in_port = nla_get_u32(nla);
+                               flow->key.phy.in_port = nla_get_u32(nla);
+                               break;
+
+                       case OVS_KEY_ATTR_SKB_MARK:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) && !defined(CONFIG_NETFILTER)
+                               if (nla_get_u32(nla) != 0)
+                                       return -EINVAL;
+#endif
+                               flow->key.phy.skb_mark = nla_get_u32(nla);
                                break;
                        }
                }
        }
        if (rem)
                return -EINVAL;
+
+       flow->hash = ovs_flow_hash(&flow->key,
+                                  flow_key_start(&flow->key), key_len);
+
        return 0;
 }
 
@@ -1210,14 +1297,26 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
            nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
                goto nla_put_failure;
 
-       if (swkey->phy.tun_id != cpu_to_be64(0) &&
-           nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id))
+       if (swkey->phy.tun.tun_key.ipv4_dst) {
+               struct ovs_key_ipv4_tunnel *tun_key;
+               nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4_TUNNEL, sizeof(*tun_key));
+               if (!nla)
+                       goto nla_put_failure;
+               tun_key = nla_data(nla);
+               memcpy(tun_key, &swkey->phy.tun.tun_key, sizeof(*tun_key));
+       }
+       if ((swkey->phy.tun.tun_key.tun_flags & OVS_TNL_F_KEY) &&
+           nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun.tun_key.tun_id))
                goto nla_put_failure;
 
        if (swkey->phy.in_port != DP_MAX_PORTS &&
            nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
                goto nla_put_failure;
 
+       if (swkey->phy.skb_mark &&
+           nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark))
+               goto nla_put_failure;
+
        nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
        if (!nla)
                goto nla_put_failure;
@@ -1271,7 +1370,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                ipv6_key->ipv6_tclass = swkey->ip.tos;
                ipv6_key->ipv6_hlimit = swkey->ip.ttl;
                ipv6_key->ipv6_frag = swkey->ip.frag;
-       } else if (swkey->eth.type == htons(ETH_P_ARP)) {
+       } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+                  swkey->eth.type == htons(ETH_P_RARP)) {
                struct ovs_key_arp *arp_key;
 
                nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));