dataoath: compat: Do not use upstream fill-meta-data function for compat tunnel
[cascardo/ovs.git] / datapath / linux / compat / stt.c
index 5b68577..c899517 100644 (file)
 #define STT_DST_PORT 7471
 
 #ifdef OVS_STT
+#ifdef CONFIG_SLUB
+/*
+ * We saw better performance with skipping zero copy in case of SLUB.
+ * So skip zero copy for SLUB case.
+ */
+#define SKIP_ZERO_COPY
+#endif
+
 #define STT_VER 0
 
 /* @list: Per-net list of STT ports.
@@ -219,73 +227,6 @@ static int clear_gso(struct sk_buff *skb)
        return 0;
 }
 
-static struct sk_buff *normalize_frag_list(struct sk_buff *head,
-                                          struct sk_buff **skbp)
-{
-       struct sk_buff *skb = *skbp;
-       struct sk_buff *last;
-
-       do {
-               struct sk_buff *frags;
-
-               if (skb_shared(skb)) {
-                       struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-
-                       if (unlikely(!nskb))
-                               return ERR_PTR(-ENOMEM);
-
-                       nskb->next = skb->next;
-                       consume_skb(skb);
-                       skb = nskb;
-                       *skbp = skb;
-               }
-
-               if (head) {
-                       head->len -= skb->len;
-                       head->data_len -= skb->len;
-                       head->truesize -= skb->truesize;
-               }
-
-               frags = skb_shinfo(skb)->frag_list;
-               if (frags) {
-                       int err;
-
-                       err = skb_unclone(skb, GFP_ATOMIC);
-                       if (unlikely(err))
-                               return ERR_PTR(err);
-
-                       last = normalize_frag_list(skb, &frags);
-                       if (IS_ERR(last))
-                               return last;
-
-                       skb_shinfo(skb)->frag_list = NULL;
-                       last->next = skb->next;
-                       skb->next = frags;
-               } else {
-                       last = skb;
-               }
-
-               skbp = &skb->next;
-       } while ((skb = skb->next));
-
-       return last;
-}
-
-/* Takes a linked list of skbs, which potentially contain frag_list
- * (whose members in turn potentially contain frag_lists, etc.) and
- * converts them into a single linear linked list.
- */
-static int straighten_frag_list(struct sk_buff **skbp)
-{
-       struct sk_buff *err_skb;
-
-       err_skb = normalize_frag_list(NULL, skbp);
-       if (IS_ERR(err_skb))
-               return PTR_ERR(err_skb);
-
-       return 0;
-}
-
 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
 {
        to->protocol = from->protocol;
@@ -465,6 +406,74 @@ static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
        return 0;
 }
 
+#ifndef SKIP_ZERO_COPY
+static struct sk_buff *normalize_frag_list(struct sk_buff *head,
+                                          struct sk_buff **skbp)
+{
+       struct sk_buff *skb = *skbp;
+       struct sk_buff *last;
+
+       do {
+               struct sk_buff *frags;
+
+               if (skb_shared(skb)) {
+                       struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+                       if (unlikely(!nskb))
+                               return ERR_PTR(-ENOMEM);
+
+                       nskb->next = skb->next;
+                       consume_skb(skb);
+                       skb = nskb;
+                       *skbp = skb;
+               }
+
+               if (head) {
+                       head->len -= skb->len;
+                       head->data_len -= skb->len;
+                       head->truesize -= skb->truesize;
+               }
+
+               frags = skb_shinfo(skb)->frag_list;
+               if (frags) {
+                       int err;
+
+                       err = skb_unclone(skb, GFP_ATOMIC);
+                       if (unlikely(err))
+                               return ERR_PTR(err);
+
+                       last = normalize_frag_list(skb, &frags);
+                       if (IS_ERR(last))
+                               return last;
+
+                       skb_shinfo(skb)->frag_list = NULL;
+                       last->next = skb->next;
+                       skb->next = frags;
+               } else {
+                       last = skb;
+               }
+
+               skbp = &skb->next;
+       } while ((skb = skb->next));
+
+       return last;
+}
+
+/* Takes a linked list of skbs, which potentially contain frag_list
+ * (whose members in turn potentially contain frag_lists, etc.) and
+ * converts them into a single linear linked list.
+ */
+static int straighten_frag_list(struct sk_buff **skbp)
+{
+       struct sk_buff *err_skb;
+
+       err_skb = normalize_frag_list(NULL, skbp);
+       if (IS_ERR(err_skb))
+               return PTR_ERR(err_skb);
+
+       return 0;
+}
+
 static int coalesce_skb(struct sk_buff **headp)
 {
        struct sk_buff *frag, *head, *prev;
@@ -510,6 +519,34 @@ static int coalesce_skb(struct sk_buff **headp)
        head->next = NULL;
        return 0;
 }
+#else
+static int coalesce_skb(struct sk_buff **headp)
+{
+       struct sk_buff *frag, *head = *headp, *next;
+       int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head);
+       int err;
+
+       if (unlikely(!head->next))
+               return 0;
+
+       err = pskb_expand_head(head, 0, delta, GFP_ATOMIC);
+       if (unlikely(err))
+               return err;
+
+       if (unlikely(!__pskb_pull_tail(head, head->data_len)))
+               BUG();
+
+       for (frag = head->next; frag; frag = next) {
+               skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len);
+               next = frag->next;
+               kfree_skb(frag);
+       }
+
+       head->next = NULL;
+       head->truesize = SKB_TRUESIZE(head->len);
+       return 0;
+}
+#endif
 
 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
                            bool ipv4, bool tcp, int l4_offset)
@@ -522,6 +559,12 @@ static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
 
 static int try_to_segment(struct sk_buff *skb)
 {
+#ifdef SKIP_ZERO_COPY
+       /* coalesce_skb() since does not generate frag-list no need to
+        * linearize it here.
+        */
+       return 0;
+#else
        struct stthdr *stth = stt_hdr(skb);
        bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
        bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
@@ -529,16 +572,19 @@ static int try_to_segment(struct sk_buff *skb)
        int l4_offset = stth->l4_offset;
 
        return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
+#endif
 }
 
 static int segment_skb(struct sk_buff **headp, bool csum_partial,
                       bool ipv4, bool tcp, int l4_offset)
 {
+#ifndef SKIP_ZERO_COPY
        int err;
 
        err = coalesce_skb(headp);
        if (err)
                return err;
+#endif
 
        if (skb_shinfo(*headp)->frag_list)
                return __try_to_segment(*headp, csum_partial,
@@ -805,11 +851,9 @@ error:
        return ERR_PTR(err);
 }
 
-static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
-                        __be32 dst, __u8 tos, __u8 ttl, __be16 df)
+static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
+                         __be32 dst, __u8 tos, __u8 ttl, __be16 df)
 {
-       int len = 0;
-
        while (skb) {
                struct sk_buff *next = skb->next;
 
@@ -817,12 +861,11 @@ static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
                        dst_clone(&rt->dst);
 
                skb->next = NULL;
-               len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
-                                    tos, ttl, df, false);
+               iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
+                             tos, ttl, df, false);
 
                skb = next;
        }
-       return len;
 }
 
 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
@@ -863,9 +906,9 @@ static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
 }
 
 static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
-                __be32 src, __be32 dst, __u8 tos,
-                __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
-                __be64 tun_id)
+                       __be32 src, __be32 dst, __u8 tos,
+                       __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+                       __be64 tun_id)
 {
        struct ethhdr *eh = eth_hdr(skb);
        int ret = 0, min_headroom;
@@ -920,13 +963,13 @@ static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
                }
 
                /* Push IP header. */
-               ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
+               skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
 
 next:
                skb = next_skb;
        }
 
-       return ret;
+       return 0;
 
 err_free_rt:
        ip_rt_put(rt);
@@ -934,6 +977,24 @@ err_free_rt:
        return ret;
 }
 
+static struct rtable *stt_get_rt(struct sk_buff *skb,
+                                struct net_device *dev,
+                                struct flowi4 *fl,
+                                const struct ip_tunnel_key *key)
+{
+       struct net *net = dev_net(dev);
+
+       /* Route lookup */
+       memset(fl, 0, sizeof(*fl));
+       fl->daddr = key->u.ipv4.dst;
+       fl->saddr = key->u.ipv4.src;
+       fl->flowi4_tos = RT_TOS(key->tos);
+       fl->flowi4_mark = skb->mark;
+       fl->flowi4_proto = IPPROTO_TCP;
+
+       return ip_route_output_key(net, fl);
+}
+
 netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
@@ -956,14 +1017,7 @@ netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
 
        tun_key = &tun_info->key;
 
-       /* Route lookup */
-       memset(&fl, 0, sizeof(fl));
-       fl.daddr = tun_key->u.ipv4.dst;
-       fl.saddr = tun_key->u.ipv4.src;
-       fl.flowi4_tos = RT_TOS(tun_key->tos);
-       fl.flowi4_mark = skb->mark;
-       fl.flowi4_proto = IPPROTO_TCP;
-       rt = ip_route_output_key(net, &fl);
+       rt = stt_get_rt(skb, dev, &fl, tun_key);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                goto error;
@@ -973,10 +1027,9 @@ netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
        sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
        skb->ignore_df = 1;
 
-       err = stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
-                           tun_key->tos, tun_key->ttl,
-                           df, sport, dport, tun_key->tun_id);
-       iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+       stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
+                   tun_key->tos, tun_key->ttl,
+                   df, sport, dport, tun_key->tun_id);
        return NETDEV_TX_OK;
 error:
        kfree_skb(skb);
@@ -1054,16 +1107,58 @@ static struct pkt_frag *lookup_frag(struct net *net,
        return victim_frag;
 }
 
+#ifdef SKIP_ZERO_COPY
+static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
+                     int *delta, bool *headstolen)
+{
+       int err;
+
+       if (unlikely(to->next))
+               return -EINVAL;
+
+       if (unlikely(FRAG_CB(to)->offset))
+               return -EINVAL;
+
+       if (unlikely(skb_unclone(to, GFP_ATOMIC)))
+               return -ENOMEM;
+
+       if (skb_try_coalesce(to, from, headstolen, delta))
+               return 0;
+
+       *headstolen = false;
+       err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC);
+       if (unlikely(err))
+               return err;
+
+       if (unlikely(!__pskb_pull_tail(to, to->data_len)))
+               BUG();
+
+       skb_copy_bits(from, 0, skb_put(to, from->len), from->len);
+
+       *delta = from->len;
+       to->truesize += from->len;
+       return 0;
+}
+#else
+static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
+                     int *delta, bool *headstolen)
+{
+       *headstolen = false;
+       return -EINVAL;
+}
+#endif
+
 static struct sk_buff *reassemble(struct sk_buff *skb)
 {
        struct iphdr *iph = ip_hdr(skb);
        struct tcphdr *tcph = tcp_hdr(skb);
        u32 seq = ntohl(tcph->seq);
        struct stt_percpu *stt_percpu;
-       struct sk_buff *last_skb;
+       struct sk_buff *last_skb, *copied_skb = NULL;
        struct pkt_frag *frag;
        struct pkt_key key;
-       int tot_len;
+       int tot_len, delta = skb->truesize;
+       bool headstolen;
        u32 hash;
 
        tot_len = seq >> STT_SEQ_LEN_SHIFT;
@@ -1103,7 +1198,6 @@ static struct sk_buff *reassemble(struct sk_buff *skb)
                FRAG_CB(skb)->first.set_ecn_ce = false;
                list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
                stt_percpu->frag_mem_used += skb->truesize;
-
                skb = NULL;
                goto unlock;
        }
@@ -1114,8 +1208,13 @@ static struct sk_buff *reassemble(struct sk_buff *skb)
        last_skb = FRAG_CB(frag->skbs)->first.last_skb;
        if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
                   FRAG_CB(skb)->offset)) {
-               last_skb->next = skb;
-               FRAG_CB(frag->skbs)->first.last_skb = skb;
+
+               if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) {
+                       copied_skb = skb;
+               } else {
+                       last_skb->next = skb;
+                       FRAG_CB(frag->skbs)->first.last_skb = skb;
+               }
        } else {
                struct sk_buff *prev = NULL, *next;
 
@@ -1154,8 +1253,8 @@ static struct sk_buff *reassemble(struct sk_buff *skb)
 
        FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
        FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
-       FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
-       stt_percpu->frag_mem_used += skb->truesize;
+       stt_percpu->frag_mem_used += delta;
+       FRAG_CB(frag->skbs)->first.mem_used += delta;
 
        if (FRAG_CB(frag->skbs)->first.tot_len ==
            FRAG_CB(frag->skbs)->first.rcvd_len) {
@@ -1174,6 +1273,8 @@ static struct sk_buff *reassemble(struct sk_buff *skb)
                skb = NULL;
        }
 
+       if (copied_skb)
+               kfree_skb_partial(copied_skb, headstolen);
        goto unlock;
 
 unlock_free:
@@ -1307,12 +1408,12 @@ static void rcv_list(struct net_device *dev, struct sk_buff *skb,
        } while ((skb = next));
 }
 
-#ifndef HAVE_METADATA_DST
+#ifndef USE_UPSTREAM_TUNNEL
 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
 {
        struct metadata_dst tun_dst;
 
-       ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, TUNNEL_KEY | TUNNEL_CSUM,
+       ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM,
                          get_unaligned(&stt_hdr(skb)->key), 0);
        tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
        tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
@@ -1347,6 +1448,7 @@ static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
        if (unlikely(!validate_checksum(skb)))
                goto drop;
 
+       __skb_pull(skb, sizeof(struct tcphdr));
        skb = reassemble(skb);
        if (!skb)
                return;
@@ -1356,7 +1458,8 @@ static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
 
        err = iptunnel_pull_header(skb,
                                   sizeof(struct stthdr) + STT_ETH_PAD,
-                                  htons(ETH_P_TEB));
+                                  htons(ETH_P_TEB),
+                                  !net_eq(stt_dev->net, dev_net(stt_dev->dev)));
        if (unlikely(err))
                goto drop;
 
@@ -1490,7 +1593,7 @@ static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
        if (!stt_dev)
                return NF_ACCEPT;
 
-       __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
+       __skb_pull(skb, ip_hdr_len);
        stt_rcv(stt_dev, skb);
        return NF_STOLEN;
 }
@@ -1625,7 +1728,7 @@ out:
 
 static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-#ifdef HAVE_METADATA_DST
+#ifdef USE_UPSTREAM_TUNNEL
        return ovs_stt_xmit(skb);
 #else
        /* Drop All packets coming from networking stack. OVS-CB is
@@ -1707,6 +1810,31 @@ static int stt_change_mtu(struct net_device *dev, int new_mtu)
        return __stt_change_mtu(dev, new_mtu, true);
 }
 
+int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+       struct ip_tunnel_info *info = skb_tunnel_info(skb);
+       struct stt_dev *stt_dev = netdev_priv(dev);
+       struct net *net = stt_dev->net;
+       __be16 dport = stt_dev->dst_port;
+       struct flowi4 fl4;
+       struct rtable *rt;
+
+       if (ip_tunnel_info_af(info) != AF_INET)
+               return -EINVAL;
+
+       rt = stt_get_rt(skb, dev, &fl4, &info->key);
+       if (IS_ERR(rt))
+               return PTR_ERR(rt);
+
+       ip_rt_put(rt);
+
+       info->key.u.ipv4.src = fl4.saddr;
+       info->key.tp_src = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+       info->key.tp_dst = dport;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst);
+
 static const struct net_device_ops stt_netdev_ops = {
        .ndo_init               = stt_init,
        .ndo_uninit             = stt_uninit,
@@ -1717,6 +1845,11 @@ static const struct net_device_ops stt_netdev_ops = {
        .ndo_change_mtu         = stt_change_mtu,
        .ndo_validate_addr      = eth_validate_addr,
        .ndo_set_mac_address    = eth_mac_addr,
+#ifdef USE_UPSTREAM_TUNNEL
+#ifdef HAVE_NDO_FILL_METADATA_DST
+       .ndo_fill_metadata_dst  = stt_fill_metadata_dst,
+#endif
+#endif
 };
 
 static void stt_get_drvinfo(struct net_device *dev,
@@ -1755,7 +1888,7 @@ static void stt_setup(struct net_device *dev)
        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 
-#ifdef HAVE_METADATA_DST
+#ifdef USE_UPSTREAM_TUNNEL
        netif_keep_dst(dev);
 #endif
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;