#define STT_DST_PORT 7471
#ifdef OVS_STT
+#ifdef CONFIG_SLUB
+/*
+ * We saw better performance with skipping zero copy in case of SLUB.
+ * So skip zero copy for SLUB case.
+ */
+#define SKIP_ZERO_COPY
+#endif
+
#define STT_VER 0
/* @list: Per-net list of STT ports.
return 0;
}
-static struct sk_buff *normalize_frag_list(struct sk_buff *head,
- struct sk_buff **skbp)
-{
- struct sk_buff *skb = *skbp;
- struct sk_buff *last;
-
- do {
- struct sk_buff *frags;
-
- if (skb_shared(skb)) {
- struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-
- if (unlikely(!nskb))
- return ERR_PTR(-ENOMEM);
-
- nskb->next = skb->next;
- consume_skb(skb);
- skb = nskb;
- *skbp = skb;
- }
-
- if (head) {
- head->len -= skb->len;
- head->data_len -= skb->len;
- head->truesize -= skb->truesize;
- }
-
- frags = skb_shinfo(skb)->frag_list;
- if (frags) {
- int err;
-
- err = skb_unclone(skb, GFP_ATOMIC);
- if (unlikely(err))
- return ERR_PTR(err);
-
- last = normalize_frag_list(skb, &frags);
- if (IS_ERR(last))
- return last;
-
- skb_shinfo(skb)->frag_list = NULL;
- last->next = skb->next;
- skb->next = frags;
- } else {
- last = skb;
- }
-
- skbp = &skb->next;
- } while ((skb = skb->next));
-
- return last;
-}
-
-/* Takes a linked list of skbs, which potentially contain frag_list
- * (whose members in turn potentially contain frag_lists, etc.) and
- * converts them into a single linear linked list.
- */
-static int straighten_frag_list(struct sk_buff **skbp)
-{
- struct sk_buff *err_skb;
-
- err_skb = normalize_frag_list(NULL, skbp);
- if (IS_ERR(err_skb))
- return PTR_ERR(err_skb);
-
- return 0;
-}
-
static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
{
to->protocol = from->protocol;
return 0;
}
+#ifndef SKIP_ZERO_COPY
+static struct sk_buff *normalize_frag_list(struct sk_buff *head,
+ struct sk_buff **skbp)
+{
+ struct sk_buff *skb = *skbp;
+ struct sk_buff *last;
+
+ do {
+ struct sk_buff *frags;
+
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb))
+ return ERR_PTR(-ENOMEM);
+
+ nskb->next = skb->next;
+ consume_skb(skb);
+ skb = nskb;
+ *skbp = skb;
+ }
+
+ if (head) {
+ head->len -= skb->len;
+ head->data_len -= skb->len;
+ head->truesize -= skb->truesize;
+ }
+
+ frags = skb_shinfo(skb)->frag_list;
+ if (frags) {
+ int err;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ return ERR_PTR(err);
+
+ last = normalize_frag_list(skb, &frags);
+ if (IS_ERR(last))
+ return last;
+
+ skb_shinfo(skb)->frag_list = NULL;
+ last->next = skb->next;
+ skb->next = frags;
+ } else {
+ last = skb;
+ }
+
+ skbp = &skb->next;
+ } while ((skb = skb->next));
+
+ return last;
+}
+
+/* Takes a linked list of skbs, which potentially contain frag_list
+ * (whose members in turn potentially contain frag_lists, etc.) and
+ * converts them into a single linear linked list.
+ */
+static int straighten_frag_list(struct sk_buff **skbp)
+{
+ struct sk_buff *err_skb;
+
+ err_skb = normalize_frag_list(NULL, skbp);
+ if (IS_ERR(err_skb))
+ return PTR_ERR(err_skb);
+
+ return 0;
+}
+
static int coalesce_skb(struct sk_buff **headp)
{
struct sk_buff *frag, *head, *prev;
head->next = NULL;
return 0;
}
+#else
+static int coalesce_skb(struct sk_buff **headp)
+{
+ struct sk_buff *frag, *head = *headp, *next;
+ int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head);
+ int err;
+
+ if (unlikely(!head->next))
+ return 0;
+
+ err = pskb_expand_head(head, 0, delta, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ if (unlikely(!__pskb_pull_tail(head, head->data_len)))
+ BUG();
+
+ for (frag = head->next; frag; frag = next) {
+ skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len);
+ next = frag->next;
+ kfree_skb(frag);
+ }
+
+ head->next = NULL;
+ head->truesize = SKB_TRUESIZE(head->len);
+ return 0;
+}
+#endif
static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
bool ipv4, bool tcp, int l4_offset)
static int try_to_segment(struct sk_buff *skb)
{
+#ifdef SKIP_ZERO_COPY
+ /* coalesce_skb() since does not generate frag-list no need to
+ * linearize it here.
+ */
+ return 0;
+#else
struct stthdr *stth = stt_hdr(skb);
bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
int l4_offset = stth->l4_offset;
return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
+#endif
}
static int segment_skb(struct sk_buff **headp, bool csum_partial,
bool ipv4, bool tcp, int l4_offset)
{
+#ifndef SKIP_ZERO_COPY
int err;
err = coalesce_skb(headp);
if (err)
return err;
+#endif
if (skb_shinfo(*headp)->frag_list)
return __try_to_segment(*headp, csum_partial,
return ERR_PTR(err);
}
-static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
- __be32 dst, __u8 tos, __u8 ttl, __be16 df)
+static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
+ __be32 dst, __u8 tos, __u8 ttl, __be16 df)
{
- int len = 0;
-
while (skb) {
struct sk_buff *next = skb->next;
dst_clone(&rt->dst);
skb->next = NULL;
- len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
- tos, ttl, df, false);
+ iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
+ tos, ttl, df, false);
skb = next;
}
- return len;
}
static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
}
static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
- __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be64 tun_id)
+ __be32 src, __be32 dst, __u8 tos,
+ __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+ __be64 tun_id)
{
struct ethhdr *eh = eth_hdr(skb);
int ret = 0, min_headroom;
}
/* Push IP header. */
- ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
+ skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
next:
skb = next_skb;
}
- return ret;
+ return 0;
err_free_rt:
ip_rt_put(rt);
return ret;
}
+static struct rtable *stt_get_rt(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl,
+ const struct ip_tunnel_key *key)
+{
+ struct net *net = dev_net(dev);
+
+ /* Route lookup */
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
+ fl->flowi4_mark = skb->mark;
+ fl->flowi4_proto = IPPROTO_TCP;
+
+ return ip_route_output_key(net, fl);
+}
+
netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
tun_key = &tun_info->key;
- /* Route lookup */
- memset(&fl, 0, sizeof(fl));
- fl.daddr = tun_key->u.ipv4.dst;
- fl.saddr = tun_key->u.ipv4.src;
- fl.flowi4_tos = RT_TOS(tun_key->tos);
- fl.flowi4_mark = skb->mark;
- fl.flowi4_proto = IPPROTO_TCP;
- rt = ip_route_output_key(net, &fl);
+ rt = stt_get_rt(skb, dev, &fl, tun_key);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
goto error;
sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
skb->ignore_df = 1;
- err = stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
- tun_key->tos, tun_key->ttl,
- df, sport, dport, tun_key->tun_id);
- iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+ stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
+ tun_key->tos, tun_key->ttl,
+ df, sport, dport, tun_key->tun_id);
return NETDEV_TX_OK;
error:
kfree_skb(skb);
return victim_frag;
}
+#ifdef SKIP_ZERO_COPY
+static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
+ int *delta, bool *headstolen)
+{
+ int err;
+
+ if (unlikely(to->next))
+ return -EINVAL;
+
+ if (unlikely(FRAG_CB(to)->offset))
+ return -EINVAL;
+
+ if (unlikely(skb_unclone(to, GFP_ATOMIC)))
+ return -ENOMEM;
+
+ if (skb_try_coalesce(to, from, headstolen, delta))
+ return 0;
+
+ *headstolen = false;
+ err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ if (unlikely(!__pskb_pull_tail(to, to->data_len)))
+ BUG();
+
+ skb_copy_bits(from, 0, skb_put(to, from->len), from->len);
+
+ *delta = from->len;
+ to->truesize += from->len;
+ return 0;
+}
+#else
+static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
+ int *delta, bool *headstolen)
+{
+ *headstolen = false;
+ return -EINVAL;
+}
+#endif
+
static struct sk_buff *reassemble(struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
struct tcphdr *tcph = tcp_hdr(skb);
u32 seq = ntohl(tcph->seq);
struct stt_percpu *stt_percpu;
- struct sk_buff *last_skb;
+ struct sk_buff *last_skb, *copied_skb = NULL;
struct pkt_frag *frag;
struct pkt_key key;
- int tot_len;
+ int tot_len, delta = skb->truesize;
+ bool headstolen;
u32 hash;
tot_len = seq >> STT_SEQ_LEN_SHIFT;
FRAG_CB(skb)->first.set_ecn_ce = false;
list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
stt_percpu->frag_mem_used += skb->truesize;
-
skb = NULL;
goto unlock;
}
last_skb = FRAG_CB(frag->skbs)->first.last_skb;
if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
FRAG_CB(skb)->offset)) {
- last_skb->next = skb;
- FRAG_CB(frag->skbs)->first.last_skb = skb;
+
+ if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) {
+ copied_skb = skb;
+ } else {
+ last_skb->next = skb;
+ FRAG_CB(frag->skbs)->first.last_skb = skb;
+ }
} else {
struct sk_buff *prev = NULL, *next;
FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
- FRAG_CB(frag->skbs)->first.mem_used += skb->truesize;
- stt_percpu->frag_mem_used += skb->truesize;
+ stt_percpu->frag_mem_used += delta;
+ FRAG_CB(frag->skbs)->first.mem_used += delta;
if (FRAG_CB(frag->skbs)->first.tot_len ==
FRAG_CB(frag->skbs)->first.rcvd_len) {
skb = NULL;
}
+ if (copied_skb)
+ kfree_skb_partial(copied_skb, headstolen);
goto unlock;
unlock_free:
} while ((skb = next));
}
-#ifndef HAVE_METADATA_DST
+#ifndef USE_UPSTREAM_TUNNEL
static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
{
struct metadata_dst tun_dst;
- ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, TUNNEL_KEY | TUNNEL_CSUM,
+ ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM,
get_unaligned(&stt_hdr(skb)->key), 0);
tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
if (unlikely(!validate_checksum(skb)))
goto drop;
+ __skb_pull(skb, sizeof(struct tcphdr));
skb = reassemble(skb);
if (!skb)
return;
err = iptunnel_pull_header(skb,
sizeof(struct stthdr) + STT_ETH_PAD,
- htons(ETH_P_TEB));
+ htons(ETH_P_TEB),
+ !net_eq(stt_dev->net, dev_net(stt_dev->dev)));
if (unlikely(err))
goto drop;
if (!stt_dev)
return NF_ACCEPT;
- __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
+ __skb_pull(skb, ip_hdr_len);
stt_rcv(stt_dev, skb);
return NF_STOLEN;
}
static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
-#ifdef HAVE_METADATA_DST
+#ifdef USE_UPSTREAM_TUNNEL
return ovs_stt_xmit(skb);
#else
/* Drop All packets coming from networking stack. OVS-CB is
return __stt_change_mtu(dev, new_mtu, true);
}
+int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ struct stt_dev *stt_dev = netdev_priv(dev);
+ struct net *net = stt_dev->net;
+ __be16 dport = stt_dev->dst_port;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+
+ rt = stt_get_rt(skb, dev, &fl4, &info->key);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+
+ info->key.u.ipv4.src = fl4.saddr;
+ info->key.tp_src = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ info->key.tp_dst = dport;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst);
+
static const struct net_device_ops stt_netdev_ops = {
.ndo_init = stt_init,
.ndo_uninit = stt_uninit,
.ndo_change_mtu = stt_change_mtu,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = eth_mac_addr,
+#ifdef USE_UPSTREAM_TUNNEL
+#ifdef HAVE_NDO_FILL_METADATA_DST
+ .ndo_fill_metadata_dst = stt_fill_metadata_dst,
+#endif
+#endif
};
static void stt_get_drvinfo(struct net_device *dev,
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-#ifdef HAVE_METADATA_DST
+#ifdef USE_UPSTREAM_TUNNEL
netif_keep_dst(dev);
#endif
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;