From 2d3ef70b74bb69980f8f2c3fa21e812d8ca3aeda Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 2 Dec 2015 23:53:46 -0800 Subject: [PATCH] compat: Backport IPv6 fragmentation. IPv6 fragmentation functionality is not exported by most kernels, so backport this code from the upstream 4.3 development tree. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar --- acinclude.m4 | 1 + datapath/linux/Modules.mk | 2 + .../compat/include/linux/netfilter_ipv6.h | 39 ++ datapath/linux/compat/include/net/ip6_route.h | 17 +- datapath/linux/compat/include/net/ipv6.h | 7 + datapath/linux/compat/ip6_output.c | 471 ++++++++++++++++++ 6 files changed, 536 insertions(+), 1 deletion(-) create mode 100644 datapath/linux/compat/include/linux/netfilter_ipv6.h create mode 100644 datapath/linux/compat/ip6_output.c diff --git a/acinclude.m4 b/acinclude.m4 index 7793f38f0..874bf57b2 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -476,6 +476,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [gre_handle_offloads]) OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [IP6_FH_F_SKIP_RH]) OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [ip6_local_out_sk]) + OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [__ipv6_addr_jhash]) OVS_GREP_IFELSE([$KSRC/include/net/ip6_route.h], [ip6_frag.*sock], [OVS_DEFINE([HAVE_IP_FRAGMENT_TAKES_SOCK])]) diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index b1a60b080..0117ff875 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -12,6 +12,7 @@ openvswitch_sources += \ linux/compat/ip_fragment.c \ linux/compat/ip_tunnel.c \ linux/compat/ip_tunnels_core.c \ + linux/compat/ip6_output.c \ linux/compat/lisp.c \ linux/compat/netdevice.c \ linux/compat/net_namespace.c \ @@ -54,6 +55,7 @@ openvswitch_headers += \ linux/compat/include/linux/random.h \ linux/compat/include/linux/netdevice.h \ linux/compat/include/linux/netdev_features.h \ + linux/compat/include/linux/netfilter_ipv6.h \ linux/compat/include/linux/netlink.h \ linux/compat/include/linux/openvswitch.h \ linux/compat/include/linux/poison.h \ diff --git a/datapath/linux/compat/include/linux/netfilter_ipv6.h b/datapath/linux/compat/include/linux/netfilter_ipv6.h new file mode 100644 index 000000000..3026e1fe6 --- /dev/null +++ b/datapath/linux/compat/include/linux/netfilter_ipv6.h @@ -0,0 +1,39 @@ +#ifndef __NETFILTER_IPV6_WRAPPER_H +#define __NETFILTER_IPV6_WRAPPER_H 1 + +#include_next + +#include +#include /* For OVS_VPORT_OUTPUT_PARAMS */ +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) +/* Try to minimise changes required to the actions.c code for calling IPv6 + * fragmentation. We can keep the fragment() API mostly the same, except that + * the callback parameter needs to be in the form that older kernels accept. + * We don't backport the other ipv6_ops as they're currently unused by OVS. */ +struct ovs_nf_ipv6_ops { + int (*fragment)(struct sock *sk, struct sk_buff *skb, + int (*output)(OVS_VPORT_OUTPUT_PARAMS)); +}; +#define nf_ipv6_ops ovs_nf_ipv6_ops + +#if defined(OVS_FRAGMENT_BACKPORT) +static struct ovs_nf_ipv6_ops ovs_ipv6_ops = { + .fragment = ip6_fragment, +}; + +static inline struct ovs_nf_ipv6_ops *ovs_nf_get_ipv6_ops(void) +{ + return &ovs_ipv6_ops; +} +#else /* !OVS_FRAGMENT_BACKPORT || !CONFIG_NETFILTER || || !CONFIG_IPV6 */ +static inline const struct ovs_nf_ipv6_ops *ovs_nf_get_ipv6_ops(void) +{ + return NULL; +} +#endif +#define nf_get_ipv6_ops ovs_nf_get_ipv6_ops + +#endif /* < 4.3 */ +#endif /* __NETFILTER_IPV6_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/ip6_route.h b/datapath/linux/compat/include/net/ip6_route.h index 3f495e783..93d70e3a7 100644 --- a/datapath/linux/compat/include/net/ip6_route.h +++ b/datapath/linux/compat/include/net/ip6_route.h @@ -28,4 +28,19 @@ struct dst_entry *rpl_ip6_route_output(struct net *net, const struct sock *sk, #define ip6_dst_hoplimit(dst) dst_metric(dst, RTAX_HOPLIMIT) #endif /* 2.6.39 */ -#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +static inline int rpl_ip6_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sk_buff *)) +{ + kfree_skb(skb); + return -ENOTSUPP; +} +#define ip6_fragment rpl_ip6_fragment +#elif defined(OVS_FRAGMENT_BACKPORT) +int rpl_ip6_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(OVS_VPORT_OUTPUT_PARAMS)); +#define ip6_fragment rpl_ip6_fragment +#endif /* OVS_FRAGMENT_BACKPORT */ + +#endif /* _NET_IP6_ROUTE_WRAPPER */ diff --git a/datapath/linux/compat/include/net/ipv6.h b/datapath/linux/compat/include/net/ipv6.h index 18c7d301e..30b506156 100644 --- a/datapath/linux/compat/include/net/ipv6.h +++ b/datapath/linux/compat/include/net/ipv6.h @@ -47,4 +47,11 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) } #endif +#if defined(OVS_FRAGMENT_BACKPORT) && !defined(HAVE___IPV6_ADDR_JHASH) +static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 unused) +{ + return ipv6_addr_jhash(a); +} +#endif + #endif diff --git a/datapath/linux/compat/ip6_output.c b/datapath/linux/compat/ip6_output.c new file mode 100644 index 000000000..004575c5a --- /dev/null +++ b/datapath/linux/compat/ip6_output.c @@ -0,0 +1,471 @@ +/* + * Backported from upstream commit 9ef2e965e554 + * ("ipv6: drop frames with attached skb->sk in forwarding") + * + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include + +#ifdef OVS_FRAGMENT_BACKPORT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IP_IDENTS_SZ 2048u + +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; + +int __init ip6_output_init(void); +void ip6_output_exit(void); + +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. + */ +static u32 rpl_ip_idents_reserve(u32 hash, int segs) +{ + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); + u32 now = (u32)jiffies; + u32 delta = 0; + + if (old != now && cmpxchg(p_tstamp, old, now) == old) + delta = prandom_u32_max(now - old); + + return atomic_add_return(segs + delta, p_id) - segs; +} + +static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, + const struct in6_addr *dst, + const struct in6_addr *src) +{ + u32 hash, id; + + hash = __ipv6_addr_jhash(dst, hashrnd); + hash = __ipv6_addr_jhash(src, hash); + hash ^= net_hash_mix(net); + + /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, + * set the hight order instead thus minimizing possible future + * collisions. + */ + id = rpl_ip_idents_reserve(hash, 1); + if (unlikely(!id)) + id = 1 << 31; + + return id; +} + +/* XXX: Exported in 3.19. */ +static __be32 rpl_ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + static u32 ip6_idents_hashrnd __read_mostly; + u32 id; + + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); + + id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); + return htonl(id); +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); + to->dev = from->dev; + to->mark = from->mark; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif + nf_copy(to, from); + skb_copy_secmark(to, from); +} + +#ifdef HAVE_IP_FRAGMENT_TAKES_SOCK +#define OUTPUT(skb) output(skb->sk, skb) +#else +#define OUTPUT(skb) output(skb) +#endif + +int ip6_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(OVS_VPORT_OUTPUT_PARAMS)) +{ + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + int hroom, troom; + __be32 frag_id; + int ptr, offset = 0, err = 0; + u8 *prevhdr, nexthdr = 0; + struct net *net = dev_net(skb_dst(skb)->dev); + + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = ip6_skb_dst_mtu(skb); + + /* We must not fragment if the socket is set to force MTU discovery + * or if the skb it not generated by a local socket. + */ + if (unlikely(!skb->ignore_df && skb->len > mtu)) + goto fail_toobig; + + if (IP6CB(skb)->frag_max_size) { + if (IP6CB(skb)->frag_max_size > mtu) + goto fail_toobig; + + /* don't send fragments larger than what we received */ + mtu = IP6CB(skb)->frag_max_size; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } + + if (np && np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = rpl_ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + hroom = LL_RESERVED_SPACE(rt->dst.dev); + if (skb_has_frag_list(skb)) { + int first_len = skb_pagelen(skb); + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto slow_path; + + skb_walk_frags(skb, frag) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) + goto slow_path_clean; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path_clean; + + BUG_ON(frag->sk); + if (skb->sk) { + frag->sk = skb->sk; + frag->destructor = sock_wfree; + } + skb->truesize -= frag->truesize; + } + + err = 0; + offset = 0; + /* BUILD HEADER */ + + *prevhdr = NEXTHDR_FRAGMENT; + tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + __skb_pull(skb, hlen); + fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - + sizeof(struct ipv6hdr)); + + dst_hold(&rt->dst); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), tmp_hdr, + hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + ipv6_hdr(frag)->payload_len = + htons(frag->len - + sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = OUTPUT(skb); + if (!err) + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGCREATES); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGOKS); + ip6_rt_put(rt); + return 0; + } + + kfree_skb_list(frag); + + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGFAILS); + ip6_rt_put(rt); + return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } + } + +slow_path: + if ((skb->ip_summed == CHECKSUM_PARTIAL) && + skb_checksum_help(skb)) + goto fail; + + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + troom = rt->dst.dev->needed_tailroom; + + /* + * Keep copying data until we run out. + */ + while (left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + + /* Allocate buffer */ + frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC); + if (!frag) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, hroom); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); + frag->transport_header = (frag->network_header + hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), + len)); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - + sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + err = OUTPUT(frag); + if (err) + goto fail; + + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGCREATES); + } + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGOKS); + consume_skb(skb); + return err; + +fail_toobig: + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + err = -EMSGSIZE; + +fail: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return err; +} +#undef OUTPUT + +int __init ip6_output_init(void) +{ + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) { + pr_warn("IP: failed to allocate ip_idents\n"); + goto error; + } + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) { + pr_warn("IP: failed to allocate ip_tstamps\n"); + goto error_ip_idents_free; + } + + return 0; + +error_ip_idents_free: + kfree(ip_idents); +error: + return -ENOMEM; +} + +void ip6_output_exit(void) +{ + kfree(ip_tstamps); + kfree(ip_idents); +} + +#endif /* OVS_FRAGMENT_BACKPORT */ -- 2.20.1