inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations
[cascardo/linux.git] / net / ipv6 / route.c
index d358888..1a1122a 100644 (file)
@@ -72,8 +72,7 @@ enum rt6_nud_state {
        RT6_NUD_SUCCEED = 1
 };
 
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
-                                   const struct in6_addr *dest);
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
 static struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int     ip6_default_advmss(const struct dst_entry *dst);
 static unsigned int     ip6_mtu(const struct dst_entry *dst);
@@ -92,6 +91,7 @@ static void           ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu);
 static void            rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
+static void            rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -104,65 +104,82 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *gwaddr, int ifindex);
 #endif
 
-static void rt6_bind_peer(struct rt6_info *rt, int create)
+struct uncached_list {
+       spinlock_t              lock;
+       struct list_head        head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+static void rt6_uncached_list_add(struct rt6_info *rt)
 {
-       struct inet_peer_base *base;
-       struct inet_peer *peer;
+       struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 
-       base = inetpeer_base_ptr(rt->_rt6i_peer);
-       if (!base)
-               return;
+       rt->dst.flags |= DST_NOCACHE;
+       rt->rt6i_uncached_list = ul;
 
-       peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
-       if (peer) {
-               if (!rt6_set_peer(rt, peer))
-                       inet_putpeer(peer);
+       spin_lock_bh(&ul->lock);
+       list_add_tail(&rt->rt6i_uncached, &ul->head);
+       spin_unlock_bh(&ul->lock);
+}
+
+static void rt6_uncached_list_del(struct rt6_info *rt)
+{
+       if (!list_empty(&rt->rt6i_uncached)) {
+               struct uncached_list *ul = rt->rt6i_uncached_list;
+
+               spin_lock_bh(&ul->lock);
+               list_del(&rt->rt6i_uncached);
+               spin_unlock_bh(&ul->lock);
        }
 }
 
-static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
+static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 {
-       if (rt6_has_peer(rt))
-               return rt6_peer_ptr(rt);
+       struct net_device *loopback_dev = net->loopback_dev;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+               struct rt6_info *rt;
+
+               spin_lock_bh(&ul->lock);
+               list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+                       struct inet6_dev *rt_idev = rt->rt6i_idev;
+                       struct net_device *rt_dev = rt->dst.dev;
 
-       rt6_bind_peer(rt, create);
-       return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
+                       if (rt_idev && (rt_idev->dev == dev || !dev) &&
+                           rt_idev->dev != loopback_dev) {
+                               rt->rt6i_idev = in6_dev_get(loopback_dev);
+                               in6_dev_put(rt_idev);
+                       }
+
+                       if (rt_dev && (rt_dev == dev || !dev) &&
+                           rt_dev != loopback_dev) {
+                               rt->dst.dev = loopback_dev;
+                               dev_hold(rt->dst.dev);
+                               dev_put(rt_dev);
+                       }
+               }
+               spin_unlock_bh(&ul->lock);
+       }
 }
 
-static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
 {
-       return __rt6_get_peer(rt, 1);
+       return dst_metrics_write_ptr(rt->dst.from);
 }
 
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-       struct rt6_info *rt = (struct rt6_info *) dst;
-       struct inet_peer *peer;
-       u32 *p = NULL;
+       struct rt6_info *rt = (struct rt6_info *)dst;
 
-       if (!(rt->dst.flags & DST_HOST))
+       if (rt->rt6i_flags & RTF_PCPU)
+               return rt6_pcpu_cow_metrics(rt);
+       else if (rt->rt6i_flags & RTF_CACHE)
+               return NULL;
+       else
                return dst_cow_metrics_generic(dst, old);
-
-       peer = rt6_get_peer_create(rt);
-       if (peer) {
-               u32 *old_p = __DST_METRICS_PTR(old);
-               unsigned long prev, new;
-
-               p = peer->metrics;
-               if (inet_metrics_new(peer) ||
-                   (old & DST_METRICS_FORCE_OVERWRITE))
-                       memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
-               new = (unsigned long) p;
-               prev = cmpxchg(&dst->_metrics, old, new);
-
-               if (prev != old) {
-                       p = __DST_METRICS_PTR(prev);
-                       if (prev & DST_METRICS_READ_ONLY)
-                               p = NULL;
-               }
-       }
-       return p;
 }
 
 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
@@ -299,10 +316,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 #endif
 
 /* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
-                                            struct net_device *dev,
-                                            int flags,
-                                            struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+                                       struct net_device *dev,
+                                       int flags,
+                                       struct fib6_table *table)
 {
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        0, DST_OBSOLETE_FORCE_CHK, flags);
@@ -311,21 +328,54 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
                struct dst_entry *dst = &rt->dst;
 
                memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
-               rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
                INIT_LIST_HEAD(&rt->rt6i_siblings);
+               INIT_LIST_HEAD(&rt->rt6i_uncached);
        }
        return rt;
 }
 
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+                                     struct net_device *dev,
+                                     int flags,
+                                     struct fib6_table *table)
+{
+       struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
+
+       if (rt) {
+               rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+               if (rt->rt6i_pcpu) {
+                       int cpu;
+
+                       for_each_possible_cpu(cpu) {
+                               struct rt6_info **p;
+
+                               p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+                               /* no one shares rt */
+                               *p =  NULL;
+                       }
+               } else {
+                       dst_destroy((struct dst_entry *)rt);
+                       return NULL;
+               }
+       }
+
+       return rt;
+}
+
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
        struct rt6_info *rt = (struct rt6_info *)dst;
-       struct inet6_dev *idev = rt->rt6i_idev;
        struct dst_entry *from = dst->from;
+       struct inet6_dev *idev;
+
+       dst_destroy_metrics_generic(dst);
+
+       if (rt->rt6i_pcpu)
+               free_percpu(rt->rt6i_pcpu);
 
-       if (!(rt->dst.flags & DST_HOST))
-               dst_destroy_metrics_generic(dst);
+       rt6_uncached_list_del(rt);
 
+       idev = rt->rt6i_idev;
        if (idev) {
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
@@ -333,11 +383,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 
        dst->from = NULL;
        dst_release(from);
-
-       if (rt6_has_peer(rt)) {
-               struct inet_peer *peer = rt6_peer_ptr(rt);
-               inet_putpeer(peer);
-       }
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -652,15 +697,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
                                     u32 metric, int oif, int strict,
                                     bool *do_rr)
 {
-       struct rt6_info *rt, *match;
+       struct rt6_info *rt, *match, *cont;
        int mpri = -1;
 
        match = NULL;
-       for (rt = rr_head; rt && rt->rt6i_metric == metric;
-            rt = rt->dst.rt6_next)
+       cont = NULL;
+       for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+               if (rt->rt6i_metric != metric) {
+                       cont = rt;
+                       break;
+               }
+
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
-       for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
-            rt = rt->dst.rt6_next)
+       }
+
+       for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+               if (rt->rt6i_metric != metric) {
+                       cont = rt;
+                       break;
+               }
+
+               match = find_match(rt, oif, strict, &mpri, match, do_rr);
+       }
+
+       if (match || !cont)
+               return match;
+
+       for (rt = cont; rt; rt = rt->dst.rt6_next)
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
        return match;
@@ -694,6 +757,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
        return match ? match : net->ipv6.ip6_null_entry;
 }
 
+static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+{
+       return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+}
+
 #ifdef CONFIG_IPV6_ROUTE_INFO
 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr)
@@ -872,9 +940,9 @@ int ip6_ins_rt(struct rt6_info *rt)
        return __ip6_ins_rt(rt, &info, &mxc);
 }
 
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
-                                     const struct in6_addr *daddr,
-                                     const struct in6_addr *saddr)
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+                                          const struct in6_addr *daddr,
+                                          const struct in6_addr *saddr)
 {
        struct rt6_info *rt;
 
@@ -882,15 +950,26 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
         *      Clone the route.
         */
 
-       rt = ip6_rt_copy(ort, daddr);
+       if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+               ort = (struct rt6_info *)ort->dst.from;
 
-       if (rt) {
+       rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
+                            0, ort->rt6i_table);
+
+       if (!rt)
+               return NULL;
+
+       ip6_rt_copy_init(rt, ort);
+       rt->rt6i_flags |= RTF_CACHE;
+       rt->rt6i_metric = 0;
+       rt->dst.flags |= DST_HOST;
+       rt->rt6i_dst.addr = *daddr;
+       rt->rt6i_dst.plen = 128;
+
+       if (!rt6_is_gw_or_nonexthop(ort)) {
                if (ort->rt6i_dst.plen != 128 &&
                    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
                        rt->rt6i_flags |= RTF_ANYCAST;
-
-               rt->rt6i_flags |= RTF_CACHE;
-
 #ifdef CONFIG_IPV6_SUBTREES
                if (rt->rt6i_src.plen && saddr) {
                        rt->rt6i_src.addr = *saddr;
@@ -902,30 +981,65 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
        return rt;
 }
 
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
-                                       const struct in6_addr *daddr)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
-       struct rt6_info *rt = ip6_rt_copy(ort, daddr);
+       struct rt6_info *pcpu_rt;
 
-       if (rt)
-               rt->rt6i_flags |= RTF_CACHE;
-       return rt;
+       pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+                                 rt->dst.dev, rt->dst.flags,
+                                 rt->rt6i_table);
+
+       if (!pcpu_rt)
+               return NULL;
+       ip6_rt_copy_init(pcpu_rt, rt);
+       pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+       pcpu_rt->rt6i_flags |= RTF_PCPU;
+       return pcpu_rt;
+}
+
+/* It should be called with read_lock_bh(&tb6_lock) acquired */
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+       struct rt6_info *pcpu_rt, *prev, **p;
+
+       p = this_cpu_ptr(rt->rt6i_pcpu);
+       pcpu_rt = *p;
+
+       if (pcpu_rt)
+               goto done;
+
+       pcpu_rt = ip6_rt_pcpu_alloc(rt);
+       if (!pcpu_rt) {
+               struct net *net = dev_net(rt->dst.dev);
+
+               pcpu_rt = net->ipv6.ip6_null_entry;
+               goto done;
+       }
+
+       prev = cmpxchg(p, NULL, pcpu_rt);
+       if (prev) {
+               /* If someone did it before us, return prev instead */
+               dst_destroy(&pcpu_rt->dst);
+               pcpu_rt = prev;
+       }
+
+done:
+       dst_hold(&pcpu_rt->dst);
+       rt6_dst_from_metrics_check(pcpu_rt);
+       return pcpu_rt;
 }
 
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
                                      struct flowi6 *fl6, int flags)
 {
        struct fib6_node *fn, *saved_fn;
-       struct rt6_info *rt, *nrt;
+       struct rt6_info *rt;
        int strict = 0;
-       int attempts = 3;
-       int err;
 
        strict |= flags & RT6_LOOKUP_F_IFACE;
        if (net->ipv6.devconf_all->forwarding == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;
 
-redo_fib6_lookup_lock:
        read_lock_bh(&table->tb6_lock);
 
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@@ -944,51 +1058,52 @@ redo_rt6_select:
                        strict &= ~RT6_LOOKUP_F_REACHABLE;
                        fn = saved_fn;
                        goto redo_rt6_select;
-               } else {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       goto out2;
                }
        }
 
-       dst_hold(&rt->dst);
-       read_unlock_bh(&table->tb6_lock);
 
-       if (rt->rt6i_flags & RTF_CACHE)
-               goto out2;
+       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
+               dst_use(&rt->dst, jiffies);
+               read_unlock_bh(&table->tb6_lock);
 
-       if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
-               nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
-       else if (!(rt->dst.flags & DST_HOST))
-               nrt = rt6_alloc_clone(rt, &fl6->daddr);
-       else
-               goto out2;
+               rt6_dst_from_metrics_check(rt);
+               return rt;
+       } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+                           !(rt->rt6i_flags & RTF_GATEWAY))) {
+               /* Create a RTF_CACHE clone which will not be
+                * owned by the fib6 tree.  It is for the special case where
+                * the daddr in the skb during the neighbor look-up is different
+                * from the fl6->daddr used to look-up route here.
+                */
 
-       ip6_rt_put(rt);
-       rt = nrt ? : net->ipv6.ip6_null_entry;
+               struct rt6_info *uncached_rt;
 
-       dst_hold(&rt->dst);
-       if (nrt) {
-               err = ip6_ins_rt(nrt);
-               if (!err)
-                       goto out2;
-       }
+               dst_use(&rt->dst, jiffies);
+               read_unlock_bh(&table->tb6_lock);
 
-       if (--attempts <= 0)
-               goto out2;
+               uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+               dst_release(&rt->dst);
 
-       /*
-        * Race condition! In the gap, when table->tb6_lock was
-        * released someone could insert this route.  Relookup.
-        */
-       ip6_rt_put(rt);
-       goto redo_fib6_lookup_lock;
+               if (uncached_rt)
+                       rt6_uncached_list_add(uncached_rt);
+               else
+                       uncached_rt = net->ipv6.ip6_null_entry;
 
-out2:
-       rt->dst.lastuse = jiffies;
-       rt->dst.__use++;
+               dst_hold(&uncached_rt->dst);
+               return uncached_rt;
 
-       return rt;
+       } else {
+               /* Get a percpu copy */
+
+               struct rt6_info *pcpu_rt;
+
+               rt->dst.lastuse = jiffies;
+               rt->dst.__use++;
+               pcpu_rt = rt6_get_pcpu_route(rt);
+               read_unlock_bh(&table->tb6_lock);
+
+               return pcpu_rt;
+       }
 }
 
 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1059,7 +1174,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
                new = &rt->dst;
 
                memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
-               rt6_init_peer(rt, net->ipv6.peers);
 
                new->__use = 1;
                new->input = dst_discard;
@@ -1093,6 +1207,33 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *     Destination cache support functions
  */
 
+static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+{
+       if (rt->dst.from &&
+           dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
+               dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+}
+
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+       if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+               return NULL;
+
+       if (rt6_check_expired(rt))
+               return NULL;
+
+       return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+{
+       if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+           rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+               return &rt->dst;
+       else
+               return NULL;
+}
+
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
        struct rt6_info *rt;
@@ -1103,13 +1244,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         */
-       if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
-               return NULL;
 
-       if (rt6_check_expired(rt))
-               return NULL;
+       rt6_dst_from_metrics_check(rt);
 
-       return dst;
+       if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
+               return rt6_dst_from_check(rt, cookie);
+       else
+               return rt6_check(rt, cookie);
 }
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -1148,24 +1289,63 @@ static void ip6_link_failure(struct sk_buff *skb)
        }
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                              struct sk_buff *skb, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+{
+       struct net *net = dev_net(rt->dst.dev);
+
+       rt->rt6i_flags |= RTF_MODIFIED;
+       rt->rt6i_pmtu = mtu;
+       rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+                                const struct ipv6hdr *iph, u32 mtu)
 {
        struct rt6_info *rt6 = (struct rt6_info *)dst;
 
-       dst_confirm(dst);
-       if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
-               struct net *net = dev_net(dst->dev);
+       if (rt6->rt6i_flags & RTF_LOCAL)
+               return;
 
-               rt6->rt6i_flags |= RTF_MODIFIED;
-               if (mtu < IPV6_MIN_MTU)
-                       mtu = IPV6_MIN_MTU;
+       dst_confirm(dst);
+       mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+       if (mtu >= dst_mtu(dst))
+               return;
 
-               dst_metric_set(dst, RTAX_MTU, mtu);
-               rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+       if (rt6->rt6i_flags & RTF_CACHE) {
+               rt6_do_update_pmtu(rt6, mtu);
+       } else {
+               const struct in6_addr *daddr, *saddr;
+               struct rt6_info *nrt6;
+
+               if (iph) {
+                       daddr = &iph->daddr;
+                       saddr = &iph->saddr;
+               } else if (sk) {
+                       daddr = &sk->sk_v6_daddr;
+                       saddr = &inet6_sk(sk)->saddr;
+               } else {
+                       return;
+               }
+               nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+               if (nrt6) {
+                       rt6_do_update_pmtu(nrt6, mtu);
+
+                       /* ip6_ins_rt(nrt6) will bump the
+                        * rt6->rt6i_node->fn_sernum
+                        * which will fail the next rt6_check() and
+                        * invalidate the sk->sk_dst_cache.
+                        */
+                       ip6_ins_rt(nrt6);
+               }
        }
 }
 
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                              struct sk_buff *skb, u32 mtu)
+{
+       __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+}
+
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
                     int oif, u32 mark)
 {
@@ -1182,7 +1362,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
-               ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+               __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
        dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1341,9 +1521,14 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 static unsigned int ip6_mtu(const struct dst_entry *dst)
 {
+       const struct rt6_info *rt = (const struct rt6_info *)dst;
+       unsigned int mtu = rt->rt6i_pmtu;
        struct inet6_dev *idev;
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
+       if (mtu)
+               goto out;
+
+       mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu)
                goto out;
 
@@ -1590,10 +1775,8 @@ int ip6_route_add(struct fib6_config *cfg)
 
        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
        rt->rt6i_dst.plen = cfg->fc_dst_len;
-       if (rt->rt6i_dst.plen == 128) {
+       if (rt->rt6i_dst.plen == 128)
                rt->dst.flags |= DST_HOST;
-               dst_metrics_set_force_overwrite(&rt->dst);
-       }
 
 #ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -1651,6 +1834,16 @@ int ip6_route_add(struct fib6_config *cfg)
                int gwa_type;
 
                gw_addr = &cfg->fc_gateway;
+
+               /* if gw_addr is local we will fail to detect this in case
+                * address is still TENTATIVE (DAD in progress). rt6_lookup()
+                * will return already-added prefix route via interface that
+                * prefix route was assigned to, which might be non-loopback.
+                */
+               err = -EINVAL;
+               if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
+                       goto out;
+
                rt->rt6i_gateway = *gw_addr;
                gwa_type = ipv6_addr_type(gw_addr);
 
@@ -1664,7 +1857,6 @@ int ip6_route_add(struct fib6_config *cfg)
                           (SIT, PtP, NBMA NOARP links) it is handy to allow
                           some exceptions. --ANK
                         */
-                       err = -EINVAL;
                        if (!(gwa_type & IPV6_ADDR_UNICAST))
                                goto out;
 
@@ -1785,6 +1977,9 @@ static int ip6_route_del(struct fib6_config *cfg)
 
        if (fn) {
                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+                       if ((rt->rt6i_flags & RTF_CACHE) &&
+                           !(cfg->fc_flags & RTF_CACHE))
+                               continue;
                        if (cfg->fc_ifindex &&
                            (!rt->dst.dev ||
                             rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -1894,7 +2089,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                                     NEIGH_UPDATE_F_ISROUTER))
                     );
 
-       nrt = ip6_rt_copy(rt, &msg->dest);
+       nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
        if (!nrt)
                goto out;
 
@@ -1926,42 +2121,35 @@ out:
  *     Misc support functions
  */
 
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
-                                   const struct in6_addr *dest)
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-       struct net *net = dev_net(ort->dst.dev);
-       struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
-                                           ort->rt6i_table);
-
-       if (rt) {
-               rt->dst.input = ort->dst.input;
-               rt->dst.output = ort->dst.output;
-               rt->dst.flags |= DST_HOST;
-
-               rt->rt6i_dst.addr = *dest;
-               rt->rt6i_dst.plen = 128;
-               dst_copy_metrics(&rt->dst, &ort->dst);
-               rt->dst.error = ort->dst.error;
-               rt->rt6i_idev = ort->rt6i_idev;
-               if (rt->rt6i_idev)
-                       in6_dev_hold(rt->rt6i_idev);
-               rt->dst.lastuse = jiffies;
+       BUG_ON(from->dst.from);
 
-               if (ort->rt6i_flags & RTF_GATEWAY)
-                       rt->rt6i_gateway = ort->rt6i_gateway;
-               else
-                       rt->rt6i_gateway = *dest;
-               rt->rt6i_flags = ort->rt6i_flags;
-               rt6_set_from(rt, ort);
-               rt->rt6i_metric = 0;
+       rt->rt6i_flags &= ~RTF_EXPIRES;
+       dst_hold(&from->dst);
+       rt->dst.from = &from->dst;
+       dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
 
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+       rt->dst.input = ort->dst.input;
+       rt->dst.output = ort->dst.output;
+       rt->rt6i_dst = ort->rt6i_dst;
+       rt->dst.error = ort->dst.error;
+       rt->rt6i_idev = ort->rt6i_idev;
+       if (rt->rt6i_idev)
+               in6_dev_hold(rt->rt6i_idev);
+       rt->dst.lastuse = jiffies;
+       rt->rt6i_gateway = ort->rt6i_gateway;
+       rt->rt6i_flags = ort->rt6i_flags;
+       rt6_set_from(rt, ort);
+       rt->rt6i_metric = ort->rt6i_metric;
 #ifdef CONFIG_IPV6_SUBTREES
-               memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+       rt->rt6i_src = ort->rt6i_src;
 #endif
-               memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
-               rt->rt6i_table = ort->rt6i_table;
-       }
-       return rt;
+       rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+       rt->rt6i_table = ort->rt6i_table;
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2336,6 +2524,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
 
        fib6_clean_all(net, fib6_ifdown, &adn);
        icmp6_clean_all(fib6_ifdown, &adn);
+       rt6_uncached_list_flush_dev(net, dev);
 }
 
 struct rt6_mtu_change_arg {
@@ -2373,11 +2562,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
           PMTU discouvery.
         */
        if (rt->dst.dev == arg->dev &&
-           !dst_metric_locked(&rt->dst, RTAX_MTU) &&
-           (dst_mtu(&rt->dst) >= arg->mtu ||
-            (dst_mtu(&rt->dst) < arg->mtu &&
-             dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
-               dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+           !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+               if (rt->rt6i_flags & RTF_CACHE) {
+                       /* For RTF_CACHE with rt6i_pmtu == 0
+                        * (i.e. a redirected route),
+                        * the metrics of its rt->dst.from has already
+                        * been updated.
+                        */
+                       if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
+                               rt->rt6i_pmtu = arg->mtu;
+               } else if (dst_mtu(&rt->dst) >= arg->mtu ||
+                          (dst_mtu(&rt->dst) < arg->mtu &&
+                           dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+                       dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+               }
        }
        return 0;
 }
@@ -2434,6 +2632,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
        if (rtm->rtm_type == RTN_LOCAL)
                cfg->fc_flags |= RTF_LOCAL;
 
+       if (rtm->rtm_flags & RTM_F_CLONED)
+               cfg->fc_flags |= RTF_CACHE;
+
        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2504,9 +2705,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add)
        int attrlen;
        int err = 0, last_err = 0;
 
+       remaining = cfg->fc_mp_len;
 beginning:
        rtnh = (struct rtnexthop *)cfg->fc_mp;
-       remaining = cfg->fc_mp_len;
 
        /* Parse a Multipath Entry */
        while (rtnh_ok(rtnh, remaining)) {
@@ -2536,15 +2737,19 @@ beginning:
                                 * next hops that have been already added.
                                 */
                                add = 0;
+                               remaining = cfg->fc_mp_len - remaining;
                                goto beginning;
                        }
                }
                /* Because each route is added like a single route we remove
-                * this flag after the first nexthop (if there is a collision,
-                * we have already fail to add the first nexthop:
-                * fib6_add_rt2node() has reject it).
+                * these flags after the first nexthop: if there is a collision,
+                * we have already failed to add the first nexthop:
+                * fib6_add_rt2node() has rejected it; when replacing, old
+                * nexthops have been replaced by first new, the rest should
+                * be added to it.
                 */
-               cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+               cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+                                                    NLM_F_REPLACE);
                rtnh = rtnh_next(rtnh, &remaining);
        }
 
@@ -2604,6 +2809,7 @@ static int rt6_fill_node(struct net *net,
                         int iif, int type, u32 portid, u32 seq,
                         int prefix, int nowait, unsigned int flags)
 {
+       u32 metrics[RTAX_MAX];
        struct rtmsg *rtm;
        struct nlmsghdr *nlh;
        long expires;
@@ -2717,7 +2923,10 @@ static int rt6_fill_node(struct net *net,
                        goto nla_put_failure;
        }
 
-       if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+       memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+       if (rt->rt6i_pmtu)
+               metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
+       if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
        if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -3212,6 +3421,7 @@ static struct notifier_block ip6_route_dev_notifier = {
 int __init ip6_route_init(void)
 {
        int ret;
+       int cpu;
 
        ret = -ENOMEM;
        ip6_dst_ops_template.kmem_cachep =
@@ -3271,6 +3481,13 @@ int __init ip6_route_init(void)
        if (ret)
                goto out_register_late_subsys;
 
+       for_each_possible_cpu(cpu) {
+               struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+               INIT_LIST_HEAD(&ul->head);
+               spin_lock_init(&ul->lock);
+       }
+
 out:
        return ret;