net: l3mdev: remove redundant calls
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <asm/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr, int ifindex,
106                                            unsigned int pref);
107 static struct rt6_info *rt6_get_route_info(struct net *net,
108                                            const struct in6_addr *prefix, int prefixlen,
109                                            const struct in6_addr *gwaddr, int ifindex);
110 #endif
111
112 struct uncached_list {
113         spinlock_t              lock;
114         struct list_head        head;
115 };
116
117 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
118
119 static void rt6_uncached_list_add(struct rt6_info *rt)
120 {
121         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
122
123         rt->dst.flags |= DST_NOCACHE;
124         rt->rt6i_uncached_list = ul;
125
126         spin_lock_bh(&ul->lock);
127         list_add_tail(&rt->rt6i_uncached, &ul->head);
128         spin_unlock_bh(&ul->lock);
129 }
130
131 static void rt6_uncached_list_del(struct rt6_info *rt)
132 {
133         if (!list_empty(&rt->rt6i_uncached)) {
134                 struct uncached_list *ul = rt->rt6i_uncached_list;
135
136                 spin_lock_bh(&ul->lock);
137                 list_del(&rt->rt6i_uncached);
138                 spin_unlock_bh(&ul->lock);
139         }
140 }
141
142 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
143 {
144         struct net_device *loopback_dev = net->loopback_dev;
145         int cpu;
146
147         if (dev == loopback_dev)
148                 return;
149
150         for_each_possible_cpu(cpu) {
151                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152                 struct rt6_info *rt;
153
154                 spin_lock_bh(&ul->lock);
155                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
156                         struct inet6_dev *rt_idev = rt->rt6i_idev;
157                         struct net_device *rt_dev = rt->dst.dev;
158
159                         if (rt_idev->dev == dev) {
160                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
161                                 in6_dev_put(rt_idev);
162                         }
163
164                         if (rt_dev == dev) {
165                                 rt->dst.dev = loopback_dev;
166                                 dev_hold(rt->dst.dev);
167                                 dev_put(rt_dev);
168                         }
169                 }
170                 spin_unlock_bh(&ul->lock);
171         }
172 }
173
174 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
175 {
176         return dst_metrics_write_ptr(rt->dst.from);
177 }
178
179 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
180 {
181         struct rt6_info *rt = (struct rt6_info *)dst;
182
183         if (rt->rt6i_flags & RTF_PCPU)
184                 return rt6_pcpu_cow_metrics(rt);
185         else if (rt->rt6i_flags & RTF_CACHE)
186                 return NULL;
187         else
188                 return dst_cow_metrics_generic(dst, old);
189 }
190
191 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         struct in6_addr *p = &rt->rt6i_gateway;
196
197         if (!ipv6_addr_any(p))
198                 return (const void *) p;
199         else if (skb)
200                 return &ipv6_hdr(skb)->daddr;
201         return daddr;
202 }
203
204 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
205                                           struct sk_buff *skb,
206                                           const void *daddr)
207 {
208         struct rt6_info *rt = (struct rt6_info *) dst;
209         struct neighbour *n;
210
211         daddr = choose_neigh_daddr(rt, skb, daddr);
212         n = __ipv6_neigh_lookup(dst->dev, daddr);
213         if (n)
214                 return n;
215         return neigh_create(&nd_tbl, daddr, dst->dev);
216 }
217
218 static struct dst_ops ip6_dst_ops_template = {
219         .family                 =       AF_INET6,
220         .gc                     =       ip6_dst_gc,
221         .gc_thresh              =       1024,
222         .check                  =       ip6_dst_check,
223         .default_advmss         =       ip6_default_advmss,
224         .mtu                    =       ip6_mtu,
225         .cow_metrics            =       ipv6_cow_metrics,
226         .destroy                =       ip6_dst_destroy,
227         .ifdown                 =       ip6_dst_ifdown,
228         .negative_advice        =       ip6_negative_advice,
229         .link_failure           =       ip6_link_failure,
230         .update_pmtu            =       ip6_rt_update_pmtu,
231         .redirect               =       rt6_do_redirect,
232         .local_out              =       __ip6_local_out,
233         .neigh_lookup           =       ip6_neigh_lookup,
234 };
235
236 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
237 {
238         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
239
240         return mtu ? : dst->dev->mtu;
241 }
242
243 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
244                                          struct sk_buff *skb, u32 mtu)
245 {
246 }
247
248 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
249                                       struct sk_buff *skb)
250 {
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       dst_cow_metrics_generic,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 static void rt6_info_init(struct rt6_info *rt)
319 {
320         struct dst_entry *dst = &rt->dst;
321
322         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
323         INIT_LIST_HEAD(&rt->rt6i_siblings);
324         INIT_LIST_HEAD(&rt->rt6i_uncached);
325 }
326
327 /* allocate dst with ip6_dst_ops */
328 static struct rt6_info *__ip6_dst_alloc(struct net *net,
329                                         struct net_device *dev,
330                                         int flags)
331 {
332         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
333                                         0, DST_OBSOLETE_FORCE_CHK, flags);
334
335         if (rt)
336                 rt6_info_init(rt);
337
338         return rt;
339 }
340
341 struct rt6_info *ip6_dst_alloc(struct net *net,
342                                struct net_device *dev,
343                                int flags)
344 {
345         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346
347         if (rt) {
348                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349                 if (rt->rt6i_pcpu) {
350                         int cpu;
351
352                         for_each_possible_cpu(cpu) {
353                                 struct rt6_info **p;
354
355                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356                                 /* no one shares rt */
357                                 *p =  NULL;
358                         }
359                 } else {
360                         dst_destroy((struct dst_entry *)rt);
361                         return NULL;
362                 }
363         }
364
365         return rt;
366 }
367 EXPORT_SYMBOL(ip6_dst_alloc);
368
369 static void ip6_dst_destroy(struct dst_entry *dst)
370 {
371         struct rt6_info *rt = (struct rt6_info *)dst;
372         struct dst_entry *from = dst->from;
373         struct inet6_dev *idev;
374
375         dst_destroy_metrics_generic(dst);
376         free_percpu(rt->rt6i_pcpu);
377         rt6_uncached_list_del(rt);
378
379         idev = rt->rt6i_idev;
380         if (idev) {
381                 rt->rt6i_idev = NULL;
382                 in6_dev_put(idev);
383         }
384
385         dst->from = NULL;
386         dst_release(from);
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (dev != loopback_dev) {
398                 if (idev && idev->dev == dev) {
399                         struct inet6_dev *loopback_idev =
400                                 in6_dev_get(loopback_dev);
401                         if (loopback_idev) {
402                                 rt->rt6i_idev = loopback_idev;
403                                 in6_dev_put(idev);
404                         }
405                 }
406         }
407 }
408
409 static bool __rt6_check_expired(const struct rt6_info *rt)
410 {
411         if (rt->rt6i_flags & RTF_EXPIRES)
412                 return time_after(jiffies, rt->dst.expires);
413         else
414                 return false;
415 }
416
417 static bool rt6_check_expired(const struct rt6_info *rt)
418 {
419         if (rt->rt6i_flags & RTF_EXPIRES) {
420                 if (time_after(jiffies, rt->dst.expires))
421                         return true;
422         } else if (rt->dst.from) {
423                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
424         }
425         return false;
426 }
427
428 /* Multipath route selection:
429  *   Hash based function using packet header and flowlabel.
430  * Adapted from fib_info_hashfn()
431  */
432 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
433                                const struct flowi6 *fl6)
434 {
435         return get_hash_from_flowi6(fl6) % candidate_count;
436 }
437
438 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
439                                              struct flowi6 *fl6, int oif,
440                                              int strict)
441 {
442         struct rt6_info *sibling, *next_sibling;
443         int route_choosen;
444
445         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
446         /* Don't change the route, if route_choosen == 0
447          * (siblings does not include ourself)
448          */
449         if (route_choosen)
450                 list_for_each_entry_safe(sibling, next_sibling,
451                                 &match->rt6i_siblings, rt6i_siblings) {
452                         route_choosen--;
453                         if (route_choosen == 0) {
454                                 if (rt6_score_route(sibling, oif, strict) < 0)
455                                         break;
456                                 match = sibling;
457                                 break;
458                         }
459                 }
460         return match;
461 }
462
463 /*
464  *      Route lookup. Any table->tb6_lock is implied.
465  */
466
467 static inline struct rt6_info *rt6_device_match(struct net *net,
468                                                     struct rt6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct rt6_info *local = NULL;
474         struct rt6_info *sprt;
475
476         if (!oif && ipv6_addr_any(saddr))
477                 goto out;
478
479         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
480                 struct net_device *dev = sprt->dst.dev;
481
482                 if (oif) {
483                         if (dev->ifindex == oif)
484                                 return sprt;
485                         if (dev->flags & IFF_LOOPBACK) {
486                                 if (!sprt->rt6i_idev ||
487                                     sprt->rt6i_idev->dev->ifindex != oif) {
488                                         if (flags & RT6_LOOKUP_F_IFACE)
489                                                 continue;
490                                         if (local &&
491                                             local->rt6i_idev->dev->ifindex == oif)
492                                                 continue;
493                                 }
494                                 local = sprt;
495                         }
496                 } else {
497                         if (ipv6_chk_addr(net, saddr, dev,
498                                           flags & RT6_LOOKUP_F_IFACE))
499                                 return sprt;
500                 }
501         }
502
503         if (oif) {
504                 if (local)
505                         return local;
506
507                 if (flags & RT6_LOOKUP_F_IFACE)
508                         return net->ipv6.ip6_null_entry;
509         }
510 out:
511         return rt;
512 }
513
514 #ifdef CONFIG_IPV6_ROUTER_PREF
515 struct __rt6_probe_work {
516         struct work_struct work;
517         struct in6_addr target;
518         struct net_device *dev;
519 };
520
521 static void rt6_probe_deferred(struct work_struct *w)
522 {
523         struct in6_addr mcaddr;
524         struct __rt6_probe_work *work =
525                 container_of(w, struct __rt6_probe_work, work);
526
527         addrconf_addr_solict_mult(&work->target, &mcaddr);
528         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
529         dev_put(work->dev);
530         kfree(work);
531 }
532
533 static void rt6_probe(struct rt6_info *rt)
534 {
535         struct __rt6_probe_work *work;
536         struct neighbour *neigh;
537         /*
538          * Okay, this does not seem to be appropriate
539          * for now, however, we need to check if it
540          * is really so; aka Router Reachability Probing.
541          *
542          * Router Reachability Probe MUST be rate-limited
543          * to no more than one per minute.
544          */
545         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
546                 return;
547         rcu_read_lock_bh();
548         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
549         if (neigh) {
550                 if (neigh->nud_state & NUD_VALID)
551                         goto out;
552
553                 work = NULL;
554                 write_lock(&neigh->lock);
555                 if (!(neigh->nud_state & NUD_VALID) &&
556                     time_after(jiffies,
557                                neigh->updated +
558                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
559                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
560                         if (work)
561                                 __neigh_set_probe_once(neigh);
562                 }
563                 write_unlock(&neigh->lock);
564         } else {
565                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
566         }
567
568         if (work) {
569                 INIT_WORK(&work->work, rt6_probe_deferred);
570                 work->target = rt->rt6i_gateway;
571                 dev_hold(rt->dst.dev);
572                 work->dev = rt->dst.dev;
573                 schedule_work(&work->work);
574         }
575
576 out:
577         rcu_read_unlock_bh();
578 }
579 #else
580 static inline void rt6_probe(struct rt6_info *rt)
581 {
582 }
583 #endif
584
585 /*
586  * Default Router Selection (RFC 2461 6.3.6)
587  */
588 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
589 {
590         struct net_device *dev = rt->dst.dev;
591         if (!oif || dev->ifindex == oif)
592                 return 2;
593         if ((dev->flags & IFF_LOOPBACK) &&
594             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
595                 return 1;
596         return 0;
597 }
598
599 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
600 {
601         struct neighbour *neigh;
602         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
603
604         if (rt->rt6i_flags & RTF_NONEXTHOP ||
605             !(rt->rt6i_flags & RTF_GATEWAY))
606                 return RT6_NUD_SUCCEED;
607
608         rcu_read_lock_bh();
609         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
610         if (neigh) {
611                 read_lock(&neigh->lock);
612                 if (neigh->nud_state & NUD_VALID)
613                         ret = RT6_NUD_SUCCEED;
614 #ifdef CONFIG_IPV6_ROUTER_PREF
615                 else if (!(neigh->nud_state & NUD_FAILED))
616                         ret = RT6_NUD_SUCCEED;
617                 else
618                         ret = RT6_NUD_FAIL_PROBE;
619 #endif
620                 read_unlock(&neigh->lock);
621         } else {
622                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
623                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
624         }
625         rcu_read_unlock_bh();
626
627         return ret;
628 }
629
630 static int rt6_score_route(struct rt6_info *rt, int oif,
631                            int strict)
632 {
633         int m;
634
635         m = rt6_check_dev(rt, oif);
636         if (!m && (strict & RT6_LOOKUP_F_IFACE))
637                 return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
640 #endif
641         if (strict & RT6_LOOKUP_F_REACHABLE) {
642                 int n = rt6_check_neigh(rt);
643                 if (n < 0)
644                         return n;
645         }
646         return m;
647 }
648
649 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
650                                    int *mpri, struct rt6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655         struct inet6_dev *idev = rt->rt6i_idev;
656         struct net_device *dev = rt->dst.dev;
657
658         if (dev && !netif_carrier_ok(dev) &&
659             idev->cnf.ignore_routes_with_linkdown)
660                 goto out;
661
662         if (rt6_check_expired(rt))
663                 goto out;
664
665         m = rt6_score_route(rt, oif, strict);
666         if (m == RT6_NUD_FAIL_DO_RR) {
667                 match_do_rr = true;
668                 m = 0; /* lowest valid score */
669         } else if (m == RT6_NUD_FAIL_HARD) {
670                 goto out;
671         }
672
673         if (strict & RT6_LOOKUP_F_REACHABLE)
674                 rt6_probe(rt);
675
676         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
677         if (m > *mpri) {
678                 *do_rr = match_do_rr;
679                 *mpri = m;
680                 match = rt;
681         }
682 out:
683         return match;
684 }
685
686 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
687                                      struct rt6_info *rr_head,
688                                      u32 metric, int oif, int strict,
689                                      bool *do_rr)
690 {
691         struct rt6_info *rt, *match, *cont;
692         int mpri = -1;
693
694         match = NULL;
695         cont = NULL;
696         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
697                 if (rt->rt6i_metric != metric) {
698                         cont = rt;
699                         break;
700                 }
701
702                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
703         }
704
705         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
706                 if (rt->rt6i_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         if (match || !cont)
715                 return match;
716
717         for (rt = cont; rt; rt = rt->dst.rt6_next)
718                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719
720         return match;
721 }
722
723 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
724 {
725         struct rt6_info *match, *rt0;
726         struct net *net;
727         bool do_rr = false;
728
729         rt0 = fn->rr_ptr;
730         if (!rt0)
731                 fn->rr_ptr = rt0 = fn->leaf;
732
733         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
734                              &do_rr);
735
736         if (do_rr) {
737                 struct rt6_info *next = rt0->dst.rt6_next;
738
739                 /* no entries matched; do round-robin */
740                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
741                         next = fn->leaf;
742
743                 if (next != rt0)
744                         fn->rr_ptr = next;
745         }
746
747         net = dev_net(rt0->dst.dev);
748         return match ? match : net->ipv6.ip6_null_entry;
749 }
750
751 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
752 {
753         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
754 }
755
756 #ifdef CONFIG_IPV6_ROUTE_INFO
757 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
758                   const struct in6_addr *gwaddr)
759 {
760         struct net *net = dev_net(dev);
761         struct route_info *rinfo = (struct route_info *) opt;
762         struct in6_addr prefix_buf, *prefix;
763         unsigned int pref;
764         unsigned long lifetime;
765         struct rt6_info *rt;
766
767         if (len < sizeof(struct route_info)) {
768                 return -EINVAL;
769         }
770
771         /* Sanity check for prefix_len and length */
772         if (rinfo->length > 3) {
773                 return -EINVAL;
774         } else if (rinfo->prefix_len > 128) {
775                 return -EINVAL;
776         } else if (rinfo->prefix_len > 64) {
777                 if (rinfo->length < 2) {
778                         return -EINVAL;
779                 }
780         } else if (rinfo->prefix_len > 0) {
781                 if (rinfo->length < 1) {
782                         return -EINVAL;
783                 }
784         }
785
786         pref = rinfo->route_pref;
787         if (pref == ICMPV6_ROUTER_PREF_INVALID)
788                 return -EINVAL;
789
790         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
791
792         if (rinfo->length == 3)
793                 prefix = (struct in6_addr *)rinfo->prefix;
794         else {
795                 /* this function is safe */
796                 ipv6_addr_prefix(&prefix_buf,
797                                  (struct in6_addr *)rinfo->prefix,
798                                  rinfo->prefix_len);
799                 prefix = &prefix_buf;
800         }
801
802         if (rinfo->prefix_len == 0)
803                 rt = rt6_get_dflt_router(gwaddr, dev);
804         else
805                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
806                                         gwaddr, dev->ifindex);
807
808         if (rt && !lifetime) {
809                 ip6_del_rt(rt);
810                 rt = NULL;
811         }
812
813         if (!rt && lifetime)
814                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
815                                         pref);
816         else if (rt)
817                 rt->rt6i_flags = RTF_ROUTEINFO |
818                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
819
820         if (rt) {
821                 if (!addrconf_finite_timeout(lifetime))
822                         rt6_clean_expires(rt);
823                 else
824                         rt6_set_expires(rt, jiffies + HZ * lifetime);
825
826                 ip6_rt_put(rt);
827         }
828         return 0;
829 }
830 #endif
831
832 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
833                                         struct in6_addr *saddr)
834 {
835         struct fib6_node *pn;
836         while (1) {
837                 if (fn->fn_flags & RTN_TL_ROOT)
838                         return NULL;
839                 pn = fn->parent;
840                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
841                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
842                 else
843                         fn = pn;
844                 if (fn->fn_flags & RTN_RTINFO)
845                         return fn;
846         }
847 }
848
849 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
850                                              struct fib6_table *table,
851                                              struct flowi6 *fl6, int flags)
852 {
853         struct fib6_node *fn;
854         struct rt6_info *rt;
855
856         read_lock_bh(&table->tb6_lock);
857         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
858 restart:
859         rt = fn->leaf;
860         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
861         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
862                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
863         if (rt == net->ipv6.ip6_null_entry) {
864                 fn = fib6_backtrack(fn, &fl6->saddr);
865                 if (fn)
866                         goto restart;
867         }
868         dst_use(&rt->dst, jiffies);
869         read_unlock_bh(&table->tb6_lock);
870
871         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
872
873         return rt;
874
875 }
876
877 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
878                                     int flags)
879 {
880         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
881 }
882 EXPORT_SYMBOL_GPL(ip6_route_lookup);
883
884 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
885                             const struct in6_addr *saddr, int oif, int strict)
886 {
887         struct flowi6 fl6 = {
888                 .flowi6_oif = oif,
889                 .daddr = *daddr,
890         };
891         struct dst_entry *dst;
892         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
893
894         if (saddr) {
895                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
896                 flags |= RT6_LOOKUP_F_HAS_SADDR;
897         }
898
899         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
900         if (dst->error == 0)
901                 return (struct rt6_info *) dst;
902
903         dst_release(dst);
904
905         return NULL;
906 }
907 EXPORT_SYMBOL(rt6_lookup);
908
909 /* ip6_ins_rt is called with FREE table->tb6_lock.
910    It takes new route entry, the addition fails by any reason the
911    route is freed. In any case, if caller does not hold it, it may
912    be destroyed.
913  */
914
915 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
916                         struct mx6_config *mxc)
917 {
918         int err;
919         struct fib6_table *table;
920
921         table = rt->rt6i_table;
922         write_lock_bh(&table->tb6_lock);
923         err = fib6_add(&table->tb6_root, rt, info, mxc);
924         write_unlock_bh(&table->tb6_lock);
925
926         return err;
927 }
928
929 int ip6_ins_rt(struct rt6_info *rt)
930 {
931         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
932         struct mx6_config mxc = { .mx = NULL, };
933
934         return __ip6_ins_rt(rt, &info, &mxc);
935 }
936
937 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
938                                            const struct in6_addr *daddr,
939                                            const struct in6_addr *saddr)
940 {
941         struct rt6_info *rt;
942
943         /*
944          *      Clone the route.
945          */
946
947         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
948                 ort = (struct rt6_info *)ort->dst.from;
949
950         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
951
952         if (!rt)
953                 return NULL;
954
955         ip6_rt_copy_init(rt, ort);
956         rt->rt6i_flags |= RTF_CACHE;
957         rt->rt6i_metric = 0;
958         rt->dst.flags |= DST_HOST;
959         rt->rt6i_dst.addr = *daddr;
960         rt->rt6i_dst.plen = 128;
961
962         if (!rt6_is_gw_or_nonexthop(ort)) {
963                 if (ort->rt6i_dst.plen != 128 &&
964                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
965                         rt->rt6i_flags |= RTF_ANYCAST;
966 #ifdef CONFIG_IPV6_SUBTREES
967                 if (rt->rt6i_src.plen && saddr) {
968                         rt->rt6i_src.addr = *saddr;
969                         rt->rt6i_src.plen = 128;
970                 }
971 #endif
972         }
973
974         return rt;
975 }
976
977 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
978 {
979         struct rt6_info *pcpu_rt;
980
981         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
982                                   rt->dst.dev, rt->dst.flags);
983
984         if (!pcpu_rt)
985                 return NULL;
986         ip6_rt_copy_init(pcpu_rt, rt);
987         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
988         pcpu_rt->rt6i_flags |= RTF_PCPU;
989         return pcpu_rt;
990 }
991
992 /* It should be called with read_lock_bh(&tb6_lock) acquired */
993 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
994 {
995         struct rt6_info *pcpu_rt, **p;
996
997         p = this_cpu_ptr(rt->rt6i_pcpu);
998         pcpu_rt = *p;
999
1000         if (pcpu_rt) {
1001                 dst_hold(&pcpu_rt->dst);
1002                 rt6_dst_from_metrics_check(pcpu_rt);
1003         }
1004         return pcpu_rt;
1005 }
1006
1007 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1008 {
1009         struct fib6_table *table = rt->rt6i_table;
1010         struct rt6_info *pcpu_rt, *prev, **p;
1011
1012         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1013         if (!pcpu_rt) {
1014                 struct net *net = dev_net(rt->dst.dev);
1015
1016                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1017                 return net->ipv6.ip6_null_entry;
1018         }
1019
1020         read_lock_bh(&table->tb6_lock);
1021         if (rt->rt6i_pcpu) {
1022                 p = this_cpu_ptr(rt->rt6i_pcpu);
1023                 prev = cmpxchg(p, NULL, pcpu_rt);
1024                 if (prev) {
1025                         /* If someone did it before us, return prev instead */
1026                         dst_destroy(&pcpu_rt->dst);
1027                         pcpu_rt = prev;
1028                 }
1029         } else {
1030                 /* rt has been removed from the fib6 tree
1031                  * before we have a chance to acquire the read_lock.
1032                  * In this case, don't brother to create a pcpu rt
1033                  * since rt is going away anyway.  The next
1034                  * dst_check() will trigger a re-lookup.
1035                  */
1036                 dst_destroy(&pcpu_rt->dst);
1037                 pcpu_rt = rt;
1038         }
1039         dst_hold(&pcpu_rt->dst);
1040         rt6_dst_from_metrics_check(pcpu_rt);
1041         read_unlock_bh(&table->tb6_lock);
1042         return pcpu_rt;
1043 }
1044
1045 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1046                                int oif, struct flowi6 *fl6, int flags)
1047 {
1048         struct fib6_node *fn, *saved_fn;
1049         struct rt6_info *rt;
1050         int strict = 0;
1051
1052         strict |= flags & RT6_LOOKUP_F_IFACE;
1053         if (net->ipv6.devconf_all->forwarding == 0)
1054                 strict |= RT6_LOOKUP_F_REACHABLE;
1055
1056         read_lock_bh(&table->tb6_lock);
1057
1058         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1059         saved_fn = fn;
1060
1061         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1062                 oif = 0;
1063
1064 redo_rt6_select:
1065         rt = rt6_select(fn, oif, strict);
1066         if (rt->rt6i_nsiblings)
1067                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1068         if (rt == net->ipv6.ip6_null_entry) {
1069                 fn = fib6_backtrack(fn, &fl6->saddr);
1070                 if (fn)
1071                         goto redo_rt6_select;
1072                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1073                         /* also consider unreachable route */
1074                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1075                         fn = saved_fn;
1076                         goto redo_rt6_select;
1077                 }
1078         }
1079
1080
1081         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1082                 dst_use(&rt->dst, jiffies);
1083                 read_unlock_bh(&table->tb6_lock);
1084
1085                 rt6_dst_from_metrics_check(rt);
1086
1087                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1088                 return rt;
1089         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1090                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1091                 /* Create a RTF_CACHE clone which will not be
1092                  * owned by the fib6 tree.  It is for the special case where
1093                  * the daddr in the skb during the neighbor look-up is different
1094                  * from the fl6->daddr used to look-up route here.
1095                  */
1096
1097                 struct rt6_info *uncached_rt;
1098
1099                 dst_use(&rt->dst, jiffies);
1100                 read_unlock_bh(&table->tb6_lock);
1101
1102                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1103                 dst_release(&rt->dst);
1104
1105                 if (uncached_rt)
1106                         rt6_uncached_list_add(uncached_rt);
1107                 else
1108                         uncached_rt = net->ipv6.ip6_null_entry;
1109
1110                 dst_hold(&uncached_rt->dst);
1111
1112                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1113                 return uncached_rt;
1114
1115         } else {
1116                 /* Get a percpu copy */
1117
1118                 struct rt6_info *pcpu_rt;
1119
1120                 rt->dst.lastuse = jiffies;
1121                 rt->dst.__use++;
1122                 pcpu_rt = rt6_get_pcpu_route(rt);
1123
1124                 if (pcpu_rt) {
1125                         read_unlock_bh(&table->tb6_lock);
1126                 } else {
1127                         /* We have to do the read_unlock first
1128                          * because rt6_make_pcpu_route() may trigger
1129                          * ip6_dst_gc() which will take the write_lock.
1130                          */
1131                         dst_hold(&rt->dst);
1132                         read_unlock_bh(&table->tb6_lock);
1133                         pcpu_rt = rt6_make_pcpu_route(rt);
1134                         dst_release(&rt->dst);
1135                 }
1136
1137                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1138                 return pcpu_rt;
1139
1140         }
1141 }
1142 EXPORT_SYMBOL_GPL(ip6_pol_route);
1143
1144 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1145                                             struct flowi6 *fl6, int flags)
1146 {
1147         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1148 }
1149
1150 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1151                                                 struct net_device *dev,
1152                                                 struct flowi6 *fl6, int flags)
1153 {
1154         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1155                 flags |= RT6_LOOKUP_F_IFACE;
1156
1157         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1158 }
1159
1160 void ip6_route_input(struct sk_buff *skb)
1161 {
1162         const struct ipv6hdr *iph = ipv6_hdr(skb);
1163         struct net *net = dev_net(skb->dev);
1164         int flags = RT6_LOOKUP_F_HAS_SADDR;
1165         struct ip_tunnel_info *tun_info;
1166         struct flowi6 fl6 = {
1167                 .flowi6_iif = skb->dev->ifindex,
1168                 .daddr = iph->daddr,
1169                 .saddr = iph->saddr,
1170                 .flowlabel = ip6_flowinfo(iph),
1171                 .flowi6_mark = skb->mark,
1172                 .flowi6_proto = iph->nexthdr,
1173         };
1174
1175         tun_info = skb_tunnel_info(skb);
1176         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1177                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1178         skb_dst_drop(skb);
1179         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1180 }
1181
1182 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1183                                              struct flowi6 *fl6, int flags)
1184 {
1185         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1186 }
1187
1188 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1189                                          struct flowi6 *fl6, int flags)
1190 {
1191         bool any_src;
1192
1193         if (rt6_need_strict(&fl6->daddr)) {
1194                 struct dst_entry *dst;
1195
1196                 dst = l3mdev_link_scope_lookup(net, fl6);
1197                 if (dst)
1198                         return dst;
1199         }
1200
1201         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1202
1203         any_src = ipv6_addr_any(&fl6->saddr);
1204         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1205             (fl6->flowi6_oif && any_src))
1206                 flags |= RT6_LOOKUP_F_IFACE;
1207
1208         if (!any_src)
1209                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1210         else if (sk)
1211                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1212
1213         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1214 }
1215 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1216
1217 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1218 {
1219         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1220         struct dst_entry *new = NULL;
1221
1222         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1223         if (rt) {
1224                 rt6_info_init(rt);
1225
1226                 new = &rt->dst;
1227                 new->__use = 1;
1228                 new->input = dst_discard;
1229                 new->output = dst_discard_out;
1230
1231                 dst_copy_metrics(new, &ort->dst);
1232                 rt->rt6i_idev = ort->rt6i_idev;
1233                 if (rt->rt6i_idev)
1234                         in6_dev_hold(rt->rt6i_idev);
1235
1236                 rt->rt6i_gateway = ort->rt6i_gateway;
1237                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1238                 rt->rt6i_metric = 0;
1239
1240                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1241 #ifdef CONFIG_IPV6_SUBTREES
1242                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1243 #endif
1244
1245                 dst_free(new);
1246         }
1247
1248         dst_release(dst_orig);
1249         return new ? new : ERR_PTR(-ENOMEM);
1250 }
1251
1252 /*
1253  *      Destination cache support functions
1254  */
1255
1256 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1257 {
1258         if (rt->dst.from &&
1259             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1260                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1261 }
1262
1263 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1264 {
1265         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1266                 return NULL;
1267
1268         if (rt6_check_expired(rt))
1269                 return NULL;
1270
1271         return &rt->dst;
1272 }
1273
1274 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1275 {
1276         if (!__rt6_check_expired(rt) &&
1277             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1278             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1279                 return &rt->dst;
1280         else
1281                 return NULL;
1282 }
1283
1284 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1285 {
1286         struct rt6_info *rt;
1287
1288         rt = (struct rt6_info *) dst;
1289
1290         /* All IPV6 dsts are created with ->obsolete set to the value
1291          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1292          * into this function always.
1293          */
1294
1295         rt6_dst_from_metrics_check(rt);
1296
1297         if (rt->rt6i_flags & RTF_PCPU ||
1298             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1299                 return rt6_dst_from_check(rt, cookie);
1300         else
1301                 return rt6_check(rt, cookie);
1302 }
1303
1304 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1305 {
1306         struct rt6_info *rt = (struct rt6_info *) dst;
1307
1308         if (rt) {
1309                 if (rt->rt6i_flags & RTF_CACHE) {
1310                         if (rt6_check_expired(rt)) {
1311                                 ip6_del_rt(rt);
1312                                 dst = NULL;
1313                         }
1314                 } else {
1315                         dst_release(dst);
1316                         dst = NULL;
1317                 }
1318         }
1319         return dst;
1320 }
1321
1322 static void ip6_link_failure(struct sk_buff *skb)
1323 {
1324         struct rt6_info *rt;
1325
1326         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1327
1328         rt = (struct rt6_info *) skb_dst(skb);
1329         if (rt) {
1330                 if (rt->rt6i_flags & RTF_CACHE) {
1331                         dst_hold(&rt->dst);
1332                         ip6_del_rt(rt);
1333                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1334                         rt->rt6i_node->fn_sernum = -1;
1335                 }
1336         }
1337 }
1338
1339 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1340 {
1341         struct net *net = dev_net(rt->dst.dev);
1342
1343         rt->rt6i_flags |= RTF_MODIFIED;
1344         rt->rt6i_pmtu = mtu;
1345         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1346 }
1347
1348 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1349 {
1350         return !(rt->rt6i_flags & RTF_CACHE) &&
1351                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1352 }
1353
1354 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1355                                  const struct ipv6hdr *iph, u32 mtu)
1356 {
1357         struct rt6_info *rt6 = (struct rt6_info *)dst;
1358
1359         if (rt6->rt6i_flags & RTF_LOCAL)
1360                 return;
1361
1362         dst_confirm(dst);
1363         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1364         if (mtu >= dst_mtu(dst))
1365                 return;
1366
1367         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1368                 rt6_do_update_pmtu(rt6, mtu);
1369         } else {
1370                 const struct in6_addr *daddr, *saddr;
1371                 struct rt6_info *nrt6;
1372
1373                 if (iph) {
1374                         daddr = &iph->daddr;
1375                         saddr = &iph->saddr;
1376                 } else if (sk) {
1377                         daddr = &sk->sk_v6_daddr;
1378                         saddr = &inet6_sk(sk)->saddr;
1379                 } else {
1380                         return;
1381                 }
1382                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1383                 if (nrt6) {
1384                         rt6_do_update_pmtu(nrt6, mtu);
1385
1386                         /* ip6_ins_rt(nrt6) will bump the
1387                          * rt6->rt6i_node->fn_sernum
1388                          * which will fail the next rt6_check() and
1389                          * invalidate the sk->sk_dst_cache.
1390                          */
1391                         ip6_ins_rt(nrt6);
1392                 }
1393         }
1394 }
1395
1396 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1397                                struct sk_buff *skb, u32 mtu)
1398 {
1399         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1400 }
1401
1402 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1403                      int oif, u32 mark)
1404 {
1405         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1406         struct dst_entry *dst;
1407         struct flowi6 fl6;
1408
1409         memset(&fl6, 0, sizeof(fl6));
1410         fl6.flowi6_oif = oif;
1411         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1412         fl6.daddr = iph->daddr;
1413         fl6.saddr = iph->saddr;
1414         fl6.flowlabel = ip6_flowinfo(iph);
1415
1416         dst = ip6_route_output(net, NULL, &fl6);
1417         if (!dst->error)
1418                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1419         dst_release(dst);
1420 }
1421 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1422
1423 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1424 {
1425         struct dst_entry *dst;
1426
1427         ip6_update_pmtu(skb, sock_net(sk), mtu,
1428                         sk->sk_bound_dev_if, sk->sk_mark);
1429
1430         dst = __sk_dst_get(sk);
1431         if (!dst || !dst->obsolete ||
1432             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1433                 return;
1434
1435         bh_lock_sock(sk);
1436         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1437                 ip6_datagram_dst_update(sk, false);
1438         bh_unlock_sock(sk);
1439 }
1440 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1441
1442 /* Handle redirects */
1443 struct ip6rd_flowi {
1444         struct flowi6 fl6;
1445         struct in6_addr gateway;
1446 };
1447
1448 static struct rt6_info *__ip6_route_redirect(struct net *net,
1449                                              struct fib6_table *table,
1450                                              struct flowi6 *fl6,
1451                                              int flags)
1452 {
1453         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1454         struct rt6_info *rt;
1455         struct fib6_node *fn;
1456
1457         /* Get the "current" route for this destination and
1458          * check if the redirect has come from approriate router.
1459          *
1460          * RFC 4861 specifies that redirects should only be
1461          * accepted if they come from the nexthop to the target.
1462          * Due to the way the routes are chosen, this notion
1463          * is a bit fuzzy and one might need to check all possible
1464          * routes.
1465          */
1466
1467         read_lock_bh(&table->tb6_lock);
1468         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1469 restart:
1470         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1471                 if (rt6_check_expired(rt))
1472                         continue;
1473                 if (rt->dst.error)
1474                         break;
1475                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1476                         continue;
1477                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1478                         continue;
1479                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1480                         continue;
1481                 break;
1482         }
1483
1484         if (!rt)
1485                 rt = net->ipv6.ip6_null_entry;
1486         else if (rt->dst.error) {
1487                 rt = net->ipv6.ip6_null_entry;
1488                 goto out;
1489         }
1490
1491         if (rt == net->ipv6.ip6_null_entry) {
1492                 fn = fib6_backtrack(fn, &fl6->saddr);
1493                 if (fn)
1494                         goto restart;
1495         }
1496
1497 out:
1498         dst_hold(&rt->dst);
1499
1500         read_unlock_bh(&table->tb6_lock);
1501
1502         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1503         return rt;
1504 };
1505
1506 static struct dst_entry *ip6_route_redirect(struct net *net,
1507                                         const struct flowi6 *fl6,
1508                                         const struct in6_addr *gateway)
1509 {
1510         int flags = RT6_LOOKUP_F_HAS_SADDR;
1511         struct ip6rd_flowi rdfl;
1512
1513         rdfl.fl6 = *fl6;
1514         rdfl.gateway = *gateway;
1515
1516         return fib6_rule_lookup(net, &rdfl.fl6,
1517                                 flags, __ip6_route_redirect);
1518 }
1519
1520 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1521 {
1522         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1523         struct dst_entry *dst;
1524         struct flowi6 fl6;
1525
1526         memset(&fl6, 0, sizeof(fl6));
1527         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1528         fl6.flowi6_oif = oif;
1529         fl6.flowi6_mark = mark;
1530         fl6.daddr = iph->daddr;
1531         fl6.saddr = iph->saddr;
1532         fl6.flowlabel = ip6_flowinfo(iph);
1533
1534         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1535         rt6_do_redirect(dst, NULL, skb);
1536         dst_release(dst);
1537 }
1538 EXPORT_SYMBOL_GPL(ip6_redirect);
1539
1540 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1541                             u32 mark)
1542 {
1543         const struct ipv6hdr *iph = ipv6_hdr(skb);
1544         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1545         struct dst_entry *dst;
1546         struct flowi6 fl6;
1547
1548         memset(&fl6, 0, sizeof(fl6));
1549         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1550         fl6.flowi6_oif = oif;
1551         fl6.flowi6_mark = mark;
1552         fl6.daddr = msg->dest;
1553         fl6.saddr = iph->daddr;
1554
1555         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1556         rt6_do_redirect(dst, NULL, skb);
1557         dst_release(dst);
1558 }
1559
1560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1561 {
1562         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1563 }
1564 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1565
1566 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1567 {
1568         struct net_device *dev = dst->dev;
1569         unsigned int mtu = dst_mtu(dst);
1570         struct net *net = dev_net(dev);
1571
1572         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1573
1574         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1575                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1576
1577         /*
1578          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1579          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1580          * IPV6_MAXPLEN is also valid and means: "any MSS,
1581          * rely only on pmtu discovery"
1582          */
1583         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1584                 mtu = IPV6_MAXPLEN;
1585         return mtu;
1586 }
1587
1588 static unsigned int ip6_mtu(const struct dst_entry *dst)
1589 {
1590         const struct rt6_info *rt = (const struct rt6_info *)dst;
1591         unsigned int mtu = rt->rt6i_pmtu;
1592         struct inet6_dev *idev;
1593
1594         if (mtu)
1595                 goto out;
1596
1597         mtu = dst_metric_raw(dst, RTAX_MTU);
1598         if (mtu)
1599                 goto out;
1600
1601         mtu = IPV6_MIN_MTU;
1602
1603         rcu_read_lock();
1604         idev = __in6_dev_get(dst->dev);
1605         if (idev)
1606                 mtu = idev->cnf.mtu6;
1607         rcu_read_unlock();
1608
1609 out:
1610         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1611
1612         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1613 }
1614
1615 static struct dst_entry *icmp6_dst_gc_list;
1616 static DEFINE_SPINLOCK(icmp6_dst_lock);
1617
1618 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1619                                   struct flowi6 *fl6)
1620 {
1621         struct dst_entry *dst;
1622         struct rt6_info *rt;
1623         struct inet6_dev *idev = in6_dev_get(dev);
1624         struct net *net = dev_net(dev);
1625
1626         if (unlikely(!idev))
1627                 return ERR_PTR(-ENODEV);
1628
1629         rt = ip6_dst_alloc(net, dev, 0);
1630         if (unlikely(!rt)) {
1631                 in6_dev_put(idev);
1632                 dst = ERR_PTR(-ENOMEM);
1633                 goto out;
1634         }
1635
1636         rt->dst.flags |= DST_HOST;
1637         rt->dst.output  = ip6_output;
1638         atomic_set(&rt->dst.__refcnt, 1);
1639         rt->rt6i_gateway  = fl6->daddr;
1640         rt->rt6i_dst.addr = fl6->daddr;
1641         rt->rt6i_dst.plen = 128;
1642         rt->rt6i_idev     = idev;
1643         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1644
1645         spin_lock_bh(&icmp6_dst_lock);
1646         rt->dst.next = icmp6_dst_gc_list;
1647         icmp6_dst_gc_list = &rt->dst;
1648         spin_unlock_bh(&icmp6_dst_lock);
1649
1650         fib6_force_start_gc(net);
1651
1652         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1653
1654 out:
1655         return dst;
1656 }
1657
1658 int icmp6_dst_gc(void)
1659 {
1660         struct dst_entry *dst, **pprev;
1661         int more = 0;
1662
1663         spin_lock_bh(&icmp6_dst_lock);
1664         pprev = &icmp6_dst_gc_list;
1665
1666         while ((dst = *pprev) != NULL) {
1667                 if (!atomic_read(&dst->__refcnt)) {
1668                         *pprev = dst->next;
1669                         dst_free(dst);
1670                 } else {
1671                         pprev = &dst->next;
1672                         ++more;
1673                 }
1674         }
1675
1676         spin_unlock_bh(&icmp6_dst_lock);
1677
1678         return more;
1679 }
1680
1681 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1682                             void *arg)
1683 {
1684         struct dst_entry *dst, **pprev;
1685
1686         spin_lock_bh(&icmp6_dst_lock);
1687         pprev = &icmp6_dst_gc_list;
1688         while ((dst = *pprev) != NULL) {
1689                 struct rt6_info *rt = (struct rt6_info *) dst;
1690                 if (func(rt, arg)) {
1691                         *pprev = dst->next;
1692                         dst_free(dst);
1693                 } else {
1694                         pprev = &dst->next;
1695                 }
1696         }
1697         spin_unlock_bh(&icmp6_dst_lock);
1698 }
1699
1700 static int ip6_dst_gc(struct dst_ops *ops)
1701 {
1702         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1703         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1704         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1705         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1706         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1707         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1708         int entries;
1709
1710         entries = dst_entries_get_fast(ops);
1711         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1712             entries <= rt_max_size)
1713                 goto out;
1714
1715         net->ipv6.ip6_rt_gc_expire++;
1716         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1717         entries = dst_entries_get_slow(ops);
1718         if (entries < ops->gc_thresh)
1719                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1720 out:
1721         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1722         return entries > rt_max_size;
1723 }
1724
1725 static int ip6_convert_metrics(struct mx6_config *mxc,
1726                                const struct fib6_config *cfg)
1727 {
1728         bool ecn_ca = false;
1729         struct nlattr *nla;
1730         int remaining;
1731         u32 *mp;
1732
1733         if (!cfg->fc_mx)
1734                 return 0;
1735
1736         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1737         if (unlikely(!mp))
1738                 return -ENOMEM;
1739
1740         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1741                 int type = nla_type(nla);
1742                 u32 val;
1743
1744                 if (!type)
1745                         continue;
1746                 if (unlikely(type > RTAX_MAX))
1747                         goto err;
1748
1749                 if (type == RTAX_CC_ALGO) {
1750                         char tmp[TCP_CA_NAME_MAX];
1751
1752                         nla_strlcpy(tmp, nla, sizeof(tmp));
1753                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1754                         if (val == TCP_CA_UNSPEC)
1755                                 goto err;
1756                 } else {
1757                         val = nla_get_u32(nla);
1758                 }
1759                 if (type == RTAX_HOPLIMIT && val > 255)
1760                         val = 255;
1761                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1762                         goto err;
1763
1764                 mp[type - 1] = val;
1765                 __set_bit(type - 1, mxc->mx_valid);
1766         }
1767
1768         if (ecn_ca) {
1769                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1770                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1771         }
1772
1773         mxc->mx = mp;
1774         return 0;
1775  err:
1776         kfree(mp);
1777         return -EINVAL;
1778 }
1779
1780 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1781                                             struct fib6_config *cfg,
1782                                             const struct in6_addr *gw_addr)
1783 {
1784         struct flowi6 fl6 = {
1785                 .flowi6_oif = cfg->fc_ifindex,
1786                 .daddr = *gw_addr,
1787                 .saddr = cfg->fc_prefsrc,
1788         };
1789         struct fib6_table *table;
1790         struct rt6_info *rt;
1791         int flags = RT6_LOOKUP_F_IFACE;
1792
1793         table = fib6_get_table(net, cfg->fc_table);
1794         if (!table)
1795                 return NULL;
1796
1797         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1798                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1799
1800         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1801
1802         /* if table lookup failed, fall back to full lookup */
1803         if (rt == net->ipv6.ip6_null_entry) {
1804                 ip6_rt_put(rt);
1805                 rt = NULL;
1806         }
1807
1808         return rt;
1809 }
1810
1811 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1812 {
1813         struct net *net = cfg->fc_nlinfo.nl_net;
1814         struct rt6_info *rt = NULL;
1815         struct net_device *dev = NULL;
1816         struct inet6_dev *idev = NULL;
1817         struct fib6_table *table;
1818         int addr_type;
1819         int err = -EINVAL;
1820
1821         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1822                 goto out;
1823 #ifndef CONFIG_IPV6_SUBTREES
1824         if (cfg->fc_src_len)
1825                 goto out;
1826 #endif
1827         if (cfg->fc_ifindex) {
1828                 err = -ENODEV;
1829                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1830                 if (!dev)
1831                         goto out;
1832                 idev = in6_dev_get(dev);
1833                 if (!idev)
1834                         goto out;
1835         }
1836
1837         if (cfg->fc_metric == 0)
1838                 cfg->fc_metric = IP6_RT_PRIO_USER;
1839
1840         err = -ENOBUFS;
1841         if (cfg->fc_nlinfo.nlh &&
1842             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1843                 table = fib6_get_table(net, cfg->fc_table);
1844                 if (!table) {
1845                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1846                         table = fib6_new_table(net, cfg->fc_table);
1847                 }
1848         } else {
1849                 table = fib6_new_table(net, cfg->fc_table);
1850         }
1851
1852         if (!table)
1853                 goto out;
1854
1855         rt = ip6_dst_alloc(net, NULL,
1856                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1857
1858         if (!rt) {
1859                 err = -ENOMEM;
1860                 goto out;
1861         }
1862
1863         if (cfg->fc_flags & RTF_EXPIRES)
1864                 rt6_set_expires(rt, jiffies +
1865                                 clock_t_to_jiffies(cfg->fc_expires));
1866         else
1867                 rt6_clean_expires(rt);
1868
1869         if (cfg->fc_protocol == RTPROT_UNSPEC)
1870                 cfg->fc_protocol = RTPROT_BOOT;
1871         rt->rt6i_protocol = cfg->fc_protocol;
1872
1873         addr_type = ipv6_addr_type(&cfg->fc_dst);
1874
1875         if (addr_type & IPV6_ADDR_MULTICAST)
1876                 rt->dst.input = ip6_mc_input;
1877         else if (cfg->fc_flags & RTF_LOCAL)
1878                 rt->dst.input = ip6_input;
1879         else
1880                 rt->dst.input = ip6_forward;
1881
1882         rt->dst.output = ip6_output;
1883
1884         if (cfg->fc_encap) {
1885                 struct lwtunnel_state *lwtstate;
1886
1887                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1888                                            cfg->fc_encap, AF_INET6, cfg,
1889                                            &lwtstate);
1890                 if (err)
1891                         goto out;
1892                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1893                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1894                         rt->dst.lwtstate->orig_output = rt->dst.output;
1895                         rt->dst.output = lwtunnel_output;
1896                 }
1897                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1898                         rt->dst.lwtstate->orig_input = rt->dst.input;
1899                         rt->dst.input = lwtunnel_input;
1900                 }
1901         }
1902
1903         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1904         rt->rt6i_dst.plen = cfg->fc_dst_len;
1905         if (rt->rt6i_dst.plen == 128)
1906                 rt->dst.flags |= DST_HOST;
1907
1908 #ifdef CONFIG_IPV6_SUBTREES
1909         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1910         rt->rt6i_src.plen = cfg->fc_src_len;
1911 #endif
1912
1913         rt->rt6i_metric = cfg->fc_metric;
1914
1915         /* We cannot add true routes via loopback here,
1916            they would result in kernel looping; promote them to reject routes
1917          */
1918         if ((cfg->fc_flags & RTF_REJECT) ||
1919             (dev && (dev->flags & IFF_LOOPBACK) &&
1920              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1921              !(cfg->fc_flags & RTF_LOCAL))) {
1922                 /* hold loopback dev/idev if we haven't done so. */
1923                 if (dev != net->loopback_dev) {
1924                         if (dev) {
1925                                 dev_put(dev);
1926                                 in6_dev_put(idev);
1927                         }
1928                         dev = net->loopback_dev;
1929                         dev_hold(dev);
1930                         idev = in6_dev_get(dev);
1931                         if (!idev) {
1932                                 err = -ENODEV;
1933                                 goto out;
1934                         }
1935                 }
1936                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1937                 switch (cfg->fc_type) {
1938                 case RTN_BLACKHOLE:
1939                         rt->dst.error = -EINVAL;
1940                         rt->dst.output = dst_discard_out;
1941                         rt->dst.input = dst_discard;
1942                         break;
1943                 case RTN_PROHIBIT:
1944                         rt->dst.error = -EACCES;
1945                         rt->dst.output = ip6_pkt_prohibit_out;
1946                         rt->dst.input = ip6_pkt_prohibit;
1947                         break;
1948                 case RTN_THROW:
1949                 case RTN_UNREACHABLE:
1950                 default:
1951                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1952                                         : (cfg->fc_type == RTN_UNREACHABLE)
1953                                         ? -EHOSTUNREACH : -ENETUNREACH;
1954                         rt->dst.output = ip6_pkt_discard_out;
1955                         rt->dst.input = ip6_pkt_discard;
1956                         break;
1957                 }
1958                 goto install_route;
1959         }
1960
1961         if (cfg->fc_flags & RTF_GATEWAY) {
1962                 const struct in6_addr *gw_addr;
1963                 int gwa_type;
1964
1965                 gw_addr = &cfg->fc_gateway;
1966                 gwa_type = ipv6_addr_type(gw_addr);
1967
1968                 /* if gw_addr is local we will fail to detect this in case
1969                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1970                  * will return already-added prefix route via interface that
1971                  * prefix route was assigned to, which might be non-loopback.
1972                  */
1973                 err = -EINVAL;
1974                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1975                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1976                                             dev : NULL, 0, 0))
1977                         goto out;
1978
1979                 rt->rt6i_gateway = *gw_addr;
1980
1981                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1982                         struct rt6_info *grt = NULL;
1983
1984                         /* IPv6 strictly inhibits using not link-local
1985                            addresses as nexthop address.
1986                            Otherwise, router will not able to send redirects.
1987                            It is very good, but in some (rare!) circumstances
1988                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1989                            some exceptions. --ANK
1990                          */
1991                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1992                                 goto out;
1993
1994                         if (cfg->fc_table)
1995                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
1996
1997                         if (!grt)
1998                                 grt = rt6_lookup(net, gw_addr, NULL,
1999                                                  cfg->fc_ifindex, 1);
2000
2001                         err = -EHOSTUNREACH;
2002                         if (!grt)
2003                                 goto out;
2004                         if (dev) {
2005                                 if (dev != grt->dst.dev) {
2006                                         ip6_rt_put(grt);
2007                                         goto out;
2008                                 }
2009                         } else {
2010                                 dev = grt->dst.dev;
2011                                 idev = grt->rt6i_idev;
2012                                 dev_hold(dev);
2013                                 in6_dev_hold(grt->rt6i_idev);
2014                         }
2015                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2016                                 err = 0;
2017                         ip6_rt_put(grt);
2018
2019                         if (err)
2020                                 goto out;
2021                 }
2022                 err = -EINVAL;
2023                 if (!dev || (dev->flags & IFF_LOOPBACK))
2024                         goto out;
2025         }
2026
2027         err = -ENODEV;
2028         if (!dev)
2029                 goto out;
2030
2031         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2032                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2033                         err = -EINVAL;
2034                         goto out;
2035                 }
2036                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2037                 rt->rt6i_prefsrc.plen = 128;
2038         } else
2039                 rt->rt6i_prefsrc.plen = 0;
2040
2041         rt->rt6i_flags = cfg->fc_flags;
2042
2043 install_route:
2044         rt->dst.dev = dev;
2045         rt->rt6i_idev = idev;
2046         rt->rt6i_table = table;
2047
2048         cfg->fc_nlinfo.nl_net = dev_net(dev);
2049
2050         return rt;
2051 out:
2052         if (dev)
2053                 dev_put(dev);
2054         if (idev)
2055                 in6_dev_put(idev);
2056         if (rt)
2057                 dst_free(&rt->dst);
2058
2059         return ERR_PTR(err);
2060 }
2061
2062 int ip6_route_add(struct fib6_config *cfg)
2063 {
2064         struct mx6_config mxc = { .mx = NULL, };
2065         struct rt6_info *rt;
2066         int err;
2067
2068         rt = ip6_route_info_create(cfg);
2069         if (IS_ERR(rt)) {
2070                 err = PTR_ERR(rt);
2071                 rt = NULL;
2072                 goto out;
2073         }
2074
2075         err = ip6_convert_metrics(&mxc, cfg);
2076         if (err)
2077                 goto out;
2078
2079         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2080
2081         kfree(mxc.mx);
2082
2083         return err;
2084 out:
2085         if (rt)
2086                 dst_free(&rt->dst);
2087
2088         return err;
2089 }
2090
2091 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2092 {
2093         int err;
2094         struct fib6_table *table;
2095         struct net *net = dev_net(rt->dst.dev);
2096
2097         if (rt == net->ipv6.ip6_null_entry ||
2098             rt->dst.flags & DST_NOCACHE) {
2099                 err = -ENOENT;
2100                 goto out;
2101         }
2102
2103         table = rt->rt6i_table;
2104         write_lock_bh(&table->tb6_lock);
2105         err = fib6_del(rt, info);
2106         write_unlock_bh(&table->tb6_lock);
2107
2108 out:
2109         ip6_rt_put(rt);
2110         return err;
2111 }
2112
2113 int ip6_del_rt(struct rt6_info *rt)
2114 {
2115         struct nl_info info = {
2116                 .nl_net = dev_net(rt->dst.dev),
2117         };
2118         return __ip6_del_rt(rt, &info);
2119 }
2120
2121 static int ip6_route_del(struct fib6_config *cfg)
2122 {
2123         struct fib6_table *table;
2124         struct fib6_node *fn;
2125         struct rt6_info *rt;
2126         int err = -ESRCH;
2127
2128         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2129         if (!table)
2130                 return err;
2131
2132         read_lock_bh(&table->tb6_lock);
2133
2134         fn = fib6_locate(&table->tb6_root,
2135                          &cfg->fc_dst, cfg->fc_dst_len,
2136                          &cfg->fc_src, cfg->fc_src_len);
2137
2138         if (fn) {
2139                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2140                         if ((rt->rt6i_flags & RTF_CACHE) &&
2141                             !(cfg->fc_flags & RTF_CACHE))
2142                                 continue;
2143                         if (cfg->fc_ifindex &&
2144                             (!rt->dst.dev ||
2145                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2146                                 continue;
2147                         if (cfg->fc_flags & RTF_GATEWAY &&
2148                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2149                                 continue;
2150                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2151                                 continue;
2152                         dst_hold(&rt->dst);
2153                         read_unlock_bh(&table->tb6_lock);
2154
2155                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2156                 }
2157         }
2158         read_unlock_bh(&table->tb6_lock);
2159
2160         return err;
2161 }
2162
2163 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2164 {
2165         struct netevent_redirect netevent;
2166         struct rt6_info *rt, *nrt = NULL;
2167         struct ndisc_options ndopts;
2168         struct inet6_dev *in6_dev;
2169         struct neighbour *neigh;
2170         struct rd_msg *msg;
2171         int optlen, on_link;
2172         u8 *lladdr;
2173
2174         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2175         optlen -= sizeof(*msg);
2176
2177         if (optlen < 0) {
2178                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2179                 return;
2180         }
2181
2182         msg = (struct rd_msg *)icmp6_hdr(skb);
2183
2184         if (ipv6_addr_is_multicast(&msg->dest)) {
2185                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2186                 return;
2187         }
2188
2189         on_link = 0;
2190         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2191                 on_link = 1;
2192         } else if (ipv6_addr_type(&msg->target) !=
2193                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2194                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2195                 return;
2196         }
2197
2198         in6_dev = __in6_dev_get(skb->dev);
2199         if (!in6_dev)
2200                 return;
2201         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2202                 return;
2203
2204         /* RFC2461 8.1:
2205          *      The IP source address of the Redirect MUST be the same as the current
2206          *      first-hop router for the specified ICMP Destination Address.
2207          */
2208
2209         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2210                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2211                 return;
2212         }
2213
2214         lladdr = NULL;
2215         if (ndopts.nd_opts_tgt_lladdr) {
2216                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2217                                              skb->dev);
2218                 if (!lladdr) {
2219                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2220                         return;
2221                 }
2222         }
2223
2224         rt = (struct rt6_info *) dst;
2225         if (rt->rt6i_flags & RTF_REJECT) {
2226                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2227                 return;
2228         }
2229
2230         /* Redirect received -> path was valid.
2231          * Look, redirects are sent only in response to data packets,
2232          * so that this nexthop apparently is reachable. --ANK
2233          */
2234         dst_confirm(&rt->dst);
2235
2236         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2237         if (!neigh)
2238                 return;
2239
2240         /*
2241          *      We have finally decided to accept it.
2242          */
2243
2244         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2245                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2246                      NEIGH_UPDATE_F_OVERRIDE|
2247                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2248                                      NEIGH_UPDATE_F_ISROUTER)),
2249                      NDISC_REDIRECT, &ndopts);
2250
2251         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2252         if (!nrt)
2253                 goto out;
2254
2255         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2256         if (on_link)
2257                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2258
2259         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2260
2261         if (ip6_ins_rt(nrt))
2262                 goto out;
2263
2264         netevent.old = &rt->dst;
2265         netevent.new = &nrt->dst;
2266         netevent.daddr = &msg->dest;
2267         netevent.neigh = neigh;
2268         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2269
2270         if (rt->rt6i_flags & RTF_CACHE) {
2271                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2272                 ip6_del_rt(rt);
2273         }
2274
2275 out:
2276         neigh_release(neigh);
2277 }
2278
2279 /*
2280  *      Misc support functions
2281  */
2282
2283 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2284 {
2285         BUG_ON(from->dst.from);
2286
2287         rt->rt6i_flags &= ~RTF_EXPIRES;
2288         dst_hold(&from->dst);
2289         rt->dst.from = &from->dst;
2290         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2291 }
2292
2293 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2294 {
2295         rt->dst.input = ort->dst.input;
2296         rt->dst.output = ort->dst.output;
2297         rt->rt6i_dst = ort->rt6i_dst;
2298         rt->dst.error = ort->dst.error;
2299         rt->rt6i_idev = ort->rt6i_idev;
2300         if (rt->rt6i_idev)
2301                 in6_dev_hold(rt->rt6i_idev);
2302         rt->dst.lastuse = jiffies;
2303         rt->rt6i_gateway = ort->rt6i_gateway;
2304         rt->rt6i_flags = ort->rt6i_flags;
2305         rt6_set_from(rt, ort);
2306         rt->rt6i_metric = ort->rt6i_metric;
2307 #ifdef CONFIG_IPV6_SUBTREES
2308         rt->rt6i_src = ort->rt6i_src;
2309 #endif
2310         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2311         rt->rt6i_table = ort->rt6i_table;
2312         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2313 }
2314
2315 #ifdef CONFIG_IPV6_ROUTE_INFO
2316 static struct rt6_info *rt6_get_route_info(struct net *net,
2317                                            const struct in6_addr *prefix, int prefixlen,
2318                                            const struct in6_addr *gwaddr, int ifindex)
2319 {
2320         struct fib6_node *fn;
2321         struct rt6_info *rt = NULL;
2322         struct fib6_table *table;
2323
2324         table = fib6_get_table(net, RT6_TABLE_INFO);
2325         if (!table)
2326                 return NULL;
2327
2328         read_lock_bh(&table->tb6_lock);
2329         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2330         if (!fn)
2331                 goto out;
2332
2333         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2334                 if (rt->dst.dev->ifindex != ifindex)
2335                         continue;
2336                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2337                         continue;
2338                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2339                         continue;
2340                 dst_hold(&rt->dst);
2341                 break;
2342         }
2343 out:
2344         read_unlock_bh(&table->tb6_lock);
2345         return rt;
2346 }
2347
2348 static struct rt6_info *rt6_add_route_info(struct net *net,
2349                                            const struct in6_addr *prefix, int prefixlen,
2350                                            const struct in6_addr *gwaddr, int ifindex,
2351                                            unsigned int pref)
2352 {
2353         struct fib6_config cfg = {
2354                 .fc_metric      = IP6_RT_PRIO_USER,
2355                 .fc_ifindex     = ifindex,
2356                 .fc_dst_len     = prefixlen,
2357                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2358                                   RTF_UP | RTF_PREF(pref),
2359                 .fc_nlinfo.portid = 0,
2360                 .fc_nlinfo.nlh = NULL,
2361                 .fc_nlinfo.nl_net = net,
2362         };
2363
2364         cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2365         cfg.fc_dst = *prefix;
2366         cfg.fc_gateway = *gwaddr;
2367
2368         /* We should treat it as a default route if prefix length is 0. */
2369         if (!prefixlen)
2370                 cfg.fc_flags |= RTF_DEFAULT;
2371
2372         ip6_route_add(&cfg);
2373
2374         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2375 }
2376 #endif
2377
2378 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2379 {
2380         struct rt6_info *rt;
2381         struct fib6_table *table;
2382
2383         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2384         if (!table)
2385                 return NULL;
2386
2387         read_lock_bh(&table->tb6_lock);
2388         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2389                 if (dev == rt->dst.dev &&
2390                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2391                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2392                         break;
2393         }
2394         if (rt)
2395                 dst_hold(&rt->dst);
2396         read_unlock_bh(&table->tb6_lock);
2397         return rt;
2398 }
2399
2400 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2401                                      struct net_device *dev,
2402                                      unsigned int pref)
2403 {
2404         struct fib6_config cfg = {
2405                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2406                 .fc_metric      = IP6_RT_PRIO_USER,
2407                 .fc_ifindex     = dev->ifindex,
2408                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2409                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2410                 .fc_nlinfo.portid = 0,
2411                 .fc_nlinfo.nlh = NULL,
2412                 .fc_nlinfo.nl_net = dev_net(dev),
2413         };
2414
2415         cfg.fc_gateway = *gwaddr;
2416
2417         ip6_route_add(&cfg);
2418
2419         return rt6_get_dflt_router(gwaddr, dev);
2420 }
2421
2422 void rt6_purge_dflt_routers(struct net *net)
2423 {
2424         struct rt6_info *rt;
2425         struct fib6_table *table;
2426
2427         /* NOTE: Keep consistent with rt6_get_dflt_router */
2428         table = fib6_get_table(net, RT6_TABLE_DFLT);
2429         if (!table)
2430                 return;
2431
2432 restart:
2433         read_lock_bh(&table->tb6_lock);
2434         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2435                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2436                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2437                         dst_hold(&rt->dst);
2438                         read_unlock_bh(&table->tb6_lock);
2439                         ip6_del_rt(rt);
2440                         goto restart;
2441                 }
2442         }
2443         read_unlock_bh(&table->tb6_lock);
2444 }
2445
2446 static void rtmsg_to_fib6_config(struct net *net,
2447                                  struct in6_rtmsg *rtmsg,
2448                                  struct fib6_config *cfg)
2449 {
2450         memset(cfg, 0, sizeof(*cfg));
2451
2452         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2453                          : RT6_TABLE_MAIN;
2454         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2455         cfg->fc_metric = rtmsg->rtmsg_metric;
2456         cfg->fc_expires = rtmsg->rtmsg_info;
2457         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2458         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2459         cfg->fc_flags = rtmsg->rtmsg_flags;
2460
2461         cfg->fc_nlinfo.nl_net = net;
2462
2463         cfg->fc_dst = rtmsg->rtmsg_dst;
2464         cfg->fc_src = rtmsg->rtmsg_src;
2465         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2466 }
2467
2468 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2469 {
2470         struct fib6_config cfg;
2471         struct in6_rtmsg rtmsg;
2472         int err;
2473
2474         switch (cmd) {
2475         case SIOCADDRT:         /* Add a route */
2476         case SIOCDELRT:         /* Delete a route */
2477                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2478                         return -EPERM;
2479                 err = copy_from_user(&rtmsg, arg,
2480                                      sizeof(struct in6_rtmsg));
2481                 if (err)
2482                         return -EFAULT;
2483
2484                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2485
2486                 rtnl_lock();
2487                 switch (cmd) {
2488                 case SIOCADDRT:
2489                         err = ip6_route_add(&cfg);
2490                         break;
2491                 case SIOCDELRT:
2492                         err = ip6_route_del(&cfg);
2493                         break;
2494                 default:
2495                         err = -EINVAL;
2496                 }
2497                 rtnl_unlock();
2498
2499                 return err;
2500         }
2501
2502         return -EINVAL;
2503 }
2504
2505 /*
2506  *      Drop the packet on the floor
2507  */
2508
2509 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2510 {
2511         int type;
2512         struct dst_entry *dst = skb_dst(skb);
2513         switch (ipstats_mib_noroutes) {
2514         case IPSTATS_MIB_INNOROUTES:
2515                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2516                 if (type == IPV6_ADDR_ANY) {
2517                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2518                                       IPSTATS_MIB_INADDRERRORS);
2519                         break;
2520                 }
2521                 /* FALLTHROUGH */
2522         case IPSTATS_MIB_OUTNOROUTES:
2523                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2524                               ipstats_mib_noroutes);
2525                 break;
2526         }
2527         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2528         kfree_skb(skb);
2529         return 0;
2530 }
2531
2532 static int ip6_pkt_discard(struct sk_buff *skb)
2533 {
2534         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2535 }
2536
2537 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2538 {
2539         skb->dev = skb_dst(skb)->dev;
2540         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2541 }
2542
2543 static int ip6_pkt_prohibit(struct sk_buff *skb)
2544 {
2545         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2546 }
2547
2548 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2549 {
2550         skb->dev = skb_dst(skb)->dev;
2551         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2552 }
2553
2554 /*
2555  *      Allocate a dst for local (unicast / anycast) address.
2556  */
2557
2558 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2559                                     const struct in6_addr *addr,
2560                                     bool anycast)
2561 {
2562         u32 tb_id;
2563         struct net *net = dev_net(idev->dev);
2564         struct net_device *dev = net->loopback_dev;
2565         struct rt6_info *rt;
2566
2567         /* use L3 Master device as loopback for host routes if device
2568          * is enslaved and address is not link local or multicast
2569          */
2570         if (!rt6_need_strict(addr))
2571                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2572
2573         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2574         if (!rt)
2575                 return ERR_PTR(-ENOMEM);
2576
2577         in6_dev_hold(idev);
2578
2579         rt->dst.flags |= DST_HOST;
2580         rt->dst.input = ip6_input;
2581         rt->dst.output = ip6_output;
2582         rt->rt6i_idev = idev;
2583
2584         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2585         if (anycast)
2586                 rt->rt6i_flags |= RTF_ANYCAST;
2587         else
2588                 rt->rt6i_flags |= RTF_LOCAL;
2589
2590         rt->rt6i_gateway  = *addr;
2591         rt->rt6i_dst.addr = *addr;
2592         rt->rt6i_dst.plen = 128;
2593         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2594         rt->rt6i_table = fib6_get_table(net, tb_id);
2595         rt->dst.flags |= DST_NOCACHE;
2596
2597         atomic_set(&rt->dst.__refcnt, 1);
2598
2599         return rt;
2600 }
2601
2602 /* remove deleted ip from prefsrc entries */
2603 struct arg_dev_net_ip {
2604         struct net_device *dev;
2605         struct net *net;
2606         struct in6_addr *addr;
2607 };
2608
2609 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2610 {
2611         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2612         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2613         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2614
2615         if (((void *)rt->dst.dev == dev || !dev) &&
2616             rt != net->ipv6.ip6_null_entry &&
2617             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2618                 /* remove prefsrc entry */
2619                 rt->rt6i_prefsrc.plen = 0;
2620         }
2621         return 0;
2622 }
2623
2624 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2625 {
2626         struct net *net = dev_net(ifp->idev->dev);
2627         struct arg_dev_net_ip adni = {
2628                 .dev = ifp->idev->dev,
2629                 .net = net,
2630                 .addr = &ifp->addr,
2631         };
2632         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2633 }
2634
2635 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2636 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2637
2638 /* Remove routers and update dst entries when gateway turn into host. */
2639 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2640 {
2641         struct in6_addr *gateway = (struct in6_addr *)arg;
2642
2643         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2644              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2645              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2646                 return -1;
2647         }
2648         return 0;
2649 }
2650
2651 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2652 {
2653         fib6_clean_all(net, fib6_clean_tohost, gateway);
2654 }
2655
2656 struct arg_dev_net {
2657         struct net_device *dev;
2658         struct net *net;
2659 };
2660
2661 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2662 {
2663         const struct arg_dev_net *adn = arg;
2664         const struct net_device *dev = adn->dev;
2665
2666         if ((rt->dst.dev == dev || !dev) &&
2667             rt != adn->net->ipv6.ip6_null_entry)
2668                 return -1;
2669
2670         return 0;
2671 }
2672
2673 void rt6_ifdown(struct net *net, struct net_device *dev)
2674 {
2675         struct arg_dev_net adn = {
2676                 .dev = dev,
2677                 .net = net,
2678         };
2679
2680         fib6_clean_all(net, fib6_ifdown, &adn);
2681         icmp6_clean_all(fib6_ifdown, &adn);
2682         if (dev)
2683                 rt6_uncached_list_flush_dev(net, dev);
2684 }
2685
2686 struct rt6_mtu_change_arg {
2687         struct net_device *dev;
2688         unsigned int mtu;
2689 };
2690
2691 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2692 {
2693         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2694         struct inet6_dev *idev;
2695
2696         /* In IPv6 pmtu discovery is not optional,
2697            so that RTAX_MTU lock cannot disable it.
2698            We still use this lock to block changes
2699            caused by addrconf/ndisc.
2700         */
2701
2702         idev = __in6_dev_get(arg->dev);
2703         if (!idev)
2704                 return 0;
2705
2706         /* For administrative MTU increase, there is no way to discover
2707            IPv6 PMTU increase, so PMTU increase should be updated here.
2708            Since RFC 1981 doesn't include administrative MTU increase
2709            update PMTU increase is a MUST. (i.e. jumbo frame)
2710          */
2711         /*
2712            If new MTU is less than route PMTU, this new MTU will be the
2713            lowest MTU in the path, update the route PMTU to reflect PMTU
2714            decreases; if new MTU is greater than route PMTU, and the
2715            old MTU is the lowest MTU in the path, update the route PMTU
2716            to reflect the increase. In this case if the other nodes' MTU
2717            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2718            PMTU discouvery.
2719          */
2720         if (rt->dst.dev == arg->dev &&
2721             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2722                 if (rt->rt6i_flags & RTF_CACHE) {
2723                         /* For RTF_CACHE with rt6i_pmtu == 0
2724                          * (i.e. a redirected route),
2725                          * the metrics of its rt->dst.from has already
2726                          * been updated.
2727                          */
2728                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2729                                 rt->rt6i_pmtu = arg->mtu;
2730                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2731                            (dst_mtu(&rt->dst) < arg->mtu &&
2732                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2733                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2734                 }
2735         }
2736         return 0;
2737 }
2738
2739 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2740 {
2741         struct rt6_mtu_change_arg arg = {
2742                 .dev = dev,
2743                 .mtu = mtu,
2744         };
2745
2746         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2747 }
2748
2749 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2750         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2751         [RTA_OIF]               = { .type = NLA_U32 },
2752         [RTA_IIF]               = { .type = NLA_U32 },
2753         [RTA_PRIORITY]          = { .type = NLA_U32 },
2754         [RTA_METRICS]           = { .type = NLA_NESTED },
2755         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2756         [RTA_PREF]              = { .type = NLA_U8 },
2757         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2758         [RTA_ENCAP]             = { .type = NLA_NESTED },
2759         [RTA_EXPIRES]           = { .type = NLA_U32 },
2760 };
2761
2762 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2763                               struct fib6_config *cfg)
2764 {
2765         struct rtmsg *rtm;
2766         struct nlattr *tb[RTA_MAX+1];
2767         unsigned int pref;
2768         int err;
2769
2770         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2771         if (err < 0)
2772                 goto errout;
2773
2774         err = -EINVAL;
2775         rtm = nlmsg_data(nlh);
2776         memset(cfg, 0, sizeof(*cfg));
2777
2778         cfg->fc_table = rtm->rtm_table;
2779         cfg->fc_dst_len = rtm->rtm_dst_len;
2780         cfg->fc_src_len = rtm->rtm_src_len;
2781         cfg->fc_flags = RTF_UP;
2782         cfg->fc_protocol = rtm->rtm_protocol;
2783         cfg->fc_type = rtm->rtm_type;
2784
2785         if (rtm->rtm_type == RTN_UNREACHABLE ||
2786             rtm->rtm_type == RTN_BLACKHOLE ||
2787             rtm->rtm_type == RTN_PROHIBIT ||
2788             rtm->rtm_type == RTN_THROW)
2789                 cfg->fc_flags |= RTF_REJECT;
2790
2791         if (rtm->rtm_type == RTN_LOCAL)
2792                 cfg->fc_flags |= RTF_LOCAL;
2793
2794         if (rtm->rtm_flags & RTM_F_CLONED)
2795                 cfg->fc_flags |= RTF_CACHE;
2796
2797         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2798         cfg->fc_nlinfo.nlh = nlh;
2799         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2800
2801         if (tb[RTA_GATEWAY]) {
2802                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2803                 cfg->fc_flags |= RTF_GATEWAY;
2804         }
2805
2806         if (tb[RTA_DST]) {
2807                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2808
2809                 if (nla_len(tb[RTA_DST]) < plen)
2810                         goto errout;
2811
2812                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2813         }
2814
2815         if (tb[RTA_SRC]) {
2816                 int plen = (rtm->rtm_src_len + 7) >> 3;
2817
2818                 if (nla_len(tb[RTA_SRC]) < plen)
2819                         goto errout;
2820
2821                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2822         }
2823
2824         if (tb[RTA_PREFSRC])
2825                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2826
2827         if (tb[RTA_OIF])
2828                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2829
2830         if (tb[RTA_PRIORITY])
2831                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2832
2833         if (tb[RTA_METRICS]) {
2834                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2835                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2836         }
2837
2838         if (tb[RTA_TABLE])
2839                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2840
2841         if (tb[RTA_MULTIPATH]) {
2842                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2843                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2844         }
2845
2846         if (tb[RTA_PREF]) {
2847                 pref = nla_get_u8(tb[RTA_PREF]);
2848                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2849                     pref != ICMPV6_ROUTER_PREF_HIGH)
2850                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2851                 cfg->fc_flags |= RTF_PREF(pref);
2852         }
2853
2854         if (tb[RTA_ENCAP])
2855                 cfg->fc_encap = tb[RTA_ENCAP];
2856
2857         if (tb[RTA_ENCAP_TYPE])
2858                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2859
2860         if (tb[RTA_EXPIRES]) {
2861                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2862
2863                 if (addrconf_finite_timeout(timeout)) {
2864                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2865                         cfg->fc_flags |= RTF_EXPIRES;
2866                 }
2867         }
2868
2869         err = 0;
2870 errout:
2871         return err;
2872 }
2873
2874 struct rt6_nh {
2875         struct rt6_info *rt6_info;
2876         struct fib6_config r_cfg;
2877         struct mx6_config mxc;
2878         struct list_head next;
2879 };
2880
2881 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2882 {
2883         struct rt6_nh *nh;
2884
2885         list_for_each_entry(nh, rt6_nh_list, next) {
2886                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2887                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2888                         nh->r_cfg.fc_ifindex);
2889         }
2890 }
2891
2892 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2893                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2894 {
2895         struct rt6_nh *nh;
2896         struct rt6_info *rtnh;
2897         int err = -EEXIST;
2898
2899         list_for_each_entry(nh, rt6_nh_list, next) {
2900                 /* check if rt6_info already exists */
2901                 rtnh = nh->rt6_info;
2902
2903                 if (rtnh->dst.dev == rt->dst.dev &&
2904                     rtnh->rt6i_idev == rt->rt6i_idev &&
2905                     ipv6_addr_equal(&rtnh->rt6i_gateway,
2906                                     &rt->rt6i_gateway))
2907                         return err;
2908         }
2909
2910         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2911         if (!nh)
2912                 return -ENOMEM;
2913         nh->rt6_info = rt;
2914         err = ip6_convert_metrics(&nh->mxc, r_cfg);
2915         if (err) {
2916                 kfree(nh);
2917                 return err;
2918         }
2919         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2920         list_add_tail(&nh->next, rt6_nh_list);
2921
2922         return 0;
2923 }
2924
2925 static int ip6_route_multipath_add(struct fib6_config *cfg)
2926 {
2927         struct fib6_config r_cfg;
2928         struct rtnexthop *rtnh;
2929         struct rt6_info *rt;
2930         struct rt6_nh *err_nh;
2931         struct rt6_nh *nh, *nh_safe;
2932         int remaining;
2933         int attrlen;
2934         int err = 1;
2935         int nhn = 0;
2936         int replace = (cfg->fc_nlinfo.nlh &&
2937                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2938         LIST_HEAD(rt6_nh_list);
2939
2940         remaining = cfg->fc_mp_len;
2941         rtnh = (struct rtnexthop *)cfg->fc_mp;
2942
2943         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2944          * rt6_info structs per nexthop
2945          */
2946         while (rtnh_ok(rtnh, remaining)) {
2947                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2948                 if (rtnh->rtnh_ifindex)
2949                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2950
2951                 attrlen = rtnh_attrlen(rtnh);
2952                 if (attrlen > 0) {
2953                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2954
2955                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2956                         if (nla) {
2957                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2958                                 r_cfg.fc_flags |= RTF_GATEWAY;
2959                         }
2960                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2961                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2962                         if (nla)
2963                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2964                 }
2965
2966                 rt = ip6_route_info_create(&r_cfg);
2967                 if (IS_ERR(rt)) {
2968                         err = PTR_ERR(rt);
2969                         rt = NULL;
2970                         goto cleanup;
2971                 }
2972
2973                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2974                 if (err) {
2975                         dst_free(&rt->dst);
2976                         goto cleanup;
2977                 }
2978
2979                 rtnh = rtnh_next(rtnh, &remaining);
2980         }
2981
2982         err_nh = NULL;
2983         list_for_each_entry(nh, &rt6_nh_list, next) {
2984                 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2985                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
2986                 nh->rt6_info = NULL;
2987                 if (err) {
2988                         if (replace && nhn)
2989                                 ip6_print_replace_route_err(&rt6_nh_list);
2990                         err_nh = nh;
2991                         goto add_errout;
2992                 }
2993
2994                 /* Because each route is added like a single route we remove
2995                  * these flags after the first nexthop: if there is a collision,
2996                  * we have already failed to add the first nexthop:
2997                  * fib6_add_rt2node() has rejected it; when replacing, old
2998                  * nexthops have been replaced by first new, the rest should
2999                  * be added to it.
3000                  */
3001                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3002                                                      NLM_F_REPLACE);
3003                 nhn++;
3004         }
3005
3006         goto cleanup;
3007
3008 add_errout:
3009         /* Delete routes that were already added */
3010         list_for_each_entry(nh, &rt6_nh_list, next) {
3011                 if (err_nh == nh)
3012                         break;
3013                 ip6_route_del(&nh->r_cfg);
3014         }
3015
3016 cleanup:
3017         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3018                 if (nh->rt6_info)
3019                         dst_free(&nh->rt6_info->dst);
3020                 kfree(nh->mxc.mx);
3021                 list_del(&nh->next);
3022                 kfree(nh);
3023         }
3024
3025         return err;
3026 }
3027
3028 static int ip6_route_multipath_del(struct fib6_config *cfg)
3029 {
3030         struct fib6_config r_cfg;
3031         struct rtnexthop *rtnh;
3032         int remaining;
3033         int attrlen;
3034         int err = 1, last_err = 0;
3035
3036         remaining = cfg->fc_mp_len;
3037         rtnh = (struct rtnexthop *)cfg->fc_mp;
3038
3039         /* Parse a Multipath Entry */
3040         while (rtnh_ok(rtnh, remaining)) {
3041                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3042                 if (rtnh->rtnh_ifindex)
3043                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3044
3045                 attrlen = rtnh_attrlen(rtnh);
3046                 if (attrlen > 0) {
3047                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3048
3049                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3050                         if (nla) {
3051                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3052                                 r_cfg.fc_flags |= RTF_GATEWAY;
3053                         }
3054                 }
3055                 err = ip6_route_del(&r_cfg);
3056                 if (err)
3057                         last_err = err;
3058
3059                 rtnh = rtnh_next(rtnh, &remaining);
3060         }
3061
3062         return last_err;
3063 }
3064
3065 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3066 {
3067         struct fib6_config cfg;
3068         int err;
3069
3070         err = rtm_to_fib6_config(skb, nlh, &cfg);
3071         if (err < 0)
3072                 return err;
3073
3074         if (cfg.fc_mp)
3075                 return ip6_route_multipath_del(&cfg);
3076         else
3077                 return ip6_route_del(&cfg);
3078 }
3079
3080 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3081 {
3082         struct fib6_config cfg;
3083         int err;
3084
3085         err = rtm_to_fib6_config(skb, nlh, &cfg);
3086         if (err < 0)
3087                 return err;
3088
3089         if (cfg.fc_mp)
3090                 return ip6_route_multipath_add(&cfg);
3091         else
3092                 return ip6_route_add(&cfg);
3093 }
3094
3095 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3096 {
3097         return NLMSG_ALIGN(sizeof(struct rtmsg))
3098                + nla_total_size(16) /* RTA_SRC */
3099                + nla_total_size(16) /* RTA_DST */
3100                + nla_total_size(16) /* RTA_GATEWAY */
3101                + nla_total_size(16) /* RTA_PREFSRC */
3102                + nla_total_size(4) /* RTA_TABLE */
3103                + nla_total_size(4) /* RTA_IIF */
3104                + nla_total_size(4) /* RTA_OIF */
3105                + nla_total_size(4) /* RTA_PRIORITY */
3106                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3107                + nla_total_size(sizeof(struct rta_cacheinfo))
3108                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3109                + nla_total_size(1) /* RTA_PREF */
3110                + lwtunnel_get_encap_size(rt->dst.lwtstate);
3111 }
3112
3113 static int rt6_fill_node(struct net *net,
3114                          struct sk_buff *skb, struct rt6_info *rt,
3115                          struct in6_addr *dst, struct in6_addr *src,
3116                          int iif, int type, u32 portid, u32 seq,
3117                          int prefix, int nowait, unsigned int flags)
3118 {
3119         u32 metrics[RTAX_MAX];
3120         struct rtmsg *rtm;
3121         struct nlmsghdr *nlh;
3122         long expires;
3123         u32 table;
3124
3125         if (prefix) {   /* user wants prefix routes only */
3126                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3127                         /* success since this is not a prefix route */
3128                         return 1;
3129                 }
3130         }
3131
3132         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3133         if (!nlh)
3134                 return -EMSGSIZE;
3135
3136         rtm = nlmsg_data(nlh);
3137         rtm->rtm_family = AF_INET6;
3138         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3139         rtm->rtm_src_len = rt->rt6i_src.plen;
3140         rtm->rtm_tos = 0;
3141         if (rt->rt6i_table)
3142                 table = rt->rt6i_table->tb6_id;
3143         else
3144                 table = RT6_TABLE_UNSPEC;
3145         rtm->rtm_table = table;
3146         if (nla_put_u32(skb, RTA_TABLE, table))
3147                 goto nla_put_failure;
3148         if (rt->rt6i_flags & RTF_REJECT) {
3149                 switch (rt->dst.error) {
3150                 case -EINVAL:
3151                         rtm->rtm_type = RTN_BLACKHOLE;
3152                         break;
3153                 case -EACCES:
3154                         rtm->rtm_type = RTN_PROHIBIT;
3155                         break;
3156                 case -EAGAIN:
3157                         rtm->rtm_type = RTN_THROW;
3158                         break;
3159                 default:
3160                         rtm->rtm_type = RTN_UNREACHABLE;
3161                         break;
3162                 }
3163         }
3164         else if (rt->rt6i_flags & RTF_LOCAL)
3165                 rtm->rtm_type = RTN_LOCAL;
3166         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3167                 rtm->rtm_type = RTN_LOCAL;
3168         else
3169                 rtm->rtm_type = RTN_UNICAST;
3170         rtm->rtm_flags = 0;
3171         if (!netif_carrier_ok(rt->dst.dev)) {
3172                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3173                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3174                         rtm->rtm_flags |= RTNH_F_DEAD;
3175         }
3176         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3177         rtm->rtm_protocol = rt->rt6i_protocol;
3178         if (rt->rt6i_flags & RTF_DYNAMIC)
3179                 rtm->rtm_protocol = RTPROT_REDIRECT;
3180         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3181                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3182                         rtm->rtm_protocol = RTPROT_RA;
3183                 else
3184                         rtm->rtm_protocol = RTPROT_KERNEL;
3185         }
3186
3187         if (rt->rt6i_flags & RTF_CACHE)
3188                 rtm->rtm_flags |= RTM_F_CLONED;
3189
3190         if (dst) {
3191                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3192                         goto nla_put_failure;
3193                 rtm->rtm_dst_len = 128;
3194         } else if (rtm->rtm_dst_len)
3195                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3196                         goto nla_put_failure;
3197 #ifdef CONFIG_IPV6_SUBTREES
3198         if (src) {
3199                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3200                         goto nla_put_failure;
3201                 rtm->rtm_src_len = 128;
3202         } else if (rtm->rtm_src_len &&
3203                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3204                 goto nla_put_failure;
3205 #endif
3206         if (iif) {
3207 #ifdef CONFIG_IPV6_MROUTE
3208                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3209                         int err = ip6mr_get_route(net, skb, rtm, nowait);
3210                         if (err <= 0) {
3211                                 if (!nowait) {
3212                                         if (err == 0)
3213                                                 return 0;
3214                                         goto nla_put_failure;
3215                                 } else {
3216                                         if (err == -EMSGSIZE)
3217                                                 goto nla_put_failure;
3218                                 }
3219                         }
3220                 } else
3221 #endif
3222                         if (nla_put_u32(skb, RTA_IIF, iif))
3223                                 goto nla_put_failure;
3224         } else if (dst) {
3225                 struct in6_addr saddr_buf;
3226                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3227                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3228                         goto nla_put_failure;
3229         }
3230
3231         if (rt->rt6i_prefsrc.plen) {
3232                 struct in6_addr saddr_buf;
3233                 saddr_buf = rt->rt6i_prefsrc.addr;
3234                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3235                         goto nla_put_failure;
3236         }
3237
3238         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3239         if (rt->rt6i_pmtu)
3240                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3241         if (rtnetlink_put_metrics(skb, metrics) < 0)
3242                 goto nla_put_failure;
3243
3244         if (rt->rt6i_flags & RTF_GATEWAY) {
3245                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3246                         goto nla_put_failure;
3247         }
3248
3249         if (rt->dst.dev &&
3250             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3251                 goto nla_put_failure;
3252         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3253                 goto nla_put_failure;
3254
3255         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3256
3257         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3258                 goto nla_put_failure;
3259
3260         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3261                 goto nla_put_failure;
3262
3263         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3264
3265         nlmsg_end(skb, nlh);
3266         return 0;
3267
3268 nla_put_failure:
3269         nlmsg_cancel(skb, nlh);
3270         return -EMSGSIZE;
3271 }
3272
3273 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3274 {
3275         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3276         int prefix;
3277
3278         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3279                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3280                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3281         } else
3282                 prefix = 0;
3283
3284         return rt6_fill_node(arg->net,
3285                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3286                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3287                      prefix, 0, NLM_F_MULTI);
3288 }
3289
3290 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3291 {
3292         struct net *net = sock_net(in_skb->sk);
3293         struct nlattr *tb[RTA_MAX+1];
3294         struct rt6_info *rt;
3295         struct sk_buff *skb;
3296         struct rtmsg *rtm;
3297         struct flowi6 fl6;
3298         int err, iif = 0, oif = 0;
3299
3300         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3301         if (err < 0)
3302                 goto errout;
3303
3304         err = -EINVAL;
3305         memset(&fl6, 0, sizeof(fl6));
3306         rtm = nlmsg_data(nlh);
3307         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3308
3309         if (tb[RTA_SRC]) {
3310                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3311                         goto errout;
3312
3313                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3314         }
3315
3316         if (tb[RTA_DST]) {
3317                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3318                         goto errout;
3319
3320                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3321         }
3322
3323         if (tb[RTA_IIF])
3324                 iif = nla_get_u32(tb[RTA_IIF]);
3325
3326         if (tb[RTA_OIF])
3327                 oif = nla_get_u32(tb[RTA_OIF]);
3328
3329         if (tb[RTA_MARK])
3330                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3331
3332         if (iif) {
3333                 struct net_device *dev;
3334                 int flags = 0;
3335
3336                 dev = __dev_get_by_index(net, iif);
3337                 if (!dev) {
3338                         err = -ENODEV;
3339                         goto errout;
3340                 }
3341
3342                 fl6.flowi6_iif = iif;
3343
3344                 if (!ipv6_addr_any(&fl6.saddr))
3345                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3346
3347                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3348                                                                flags);
3349         } else {
3350                 fl6.flowi6_oif = oif;
3351
3352                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3353         }
3354
3355         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3356         if (!skb) {
3357                 ip6_rt_put(rt);
3358                 err = -ENOBUFS;
3359                 goto errout;
3360         }
3361
3362         /* Reserve room for dummy headers, this skb can pass
3363            through good chunk of routing engine.
3364          */
3365         skb_reset_mac_header(skb);
3366         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3367
3368         skb_dst_set(skb, &rt->dst);
3369
3370         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3371                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3372                             nlh->nlmsg_seq, 0, 0, 0);
3373         if (err < 0) {
3374                 kfree_skb(skb);
3375                 goto errout;
3376         }
3377
3378         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3379 errout:
3380         return err;
3381 }
3382
3383 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3384                      unsigned int nlm_flags)
3385 {
3386         struct sk_buff *skb;
3387         struct net *net = info->nl_net;
3388         u32 seq;
3389         int err;
3390
3391         err = -ENOBUFS;
3392         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3393
3394         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3395         if (!skb)
3396                 goto errout;
3397
3398         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3399                                 event, info->portid, seq, 0, 0, nlm_flags);
3400         if (err < 0) {
3401                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3402                 WARN_ON(err == -EMSGSIZE);
3403                 kfree_skb(skb);
3404                 goto errout;
3405         }
3406         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3407                     info->nlh, gfp_any());
3408         return;
3409 errout:
3410         if (err < 0)
3411                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3412 }
3413
3414 static int ip6_route_dev_notify(struct notifier_block *this,
3415                                 unsigned long event, void *ptr)
3416 {
3417         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3418         struct net *net = dev_net(dev);
3419
3420         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3421                 net->ipv6.ip6_null_entry->dst.dev = dev;
3422                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3423 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3424                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3425                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3426                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3427                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3428 #endif
3429         }
3430
3431         return NOTIFY_OK;
3432 }
3433
3434 /*
3435  *      /proc
3436  */
3437
3438 #ifdef CONFIG_PROC_FS
3439
3440 static const struct file_operations ipv6_route_proc_fops = {
3441         .owner          = THIS_MODULE,
3442         .open           = ipv6_route_open,
3443         .read           = seq_read,
3444         .llseek         = seq_lseek,
3445         .release        = seq_release_net,
3446 };
3447
3448 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3449 {
3450         struct net *net = (struct net *)seq->private;
3451         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3452                    net->ipv6.rt6_stats->fib_nodes,
3453                    net->ipv6.rt6_stats->fib_route_nodes,
3454                    net->ipv6.rt6_stats->fib_rt_alloc,
3455                    net->ipv6.rt6_stats->fib_rt_entries,
3456                    net->ipv6.rt6_stats->fib_rt_cache,
3457                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3458                    net->ipv6.rt6_stats->fib_discarded_routes);
3459
3460         return 0;
3461 }
3462
3463 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3464 {
3465         return single_open_net(inode, file, rt6_stats_seq_show);
3466 }
3467
3468 static const struct file_operations rt6_stats_seq_fops = {
3469         .owner   = THIS_MODULE,
3470         .open    = rt6_stats_seq_open,
3471         .read    = seq_read,
3472         .llseek  = seq_lseek,
3473         .release = single_release_net,
3474 };
3475 #endif  /* CONFIG_PROC_FS */
3476
3477 #ifdef CONFIG_SYSCTL
3478
3479 static
3480 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3481                               void __user *buffer, size_t *lenp, loff_t *ppos)
3482 {
3483         struct net *net;
3484         int delay;
3485         if (!write)
3486                 return -EINVAL;
3487
3488         net = (struct net *)ctl->extra1;
3489         delay = net->ipv6.sysctl.flush_delay;
3490         proc_dointvec(ctl, write, buffer, lenp, ppos);
3491         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3492         return 0;
3493 }
3494
3495 struct ctl_table ipv6_route_table_template[] = {
3496         {
3497                 .procname       =       "flush",
3498                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3499                 .maxlen         =       sizeof(int),
3500                 .mode           =       0200,
3501                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3502         },
3503         {
3504                 .procname       =       "gc_thresh",
3505                 .data           =       &ip6_dst_ops_template.gc_thresh,
3506                 .maxlen         =       sizeof(int),
3507                 .mode           =       0644,
3508                 .proc_handler   =       proc_dointvec,
3509         },
3510         {
3511                 .procname       =       "max_size",
3512                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3513                 .maxlen         =       sizeof(int),
3514                 .mode           =       0644,
3515                 .proc_handler   =       proc_dointvec,
3516         },
3517         {
3518                 .procname       =       "gc_min_interval",
3519                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3520                 .maxlen         =       sizeof(int),
3521                 .mode           =       0644,
3522                 .proc_handler   =       proc_dointvec_jiffies,
3523         },
3524         {
3525                 .procname       =       "gc_timeout",
3526                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3527                 .maxlen         =       sizeof(int),
3528                 .mode           =       0644,
3529                 .proc_handler   =       proc_dointvec_jiffies,
3530         },
3531         {
3532                 .procname       =       "gc_interval",
3533                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3534                 .maxlen         =       sizeof(int),
3535                 .mode           =       0644,
3536                 .proc_handler   =       proc_dointvec_jiffies,
3537         },
3538         {
3539                 .procname       =       "gc_elasticity",
3540                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3541                 .maxlen         =       sizeof(int),
3542                 .mode           =       0644,
3543                 .proc_handler   =       proc_dointvec,
3544         },
3545         {
3546                 .procname       =       "mtu_expires",
3547                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3548                 .maxlen         =       sizeof(int),
3549                 .mode           =       0644,
3550                 .proc_handler   =       proc_dointvec_jiffies,
3551         },
3552         {
3553                 .procname       =       "min_adv_mss",
3554                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3555                 .maxlen         =       sizeof(int),
3556                 .mode           =       0644,
3557                 .proc_handler   =       proc_dointvec,
3558         },
3559         {
3560                 .procname       =       "gc_min_interval_ms",
3561                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3562                 .maxlen         =       sizeof(int),
3563                 .mode           =       0644,
3564                 .proc_handler   =       proc_dointvec_ms_jiffies,
3565         },
3566         { }
3567 };
3568
3569 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3570 {
3571         struct ctl_table *table;
3572
3573         table = kmemdup(ipv6_route_table_template,
3574                         sizeof(ipv6_route_table_template),
3575                         GFP_KERNEL);
3576
3577         if (table) {
3578                 table[0].data = &net->ipv6.sysctl.flush_delay;
3579                 table[0].extra1 = net;
3580                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3581                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3582                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3583                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3584                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3585                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3586                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3587                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3588                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3589
3590                 /* Don't export sysctls to unprivileged users */
3591                 if (net->user_ns != &init_user_ns)
3592                         table[0].procname = NULL;
3593         }
3594
3595         return table;
3596 }
3597 #endif
3598
3599 static int __net_init ip6_route_net_init(struct net *net)
3600 {
3601         int ret = -ENOMEM;
3602
3603         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3604                sizeof(net->ipv6.ip6_dst_ops));
3605
3606         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3607                 goto out_ip6_dst_ops;
3608
3609         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3610                                            sizeof(*net->ipv6.ip6_null_entry),
3611                                            GFP_KERNEL);
3612         if (!net->ipv6.ip6_null_entry)
3613                 goto out_ip6_dst_entries;
3614         net->ipv6.ip6_null_entry->dst.path =
3615                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3616         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3617         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3618                          ip6_template_metrics, true);
3619
3620 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3621         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3622                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3623                                                GFP_KERNEL);
3624         if (!net->ipv6.ip6_prohibit_entry)
3625                 goto out_ip6_null_entry;
3626         net->ipv6.ip6_prohibit_entry->dst.path =
3627                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3628         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3629         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3630                          ip6_template_metrics, true);
3631
3632         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3633                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3634                                                GFP_KERNEL);
3635         if (!net->ipv6.ip6_blk_hole_entry)
3636                 goto out_ip6_prohibit_entry;
3637         net->ipv6.ip6_blk_hole_entry->dst.path =
3638                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3639         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3640         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3641                          ip6_template_metrics, true);
3642 #endif
3643
3644         net->ipv6.sysctl.flush_delay = 0;
3645         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3646         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3647         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3648         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3649         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3650         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3651         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3652
3653         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3654
3655         ret = 0;
3656 out:
3657         return ret;
3658
3659 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3660 out_ip6_prohibit_entry:
3661         kfree(net->ipv6.ip6_prohibit_entry);
3662 out_ip6_null_entry:
3663         kfree(net->ipv6.ip6_null_entry);
3664 #endif
3665 out_ip6_dst_entries:
3666         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3667 out_ip6_dst_ops:
3668         goto out;
3669 }
3670
3671 static void __net_exit ip6_route_net_exit(struct net *net)
3672 {
3673         kfree(net->ipv6.ip6_null_entry);
3674 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3675         kfree(net->ipv6.ip6_prohibit_entry);
3676         kfree(net->ipv6.ip6_blk_hole_entry);
3677 #endif
3678         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3679 }
3680
3681 static int __net_init ip6_route_net_init_late(struct net *net)
3682 {
3683 #ifdef CONFIG_PROC_FS
3684         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3685         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3686 #endif
3687         return 0;
3688 }
3689
3690 static void __net_exit ip6_route_net_exit_late(struct net *net)
3691 {
3692 #ifdef CONFIG_PROC_FS
3693         remove_proc_entry("ipv6_route", net->proc_net);
3694         remove_proc_entry("rt6_stats", net->proc_net);
3695 #endif
3696 }
3697
3698 static struct pernet_operations ip6_route_net_ops = {
3699         .init = ip6_route_net_init,
3700         .exit = ip6_route_net_exit,
3701 };
3702
3703 static int __net_init ipv6_inetpeer_init(struct net *net)
3704 {
3705         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3706
3707         if (!bp)
3708                 return -ENOMEM;
3709         inet_peer_base_init(bp);
3710         net->ipv6.peers = bp;
3711         return 0;
3712 }
3713
3714 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3715 {
3716         struct inet_peer_base *bp = net->ipv6.peers;
3717
3718         net->ipv6.peers = NULL;
3719         inetpeer_invalidate_tree(bp);
3720         kfree(bp);
3721 }
3722
3723 static struct pernet_operations ipv6_inetpeer_ops = {
3724         .init   =       ipv6_inetpeer_init,
3725         .exit   =       ipv6_inetpeer_exit,
3726 };
3727
3728 static struct pernet_operations ip6_route_net_late_ops = {
3729         .init = ip6_route_net_init_late,
3730         .exit = ip6_route_net_exit_late,
3731 };
3732
3733 static struct notifier_block ip6_route_dev_notifier = {
3734         .notifier_call = ip6_route_dev_notify,
3735         .priority = 0,
3736 };
3737
3738 int __init ip6_route_init(void)
3739 {
3740         int ret;
3741         int cpu;
3742
3743         ret = -ENOMEM;
3744         ip6_dst_ops_template.kmem_cachep =
3745                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3746                                   SLAB_HWCACHE_ALIGN, NULL);
3747         if (!ip6_dst_ops_template.kmem_cachep)
3748                 goto out;
3749
3750         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3751         if (ret)
3752                 goto out_kmem_cache;
3753
3754         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3755         if (ret)
3756                 goto out_dst_entries;
3757
3758         ret = register_pernet_subsys(&ip6_route_net_ops);
3759         if (ret)
3760                 goto out_register_inetpeer;
3761
3762         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3763
3764         /* Registering of the loopback is done before this portion of code,
3765          * the loopback reference in rt6_info will not be taken, do it
3766          * manually for init_net */
3767         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3768         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3769   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3770         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3771         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3772         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3773         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3774   #endif
3775         ret = fib6_init();
3776         if (ret)
3777                 goto out_register_subsys;
3778
3779         ret = xfrm6_init();
3780         if (ret)
3781                 goto out_fib6_init;
3782
3783         ret = fib6_rules_init();
3784         if (ret)
3785                 goto xfrm6_init;
3786
3787         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3788         if (ret)
3789                 goto fib6_rules_init;
3790
3791         ret = -ENOBUFS;
3792         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3793             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3794             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3795                 goto out_register_late_subsys;
3796
3797         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3798         if (ret)
3799                 goto out_register_late_subsys;
3800
3801         for_each_possible_cpu(cpu) {
3802                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3803
3804                 INIT_LIST_HEAD(&ul->head);
3805                 spin_lock_init(&ul->lock);
3806         }
3807
3808 out:
3809         return ret;
3810
3811 out_register_late_subsys:
3812         unregister_pernet_subsys(&ip6_route_net_late_ops);
3813 fib6_rules_init:
3814         fib6_rules_cleanup();
3815 xfrm6_init:
3816         xfrm6_fini();
3817 out_fib6_init:
3818         fib6_gc_cleanup();
3819 out_register_subsys:
3820         unregister_pernet_subsys(&ip6_route_net_ops);
3821 out_register_inetpeer:
3822         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3823 out_dst_entries:
3824         dst_entries_destroy(&ip6_dst_blackhole_ops);
3825 out_kmem_cache:
3826         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3827         goto out;
3828 }
3829
3830 void ip6_route_cleanup(void)
3831 {
3832         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3833         unregister_pernet_subsys(&ip6_route_net_late_ops);
3834         fib6_rules_cleanup();
3835         xfrm6_fini();
3836         fib6_gc_cleanup();
3837         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3838         unregister_pernet_subsys(&ip6_route_net_ops);
3839         dst_entries_destroy(&ip6_dst_blackhole_ops);
3840         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3841 }