ipv6: copy lwtstate in ip6_rt_copy_init()
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
62
63 #include <asm/uaccess.h>
64
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68
69 enum rt6_nud_state {
70         RT6_NUD_FAIL_HARD = -3,
71         RT6_NUD_FAIL_PROBE = -2,
72         RT6_NUD_FAIL_DO_RR = -1,
73         RT6_NUD_SUCCEED = 1
74 };
75
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int              ip6_pkt_prohibit(struct sk_buff *skb);
89 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void             ip6_link_failure(struct sk_buff *skb);
91 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92                                            struct sk_buff *skb, u32 mtu);
93 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
94                                         struct sk_buff *skb);
95 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
97
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100                                            const struct in6_addr *prefix, int prefixlen,
101                                            const struct in6_addr *gwaddr, int ifindex,
102                                            unsigned int pref);
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 struct uncached_list {
109         spinlock_t              lock;
110         struct list_head        head;
111 };
112
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
114
115 static void rt6_uncached_list_add(struct rt6_info *rt)
116 {
117         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
118
119         rt->dst.flags |= DST_NOCACHE;
120         rt->rt6i_uncached_list = ul;
121
122         spin_lock_bh(&ul->lock);
123         list_add_tail(&rt->rt6i_uncached, &ul->head);
124         spin_unlock_bh(&ul->lock);
125 }
126
127 static void rt6_uncached_list_del(struct rt6_info *rt)
128 {
129         if (!list_empty(&rt->rt6i_uncached)) {
130                 struct uncached_list *ul = rt->rt6i_uncached_list;
131
132                 spin_lock_bh(&ul->lock);
133                 list_del(&rt->rt6i_uncached);
134                 spin_unlock_bh(&ul->lock);
135         }
136 }
137
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
139 {
140         struct net_device *loopback_dev = net->loopback_dev;
141         int cpu;
142
143         for_each_possible_cpu(cpu) {
144                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
145                 struct rt6_info *rt;
146
147                 spin_lock_bh(&ul->lock);
148                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149                         struct inet6_dev *rt_idev = rt->rt6i_idev;
150                         struct net_device *rt_dev = rt->dst.dev;
151
152                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
153                             rt_idev->dev != loopback_dev) {
154                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
155                                 in6_dev_put(rt_idev);
156                         }
157
158                         if (rt_dev && (rt_dev == dev || !dev) &&
159                             rt_dev != loopback_dev) {
160                                 rt->dst.dev = loopback_dev;
161                                 dev_hold(rt->dst.dev);
162                                 dev_put(rt_dev);
163                         }
164                 }
165                 spin_unlock_bh(&ul->lock);
166         }
167 }
168
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
170 {
171         return dst_metrics_write_ptr(rt->dst.from);
172 }
173
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
175 {
176         struct rt6_info *rt = (struct rt6_info *)dst;
177
178         if (rt->rt6i_flags & RTF_PCPU)
179                 return rt6_pcpu_cow_metrics(rt);
180         else if (rt->rt6i_flags & RTF_CACHE)
181                 return NULL;
182         else
183                 return dst_cow_metrics_generic(dst, old);
184 }
185
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
187                                              struct sk_buff *skb,
188                                              const void *daddr)
189 {
190         struct in6_addr *p = &rt->rt6i_gateway;
191
192         if (!ipv6_addr_any(p))
193                 return (const void *) p;
194         else if (skb)
195                 return &ipv6_hdr(skb)->daddr;
196         return daddr;
197 }
198
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
200                                           struct sk_buff *skb,
201                                           const void *daddr)
202 {
203         struct rt6_info *rt = (struct rt6_info *) dst;
204         struct neighbour *n;
205
206         daddr = choose_neigh_daddr(rt, skb, daddr);
207         n = __ipv6_neigh_lookup(dst->dev, daddr);
208         if (n)
209                 return n;
210         return neigh_create(&nd_tbl, daddr, dst->dev);
211 }
212
213 static struct dst_ops ip6_dst_ops_template = {
214         .family                 =       AF_INET6,
215         .gc                     =       ip6_dst_gc,
216         .gc_thresh              =       1024,
217         .check                  =       ip6_dst_check,
218         .default_advmss         =       ip6_default_advmss,
219         .mtu                    =       ip6_mtu,
220         .cow_metrics            =       ipv6_cow_metrics,
221         .destroy                =       ip6_dst_destroy,
222         .ifdown                 =       ip6_dst_ifdown,
223         .negative_advice        =       ip6_negative_advice,
224         .link_failure           =       ip6_link_failure,
225         .update_pmtu            =       ip6_rt_update_pmtu,
226         .redirect               =       rt6_do_redirect,
227         .local_out              =       __ip6_local_out,
228         .neigh_lookup           =       ip6_neigh_lookup,
229 };
230
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
232 {
233         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
234
235         return mtu ? : dst->dev->mtu;
236 }
237
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239                                          struct sk_buff *skb, u32 mtu)
240 {
241 }
242
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
244                                       struct sk_buff *skb)
245 {
246 }
247
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
249                                          unsigned long old)
250 {
251         return NULL;
252 }
253
254 static struct dst_ops ip6_dst_blackhole_ops = {
255         .family                 =       AF_INET6,
256         .destroy                =       ip6_dst_destroy,
257         .check                  =       ip6_dst_check,
258         .mtu                    =       ip6_blackhole_mtu,
259         .default_advmss         =       ip6_default_advmss,
260         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
261         .redirect               =       ip6_rt_blackhole_redirect,
262         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
263         .neigh_lookup           =       ip6_neigh_lookup,
264 };
265
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267         [RTAX_HOPLIMIT - 1] = 0,
268 };
269
270 static const struct rt6_info ip6_null_entry_template = {
271         .dst = {
272                 .__refcnt       = ATOMIC_INIT(1),
273                 .__use          = 1,
274                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
275                 .error          = -ENETUNREACH,
276                 .input          = ip6_pkt_discard,
277                 .output         = ip6_pkt_discard_out,
278         },
279         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
280         .rt6i_protocol  = RTPROT_KERNEL,
281         .rt6i_metric    = ~(u32) 0,
282         .rt6i_ref       = ATOMIC_INIT(1),
283 };
284
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
286
287 static const struct rt6_info ip6_prohibit_entry_template = {
288         .dst = {
289                 .__refcnt       = ATOMIC_INIT(1),
290                 .__use          = 1,
291                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
292                 .error          = -EACCES,
293                 .input          = ip6_pkt_prohibit,
294                 .output         = ip6_pkt_prohibit_out,
295         },
296         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .rt6i_protocol  = RTPROT_KERNEL,
298         .rt6i_metric    = ~(u32) 0,
299         .rt6i_ref       = ATOMIC_INIT(1),
300 };
301
302 static const struct rt6_info ip6_blk_hole_entry_template = {
303         .dst = {
304                 .__refcnt       = ATOMIC_INIT(1),
305                 .__use          = 1,
306                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
307                 .error          = -EINVAL,
308                 .input          = dst_discard,
309                 .output         = dst_discard_sk,
310         },
311         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
312         .rt6i_protocol  = RTPROT_KERNEL,
313         .rt6i_metric    = ~(u32) 0,
314         .rt6i_ref       = ATOMIC_INIT(1),
315 };
316
317 #endif
318
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321                                         struct net_device *dev,
322                                         int flags,
323                                         struct fib6_table *table)
324 {
325         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326                                         0, DST_OBSOLETE_FORCE_CHK, flags);
327
328         if (rt) {
329                 struct dst_entry *dst = &rt->dst;
330
331                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332                 INIT_LIST_HEAD(&rt->rt6i_siblings);
333                 INIT_LIST_HEAD(&rt->rt6i_uncached);
334         }
335         return rt;
336 }
337
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339                                       struct net_device *dev,
340                                       int flags,
341                                       struct fib6_table *table)
342 {
343         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
344
345         if (rt) {
346                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
347                 if (rt->rt6i_pcpu) {
348                         int cpu;
349
350                         for_each_possible_cpu(cpu) {
351                                 struct rt6_info **p;
352
353                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354                                 /* no one shares rt */
355                                 *p =  NULL;
356                         }
357                 } else {
358                         dst_destroy((struct dst_entry *)rt);
359                         return NULL;
360                 }
361         }
362
363         return rt;
364 }
365
366 static void ip6_dst_destroy(struct dst_entry *dst)
367 {
368         struct rt6_info *rt = (struct rt6_info *)dst;
369         struct dst_entry *from = dst->from;
370         struct inet6_dev *idev;
371
372         dst_destroy_metrics_generic(dst);
373         free_percpu(rt->rt6i_pcpu);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         dst->from = NULL;
383         dst_release(from);
384 }
385
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387                            int how)
388 {
389         struct rt6_info *rt = (struct rt6_info *)dst;
390         struct inet6_dev *idev = rt->rt6i_idev;
391         struct net_device *loopback_dev =
392                 dev_net(dev)->loopback_dev;
393
394         if (dev != loopback_dev) {
395                 if (idev && idev->dev == dev) {
396                         struct inet6_dev *loopback_idev =
397                                 in6_dev_get(loopback_dev);
398                         if (loopback_idev) {
399                                 rt->rt6i_idev = loopback_idev;
400                                 in6_dev_put(idev);
401                         }
402                 }
403         }
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES) {
409                 if (time_after(jiffies, rt->dst.expires))
410                         return true;
411         } else if (rt->dst.from) {
412                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
413         }
414         return false;
415 }
416
417 /* Multipath route selection:
418  *   Hash based function using packet header and flowlabel.
419  * Adapted from fib_info_hashfn()
420  */
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422                                const struct flowi6 *fl6)
423 {
424         unsigned int val = fl6->flowi6_proto;
425
426         val ^= ipv6_addr_hash(&fl6->daddr);
427         val ^= ipv6_addr_hash(&fl6->saddr);
428
429         /* Work only if this not encapsulated */
430         switch (fl6->flowi6_proto) {
431         case IPPROTO_UDP:
432         case IPPROTO_TCP:
433         case IPPROTO_SCTP:
434                 val ^= (__force u16)fl6->fl6_sport;
435                 val ^= (__force u16)fl6->fl6_dport;
436                 break;
437
438         case IPPROTO_ICMPV6:
439                 val ^= (__force u16)fl6->fl6_icmp_type;
440                 val ^= (__force u16)fl6->fl6_icmp_code;
441                 break;
442         }
443         /* RFC6438 recommands to use flowlabel */
444         val ^= (__force u32)fl6->flowlabel;
445
446         /* Perhaps, we need to tune, this function? */
447         val = val ^ (val >> 7) ^ (val >> 12);
448         return val % candidate_count;
449 }
450
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452                                              struct flowi6 *fl6, int oif,
453                                              int strict)
454 {
455         struct rt6_info *sibling, *next_sibling;
456         int route_choosen;
457
458         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459         /* Don't change the route, if route_choosen == 0
460          * (siblings does not include ourself)
461          */
462         if (route_choosen)
463                 list_for_each_entry_safe(sibling, next_sibling,
464                                 &match->rt6i_siblings, rt6i_siblings) {
465                         route_choosen--;
466                         if (route_choosen == 0) {
467                                 if (rt6_score_route(sibling, oif, strict) < 0)
468                                         break;
469                                 match = sibling;
470                                 break;
471                         }
472                 }
473         return match;
474 }
475
476 /*
477  *      Route lookup. Any table->tb6_lock is implied.
478  */
479
480 static inline struct rt6_info *rt6_device_match(struct net *net,
481                                                     struct rt6_info *rt,
482                                                     const struct in6_addr *saddr,
483                                                     int oif,
484                                                     int flags)
485 {
486         struct rt6_info *local = NULL;
487         struct rt6_info *sprt;
488
489         if (!oif && ipv6_addr_any(saddr))
490                 goto out;
491
492         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493                 struct net_device *dev = sprt->dst.dev;
494
495                 if (oif) {
496                         if (dev->ifindex == oif)
497                                 return sprt;
498                         if (dev->flags & IFF_LOOPBACK) {
499                                 if (!sprt->rt6i_idev ||
500                                     sprt->rt6i_idev->dev->ifindex != oif) {
501                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
502                                                 continue;
503                                         if (local && (!oif ||
504                                                       local->rt6i_idev->dev->ifindex == oif))
505                                                 continue;
506                                 }
507                                 local = sprt;
508                         }
509                 } else {
510                         if (ipv6_chk_addr(net, saddr, dev,
511                                           flags & RT6_LOOKUP_F_IFACE))
512                                 return sprt;
513                 }
514         }
515
516         if (oif) {
517                 if (local)
518                         return local;
519
520                 if (flags & RT6_LOOKUP_F_IFACE)
521                         return net->ipv6.ip6_null_entry;
522         }
523 out:
524         return rt;
525 }
526
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529         struct work_struct work;
530         struct in6_addr target;
531         struct net_device *dev;
532 };
533
534 static void rt6_probe_deferred(struct work_struct *w)
535 {
536         struct in6_addr mcaddr;
537         struct __rt6_probe_work *work =
538                 container_of(w, struct __rt6_probe_work, work);
539
540         addrconf_addr_solict_mult(&work->target, &mcaddr);
541         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
542         dev_put(work->dev);
543         kfree(work);
544 }
545
546 static void rt6_probe(struct rt6_info *rt)
547 {
548         struct neighbour *neigh;
549         /*
550          * Okay, this does not seem to be appropriate
551          * for now, however, we need to check if it
552          * is really so; aka Router Reachability Probing.
553          *
554          * Router Reachability Probe MUST be rate-limited
555          * to no more than one per minute.
556          */
557         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
558                 return;
559         rcu_read_lock_bh();
560         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
561         if (neigh) {
562                 write_lock(&neigh->lock);
563                 if (neigh->nud_state & NUD_VALID)
564                         goto out;
565         }
566
567         if (!neigh ||
568             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
569                 struct __rt6_probe_work *work;
570
571                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
572
573                 if (neigh && work)
574                         __neigh_set_probe_once(neigh);
575
576                 if (neigh)
577                         write_unlock(&neigh->lock);
578
579                 if (work) {
580                         INIT_WORK(&work->work, rt6_probe_deferred);
581                         work->target = rt->rt6i_gateway;
582                         dev_hold(rt->dst.dev);
583                         work->dev = rt->dst.dev;
584                         schedule_work(&work->work);
585                 }
586         } else {
587 out:
588                 write_unlock(&neigh->lock);
589         }
590         rcu_read_unlock_bh();
591 }
592 #else
593 static inline void rt6_probe(struct rt6_info *rt)
594 {
595 }
596 #endif
597
598 /*
599  * Default Router Selection (RFC 2461 6.3.6)
600  */
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
602 {
603         struct net_device *dev = rt->dst.dev;
604         if (!oif || dev->ifindex == oif)
605                 return 2;
606         if ((dev->flags & IFF_LOOPBACK) &&
607             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608                 return 1;
609         return 0;
610 }
611
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
613 {
614         struct neighbour *neigh;
615         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
616
617         if (rt->rt6i_flags & RTF_NONEXTHOP ||
618             !(rt->rt6i_flags & RTF_GATEWAY))
619                 return RT6_NUD_SUCCEED;
620
621         rcu_read_lock_bh();
622         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
623         if (neigh) {
624                 read_lock(&neigh->lock);
625                 if (neigh->nud_state & NUD_VALID)
626                         ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628                 else if (!(neigh->nud_state & NUD_FAILED))
629                         ret = RT6_NUD_SUCCEED;
630                 else
631                         ret = RT6_NUD_FAIL_PROBE;
632 #endif
633                 read_unlock(&neigh->lock);
634         } else {
635                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
637         }
638         rcu_read_unlock_bh();
639
640         return ret;
641 }
642
643 static int rt6_score_route(struct rt6_info *rt, int oif,
644                            int strict)
645 {
646         int m;
647
648         m = rt6_check_dev(rt, oif);
649         if (!m && (strict & RT6_LOOKUP_F_IFACE))
650                 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
653 #endif
654         if (strict & RT6_LOOKUP_F_REACHABLE) {
655                 int n = rt6_check_neigh(rt);
656                 if (n < 0)
657                         return n;
658         }
659         return m;
660 }
661
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663                                    int *mpri, struct rt6_info *match,
664                                    bool *do_rr)
665 {
666         int m;
667         bool match_do_rr = false;
668
669         if (rt6_check_expired(rt))
670                 goto out;
671
672         m = rt6_score_route(rt, oif, strict);
673         if (m == RT6_NUD_FAIL_DO_RR) {
674                 match_do_rr = true;
675                 m = 0; /* lowest valid score */
676         } else if (m == RT6_NUD_FAIL_HARD) {
677                 goto out;
678         }
679
680         if (strict & RT6_LOOKUP_F_REACHABLE)
681                 rt6_probe(rt);
682
683         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
684         if (m > *mpri) {
685                 *do_rr = match_do_rr;
686                 *mpri = m;
687                 match = rt;
688         }
689 out:
690         return match;
691 }
692
693 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
694                                      struct rt6_info *rr_head,
695                                      u32 metric, int oif, int strict,
696                                      bool *do_rr)
697 {
698         struct rt6_info *rt, *match, *cont;
699         int mpri = -1;
700
701         match = NULL;
702         cont = NULL;
703         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
704                 if (rt->rt6i_metric != metric) {
705                         cont = rt;
706                         break;
707                 }
708
709                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
710         }
711
712         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
713                 if (rt->rt6i_metric != metric) {
714                         cont = rt;
715                         break;
716                 }
717
718                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719         }
720
721         if (match || !cont)
722                 return match;
723
724         for (rt = cont; rt; rt = rt->dst.rt6_next)
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726
727         return match;
728 }
729
730 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
731 {
732         struct rt6_info *match, *rt0;
733         struct net *net;
734         bool do_rr = false;
735
736         rt0 = fn->rr_ptr;
737         if (!rt0)
738                 fn->rr_ptr = rt0 = fn->leaf;
739
740         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
741                              &do_rr);
742
743         if (do_rr) {
744                 struct rt6_info *next = rt0->dst.rt6_next;
745
746                 /* no entries matched; do round-robin */
747                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
748                         next = fn->leaf;
749
750                 if (next != rt0)
751                         fn->rr_ptr = next;
752         }
753
754         net = dev_net(rt0->dst.dev);
755         return match ? match : net->ipv6.ip6_null_entry;
756 }
757
758 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
759 {
760         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
761 }
762
763 #ifdef CONFIG_IPV6_ROUTE_INFO
764 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
765                   const struct in6_addr *gwaddr)
766 {
767         struct net *net = dev_net(dev);
768         struct route_info *rinfo = (struct route_info *) opt;
769         struct in6_addr prefix_buf, *prefix;
770         unsigned int pref;
771         unsigned long lifetime;
772         struct rt6_info *rt;
773
774         if (len < sizeof(struct route_info)) {
775                 return -EINVAL;
776         }
777
778         /* Sanity check for prefix_len and length */
779         if (rinfo->length > 3) {
780                 return -EINVAL;
781         } else if (rinfo->prefix_len > 128) {
782                 return -EINVAL;
783         } else if (rinfo->prefix_len > 64) {
784                 if (rinfo->length < 2) {
785                         return -EINVAL;
786                 }
787         } else if (rinfo->prefix_len > 0) {
788                 if (rinfo->length < 1) {
789                         return -EINVAL;
790                 }
791         }
792
793         pref = rinfo->route_pref;
794         if (pref == ICMPV6_ROUTER_PREF_INVALID)
795                 return -EINVAL;
796
797         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
798
799         if (rinfo->length == 3)
800                 prefix = (struct in6_addr *)rinfo->prefix;
801         else {
802                 /* this function is safe */
803                 ipv6_addr_prefix(&prefix_buf,
804                                  (struct in6_addr *)rinfo->prefix,
805                                  rinfo->prefix_len);
806                 prefix = &prefix_buf;
807         }
808
809         if (rinfo->prefix_len == 0)
810                 rt = rt6_get_dflt_router(gwaddr, dev);
811         else
812                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
813                                         gwaddr, dev->ifindex);
814
815         if (rt && !lifetime) {
816                 ip6_del_rt(rt);
817                 rt = NULL;
818         }
819
820         if (!rt && lifetime)
821                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
822                                         pref);
823         else if (rt)
824                 rt->rt6i_flags = RTF_ROUTEINFO |
825                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
826
827         if (rt) {
828                 if (!addrconf_finite_timeout(lifetime))
829                         rt6_clean_expires(rt);
830                 else
831                         rt6_set_expires(rt, jiffies + HZ * lifetime);
832
833                 ip6_rt_put(rt);
834         }
835         return 0;
836 }
837 #endif
838
839 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
840                                         struct in6_addr *saddr)
841 {
842         struct fib6_node *pn;
843         while (1) {
844                 if (fn->fn_flags & RTN_TL_ROOT)
845                         return NULL;
846                 pn = fn->parent;
847                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
848                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
849                 else
850                         fn = pn;
851                 if (fn->fn_flags & RTN_RTINFO)
852                         return fn;
853         }
854 }
855
856 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
857                                              struct fib6_table *table,
858                                              struct flowi6 *fl6, int flags)
859 {
860         struct fib6_node *fn;
861         struct rt6_info *rt;
862
863         read_lock_bh(&table->tb6_lock);
864         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
865 restart:
866         rt = fn->leaf;
867         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
868         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
869                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
870         if (rt == net->ipv6.ip6_null_entry) {
871                 fn = fib6_backtrack(fn, &fl6->saddr);
872                 if (fn)
873                         goto restart;
874         }
875         dst_use(&rt->dst, jiffies);
876         read_unlock_bh(&table->tb6_lock);
877         return rt;
878
879 }
880
881 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
882                                     int flags)
883 {
884         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
885 }
886 EXPORT_SYMBOL_GPL(ip6_route_lookup);
887
888 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
889                             const struct in6_addr *saddr, int oif, int strict)
890 {
891         struct flowi6 fl6 = {
892                 .flowi6_oif = oif,
893                 .daddr = *daddr,
894         };
895         struct dst_entry *dst;
896         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
897
898         if (saddr) {
899                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
900                 flags |= RT6_LOOKUP_F_HAS_SADDR;
901         }
902
903         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
904         if (dst->error == 0)
905                 return (struct rt6_info *) dst;
906
907         dst_release(dst);
908
909         return NULL;
910 }
911 EXPORT_SYMBOL(rt6_lookup);
912
913 /* ip6_ins_rt is called with FREE table->tb6_lock.
914    It takes new route entry, the addition fails by any reason the
915    route is freed. In any case, if caller does not hold it, it may
916    be destroyed.
917  */
918
919 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
920                         struct mx6_config *mxc)
921 {
922         int err;
923         struct fib6_table *table;
924
925         table = rt->rt6i_table;
926         write_lock_bh(&table->tb6_lock);
927         err = fib6_add(&table->tb6_root, rt, info, mxc);
928         write_unlock_bh(&table->tb6_lock);
929
930         return err;
931 }
932
933 int ip6_ins_rt(struct rt6_info *rt)
934 {
935         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
936         struct mx6_config mxc = { .mx = NULL, };
937
938         return __ip6_ins_rt(rt, &info, &mxc);
939 }
940
941 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
942                                            const struct in6_addr *daddr,
943                                            const struct in6_addr *saddr)
944 {
945         struct rt6_info *rt;
946
947         /*
948          *      Clone the route.
949          */
950
951         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
952                 ort = (struct rt6_info *)ort->dst.from;
953
954         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
955                              0, ort->rt6i_table);
956
957         if (!rt)
958                 return NULL;
959
960         ip6_rt_copy_init(rt, ort);
961         rt->rt6i_flags |= RTF_CACHE;
962         rt->rt6i_metric = 0;
963         rt->dst.flags |= DST_HOST;
964         rt->rt6i_dst.addr = *daddr;
965         rt->rt6i_dst.plen = 128;
966
967         if (!rt6_is_gw_or_nonexthop(ort)) {
968                 if (ort->rt6i_dst.plen != 128 &&
969                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
970                         rt->rt6i_flags |= RTF_ANYCAST;
971 #ifdef CONFIG_IPV6_SUBTREES
972                 if (rt->rt6i_src.plen && saddr) {
973                         rt->rt6i_src.addr = *saddr;
974                         rt->rt6i_src.plen = 128;
975                 }
976 #endif
977         }
978
979         return rt;
980 }
981
982 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
983 {
984         struct rt6_info *pcpu_rt;
985
986         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
987                                   rt->dst.dev, rt->dst.flags,
988                                   rt->rt6i_table);
989
990         if (!pcpu_rt)
991                 return NULL;
992         ip6_rt_copy_init(pcpu_rt, rt);
993         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
994         pcpu_rt->rt6i_flags |= RTF_PCPU;
995         return pcpu_rt;
996 }
997
998 /* It should be called with read_lock_bh(&tb6_lock) acquired */
999 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1000 {
1001         struct rt6_info *pcpu_rt, *prev, **p;
1002
1003         p = this_cpu_ptr(rt->rt6i_pcpu);
1004         pcpu_rt = *p;
1005
1006         if (pcpu_rt)
1007                 goto done;
1008
1009         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1010         if (!pcpu_rt) {
1011                 struct net *net = dev_net(rt->dst.dev);
1012
1013                 pcpu_rt = net->ipv6.ip6_null_entry;
1014                 goto done;
1015         }
1016
1017         prev = cmpxchg(p, NULL, pcpu_rt);
1018         if (prev) {
1019                 /* If someone did it before us, return prev instead */
1020                 dst_destroy(&pcpu_rt->dst);
1021                 pcpu_rt = prev;
1022         }
1023
1024 done:
1025         dst_hold(&pcpu_rt->dst);
1026         rt6_dst_from_metrics_check(pcpu_rt);
1027         return pcpu_rt;
1028 }
1029
1030 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1031                                       struct flowi6 *fl6, int flags)
1032 {
1033         struct fib6_node *fn, *saved_fn;
1034         struct rt6_info *rt;
1035         int strict = 0;
1036
1037         strict |= flags & RT6_LOOKUP_F_IFACE;
1038         if (net->ipv6.devconf_all->forwarding == 0)
1039                 strict |= RT6_LOOKUP_F_REACHABLE;
1040
1041         read_lock_bh(&table->tb6_lock);
1042
1043         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1044         saved_fn = fn;
1045
1046 redo_rt6_select:
1047         rt = rt6_select(fn, oif, strict);
1048         if (rt->rt6i_nsiblings)
1049                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1050         if (rt == net->ipv6.ip6_null_entry) {
1051                 fn = fib6_backtrack(fn, &fl6->saddr);
1052                 if (fn)
1053                         goto redo_rt6_select;
1054                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1055                         /* also consider unreachable route */
1056                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1057                         fn = saved_fn;
1058                         goto redo_rt6_select;
1059                 }
1060         }
1061
1062
1063         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1064                 dst_use(&rt->dst, jiffies);
1065                 read_unlock_bh(&table->tb6_lock);
1066
1067                 rt6_dst_from_metrics_check(rt);
1068                 return rt;
1069         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1070                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1071                 /* Create a RTF_CACHE clone which will not be
1072                  * owned by the fib6 tree.  It is for the special case where
1073                  * the daddr in the skb during the neighbor look-up is different
1074                  * from the fl6->daddr used to look-up route here.
1075                  */
1076
1077                 struct rt6_info *uncached_rt;
1078
1079                 dst_use(&rt->dst, jiffies);
1080                 read_unlock_bh(&table->tb6_lock);
1081
1082                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1083                 dst_release(&rt->dst);
1084
1085                 if (uncached_rt)
1086                         rt6_uncached_list_add(uncached_rt);
1087                 else
1088                         uncached_rt = net->ipv6.ip6_null_entry;
1089
1090                 dst_hold(&uncached_rt->dst);
1091                 return uncached_rt;
1092
1093         } else {
1094                 /* Get a percpu copy */
1095
1096                 struct rt6_info *pcpu_rt;
1097
1098                 rt->dst.lastuse = jiffies;
1099                 rt->dst.__use++;
1100                 pcpu_rt = rt6_get_pcpu_route(rt);
1101                 read_unlock_bh(&table->tb6_lock);
1102
1103                 return pcpu_rt;
1104         }
1105 }
1106
1107 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1108                                             struct flowi6 *fl6, int flags)
1109 {
1110         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1111 }
1112
1113 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1114                                                 struct net_device *dev,
1115                                                 struct flowi6 *fl6, int flags)
1116 {
1117         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1118                 flags |= RT6_LOOKUP_F_IFACE;
1119
1120         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1121 }
1122
1123 void ip6_route_input(struct sk_buff *skb)
1124 {
1125         const struct ipv6hdr *iph = ipv6_hdr(skb);
1126         struct net *net = dev_net(skb->dev);
1127         int flags = RT6_LOOKUP_F_HAS_SADDR;
1128         struct flowi6 fl6 = {
1129                 .flowi6_iif = skb->dev->ifindex,
1130                 .daddr = iph->daddr,
1131                 .saddr = iph->saddr,
1132                 .flowlabel = ip6_flowinfo(iph),
1133                 .flowi6_mark = skb->mark,
1134                 .flowi6_proto = iph->nexthdr,
1135         };
1136
1137         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1138 }
1139
1140 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1141                                              struct flowi6 *fl6, int flags)
1142 {
1143         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1144 }
1145
1146 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1147                                     struct flowi6 *fl6)
1148 {
1149         int flags = 0;
1150
1151         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1152
1153         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1154                 flags |= RT6_LOOKUP_F_IFACE;
1155
1156         if (!ipv6_addr_any(&fl6->saddr))
1157                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1158         else if (sk)
1159                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1160
1161         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1162 }
1163 EXPORT_SYMBOL(ip6_route_output);
1164
1165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1166 {
1167         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1168         struct dst_entry *new = NULL;
1169
1170         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1171         if (rt) {
1172                 new = &rt->dst;
1173
1174                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1175
1176                 new->__use = 1;
1177                 new->input = dst_discard;
1178                 new->output = dst_discard_sk;
1179
1180                 if (dst_metrics_read_only(&ort->dst))
1181                         new->_metrics = ort->dst._metrics;
1182                 else
1183                         dst_copy_metrics(new, &ort->dst);
1184                 rt->rt6i_idev = ort->rt6i_idev;
1185                 if (rt->rt6i_idev)
1186                         in6_dev_hold(rt->rt6i_idev);
1187
1188                 rt->rt6i_gateway = ort->rt6i_gateway;
1189                 rt->rt6i_flags = ort->rt6i_flags;
1190                 rt->rt6i_metric = 0;
1191
1192                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1193 #ifdef CONFIG_IPV6_SUBTREES
1194                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1195 #endif
1196
1197                 dst_free(new);
1198         }
1199
1200         dst_release(dst_orig);
1201         return new ? new : ERR_PTR(-ENOMEM);
1202 }
1203
1204 /*
1205  *      Destination cache support functions
1206  */
1207
1208 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1209 {
1210         if (rt->dst.from &&
1211             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1212                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1213 }
1214
1215 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1216 {
1217         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1218                 return NULL;
1219
1220         if (rt6_check_expired(rt))
1221                 return NULL;
1222
1223         return &rt->dst;
1224 }
1225
1226 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1227 {
1228         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1229             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1230                 return &rt->dst;
1231         else
1232                 return NULL;
1233 }
1234
1235 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1236 {
1237         struct rt6_info *rt;
1238
1239         rt = (struct rt6_info *) dst;
1240
1241         /* All IPV6 dsts are created with ->obsolete set to the value
1242          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1243          * into this function always.
1244          */
1245
1246         rt6_dst_from_metrics_check(rt);
1247
1248         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1249                 return rt6_dst_from_check(rt, cookie);
1250         else
1251                 return rt6_check(rt, cookie);
1252 }
1253
1254 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1255 {
1256         struct rt6_info *rt = (struct rt6_info *) dst;
1257
1258         if (rt) {
1259                 if (rt->rt6i_flags & RTF_CACHE) {
1260                         if (rt6_check_expired(rt)) {
1261                                 ip6_del_rt(rt);
1262                                 dst = NULL;
1263                         }
1264                 } else {
1265                         dst_release(dst);
1266                         dst = NULL;
1267                 }
1268         }
1269         return dst;
1270 }
1271
1272 static void ip6_link_failure(struct sk_buff *skb)
1273 {
1274         struct rt6_info *rt;
1275
1276         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1277
1278         rt = (struct rt6_info *) skb_dst(skb);
1279         if (rt) {
1280                 if (rt->rt6i_flags & RTF_CACHE) {
1281                         dst_hold(&rt->dst);
1282                         if (ip6_del_rt(rt))
1283                                 dst_free(&rt->dst);
1284                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1285                         rt->rt6i_node->fn_sernum = -1;
1286                 }
1287         }
1288 }
1289
1290 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1291 {
1292         struct net *net = dev_net(rt->dst.dev);
1293
1294         rt->rt6i_flags |= RTF_MODIFIED;
1295         rt->rt6i_pmtu = mtu;
1296         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1297 }
1298
1299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1300                                  const struct ipv6hdr *iph, u32 mtu)
1301 {
1302         struct rt6_info *rt6 = (struct rt6_info *)dst;
1303
1304         if (rt6->rt6i_flags & RTF_LOCAL)
1305                 return;
1306
1307         dst_confirm(dst);
1308         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1309         if (mtu >= dst_mtu(dst))
1310                 return;
1311
1312         if (rt6->rt6i_flags & RTF_CACHE) {
1313                 rt6_do_update_pmtu(rt6, mtu);
1314         } else {
1315                 const struct in6_addr *daddr, *saddr;
1316                 struct rt6_info *nrt6;
1317
1318                 if (iph) {
1319                         daddr = &iph->daddr;
1320                         saddr = &iph->saddr;
1321                 } else if (sk) {
1322                         daddr = &sk->sk_v6_daddr;
1323                         saddr = &inet6_sk(sk)->saddr;
1324                 } else {
1325                         return;
1326                 }
1327                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1328                 if (nrt6) {
1329                         rt6_do_update_pmtu(nrt6, mtu);
1330
1331                         /* ip6_ins_rt(nrt6) will bump the
1332                          * rt6->rt6i_node->fn_sernum
1333                          * which will fail the next rt6_check() and
1334                          * invalidate the sk->sk_dst_cache.
1335                          */
1336                         ip6_ins_rt(nrt6);
1337                 }
1338         }
1339 }
1340
1341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1342                                struct sk_buff *skb, u32 mtu)
1343 {
1344         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1345 }
1346
1347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1348                      int oif, u32 mark)
1349 {
1350         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1351         struct dst_entry *dst;
1352         struct flowi6 fl6;
1353
1354         memset(&fl6, 0, sizeof(fl6));
1355         fl6.flowi6_oif = oif;
1356         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1357         fl6.daddr = iph->daddr;
1358         fl6.saddr = iph->saddr;
1359         fl6.flowlabel = ip6_flowinfo(iph);
1360
1361         dst = ip6_route_output(net, NULL, &fl6);
1362         if (!dst->error)
1363                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1364         dst_release(dst);
1365 }
1366 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1367
1368 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1369 {
1370         ip6_update_pmtu(skb, sock_net(sk), mtu,
1371                         sk->sk_bound_dev_if, sk->sk_mark);
1372 }
1373 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1374
1375 /* Handle redirects */
1376 struct ip6rd_flowi {
1377         struct flowi6 fl6;
1378         struct in6_addr gateway;
1379 };
1380
1381 static struct rt6_info *__ip6_route_redirect(struct net *net,
1382                                              struct fib6_table *table,
1383                                              struct flowi6 *fl6,
1384                                              int flags)
1385 {
1386         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1387         struct rt6_info *rt;
1388         struct fib6_node *fn;
1389
1390         /* Get the "current" route for this destination and
1391          * check if the redirect has come from approriate router.
1392          *
1393          * RFC 4861 specifies that redirects should only be
1394          * accepted if they come from the nexthop to the target.
1395          * Due to the way the routes are chosen, this notion
1396          * is a bit fuzzy and one might need to check all possible
1397          * routes.
1398          */
1399
1400         read_lock_bh(&table->tb6_lock);
1401         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1402 restart:
1403         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1404                 if (rt6_check_expired(rt))
1405                         continue;
1406                 if (rt->dst.error)
1407                         break;
1408                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1409                         continue;
1410                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1411                         continue;
1412                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1413                         continue;
1414                 break;
1415         }
1416
1417         if (!rt)
1418                 rt = net->ipv6.ip6_null_entry;
1419         else if (rt->dst.error) {
1420                 rt = net->ipv6.ip6_null_entry;
1421                 goto out;
1422         }
1423
1424         if (rt == net->ipv6.ip6_null_entry) {
1425                 fn = fib6_backtrack(fn, &fl6->saddr);
1426                 if (fn)
1427                         goto restart;
1428         }
1429
1430 out:
1431         dst_hold(&rt->dst);
1432
1433         read_unlock_bh(&table->tb6_lock);
1434
1435         return rt;
1436 };
1437
1438 static struct dst_entry *ip6_route_redirect(struct net *net,
1439                                         const struct flowi6 *fl6,
1440                                         const struct in6_addr *gateway)
1441 {
1442         int flags = RT6_LOOKUP_F_HAS_SADDR;
1443         struct ip6rd_flowi rdfl;
1444
1445         rdfl.fl6 = *fl6;
1446         rdfl.gateway = *gateway;
1447
1448         return fib6_rule_lookup(net, &rdfl.fl6,
1449                                 flags, __ip6_route_redirect);
1450 }
1451
1452 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1453 {
1454         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1455         struct dst_entry *dst;
1456         struct flowi6 fl6;
1457
1458         memset(&fl6, 0, sizeof(fl6));
1459         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1460         fl6.flowi6_oif = oif;
1461         fl6.flowi6_mark = mark;
1462         fl6.daddr = iph->daddr;
1463         fl6.saddr = iph->saddr;
1464         fl6.flowlabel = ip6_flowinfo(iph);
1465
1466         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1467         rt6_do_redirect(dst, NULL, skb);
1468         dst_release(dst);
1469 }
1470 EXPORT_SYMBOL_GPL(ip6_redirect);
1471
1472 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1473                             u32 mark)
1474 {
1475         const struct ipv6hdr *iph = ipv6_hdr(skb);
1476         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1477         struct dst_entry *dst;
1478         struct flowi6 fl6;
1479
1480         memset(&fl6, 0, sizeof(fl6));
1481         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1482         fl6.flowi6_oif = oif;
1483         fl6.flowi6_mark = mark;
1484         fl6.daddr = msg->dest;
1485         fl6.saddr = iph->daddr;
1486
1487         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1488         rt6_do_redirect(dst, NULL, skb);
1489         dst_release(dst);
1490 }
1491
1492 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1493 {
1494         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1495 }
1496 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1497
1498 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1499 {
1500         struct net_device *dev = dst->dev;
1501         unsigned int mtu = dst_mtu(dst);
1502         struct net *net = dev_net(dev);
1503
1504         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1505
1506         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1507                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1508
1509         /*
1510          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1511          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1512          * IPV6_MAXPLEN is also valid and means: "any MSS,
1513          * rely only on pmtu discovery"
1514          */
1515         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1516                 mtu = IPV6_MAXPLEN;
1517         return mtu;
1518 }
1519
1520 static unsigned int ip6_mtu(const struct dst_entry *dst)
1521 {
1522         const struct rt6_info *rt = (const struct rt6_info *)dst;
1523         unsigned int mtu = rt->rt6i_pmtu;
1524         struct inet6_dev *idev;
1525
1526         if (mtu)
1527                 goto out;
1528
1529         mtu = dst_metric_raw(dst, RTAX_MTU);
1530         if (mtu)
1531                 goto out;
1532
1533         mtu = IPV6_MIN_MTU;
1534
1535         rcu_read_lock();
1536         idev = __in6_dev_get(dst->dev);
1537         if (idev)
1538                 mtu = idev->cnf.mtu6;
1539         rcu_read_unlock();
1540
1541 out:
1542         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1543 }
1544
1545 static struct dst_entry *icmp6_dst_gc_list;
1546 static DEFINE_SPINLOCK(icmp6_dst_lock);
1547
1548 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1549                                   struct flowi6 *fl6)
1550 {
1551         struct dst_entry *dst;
1552         struct rt6_info *rt;
1553         struct inet6_dev *idev = in6_dev_get(dev);
1554         struct net *net = dev_net(dev);
1555
1556         if (unlikely(!idev))
1557                 return ERR_PTR(-ENODEV);
1558
1559         rt = ip6_dst_alloc(net, dev, 0, NULL);
1560         if (unlikely(!rt)) {
1561                 in6_dev_put(idev);
1562                 dst = ERR_PTR(-ENOMEM);
1563                 goto out;
1564         }
1565
1566         rt->dst.flags |= DST_HOST;
1567         rt->dst.output  = ip6_output;
1568         atomic_set(&rt->dst.__refcnt, 1);
1569         rt->rt6i_gateway  = fl6->daddr;
1570         rt->rt6i_dst.addr = fl6->daddr;
1571         rt->rt6i_dst.plen = 128;
1572         rt->rt6i_idev     = idev;
1573         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1574
1575         spin_lock_bh(&icmp6_dst_lock);
1576         rt->dst.next = icmp6_dst_gc_list;
1577         icmp6_dst_gc_list = &rt->dst;
1578         spin_unlock_bh(&icmp6_dst_lock);
1579
1580         fib6_force_start_gc(net);
1581
1582         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1583
1584 out:
1585         return dst;
1586 }
1587
1588 int icmp6_dst_gc(void)
1589 {
1590         struct dst_entry *dst, **pprev;
1591         int more = 0;
1592
1593         spin_lock_bh(&icmp6_dst_lock);
1594         pprev = &icmp6_dst_gc_list;
1595
1596         while ((dst = *pprev) != NULL) {
1597                 if (!atomic_read(&dst->__refcnt)) {
1598                         *pprev = dst->next;
1599                         dst_free(dst);
1600                 } else {
1601                         pprev = &dst->next;
1602                         ++more;
1603                 }
1604         }
1605
1606         spin_unlock_bh(&icmp6_dst_lock);
1607
1608         return more;
1609 }
1610
1611 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1612                             void *arg)
1613 {
1614         struct dst_entry *dst, **pprev;
1615
1616         spin_lock_bh(&icmp6_dst_lock);
1617         pprev = &icmp6_dst_gc_list;
1618         while ((dst = *pprev) != NULL) {
1619                 struct rt6_info *rt = (struct rt6_info *) dst;
1620                 if (func(rt, arg)) {
1621                         *pprev = dst->next;
1622                         dst_free(dst);
1623                 } else {
1624                         pprev = &dst->next;
1625                 }
1626         }
1627         spin_unlock_bh(&icmp6_dst_lock);
1628 }
1629
1630 static int ip6_dst_gc(struct dst_ops *ops)
1631 {
1632         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1633         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1634         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1635         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1636         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1637         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1638         int entries;
1639
1640         entries = dst_entries_get_fast(ops);
1641         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1642             entries <= rt_max_size)
1643                 goto out;
1644
1645         net->ipv6.ip6_rt_gc_expire++;
1646         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1647         entries = dst_entries_get_slow(ops);
1648         if (entries < ops->gc_thresh)
1649                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1650 out:
1651         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1652         return entries > rt_max_size;
1653 }
1654
1655 static int ip6_convert_metrics(struct mx6_config *mxc,
1656                                const struct fib6_config *cfg)
1657 {
1658         struct nlattr *nla;
1659         int remaining;
1660         u32 *mp;
1661
1662         if (!cfg->fc_mx)
1663                 return 0;
1664
1665         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1666         if (unlikely(!mp))
1667                 return -ENOMEM;
1668
1669         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1670                 int type = nla_type(nla);
1671
1672                 if (type) {
1673                         u32 val;
1674
1675                         if (unlikely(type > RTAX_MAX))
1676                                 goto err;
1677                         if (type == RTAX_CC_ALGO) {
1678                                 char tmp[TCP_CA_NAME_MAX];
1679
1680                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1681                                 val = tcp_ca_get_key_by_name(tmp);
1682                                 if (val == TCP_CA_UNSPEC)
1683                                         goto err;
1684                         } else {
1685                                 val = nla_get_u32(nla);
1686                         }
1687
1688                         mp[type - 1] = val;
1689                         __set_bit(type - 1, mxc->mx_valid);
1690                 }
1691         }
1692
1693         mxc->mx = mp;
1694
1695         return 0;
1696  err:
1697         kfree(mp);
1698         return -EINVAL;
1699 }
1700
1701 int ip6_route_add(struct fib6_config *cfg)
1702 {
1703         int err;
1704         struct net *net = cfg->fc_nlinfo.nl_net;
1705         struct rt6_info *rt = NULL;
1706         struct net_device *dev = NULL;
1707         struct inet6_dev *idev = NULL;
1708         struct fib6_table *table;
1709         struct mx6_config mxc = { .mx = NULL, };
1710         int addr_type;
1711
1712         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1713                 return -EINVAL;
1714 #ifndef CONFIG_IPV6_SUBTREES
1715         if (cfg->fc_src_len)
1716                 return -EINVAL;
1717 #endif
1718         if (cfg->fc_ifindex) {
1719                 err = -ENODEV;
1720                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1721                 if (!dev)
1722                         goto out;
1723                 idev = in6_dev_get(dev);
1724                 if (!idev)
1725                         goto out;
1726         }
1727
1728         if (cfg->fc_metric == 0)
1729                 cfg->fc_metric = IP6_RT_PRIO_USER;
1730
1731         err = -ENOBUFS;
1732         if (cfg->fc_nlinfo.nlh &&
1733             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1734                 table = fib6_get_table(net, cfg->fc_table);
1735                 if (!table) {
1736                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1737                         table = fib6_new_table(net, cfg->fc_table);
1738                 }
1739         } else {
1740                 table = fib6_new_table(net, cfg->fc_table);
1741         }
1742
1743         if (!table)
1744                 goto out;
1745
1746         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1747
1748         if (!rt) {
1749                 err = -ENOMEM;
1750                 goto out;
1751         }
1752
1753         if (cfg->fc_flags & RTF_EXPIRES)
1754                 rt6_set_expires(rt, jiffies +
1755                                 clock_t_to_jiffies(cfg->fc_expires));
1756         else
1757                 rt6_clean_expires(rt);
1758
1759         if (cfg->fc_protocol == RTPROT_UNSPEC)
1760                 cfg->fc_protocol = RTPROT_BOOT;
1761         rt->rt6i_protocol = cfg->fc_protocol;
1762
1763         addr_type = ipv6_addr_type(&cfg->fc_dst);
1764
1765         if (addr_type & IPV6_ADDR_MULTICAST)
1766                 rt->dst.input = ip6_mc_input;
1767         else if (cfg->fc_flags & RTF_LOCAL)
1768                 rt->dst.input = ip6_input;
1769         else
1770                 rt->dst.input = ip6_forward;
1771
1772         rt->dst.output = ip6_output;
1773
1774         if (cfg->fc_encap) {
1775                 struct lwtunnel_state *lwtstate;
1776
1777                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1778                                            cfg->fc_encap, &lwtstate);
1779                 if (err)
1780                         goto out;
1781                 lwtunnel_state_get(lwtstate);
1782                 rt->rt6i_lwtstate = lwtstate;
1783                 if (lwtunnel_output_redirect(rt->rt6i_lwtstate))
1784                         rt->dst.output = lwtunnel_output6;
1785         }
1786
1787         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1788         rt->rt6i_dst.plen = cfg->fc_dst_len;
1789         if (rt->rt6i_dst.plen == 128)
1790                 rt->dst.flags |= DST_HOST;
1791
1792 #ifdef CONFIG_IPV6_SUBTREES
1793         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1794         rt->rt6i_src.plen = cfg->fc_src_len;
1795 #endif
1796
1797         rt->rt6i_metric = cfg->fc_metric;
1798
1799         /* We cannot add true routes via loopback here,
1800            they would result in kernel looping; promote them to reject routes
1801          */
1802         if ((cfg->fc_flags & RTF_REJECT) ||
1803             (dev && (dev->flags & IFF_LOOPBACK) &&
1804              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1805              !(cfg->fc_flags & RTF_LOCAL))) {
1806                 /* hold loopback dev/idev if we haven't done so. */
1807                 if (dev != net->loopback_dev) {
1808                         if (dev) {
1809                                 dev_put(dev);
1810                                 in6_dev_put(idev);
1811                         }
1812                         dev = net->loopback_dev;
1813                         dev_hold(dev);
1814                         idev = in6_dev_get(dev);
1815                         if (!idev) {
1816                                 err = -ENODEV;
1817                                 goto out;
1818                         }
1819                 }
1820                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1821                 switch (cfg->fc_type) {
1822                 case RTN_BLACKHOLE:
1823                         rt->dst.error = -EINVAL;
1824                         rt->dst.output = dst_discard_sk;
1825                         rt->dst.input = dst_discard;
1826                         break;
1827                 case RTN_PROHIBIT:
1828                         rt->dst.error = -EACCES;
1829                         rt->dst.output = ip6_pkt_prohibit_out;
1830                         rt->dst.input = ip6_pkt_prohibit;
1831                         break;
1832                 case RTN_THROW:
1833                 default:
1834                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1835                                         : -ENETUNREACH;
1836                         rt->dst.output = ip6_pkt_discard_out;
1837                         rt->dst.input = ip6_pkt_discard;
1838                         break;
1839                 }
1840                 goto install_route;
1841         }
1842
1843         if (cfg->fc_flags & RTF_GATEWAY) {
1844                 const struct in6_addr *gw_addr;
1845                 int gwa_type;
1846
1847                 gw_addr = &cfg->fc_gateway;
1848
1849                 /* if gw_addr is local we will fail to detect this in case
1850                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1851                  * will return already-added prefix route via interface that
1852                  * prefix route was assigned to, which might be non-loopback.
1853                  */
1854                 err = -EINVAL;
1855                 if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1856                         goto out;
1857
1858                 rt->rt6i_gateway = *gw_addr;
1859                 gwa_type = ipv6_addr_type(gw_addr);
1860
1861                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1862                         struct rt6_info *grt;
1863
1864                         /* IPv6 strictly inhibits using not link-local
1865                            addresses as nexthop address.
1866                            Otherwise, router will not able to send redirects.
1867                            It is very good, but in some (rare!) circumstances
1868                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1869                            some exceptions. --ANK
1870                          */
1871                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1872                                 goto out;
1873
1874                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1875
1876                         err = -EHOSTUNREACH;
1877                         if (!grt)
1878                                 goto out;
1879                         if (dev) {
1880                                 if (dev != grt->dst.dev) {
1881                                         ip6_rt_put(grt);
1882                                         goto out;
1883                                 }
1884                         } else {
1885                                 dev = grt->dst.dev;
1886                                 idev = grt->rt6i_idev;
1887                                 dev_hold(dev);
1888                                 in6_dev_hold(grt->rt6i_idev);
1889                         }
1890                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1891                                 err = 0;
1892                         ip6_rt_put(grt);
1893
1894                         if (err)
1895                                 goto out;
1896                 }
1897                 err = -EINVAL;
1898                 if (!dev || (dev->flags & IFF_LOOPBACK))
1899                         goto out;
1900         }
1901
1902         err = -ENODEV;
1903         if (!dev)
1904                 goto out;
1905
1906         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1907                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1908                         err = -EINVAL;
1909                         goto out;
1910                 }
1911                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1912                 rt->rt6i_prefsrc.plen = 128;
1913         } else
1914                 rt->rt6i_prefsrc.plen = 0;
1915
1916         rt->rt6i_flags = cfg->fc_flags;
1917
1918 install_route:
1919         rt->dst.dev = dev;
1920         rt->rt6i_idev = idev;
1921         rt->rt6i_table = table;
1922
1923         cfg->fc_nlinfo.nl_net = dev_net(dev);
1924
1925         err = ip6_convert_metrics(&mxc, cfg);
1926         if (err)
1927                 goto out;
1928
1929         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1930
1931         kfree(mxc.mx);
1932         return err;
1933 out:
1934         if (dev)
1935                 dev_put(dev);
1936         if (idev)
1937                 in6_dev_put(idev);
1938         if (rt)
1939                 dst_free(&rt->dst);
1940         return err;
1941 }
1942
1943 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1944 {
1945         int err;
1946         struct fib6_table *table;
1947         struct net *net = dev_net(rt->dst.dev);
1948
1949         if (rt == net->ipv6.ip6_null_entry) {
1950                 err = -ENOENT;
1951                 goto out;
1952         }
1953
1954         table = rt->rt6i_table;
1955         write_lock_bh(&table->tb6_lock);
1956         err = fib6_del(rt, info);
1957         write_unlock_bh(&table->tb6_lock);
1958
1959 out:
1960         ip6_rt_put(rt);
1961         return err;
1962 }
1963
1964 int ip6_del_rt(struct rt6_info *rt)
1965 {
1966         struct nl_info info = {
1967                 .nl_net = dev_net(rt->dst.dev),
1968         };
1969         return __ip6_del_rt(rt, &info);
1970 }
1971
1972 static int ip6_route_del(struct fib6_config *cfg)
1973 {
1974         struct fib6_table *table;
1975         struct fib6_node *fn;
1976         struct rt6_info *rt;
1977         int err = -ESRCH;
1978
1979         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1980         if (!table)
1981                 return err;
1982
1983         read_lock_bh(&table->tb6_lock);
1984
1985         fn = fib6_locate(&table->tb6_root,
1986                          &cfg->fc_dst, cfg->fc_dst_len,
1987                          &cfg->fc_src, cfg->fc_src_len);
1988
1989         if (fn) {
1990                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1991                         if ((rt->rt6i_flags & RTF_CACHE) &&
1992                             !(cfg->fc_flags & RTF_CACHE))
1993                                 continue;
1994                         if (cfg->fc_ifindex &&
1995                             (!rt->dst.dev ||
1996                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1997                                 continue;
1998                         if (cfg->fc_flags & RTF_GATEWAY &&
1999                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2000                                 continue;
2001                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2002                                 continue;
2003                         dst_hold(&rt->dst);
2004                         read_unlock_bh(&table->tb6_lock);
2005
2006                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2007                 }
2008         }
2009         read_unlock_bh(&table->tb6_lock);
2010
2011         return err;
2012 }
2013
2014 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2015 {
2016         struct net *net = dev_net(skb->dev);
2017         struct netevent_redirect netevent;
2018         struct rt6_info *rt, *nrt = NULL;
2019         struct ndisc_options ndopts;
2020         struct inet6_dev *in6_dev;
2021         struct neighbour *neigh;
2022         struct rd_msg *msg;
2023         int optlen, on_link;
2024         u8 *lladdr;
2025
2026         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2027         optlen -= sizeof(*msg);
2028
2029         if (optlen < 0) {
2030                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2031                 return;
2032         }
2033
2034         msg = (struct rd_msg *)icmp6_hdr(skb);
2035
2036         if (ipv6_addr_is_multicast(&msg->dest)) {
2037                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2038                 return;
2039         }
2040
2041         on_link = 0;
2042         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2043                 on_link = 1;
2044         } else if (ipv6_addr_type(&msg->target) !=
2045                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2046                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2047                 return;
2048         }
2049
2050         in6_dev = __in6_dev_get(skb->dev);
2051         if (!in6_dev)
2052                 return;
2053         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2054                 return;
2055
2056         /* RFC2461 8.1:
2057          *      The IP source address of the Redirect MUST be the same as the current
2058          *      first-hop router for the specified ICMP Destination Address.
2059          */
2060
2061         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2062                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2063                 return;
2064         }
2065
2066         lladdr = NULL;
2067         if (ndopts.nd_opts_tgt_lladdr) {
2068                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2069                                              skb->dev);
2070                 if (!lladdr) {
2071                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2072                         return;
2073                 }
2074         }
2075
2076         rt = (struct rt6_info *) dst;
2077         if (rt == net->ipv6.ip6_null_entry) {
2078                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2079                 return;
2080         }
2081
2082         /* Redirect received -> path was valid.
2083          * Look, redirects are sent only in response to data packets,
2084          * so that this nexthop apparently is reachable. --ANK
2085          */
2086         dst_confirm(&rt->dst);
2087
2088         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2089         if (!neigh)
2090                 return;
2091
2092         /*
2093          *      We have finally decided to accept it.
2094          */
2095
2096         neigh_update(neigh, lladdr, NUD_STALE,
2097                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2098                      NEIGH_UPDATE_F_OVERRIDE|
2099                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2100                                      NEIGH_UPDATE_F_ISROUTER))
2101                      );
2102
2103         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2104         if (!nrt)
2105                 goto out;
2106
2107         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2108         if (on_link)
2109                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2110
2111         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2112
2113         if (ip6_ins_rt(nrt))
2114                 goto out;
2115
2116         netevent.old = &rt->dst;
2117         netevent.new = &nrt->dst;
2118         netevent.daddr = &msg->dest;
2119         netevent.neigh = neigh;
2120         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2121
2122         if (rt->rt6i_flags & RTF_CACHE) {
2123                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2124                 ip6_del_rt(rt);
2125         }
2126
2127 out:
2128         neigh_release(neigh);
2129 }
2130
2131 /*
2132  *      Misc support functions
2133  */
2134
2135 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2136 {
2137         BUG_ON(from->dst.from);
2138
2139         rt->rt6i_flags &= ~RTF_EXPIRES;
2140         dst_hold(&from->dst);
2141         rt->dst.from = &from->dst;
2142         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2143 }
2144
2145 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2146 {
2147         rt->dst.input = ort->dst.input;
2148         rt->dst.output = ort->dst.output;
2149         rt->rt6i_dst = ort->rt6i_dst;
2150         rt->dst.error = ort->dst.error;
2151         rt->rt6i_idev = ort->rt6i_idev;
2152         if (rt->rt6i_idev)
2153                 in6_dev_hold(rt->rt6i_idev);
2154         rt->dst.lastuse = jiffies;
2155         rt->rt6i_gateway = ort->rt6i_gateway;
2156         rt->rt6i_flags = ort->rt6i_flags;
2157         rt6_set_from(rt, ort);
2158         rt->rt6i_metric = ort->rt6i_metric;
2159 #ifdef CONFIG_IPV6_SUBTREES
2160         rt->rt6i_src = ort->rt6i_src;
2161 #endif
2162         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2163         rt->rt6i_table = ort->rt6i_table;
2164         if (ort->rt6i_lwtstate) {
2165                 lwtunnel_state_get(ort->rt6i_lwtstate);
2166                 rt->rt6i_lwtstate = ort->rt6i_lwtstate;
2167         }
2168 }
2169
2170 #ifdef CONFIG_IPV6_ROUTE_INFO
2171 static struct rt6_info *rt6_get_route_info(struct net *net,
2172                                            const struct in6_addr *prefix, int prefixlen,
2173                                            const struct in6_addr *gwaddr, int ifindex)
2174 {
2175         struct fib6_node *fn;
2176         struct rt6_info *rt = NULL;
2177         struct fib6_table *table;
2178
2179         table = fib6_get_table(net, RT6_TABLE_INFO);
2180         if (!table)
2181                 return NULL;
2182
2183         read_lock_bh(&table->tb6_lock);
2184         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2185         if (!fn)
2186                 goto out;
2187
2188         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2189                 if (rt->dst.dev->ifindex != ifindex)
2190                         continue;
2191                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2192                         continue;
2193                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2194                         continue;
2195                 dst_hold(&rt->dst);
2196                 break;
2197         }
2198 out:
2199         read_unlock_bh(&table->tb6_lock);
2200         return rt;
2201 }
2202
2203 static struct rt6_info *rt6_add_route_info(struct net *net,
2204                                            const struct in6_addr *prefix, int prefixlen,
2205                                            const struct in6_addr *gwaddr, int ifindex,
2206                                            unsigned int pref)
2207 {
2208         struct fib6_config cfg = {
2209                 .fc_table       = RT6_TABLE_INFO,
2210                 .fc_metric      = IP6_RT_PRIO_USER,
2211                 .fc_ifindex     = ifindex,
2212                 .fc_dst_len     = prefixlen,
2213                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2214                                   RTF_UP | RTF_PREF(pref),
2215                 .fc_nlinfo.portid = 0,
2216                 .fc_nlinfo.nlh = NULL,
2217                 .fc_nlinfo.nl_net = net,
2218         };
2219
2220         cfg.fc_dst = *prefix;
2221         cfg.fc_gateway = *gwaddr;
2222
2223         /* We should treat it as a default route if prefix length is 0. */
2224         if (!prefixlen)
2225                 cfg.fc_flags |= RTF_DEFAULT;
2226
2227         ip6_route_add(&cfg);
2228
2229         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2230 }
2231 #endif
2232
2233 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2234 {
2235         struct rt6_info *rt;
2236         struct fib6_table *table;
2237
2238         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2239         if (!table)
2240                 return NULL;
2241
2242         read_lock_bh(&table->tb6_lock);
2243         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2244                 if (dev == rt->dst.dev &&
2245                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2246                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2247                         break;
2248         }
2249         if (rt)
2250                 dst_hold(&rt->dst);
2251         read_unlock_bh(&table->tb6_lock);
2252         return rt;
2253 }
2254
2255 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2256                                      struct net_device *dev,
2257                                      unsigned int pref)
2258 {
2259         struct fib6_config cfg = {
2260                 .fc_table       = RT6_TABLE_DFLT,
2261                 .fc_metric      = IP6_RT_PRIO_USER,
2262                 .fc_ifindex     = dev->ifindex,
2263                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2264                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2265                 .fc_nlinfo.portid = 0,
2266                 .fc_nlinfo.nlh = NULL,
2267                 .fc_nlinfo.nl_net = dev_net(dev),
2268         };
2269
2270         cfg.fc_gateway = *gwaddr;
2271
2272         ip6_route_add(&cfg);
2273
2274         return rt6_get_dflt_router(gwaddr, dev);
2275 }
2276
2277 void rt6_purge_dflt_routers(struct net *net)
2278 {
2279         struct rt6_info *rt;
2280         struct fib6_table *table;
2281
2282         /* NOTE: Keep consistent with rt6_get_dflt_router */
2283         table = fib6_get_table(net, RT6_TABLE_DFLT);
2284         if (!table)
2285                 return;
2286
2287 restart:
2288         read_lock_bh(&table->tb6_lock);
2289         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2290                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2291                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2292                         dst_hold(&rt->dst);
2293                         read_unlock_bh(&table->tb6_lock);
2294                         ip6_del_rt(rt);
2295                         goto restart;
2296                 }
2297         }
2298         read_unlock_bh(&table->tb6_lock);
2299 }
2300
2301 static void rtmsg_to_fib6_config(struct net *net,
2302                                  struct in6_rtmsg *rtmsg,
2303                                  struct fib6_config *cfg)
2304 {
2305         memset(cfg, 0, sizeof(*cfg));
2306
2307         cfg->fc_table = RT6_TABLE_MAIN;
2308         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2309         cfg->fc_metric = rtmsg->rtmsg_metric;
2310         cfg->fc_expires = rtmsg->rtmsg_info;
2311         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2312         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2313         cfg->fc_flags = rtmsg->rtmsg_flags;
2314
2315         cfg->fc_nlinfo.nl_net = net;
2316
2317         cfg->fc_dst = rtmsg->rtmsg_dst;
2318         cfg->fc_src = rtmsg->rtmsg_src;
2319         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2320 }
2321
2322 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2323 {
2324         struct fib6_config cfg;
2325         struct in6_rtmsg rtmsg;
2326         int err;
2327
2328         switch (cmd) {
2329         case SIOCADDRT:         /* Add a route */
2330         case SIOCDELRT:         /* Delete a route */
2331                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2332                         return -EPERM;
2333                 err = copy_from_user(&rtmsg, arg,
2334                                      sizeof(struct in6_rtmsg));
2335                 if (err)
2336                         return -EFAULT;
2337
2338                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2339
2340                 rtnl_lock();
2341                 switch (cmd) {
2342                 case SIOCADDRT:
2343                         err = ip6_route_add(&cfg);
2344                         break;
2345                 case SIOCDELRT:
2346                         err = ip6_route_del(&cfg);
2347                         break;
2348                 default:
2349                         err = -EINVAL;
2350                 }
2351                 rtnl_unlock();
2352
2353                 return err;
2354         }
2355
2356         return -EINVAL;
2357 }
2358
2359 /*
2360  *      Drop the packet on the floor
2361  */
2362
2363 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2364 {
2365         int type;
2366         struct dst_entry *dst = skb_dst(skb);
2367         switch (ipstats_mib_noroutes) {
2368         case IPSTATS_MIB_INNOROUTES:
2369                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2370                 if (type == IPV6_ADDR_ANY) {
2371                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2372                                       IPSTATS_MIB_INADDRERRORS);
2373                         break;
2374                 }
2375                 /* FALLTHROUGH */
2376         case IPSTATS_MIB_OUTNOROUTES:
2377                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2378                               ipstats_mib_noroutes);
2379                 break;
2380         }
2381         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2382         kfree_skb(skb);
2383         return 0;
2384 }
2385
2386 static int ip6_pkt_discard(struct sk_buff *skb)
2387 {
2388         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2389 }
2390
2391 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2392 {
2393         skb->dev = skb_dst(skb)->dev;
2394         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2395 }
2396
2397 static int ip6_pkt_prohibit(struct sk_buff *skb)
2398 {
2399         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2400 }
2401
2402 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2403 {
2404         skb->dev = skb_dst(skb)->dev;
2405         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2406 }
2407
2408 /*
2409  *      Allocate a dst for local (unicast / anycast) address.
2410  */
2411
2412 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2413                                     const struct in6_addr *addr,
2414                                     bool anycast)
2415 {
2416         struct net *net = dev_net(idev->dev);
2417         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2418                                             DST_NOCOUNT, NULL);
2419         if (!rt)
2420                 return ERR_PTR(-ENOMEM);
2421
2422         in6_dev_hold(idev);
2423
2424         rt->dst.flags |= DST_HOST;
2425         rt->dst.input = ip6_input;
2426         rt->dst.output = ip6_output;
2427         rt->rt6i_idev = idev;
2428
2429         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2430         if (anycast)
2431                 rt->rt6i_flags |= RTF_ANYCAST;
2432         else
2433                 rt->rt6i_flags |= RTF_LOCAL;
2434
2435         rt->rt6i_gateway  = *addr;
2436         rt->rt6i_dst.addr = *addr;
2437         rt->rt6i_dst.plen = 128;
2438         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2439
2440         atomic_set(&rt->dst.__refcnt, 1);
2441
2442         return rt;
2443 }
2444
2445 int ip6_route_get_saddr(struct net *net,
2446                         struct rt6_info *rt,
2447                         const struct in6_addr *daddr,
2448                         unsigned int prefs,
2449                         struct in6_addr *saddr)
2450 {
2451         struct inet6_dev *idev =
2452                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2453         int err = 0;
2454         if (rt && rt->rt6i_prefsrc.plen)
2455                 *saddr = rt->rt6i_prefsrc.addr;
2456         else
2457                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2458                                          daddr, prefs, saddr);
2459         return err;
2460 }
2461
2462 /* remove deleted ip from prefsrc entries */
2463 struct arg_dev_net_ip {
2464         struct net_device *dev;
2465         struct net *net;
2466         struct in6_addr *addr;
2467 };
2468
2469 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2470 {
2471         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2472         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2473         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2474
2475         if (((void *)rt->dst.dev == dev || !dev) &&
2476             rt != net->ipv6.ip6_null_entry &&
2477             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2478                 /* remove prefsrc entry */
2479                 rt->rt6i_prefsrc.plen = 0;
2480         }
2481         return 0;
2482 }
2483
2484 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2485 {
2486         struct net *net = dev_net(ifp->idev->dev);
2487         struct arg_dev_net_ip adni = {
2488                 .dev = ifp->idev->dev,
2489                 .net = net,
2490                 .addr = &ifp->addr,
2491         };
2492         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2493 }
2494
2495 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2496 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2497
2498 /* Remove routers and update dst entries when gateway turn into host. */
2499 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2500 {
2501         struct in6_addr *gateway = (struct in6_addr *)arg;
2502
2503         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2504              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2505              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2506                 return -1;
2507         }
2508         return 0;
2509 }
2510
2511 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2512 {
2513         fib6_clean_all(net, fib6_clean_tohost, gateway);
2514 }
2515
2516 struct arg_dev_net {
2517         struct net_device *dev;
2518         struct net *net;
2519 };
2520
2521 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2522 {
2523         const struct arg_dev_net *adn = arg;
2524         const struct net_device *dev = adn->dev;
2525
2526         if ((rt->dst.dev == dev || !dev) &&
2527             rt != adn->net->ipv6.ip6_null_entry)
2528                 return -1;
2529
2530         return 0;
2531 }
2532
2533 void rt6_ifdown(struct net *net, struct net_device *dev)
2534 {
2535         struct arg_dev_net adn = {
2536                 .dev = dev,
2537                 .net = net,
2538         };
2539
2540         fib6_clean_all(net, fib6_ifdown, &adn);
2541         icmp6_clean_all(fib6_ifdown, &adn);
2542         rt6_uncached_list_flush_dev(net, dev);
2543 }
2544
2545 struct rt6_mtu_change_arg {
2546         struct net_device *dev;
2547         unsigned int mtu;
2548 };
2549
2550 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2551 {
2552         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2553         struct inet6_dev *idev;
2554
2555         /* In IPv6 pmtu discovery is not optional,
2556            so that RTAX_MTU lock cannot disable it.
2557            We still use this lock to block changes
2558            caused by addrconf/ndisc.
2559         */
2560
2561         idev = __in6_dev_get(arg->dev);
2562         if (!idev)
2563                 return 0;
2564
2565         /* For administrative MTU increase, there is no way to discover
2566            IPv6 PMTU increase, so PMTU increase should be updated here.
2567            Since RFC 1981 doesn't include administrative MTU increase
2568            update PMTU increase is a MUST. (i.e. jumbo frame)
2569          */
2570         /*
2571            If new MTU is less than route PMTU, this new MTU will be the
2572            lowest MTU in the path, update the route PMTU to reflect PMTU
2573            decreases; if new MTU is greater than route PMTU, and the
2574            old MTU is the lowest MTU in the path, update the route PMTU
2575            to reflect the increase. In this case if the other nodes' MTU
2576            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2577            PMTU discouvery.
2578          */
2579         if (rt->dst.dev == arg->dev &&
2580             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2581                 if (rt->rt6i_flags & RTF_CACHE) {
2582                         /* For RTF_CACHE with rt6i_pmtu == 0
2583                          * (i.e. a redirected route),
2584                          * the metrics of its rt->dst.from has already
2585                          * been updated.
2586                          */
2587                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2588                                 rt->rt6i_pmtu = arg->mtu;
2589                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2590                            (dst_mtu(&rt->dst) < arg->mtu &&
2591                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2592                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2593                 }
2594         }
2595         return 0;
2596 }
2597
2598 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2599 {
2600         struct rt6_mtu_change_arg arg = {
2601                 .dev = dev,
2602                 .mtu = mtu,
2603         };
2604
2605         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2606 }
2607
2608 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2609         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2610         [RTA_OIF]               = { .type = NLA_U32 },
2611         [RTA_IIF]               = { .type = NLA_U32 },
2612         [RTA_PRIORITY]          = { .type = NLA_U32 },
2613         [RTA_METRICS]           = { .type = NLA_NESTED },
2614         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2615         [RTA_PREF]              = { .type = NLA_U8 },
2616         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2617         [RTA_ENCAP]             = { .type = NLA_NESTED },
2618 };
2619
2620 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2621                               struct fib6_config *cfg)
2622 {
2623         struct rtmsg *rtm;
2624         struct nlattr *tb[RTA_MAX+1];
2625         unsigned int pref;
2626         int err;
2627
2628         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2629         if (err < 0)
2630                 goto errout;
2631
2632         err = -EINVAL;
2633         rtm = nlmsg_data(nlh);
2634         memset(cfg, 0, sizeof(*cfg));
2635
2636         cfg->fc_table = rtm->rtm_table;
2637         cfg->fc_dst_len = rtm->rtm_dst_len;
2638         cfg->fc_src_len = rtm->rtm_src_len;
2639         cfg->fc_flags = RTF_UP;
2640         cfg->fc_protocol = rtm->rtm_protocol;
2641         cfg->fc_type = rtm->rtm_type;
2642
2643         if (rtm->rtm_type == RTN_UNREACHABLE ||
2644             rtm->rtm_type == RTN_BLACKHOLE ||
2645             rtm->rtm_type == RTN_PROHIBIT ||
2646             rtm->rtm_type == RTN_THROW)
2647                 cfg->fc_flags |= RTF_REJECT;
2648
2649         if (rtm->rtm_type == RTN_LOCAL)
2650                 cfg->fc_flags |= RTF_LOCAL;
2651
2652         if (rtm->rtm_flags & RTM_F_CLONED)
2653                 cfg->fc_flags |= RTF_CACHE;
2654
2655         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2656         cfg->fc_nlinfo.nlh = nlh;
2657         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2658
2659         if (tb[RTA_GATEWAY]) {
2660                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2661                 cfg->fc_flags |= RTF_GATEWAY;
2662         }
2663
2664         if (tb[RTA_DST]) {
2665                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2666
2667                 if (nla_len(tb[RTA_DST]) < plen)
2668                         goto errout;
2669
2670                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2671         }
2672
2673         if (tb[RTA_SRC]) {
2674                 int plen = (rtm->rtm_src_len + 7) >> 3;
2675
2676                 if (nla_len(tb[RTA_SRC]) < plen)
2677                         goto errout;
2678
2679                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2680         }
2681
2682         if (tb[RTA_PREFSRC])
2683                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2684
2685         if (tb[RTA_OIF])
2686                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2687
2688         if (tb[RTA_PRIORITY])
2689                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2690
2691         if (tb[RTA_METRICS]) {
2692                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2693                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2694         }
2695
2696         if (tb[RTA_TABLE])
2697                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2698
2699         if (tb[RTA_MULTIPATH]) {
2700                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2701                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2702         }
2703
2704         if (tb[RTA_PREF]) {
2705                 pref = nla_get_u8(tb[RTA_PREF]);
2706                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2707                     pref != ICMPV6_ROUTER_PREF_HIGH)
2708                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2709                 cfg->fc_flags |= RTF_PREF(pref);
2710         }
2711
2712         if (tb[RTA_ENCAP])
2713                 cfg->fc_encap = tb[RTA_ENCAP];
2714
2715         if (tb[RTA_ENCAP_TYPE])
2716                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2717
2718         err = 0;
2719 errout:
2720         return err;
2721 }
2722
2723 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2724 {
2725         struct fib6_config r_cfg;
2726         struct rtnexthop *rtnh;
2727         int remaining;
2728         int attrlen;
2729         int err = 0, last_err = 0;
2730
2731         remaining = cfg->fc_mp_len;
2732 beginning:
2733         rtnh = (struct rtnexthop *)cfg->fc_mp;
2734
2735         /* Parse a Multipath Entry */
2736         while (rtnh_ok(rtnh, remaining)) {
2737                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2738                 if (rtnh->rtnh_ifindex)
2739                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2740
2741                 attrlen = rtnh_attrlen(rtnh);
2742                 if (attrlen > 0) {
2743                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2744
2745                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2746                         if (nla) {
2747                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2748                                 r_cfg.fc_flags |= RTF_GATEWAY;
2749                         }
2750                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2751                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2752                         if (nla)
2753                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2754                 }
2755                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2756                 if (err) {
2757                         last_err = err;
2758                         /* If we are trying to remove a route, do not stop the
2759                          * loop when ip6_route_del() fails (because next hop is
2760                          * already gone), we should try to remove all next hops.
2761                          */
2762                         if (add) {
2763                                 /* If add fails, we should try to delete all
2764                                  * next hops that have been already added.
2765                                  */
2766                                 add = 0;
2767                                 remaining = cfg->fc_mp_len - remaining;
2768                                 goto beginning;
2769                         }
2770                 }
2771                 /* Because each route is added like a single route we remove
2772                  * these flags after the first nexthop: if there is a collision,
2773                  * we have already failed to add the first nexthop:
2774                  * fib6_add_rt2node() has rejected it; when replacing, old
2775                  * nexthops have been replaced by first new, the rest should
2776                  * be added to it.
2777                  */
2778                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2779                                                      NLM_F_REPLACE);
2780                 rtnh = rtnh_next(rtnh, &remaining);
2781         }
2782
2783         return last_err;
2784 }
2785
2786 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2787 {
2788         struct fib6_config cfg;
2789         int err;
2790
2791         err = rtm_to_fib6_config(skb, nlh, &cfg);
2792         if (err < 0)
2793                 return err;
2794
2795         if (cfg.fc_mp)
2796                 return ip6_route_multipath(&cfg, 0);
2797         else
2798                 return ip6_route_del(&cfg);
2799 }
2800
2801 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2802 {
2803         struct fib6_config cfg;
2804         int err;
2805
2806         err = rtm_to_fib6_config(skb, nlh, &cfg);
2807         if (err < 0)
2808                 return err;
2809
2810         if (cfg.fc_mp)
2811                 return ip6_route_multipath(&cfg, 1);
2812         else
2813                 return ip6_route_add(&cfg);
2814 }
2815
2816 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2817 {
2818         return NLMSG_ALIGN(sizeof(struct rtmsg))
2819                + nla_total_size(16) /* RTA_SRC */
2820                + nla_total_size(16) /* RTA_DST */
2821                + nla_total_size(16) /* RTA_GATEWAY */
2822                + nla_total_size(16) /* RTA_PREFSRC */
2823                + nla_total_size(4) /* RTA_TABLE */
2824                + nla_total_size(4) /* RTA_IIF */
2825                + nla_total_size(4) /* RTA_OIF */
2826                + nla_total_size(4) /* RTA_PRIORITY */
2827                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2828                + nla_total_size(sizeof(struct rta_cacheinfo))
2829                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2830                + nla_total_size(1) /* RTA_PREF */
2831                + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
2832 }
2833
2834 static int rt6_fill_node(struct net *net,
2835                          struct sk_buff *skb, struct rt6_info *rt,
2836                          struct in6_addr *dst, struct in6_addr *src,
2837                          int iif, int type, u32 portid, u32 seq,
2838                          int prefix, int nowait, unsigned int flags)
2839 {
2840         u32 metrics[RTAX_MAX];
2841         struct rtmsg *rtm;
2842         struct nlmsghdr *nlh;
2843         long expires;
2844         u32 table;
2845
2846         if (prefix) {   /* user wants prefix routes only */
2847                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2848                         /* success since this is not a prefix route */
2849                         return 1;
2850                 }
2851         }
2852
2853         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2854         if (!nlh)
2855                 return -EMSGSIZE;
2856
2857         rtm = nlmsg_data(nlh);
2858         rtm->rtm_family = AF_INET6;
2859         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2860         rtm->rtm_src_len = rt->rt6i_src.plen;
2861         rtm->rtm_tos = 0;
2862         if (rt->rt6i_table)
2863                 table = rt->rt6i_table->tb6_id;
2864         else
2865                 table = RT6_TABLE_UNSPEC;
2866         rtm->rtm_table = table;
2867         if (nla_put_u32(skb, RTA_TABLE, table))
2868                 goto nla_put_failure;
2869         if (rt->rt6i_flags & RTF_REJECT) {
2870                 switch (rt->dst.error) {
2871                 case -EINVAL:
2872                         rtm->rtm_type = RTN_BLACKHOLE;
2873                         break;
2874                 case -EACCES:
2875                         rtm->rtm_type = RTN_PROHIBIT;
2876                         break;
2877                 case -EAGAIN:
2878                         rtm->rtm_type = RTN_THROW;
2879                         break;
2880                 default:
2881                         rtm->rtm_type = RTN_UNREACHABLE;
2882                         break;
2883                 }
2884         }
2885         else if (rt->rt6i_flags & RTF_LOCAL)
2886                 rtm->rtm_type = RTN_LOCAL;
2887         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2888                 rtm->rtm_type = RTN_LOCAL;
2889         else
2890                 rtm->rtm_type = RTN_UNICAST;
2891         rtm->rtm_flags = 0;
2892         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2893         rtm->rtm_protocol = rt->rt6i_protocol;
2894         if (rt->rt6i_flags & RTF_DYNAMIC)
2895                 rtm->rtm_protocol = RTPROT_REDIRECT;
2896         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2897                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2898                         rtm->rtm_protocol = RTPROT_RA;
2899                 else
2900                         rtm->rtm_protocol = RTPROT_KERNEL;
2901         }
2902
2903         if (rt->rt6i_flags & RTF_CACHE)
2904                 rtm->rtm_flags |= RTM_F_CLONED;
2905
2906         if (dst) {
2907                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2908                         goto nla_put_failure;
2909                 rtm->rtm_dst_len = 128;
2910         } else if (rtm->rtm_dst_len)
2911                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2912                         goto nla_put_failure;
2913 #ifdef CONFIG_IPV6_SUBTREES
2914         if (src) {
2915                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2916                         goto nla_put_failure;
2917                 rtm->rtm_src_len = 128;
2918         } else if (rtm->rtm_src_len &&
2919                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2920                 goto nla_put_failure;
2921 #endif
2922         if (iif) {
2923 #ifdef CONFIG_IPV6_MROUTE
2924                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2925                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2926                         if (err <= 0) {
2927                                 if (!nowait) {
2928                                         if (err == 0)
2929                                                 return 0;
2930                                         goto nla_put_failure;
2931                                 } else {
2932                                         if (err == -EMSGSIZE)
2933                                                 goto nla_put_failure;
2934                                 }
2935                         }
2936                 } else
2937 #endif
2938                         if (nla_put_u32(skb, RTA_IIF, iif))
2939                                 goto nla_put_failure;
2940         } else if (dst) {
2941                 struct in6_addr saddr_buf;
2942                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2943                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2944                         goto nla_put_failure;
2945         }
2946
2947         if (rt->rt6i_prefsrc.plen) {
2948                 struct in6_addr saddr_buf;
2949                 saddr_buf = rt->rt6i_prefsrc.addr;
2950                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2951                         goto nla_put_failure;
2952         }
2953
2954         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2955         if (rt->rt6i_pmtu)
2956                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2957         if (rtnetlink_put_metrics(skb, metrics) < 0)
2958                 goto nla_put_failure;
2959
2960         if (rt->rt6i_flags & RTF_GATEWAY) {
2961                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2962                         goto nla_put_failure;
2963         }
2964
2965         if (rt->dst.dev &&
2966             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2967                 goto nla_put_failure;
2968         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2969                 goto nla_put_failure;
2970
2971         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2972
2973         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2974                 goto nla_put_failure;
2975
2976         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2977                 goto nla_put_failure;
2978
2979         lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
2980
2981         nlmsg_end(skb, nlh);
2982         return 0;
2983
2984 nla_put_failure:
2985         nlmsg_cancel(skb, nlh);
2986         return -EMSGSIZE;
2987 }
2988
2989 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2990 {
2991         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2992         int prefix;
2993
2994         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2995                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2996                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2997         } else
2998                 prefix = 0;
2999
3000         return rt6_fill_node(arg->net,
3001                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3002                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3003                      prefix, 0, NLM_F_MULTI);
3004 }
3005
3006 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3007 {
3008         struct net *net = sock_net(in_skb->sk);
3009         struct nlattr *tb[RTA_MAX+1];
3010         struct rt6_info *rt;
3011         struct sk_buff *skb;
3012         struct rtmsg *rtm;
3013         struct flowi6 fl6;
3014         int err, iif = 0, oif = 0;
3015
3016         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3017         if (err < 0)
3018                 goto errout;
3019
3020         err = -EINVAL;
3021         memset(&fl6, 0, sizeof(fl6));
3022
3023         if (tb[RTA_SRC]) {
3024                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3025                         goto errout;
3026
3027                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3028         }
3029
3030         if (tb[RTA_DST]) {
3031                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3032                         goto errout;
3033
3034                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3035         }
3036
3037         if (tb[RTA_IIF])
3038                 iif = nla_get_u32(tb[RTA_IIF]);
3039
3040         if (tb[RTA_OIF])
3041                 oif = nla_get_u32(tb[RTA_OIF]);
3042
3043         if (tb[RTA_MARK])
3044                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3045
3046         if (iif) {
3047                 struct net_device *dev;
3048                 int flags = 0;
3049
3050                 dev = __dev_get_by_index(net, iif);
3051                 if (!dev) {
3052                         err = -ENODEV;
3053                         goto errout;
3054                 }
3055
3056                 fl6.flowi6_iif = iif;
3057
3058                 if (!ipv6_addr_any(&fl6.saddr))
3059                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3060
3061                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3062                                                                flags);
3063         } else {
3064                 fl6.flowi6_oif = oif;
3065
3066                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3067         }
3068
3069         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070         if (!skb) {
3071                 ip6_rt_put(rt);
3072                 err = -ENOBUFS;
3073                 goto errout;
3074         }
3075
3076         /* Reserve room for dummy headers, this skb can pass
3077            through good chunk of routing engine.
3078          */
3079         skb_reset_mac_header(skb);
3080         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3081
3082         skb_dst_set(skb, &rt->dst);
3083
3084         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3085                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3086                             nlh->nlmsg_seq, 0, 0, 0);
3087         if (err < 0) {
3088                 kfree_skb(skb);
3089                 goto errout;
3090         }
3091
3092         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3093 errout:
3094         return err;
3095 }
3096
3097 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3098 {
3099         struct sk_buff *skb;
3100         struct net *net = info->nl_net;
3101         u32 seq;
3102         int err;
3103
3104         err = -ENOBUFS;
3105         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3106
3107         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3108         if (!skb)
3109                 goto errout;
3110
3111         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3112                                 event, info->portid, seq, 0, 0, 0);
3113         if (err < 0) {
3114                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3115                 WARN_ON(err == -EMSGSIZE);
3116                 kfree_skb(skb);
3117                 goto errout;
3118         }
3119         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3120                     info->nlh, gfp_any());
3121         return;
3122 errout:
3123         if (err < 0)
3124                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3125 }
3126
3127 static int ip6_route_dev_notify(struct notifier_block *this,
3128                                 unsigned long event, void *ptr)
3129 {
3130         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3131         struct net *net = dev_net(dev);
3132
3133         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3134                 net->ipv6.ip6_null_entry->dst.dev = dev;
3135                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3136 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3137                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3138                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3139                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3140                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3141 #endif
3142         }
3143
3144         return NOTIFY_OK;
3145 }
3146
3147 /*
3148  *      /proc
3149  */
3150
3151 #ifdef CONFIG_PROC_FS
3152
3153 static const struct file_operations ipv6_route_proc_fops = {
3154         .owner          = THIS_MODULE,
3155         .open           = ipv6_route_open,
3156         .read           = seq_read,
3157         .llseek         = seq_lseek,
3158         .release        = seq_release_net,
3159 };
3160
3161 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3162 {
3163         struct net *net = (struct net *)seq->private;
3164         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3165                    net->ipv6.rt6_stats->fib_nodes,
3166                    net->ipv6.rt6_stats->fib_route_nodes,
3167                    net->ipv6.rt6_stats->fib_rt_alloc,
3168                    net->ipv6.rt6_stats->fib_rt_entries,
3169                    net->ipv6.rt6_stats->fib_rt_cache,
3170                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3171                    net->ipv6.rt6_stats->fib_discarded_routes);
3172
3173         return 0;
3174 }
3175
3176 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3177 {
3178         return single_open_net(inode, file, rt6_stats_seq_show);
3179 }
3180
3181 static const struct file_operations rt6_stats_seq_fops = {
3182         .owner   = THIS_MODULE,
3183         .open    = rt6_stats_seq_open,
3184         .read    = seq_read,
3185         .llseek  = seq_lseek,
3186         .release = single_release_net,
3187 };
3188 #endif  /* CONFIG_PROC_FS */
3189
3190 #ifdef CONFIG_SYSCTL
3191
3192 static
3193 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3194                               void __user *buffer, size_t *lenp, loff_t *ppos)
3195 {
3196         struct net *net;
3197         int delay;
3198         if (!write)
3199                 return -EINVAL;
3200
3201         net = (struct net *)ctl->extra1;
3202         delay = net->ipv6.sysctl.flush_delay;
3203         proc_dointvec(ctl, write, buffer, lenp, ppos);
3204         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3205         return 0;
3206 }
3207
3208 struct ctl_table ipv6_route_table_template[] = {
3209         {
3210                 .procname       =       "flush",
3211                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3212                 .maxlen         =       sizeof(int),
3213                 .mode           =       0200,
3214                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3215         },
3216         {
3217                 .procname       =       "gc_thresh",
3218                 .data           =       &ip6_dst_ops_template.gc_thresh,
3219                 .maxlen         =       sizeof(int),
3220                 .mode           =       0644,
3221                 .proc_handler   =       proc_dointvec,
3222         },
3223         {
3224                 .procname       =       "max_size",
3225                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3226                 .maxlen         =       sizeof(int),
3227                 .mode           =       0644,
3228                 .proc_handler   =       proc_dointvec,
3229         },
3230         {
3231                 .procname       =       "gc_min_interval",
3232                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3233                 .maxlen         =       sizeof(int),
3234                 .mode           =       0644,
3235                 .proc_handler   =       proc_dointvec_jiffies,
3236         },
3237         {
3238                 .procname       =       "gc_timeout",
3239                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3240                 .maxlen         =       sizeof(int),
3241                 .mode           =       0644,
3242                 .proc_handler   =       proc_dointvec_jiffies,
3243         },
3244         {
3245                 .procname       =       "gc_interval",
3246                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3247                 .maxlen         =       sizeof(int),
3248                 .mode           =       0644,
3249                 .proc_handler   =       proc_dointvec_jiffies,
3250         },
3251         {
3252                 .procname       =       "gc_elasticity",
3253                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3254                 .maxlen         =       sizeof(int),
3255                 .mode           =       0644,
3256                 .proc_handler   =       proc_dointvec,
3257         },
3258         {
3259                 .procname       =       "mtu_expires",
3260                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3261                 .maxlen         =       sizeof(int),
3262                 .mode           =       0644,
3263                 .proc_handler   =       proc_dointvec_jiffies,
3264         },
3265         {
3266                 .procname       =       "min_adv_mss",
3267                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3268                 .maxlen         =       sizeof(int),
3269                 .mode           =       0644,
3270                 .proc_handler   =       proc_dointvec,
3271         },
3272         {
3273                 .procname       =       "gc_min_interval_ms",
3274                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3275                 .maxlen         =       sizeof(int),
3276                 .mode           =       0644,
3277                 .proc_handler   =       proc_dointvec_ms_jiffies,
3278         },
3279         { }
3280 };
3281
3282 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3283 {
3284         struct ctl_table *table;
3285
3286         table = kmemdup(ipv6_route_table_template,
3287                         sizeof(ipv6_route_table_template),
3288                         GFP_KERNEL);
3289
3290         if (table) {
3291                 table[0].data = &net->ipv6.sysctl.flush_delay;
3292                 table[0].extra1 = net;
3293                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3294                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3295                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3296                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3297                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3298                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3299                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3300                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3301                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3302
3303                 /* Don't export sysctls to unprivileged users */
3304                 if (net->user_ns != &init_user_ns)
3305                         table[0].procname = NULL;
3306         }
3307
3308         return table;
3309 }
3310 #endif
3311
3312 static int __net_init ip6_route_net_init(struct net *net)
3313 {
3314         int ret = -ENOMEM;
3315
3316         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3317                sizeof(net->ipv6.ip6_dst_ops));
3318
3319         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3320                 goto out_ip6_dst_ops;
3321
3322         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3323                                            sizeof(*net->ipv6.ip6_null_entry),
3324                                            GFP_KERNEL);
3325         if (!net->ipv6.ip6_null_entry)
3326                 goto out_ip6_dst_entries;
3327         net->ipv6.ip6_null_entry->dst.path =
3328                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3329         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3330         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3331                          ip6_template_metrics, true);
3332
3333 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3334         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3335                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3336                                                GFP_KERNEL);
3337         if (!net->ipv6.ip6_prohibit_entry)
3338                 goto out_ip6_null_entry;
3339         net->ipv6.ip6_prohibit_entry->dst.path =
3340                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3341         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3342         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3343                          ip6_template_metrics, true);
3344
3345         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3346                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3347                                                GFP_KERNEL);
3348         if (!net->ipv6.ip6_blk_hole_entry)
3349                 goto out_ip6_prohibit_entry;
3350         net->ipv6.ip6_blk_hole_entry->dst.path =
3351                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3352         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3353         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3354                          ip6_template_metrics, true);
3355 #endif
3356
3357         net->ipv6.sysctl.flush_delay = 0;
3358         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3359         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3360         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3361         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3362         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3363         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3364         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3365
3366         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3367
3368         ret = 0;
3369 out:
3370         return ret;
3371
3372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3373 out_ip6_prohibit_entry:
3374         kfree(net->ipv6.ip6_prohibit_entry);
3375 out_ip6_null_entry:
3376         kfree(net->ipv6.ip6_null_entry);
3377 #endif
3378 out_ip6_dst_entries:
3379         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3380 out_ip6_dst_ops:
3381         goto out;
3382 }
3383
3384 static void __net_exit ip6_route_net_exit(struct net *net)
3385 {
3386         kfree(net->ipv6.ip6_null_entry);
3387 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3388         kfree(net->ipv6.ip6_prohibit_entry);
3389         kfree(net->ipv6.ip6_blk_hole_entry);
3390 #endif
3391         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3392 }
3393
3394 static int __net_init ip6_route_net_init_late(struct net *net)
3395 {
3396 #ifdef CONFIG_PROC_FS
3397         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3398         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3399 #endif
3400         return 0;
3401 }
3402
3403 static void __net_exit ip6_route_net_exit_late(struct net *net)
3404 {
3405 #ifdef CONFIG_PROC_FS
3406         remove_proc_entry("ipv6_route", net->proc_net);
3407         remove_proc_entry("rt6_stats", net->proc_net);
3408 #endif
3409 }
3410
3411 static struct pernet_operations ip6_route_net_ops = {
3412         .init = ip6_route_net_init,
3413         .exit = ip6_route_net_exit,
3414 };
3415
3416 static int __net_init ipv6_inetpeer_init(struct net *net)
3417 {
3418         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3419
3420         if (!bp)
3421                 return -ENOMEM;
3422         inet_peer_base_init(bp);
3423         net->ipv6.peers = bp;
3424         return 0;
3425 }
3426
3427 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3428 {
3429         struct inet_peer_base *bp = net->ipv6.peers;
3430
3431         net->ipv6.peers = NULL;
3432         inetpeer_invalidate_tree(bp);
3433         kfree(bp);
3434 }
3435
3436 static struct pernet_operations ipv6_inetpeer_ops = {
3437         .init   =       ipv6_inetpeer_init,
3438         .exit   =       ipv6_inetpeer_exit,
3439 };
3440
3441 static struct pernet_operations ip6_route_net_late_ops = {
3442         .init = ip6_route_net_init_late,
3443         .exit = ip6_route_net_exit_late,
3444 };
3445
3446 static struct notifier_block ip6_route_dev_notifier = {
3447         .notifier_call = ip6_route_dev_notify,
3448         .priority = 0,
3449 };
3450
3451 int __init ip6_route_init(void)
3452 {
3453         int ret;
3454         int cpu;
3455
3456         ret = -ENOMEM;
3457         ip6_dst_ops_template.kmem_cachep =
3458                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3459                                   SLAB_HWCACHE_ALIGN, NULL);
3460         if (!ip6_dst_ops_template.kmem_cachep)
3461                 goto out;
3462
3463         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3464         if (ret)
3465                 goto out_kmem_cache;
3466
3467         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3468         if (ret)
3469                 goto out_dst_entries;
3470
3471         ret = register_pernet_subsys(&ip6_route_net_ops);
3472         if (ret)
3473                 goto out_register_inetpeer;
3474
3475         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3476
3477         /* Registering of the loopback is done before this portion of code,
3478          * the loopback reference in rt6_info will not be taken, do it
3479          * manually for init_net */
3480         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3481         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3482   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3483         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3484         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3485         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3486         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3487   #endif
3488         ret = fib6_init();
3489         if (ret)
3490                 goto out_register_subsys;
3491
3492         ret = xfrm6_init();
3493         if (ret)
3494                 goto out_fib6_init;
3495
3496         ret = fib6_rules_init();
3497         if (ret)
3498                 goto xfrm6_init;
3499
3500         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3501         if (ret)
3502                 goto fib6_rules_init;
3503
3504         ret = -ENOBUFS;
3505         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3506             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3507             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3508                 goto out_register_late_subsys;
3509
3510         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3511         if (ret)
3512                 goto out_register_late_subsys;
3513
3514         for_each_possible_cpu(cpu) {
3515                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3516
3517                 INIT_LIST_HEAD(&ul->head);
3518                 spin_lock_init(&ul->lock);
3519         }
3520
3521 out:
3522         return ret;
3523
3524 out_register_late_subsys:
3525         unregister_pernet_subsys(&ip6_route_net_late_ops);
3526 fib6_rules_init:
3527         fib6_rules_cleanup();
3528 xfrm6_init:
3529         xfrm6_fini();
3530 out_fib6_init:
3531         fib6_gc_cleanup();
3532 out_register_subsys:
3533         unregister_pernet_subsys(&ip6_route_net_ops);
3534 out_register_inetpeer:
3535         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3536 out_dst_entries:
3537         dst_entries_destroy(&ip6_dst_blackhole_ops);
3538 out_kmem_cache:
3539         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3540         goto out;
3541 }
3542
3543 void ip6_route_cleanup(void)
3544 {
3545         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3546         unregister_pernet_subsys(&ip6_route_net_late_ops);
3547         fib6_rules_cleanup();
3548         xfrm6_fini();
3549         fib6_gc_cleanup();
3550         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3551         unregister_pernet_subsys(&ip6_route_net_ops);
3552         dst_entries_destroy(&ip6_dst_blackhole_ops);
3553         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3554 }