ipv6: route: extend flow representation with tunnel key
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64
65 #include <asm/uaccess.h>
66
67 #ifdef CONFIG_SYSCTL
68 #include <linux/sysctl.h>
69 #endif
70
71 enum rt6_nud_state {
72         RT6_NUD_FAIL_HARD = -3,
73         RT6_NUD_FAIL_PROBE = -2,
74         RT6_NUD_FAIL_DO_RR = -1,
75         RT6_NUD_SUCCEED = 1
76 };
77
78 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
79 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
81 static unsigned int      ip6_mtu(const struct dst_entry *dst);
82 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
83 static void             ip6_dst_destroy(struct dst_entry *);
84 static void             ip6_dst_ifdown(struct dst_entry *,
85                                        struct net_device *dev, int how);
86 static int               ip6_dst_gc(struct dst_ops *ops);
87
88 static int              ip6_pkt_discard(struct sk_buff *skb);
89 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
90 static int              ip6_pkt_prohibit(struct sk_buff *skb);
91 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
92 static void             ip6_link_failure(struct sk_buff *skb);
93 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
94                                            struct sk_buff *skb, u32 mtu);
95 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
96                                         struct sk_buff *skb);
97 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
98 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct net *net,
102                                            const struct in6_addr *prefix, int prefixlen,
103                                            const struct in6_addr *gwaddr, int ifindex,
104                                            unsigned int pref);
105 static struct rt6_info *rt6_get_route_info(struct net *net,
106                                            const struct in6_addr *prefix, int prefixlen,
107                                            const struct in6_addr *gwaddr, int ifindex);
108 #endif
109
110 struct uncached_list {
111         spinlock_t              lock;
112         struct list_head        head;
113 };
114
115 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
116
117 static void rt6_uncached_list_add(struct rt6_info *rt)
118 {
119         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
120
121         rt->dst.flags |= DST_NOCACHE;
122         rt->rt6i_uncached_list = ul;
123
124         spin_lock_bh(&ul->lock);
125         list_add_tail(&rt->rt6i_uncached, &ul->head);
126         spin_unlock_bh(&ul->lock);
127 }
128
129 static void rt6_uncached_list_del(struct rt6_info *rt)
130 {
131         if (!list_empty(&rt->rt6i_uncached)) {
132                 struct uncached_list *ul = rt->rt6i_uncached_list;
133
134                 spin_lock_bh(&ul->lock);
135                 list_del(&rt->rt6i_uncached);
136                 spin_unlock_bh(&ul->lock);
137         }
138 }
139
140 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
141 {
142         struct net_device *loopback_dev = net->loopback_dev;
143         int cpu;
144
145         for_each_possible_cpu(cpu) {
146                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
147                 struct rt6_info *rt;
148
149                 spin_lock_bh(&ul->lock);
150                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
151                         struct inet6_dev *rt_idev = rt->rt6i_idev;
152                         struct net_device *rt_dev = rt->dst.dev;
153
154                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
155                             rt_idev->dev != loopback_dev) {
156                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
157                                 in6_dev_put(rt_idev);
158                         }
159
160                         if (rt_dev && (rt_dev == dev || !dev) &&
161                             rt_dev != loopback_dev) {
162                                 rt->dst.dev = loopback_dev;
163                                 dev_hold(rt->dst.dev);
164                                 dev_put(rt_dev);
165                         }
166                 }
167                 spin_unlock_bh(&ul->lock);
168         }
169 }
170
171 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
172 {
173         return dst_metrics_write_ptr(rt->dst.from);
174 }
175
176 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
177 {
178         struct rt6_info *rt = (struct rt6_info *)dst;
179
180         if (rt->rt6i_flags & RTF_PCPU)
181                 return rt6_pcpu_cow_metrics(rt);
182         else if (rt->rt6i_flags & RTF_CACHE)
183                 return NULL;
184         else
185                 return dst_cow_metrics_generic(dst, old);
186 }
187
188 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
189                                              struct sk_buff *skb,
190                                              const void *daddr)
191 {
192         struct in6_addr *p = &rt->rt6i_gateway;
193
194         if (!ipv6_addr_any(p))
195                 return (const void *) p;
196         else if (skb)
197                 return &ipv6_hdr(skb)->daddr;
198         return daddr;
199 }
200
201 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
202                                           struct sk_buff *skb,
203                                           const void *daddr)
204 {
205         struct rt6_info *rt = (struct rt6_info *) dst;
206         struct neighbour *n;
207
208         daddr = choose_neigh_daddr(rt, skb, daddr);
209         n = __ipv6_neigh_lookup(dst->dev, daddr);
210         if (n)
211                 return n;
212         return neigh_create(&nd_tbl, daddr, dst->dev);
213 }
214
215 static struct dst_ops ip6_dst_ops_template = {
216         .family                 =       AF_INET6,
217         .gc                     =       ip6_dst_gc,
218         .gc_thresh              =       1024,
219         .check                  =       ip6_dst_check,
220         .default_advmss         =       ip6_default_advmss,
221         .mtu                    =       ip6_mtu,
222         .cow_metrics            =       ipv6_cow_metrics,
223         .destroy                =       ip6_dst_destroy,
224         .ifdown                 =       ip6_dst_ifdown,
225         .negative_advice        =       ip6_negative_advice,
226         .link_failure           =       ip6_link_failure,
227         .update_pmtu            =       ip6_rt_update_pmtu,
228         .redirect               =       rt6_do_redirect,
229         .local_out              =       __ip6_local_out,
230         .neigh_lookup           =       ip6_neigh_lookup,
231 };
232
233 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
234 {
235         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
236
237         return mtu ? : dst->dev->mtu;
238 }
239
240 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
241                                          struct sk_buff *skb, u32 mtu)
242 {
243 }
244
245 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
246                                       struct sk_buff *skb)
247 {
248 }
249
250 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
251                                          unsigned long old)
252 {
253         return NULL;
254 }
255
256 static struct dst_ops ip6_dst_blackhole_ops = {
257         .family                 =       AF_INET6,
258         .destroy                =       ip6_dst_destroy,
259         .check                  =       ip6_dst_check,
260         .mtu                    =       ip6_blackhole_mtu,
261         .default_advmss         =       ip6_default_advmss,
262         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
263         .redirect               =       ip6_rt_blackhole_redirect,
264         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
265         .neigh_lookup           =       ip6_neigh_lookup,
266 };
267
268 static const u32 ip6_template_metrics[RTAX_MAX] = {
269         [RTAX_HOPLIMIT - 1] = 0,
270 };
271
272 static const struct rt6_info ip6_null_entry_template = {
273         .dst = {
274                 .__refcnt       = ATOMIC_INIT(1),
275                 .__use          = 1,
276                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
277                 .error          = -ENETUNREACH,
278                 .input          = ip6_pkt_discard,
279                 .output         = ip6_pkt_discard_out,
280         },
281         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
282         .rt6i_protocol  = RTPROT_KERNEL,
283         .rt6i_metric    = ~(u32) 0,
284         .rt6i_ref       = ATOMIC_INIT(1),
285 };
286
287 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
288
289 static const struct rt6_info ip6_prohibit_entry_template = {
290         .dst = {
291                 .__refcnt       = ATOMIC_INIT(1),
292                 .__use          = 1,
293                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
294                 .error          = -EACCES,
295                 .input          = ip6_pkt_prohibit,
296                 .output         = ip6_pkt_prohibit_out,
297         },
298         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
299         .rt6i_protocol  = RTPROT_KERNEL,
300         .rt6i_metric    = ~(u32) 0,
301         .rt6i_ref       = ATOMIC_INIT(1),
302 };
303
304 static const struct rt6_info ip6_blk_hole_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -EINVAL,
310                 .input          = dst_discard,
311                 .output         = dst_discard_sk,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314         .rt6i_protocol  = RTPROT_KERNEL,
315         .rt6i_metric    = ~(u32) 0,
316         .rt6i_ref       = ATOMIC_INIT(1),
317 };
318
319 #endif
320
321 /* allocate dst with ip6_dst_ops */
322 static struct rt6_info *__ip6_dst_alloc(struct net *net,
323                                         struct net_device *dev,
324                                         int flags,
325                                         struct fib6_table *table)
326 {
327         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
328                                         0, DST_OBSOLETE_FORCE_CHK, flags);
329
330         if (rt) {
331                 struct dst_entry *dst = &rt->dst;
332
333                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
334                 INIT_LIST_HEAD(&rt->rt6i_siblings);
335                 INIT_LIST_HEAD(&rt->rt6i_uncached);
336         }
337         return rt;
338 }
339
340 static struct rt6_info *ip6_dst_alloc(struct net *net,
341                                       struct net_device *dev,
342                                       int flags,
343                                       struct fib6_table *table)
344 {
345         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
346
347         if (rt) {
348                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349                 if (rt->rt6i_pcpu) {
350                         int cpu;
351
352                         for_each_possible_cpu(cpu) {
353                                 struct rt6_info **p;
354
355                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356                                 /* no one shares rt */
357                                 *p =  NULL;
358                         }
359                 } else {
360                         dst_destroy((struct dst_entry *)rt);
361                         return NULL;
362                 }
363         }
364
365         return rt;
366 }
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct dst_entry *from = dst->from;
372         struct inet6_dev *idev;
373
374         dst_destroy_metrics_generic(dst);
375         free_percpu(rt->rt6i_pcpu);
376         rt6_uncached_list_del(rt);
377
378         idev = rt->rt6i_idev;
379         if (idev) {
380                 rt->rt6i_idev = NULL;
381                 in6_dev_put(idev);
382         }
383
384         dst->from = NULL;
385         dst_release(from);
386 }
387
388 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389                            int how)
390 {
391         struct rt6_info *rt = (struct rt6_info *)dst;
392         struct inet6_dev *idev = rt->rt6i_idev;
393         struct net_device *loopback_dev =
394                 dev_net(dev)->loopback_dev;
395
396         if (dev != loopback_dev) {
397                 if (idev && idev->dev == dev) {
398                         struct inet6_dev *loopback_idev =
399                                 in6_dev_get(loopback_dev);
400                         if (loopback_idev) {
401                                 rt->rt6i_idev = loopback_idev;
402                                 in6_dev_put(idev);
403                         }
404                 }
405         }
406 }
407
408 static bool rt6_check_expired(const struct rt6_info *rt)
409 {
410         if (rt->rt6i_flags & RTF_EXPIRES) {
411                 if (time_after(jiffies, rt->dst.expires))
412                         return true;
413         } else if (rt->dst.from) {
414                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
415         }
416         return false;
417 }
418
419 /* Multipath route selection:
420  *   Hash based function using packet header and flowlabel.
421  * Adapted from fib_info_hashfn()
422  */
423 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
424                                const struct flowi6 *fl6)
425 {
426         unsigned int val = fl6->flowi6_proto;
427
428         val ^= ipv6_addr_hash(&fl6->daddr);
429         val ^= ipv6_addr_hash(&fl6->saddr);
430
431         /* Work only if this not encapsulated */
432         switch (fl6->flowi6_proto) {
433         case IPPROTO_UDP:
434         case IPPROTO_TCP:
435         case IPPROTO_SCTP:
436                 val ^= (__force u16)fl6->fl6_sport;
437                 val ^= (__force u16)fl6->fl6_dport;
438                 break;
439
440         case IPPROTO_ICMPV6:
441                 val ^= (__force u16)fl6->fl6_icmp_type;
442                 val ^= (__force u16)fl6->fl6_icmp_code;
443                 break;
444         }
445         /* RFC6438 recommands to use flowlabel */
446         val ^= (__force u32)fl6->flowlabel;
447
448         /* Perhaps, we need to tune, this function? */
449         val = val ^ (val >> 7) ^ (val >> 12);
450         return val % candidate_count;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
461         /* Don't change the route, if route_choosen == 0
462          * (siblings does not include ourself)
463          */
464         if (route_choosen)
465                 list_for_each_entry_safe(sibling, next_sibling,
466                                 &match->rt6i_siblings, rt6i_siblings) {
467                         route_choosen--;
468                         if (route_choosen == 0) {
469                                 if (rt6_score_route(sibling, oif, strict) < 0)
470                                         break;
471                                 match = sibling;
472                                 break;
473                         }
474                 }
475         return match;
476 }
477
478 /*
479  *      Route lookup. Any table->tb6_lock is implied.
480  */
481
482 static inline struct rt6_info *rt6_device_match(struct net *net,
483                                                     struct rt6_info *rt,
484                                                     const struct in6_addr *saddr,
485                                                     int oif,
486                                                     int flags)
487 {
488         struct rt6_info *local = NULL;
489         struct rt6_info *sprt;
490
491         if (!oif && ipv6_addr_any(saddr))
492                 goto out;
493
494         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
495                 struct net_device *dev = sprt->dst.dev;
496
497                 if (oif) {
498                         if (dev->ifindex == oif)
499                                 return sprt;
500                         if (dev->flags & IFF_LOOPBACK) {
501                                 if (!sprt->rt6i_idev ||
502                                     sprt->rt6i_idev->dev->ifindex != oif) {
503                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
504                                                 continue;
505                                         if (local && (!oif ||
506                                                       local->rt6i_idev->dev->ifindex == oif))
507                                                 continue;
508                                 }
509                                 local = sprt;
510                         }
511                 } else {
512                         if (ipv6_chk_addr(net, saddr, dev,
513                                           flags & RT6_LOOKUP_F_IFACE))
514                                 return sprt;
515                 }
516         }
517
518         if (oif) {
519                 if (local)
520                         return local;
521
522                 if (flags & RT6_LOOKUP_F_IFACE)
523                         return net->ipv6.ip6_null_entry;
524         }
525 out:
526         return rt;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct rt6_info *rt)
549 {
550         struct __rt6_probe_work *work;
551         struct neighbour *neigh;
552         /*
553          * Okay, this does not seem to be appropriate
554          * for now, however, we need to check if it
555          * is really so; aka Router Reachability Probing.
556          *
557          * Router Reachability Probe MUST be rate-limited
558          * to no more than one per minute.
559          */
560         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
561                 return;
562         rcu_read_lock_bh();
563         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
564         if (neigh) {
565                 if (neigh->nud_state & NUD_VALID)
566                         goto out;
567
568                 work = NULL;
569                 write_lock(&neigh->lock);
570                 if (!(neigh->nud_state & NUD_VALID) &&
571                     time_after(jiffies,
572                                neigh->updated +
573                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
574                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
575                         if (work)
576                                 __neigh_set_probe_once(neigh);
577                 }
578                 write_unlock(&neigh->lock);
579         } else {
580                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
581         }
582
583         if (work) {
584                 INIT_WORK(&work->work, rt6_probe_deferred);
585                 work->target = rt->rt6i_gateway;
586                 dev_hold(rt->dst.dev);
587                 work->dev = rt->dst.dev;
588                 schedule_work(&work->work);
589         }
590
591 out:
592         rcu_read_unlock_bh();
593 }
594 #else
595 static inline void rt6_probe(struct rt6_info *rt)
596 {
597 }
598 #endif
599
600 /*
601  * Default Router Selection (RFC 2461 6.3.6)
602  */
603 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
604 {
605         struct net_device *dev = rt->dst.dev;
606         if (!oif || dev->ifindex == oif)
607                 return 2;
608         if ((dev->flags & IFF_LOOPBACK) &&
609             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
610                 return 1;
611         return 0;
612 }
613
614 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
615 {
616         struct neighbour *neigh;
617         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
618
619         if (rt->rt6i_flags & RTF_NONEXTHOP ||
620             !(rt->rt6i_flags & RTF_GATEWAY))
621                 return RT6_NUD_SUCCEED;
622
623         rcu_read_lock_bh();
624         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
625         if (neigh) {
626                 read_lock(&neigh->lock);
627                 if (neigh->nud_state & NUD_VALID)
628                         ret = RT6_NUD_SUCCEED;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630                 else if (!(neigh->nud_state & NUD_FAILED))
631                         ret = RT6_NUD_SUCCEED;
632                 else
633                         ret = RT6_NUD_FAIL_PROBE;
634 #endif
635                 read_unlock(&neigh->lock);
636         } else {
637                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
638                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
639         }
640         rcu_read_unlock_bh();
641
642         return ret;
643 }
644
645 static int rt6_score_route(struct rt6_info *rt, int oif,
646                            int strict)
647 {
648         int m;
649
650         m = rt6_check_dev(rt, oif);
651         if (!m && (strict & RT6_LOOKUP_F_IFACE))
652                 return RT6_NUD_FAIL_HARD;
653 #ifdef CONFIG_IPV6_ROUTER_PREF
654         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
655 #endif
656         if (strict & RT6_LOOKUP_F_REACHABLE) {
657                 int n = rt6_check_neigh(rt);
658                 if (n < 0)
659                         return n;
660         }
661         return m;
662 }
663
664 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
665                                    int *mpri, struct rt6_info *match,
666                                    bool *do_rr)
667 {
668         int m;
669         bool match_do_rr = false;
670         struct inet6_dev *idev = rt->rt6i_idev;
671         struct net_device *dev = rt->dst.dev;
672
673         if (dev && !netif_carrier_ok(dev) &&
674             idev->cnf.ignore_routes_with_linkdown)
675                 goto out;
676
677         if (rt6_check_expired(rt))
678                 goto out;
679
680         m = rt6_score_route(rt, oif, strict);
681         if (m == RT6_NUD_FAIL_DO_RR) {
682                 match_do_rr = true;
683                 m = 0; /* lowest valid score */
684         } else if (m == RT6_NUD_FAIL_HARD) {
685                 goto out;
686         }
687
688         if (strict & RT6_LOOKUP_F_REACHABLE)
689                 rt6_probe(rt);
690
691         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
692         if (m > *mpri) {
693                 *do_rr = match_do_rr;
694                 *mpri = m;
695                 match = rt;
696         }
697 out:
698         return match;
699 }
700
701 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
702                                      struct rt6_info *rr_head,
703                                      u32 metric, int oif, int strict,
704                                      bool *do_rr)
705 {
706         struct rt6_info *rt, *match, *cont;
707         int mpri = -1;
708
709         match = NULL;
710         cont = NULL;
711         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
712                 if (rt->rt6i_metric != metric) {
713                         cont = rt;
714                         break;
715                 }
716
717                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718         }
719
720         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
721                 if (rt->rt6i_metric != metric) {
722                         cont = rt;
723                         break;
724                 }
725
726                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727         }
728
729         if (match || !cont)
730                 return match;
731
732         for (rt = cont; rt; rt = rt->dst.rt6_next)
733                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734
735         return match;
736 }
737
738 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
739 {
740         struct rt6_info *match, *rt0;
741         struct net *net;
742         bool do_rr = false;
743
744         rt0 = fn->rr_ptr;
745         if (!rt0)
746                 fn->rr_ptr = rt0 = fn->leaf;
747
748         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
749                              &do_rr);
750
751         if (do_rr) {
752                 struct rt6_info *next = rt0->dst.rt6_next;
753
754                 /* no entries matched; do round-robin */
755                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
756                         next = fn->leaf;
757
758                 if (next != rt0)
759                         fn->rr_ptr = next;
760         }
761
762         net = dev_net(rt0->dst.dev);
763         return match ? match : net->ipv6.ip6_null_entry;
764 }
765
766 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
767 {
768         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
769 }
770
771 #ifdef CONFIG_IPV6_ROUTE_INFO
772 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
773                   const struct in6_addr *gwaddr)
774 {
775         struct net *net = dev_net(dev);
776         struct route_info *rinfo = (struct route_info *) opt;
777         struct in6_addr prefix_buf, *prefix;
778         unsigned int pref;
779         unsigned long lifetime;
780         struct rt6_info *rt;
781
782         if (len < sizeof(struct route_info)) {
783                 return -EINVAL;
784         }
785
786         /* Sanity check for prefix_len and length */
787         if (rinfo->length > 3) {
788                 return -EINVAL;
789         } else if (rinfo->prefix_len > 128) {
790                 return -EINVAL;
791         } else if (rinfo->prefix_len > 64) {
792                 if (rinfo->length < 2) {
793                         return -EINVAL;
794                 }
795         } else if (rinfo->prefix_len > 0) {
796                 if (rinfo->length < 1) {
797                         return -EINVAL;
798                 }
799         }
800
801         pref = rinfo->route_pref;
802         if (pref == ICMPV6_ROUTER_PREF_INVALID)
803                 return -EINVAL;
804
805         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
806
807         if (rinfo->length == 3)
808                 prefix = (struct in6_addr *)rinfo->prefix;
809         else {
810                 /* this function is safe */
811                 ipv6_addr_prefix(&prefix_buf,
812                                  (struct in6_addr *)rinfo->prefix,
813                                  rinfo->prefix_len);
814                 prefix = &prefix_buf;
815         }
816
817         if (rinfo->prefix_len == 0)
818                 rt = rt6_get_dflt_router(gwaddr, dev);
819         else
820                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
821                                         gwaddr, dev->ifindex);
822
823         if (rt && !lifetime) {
824                 ip6_del_rt(rt);
825                 rt = NULL;
826         }
827
828         if (!rt && lifetime)
829                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
830                                         pref);
831         else if (rt)
832                 rt->rt6i_flags = RTF_ROUTEINFO |
833                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
834
835         if (rt) {
836                 if (!addrconf_finite_timeout(lifetime))
837                         rt6_clean_expires(rt);
838                 else
839                         rt6_set_expires(rt, jiffies + HZ * lifetime);
840
841                 ip6_rt_put(rt);
842         }
843         return 0;
844 }
845 #endif
846
847 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
848                                         struct in6_addr *saddr)
849 {
850         struct fib6_node *pn;
851         while (1) {
852                 if (fn->fn_flags & RTN_TL_ROOT)
853                         return NULL;
854                 pn = fn->parent;
855                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
856                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
857                 else
858                         fn = pn;
859                 if (fn->fn_flags & RTN_RTINFO)
860                         return fn;
861         }
862 }
863
864 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
865                                              struct fib6_table *table,
866                                              struct flowi6 *fl6, int flags)
867 {
868         struct fib6_node *fn;
869         struct rt6_info *rt;
870
871         read_lock_bh(&table->tb6_lock);
872         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
873 restart:
874         rt = fn->leaf;
875         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
876         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
877                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
878         if (rt == net->ipv6.ip6_null_entry) {
879                 fn = fib6_backtrack(fn, &fl6->saddr);
880                 if (fn)
881                         goto restart;
882         }
883         dst_use(&rt->dst, jiffies);
884         read_unlock_bh(&table->tb6_lock);
885         return rt;
886
887 }
888
889 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
890                                     int flags)
891 {
892         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
893 }
894 EXPORT_SYMBOL_GPL(ip6_route_lookup);
895
896 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
897                             const struct in6_addr *saddr, int oif, int strict)
898 {
899         struct flowi6 fl6 = {
900                 .flowi6_oif = oif,
901                 .daddr = *daddr,
902         };
903         struct dst_entry *dst;
904         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
905
906         if (saddr) {
907                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
908                 flags |= RT6_LOOKUP_F_HAS_SADDR;
909         }
910
911         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
912         if (dst->error == 0)
913                 return (struct rt6_info *) dst;
914
915         dst_release(dst);
916
917         return NULL;
918 }
919 EXPORT_SYMBOL(rt6_lookup);
920
921 /* ip6_ins_rt is called with FREE table->tb6_lock.
922    It takes new route entry, the addition fails by any reason the
923    route is freed. In any case, if caller does not hold it, it may
924    be destroyed.
925  */
926
927 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
928                         struct mx6_config *mxc)
929 {
930         int err;
931         struct fib6_table *table;
932
933         table = rt->rt6i_table;
934         write_lock_bh(&table->tb6_lock);
935         err = fib6_add(&table->tb6_root, rt, info, mxc);
936         write_unlock_bh(&table->tb6_lock);
937
938         return err;
939 }
940
941 int ip6_ins_rt(struct rt6_info *rt)
942 {
943         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
944         struct mx6_config mxc = { .mx = NULL, };
945
946         return __ip6_ins_rt(rt, &info, &mxc);
947 }
948
949 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
950                                            const struct in6_addr *daddr,
951                                            const struct in6_addr *saddr)
952 {
953         struct rt6_info *rt;
954
955         /*
956          *      Clone the route.
957          */
958
959         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
960                 ort = (struct rt6_info *)ort->dst.from;
961
962         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
963                              0, ort->rt6i_table);
964
965         if (!rt)
966                 return NULL;
967
968         ip6_rt_copy_init(rt, ort);
969         rt->rt6i_flags |= RTF_CACHE;
970         rt->rt6i_metric = 0;
971         rt->dst.flags |= DST_HOST;
972         rt->rt6i_dst.addr = *daddr;
973         rt->rt6i_dst.plen = 128;
974
975         if (!rt6_is_gw_or_nonexthop(ort)) {
976                 if (ort->rt6i_dst.plen != 128 &&
977                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
978                         rt->rt6i_flags |= RTF_ANYCAST;
979 #ifdef CONFIG_IPV6_SUBTREES
980                 if (rt->rt6i_src.plen && saddr) {
981                         rt->rt6i_src.addr = *saddr;
982                         rt->rt6i_src.plen = 128;
983                 }
984 #endif
985         }
986
987         return rt;
988 }
989
990 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
991 {
992         struct rt6_info *pcpu_rt;
993
994         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
995                                   rt->dst.dev, rt->dst.flags,
996                                   rt->rt6i_table);
997
998         if (!pcpu_rt)
999                 return NULL;
1000         ip6_rt_copy_init(pcpu_rt, rt);
1001         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1002         pcpu_rt->rt6i_flags |= RTF_PCPU;
1003         return pcpu_rt;
1004 }
1005
1006 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1007 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1008 {
1009         struct rt6_info *pcpu_rt, *prev, **p;
1010
1011         p = this_cpu_ptr(rt->rt6i_pcpu);
1012         pcpu_rt = *p;
1013
1014         if (pcpu_rt)
1015                 goto done;
1016
1017         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1018         if (!pcpu_rt) {
1019                 struct net *net = dev_net(rt->dst.dev);
1020
1021                 pcpu_rt = net->ipv6.ip6_null_entry;
1022                 goto done;
1023         }
1024
1025         prev = cmpxchg(p, NULL, pcpu_rt);
1026         if (prev) {
1027                 /* If someone did it before us, return prev instead */
1028                 dst_destroy(&pcpu_rt->dst);
1029                 pcpu_rt = prev;
1030         }
1031
1032 done:
1033         dst_hold(&pcpu_rt->dst);
1034         rt6_dst_from_metrics_check(pcpu_rt);
1035         return pcpu_rt;
1036 }
1037
1038 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1039                                       struct flowi6 *fl6, int flags)
1040 {
1041         struct fib6_node *fn, *saved_fn;
1042         struct rt6_info *rt;
1043         int strict = 0;
1044
1045         strict |= flags & RT6_LOOKUP_F_IFACE;
1046         if (net->ipv6.devconf_all->forwarding == 0)
1047                 strict |= RT6_LOOKUP_F_REACHABLE;
1048
1049         read_lock_bh(&table->tb6_lock);
1050
1051         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1052         saved_fn = fn;
1053
1054 redo_rt6_select:
1055         rt = rt6_select(fn, oif, strict);
1056         if (rt->rt6i_nsiblings)
1057                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1058         if (rt == net->ipv6.ip6_null_entry) {
1059                 fn = fib6_backtrack(fn, &fl6->saddr);
1060                 if (fn)
1061                         goto redo_rt6_select;
1062                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1063                         /* also consider unreachable route */
1064                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1065                         fn = saved_fn;
1066                         goto redo_rt6_select;
1067                 }
1068         }
1069
1070
1071         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1072                 dst_use(&rt->dst, jiffies);
1073                 read_unlock_bh(&table->tb6_lock);
1074
1075                 rt6_dst_from_metrics_check(rt);
1076                 return rt;
1077         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1078                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1079                 /* Create a RTF_CACHE clone which will not be
1080                  * owned by the fib6 tree.  It is for the special case where
1081                  * the daddr in the skb during the neighbor look-up is different
1082                  * from the fl6->daddr used to look-up route here.
1083                  */
1084
1085                 struct rt6_info *uncached_rt;
1086
1087                 dst_use(&rt->dst, jiffies);
1088                 read_unlock_bh(&table->tb6_lock);
1089
1090                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1091                 dst_release(&rt->dst);
1092
1093                 if (uncached_rt)
1094                         rt6_uncached_list_add(uncached_rt);
1095                 else
1096                         uncached_rt = net->ipv6.ip6_null_entry;
1097
1098                 dst_hold(&uncached_rt->dst);
1099                 return uncached_rt;
1100
1101         } else {
1102                 /* Get a percpu copy */
1103
1104                 struct rt6_info *pcpu_rt;
1105
1106                 rt->dst.lastuse = jiffies;
1107                 rt->dst.__use++;
1108                 pcpu_rt = rt6_get_pcpu_route(rt);
1109                 read_unlock_bh(&table->tb6_lock);
1110
1111                 return pcpu_rt;
1112         }
1113 }
1114
1115 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1116                                             struct flowi6 *fl6, int flags)
1117 {
1118         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1119 }
1120
1121 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1122                                                 struct net_device *dev,
1123                                                 struct flowi6 *fl6, int flags)
1124 {
1125         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1126                 flags |= RT6_LOOKUP_F_IFACE;
1127
1128         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1129 }
1130
1131 void ip6_route_input(struct sk_buff *skb)
1132 {
1133         const struct ipv6hdr *iph = ipv6_hdr(skb);
1134         struct net *net = dev_net(skb->dev);
1135         int flags = RT6_LOOKUP_F_HAS_SADDR;
1136         struct ip_tunnel_info *tun_info;
1137         struct flowi6 fl6 = {
1138                 .flowi6_iif = skb->dev->ifindex,
1139                 .daddr = iph->daddr,
1140                 .saddr = iph->saddr,
1141                 .flowlabel = ip6_flowinfo(iph),
1142                 .flowi6_mark = skb->mark,
1143                 .flowi6_proto = iph->nexthdr,
1144         };
1145
1146         tun_info = skb_tunnel_info(skb);
1147         if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
1148                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1149         skb_dst_drop(skb);
1150         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1151 }
1152
1153 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1154                                              struct flowi6 *fl6, int flags)
1155 {
1156         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1157 }
1158
1159 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1160                                     struct flowi6 *fl6)
1161 {
1162         int flags = 0;
1163
1164         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1165
1166         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1167                 flags |= RT6_LOOKUP_F_IFACE;
1168
1169         if (!ipv6_addr_any(&fl6->saddr))
1170                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1171         else if (sk)
1172                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1173
1174         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1175 }
1176 EXPORT_SYMBOL(ip6_route_output);
1177
1178 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1179 {
1180         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1181         struct dst_entry *new = NULL;
1182
1183         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1184         if (rt) {
1185                 new = &rt->dst;
1186
1187                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1188
1189                 new->__use = 1;
1190                 new->input = dst_discard;
1191                 new->output = dst_discard_sk;
1192
1193                 if (dst_metrics_read_only(&ort->dst))
1194                         new->_metrics = ort->dst._metrics;
1195                 else
1196                         dst_copy_metrics(new, &ort->dst);
1197                 rt->rt6i_idev = ort->rt6i_idev;
1198                 if (rt->rt6i_idev)
1199                         in6_dev_hold(rt->rt6i_idev);
1200
1201                 rt->rt6i_gateway = ort->rt6i_gateway;
1202                 rt->rt6i_flags = ort->rt6i_flags;
1203                 rt->rt6i_metric = 0;
1204
1205                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1206 #ifdef CONFIG_IPV6_SUBTREES
1207                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1208 #endif
1209
1210                 dst_free(new);
1211         }
1212
1213         dst_release(dst_orig);
1214         return new ? new : ERR_PTR(-ENOMEM);
1215 }
1216
1217 /*
1218  *      Destination cache support functions
1219  */
1220
1221 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1222 {
1223         if (rt->dst.from &&
1224             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1225                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1226 }
1227
1228 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1229 {
1230         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1231                 return NULL;
1232
1233         if (rt6_check_expired(rt))
1234                 return NULL;
1235
1236         return &rt->dst;
1237 }
1238
1239 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1240 {
1241         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1242             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1243                 return &rt->dst;
1244         else
1245                 return NULL;
1246 }
1247
1248 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1249 {
1250         struct rt6_info *rt;
1251
1252         rt = (struct rt6_info *) dst;
1253
1254         /* All IPV6 dsts are created with ->obsolete set to the value
1255          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1256          * into this function always.
1257          */
1258
1259         rt6_dst_from_metrics_check(rt);
1260
1261         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1262                 return rt6_dst_from_check(rt, cookie);
1263         else
1264                 return rt6_check(rt, cookie);
1265 }
1266
1267 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1268 {
1269         struct rt6_info *rt = (struct rt6_info *) dst;
1270
1271         if (rt) {
1272                 if (rt->rt6i_flags & RTF_CACHE) {
1273                         if (rt6_check_expired(rt)) {
1274                                 ip6_del_rt(rt);
1275                                 dst = NULL;
1276                         }
1277                 } else {
1278                         dst_release(dst);
1279                         dst = NULL;
1280                 }
1281         }
1282         return dst;
1283 }
1284
1285 static void ip6_link_failure(struct sk_buff *skb)
1286 {
1287         struct rt6_info *rt;
1288
1289         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1290
1291         rt = (struct rt6_info *) skb_dst(skb);
1292         if (rt) {
1293                 if (rt->rt6i_flags & RTF_CACHE) {
1294                         dst_hold(&rt->dst);
1295                         if (ip6_del_rt(rt))
1296                                 dst_free(&rt->dst);
1297                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1298                         rt->rt6i_node->fn_sernum = -1;
1299                 }
1300         }
1301 }
1302
1303 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1304 {
1305         struct net *net = dev_net(rt->dst.dev);
1306
1307         rt->rt6i_flags |= RTF_MODIFIED;
1308         rt->rt6i_pmtu = mtu;
1309         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1310 }
1311
1312 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1313                                  const struct ipv6hdr *iph, u32 mtu)
1314 {
1315         struct rt6_info *rt6 = (struct rt6_info *)dst;
1316
1317         if (rt6->rt6i_flags & RTF_LOCAL)
1318                 return;
1319
1320         dst_confirm(dst);
1321         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1322         if (mtu >= dst_mtu(dst))
1323                 return;
1324
1325         if (rt6->rt6i_flags & RTF_CACHE) {
1326                 rt6_do_update_pmtu(rt6, mtu);
1327         } else {
1328                 const struct in6_addr *daddr, *saddr;
1329                 struct rt6_info *nrt6;
1330
1331                 if (iph) {
1332                         daddr = &iph->daddr;
1333                         saddr = &iph->saddr;
1334                 } else if (sk) {
1335                         daddr = &sk->sk_v6_daddr;
1336                         saddr = &inet6_sk(sk)->saddr;
1337                 } else {
1338                         return;
1339                 }
1340                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1341                 if (nrt6) {
1342                         rt6_do_update_pmtu(nrt6, mtu);
1343
1344                         /* ip6_ins_rt(nrt6) will bump the
1345                          * rt6->rt6i_node->fn_sernum
1346                          * which will fail the next rt6_check() and
1347                          * invalidate the sk->sk_dst_cache.
1348                          */
1349                         ip6_ins_rt(nrt6);
1350                 }
1351         }
1352 }
1353
1354 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1355                                struct sk_buff *skb, u32 mtu)
1356 {
1357         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1358 }
1359
1360 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1361                      int oif, u32 mark)
1362 {
1363         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1364         struct dst_entry *dst;
1365         struct flowi6 fl6;
1366
1367         memset(&fl6, 0, sizeof(fl6));
1368         fl6.flowi6_oif = oif;
1369         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1370         fl6.daddr = iph->daddr;
1371         fl6.saddr = iph->saddr;
1372         fl6.flowlabel = ip6_flowinfo(iph);
1373
1374         dst = ip6_route_output(net, NULL, &fl6);
1375         if (!dst->error)
1376                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1377         dst_release(dst);
1378 }
1379 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1380
1381 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1382 {
1383         ip6_update_pmtu(skb, sock_net(sk), mtu,
1384                         sk->sk_bound_dev_if, sk->sk_mark);
1385 }
1386 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1387
1388 /* Handle redirects */
1389 struct ip6rd_flowi {
1390         struct flowi6 fl6;
1391         struct in6_addr gateway;
1392 };
1393
1394 static struct rt6_info *__ip6_route_redirect(struct net *net,
1395                                              struct fib6_table *table,
1396                                              struct flowi6 *fl6,
1397                                              int flags)
1398 {
1399         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1400         struct rt6_info *rt;
1401         struct fib6_node *fn;
1402
1403         /* Get the "current" route for this destination and
1404          * check if the redirect has come from approriate router.
1405          *
1406          * RFC 4861 specifies that redirects should only be
1407          * accepted if they come from the nexthop to the target.
1408          * Due to the way the routes are chosen, this notion
1409          * is a bit fuzzy and one might need to check all possible
1410          * routes.
1411          */
1412
1413         read_lock_bh(&table->tb6_lock);
1414         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1415 restart:
1416         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1417                 if (rt6_check_expired(rt))
1418                         continue;
1419                 if (rt->dst.error)
1420                         break;
1421                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1422                         continue;
1423                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1424                         continue;
1425                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1426                         continue;
1427                 break;
1428         }
1429
1430         if (!rt)
1431                 rt = net->ipv6.ip6_null_entry;
1432         else if (rt->dst.error) {
1433                 rt = net->ipv6.ip6_null_entry;
1434                 goto out;
1435         }
1436
1437         if (rt == net->ipv6.ip6_null_entry) {
1438                 fn = fib6_backtrack(fn, &fl6->saddr);
1439                 if (fn)
1440                         goto restart;
1441         }
1442
1443 out:
1444         dst_hold(&rt->dst);
1445
1446         read_unlock_bh(&table->tb6_lock);
1447
1448         return rt;
1449 };
1450
1451 static struct dst_entry *ip6_route_redirect(struct net *net,
1452                                         const struct flowi6 *fl6,
1453                                         const struct in6_addr *gateway)
1454 {
1455         int flags = RT6_LOOKUP_F_HAS_SADDR;
1456         struct ip6rd_flowi rdfl;
1457
1458         rdfl.fl6 = *fl6;
1459         rdfl.gateway = *gateway;
1460
1461         return fib6_rule_lookup(net, &rdfl.fl6,
1462                                 flags, __ip6_route_redirect);
1463 }
1464
1465 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1466 {
1467         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1468         struct dst_entry *dst;
1469         struct flowi6 fl6;
1470
1471         memset(&fl6, 0, sizeof(fl6));
1472         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1473         fl6.flowi6_oif = oif;
1474         fl6.flowi6_mark = mark;
1475         fl6.daddr = iph->daddr;
1476         fl6.saddr = iph->saddr;
1477         fl6.flowlabel = ip6_flowinfo(iph);
1478
1479         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1480         rt6_do_redirect(dst, NULL, skb);
1481         dst_release(dst);
1482 }
1483 EXPORT_SYMBOL_GPL(ip6_redirect);
1484
1485 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1486                             u32 mark)
1487 {
1488         const struct ipv6hdr *iph = ipv6_hdr(skb);
1489         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1490         struct dst_entry *dst;
1491         struct flowi6 fl6;
1492
1493         memset(&fl6, 0, sizeof(fl6));
1494         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1495         fl6.flowi6_oif = oif;
1496         fl6.flowi6_mark = mark;
1497         fl6.daddr = msg->dest;
1498         fl6.saddr = iph->daddr;
1499
1500         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1501         rt6_do_redirect(dst, NULL, skb);
1502         dst_release(dst);
1503 }
1504
1505 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1506 {
1507         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1508 }
1509 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1510
1511 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1512 {
1513         struct net_device *dev = dst->dev;
1514         unsigned int mtu = dst_mtu(dst);
1515         struct net *net = dev_net(dev);
1516
1517         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1518
1519         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1520                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1521
1522         /*
1523          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1524          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1525          * IPV6_MAXPLEN is also valid and means: "any MSS,
1526          * rely only on pmtu discovery"
1527          */
1528         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1529                 mtu = IPV6_MAXPLEN;
1530         return mtu;
1531 }
1532
1533 static unsigned int ip6_mtu(const struct dst_entry *dst)
1534 {
1535         const struct rt6_info *rt = (const struct rt6_info *)dst;
1536         unsigned int mtu = rt->rt6i_pmtu;
1537         struct inet6_dev *idev;
1538
1539         if (mtu)
1540                 goto out;
1541
1542         mtu = dst_metric_raw(dst, RTAX_MTU);
1543         if (mtu)
1544                 goto out;
1545
1546         mtu = IPV6_MIN_MTU;
1547
1548         rcu_read_lock();
1549         idev = __in6_dev_get(dst->dev);
1550         if (idev)
1551                 mtu = idev->cnf.mtu6;
1552         rcu_read_unlock();
1553
1554 out:
1555         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1556 }
1557
1558 static struct dst_entry *icmp6_dst_gc_list;
1559 static DEFINE_SPINLOCK(icmp6_dst_lock);
1560
1561 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1562                                   struct flowi6 *fl6)
1563 {
1564         struct dst_entry *dst;
1565         struct rt6_info *rt;
1566         struct inet6_dev *idev = in6_dev_get(dev);
1567         struct net *net = dev_net(dev);
1568
1569         if (unlikely(!idev))
1570                 return ERR_PTR(-ENODEV);
1571
1572         rt = ip6_dst_alloc(net, dev, 0, NULL);
1573         if (unlikely(!rt)) {
1574                 in6_dev_put(idev);
1575                 dst = ERR_PTR(-ENOMEM);
1576                 goto out;
1577         }
1578
1579         rt->dst.flags |= DST_HOST;
1580         rt->dst.output  = ip6_output;
1581         atomic_set(&rt->dst.__refcnt, 1);
1582         rt->rt6i_gateway  = fl6->daddr;
1583         rt->rt6i_dst.addr = fl6->daddr;
1584         rt->rt6i_dst.plen = 128;
1585         rt->rt6i_idev     = idev;
1586         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1587
1588         spin_lock_bh(&icmp6_dst_lock);
1589         rt->dst.next = icmp6_dst_gc_list;
1590         icmp6_dst_gc_list = &rt->dst;
1591         spin_unlock_bh(&icmp6_dst_lock);
1592
1593         fib6_force_start_gc(net);
1594
1595         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1596
1597 out:
1598         return dst;
1599 }
1600
1601 int icmp6_dst_gc(void)
1602 {
1603         struct dst_entry *dst, **pprev;
1604         int more = 0;
1605
1606         spin_lock_bh(&icmp6_dst_lock);
1607         pprev = &icmp6_dst_gc_list;
1608
1609         while ((dst = *pprev) != NULL) {
1610                 if (!atomic_read(&dst->__refcnt)) {
1611                         *pprev = dst->next;
1612                         dst_free(dst);
1613                 } else {
1614                         pprev = &dst->next;
1615                         ++more;
1616                 }
1617         }
1618
1619         spin_unlock_bh(&icmp6_dst_lock);
1620
1621         return more;
1622 }
1623
1624 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1625                             void *arg)
1626 {
1627         struct dst_entry *dst, **pprev;
1628
1629         spin_lock_bh(&icmp6_dst_lock);
1630         pprev = &icmp6_dst_gc_list;
1631         while ((dst = *pprev) != NULL) {
1632                 struct rt6_info *rt = (struct rt6_info *) dst;
1633                 if (func(rt, arg)) {
1634                         *pprev = dst->next;
1635                         dst_free(dst);
1636                 } else {
1637                         pprev = &dst->next;
1638                 }
1639         }
1640         spin_unlock_bh(&icmp6_dst_lock);
1641 }
1642
1643 static int ip6_dst_gc(struct dst_ops *ops)
1644 {
1645         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1646         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1647         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1648         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1649         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1650         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1651         int entries;
1652
1653         entries = dst_entries_get_fast(ops);
1654         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1655             entries <= rt_max_size)
1656                 goto out;
1657
1658         net->ipv6.ip6_rt_gc_expire++;
1659         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1660         entries = dst_entries_get_slow(ops);
1661         if (entries < ops->gc_thresh)
1662                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1663 out:
1664         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1665         return entries > rt_max_size;
1666 }
1667
1668 static int ip6_convert_metrics(struct mx6_config *mxc,
1669                                const struct fib6_config *cfg)
1670 {
1671         struct nlattr *nla;
1672         int remaining;
1673         u32 *mp;
1674
1675         if (!cfg->fc_mx)
1676                 return 0;
1677
1678         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1679         if (unlikely(!mp))
1680                 return -ENOMEM;
1681
1682         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1683                 int type = nla_type(nla);
1684
1685                 if (type) {
1686                         u32 val;
1687
1688                         if (unlikely(type > RTAX_MAX))
1689                                 goto err;
1690                         if (type == RTAX_CC_ALGO) {
1691                                 char tmp[TCP_CA_NAME_MAX];
1692
1693                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1694                                 val = tcp_ca_get_key_by_name(tmp);
1695                                 if (val == TCP_CA_UNSPEC)
1696                                         goto err;
1697                         } else {
1698                                 val = nla_get_u32(nla);
1699                         }
1700
1701                         mp[type - 1] = val;
1702                         __set_bit(type - 1, mxc->mx_valid);
1703                 }
1704         }
1705
1706         mxc->mx = mp;
1707
1708         return 0;
1709  err:
1710         kfree(mp);
1711         return -EINVAL;
1712 }
1713
1714 int ip6_route_add(struct fib6_config *cfg)
1715 {
1716         int err;
1717         struct net *net = cfg->fc_nlinfo.nl_net;
1718         struct rt6_info *rt = NULL;
1719         struct net_device *dev = NULL;
1720         struct inet6_dev *idev = NULL;
1721         struct fib6_table *table;
1722         struct mx6_config mxc = { .mx = NULL, };
1723         int addr_type;
1724
1725         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1726                 return -EINVAL;
1727 #ifndef CONFIG_IPV6_SUBTREES
1728         if (cfg->fc_src_len)
1729                 return -EINVAL;
1730 #endif
1731         if (cfg->fc_ifindex) {
1732                 err = -ENODEV;
1733                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1734                 if (!dev)
1735                         goto out;
1736                 idev = in6_dev_get(dev);
1737                 if (!idev)
1738                         goto out;
1739         }
1740
1741         if (cfg->fc_metric == 0)
1742                 cfg->fc_metric = IP6_RT_PRIO_USER;
1743
1744         err = -ENOBUFS;
1745         if (cfg->fc_nlinfo.nlh &&
1746             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1747                 table = fib6_get_table(net, cfg->fc_table);
1748                 if (!table) {
1749                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1750                         table = fib6_new_table(net, cfg->fc_table);
1751                 }
1752         } else {
1753                 table = fib6_new_table(net, cfg->fc_table);
1754         }
1755
1756         if (!table)
1757                 goto out;
1758
1759         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1760
1761         if (!rt) {
1762                 err = -ENOMEM;
1763                 goto out;
1764         }
1765
1766         if (cfg->fc_flags & RTF_EXPIRES)
1767                 rt6_set_expires(rt, jiffies +
1768                                 clock_t_to_jiffies(cfg->fc_expires));
1769         else
1770                 rt6_clean_expires(rt);
1771
1772         if (cfg->fc_protocol == RTPROT_UNSPEC)
1773                 cfg->fc_protocol = RTPROT_BOOT;
1774         rt->rt6i_protocol = cfg->fc_protocol;
1775
1776         addr_type = ipv6_addr_type(&cfg->fc_dst);
1777
1778         if (addr_type & IPV6_ADDR_MULTICAST)
1779                 rt->dst.input = ip6_mc_input;
1780         else if (cfg->fc_flags & RTF_LOCAL)
1781                 rt->dst.input = ip6_input;
1782         else
1783                 rt->dst.input = ip6_forward;
1784
1785         rt->dst.output = ip6_output;
1786
1787         if (cfg->fc_encap) {
1788                 struct lwtunnel_state *lwtstate;
1789
1790                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1791                                            cfg->fc_encap, &lwtstate);
1792                 if (err)
1793                         goto out;
1794                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1795                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1796                         rt->dst.lwtstate->orig_output = rt->dst.output;
1797                         rt->dst.output = lwtunnel_output;
1798                 }
1799                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1800                         rt->dst.lwtstate->orig_input = rt->dst.input;
1801                         rt->dst.input = lwtunnel_input;
1802                 }
1803         }
1804
1805         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1806         rt->rt6i_dst.plen = cfg->fc_dst_len;
1807         if (rt->rt6i_dst.plen == 128)
1808                 rt->dst.flags |= DST_HOST;
1809
1810 #ifdef CONFIG_IPV6_SUBTREES
1811         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1812         rt->rt6i_src.plen = cfg->fc_src_len;
1813 #endif
1814
1815         rt->rt6i_metric = cfg->fc_metric;
1816
1817         /* We cannot add true routes via loopback here,
1818            they would result in kernel looping; promote them to reject routes
1819          */
1820         if ((cfg->fc_flags & RTF_REJECT) ||
1821             (dev && (dev->flags & IFF_LOOPBACK) &&
1822              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1823              !(cfg->fc_flags & RTF_LOCAL))) {
1824                 /* hold loopback dev/idev if we haven't done so. */
1825                 if (dev != net->loopback_dev) {
1826                         if (dev) {
1827                                 dev_put(dev);
1828                                 in6_dev_put(idev);
1829                         }
1830                         dev = net->loopback_dev;
1831                         dev_hold(dev);
1832                         idev = in6_dev_get(dev);
1833                         if (!idev) {
1834                                 err = -ENODEV;
1835                                 goto out;
1836                         }
1837                 }
1838                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1839                 switch (cfg->fc_type) {
1840                 case RTN_BLACKHOLE:
1841                         rt->dst.error = -EINVAL;
1842                         rt->dst.output = dst_discard_sk;
1843                         rt->dst.input = dst_discard;
1844                         break;
1845                 case RTN_PROHIBIT:
1846                         rt->dst.error = -EACCES;
1847                         rt->dst.output = ip6_pkt_prohibit_out;
1848                         rt->dst.input = ip6_pkt_prohibit;
1849                         break;
1850                 case RTN_THROW:
1851                 default:
1852                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1853                                         : -ENETUNREACH;
1854                         rt->dst.output = ip6_pkt_discard_out;
1855                         rt->dst.input = ip6_pkt_discard;
1856                         break;
1857                 }
1858                 goto install_route;
1859         }
1860
1861         if (cfg->fc_flags & RTF_GATEWAY) {
1862                 const struct in6_addr *gw_addr;
1863                 int gwa_type;
1864
1865                 gw_addr = &cfg->fc_gateway;
1866                 gwa_type = ipv6_addr_type(gw_addr);
1867
1868                 /* if gw_addr is local we will fail to detect this in case
1869                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1870                  * will return already-added prefix route via interface that
1871                  * prefix route was assigned to, which might be non-loopback.
1872                  */
1873                 err = -EINVAL;
1874                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1875                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1876                                             dev : NULL, 0, 0))
1877                         goto out;
1878
1879                 rt->rt6i_gateway = *gw_addr;
1880
1881                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1882                         struct rt6_info *grt;
1883
1884                         /* IPv6 strictly inhibits using not link-local
1885                            addresses as nexthop address.
1886                            Otherwise, router will not able to send redirects.
1887                            It is very good, but in some (rare!) circumstances
1888                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1889                            some exceptions. --ANK
1890                          */
1891                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1892                                 goto out;
1893
1894                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1895
1896                         err = -EHOSTUNREACH;
1897                         if (!grt)
1898                                 goto out;
1899                         if (dev) {
1900                                 if (dev != grt->dst.dev) {
1901                                         ip6_rt_put(grt);
1902                                         goto out;
1903                                 }
1904                         } else {
1905                                 dev = grt->dst.dev;
1906                                 idev = grt->rt6i_idev;
1907                                 dev_hold(dev);
1908                                 in6_dev_hold(grt->rt6i_idev);
1909                         }
1910                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1911                                 err = 0;
1912                         ip6_rt_put(grt);
1913
1914                         if (err)
1915                                 goto out;
1916                 }
1917                 err = -EINVAL;
1918                 if (!dev || (dev->flags & IFF_LOOPBACK))
1919                         goto out;
1920         }
1921
1922         err = -ENODEV;
1923         if (!dev)
1924                 goto out;
1925
1926         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1927                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1928                         err = -EINVAL;
1929                         goto out;
1930                 }
1931                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1932                 rt->rt6i_prefsrc.plen = 128;
1933         } else
1934                 rt->rt6i_prefsrc.plen = 0;
1935
1936         rt->rt6i_flags = cfg->fc_flags;
1937
1938 install_route:
1939         rt->dst.dev = dev;
1940         rt->rt6i_idev = idev;
1941         rt->rt6i_table = table;
1942
1943         cfg->fc_nlinfo.nl_net = dev_net(dev);
1944
1945         err = ip6_convert_metrics(&mxc, cfg);
1946         if (err)
1947                 goto out;
1948
1949         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1950
1951         kfree(mxc.mx);
1952         return err;
1953 out:
1954         if (dev)
1955                 dev_put(dev);
1956         if (idev)
1957                 in6_dev_put(idev);
1958         if (rt)
1959                 dst_free(&rt->dst);
1960         return err;
1961 }
1962
1963 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1964 {
1965         int err;
1966         struct fib6_table *table;
1967         struct net *net = dev_net(rt->dst.dev);
1968
1969         if (rt == net->ipv6.ip6_null_entry) {
1970                 err = -ENOENT;
1971                 goto out;
1972         }
1973
1974         table = rt->rt6i_table;
1975         write_lock_bh(&table->tb6_lock);
1976         err = fib6_del(rt, info);
1977         write_unlock_bh(&table->tb6_lock);
1978
1979 out:
1980         ip6_rt_put(rt);
1981         return err;
1982 }
1983
1984 int ip6_del_rt(struct rt6_info *rt)
1985 {
1986         struct nl_info info = {
1987                 .nl_net = dev_net(rt->dst.dev),
1988         };
1989         return __ip6_del_rt(rt, &info);
1990 }
1991
1992 static int ip6_route_del(struct fib6_config *cfg)
1993 {
1994         struct fib6_table *table;
1995         struct fib6_node *fn;
1996         struct rt6_info *rt;
1997         int err = -ESRCH;
1998
1999         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2000         if (!table)
2001                 return err;
2002
2003         read_lock_bh(&table->tb6_lock);
2004
2005         fn = fib6_locate(&table->tb6_root,
2006                          &cfg->fc_dst, cfg->fc_dst_len,
2007                          &cfg->fc_src, cfg->fc_src_len);
2008
2009         if (fn) {
2010                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2011                         if ((rt->rt6i_flags & RTF_CACHE) &&
2012                             !(cfg->fc_flags & RTF_CACHE))
2013                                 continue;
2014                         if (cfg->fc_ifindex &&
2015                             (!rt->dst.dev ||
2016                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2017                                 continue;
2018                         if (cfg->fc_flags & RTF_GATEWAY &&
2019                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2020                                 continue;
2021                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2022                                 continue;
2023                         dst_hold(&rt->dst);
2024                         read_unlock_bh(&table->tb6_lock);
2025
2026                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2027                 }
2028         }
2029         read_unlock_bh(&table->tb6_lock);
2030
2031         return err;
2032 }
2033
2034 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2035 {
2036         struct net *net = dev_net(skb->dev);
2037         struct netevent_redirect netevent;
2038         struct rt6_info *rt, *nrt = NULL;
2039         struct ndisc_options ndopts;
2040         struct inet6_dev *in6_dev;
2041         struct neighbour *neigh;
2042         struct rd_msg *msg;
2043         int optlen, on_link;
2044         u8 *lladdr;
2045
2046         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2047         optlen -= sizeof(*msg);
2048
2049         if (optlen < 0) {
2050                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2051                 return;
2052         }
2053
2054         msg = (struct rd_msg *)icmp6_hdr(skb);
2055
2056         if (ipv6_addr_is_multicast(&msg->dest)) {
2057                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2058                 return;
2059         }
2060
2061         on_link = 0;
2062         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2063                 on_link = 1;
2064         } else if (ipv6_addr_type(&msg->target) !=
2065                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2066                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2067                 return;
2068         }
2069
2070         in6_dev = __in6_dev_get(skb->dev);
2071         if (!in6_dev)
2072                 return;
2073         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2074                 return;
2075
2076         /* RFC2461 8.1:
2077          *      The IP source address of the Redirect MUST be the same as the current
2078          *      first-hop router for the specified ICMP Destination Address.
2079          */
2080
2081         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2082                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2083                 return;
2084         }
2085
2086         lladdr = NULL;
2087         if (ndopts.nd_opts_tgt_lladdr) {
2088                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2089                                              skb->dev);
2090                 if (!lladdr) {
2091                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2092                         return;
2093                 }
2094         }
2095
2096         rt = (struct rt6_info *) dst;
2097         if (rt == net->ipv6.ip6_null_entry) {
2098                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2099                 return;
2100         }
2101
2102         /* Redirect received -> path was valid.
2103          * Look, redirects are sent only in response to data packets,
2104          * so that this nexthop apparently is reachable. --ANK
2105          */
2106         dst_confirm(&rt->dst);
2107
2108         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2109         if (!neigh)
2110                 return;
2111
2112         /*
2113          *      We have finally decided to accept it.
2114          */
2115
2116         neigh_update(neigh, lladdr, NUD_STALE,
2117                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2118                      NEIGH_UPDATE_F_OVERRIDE|
2119                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2120                                      NEIGH_UPDATE_F_ISROUTER))
2121                      );
2122
2123         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2124         if (!nrt)
2125                 goto out;
2126
2127         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2128         if (on_link)
2129                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2130
2131         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2132
2133         if (ip6_ins_rt(nrt))
2134                 goto out;
2135
2136         netevent.old = &rt->dst;
2137         netevent.new = &nrt->dst;
2138         netevent.daddr = &msg->dest;
2139         netevent.neigh = neigh;
2140         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2141
2142         if (rt->rt6i_flags & RTF_CACHE) {
2143                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2144                 ip6_del_rt(rt);
2145         }
2146
2147 out:
2148         neigh_release(neigh);
2149 }
2150
2151 /*
2152  *      Misc support functions
2153  */
2154
2155 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2156 {
2157         BUG_ON(from->dst.from);
2158
2159         rt->rt6i_flags &= ~RTF_EXPIRES;
2160         dst_hold(&from->dst);
2161         rt->dst.from = &from->dst;
2162         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2163 }
2164
2165 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2166 {
2167         rt->dst.input = ort->dst.input;
2168         rt->dst.output = ort->dst.output;
2169         rt->rt6i_dst = ort->rt6i_dst;
2170         rt->dst.error = ort->dst.error;
2171         rt->rt6i_idev = ort->rt6i_idev;
2172         if (rt->rt6i_idev)
2173                 in6_dev_hold(rt->rt6i_idev);
2174         rt->dst.lastuse = jiffies;
2175         rt->rt6i_gateway = ort->rt6i_gateway;
2176         rt->rt6i_flags = ort->rt6i_flags;
2177         rt6_set_from(rt, ort);
2178         rt->rt6i_metric = ort->rt6i_metric;
2179 #ifdef CONFIG_IPV6_SUBTREES
2180         rt->rt6i_src = ort->rt6i_src;
2181 #endif
2182         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2183         rt->rt6i_table = ort->rt6i_table;
2184         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2185 }
2186
2187 #ifdef CONFIG_IPV6_ROUTE_INFO
2188 static struct rt6_info *rt6_get_route_info(struct net *net,
2189                                            const struct in6_addr *prefix, int prefixlen,
2190                                            const struct in6_addr *gwaddr, int ifindex)
2191 {
2192         struct fib6_node *fn;
2193         struct rt6_info *rt = NULL;
2194         struct fib6_table *table;
2195
2196         table = fib6_get_table(net, RT6_TABLE_INFO);
2197         if (!table)
2198                 return NULL;
2199
2200         read_lock_bh(&table->tb6_lock);
2201         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2202         if (!fn)
2203                 goto out;
2204
2205         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2206                 if (rt->dst.dev->ifindex != ifindex)
2207                         continue;
2208                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2209                         continue;
2210                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2211                         continue;
2212                 dst_hold(&rt->dst);
2213                 break;
2214         }
2215 out:
2216         read_unlock_bh(&table->tb6_lock);
2217         return rt;
2218 }
2219
2220 static struct rt6_info *rt6_add_route_info(struct net *net,
2221                                            const struct in6_addr *prefix, int prefixlen,
2222                                            const struct in6_addr *gwaddr, int ifindex,
2223                                            unsigned int pref)
2224 {
2225         struct fib6_config cfg = {
2226                 .fc_table       = RT6_TABLE_INFO,
2227                 .fc_metric      = IP6_RT_PRIO_USER,
2228                 .fc_ifindex     = ifindex,
2229                 .fc_dst_len     = prefixlen,
2230                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2231                                   RTF_UP | RTF_PREF(pref),
2232                 .fc_nlinfo.portid = 0,
2233                 .fc_nlinfo.nlh = NULL,
2234                 .fc_nlinfo.nl_net = net,
2235         };
2236
2237         cfg.fc_dst = *prefix;
2238         cfg.fc_gateway = *gwaddr;
2239
2240         /* We should treat it as a default route if prefix length is 0. */
2241         if (!prefixlen)
2242                 cfg.fc_flags |= RTF_DEFAULT;
2243
2244         ip6_route_add(&cfg);
2245
2246         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2247 }
2248 #endif
2249
2250 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2251 {
2252         struct rt6_info *rt;
2253         struct fib6_table *table;
2254
2255         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2256         if (!table)
2257                 return NULL;
2258
2259         read_lock_bh(&table->tb6_lock);
2260         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2261                 if (dev == rt->dst.dev &&
2262                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2263                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2264                         break;
2265         }
2266         if (rt)
2267                 dst_hold(&rt->dst);
2268         read_unlock_bh(&table->tb6_lock);
2269         return rt;
2270 }
2271
2272 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2273                                      struct net_device *dev,
2274                                      unsigned int pref)
2275 {
2276         struct fib6_config cfg = {
2277                 .fc_table       = RT6_TABLE_DFLT,
2278                 .fc_metric      = IP6_RT_PRIO_USER,
2279                 .fc_ifindex     = dev->ifindex,
2280                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2281                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2282                 .fc_nlinfo.portid = 0,
2283                 .fc_nlinfo.nlh = NULL,
2284                 .fc_nlinfo.nl_net = dev_net(dev),
2285         };
2286
2287         cfg.fc_gateway = *gwaddr;
2288
2289         ip6_route_add(&cfg);
2290
2291         return rt6_get_dflt_router(gwaddr, dev);
2292 }
2293
2294 void rt6_purge_dflt_routers(struct net *net)
2295 {
2296         struct rt6_info *rt;
2297         struct fib6_table *table;
2298
2299         /* NOTE: Keep consistent with rt6_get_dflt_router */
2300         table = fib6_get_table(net, RT6_TABLE_DFLT);
2301         if (!table)
2302                 return;
2303
2304 restart:
2305         read_lock_bh(&table->tb6_lock);
2306         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2307                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2308                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2309                         dst_hold(&rt->dst);
2310                         read_unlock_bh(&table->tb6_lock);
2311                         ip6_del_rt(rt);
2312                         goto restart;
2313                 }
2314         }
2315         read_unlock_bh(&table->tb6_lock);
2316 }
2317
2318 static void rtmsg_to_fib6_config(struct net *net,
2319                                  struct in6_rtmsg *rtmsg,
2320                                  struct fib6_config *cfg)
2321 {
2322         memset(cfg, 0, sizeof(*cfg));
2323
2324         cfg->fc_table = RT6_TABLE_MAIN;
2325         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2326         cfg->fc_metric = rtmsg->rtmsg_metric;
2327         cfg->fc_expires = rtmsg->rtmsg_info;
2328         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2329         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2330         cfg->fc_flags = rtmsg->rtmsg_flags;
2331
2332         cfg->fc_nlinfo.nl_net = net;
2333
2334         cfg->fc_dst = rtmsg->rtmsg_dst;
2335         cfg->fc_src = rtmsg->rtmsg_src;
2336         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2337 }
2338
2339 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2340 {
2341         struct fib6_config cfg;
2342         struct in6_rtmsg rtmsg;
2343         int err;
2344
2345         switch (cmd) {
2346         case SIOCADDRT:         /* Add a route */
2347         case SIOCDELRT:         /* Delete a route */
2348                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2349                         return -EPERM;
2350                 err = copy_from_user(&rtmsg, arg,
2351                                      sizeof(struct in6_rtmsg));
2352                 if (err)
2353                         return -EFAULT;
2354
2355                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2356
2357                 rtnl_lock();
2358                 switch (cmd) {
2359                 case SIOCADDRT:
2360                         err = ip6_route_add(&cfg);
2361                         break;
2362                 case SIOCDELRT:
2363                         err = ip6_route_del(&cfg);
2364                         break;
2365                 default:
2366                         err = -EINVAL;
2367                 }
2368                 rtnl_unlock();
2369
2370                 return err;
2371         }
2372
2373         return -EINVAL;
2374 }
2375
2376 /*
2377  *      Drop the packet on the floor
2378  */
2379
2380 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2381 {
2382         int type;
2383         struct dst_entry *dst = skb_dst(skb);
2384         switch (ipstats_mib_noroutes) {
2385         case IPSTATS_MIB_INNOROUTES:
2386                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2387                 if (type == IPV6_ADDR_ANY) {
2388                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2389                                       IPSTATS_MIB_INADDRERRORS);
2390                         break;
2391                 }
2392                 /* FALLTHROUGH */
2393         case IPSTATS_MIB_OUTNOROUTES:
2394                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2395                               ipstats_mib_noroutes);
2396                 break;
2397         }
2398         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2399         kfree_skb(skb);
2400         return 0;
2401 }
2402
2403 static int ip6_pkt_discard(struct sk_buff *skb)
2404 {
2405         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2406 }
2407
2408 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2409 {
2410         skb->dev = skb_dst(skb)->dev;
2411         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2412 }
2413
2414 static int ip6_pkt_prohibit(struct sk_buff *skb)
2415 {
2416         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2417 }
2418
2419 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2420 {
2421         skb->dev = skb_dst(skb)->dev;
2422         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2423 }
2424
2425 /*
2426  *      Allocate a dst for local (unicast / anycast) address.
2427  */
2428
2429 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2430                                     const struct in6_addr *addr,
2431                                     bool anycast)
2432 {
2433         struct net *net = dev_net(idev->dev);
2434         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2435                                             DST_NOCOUNT, NULL);
2436         if (!rt)
2437                 return ERR_PTR(-ENOMEM);
2438
2439         in6_dev_hold(idev);
2440
2441         rt->dst.flags |= DST_HOST;
2442         rt->dst.input = ip6_input;
2443         rt->dst.output = ip6_output;
2444         rt->rt6i_idev = idev;
2445
2446         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2447         if (anycast)
2448                 rt->rt6i_flags |= RTF_ANYCAST;
2449         else
2450                 rt->rt6i_flags |= RTF_LOCAL;
2451
2452         rt->rt6i_gateway  = *addr;
2453         rt->rt6i_dst.addr = *addr;
2454         rt->rt6i_dst.plen = 128;
2455         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2456
2457         atomic_set(&rt->dst.__refcnt, 1);
2458
2459         return rt;
2460 }
2461
2462 int ip6_route_get_saddr(struct net *net,
2463                         struct rt6_info *rt,
2464                         const struct in6_addr *daddr,
2465                         unsigned int prefs,
2466                         struct in6_addr *saddr)
2467 {
2468         struct inet6_dev *idev =
2469                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2470         int err = 0;
2471         if (rt && rt->rt6i_prefsrc.plen)
2472                 *saddr = rt->rt6i_prefsrc.addr;
2473         else
2474                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2475                                          daddr, prefs, saddr);
2476         return err;
2477 }
2478
2479 /* remove deleted ip from prefsrc entries */
2480 struct arg_dev_net_ip {
2481         struct net_device *dev;
2482         struct net *net;
2483         struct in6_addr *addr;
2484 };
2485
2486 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2487 {
2488         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2489         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2490         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2491
2492         if (((void *)rt->dst.dev == dev || !dev) &&
2493             rt != net->ipv6.ip6_null_entry &&
2494             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2495                 /* remove prefsrc entry */
2496                 rt->rt6i_prefsrc.plen = 0;
2497         }
2498         return 0;
2499 }
2500
2501 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2502 {
2503         struct net *net = dev_net(ifp->idev->dev);
2504         struct arg_dev_net_ip adni = {
2505                 .dev = ifp->idev->dev,
2506                 .net = net,
2507                 .addr = &ifp->addr,
2508         };
2509         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2510 }
2511
2512 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2513 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2514
2515 /* Remove routers and update dst entries when gateway turn into host. */
2516 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2517 {
2518         struct in6_addr *gateway = (struct in6_addr *)arg;
2519
2520         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2521              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2522              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2523                 return -1;
2524         }
2525         return 0;
2526 }
2527
2528 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2529 {
2530         fib6_clean_all(net, fib6_clean_tohost, gateway);
2531 }
2532
2533 struct arg_dev_net {
2534         struct net_device *dev;
2535         struct net *net;
2536 };
2537
2538 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2539 {
2540         const struct arg_dev_net *adn = arg;
2541         const struct net_device *dev = adn->dev;
2542
2543         if ((rt->dst.dev == dev || !dev) &&
2544             rt != adn->net->ipv6.ip6_null_entry)
2545                 return -1;
2546
2547         return 0;
2548 }
2549
2550 void rt6_ifdown(struct net *net, struct net_device *dev)
2551 {
2552         struct arg_dev_net adn = {
2553                 .dev = dev,
2554                 .net = net,
2555         };
2556
2557         fib6_clean_all(net, fib6_ifdown, &adn);
2558         icmp6_clean_all(fib6_ifdown, &adn);
2559         rt6_uncached_list_flush_dev(net, dev);
2560 }
2561
2562 struct rt6_mtu_change_arg {
2563         struct net_device *dev;
2564         unsigned int mtu;
2565 };
2566
2567 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2568 {
2569         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2570         struct inet6_dev *idev;
2571
2572         /* In IPv6 pmtu discovery is not optional,
2573            so that RTAX_MTU lock cannot disable it.
2574            We still use this lock to block changes
2575            caused by addrconf/ndisc.
2576         */
2577
2578         idev = __in6_dev_get(arg->dev);
2579         if (!idev)
2580                 return 0;
2581
2582         /* For administrative MTU increase, there is no way to discover
2583            IPv6 PMTU increase, so PMTU increase should be updated here.
2584            Since RFC 1981 doesn't include administrative MTU increase
2585            update PMTU increase is a MUST. (i.e. jumbo frame)
2586          */
2587         /*
2588            If new MTU is less than route PMTU, this new MTU will be the
2589            lowest MTU in the path, update the route PMTU to reflect PMTU
2590            decreases; if new MTU is greater than route PMTU, and the
2591            old MTU is the lowest MTU in the path, update the route PMTU
2592            to reflect the increase. In this case if the other nodes' MTU
2593            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2594            PMTU discouvery.
2595          */
2596         if (rt->dst.dev == arg->dev &&
2597             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2598                 if (rt->rt6i_flags & RTF_CACHE) {
2599                         /* For RTF_CACHE with rt6i_pmtu == 0
2600                          * (i.e. a redirected route),
2601                          * the metrics of its rt->dst.from has already
2602                          * been updated.
2603                          */
2604                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2605                                 rt->rt6i_pmtu = arg->mtu;
2606                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2607                            (dst_mtu(&rt->dst) < arg->mtu &&
2608                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2609                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2610                 }
2611         }
2612         return 0;
2613 }
2614
2615 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2616 {
2617         struct rt6_mtu_change_arg arg = {
2618                 .dev = dev,
2619                 .mtu = mtu,
2620         };
2621
2622         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2623 }
2624
2625 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2626         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2627         [RTA_OIF]               = { .type = NLA_U32 },
2628         [RTA_IIF]               = { .type = NLA_U32 },
2629         [RTA_PRIORITY]          = { .type = NLA_U32 },
2630         [RTA_METRICS]           = { .type = NLA_NESTED },
2631         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2632         [RTA_PREF]              = { .type = NLA_U8 },
2633         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2634         [RTA_ENCAP]             = { .type = NLA_NESTED },
2635 };
2636
2637 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2638                               struct fib6_config *cfg)
2639 {
2640         struct rtmsg *rtm;
2641         struct nlattr *tb[RTA_MAX+1];
2642         unsigned int pref;
2643         int err;
2644
2645         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2646         if (err < 0)
2647                 goto errout;
2648
2649         err = -EINVAL;
2650         rtm = nlmsg_data(nlh);
2651         memset(cfg, 0, sizeof(*cfg));
2652
2653         cfg->fc_table = rtm->rtm_table;
2654         cfg->fc_dst_len = rtm->rtm_dst_len;
2655         cfg->fc_src_len = rtm->rtm_src_len;
2656         cfg->fc_flags = RTF_UP;
2657         cfg->fc_protocol = rtm->rtm_protocol;
2658         cfg->fc_type = rtm->rtm_type;
2659
2660         if (rtm->rtm_type == RTN_UNREACHABLE ||
2661             rtm->rtm_type == RTN_BLACKHOLE ||
2662             rtm->rtm_type == RTN_PROHIBIT ||
2663             rtm->rtm_type == RTN_THROW)
2664                 cfg->fc_flags |= RTF_REJECT;
2665
2666         if (rtm->rtm_type == RTN_LOCAL)
2667                 cfg->fc_flags |= RTF_LOCAL;
2668
2669         if (rtm->rtm_flags & RTM_F_CLONED)
2670                 cfg->fc_flags |= RTF_CACHE;
2671
2672         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2673         cfg->fc_nlinfo.nlh = nlh;
2674         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2675
2676         if (tb[RTA_GATEWAY]) {
2677                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2678                 cfg->fc_flags |= RTF_GATEWAY;
2679         }
2680
2681         if (tb[RTA_DST]) {
2682                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2683
2684                 if (nla_len(tb[RTA_DST]) < plen)
2685                         goto errout;
2686
2687                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2688         }
2689
2690         if (tb[RTA_SRC]) {
2691                 int plen = (rtm->rtm_src_len + 7) >> 3;
2692
2693                 if (nla_len(tb[RTA_SRC]) < plen)
2694                         goto errout;
2695
2696                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2697         }
2698
2699         if (tb[RTA_PREFSRC])
2700                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2701
2702         if (tb[RTA_OIF])
2703                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2704
2705         if (tb[RTA_PRIORITY])
2706                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2707
2708         if (tb[RTA_METRICS]) {
2709                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2710                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2711         }
2712
2713         if (tb[RTA_TABLE])
2714                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2715
2716         if (tb[RTA_MULTIPATH]) {
2717                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2718                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2719         }
2720
2721         if (tb[RTA_PREF]) {
2722                 pref = nla_get_u8(tb[RTA_PREF]);
2723                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2724                     pref != ICMPV6_ROUTER_PREF_HIGH)
2725                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2726                 cfg->fc_flags |= RTF_PREF(pref);
2727         }
2728
2729         if (tb[RTA_ENCAP])
2730                 cfg->fc_encap = tb[RTA_ENCAP];
2731
2732         if (tb[RTA_ENCAP_TYPE])
2733                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2734
2735         err = 0;
2736 errout:
2737         return err;
2738 }
2739
2740 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2741 {
2742         struct fib6_config r_cfg;
2743         struct rtnexthop *rtnh;
2744         int remaining;
2745         int attrlen;
2746         int err = 0, last_err = 0;
2747
2748         remaining = cfg->fc_mp_len;
2749 beginning:
2750         rtnh = (struct rtnexthop *)cfg->fc_mp;
2751
2752         /* Parse a Multipath Entry */
2753         while (rtnh_ok(rtnh, remaining)) {
2754                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2755                 if (rtnh->rtnh_ifindex)
2756                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2757
2758                 attrlen = rtnh_attrlen(rtnh);
2759                 if (attrlen > 0) {
2760                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2761
2762                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2763                         if (nla) {
2764                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2765                                 r_cfg.fc_flags |= RTF_GATEWAY;
2766                         }
2767                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2768                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2769                         if (nla)
2770                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2771                 }
2772                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2773                 if (err) {
2774                         last_err = err;
2775                         /* If we are trying to remove a route, do not stop the
2776                          * loop when ip6_route_del() fails (because next hop is
2777                          * already gone), we should try to remove all next hops.
2778                          */
2779                         if (add) {
2780                                 /* If add fails, we should try to delete all
2781                                  * next hops that have been already added.
2782                                  */
2783                                 add = 0;
2784                                 remaining = cfg->fc_mp_len - remaining;
2785                                 goto beginning;
2786                         }
2787                 }
2788                 /* Because each route is added like a single route we remove
2789                  * these flags after the first nexthop: if there is a collision,
2790                  * we have already failed to add the first nexthop:
2791                  * fib6_add_rt2node() has rejected it; when replacing, old
2792                  * nexthops have been replaced by first new, the rest should
2793                  * be added to it.
2794                  */
2795                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2796                                                      NLM_F_REPLACE);
2797                 rtnh = rtnh_next(rtnh, &remaining);
2798         }
2799
2800         return last_err;
2801 }
2802
2803 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2804 {
2805         struct fib6_config cfg;
2806         int err;
2807
2808         err = rtm_to_fib6_config(skb, nlh, &cfg);
2809         if (err < 0)
2810                 return err;
2811
2812         if (cfg.fc_mp)
2813                 return ip6_route_multipath(&cfg, 0);
2814         else
2815                 return ip6_route_del(&cfg);
2816 }
2817
2818 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2819 {
2820         struct fib6_config cfg;
2821         int err;
2822
2823         err = rtm_to_fib6_config(skb, nlh, &cfg);
2824         if (err < 0)
2825                 return err;
2826
2827         if (cfg.fc_mp)
2828                 return ip6_route_multipath(&cfg, 1);
2829         else
2830                 return ip6_route_add(&cfg);
2831 }
2832
2833 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2834 {
2835         return NLMSG_ALIGN(sizeof(struct rtmsg))
2836                + nla_total_size(16) /* RTA_SRC */
2837                + nla_total_size(16) /* RTA_DST */
2838                + nla_total_size(16) /* RTA_GATEWAY */
2839                + nla_total_size(16) /* RTA_PREFSRC */
2840                + nla_total_size(4) /* RTA_TABLE */
2841                + nla_total_size(4) /* RTA_IIF */
2842                + nla_total_size(4) /* RTA_OIF */
2843                + nla_total_size(4) /* RTA_PRIORITY */
2844                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2845                + nla_total_size(sizeof(struct rta_cacheinfo))
2846                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2847                + nla_total_size(1) /* RTA_PREF */
2848                + lwtunnel_get_encap_size(rt->dst.lwtstate);
2849 }
2850
2851 static int rt6_fill_node(struct net *net,
2852                          struct sk_buff *skb, struct rt6_info *rt,
2853                          struct in6_addr *dst, struct in6_addr *src,
2854                          int iif, int type, u32 portid, u32 seq,
2855                          int prefix, int nowait, unsigned int flags)
2856 {
2857         u32 metrics[RTAX_MAX];
2858         struct rtmsg *rtm;
2859         struct nlmsghdr *nlh;
2860         long expires;
2861         u32 table;
2862
2863         if (prefix) {   /* user wants prefix routes only */
2864                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2865                         /* success since this is not a prefix route */
2866                         return 1;
2867                 }
2868         }
2869
2870         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2871         if (!nlh)
2872                 return -EMSGSIZE;
2873
2874         rtm = nlmsg_data(nlh);
2875         rtm->rtm_family = AF_INET6;
2876         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2877         rtm->rtm_src_len = rt->rt6i_src.plen;
2878         rtm->rtm_tos = 0;
2879         if (rt->rt6i_table)
2880                 table = rt->rt6i_table->tb6_id;
2881         else
2882                 table = RT6_TABLE_UNSPEC;
2883         rtm->rtm_table = table;
2884         if (nla_put_u32(skb, RTA_TABLE, table))
2885                 goto nla_put_failure;
2886         if (rt->rt6i_flags & RTF_REJECT) {
2887                 switch (rt->dst.error) {
2888                 case -EINVAL:
2889                         rtm->rtm_type = RTN_BLACKHOLE;
2890                         break;
2891                 case -EACCES:
2892                         rtm->rtm_type = RTN_PROHIBIT;
2893                         break;
2894                 case -EAGAIN:
2895                         rtm->rtm_type = RTN_THROW;
2896                         break;
2897                 default:
2898                         rtm->rtm_type = RTN_UNREACHABLE;
2899                         break;
2900                 }
2901         }
2902         else if (rt->rt6i_flags & RTF_LOCAL)
2903                 rtm->rtm_type = RTN_LOCAL;
2904         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2905                 rtm->rtm_type = RTN_LOCAL;
2906         else
2907                 rtm->rtm_type = RTN_UNICAST;
2908         rtm->rtm_flags = 0;
2909         if (!netif_carrier_ok(rt->dst.dev)) {
2910                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
2911                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
2912                         rtm->rtm_flags |= RTNH_F_DEAD;
2913         }
2914         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2915         rtm->rtm_protocol = rt->rt6i_protocol;
2916         if (rt->rt6i_flags & RTF_DYNAMIC)
2917                 rtm->rtm_protocol = RTPROT_REDIRECT;
2918         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2919                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2920                         rtm->rtm_protocol = RTPROT_RA;
2921                 else
2922                         rtm->rtm_protocol = RTPROT_KERNEL;
2923         }
2924
2925         if (rt->rt6i_flags & RTF_CACHE)
2926                 rtm->rtm_flags |= RTM_F_CLONED;
2927
2928         if (dst) {
2929                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2930                         goto nla_put_failure;
2931                 rtm->rtm_dst_len = 128;
2932         } else if (rtm->rtm_dst_len)
2933                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2934                         goto nla_put_failure;
2935 #ifdef CONFIG_IPV6_SUBTREES
2936         if (src) {
2937                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2938                         goto nla_put_failure;
2939                 rtm->rtm_src_len = 128;
2940         } else if (rtm->rtm_src_len &&
2941                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2942                 goto nla_put_failure;
2943 #endif
2944         if (iif) {
2945 #ifdef CONFIG_IPV6_MROUTE
2946                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2947                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2948                         if (err <= 0) {
2949                                 if (!nowait) {
2950                                         if (err == 0)
2951                                                 return 0;
2952                                         goto nla_put_failure;
2953                                 } else {
2954                                         if (err == -EMSGSIZE)
2955                                                 goto nla_put_failure;
2956                                 }
2957                         }
2958                 } else
2959 #endif
2960                         if (nla_put_u32(skb, RTA_IIF, iif))
2961                                 goto nla_put_failure;
2962         } else if (dst) {
2963                 struct in6_addr saddr_buf;
2964                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2965                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2966                         goto nla_put_failure;
2967         }
2968
2969         if (rt->rt6i_prefsrc.plen) {
2970                 struct in6_addr saddr_buf;
2971                 saddr_buf = rt->rt6i_prefsrc.addr;
2972                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2973                         goto nla_put_failure;
2974         }
2975
2976         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2977         if (rt->rt6i_pmtu)
2978                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2979         if (rtnetlink_put_metrics(skb, metrics) < 0)
2980                 goto nla_put_failure;
2981
2982         if (rt->rt6i_flags & RTF_GATEWAY) {
2983                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2984                         goto nla_put_failure;
2985         }
2986
2987         if (rt->dst.dev &&
2988             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2989                 goto nla_put_failure;
2990         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2991                 goto nla_put_failure;
2992
2993         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2994
2995         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2996                 goto nla_put_failure;
2997
2998         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2999                 goto nla_put_failure;
3000
3001         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3002
3003         nlmsg_end(skb, nlh);
3004         return 0;
3005
3006 nla_put_failure:
3007         nlmsg_cancel(skb, nlh);
3008         return -EMSGSIZE;
3009 }
3010
3011 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3012 {
3013         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3014         int prefix;
3015
3016         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3017                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3018                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3019         } else
3020                 prefix = 0;
3021
3022         return rt6_fill_node(arg->net,
3023                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3024                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3025                      prefix, 0, NLM_F_MULTI);
3026 }
3027
3028 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3029 {
3030         struct net *net = sock_net(in_skb->sk);
3031         struct nlattr *tb[RTA_MAX+1];
3032         struct rt6_info *rt;
3033         struct sk_buff *skb;
3034         struct rtmsg *rtm;
3035         struct flowi6 fl6;
3036         int err, iif = 0, oif = 0;
3037
3038         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3039         if (err < 0)
3040                 goto errout;
3041
3042         err = -EINVAL;
3043         memset(&fl6, 0, sizeof(fl6));
3044
3045         if (tb[RTA_SRC]) {
3046                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3047                         goto errout;
3048
3049                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3050         }
3051
3052         if (tb[RTA_DST]) {
3053                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3054                         goto errout;
3055
3056                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3057         }
3058
3059         if (tb[RTA_IIF])
3060                 iif = nla_get_u32(tb[RTA_IIF]);
3061
3062         if (tb[RTA_OIF])
3063                 oif = nla_get_u32(tb[RTA_OIF]);
3064
3065         if (tb[RTA_MARK])
3066                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3067
3068         if (iif) {
3069                 struct net_device *dev;
3070                 int flags = 0;
3071
3072                 dev = __dev_get_by_index(net, iif);
3073                 if (!dev) {
3074                         err = -ENODEV;
3075                         goto errout;
3076                 }
3077
3078                 fl6.flowi6_iif = iif;
3079
3080                 if (!ipv6_addr_any(&fl6.saddr))
3081                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3082
3083                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3084                                                                flags);
3085         } else {
3086                 fl6.flowi6_oif = oif;
3087
3088                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3089         }
3090
3091         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3092         if (!skb) {
3093                 ip6_rt_put(rt);
3094                 err = -ENOBUFS;
3095                 goto errout;
3096         }
3097
3098         /* Reserve room for dummy headers, this skb can pass
3099            through good chunk of routing engine.
3100          */
3101         skb_reset_mac_header(skb);
3102         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3103
3104         skb_dst_set(skb, &rt->dst);
3105
3106         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3107                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3108                             nlh->nlmsg_seq, 0, 0, 0);
3109         if (err < 0) {
3110                 kfree_skb(skb);
3111                 goto errout;
3112         }
3113
3114         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3115 errout:
3116         return err;
3117 }
3118
3119 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3120 {
3121         struct sk_buff *skb;
3122         struct net *net = info->nl_net;
3123         u32 seq;
3124         int err;
3125
3126         err = -ENOBUFS;
3127         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3128
3129         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3130         if (!skb)
3131                 goto errout;
3132
3133         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3134                                 event, info->portid, seq, 0, 0, 0);
3135         if (err < 0) {
3136                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3137                 WARN_ON(err == -EMSGSIZE);
3138                 kfree_skb(skb);
3139                 goto errout;
3140         }
3141         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3142                     info->nlh, gfp_any());
3143         return;
3144 errout:
3145         if (err < 0)
3146                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3147 }
3148
3149 static int ip6_route_dev_notify(struct notifier_block *this,
3150                                 unsigned long event, void *ptr)
3151 {
3152         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3153         struct net *net = dev_net(dev);
3154
3155         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3156                 net->ipv6.ip6_null_entry->dst.dev = dev;
3157                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3158 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3159                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3160                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3161                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3162                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3163 #endif
3164         }
3165
3166         return NOTIFY_OK;
3167 }
3168
3169 /*
3170  *      /proc
3171  */
3172
3173 #ifdef CONFIG_PROC_FS
3174
3175 static const struct file_operations ipv6_route_proc_fops = {
3176         .owner          = THIS_MODULE,
3177         .open           = ipv6_route_open,
3178         .read           = seq_read,
3179         .llseek         = seq_lseek,
3180         .release        = seq_release_net,
3181 };
3182
3183 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3184 {
3185         struct net *net = (struct net *)seq->private;
3186         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3187                    net->ipv6.rt6_stats->fib_nodes,
3188                    net->ipv6.rt6_stats->fib_route_nodes,
3189                    net->ipv6.rt6_stats->fib_rt_alloc,
3190                    net->ipv6.rt6_stats->fib_rt_entries,
3191                    net->ipv6.rt6_stats->fib_rt_cache,
3192                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3193                    net->ipv6.rt6_stats->fib_discarded_routes);
3194
3195         return 0;
3196 }
3197
3198 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3199 {
3200         return single_open_net(inode, file, rt6_stats_seq_show);
3201 }
3202
3203 static const struct file_operations rt6_stats_seq_fops = {
3204         .owner   = THIS_MODULE,
3205         .open    = rt6_stats_seq_open,
3206         .read    = seq_read,
3207         .llseek  = seq_lseek,
3208         .release = single_release_net,
3209 };
3210 #endif  /* CONFIG_PROC_FS */
3211
3212 #ifdef CONFIG_SYSCTL
3213
3214 static
3215 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3216                               void __user *buffer, size_t *lenp, loff_t *ppos)
3217 {
3218         struct net *net;
3219         int delay;
3220         if (!write)
3221                 return -EINVAL;
3222
3223         net = (struct net *)ctl->extra1;
3224         delay = net->ipv6.sysctl.flush_delay;
3225         proc_dointvec(ctl, write, buffer, lenp, ppos);
3226         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3227         return 0;
3228 }
3229
3230 struct ctl_table ipv6_route_table_template[] = {
3231         {
3232                 .procname       =       "flush",
3233                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3234                 .maxlen         =       sizeof(int),
3235                 .mode           =       0200,
3236                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3237         },
3238         {
3239                 .procname       =       "gc_thresh",
3240                 .data           =       &ip6_dst_ops_template.gc_thresh,
3241                 .maxlen         =       sizeof(int),
3242                 .mode           =       0644,
3243                 .proc_handler   =       proc_dointvec,
3244         },
3245         {
3246                 .procname       =       "max_size",
3247                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3248                 .maxlen         =       sizeof(int),
3249                 .mode           =       0644,
3250                 .proc_handler   =       proc_dointvec,
3251         },
3252         {
3253                 .procname       =       "gc_min_interval",
3254                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3255                 .maxlen         =       sizeof(int),
3256                 .mode           =       0644,
3257                 .proc_handler   =       proc_dointvec_jiffies,
3258         },
3259         {
3260                 .procname       =       "gc_timeout",
3261                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3262                 .maxlen         =       sizeof(int),
3263                 .mode           =       0644,
3264                 .proc_handler   =       proc_dointvec_jiffies,
3265         },
3266         {
3267                 .procname       =       "gc_interval",
3268                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3269                 .maxlen         =       sizeof(int),
3270                 .mode           =       0644,
3271                 .proc_handler   =       proc_dointvec_jiffies,
3272         },
3273         {
3274                 .procname       =       "gc_elasticity",
3275                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3276                 .maxlen         =       sizeof(int),
3277                 .mode           =       0644,
3278                 .proc_handler   =       proc_dointvec,
3279         },
3280         {
3281                 .procname       =       "mtu_expires",
3282                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3283                 .maxlen         =       sizeof(int),
3284                 .mode           =       0644,
3285                 .proc_handler   =       proc_dointvec_jiffies,
3286         },
3287         {
3288                 .procname       =       "min_adv_mss",
3289                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3290                 .maxlen         =       sizeof(int),
3291                 .mode           =       0644,
3292                 .proc_handler   =       proc_dointvec,
3293         },
3294         {
3295                 .procname       =       "gc_min_interval_ms",
3296                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3297                 .maxlen         =       sizeof(int),
3298                 .mode           =       0644,
3299                 .proc_handler   =       proc_dointvec_ms_jiffies,
3300         },
3301         { }
3302 };
3303
3304 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3305 {
3306         struct ctl_table *table;
3307
3308         table = kmemdup(ipv6_route_table_template,
3309                         sizeof(ipv6_route_table_template),
3310                         GFP_KERNEL);
3311
3312         if (table) {
3313                 table[0].data = &net->ipv6.sysctl.flush_delay;
3314                 table[0].extra1 = net;
3315                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3316                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3317                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3318                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3319                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3320                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3321                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3322                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3323                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3324
3325                 /* Don't export sysctls to unprivileged users */
3326                 if (net->user_ns != &init_user_ns)
3327                         table[0].procname = NULL;
3328         }
3329
3330         return table;
3331 }
3332 #endif
3333
3334 static int __net_init ip6_route_net_init(struct net *net)
3335 {
3336         int ret = -ENOMEM;
3337
3338         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3339                sizeof(net->ipv6.ip6_dst_ops));
3340
3341         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3342                 goto out_ip6_dst_ops;
3343
3344         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3345                                            sizeof(*net->ipv6.ip6_null_entry),
3346                                            GFP_KERNEL);
3347         if (!net->ipv6.ip6_null_entry)
3348                 goto out_ip6_dst_entries;
3349         net->ipv6.ip6_null_entry->dst.path =
3350                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3351         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3352         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3353                          ip6_template_metrics, true);
3354
3355 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3356         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3357                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3358                                                GFP_KERNEL);
3359         if (!net->ipv6.ip6_prohibit_entry)
3360                 goto out_ip6_null_entry;
3361         net->ipv6.ip6_prohibit_entry->dst.path =
3362                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3363         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3364         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3365                          ip6_template_metrics, true);
3366
3367         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3368                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3369                                                GFP_KERNEL);
3370         if (!net->ipv6.ip6_blk_hole_entry)
3371                 goto out_ip6_prohibit_entry;
3372         net->ipv6.ip6_blk_hole_entry->dst.path =
3373                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3374         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3375         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3376                          ip6_template_metrics, true);
3377 #endif
3378
3379         net->ipv6.sysctl.flush_delay = 0;
3380         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3381         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3382         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3383         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3384         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3385         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3386         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3387
3388         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3389
3390         ret = 0;
3391 out:
3392         return ret;
3393
3394 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3395 out_ip6_prohibit_entry:
3396         kfree(net->ipv6.ip6_prohibit_entry);
3397 out_ip6_null_entry:
3398         kfree(net->ipv6.ip6_null_entry);
3399 #endif
3400 out_ip6_dst_entries:
3401         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3402 out_ip6_dst_ops:
3403         goto out;
3404 }
3405
3406 static void __net_exit ip6_route_net_exit(struct net *net)
3407 {
3408         kfree(net->ipv6.ip6_null_entry);
3409 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3410         kfree(net->ipv6.ip6_prohibit_entry);
3411         kfree(net->ipv6.ip6_blk_hole_entry);
3412 #endif
3413         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3414 }
3415
3416 static int __net_init ip6_route_net_init_late(struct net *net)
3417 {
3418 #ifdef CONFIG_PROC_FS
3419         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3420         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3421 #endif
3422         return 0;
3423 }
3424
3425 static void __net_exit ip6_route_net_exit_late(struct net *net)
3426 {
3427 #ifdef CONFIG_PROC_FS
3428         remove_proc_entry("ipv6_route", net->proc_net);
3429         remove_proc_entry("rt6_stats", net->proc_net);
3430 #endif
3431 }
3432
3433 static struct pernet_operations ip6_route_net_ops = {
3434         .init = ip6_route_net_init,
3435         .exit = ip6_route_net_exit,
3436 };
3437
3438 static int __net_init ipv6_inetpeer_init(struct net *net)
3439 {
3440         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3441
3442         if (!bp)
3443                 return -ENOMEM;
3444         inet_peer_base_init(bp);
3445         net->ipv6.peers = bp;
3446         return 0;
3447 }
3448
3449 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3450 {
3451         struct inet_peer_base *bp = net->ipv6.peers;
3452
3453         net->ipv6.peers = NULL;
3454         inetpeer_invalidate_tree(bp);
3455         kfree(bp);
3456 }
3457
3458 static struct pernet_operations ipv6_inetpeer_ops = {
3459         .init   =       ipv6_inetpeer_init,
3460         .exit   =       ipv6_inetpeer_exit,
3461 };
3462
3463 static struct pernet_operations ip6_route_net_late_ops = {
3464         .init = ip6_route_net_init_late,
3465         .exit = ip6_route_net_exit_late,
3466 };
3467
3468 static struct notifier_block ip6_route_dev_notifier = {
3469         .notifier_call = ip6_route_dev_notify,
3470         .priority = 0,
3471 };
3472
3473 int __init ip6_route_init(void)
3474 {
3475         int ret;
3476         int cpu;
3477
3478         ret = -ENOMEM;
3479         ip6_dst_ops_template.kmem_cachep =
3480                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3481                                   SLAB_HWCACHE_ALIGN, NULL);
3482         if (!ip6_dst_ops_template.kmem_cachep)
3483                 goto out;
3484
3485         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3486         if (ret)
3487                 goto out_kmem_cache;
3488
3489         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3490         if (ret)
3491                 goto out_dst_entries;
3492
3493         ret = register_pernet_subsys(&ip6_route_net_ops);
3494         if (ret)
3495                 goto out_register_inetpeer;
3496
3497         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3498
3499         /* Registering of the loopback is done before this portion of code,
3500          * the loopback reference in rt6_info will not be taken, do it
3501          * manually for init_net */
3502         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3503         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3504   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3505         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3506         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3507         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3508         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3509   #endif
3510         ret = fib6_init();
3511         if (ret)
3512                 goto out_register_subsys;
3513
3514         ret = xfrm6_init();
3515         if (ret)
3516                 goto out_fib6_init;
3517
3518         ret = fib6_rules_init();
3519         if (ret)
3520                 goto xfrm6_init;
3521
3522         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3523         if (ret)
3524                 goto fib6_rules_init;
3525
3526         ret = -ENOBUFS;
3527         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3528             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3529             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3530                 goto out_register_late_subsys;
3531
3532         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3533         if (ret)
3534                 goto out_register_late_subsys;
3535
3536         for_each_possible_cpu(cpu) {
3537                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3538
3539                 INIT_LIST_HEAD(&ul->head);
3540                 spin_lock_init(&ul->lock);
3541         }
3542
3543 out:
3544         return ret;
3545
3546 out_register_late_subsys:
3547         unregister_pernet_subsys(&ip6_route_net_late_ops);
3548 fib6_rules_init:
3549         fib6_rules_cleanup();
3550 xfrm6_init:
3551         xfrm6_fini();
3552 out_fib6_init:
3553         fib6_gc_cleanup();
3554 out_register_subsys:
3555         unregister_pernet_subsys(&ip6_route_net_ops);
3556 out_register_inetpeer:
3557         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3558 out_dst_entries:
3559         dst_entries_destroy(&ip6_dst_blackhole_ops);
3560 out_kmem_cache:
3561         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3562         goto out;
3563 }
3564
3565 void ip6_route_cleanup(void)
3566 {
3567         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3568         unregister_pernet_subsys(&ip6_route_net_late_ops);
3569         fib6_rules_cleanup();
3570         xfrm6_fini();
3571         fib6_gc_cleanup();
3572         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3573         unregister_pernet_subsys(&ip6_route_net_ops);
3574         dst_entries_destroy(&ip6_dst_blackhole_ops);
3575         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3576 }