2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
67 #include <asm/uaccess.h>
70 #include <linux/sysctl.h>
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104 const struct in6_addr *prefix, int prefixlen,
105 const struct in6_addr *gwaddr,
106 struct net_device *dev,
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109 const struct in6_addr *prefix, int prefixlen,
110 const struct in6_addr *gwaddr,
111 struct net_device *dev);
114 struct uncached_list {
116 struct list_head head;
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
121 static void rt6_uncached_list_add(struct rt6_info *rt)
123 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
125 rt->dst.flags |= DST_NOCACHE;
126 rt->rt6i_uncached_list = ul;
128 spin_lock_bh(&ul->lock);
129 list_add_tail(&rt->rt6i_uncached, &ul->head);
130 spin_unlock_bh(&ul->lock);
133 static void rt6_uncached_list_del(struct rt6_info *rt)
135 if (!list_empty(&rt->rt6i_uncached)) {
136 struct uncached_list *ul = rt->rt6i_uncached_list;
138 spin_lock_bh(&ul->lock);
139 list_del(&rt->rt6i_uncached);
140 spin_unlock_bh(&ul->lock);
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
146 struct net_device *loopback_dev = net->loopback_dev;
149 if (dev == loopback_dev)
152 for_each_possible_cpu(cpu) {
153 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
156 spin_lock_bh(&ul->lock);
157 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158 struct inet6_dev *rt_idev = rt->rt6i_idev;
159 struct net_device *rt_dev = rt->dst.dev;
161 if (rt_idev->dev == dev) {
162 rt->rt6i_idev = in6_dev_get(loopback_dev);
163 in6_dev_put(rt_idev);
167 rt->dst.dev = loopback_dev;
168 dev_hold(rt->dst.dev);
172 spin_unlock_bh(&ul->lock);
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
178 return dst_metrics_write_ptr(rt->dst.from);
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
183 struct rt6_info *rt = (struct rt6_info *)dst;
185 if (rt->rt6i_flags & RTF_PCPU)
186 return rt6_pcpu_cow_metrics(rt);
187 else if (rt->rt6i_flags & RTF_CACHE)
190 return dst_cow_metrics_generic(dst, old);
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
197 struct in6_addr *p = &rt->rt6i_gateway;
199 if (!ipv6_addr_any(p))
200 return (const void *) p;
202 return &ipv6_hdr(skb)->daddr;
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
210 struct rt6_info *rt = (struct rt6_info *) dst;
213 daddr = choose_neigh_daddr(rt, skb, daddr);
214 n = __ipv6_neigh_lookup(dst->dev, daddr);
217 return neigh_create(&nd_tbl, daddr, dst->dev);
220 static struct dst_ops ip6_dst_ops_template = {
224 .check = ip6_dst_check,
225 .default_advmss = ip6_default_advmss,
227 .cow_metrics = ipv6_cow_metrics,
228 .destroy = ip6_dst_destroy,
229 .ifdown = ip6_dst_ifdown,
230 .negative_advice = ip6_negative_advice,
231 .link_failure = ip6_link_failure,
232 .update_pmtu = ip6_rt_update_pmtu,
233 .redirect = rt6_do_redirect,
234 .local_out = __ip6_local_out,
235 .neigh_lookup = ip6_neigh_lookup,
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
240 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
242 return mtu ? : dst->dev->mtu;
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246 struct sk_buff *skb, u32 mtu)
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
255 static struct dst_ops ip6_dst_blackhole_ops = {
257 .destroy = ip6_dst_destroy,
258 .check = ip6_dst_check,
259 .mtu = ip6_blackhole_mtu,
260 .default_advmss = ip6_default_advmss,
261 .update_pmtu = ip6_rt_blackhole_update_pmtu,
262 .redirect = ip6_rt_blackhole_redirect,
263 .cow_metrics = dst_cow_metrics_generic,
264 .neigh_lookup = ip6_neigh_lookup,
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268 [RTAX_HOPLIMIT - 1] = 0,
271 static const struct rt6_info ip6_null_entry_template = {
273 .__refcnt = ATOMIC_INIT(1),
275 .obsolete = DST_OBSOLETE_FORCE_CHK,
276 .error = -ENETUNREACH,
277 .input = ip6_pkt_discard,
278 .output = ip6_pkt_discard_out,
280 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
281 .rt6i_protocol = RTPROT_KERNEL,
282 .rt6i_metric = ~(u32) 0,
283 .rt6i_ref = ATOMIC_INIT(1),
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
288 static const struct rt6_info ip6_prohibit_entry_template = {
290 .__refcnt = ATOMIC_INIT(1),
292 .obsolete = DST_OBSOLETE_FORCE_CHK,
294 .input = ip6_pkt_prohibit,
295 .output = ip6_pkt_prohibit_out,
297 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
298 .rt6i_protocol = RTPROT_KERNEL,
299 .rt6i_metric = ~(u32) 0,
300 .rt6i_ref = ATOMIC_INIT(1),
303 static const struct rt6_info ip6_blk_hole_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
309 .input = dst_discard,
310 .output = dst_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 .rt6i_protocol = RTPROT_KERNEL,
314 .rt6i_metric = ~(u32) 0,
315 .rt6i_ref = ATOMIC_INIT(1),
320 static void rt6_info_init(struct rt6_info *rt)
322 struct dst_entry *dst = &rt->dst;
324 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325 INIT_LIST_HEAD(&rt->rt6i_siblings);
326 INIT_LIST_HEAD(&rt->rt6i_uncached);
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331 struct net_device *dev,
334 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335 0, DST_OBSOLETE_FORCE_CHK, flags);
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344 struct net_device *dev,
347 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
350 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
354 for_each_possible_cpu(cpu) {
357 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358 /* no one shares rt */
362 dst_destroy((struct dst_entry *)rt);
369 EXPORT_SYMBOL(ip6_dst_alloc);
371 static void ip6_dst_destroy(struct dst_entry *dst)
373 struct rt6_info *rt = (struct rt6_info *)dst;
374 struct dst_entry *from = dst->from;
375 struct inet6_dev *idev;
377 dst_destroy_metrics_generic(dst);
378 free_percpu(rt->rt6i_pcpu);
379 rt6_uncached_list_del(rt);
381 idev = rt->rt6i_idev;
383 rt->rt6i_idev = NULL;
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
394 struct rt6_info *rt = (struct rt6_info *)dst;
395 struct inet6_dev *idev = rt->rt6i_idev;
396 struct net_device *loopback_dev =
397 dev_net(dev)->loopback_dev;
399 if (dev != loopback_dev) {
400 if (idev && idev->dev == dev) {
401 struct inet6_dev *loopback_idev =
402 in6_dev_get(loopback_dev);
404 rt->rt6i_idev = loopback_idev;
411 static bool __rt6_check_expired(const struct rt6_info *rt)
413 if (rt->rt6i_flags & RTF_EXPIRES)
414 return time_after(jiffies, rt->dst.expires);
419 static bool rt6_check_expired(const struct rt6_info *rt)
421 if (rt->rt6i_flags & RTF_EXPIRES) {
422 if (time_after(jiffies, rt->dst.expires))
424 } else if (rt->dst.from) {
425 return rt6_check_expired((struct rt6_info *) rt->dst.from);
430 /* Multipath route selection:
431 * Hash based function using packet header and flowlabel.
432 * Adapted from fib_info_hashfn()
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435 const struct flowi6 *fl6)
437 return get_hash_from_flowi6(fl6) % candidate_count;
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441 struct flowi6 *fl6, int oif,
444 struct rt6_info *sibling, *next_sibling;
447 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448 /* Don't change the route, if route_choosen == 0
449 * (siblings does not include ourself)
452 list_for_each_entry_safe(sibling, next_sibling,
453 &match->rt6i_siblings, rt6i_siblings) {
455 if (route_choosen == 0) {
456 if (rt6_score_route(sibling, oif, strict) < 0)
466 * Route lookup. Any table->tb6_lock is implied.
469 static inline struct rt6_info *rt6_device_match(struct net *net,
471 const struct in6_addr *saddr,
475 struct rt6_info *local = NULL;
476 struct rt6_info *sprt;
478 if (!oif && ipv6_addr_any(saddr))
481 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482 struct net_device *dev = sprt->dst.dev;
485 if (dev->ifindex == oif)
487 if (dev->flags & IFF_LOOPBACK) {
488 if (!sprt->rt6i_idev ||
489 sprt->rt6i_idev->dev->ifindex != oif) {
490 if (flags & RT6_LOOKUP_F_IFACE)
493 local->rt6i_idev->dev->ifindex == oif)
499 if (ipv6_chk_addr(net, saddr, dev,
500 flags & RT6_LOOKUP_F_IFACE))
509 if (flags & RT6_LOOKUP_F_IFACE)
510 return net->ipv6.ip6_null_entry;
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518 struct work_struct work;
519 struct in6_addr target;
520 struct net_device *dev;
523 static void rt6_probe_deferred(struct work_struct *w)
525 struct in6_addr mcaddr;
526 struct __rt6_probe_work *work =
527 container_of(w, struct __rt6_probe_work, work);
529 addrconf_addr_solict_mult(&work->target, &mcaddr);
530 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
535 static void rt6_probe(struct rt6_info *rt)
537 struct __rt6_probe_work *work;
538 struct neighbour *neigh;
540 * Okay, this does not seem to be appropriate
541 * for now, however, we need to check if it
542 * is really so; aka Router Reachability Probing.
544 * Router Reachability Probe MUST be rate-limited
545 * to no more than one per minute.
547 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
550 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
552 if (neigh->nud_state & NUD_VALID)
556 write_lock(&neigh->lock);
557 if (!(neigh->nud_state & NUD_VALID) &&
560 rt->rt6i_idev->cnf.rtr_probe_interval)) {
561 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 __neigh_set_probe_once(neigh);
565 write_unlock(&neigh->lock);
567 work = kmalloc(sizeof(*work), GFP_ATOMIC);
571 INIT_WORK(&work->work, rt6_probe_deferred);
572 work->target = rt->rt6i_gateway;
573 dev_hold(rt->dst.dev);
574 work->dev = rt->dst.dev;
575 schedule_work(&work->work);
579 rcu_read_unlock_bh();
582 static inline void rt6_probe(struct rt6_info *rt)
588 * Default Router Selection (RFC 2461 6.3.6)
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
592 struct net_device *dev = rt->dst.dev;
593 if (!oif || dev->ifindex == oif)
595 if ((dev->flags & IFF_LOOPBACK) &&
596 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
603 struct neighbour *neigh;
604 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
606 if (rt->rt6i_flags & RTF_NONEXTHOP ||
607 !(rt->rt6i_flags & RTF_GATEWAY))
608 return RT6_NUD_SUCCEED;
611 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
613 read_lock(&neigh->lock);
614 if (neigh->nud_state & NUD_VALID)
615 ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617 else if (!(neigh->nud_state & NUD_FAILED))
618 ret = RT6_NUD_SUCCEED;
620 ret = RT6_NUD_FAIL_PROBE;
622 read_unlock(&neigh->lock);
624 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
627 rcu_read_unlock_bh();
632 static int rt6_score_route(struct rt6_info *rt, int oif,
637 m = rt6_check_dev(rt, oif);
638 if (!m && (strict & RT6_LOOKUP_F_IFACE))
639 return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
643 if (strict & RT6_LOOKUP_F_REACHABLE) {
644 int n = rt6_check_neigh(rt);
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652 int *mpri, struct rt6_info *match,
656 bool match_do_rr = false;
657 struct inet6_dev *idev = rt->rt6i_idev;
658 struct net_device *dev = rt->dst.dev;
660 if (dev && !netif_carrier_ok(dev) &&
661 idev->cnf.ignore_routes_with_linkdown)
664 if (rt6_check_expired(rt))
667 m = rt6_score_route(rt, oif, strict);
668 if (m == RT6_NUD_FAIL_DO_RR) {
670 m = 0; /* lowest valid score */
671 } else if (m == RT6_NUD_FAIL_HARD) {
675 if (strict & RT6_LOOKUP_F_REACHABLE)
678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680 *do_rr = match_do_rr;
688 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
689 struct rt6_info *rr_head,
690 u32 metric, int oif, int strict,
693 struct rt6_info *rt, *match, *cont;
698 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
699 if (rt->rt6i_metric != metric) {
704 match = find_match(rt, oif, strict, &mpri, match, do_rr);
707 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
708 if (rt->rt6i_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719 for (rt = cont; rt; rt = rt->dst.rt6_next)
720 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 struct rt6_info *match, *rt0;
733 fn->rr_ptr = rt0 = fn->leaf;
735 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
739 struct rt6_info *next = rt0->dst.rt6_next;
741 /* no entries matched; do round-robin */
742 if (!next || next->rt6i_metric != rt0->rt6i_metric)
749 net = dev_net(rt0->dst.dev);
750 return match ? match : net->ipv6.ip6_null_entry;
753 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
758 #ifdef CONFIG_IPV6_ROUTE_INFO
759 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
760 const struct in6_addr *gwaddr)
762 struct net *net = dev_net(dev);
763 struct route_info *rinfo = (struct route_info *) opt;
764 struct in6_addr prefix_buf, *prefix;
766 unsigned long lifetime;
769 if (len < sizeof(struct route_info)) {
773 /* Sanity check for prefix_len and length */
774 if (rinfo->length > 3) {
776 } else if (rinfo->prefix_len > 128) {
778 } else if (rinfo->prefix_len > 64) {
779 if (rinfo->length < 2) {
782 } else if (rinfo->prefix_len > 0) {
783 if (rinfo->length < 1) {
788 pref = rinfo->route_pref;
789 if (pref == ICMPV6_ROUTER_PREF_INVALID)
792 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794 if (rinfo->length == 3)
795 prefix = (struct in6_addr *)rinfo->prefix;
797 /* this function is safe */
798 ipv6_addr_prefix(&prefix_buf,
799 (struct in6_addr *)rinfo->prefix,
801 prefix = &prefix_buf;
804 if (rinfo->prefix_len == 0)
805 rt = rt6_get_dflt_router(gwaddr, dev);
807 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
810 if (rt && !lifetime) {
816 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
819 rt->rt6i_flags = RTF_ROUTEINFO |
820 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
823 if (!addrconf_finite_timeout(lifetime))
824 rt6_clean_expires(rt);
826 rt6_set_expires(rt, jiffies + HZ * lifetime);
834 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
835 struct in6_addr *saddr)
837 struct fib6_node *pn;
839 if (fn->fn_flags & RTN_TL_ROOT)
842 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
843 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
846 if (fn->fn_flags & RTN_RTINFO)
851 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
852 struct fib6_table *table,
853 struct flowi6 *fl6, int flags)
855 struct fib6_node *fn;
858 read_lock_bh(&table->tb6_lock);
859 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
862 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
863 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
864 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
865 if (rt == net->ipv6.ip6_null_entry) {
866 fn = fib6_backtrack(fn, &fl6->saddr);
870 dst_use(&rt->dst, jiffies);
871 read_unlock_bh(&table->tb6_lock);
873 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
879 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
882 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
887 const struct in6_addr *saddr, int oif, int strict)
889 struct flowi6 fl6 = {
893 struct dst_entry *dst;
894 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
897 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
898 flags |= RT6_LOOKUP_F_HAS_SADDR;
901 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903 return (struct rt6_info *) dst;
909 EXPORT_SYMBOL(rt6_lookup);
911 /* ip6_ins_rt is called with FREE table->tb6_lock.
912 It takes new route entry, the addition fails by any reason the
913 route is freed. In any case, if caller does not hold it, it may
917 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
918 struct mx6_config *mxc)
921 struct fib6_table *table;
923 table = rt->rt6i_table;
924 write_lock_bh(&table->tb6_lock);
925 err = fib6_add(&table->tb6_root, rt, info, mxc);
926 write_unlock_bh(&table->tb6_lock);
931 int ip6_ins_rt(struct rt6_info *rt)
933 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
934 struct mx6_config mxc = { .mx = NULL, };
936 return __ip6_ins_rt(rt, &info, &mxc);
939 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
940 const struct in6_addr *daddr,
941 const struct in6_addr *saddr)
949 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
950 ort = (struct rt6_info *)ort->dst.from;
952 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
957 ip6_rt_copy_init(rt, ort);
958 rt->rt6i_flags |= RTF_CACHE;
960 rt->dst.flags |= DST_HOST;
961 rt->rt6i_dst.addr = *daddr;
962 rt->rt6i_dst.plen = 128;
964 if (!rt6_is_gw_or_nonexthop(ort)) {
965 if (ort->rt6i_dst.plen != 128 &&
966 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
967 rt->rt6i_flags |= RTF_ANYCAST;
968 #ifdef CONFIG_IPV6_SUBTREES
969 if (rt->rt6i_src.plen && saddr) {
970 rt->rt6i_src.addr = *saddr;
971 rt->rt6i_src.plen = 128;
979 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
981 struct rt6_info *pcpu_rt;
983 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
984 rt->dst.dev, rt->dst.flags);
988 ip6_rt_copy_init(pcpu_rt, rt);
989 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
990 pcpu_rt->rt6i_flags |= RTF_PCPU;
994 /* It should be called with read_lock_bh(&tb6_lock) acquired */
995 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
997 struct rt6_info *pcpu_rt, **p;
999 p = this_cpu_ptr(rt->rt6i_pcpu);
1003 dst_hold(&pcpu_rt->dst);
1004 rt6_dst_from_metrics_check(pcpu_rt);
1009 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1011 struct fib6_table *table = rt->rt6i_table;
1012 struct rt6_info *pcpu_rt, *prev, **p;
1014 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016 struct net *net = dev_net(rt->dst.dev);
1018 dst_hold(&net->ipv6.ip6_null_entry->dst);
1019 return net->ipv6.ip6_null_entry;
1022 read_lock_bh(&table->tb6_lock);
1023 if (rt->rt6i_pcpu) {
1024 p = this_cpu_ptr(rt->rt6i_pcpu);
1025 prev = cmpxchg(p, NULL, pcpu_rt);
1027 /* If someone did it before us, return prev instead */
1028 dst_destroy(&pcpu_rt->dst);
1032 /* rt has been removed from the fib6 tree
1033 * before we have a chance to acquire the read_lock.
1034 * In this case, don't brother to create a pcpu rt
1035 * since rt is going away anyway. The next
1036 * dst_check() will trigger a re-lookup.
1038 dst_destroy(&pcpu_rt->dst);
1041 dst_hold(&pcpu_rt->dst);
1042 rt6_dst_from_metrics_check(pcpu_rt);
1043 read_unlock_bh(&table->tb6_lock);
1047 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1048 int oif, struct flowi6 *fl6, int flags)
1050 struct fib6_node *fn, *saved_fn;
1051 struct rt6_info *rt;
1054 strict |= flags & RT6_LOOKUP_F_IFACE;
1055 if (net->ipv6.devconf_all->forwarding == 0)
1056 strict |= RT6_LOOKUP_F_REACHABLE;
1058 read_lock_bh(&table->tb6_lock);
1060 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1067 rt = rt6_select(fn, oif, strict);
1068 if (rt->rt6i_nsiblings)
1069 rt = rt6_multipath_select(rt, fl6, oif, strict);
1070 if (rt == net->ipv6.ip6_null_entry) {
1071 fn = fib6_backtrack(fn, &fl6->saddr);
1073 goto redo_rt6_select;
1074 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1075 /* also consider unreachable route */
1076 strict &= ~RT6_LOOKUP_F_REACHABLE;
1078 goto redo_rt6_select;
1083 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1084 dst_use(&rt->dst, jiffies);
1085 read_unlock_bh(&table->tb6_lock);
1087 rt6_dst_from_metrics_check(rt);
1089 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1091 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1092 !(rt->rt6i_flags & RTF_GATEWAY))) {
1093 /* Create a RTF_CACHE clone which will not be
1094 * owned by the fib6 tree. It is for the special case where
1095 * the daddr in the skb during the neighbor look-up is different
1096 * from the fl6->daddr used to look-up route here.
1099 struct rt6_info *uncached_rt;
1101 dst_use(&rt->dst, jiffies);
1102 read_unlock_bh(&table->tb6_lock);
1104 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1105 dst_release(&rt->dst);
1108 rt6_uncached_list_add(uncached_rt);
1110 uncached_rt = net->ipv6.ip6_null_entry;
1112 dst_hold(&uncached_rt->dst);
1114 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1118 /* Get a percpu copy */
1120 struct rt6_info *pcpu_rt;
1122 rt->dst.lastuse = jiffies;
1124 pcpu_rt = rt6_get_pcpu_route(rt);
1127 read_unlock_bh(&table->tb6_lock);
1129 /* We have to do the read_unlock first
1130 * because rt6_make_pcpu_route() may trigger
1131 * ip6_dst_gc() which will take the write_lock.
1134 read_unlock_bh(&table->tb6_lock);
1135 pcpu_rt = rt6_make_pcpu_route(rt);
1136 dst_release(&rt->dst);
1139 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1144 EXPORT_SYMBOL_GPL(ip6_pol_route);
1146 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1147 struct flowi6 *fl6, int flags)
1149 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1152 struct dst_entry *ip6_route_input_lookup(struct net *net,
1153 struct net_device *dev,
1154 struct flowi6 *fl6, int flags)
1156 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1157 flags |= RT6_LOOKUP_F_IFACE;
1159 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1161 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1163 void ip6_route_input(struct sk_buff *skb)
1165 const struct ipv6hdr *iph = ipv6_hdr(skb);
1166 struct net *net = dev_net(skb->dev);
1167 int flags = RT6_LOOKUP_F_HAS_SADDR;
1168 struct ip_tunnel_info *tun_info;
1169 struct flowi6 fl6 = {
1170 .flowi6_iif = skb->dev->ifindex,
1171 .daddr = iph->daddr,
1172 .saddr = iph->saddr,
1173 .flowlabel = ip6_flowinfo(iph),
1174 .flowi6_mark = skb->mark,
1175 .flowi6_proto = iph->nexthdr,
1178 tun_info = skb_tunnel_info(skb);
1179 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1180 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1182 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1185 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1186 struct flowi6 *fl6, int flags)
1188 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1191 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1192 struct flowi6 *fl6, int flags)
1196 if (rt6_need_strict(&fl6->daddr)) {
1197 struct dst_entry *dst;
1199 dst = l3mdev_link_scope_lookup(net, fl6);
1204 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1206 any_src = ipv6_addr_any(&fl6->saddr);
1207 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1208 (fl6->flowi6_oif && any_src))
1209 flags |= RT6_LOOKUP_F_IFACE;
1212 flags |= RT6_LOOKUP_F_HAS_SADDR;
1214 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1216 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1218 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1220 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1222 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1223 struct dst_entry *new = NULL;
1225 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1231 new->input = dst_discard;
1232 new->output = dst_discard_out;
1234 dst_copy_metrics(new, &ort->dst);
1235 rt->rt6i_idev = ort->rt6i_idev;
1237 in6_dev_hold(rt->rt6i_idev);
1239 rt->rt6i_gateway = ort->rt6i_gateway;
1240 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1241 rt->rt6i_metric = 0;
1243 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1244 #ifdef CONFIG_IPV6_SUBTREES
1245 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1251 dst_release(dst_orig);
1252 return new ? new : ERR_PTR(-ENOMEM);
1256 * Destination cache support functions
1259 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1262 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1263 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1266 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1268 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1271 if (rt6_check_expired(rt))
1277 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1279 if (!__rt6_check_expired(rt) &&
1280 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1281 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1287 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1289 struct rt6_info *rt;
1291 rt = (struct rt6_info *) dst;
1293 /* All IPV6 dsts are created with ->obsolete set to the value
1294 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1295 * into this function always.
1298 rt6_dst_from_metrics_check(rt);
1300 if (rt->rt6i_flags & RTF_PCPU ||
1301 (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1302 return rt6_dst_from_check(rt, cookie);
1304 return rt6_check(rt, cookie);
1307 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1309 struct rt6_info *rt = (struct rt6_info *) dst;
1312 if (rt->rt6i_flags & RTF_CACHE) {
1313 if (rt6_check_expired(rt)) {
1325 static void ip6_link_failure(struct sk_buff *skb)
1327 struct rt6_info *rt;
1329 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1331 rt = (struct rt6_info *) skb_dst(skb);
1333 if (rt->rt6i_flags & RTF_CACHE) {
1336 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1337 rt->rt6i_node->fn_sernum = -1;
1342 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1344 struct net *net = dev_net(rt->dst.dev);
1346 rt->rt6i_flags |= RTF_MODIFIED;
1347 rt->rt6i_pmtu = mtu;
1348 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1351 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1353 return !(rt->rt6i_flags & RTF_CACHE) &&
1354 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1357 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1358 const struct ipv6hdr *iph, u32 mtu)
1360 struct rt6_info *rt6 = (struct rt6_info *)dst;
1362 if (rt6->rt6i_flags & RTF_LOCAL)
1366 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1367 if (mtu >= dst_mtu(dst))
1370 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1371 rt6_do_update_pmtu(rt6, mtu);
1373 const struct in6_addr *daddr, *saddr;
1374 struct rt6_info *nrt6;
1377 daddr = &iph->daddr;
1378 saddr = &iph->saddr;
1380 daddr = &sk->sk_v6_daddr;
1381 saddr = &inet6_sk(sk)->saddr;
1385 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1387 rt6_do_update_pmtu(nrt6, mtu);
1389 /* ip6_ins_rt(nrt6) will bump the
1390 * rt6->rt6i_node->fn_sernum
1391 * which will fail the next rt6_check() and
1392 * invalidate the sk->sk_dst_cache.
1399 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1400 struct sk_buff *skb, u32 mtu)
1402 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1405 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1408 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1409 struct dst_entry *dst;
1412 memset(&fl6, 0, sizeof(fl6));
1413 fl6.flowi6_oif = oif;
1414 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1415 fl6.daddr = iph->daddr;
1416 fl6.saddr = iph->saddr;
1417 fl6.flowlabel = ip6_flowinfo(iph);
1419 dst = ip6_route_output(net, NULL, &fl6);
1421 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1424 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1426 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1428 struct dst_entry *dst;
1430 ip6_update_pmtu(skb, sock_net(sk), mtu,
1431 sk->sk_bound_dev_if, sk->sk_mark);
1433 dst = __sk_dst_get(sk);
1434 if (!dst || !dst->obsolete ||
1435 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1439 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1440 ip6_datagram_dst_update(sk, false);
1443 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1445 /* Handle redirects */
1446 struct ip6rd_flowi {
1448 struct in6_addr gateway;
1451 static struct rt6_info *__ip6_route_redirect(struct net *net,
1452 struct fib6_table *table,
1456 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1457 struct rt6_info *rt;
1458 struct fib6_node *fn;
1460 /* Get the "current" route for this destination and
1461 * check if the redirect has come from approriate router.
1463 * RFC 4861 specifies that redirects should only be
1464 * accepted if they come from the nexthop to the target.
1465 * Due to the way the routes are chosen, this notion
1466 * is a bit fuzzy and one might need to check all possible
1470 read_lock_bh(&table->tb6_lock);
1471 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1473 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1474 if (rt6_check_expired(rt))
1478 if (!(rt->rt6i_flags & RTF_GATEWAY))
1480 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1482 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1488 rt = net->ipv6.ip6_null_entry;
1489 else if (rt->dst.error) {
1490 rt = net->ipv6.ip6_null_entry;
1494 if (rt == net->ipv6.ip6_null_entry) {
1495 fn = fib6_backtrack(fn, &fl6->saddr);
1503 read_unlock_bh(&table->tb6_lock);
1505 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1509 static struct dst_entry *ip6_route_redirect(struct net *net,
1510 const struct flowi6 *fl6,
1511 const struct in6_addr *gateway)
1513 int flags = RT6_LOOKUP_F_HAS_SADDR;
1514 struct ip6rd_flowi rdfl;
1517 rdfl.gateway = *gateway;
1519 return fib6_rule_lookup(net, &rdfl.fl6,
1520 flags, __ip6_route_redirect);
1523 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1525 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1526 struct dst_entry *dst;
1529 memset(&fl6, 0, sizeof(fl6));
1530 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1531 fl6.flowi6_oif = oif;
1532 fl6.flowi6_mark = mark;
1533 fl6.daddr = iph->daddr;
1534 fl6.saddr = iph->saddr;
1535 fl6.flowlabel = ip6_flowinfo(iph);
1537 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1538 rt6_do_redirect(dst, NULL, skb);
1541 EXPORT_SYMBOL_GPL(ip6_redirect);
1543 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1546 const struct ipv6hdr *iph = ipv6_hdr(skb);
1547 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1548 struct dst_entry *dst;
1551 memset(&fl6, 0, sizeof(fl6));
1552 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1553 fl6.flowi6_oif = oif;
1554 fl6.flowi6_mark = mark;
1555 fl6.daddr = msg->dest;
1556 fl6.saddr = iph->daddr;
1558 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1559 rt6_do_redirect(dst, NULL, skb);
1563 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1565 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1567 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1569 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1571 struct net_device *dev = dst->dev;
1572 unsigned int mtu = dst_mtu(dst);
1573 struct net *net = dev_net(dev);
1575 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1577 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1578 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1581 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1582 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1583 * IPV6_MAXPLEN is also valid and means: "any MSS,
1584 * rely only on pmtu discovery"
1586 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1591 static unsigned int ip6_mtu(const struct dst_entry *dst)
1593 const struct rt6_info *rt = (const struct rt6_info *)dst;
1594 unsigned int mtu = rt->rt6i_pmtu;
1595 struct inet6_dev *idev;
1600 mtu = dst_metric_raw(dst, RTAX_MTU);
1607 idev = __in6_dev_get(dst->dev);
1609 mtu = idev->cnf.mtu6;
1613 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1615 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1618 static struct dst_entry *icmp6_dst_gc_list;
1619 static DEFINE_SPINLOCK(icmp6_dst_lock);
1621 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1624 struct dst_entry *dst;
1625 struct rt6_info *rt;
1626 struct inet6_dev *idev = in6_dev_get(dev);
1627 struct net *net = dev_net(dev);
1629 if (unlikely(!idev))
1630 return ERR_PTR(-ENODEV);
1632 rt = ip6_dst_alloc(net, dev, 0);
1633 if (unlikely(!rt)) {
1635 dst = ERR_PTR(-ENOMEM);
1639 rt->dst.flags |= DST_HOST;
1640 rt->dst.output = ip6_output;
1641 atomic_set(&rt->dst.__refcnt, 1);
1642 rt->rt6i_gateway = fl6->daddr;
1643 rt->rt6i_dst.addr = fl6->daddr;
1644 rt->rt6i_dst.plen = 128;
1645 rt->rt6i_idev = idev;
1646 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1648 spin_lock_bh(&icmp6_dst_lock);
1649 rt->dst.next = icmp6_dst_gc_list;
1650 icmp6_dst_gc_list = &rt->dst;
1651 spin_unlock_bh(&icmp6_dst_lock);
1653 fib6_force_start_gc(net);
1655 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1661 int icmp6_dst_gc(void)
1663 struct dst_entry *dst, **pprev;
1666 spin_lock_bh(&icmp6_dst_lock);
1667 pprev = &icmp6_dst_gc_list;
1669 while ((dst = *pprev) != NULL) {
1670 if (!atomic_read(&dst->__refcnt)) {
1679 spin_unlock_bh(&icmp6_dst_lock);
1684 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1687 struct dst_entry *dst, **pprev;
1689 spin_lock_bh(&icmp6_dst_lock);
1690 pprev = &icmp6_dst_gc_list;
1691 while ((dst = *pprev) != NULL) {
1692 struct rt6_info *rt = (struct rt6_info *) dst;
1693 if (func(rt, arg)) {
1700 spin_unlock_bh(&icmp6_dst_lock);
1703 static int ip6_dst_gc(struct dst_ops *ops)
1705 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1706 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1707 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1708 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1709 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1710 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1713 entries = dst_entries_get_fast(ops);
1714 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1715 entries <= rt_max_size)
1718 net->ipv6.ip6_rt_gc_expire++;
1719 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1720 entries = dst_entries_get_slow(ops);
1721 if (entries < ops->gc_thresh)
1722 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1724 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1725 return entries > rt_max_size;
1728 static int ip6_convert_metrics(struct mx6_config *mxc,
1729 const struct fib6_config *cfg)
1731 bool ecn_ca = false;
1739 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1743 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1744 int type = nla_type(nla);
1749 if (unlikely(type > RTAX_MAX))
1752 if (type == RTAX_CC_ALGO) {
1753 char tmp[TCP_CA_NAME_MAX];
1755 nla_strlcpy(tmp, nla, sizeof(tmp));
1756 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1757 if (val == TCP_CA_UNSPEC)
1760 val = nla_get_u32(nla);
1762 if (type == RTAX_HOPLIMIT && val > 255)
1764 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1768 __set_bit(type - 1, mxc->mx_valid);
1772 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1773 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1783 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1784 struct fib6_config *cfg,
1785 const struct in6_addr *gw_addr)
1787 struct flowi6 fl6 = {
1788 .flowi6_oif = cfg->fc_ifindex,
1790 .saddr = cfg->fc_prefsrc,
1792 struct fib6_table *table;
1793 struct rt6_info *rt;
1794 int flags = RT6_LOOKUP_F_IFACE;
1796 table = fib6_get_table(net, cfg->fc_table);
1800 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1801 flags |= RT6_LOOKUP_F_HAS_SADDR;
1803 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1805 /* if table lookup failed, fall back to full lookup */
1806 if (rt == net->ipv6.ip6_null_entry) {
1814 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1816 struct net *net = cfg->fc_nlinfo.nl_net;
1817 struct rt6_info *rt = NULL;
1818 struct net_device *dev = NULL;
1819 struct inet6_dev *idev = NULL;
1820 struct fib6_table *table;
1824 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1826 #ifndef CONFIG_IPV6_SUBTREES
1827 if (cfg->fc_src_len)
1830 if (cfg->fc_ifindex) {
1832 dev = dev_get_by_index(net, cfg->fc_ifindex);
1835 idev = in6_dev_get(dev);
1840 if (cfg->fc_metric == 0)
1841 cfg->fc_metric = IP6_RT_PRIO_USER;
1844 if (cfg->fc_nlinfo.nlh &&
1845 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1846 table = fib6_get_table(net, cfg->fc_table);
1848 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1849 table = fib6_new_table(net, cfg->fc_table);
1852 table = fib6_new_table(net, cfg->fc_table);
1858 rt = ip6_dst_alloc(net, NULL,
1859 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1866 if (cfg->fc_flags & RTF_EXPIRES)
1867 rt6_set_expires(rt, jiffies +
1868 clock_t_to_jiffies(cfg->fc_expires));
1870 rt6_clean_expires(rt);
1872 if (cfg->fc_protocol == RTPROT_UNSPEC)
1873 cfg->fc_protocol = RTPROT_BOOT;
1874 rt->rt6i_protocol = cfg->fc_protocol;
1876 addr_type = ipv6_addr_type(&cfg->fc_dst);
1878 if (addr_type & IPV6_ADDR_MULTICAST)
1879 rt->dst.input = ip6_mc_input;
1880 else if (cfg->fc_flags & RTF_LOCAL)
1881 rt->dst.input = ip6_input;
1883 rt->dst.input = ip6_forward;
1885 rt->dst.output = ip6_output;
1887 if (cfg->fc_encap) {
1888 struct lwtunnel_state *lwtstate;
1890 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1891 cfg->fc_encap, AF_INET6, cfg,
1895 rt->dst.lwtstate = lwtstate_get(lwtstate);
1896 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1897 rt->dst.lwtstate->orig_output = rt->dst.output;
1898 rt->dst.output = lwtunnel_output;
1900 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1901 rt->dst.lwtstate->orig_input = rt->dst.input;
1902 rt->dst.input = lwtunnel_input;
1906 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1907 rt->rt6i_dst.plen = cfg->fc_dst_len;
1908 if (rt->rt6i_dst.plen == 128)
1909 rt->dst.flags |= DST_HOST;
1911 #ifdef CONFIG_IPV6_SUBTREES
1912 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1913 rt->rt6i_src.plen = cfg->fc_src_len;
1916 rt->rt6i_metric = cfg->fc_metric;
1918 /* We cannot add true routes via loopback here,
1919 they would result in kernel looping; promote them to reject routes
1921 if ((cfg->fc_flags & RTF_REJECT) ||
1922 (dev && (dev->flags & IFF_LOOPBACK) &&
1923 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1924 !(cfg->fc_flags & RTF_LOCAL))) {
1925 /* hold loopback dev/idev if we haven't done so. */
1926 if (dev != net->loopback_dev) {
1931 dev = net->loopback_dev;
1933 idev = in6_dev_get(dev);
1939 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1940 switch (cfg->fc_type) {
1942 rt->dst.error = -EINVAL;
1943 rt->dst.output = dst_discard_out;
1944 rt->dst.input = dst_discard;
1947 rt->dst.error = -EACCES;
1948 rt->dst.output = ip6_pkt_prohibit_out;
1949 rt->dst.input = ip6_pkt_prohibit;
1952 case RTN_UNREACHABLE:
1954 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1955 : (cfg->fc_type == RTN_UNREACHABLE)
1956 ? -EHOSTUNREACH : -ENETUNREACH;
1957 rt->dst.output = ip6_pkt_discard_out;
1958 rt->dst.input = ip6_pkt_discard;
1964 if (cfg->fc_flags & RTF_GATEWAY) {
1965 const struct in6_addr *gw_addr;
1968 gw_addr = &cfg->fc_gateway;
1969 gwa_type = ipv6_addr_type(gw_addr);
1971 /* if gw_addr is local we will fail to detect this in case
1972 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1973 * will return already-added prefix route via interface that
1974 * prefix route was assigned to, which might be non-loopback.
1977 if (ipv6_chk_addr_and_flags(net, gw_addr,
1978 gwa_type & IPV6_ADDR_LINKLOCAL ?
1982 rt->rt6i_gateway = *gw_addr;
1984 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1985 struct rt6_info *grt = NULL;
1987 /* IPv6 strictly inhibits using not link-local
1988 addresses as nexthop address.
1989 Otherwise, router will not able to send redirects.
1990 It is very good, but in some (rare!) circumstances
1991 (SIT, PtP, NBMA NOARP links) it is handy to allow
1992 some exceptions. --ANK
1994 if (!(gwa_type & IPV6_ADDR_UNICAST))
1997 if (cfg->fc_table) {
1998 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2001 if (grt->rt6i_flags & RTF_GATEWAY ||
2002 (dev && dev != grt->dst.dev)) {
2010 grt = rt6_lookup(net, gw_addr, NULL,
2011 cfg->fc_ifindex, 1);
2013 err = -EHOSTUNREACH;
2017 if (dev != grt->dst.dev) {
2023 idev = grt->rt6i_idev;
2025 in6_dev_hold(grt->rt6i_idev);
2027 if (!(grt->rt6i_flags & RTF_GATEWAY))
2035 if (!dev || (dev->flags & IFF_LOOPBACK))
2043 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2044 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2048 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2049 rt->rt6i_prefsrc.plen = 128;
2051 rt->rt6i_prefsrc.plen = 0;
2053 rt->rt6i_flags = cfg->fc_flags;
2057 rt->rt6i_idev = idev;
2058 rt->rt6i_table = table;
2060 cfg->fc_nlinfo.nl_net = dev_net(dev);
2071 return ERR_PTR(err);
2074 int ip6_route_add(struct fib6_config *cfg)
2076 struct mx6_config mxc = { .mx = NULL, };
2077 struct rt6_info *rt;
2080 rt = ip6_route_info_create(cfg);
2087 err = ip6_convert_metrics(&mxc, cfg);
2091 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2103 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2106 struct fib6_table *table;
2107 struct net *net = dev_net(rt->dst.dev);
2109 if (rt == net->ipv6.ip6_null_entry ||
2110 rt->dst.flags & DST_NOCACHE) {
2115 table = rt->rt6i_table;
2116 write_lock_bh(&table->tb6_lock);
2117 err = fib6_del(rt, info);
2118 write_unlock_bh(&table->tb6_lock);
2125 int ip6_del_rt(struct rt6_info *rt)
2127 struct nl_info info = {
2128 .nl_net = dev_net(rt->dst.dev),
2130 return __ip6_del_rt(rt, &info);
2133 static int ip6_route_del(struct fib6_config *cfg)
2135 struct fib6_table *table;
2136 struct fib6_node *fn;
2137 struct rt6_info *rt;
2140 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2144 read_lock_bh(&table->tb6_lock);
2146 fn = fib6_locate(&table->tb6_root,
2147 &cfg->fc_dst, cfg->fc_dst_len,
2148 &cfg->fc_src, cfg->fc_src_len);
2151 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2152 if ((rt->rt6i_flags & RTF_CACHE) &&
2153 !(cfg->fc_flags & RTF_CACHE))
2155 if (cfg->fc_ifindex &&
2157 rt->dst.dev->ifindex != cfg->fc_ifindex))
2159 if (cfg->fc_flags & RTF_GATEWAY &&
2160 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2162 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2165 read_unlock_bh(&table->tb6_lock);
2167 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2170 read_unlock_bh(&table->tb6_lock);
2175 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2177 struct netevent_redirect netevent;
2178 struct rt6_info *rt, *nrt = NULL;
2179 struct ndisc_options ndopts;
2180 struct inet6_dev *in6_dev;
2181 struct neighbour *neigh;
2183 int optlen, on_link;
2186 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2187 optlen -= sizeof(*msg);
2190 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2194 msg = (struct rd_msg *)icmp6_hdr(skb);
2196 if (ipv6_addr_is_multicast(&msg->dest)) {
2197 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2202 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2204 } else if (ipv6_addr_type(&msg->target) !=
2205 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2206 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2210 in6_dev = __in6_dev_get(skb->dev);
2213 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2217 * The IP source address of the Redirect MUST be the same as the current
2218 * first-hop router for the specified ICMP Destination Address.
2221 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2222 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2227 if (ndopts.nd_opts_tgt_lladdr) {
2228 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2231 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2236 rt = (struct rt6_info *) dst;
2237 if (rt->rt6i_flags & RTF_REJECT) {
2238 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2242 /* Redirect received -> path was valid.
2243 * Look, redirects are sent only in response to data packets,
2244 * so that this nexthop apparently is reachable. --ANK
2246 dst_confirm(&rt->dst);
2248 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2253 * We have finally decided to accept it.
2256 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2257 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2258 NEIGH_UPDATE_F_OVERRIDE|
2259 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2260 NEIGH_UPDATE_F_ISROUTER)),
2261 NDISC_REDIRECT, &ndopts);
2263 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2267 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2269 nrt->rt6i_flags &= ~RTF_GATEWAY;
2271 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2273 if (ip6_ins_rt(nrt))
2276 netevent.old = &rt->dst;
2277 netevent.new = &nrt->dst;
2278 netevent.daddr = &msg->dest;
2279 netevent.neigh = neigh;
2280 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2282 if (rt->rt6i_flags & RTF_CACHE) {
2283 rt = (struct rt6_info *) dst_clone(&rt->dst);
2288 neigh_release(neigh);
2292 * Misc support functions
2295 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2297 BUG_ON(from->dst.from);
2299 rt->rt6i_flags &= ~RTF_EXPIRES;
2300 dst_hold(&from->dst);
2301 rt->dst.from = &from->dst;
2302 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2305 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2307 rt->dst.input = ort->dst.input;
2308 rt->dst.output = ort->dst.output;
2309 rt->rt6i_dst = ort->rt6i_dst;
2310 rt->dst.error = ort->dst.error;
2311 rt->rt6i_idev = ort->rt6i_idev;
2313 in6_dev_hold(rt->rt6i_idev);
2314 rt->dst.lastuse = jiffies;
2315 rt->rt6i_gateway = ort->rt6i_gateway;
2316 rt->rt6i_flags = ort->rt6i_flags;
2317 rt6_set_from(rt, ort);
2318 rt->rt6i_metric = ort->rt6i_metric;
2319 #ifdef CONFIG_IPV6_SUBTREES
2320 rt->rt6i_src = ort->rt6i_src;
2322 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2323 rt->rt6i_table = ort->rt6i_table;
2324 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2327 #ifdef CONFIG_IPV6_ROUTE_INFO
2328 static struct rt6_info *rt6_get_route_info(struct net *net,
2329 const struct in6_addr *prefix, int prefixlen,
2330 const struct in6_addr *gwaddr,
2331 struct net_device *dev)
2333 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2334 int ifindex = dev->ifindex;
2335 struct fib6_node *fn;
2336 struct rt6_info *rt = NULL;
2337 struct fib6_table *table;
2339 table = fib6_get_table(net, tb_id);
2343 read_lock_bh(&table->tb6_lock);
2344 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2348 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2349 if (rt->dst.dev->ifindex != ifindex)
2351 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2353 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2359 read_unlock_bh(&table->tb6_lock);
2363 static struct rt6_info *rt6_add_route_info(struct net *net,
2364 const struct in6_addr *prefix, int prefixlen,
2365 const struct in6_addr *gwaddr,
2366 struct net_device *dev,
2369 struct fib6_config cfg = {
2370 .fc_metric = IP6_RT_PRIO_USER,
2371 .fc_ifindex = dev->ifindex,
2372 .fc_dst_len = prefixlen,
2373 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2374 RTF_UP | RTF_PREF(pref),
2375 .fc_nlinfo.portid = 0,
2376 .fc_nlinfo.nlh = NULL,
2377 .fc_nlinfo.nl_net = net,
2380 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2381 cfg.fc_dst = *prefix;
2382 cfg.fc_gateway = *gwaddr;
2384 /* We should treat it as a default route if prefix length is 0. */
2386 cfg.fc_flags |= RTF_DEFAULT;
2388 ip6_route_add(&cfg);
2390 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2394 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2396 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2397 struct rt6_info *rt;
2398 struct fib6_table *table;
2400 table = fib6_get_table(dev_net(dev), tb_id);
2404 read_lock_bh(&table->tb6_lock);
2405 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2406 if (dev == rt->dst.dev &&
2407 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2408 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2413 read_unlock_bh(&table->tb6_lock);
2417 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2418 struct net_device *dev,
2421 struct fib6_config cfg = {
2422 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2423 .fc_metric = IP6_RT_PRIO_USER,
2424 .fc_ifindex = dev->ifindex,
2425 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2426 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2427 .fc_nlinfo.portid = 0,
2428 .fc_nlinfo.nlh = NULL,
2429 .fc_nlinfo.nl_net = dev_net(dev),
2432 cfg.fc_gateway = *gwaddr;
2434 if (!ip6_route_add(&cfg)) {
2435 struct fib6_table *table;
2437 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2439 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2442 return rt6_get_dflt_router(gwaddr, dev);
2445 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2447 struct rt6_info *rt;
2450 read_lock_bh(&table->tb6_lock);
2451 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2452 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2453 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2455 read_unlock_bh(&table->tb6_lock);
2460 read_unlock_bh(&table->tb6_lock);
2462 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2465 void rt6_purge_dflt_routers(struct net *net)
2467 struct fib6_table *table;
2468 struct hlist_head *head;
2473 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2474 head = &net->ipv6.fib_table_hash[h];
2475 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2476 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2477 __rt6_purge_dflt_routers(table);
2484 static void rtmsg_to_fib6_config(struct net *net,
2485 struct in6_rtmsg *rtmsg,
2486 struct fib6_config *cfg)
2488 memset(cfg, 0, sizeof(*cfg));
2490 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2492 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2493 cfg->fc_metric = rtmsg->rtmsg_metric;
2494 cfg->fc_expires = rtmsg->rtmsg_info;
2495 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2496 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2497 cfg->fc_flags = rtmsg->rtmsg_flags;
2499 cfg->fc_nlinfo.nl_net = net;
2501 cfg->fc_dst = rtmsg->rtmsg_dst;
2502 cfg->fc_src = rtmsg->rtmsg_src;
2503 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2506 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2508 struct fib6_config cfg;
2509 struct in6_rtmsg rtmsg;
2513 case SIOCADDRT: /* Add a route */
2514 case SIOCDELRT: /* Delete a route */
2515 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2517 err = copy_from_user(&rtmsg, arg,
2518 sizeof(struct in6_rtmsg));
2522 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2527 err = ip6_route_add(&cfg);
2530 err = ip6_route_del(&cfg);
2544 * Drop the packet on the floor
2547 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2550 struct dst_entry *dst = skb_dst(skb);
2551 switch (ipstats_mib_noroutes) {
2552 case IPSTATS_MIB_INNOROUTES:
2553 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2554 if (type == IPV6_ADDR_ANY) {
2555 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2556 IPSTATS_MIB_INADDRERRORS);
2560 case IPSTATS_MIB_OUTNOROUTES:
2561 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2562 ipstats_mib_noroutes);
2565 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2570 static int ip6_pkt_discard(struct sk_buff *skb)
2572 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2575 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2577 skb->dev = skb_dst(skb)->dev;
2578 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2581 static int ip6_pkt_prohibit(struct sk_buff *skb)
2583 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2586 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2588 skb->dev = skb_dst(skb)->dev;
2589 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2593 * Allocate a dst for local (unicast / anycast) address.
2596 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2597 const struct in6_addr *addr,
2601 struct net *net = dev_net(idev->dev);
2602 struct net_device *dev = net->loopback_dev;
2603 struct rt6_info *rt;
2605 /* use L3 Master device as loopback for host routes if device
2606 * is enslaved and address is not link local or multicast
2608 if (!rt6_need_strict(addr))
2609 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2611 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2613 return ERR_PTR(-ENOMEM);
2617 rt->dst.flags |= DST_HOST;
2618 rt->dst.input = ip6_input;
2619 rt->dst.output = ip6_output;
2620 rt->rt6i_idev = idev;
2622 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2624 rt->rt6i_flags |= RTF_ANYCAST;
2626 rt->rt6i_flags |= RTF_LOCAL;
2628 rt->rt6i_gateway = *addr;
2629 rt->rt6i_dst.addr = *addr;
2630 rt->rt6i_dst.plen = 128;
2631 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2632 rt->rt6i_table = fib6_get_table(net, tb_id);
2633 rt->dst.flags |= DST_NOCACHE;
2635 atomic_set(&rt->dst.__refcnt, 1);
2640 /* remove deleted ip from prefsrc entries */
2641 struct arg_dev_net_ip {
2642 struct net_device *dev;
2644 struct in6_addr *addr;
2647 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2649 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2650 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2651 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2653 if (((void *)rt->dst.dev == dev || !dev) &&
2654 rt != net->ipv6.ip6_null_entry &&
2655 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2656 /* remove prefsrc entry */
2657 rt->rt6i_prefsrc.plen = 0;
2662 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2664 struct net *net = dev_net(ifp->idev->dev);
2665 struct arg_dev_net_ip adni = {
2666 .dev = ifp->idev->dev,
2670 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2673 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2674 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2676 /* Remove routers and update dst entries when gateway turn into host. */
2677 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2679 struct in6_addr *gateway = (struct in6_addr *)arg;
2681 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2682 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2683 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2689 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2691 fib6_clean_all(net, fib6_clean_tohost, gateway);
2694 struct arg_dev_net {
2695 struct net_device *dev;
2699 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2701 const struct arg_dev_net *adn = arg;
2702 const struct net_device *dev = adn->dev;
2704 if ((rt->dst.dev == dev || !dev) &&
2705 rt != adn->net->ipv6.ip6_null_entry)
2711 void rt6_ifdown(struct net *net, struct net_device *dev)
2713 struct arg_dev_net adn = {
2718 fib6_clean_all(net, fib6_ifdown, &adn);
2719 icmp6_clean_all(fib6_ifdown, &adn);
2721 rt6_uncached_list_flush_dev(net, dev);
2724 struct rt6_mtu_change_arg {
2725 struct net_device *dev;
2729 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2731 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2732 struct inet6_dev *idev;
2734 /* In IPv6 pmtu discovery is not optional,
2735 so that RTAX_MTU lock cannot disable it.
2736 We still use this lock to block changes
2737 caused by addrconf/ndisc.
2740 idev = __in6_dev_get(arg->dev);
2744 /* For administrative MTU increase, there is no way to discover
2745 IPv6 PMTU increase, so PMTU increase should be updated here.
2746 Since RFC 1981 doesn't include administrative MTU increase
2747 update PMTU increase is a MUST. (i.e. jumbo frame)
2750 If new MTU is less than route PMTU, this new MTU will be the
2751 lowest MTU in the path, update the route PMTU to reflect PMTU
2752 decreases; if new MTU is greater than route PMTU, and the
2753 old MTU is the lowest MTU in the path, update the route PMTU
2754 to reflect the increase. In this case if the other nodes' MTU
2755 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2758 if (rt->dst.dev == arg->dev &&
2759 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2760 if (rt->rt6i_flags & RTF_CACHE) {
2761 /* For RTF_CACHE with rt6i_pmtu == 0
2762 * (i.e. a redirected route),
2763 * the metrics of its rt->dst.from has already
2766 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2767 rt->rt6i_pmtu = arg->mtu;
2768 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2769 (dst_mtu(&rt->dst) < arg->mtu &&
2770 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2771 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2777 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2779 struct rt6_mtu_change_arg arg = {
2784 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2787 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2788 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2789 [RTA_OIF] = { .type = NLA_U32 },
2790 [RTA_IIF] = { .type = NLA_U32 },
2791 [RTA_PRIORITY] = { .type = NLA_U32 },
2792 [RTA_METRICS] = { .type = NLA_NESTED },
2793 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2794 [RTA_PREF] = { .type = NLA_U8 },
2795 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2796 [RTA_ENCAP] = { .type = NLA_NESTED },
2797 [RTA_EXPIRES] = { .type = NLA_U32 },
2800 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2801 struct fib6_config *cfg)
2804 struct nlattr *tb[RTA_MAX+1];
2808 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2813 rtm = nlmsg_data(nlh);
2814 memset(cfg, 0, sizeof(*cfg));
2816 cfg->fc_table = rtm->rtm_table;
2817 cfg->fc_dst_len = rtm->rtm_dst_len;
2818 cfg->fc_src_len = rtm->rtm_src_len;
2819 cfg->fc_flags = RTF_UP;
2820 cfg->fc_protocol = rtm->rtm_protocol;
2821 cfg->fc_type = rtm->rtm_type;
2823 if (rtm->rtm_type == RTN_UNREACHABLE ||
2824 rtm->rtm_type == RTN_BLACKHOLE ||
2825 rtm->rtm_type == RTN_PROHIBIT ||
2826 rtm->rtm_type == RTN_THROW)
2827 cfg->fc_flags |= RTF_REJECT;
2829 if (rtm->rtm_type == RTN_LOCAL)
2830 cfg->fc_flags |= RTF_LOCAL;
2832 if (rtm->rtm_flags & RTM_F_CLONED)
2833 cfg->fc_flags |= RTF_CACHE;
2835 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2836 cfg->fc_nlinfo.nlh = nlh;
2837 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2839 if (tb[RTA_GATEWAY]) {
2840 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2841 cfg->fc_flags |= RTF_GATEWAY;
2845 int plen = (rtm->rtm_dst_len + 7) >> 3;
2847 if (nla_len(tb[RTA_DST]) < plen)
2850 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2854 int plen = (rtm->rtm_src_len + 7) >> 3;
2856 if (nla_len(tb[RTA_SRC]) < plen)
2859 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2862 if (tb[RTA_PREFSRC])
2863 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2866 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2868 if (tb[RTA_PRIORITY])
2869 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2871 if (tb[RTA_METRICS]) {
2872 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2873 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2877 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2879 if (tb[RTA_MULTIPATH]) {
2880 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2881 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2885 pref = nla_get_u8(tb[RTA_PREF]);
2886 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2887 pref != ICMPV6_ROUTER_PREF_HIGH)
2888 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2889 cfg->fc_flags |= RTF_PREF(pref);
2893 cfg->fc_encap = tb[RTA_ENCAP];
2895 if (tb[RTA_ENCAP_TYPE])
2896 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2898 if (tb[RTA_EXPIRES]) {
2899 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2901 if (addrconf_finite_timeout(timeout)) {
2902 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2903 cfg->fc_flags |= RTF_EXPIRES;
2913 struct rt6_info *rt6_info;
2914 struct fib6_config r_cfg;
2915 struct mx6_config mxc;
2916 struct list_head next;
2919 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2923 list_for_each_entry(nh, rt6_nh_list, next) {
2924 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2925 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2926 nh->r_cfg.fc_ifindex);
2930 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2931 struct rt6_info *rt, struct fib6_config *r_cfg)
2934 struct rt6_info *rtnh;
2937 list_for_each_entry(nh, rt6_nh_list, next) {
2938 /* check if rt6_info already exists */
2939 rtnh = nh->rt6_info;
2941 if (rtnh->dst.dev == rt->dst.dev &&
2942 rtnh->rt6i_idev == rt->rt6i_idev &&
2943 ipv6_addr_equal(&rtnh->rt6i_gateway,
2948 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2952 err = ip6_convert_metrics(&nh->mxc, r_cfg);
2957 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2958 list_add_tail(&nh->next, rt6_nh_list);
2963 static int ip6_route_multipath_add(struct fib6_config *cfg)
2965 struct fib6_config r_cfg;
2966 struct rtnexthop *rtnh;
2967 struct rt6_info *rt;
2968 struct rt6_nh *err_nh;
2969 struct rt6_nh *nh, *nh_safe;
2974 int replace = (cfg->fc_nlinfo.nlh &&
2975 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2976 LIST_HEAD(rt6_nh_list);
2978 remaining = cfg->fc_mp_len;
2979 rtnh = (struct rtnexthop *)cfg->fc_mp;
2981 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2982 * rt6_info structs per nexthop
2984 while (rtnh_ok(rtnh, remaining)) {
2985 memcpy(&r_cfg, cfg, sizeof(*cfg));
2986 if (rtnh->rtnh_ifindex)
2987 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2989 attrlen = rtnh_attrlen(rtnh);
2991 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2993 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2995 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2996 r_cfg.fc_flags |= RTF_GATEWAY;
2998 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2999 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3001 r_cfg.fc_encap_type = nla_get_u16(nla);
3004 rt = ip6_route_info_create(&r_cfg);
3011 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3017 rtnh = rtnh_next(rtnh, &remaining);
3021 list_for_each_entry(nh, &rt6_nh_list, next) {
3022 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
3023 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3024 nh->rt6_info = NULL;
3027 ip6_print_replace_route_err(&rt6_nh_list);
3032 /* Because each route is added like a single route we remove
3033 * these flags after the first nexthop: if there is a collision,
3034 * we have already failed to add the first nexthop:
3035 * fib6_add_rt2node() has rejected it; when replacing, old
3036 * nexthops have been replaced by first new, the rest should
3039 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3047 /* Delete routes that were already added */
3048 list_for_each_entry(nh, &rt6_nh_list, next) {
3051 ip6_route_del(&nh->r_cfg);
3055 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3057 dst_free(&nh->rt6_info->dst);
3059 list_del(&nh->next);
3066 static int ip6_route_multipath_del(struct fib6_config *cfg)
3068 struct fib6_config r_cfg;
3069 struct rtnexthop *rtnh;
3072 int err = 1, last_err = 0;
3074 remaining = cfg->fc_mp_len;
3075 rtnh = (struct rtnexthop *)cfg->fc_mp;
3077 /* Parse a Multipath Entry */
3078 while (rtnh_ok(rtnh, remaining)) {
3079 memcpy(&r_cfg, cfg, sizeof(*cfg));
3080 if (rtnh->rtnh_ifindex)
3081 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3083 attrlen = rtnh_attrlen(rtnh);
3085 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3087 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3089 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3090 r_cfg.fc_flags |= RTF_GATEWAY;
3093 err = ip6_route_del(&r_cfg);
3097 rtnh = rtnh_next(rtnh, &remaining);
3103 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3105 struct fib6_config cfg;
3108 err = rtm_to_fib6_config(skb, nlh, &cfg);
3113 return ip6_route_multipath_del(&cfg);
3115 return ip6_route_del(&cfg);
3118 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3120 struct fib6_config cfg;
3123 err = rtm_to_fib6_config(skb, nlh, &cfg);
3128 return ip6_route_multipath_add(&cfg);
3130 return ip6_route_add(&cfg);
3133 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3135 return NLMSG_ALIGN(sizeof(struct rtmsg))
3136 + nla_total_size(16) /* RTA_SRC */
3137 + nla_total_size(16) /* RTA_DST */
3138 + nla_total_size(16) /* RTA_GATEWAY */
3139 + nla_total_size(16) /* RTA_PREFSRC */
3140 + nla_total_size(4) /* RTA_TABLE */
3141 + nla_total_size(4) /* RTA_IIF */
3142 + nla_total_size(4) /* RTA_OIF */
3143 + nla_total_size(4) /* RTA_PRIORITY */
3144 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3145 + nla_total_size(sizeof(struct rta_cacheinfo))
3146 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3147 + nla_total_size(1) /* RTA_PREF */
3148 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3151 static int rt6_fill_node(struct net *net,
3152 struct sk_buff *skb, struct rt6_info *rt,
3153 struct in6_addr *dst, struct in6_addr *src,
3154 int iif, int type, u32 portid, u32 seq,
3155 int prefix, int nowait, unsigned int flags)
3157 u32 metrics[RTAX_MAX];
3159 struct nlmsghdr *nlh;
3163 if (prefix) { /* user wants prefix routes only */
3164 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3165 /* success since this is not a prefix route */
3170 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3174 rtm = nlmsg_data(nlh);
3175 rtm->rtm_family = AF_INET6;
3176 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3177 rtm->rtm_src_len = rt->rt6i_src.plen;
3180 table = rt->rt6i_table->tb6_id;
3182 table = RT6_TABLE_UNSPEC;
3183 rtm->rtm_table = table;
3184 if (nla_put_u32(skb, RTA_TABLE, table))
3185 goto nla_put_failure;
3186 if (rt->rt6i_flags & RTF_REJECT) {
3187 switch (rt->dst.error) {
3189 rtm->rtm_type = RTN_BLACKHOLE;
3192 rtm->rtm_type = RTN_PROHIBIT;
3195 rtm->rtm_type = RTN_THROW;
3198 rtm->rtm_type = RTN_UNREACHABLE;
3202 else if (rt->rt6i_flags & RTF_LOCAL)
3203 rtm->rtm_type = RTN_LOCAL;
3204 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3205 rtm->rtm_type = RTN_LOCAL;
3207 rtm->rtm_type = RTN_UNICAST;
3209 if (!netif_carrier_ok(rt->dst.dev)) {
3210 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3211 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3212 rtm->rtm_flags |= RTNH_F_DEAD;
3214 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3215 rtm->rtm_protocol = rt->rt6i_protocol;
3216 if (rt->rt6i_flags & RTF_DYNAMIC)
3217 rtm->rtm_protocol = RTPROT_REDIRECT;
3218 else if (rt->rt6i_flags & RTF_ADDRCONF) {
3219 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3220 rtm->rtm_protocol = RTPROT_RA;
3222 rtm->rtm_protocol = RTPROT_KERNEL;
3225 if (rt->rt6i_flags & RTF_CACHE)
3226 rtm->rtm_flags |= RTM_F_CLONED;
3229 if (nla_put_in6_addr(skb, RTA_DST, dst))
3230 goto nla_put_failure;
3231 rtm->rtm_dst_len = 128;
3232 } else if (rtm->rtm_dst_len)
3233 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3234 goto nla_put_failure;
3235 #ifdef CONFIG_IPV6_SUBTREES
3237 if (nla_put_in6_addr(skb, RTA_SRC, src))
3238 goto nla_put_failure;
3239 rtm->rtm_src_len = 128;
3240 } else if (rtm->rtm_src_len &&
3241 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3242 goto nla_put_failure;
3245 #ifdef CONFIG_IPV6_MROUTE
3246 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3247 int err = ip6mr_get_route(net, skb, rtm, nowait,
3254 goto nla_put_failure;
3256 if (err == -EMSGSIZE)
3257 goto nla_put_failure;
3262 if (nla_put_u32(skb, RTA_IIF, iif))
3263 goto nla_put_failure;
3265 struct in6_addr saddr_buf;
3266 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3267 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3268 goto nla_put_failure;
3271 if (rt->rt6i_prefsrc.plen) {
3272 struct in6_addr saddr_buf;
3273 saddr_buf = rt->rt6i_prefsrc.addr;
3274 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3275 goto nla_put_failure;
3278 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3280 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3281 if (rtnetlink_put_metrics(skb, metrics) < 0)
3282 goto nla_put_failure;
3284 if (rt->rt6i_flags & RTF_GATEWAY) {
3285 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3286 goto nla_put_failure;
3290 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3291 goto nla_put_failure;
3292 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3293 goto nla_put_failure;
3295 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3297 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3298 goto nla_put_failure;
3300 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3301 goto nla_put_failure;
3303 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3305 nlmsg_end(skb, nlh);
3309 nlmsg_cancel(skb, nlh);
3313 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3315 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3318 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3319 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3320 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3324 return rt6_fill_node(arg->net,
3325 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3326 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3327 prefix, 0, NLM_F_MULTI);
3330 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3332 struct net *net = sock_net(in_skb->sk);
3333 struct nlattr *tb[RTA_MAX+1];
3334 struct rt6_info *rt;
3335 struct sk_buff *skb;
3338 int err, iif = 0, oif = 0;
3340 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3345 memset(&fl6, 0, sizeof(fl6));
3346 rtm = nlmsg_data(nlh);
3347 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3350 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3353 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3357 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3360 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3364 iif = nla_get_u32(tb[RTA_IIF]);
3367 oif = nla_get_u32(tb[RTA_OIF]);
3370 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3373 struct net_device *dev;
3376 dev = __dev_get_by_index(net, iif);
3382 fl6.flowi6_iif = iif;
3384 if (!ipv6_addr_any(&fl6.saddr))
3385 flags |= RT6_LOOKUP_F_HAS_SADDR;
3387 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3390 fl6.flowi6_oif = oif;
3392 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3395 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3402 /* Reserve room for dummy headers, this skb can pass
3403 through good chunk of routing engine.
3405 skb_reset_mac_header(skb);
3406 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3408 skb_dst_set(skb, &rt->dst);
3410 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3411 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3412 nlh->nlmsg_seq, 0, 0, 0);
3418 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3423 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3424 unsigned int nlm_flags)
3426 struct sk_buff *skb;
3427 struct net *net = info->nl_net;
3432 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3434 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3438 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3439 event, info->portid, seq, 0, 0, nlm_flags);
3441 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3442 WARN_ON(err == -EMSGSIZE);
3446 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3447 info->nlh, gfp_any());
3451 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3454 static int ip6_route_dev_notify(struct notifier_block *this,
3455 unsigned long event, void *ptr)
3457 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3458 struct net *net = dev_net(dev);
3460 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3461 net->ipv6.ip6_null_entry->dst.dev = dev;
3462 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3463 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3464 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3465 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3466 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3467 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3478 #ifdef CONFIG_PROC_FS
3480 static const struct file_operations ipv6_route_proc_fops = {
3481 .owner = THIS_MODULE,
3482 .open = ipv6_route_open,
3484 .llseek = seq_lseek,
3485 .release = seq_release_net,
3488 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3490 struct net *net = (struct net *)seq->private;
3491 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3492 net->ipv6.rt6_stats->fib_nodes,
3493 net->ipv6.rt6_stats->fib_route_nodes,
3494 net->ipv6.rt6_stats->fib_rt_alloc,
3495 net->ipv6.rt6_stats->fib_rt_entries,
3496 net->ipv6.rt6_stats->fib_rt_cache,
3497 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3498 net->ipv6.rt6_stats->fib_discarded_routes);
3503 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3505 return single_open_net(inode, file, rt6_stats_seq_show);
3508 static const struct file_operations rt6_stats_seq_fops = {
3509 .owner = THIS_MODULE,
3510 .open = rt6_stats_seq_open,
3512 .llseek = seq_lseek,
3513 .release = single_release_net,
3515 #endif /* CONFIG_PROC_FS */
3517 #ifdef CONFIG_SYSCTL
3520 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3521 void __user *buffer, size_t *lenp, loff_t *ppos)
3528 net = (struct net *)ctl->extra1;
3529 delay = net->ipv6.sysctl.flush_delay;
3530 proc_dointvec(ctl, write, buffer, lenp, ppos);
3531 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3535 struct ctl_table ipv6_route_table_template[] = {
3537 .procname = "flush",
3538 .data = &init_net.ipv6.sysctl.flush_delay,
3539 .maxlen = sizeof(int),
3541 .proc_handler = ipv6_sysctl_rtcache_flush
3544 .procname = "gc_thresh",
3545 .data = &ip6_dst_ops_template.gc_thresh,
3546 .maxlen = sizeof(int),
3548 .proc_handler = proc_dointvec,
3551 .procname = "max_size",
3552 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3553 .maxlen = sizeof(int),
3555 .proc_handler = proc_dointvec,
3558 .procname = "gc_min_interval",
3559 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3560 .maxlen = sizeof(int),
3562 .proc_handler = proc_dointvec_jiffies,
3565 .procname = "gc_timeout",
3566 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3567 .maxlen = sizeof(int),
3569 .proc_handler = proc_dointvec_jiffies,
3572 .procname = "gc_interval",
3573 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3574 .maxlen = sizeof(int),
3576 .proc_handler = proc_dointvec_jiffies,
3579 .procname = "gc_elasticity",
3580 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3581 .maxlen = sizeof(int),
3583 .proc_handler = proc_dointvec,
3586 .procname = "mtu_expires",
3587 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3588 .maxlen = sizeof(int),
3590 .proc_handler = proc_dointvec_jiffies,
3593 .procname = "min_adv_mss",
3594 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3595 .maxlen = sizeof(int),
3597 .proc_handler = proc_dointvec,
3600 .procname = "gc_min_interval_ms",
3601 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3602 .maxlen = sizeof(int),
3604 .proc_handler = proc_dointvec_ms_jiffies,
3609 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3611 struct ctl_table *table;
3613 table = kmemdup(ipv6_route_table_template,
3614 sizeof(ipv6_route_table_template),
3618 table[0].data = &net->ipv6.sysctl.flush_delay;
3619 table[0].extra1 = net;
3620 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3621 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3622 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3623 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3624 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3625 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3626 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3627 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3628 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3630 /* Don't export sysctls to unprivileged users */
3631 if (net->user_ns != &init_user_ns)
3632 table[0].procname = NULL;
3639 static int __net_init ip6_route_net_init(struct net *net)
3643 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3644 sizeof(net->ipv6.ip6_dst_ops));
3646 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3647 goto out_ip6_dst_ops;
3649 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3650 sizeof(*net->ipv6.ip6_null_entry),
3652 if (!net->ipv6.ip6_null_entry)
3653 goto out_ip6_dst_entries;
3654 net->ipv6.ip6_null_entry->dst.path =
3655 (struct dst_entry *)net->ipv6.ip6_null_entry;
3656 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3657 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3658 ip6_template_metrics, true);
3660 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3661 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3662 sizeof(*net->ipv6.ip6_prohibit_entry),
3664 if (!net->ipv6.ip6_prohibit_entry)
3665 goto out_ip6_null_entry;
3666 net->ipv6.ip6_prohibit_entry->dst.path =
3667 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3668 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3669 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3670 ip6_template_metrics, true);
3672 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3673 sizeof(*net->ipv6.ip6_blk_hole_entry),
3675 if (!net->ipv6.ip6_blk_hole_entry)
3676 goto out_ip6_prohibit_entry;
3677 net->ipv6.ip6_blk_hole_entry->dst.path =
3678 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3679 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3680 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3681 ip6_template_metrics, true);
3684 net->ipv6.sysctl.flush_delay = 0;
3685 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3686 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3687 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3688 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3689 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3690 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3691 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3693 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3699 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3700 out_ip6_prohibit_entry:
3701 kfree(net->ipv6.ip6_prohibit_entry);
3703 kfree(net->ipv6.ip6_null_entry);
3705 out_ip6_dst_entries:
3706 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3711 static void __net_exit ip6_route_net_exit(struct net *net)
3713 kfree(net->ipv6.ip6_null_entry);
3714 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3715 kfree(net->ipv6.ip6_prohibit_entry);
3716 kfree(net->ipv6.ip6_blk_hole_entry);
3718 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3721 static int __net_init ip6_route_net_init_late(struct net *net)
3723 #ifdef CONFIG_PROC_FS
3724 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3725 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3730 static void __net_exit ip6_route_net_exit_late(struct net *net)
3732 #ifdef CONFIG_PROC_FS
3733 remove_proc_entry("ipv6_route", net->proc_net);
3734 remove_proc_entry("rt6_stats", net->proc_net);
3738 static struct pernet_operations ip6_route_net_ops = {
3739 .init = ip6_route_net_init,
3740 .exit = ip6_route_net_exit,
3743 static int __net_init ipv6_inetpeer_init(struct net *net)
3745 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3749 inet_peer_base_init(bp);
3750 net->ipv6.peers = bp;
3754 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3756 struct inet_peer_base *bp = net->ipv6.peers;
3758 net->ipv6.peers = NULL;
3759 inetpeer_invalidate_tree(bp);
3763 static struct pernet_operations ipv6_inetpeer_ops = {
3764 .init = ipv6_inetpeer_init,
3765 .exit = ipv6_inetpeer_exit,
3768 static struct pernet_operations ip6_route_net_late_ops = {
3769 .init = ip6_route_net_init_late,
3770 .exit = ip6_route_net_exit_late,
3773 static struct notifier_block ip6_route_dev_notifier = {
3774 .notifier_call = ip6_route_dev_notify,
3778 int __init ip6_route_init(void)
3784 ip6_dst_ops_template.kmem_cachep =
3785 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3786 SLAB_HWCACHE_ALIGN, NULL);
3787 if (!ip6_dst_ops_template.kmem_cachep)
3790 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3792 goto out_kmem_cache;
3794 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3796 goto out_dst_entries;
3798 ret = register_pernet_subsys(&ip6_route_net_ops);
3800 goto out_register_inetpeer;
3802 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3804 /* Registering of the loopback is done before this portion of code,
3805 * the loopback reference in rt6_info will not be taken, do it
3806 * manually for init_net */
3807 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3808 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3809 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3810 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3811 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3812 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3813 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3817 goto out_register_subsys;
3823 ret = fib6_rules_init();
3827 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3829 goto fib6_rules_init;
3832 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3833 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3834 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3835 goto out_register_late_subsys;
3837 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3839 goto out_register_late_subsys;
3841 for_each_possible_cpu(cpu) {
3842 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3844 INIT_LIST_HEAD(&ul->head);
3845 spin_lock_init(&ul->lock);
3851 out_register_late_subsys:
3852 unregister_pernet_subsys(&ip6_route_net_late_ops);
3854 fib6_rules_cleanup();
3859 out_register_subsys:
3860 unregister_pernet_subsys(&ip6_route_net_ops);
3861 out_register_inetpeer:
3862 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3864 dst_entries_destroy(&ip6_dst_blackhole_ops);
3866 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3870 void ip6_route_cleanup(void)
3872 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3873 unregister_pernet_subsys(&ip6_route_net_late_ops);
3874 fib6_rules_cleanup();
3877 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3878 unregister_pernet_subsys(&ip6_route_net_ops);
3879 dst_entries_destroy(&ip6_dst_blackhole_ops);
3880 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);