net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 147                                            struct sk_buff *skb, u32 mtu);
 148 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 149                                         struct sk_buff *skb);
 150 static void             ipv4_dst_destroy(struct dst_entry *dst);
 151
 152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                             int how)
 154 {
 155 }
 156
 157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158 {
 159         WARN_ON(1);
 160         return NULL;
 161 }
 162
 163 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 164                                            struct sk_buff *skb,
 165                                            const void *daddr);
 166
 167 static struct dst_ops ipv4_dst_ops = {
 168         .family =               AF_INET,
 169         .protocol =             cpu_to_be16(ETH_P_IP),
 170         .check =                ipv4_dst_check,
 171         .default_advmss =       ipv4_default_advmss,
 172         .mtu =                  ipv4_mtu,
 173         .cow_metrics =          ipv4_cow_metrics,
 174         .destroy =              ipv4_dst_destroy,
 175         .ifdown =               ipv4_dst_ifdown,
 176         .negative_advice =      ipv4_negative_advice,
 177         .link_failure =         ipv4_link_failure,
 178         .update_pmtu =          ip_rt_update_pmtu,
 179         .redirect =             ip_do_redirect,
 180         .local_out =            __ip_local_out,
 181         .neigh_lookup =         ipv4_neigh_lookup,
 182 };
 183
 184 #define ECN_OR_COST(class)      TC_PRIO_##class
 185
 186 const __u8 ip_tos2prio[16] = {
 187         TC_PRIO_BESTEFFORT,
 188         ECN_OR_COST(BESTEFFORT),
 189         TC_PRIO_BESTEFFORT,
 190         ECN_OR_COST(BESTEFFORT),
 191         TC_PRIO_BULK,
 192         ECN_OR_COST(BULK),
 193         TC_PRIO_BULK,
 194         ECN_OR_COST(BULK),
 195         TC_PRIO_INTERACTIVE,
 196         ECN_OR_COST(INTERACTIVE),
 197         TC_PRIO_INTERACTIVE,
 198         ECN_OR_COST(INTERACTIVE),
 199         TC_PRIO_INTERACTIVE_BULK,
 200         ECN_OR_COST(INTERACTIVE_BULK),
 201         TC_PRIO_INTERACTIVE_BULK,
 202         ECN_OR_COST(INTERACTIVE_BULK)
 203 };
 204 EXPORT_SYMBOL(ip_tos2prio);
 205
 206 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 207 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 208
 209 static inline int rt_genid(struct net *net)
 210 {
 211         return atomic_read(&net->ipv4.rt_genid);
 212 }
 213
 214 #ifdef CONFIG_PROC_FS
 215 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 216 {
 217         if (*pos)
 218                 return NULL;
 219         return SEQ_START_TOKEN;
 220 }
 221
 222 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 223 {
 224         ++*pos;
 225         return NULL;
 226 }
 227
 228 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 229 {
 230 }
 231
 232 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 233 {
 234         if (v == SEQ_START_TOKEN)
 235                 seq_printf(seq, "%-127s\n",
 236                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 237                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 238                            "HHUptod\tSpecDst");
 239         return 0;
 240 }
 241
 242 static const struct seq_operations rt_cache_seq_ops = {
 243         .start  = rt_cache_seq_start,
 244         .next   = rt_cache_seq_next,
 245         .stop   = rt_cache_seq_stop,
 246         .show   = rt_cache_seq_show,
 247 };
 248
 249 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 250 {
 251         return seq_open(file, &rt_cache_seq_ops);
 252 }
 253
 254 static const struct file_operations rt_cache_seq_fops = {
 255         .owner   = THIS_MODULE,
 256         .open    = rt_cache_seq_open,
 257         .read    = seq_read,
 258         .llseek  = seq_lseek,
 259         .release = seq_release,
 260 };
 261
 262
 263 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 264 {
 265         int cpu;
 266
 267         if (*pos == 0)
 268                 return SEQ_START_TOKEN;
 269
 270         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277 }
 278
 279 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 280 {
 281         int cpu;
 282
 283         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 284                 if (!cpu_possible(cpu))
 285                         continue;
 286                 *pos = cpu+1;
 287                 return &per_cpu(rt_cache_stat, cpu);
 288         }
 289         return NULL;
 290
 291 }
 292
 293 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 294 {
 295
 296 }
 297
 298 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 299 {
 300         struct rt_cache_stat *st = v;
 301
 302         if (v == SEQ_START_TOKEN) {
 303                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 304                 return 0;
 305         }
 306
 307         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 308                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 309                    dst_entries_get_slow(&ipv4_dst_ops),
 310                    st->in_hit,
 311                    st->in_slow_tot,
 312                    st->in_slow_mc,
 313                    st->in_no_route,
 314                    st->in_brd,
 315                    st->in_martian_dst,
 316                    st->in_martian_src,
 317
 318                    st->out_hit,
 319                    st->out_slow_tot,
 320                    st->out_slow_mc,
 321
 322                    st->gc_total,
 323                    st->gc_ignored,
 324                    st->gc_goal_miss,
 325                    st->gc_dst_overflow,
 326                    st->in_hlist_search,
 327                    st->out_hlist_search
 328                 );
 329         return 0;
 330 }
 331
 332 static const struct seq_operations rt_cpu_seq_ops = {
 333         .start  = rt_cpu_seq_start,
 334         .next   = rt_cpu_seq_next,
 335         .stop   = rt_cpu_seq_stop,
 336         .show   = rt_cpu_seq_show,
 337 };
 338
 339
 340 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 341 {
 342         return seq_open(file, &rt_cpu_seq_ops);
 343 }
 344
 345 static const struct file_operations rt_cpu_seq_fops = {
 346         .owner   = THIS_MODULE,
 347         .open    = rt_cpu_seq_open,
 348         .read    = seq_read,
 349         .llseek  = seq_lseek,
 350         .release = seq_release,
 351 };
 352
 353 #ifdef CONFIG_IP_ROUTE_CLASSID
 354 static int rt_acct_proc_show(struct seq_file *m, void *v)
 355 {
 356         struct ip_rt_acct *dst, *src;
 357         unsigned int i, j;
 358
 359         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 360         if (!dst)
 361                 return -ENOMEM;
 362
 363         for_each_possible_cpu(i) {
 364                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 365                 for (j = 0; j < 256; j++) {
 366                         dst[j].o_bytes   += src[j].o_bytes;
 367                         dst[j].o_packets += src[j].o_packets;
 368                         dst[j].i_bytes   += src[j].i_bytes;
 369                         dst[j].i_packets += src[j].i_packets;
 370                 }
 371         }
 372
 373         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 374         kfree(dst);
 375         return 0;
 376 }
 377
 378 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 379 {
 380         return single_open(file, rt_acct_proc_show, NULL);
 381 }
 382
 383 static const struct file_operations rt_acct_proc_fops = {
 384         .owner          = THIS_MODULE,
 385         .open           = rt_acct_proc_open,
 386         .read           = seq_read,
 387         .llseek         = seq_lseek,
 388         .release        = single_release,
 389 };
 390 #endif
 391
 392 static int __net_init ip_rt_do_proc_init(struct net *net)
 393 {
 394         struct proc_dir_entry *pde;
 395
 396         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 397                         &rt_cache_seq_fops);
 398         if (!pde)
 399                 goto err1;
 400
 401         pde = proc_create("rt_cache", S_IRUGO,
 402                           net->proc_net_stat, &rt_cpu_seq_fops);
 403         if (!pde)
 404                 goto err2;
 405
 406 #ifdef CONFIG_IP_ROUTE_CLASSID
 407         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 408         if (!pde)
 409                 goto err3;
 410 #endif
 411         return 0;
 412
 413 #ifdef CONFIG_IP_ROUTE_CLASSID
 414 err3:
 415         remove_proc_entry("rt_cache", net->proc_net_stat);
 416 #endif
 417 err2:
 418         remove_proc_entry("rt_cache", net->proc_net);
 419 err1:
 420         return -ENOMEM;
 421 }
 422
 423 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 424 {
 425         remove_proc_entry("rt_cache", net->proc_net_stat);
 426         remove_proc_entry("rt_cache", net->proc_net);
 427 #ifdef CONFIG_IP_ROUTE_CLASSID
 428         remove_proc_entry("rt_acct", net->proc_net);
 429 #endif
 430 }
 431
 432 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 433         .init = ip_rt_do_proc_init,
 434         .exit = ip_rt_do_proc_exit,
 435 };
 436
 437 static int __init ip_rt_proc_init(void)
 438 {
 439         return register_pernet_subsys(&ip_rt_proc_ops);
 440 }
 441
 442 #else
 443 static inline int ip_rt_proc_init(void)
 444 {
 445         return 0;
 446 }
 447 #endif /* CONFIG_PROC_FS */
 448
 449 static inline bool rt_is_expired(const struct rtable *rth)
 450 {
 451         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 452 }
 453
 454 /*
 455  * Perturbation of rt_genid by a small quantity [1..256]
 456  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 457  * many times (2^24) without giving recent rt_genid.
 458  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 459  */
 460 static void rt_cache_invalidate(struct net *net)
 461 {
 462         unsigned char shuffle;
 463
 464         get_random_bytes(&shuffle, sizeof(shuffle));
 465         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 466 }
 467
 468 /*
 469  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 470  * delay >= 0 : invalidate & flush cache (can be long)
 471  */
 472 void rt_cache_flush(struct net *net, int delay)
 473 {
 474         rt_cache_invalidate(net);
 475 }
 476
 477 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 478                                            struct sk_buff *skb,
 479                                            const void *daddr)
 480 {
 481         struct net_device *dev = dst->dev;
 482         const __be32 *pkey = daddr;
 483         const struct rtable *rt;
 484         struct neighbour *n;
 485
 486         rt = (const struct rtable *) dst;
 487         if (rt->rt_gateway)
 488                 pkey = (const __be32 *) &rt->rt_gateway;
 489         else if (skb)
 490                 pkey = &ip_hdr(skb)->daddr;
 491
 492         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 493         if (n)
 494                 return n;
 495         return neigh_create(&arp_tbl, pkey, dev);
 496 }
 497
 498 /*
 499  * Peer allocation may fail only in serious out-of-memory conditions.  However
 500  * we still can generate some output.
 501  * Random ID selection looks a bit dangerous because we have no chances to
 502  * select ID being unique in a reasonable period of time.
 503  * But broken packet identifier may be better than no packet at all.
 504  */
 505 static void ip_select_fb_ident(struct iphdr *iph)
 506 {
 507         static DEFINE_SPINLOCK(ip_fb_id_lock);
 508         static u32 ip_fallback_id;
 509         u32 salt;
 510
 511         spin_lock_bh(&ip_fb_id_lock);
 512         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 513         iph->id = htons(salt & 0xFFFF);
 514         ip_fallback_id = salt;
 515         spin_unlock_bh(&ip_fb_id_lock);
 516 }
 517
 518 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 519 {
 520         struct net *net = dev_net(dst->dev);
 521         struct inet_peer *peer;
 522
 523         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 524         if (peer) {
 525                 iph->id = htons(inet_getid(peer, more));
 526                 inet_putpeer(peer);
 527                 return;
 528         }
 529
 530         ip_select_fb_ident(iph);
 531 }
 532 EXPORT_SYMBOL(__ip_select_ident);
 533
 534 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 535                              const struct iphdr *iph,
 536                              int oif, u8 tos,
 537                              u8 prot, u32 mark, int flow_flags)
 538 {
 539         if (sk) {
 540                 const struct inet_sock *inet = inet_sk(sk);
 541
 542                 oif = sk->sk_bound_dev_if;
 543                 mark = sk->sk_mark;
 544                 tos = RT_CONN_FLAGS(sk);
 545                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 546         }
 547         flowi4_init_output(fl4, oif, mark, tos,
 548                            RT_SCOPE_UNIVERSE, prot,
 549                            flow_flags,
 550                            iph->daddr, iph->saddr, 0, 0);
 551 }
 552
 553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 554                                const struct sock *sk)
 555 {
 556         const struct iphdr *iph = ip_hdr(skb);
 557         int oif = skb->dev->ifindex;
 558         u8 tos = RT_TOS(iph->tos);
 559         u8 prot = iph->protocol;
 560         u32 mark = skb->mark;
 561
 562         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 563 }
 564
 565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568         const struct ip_options_rcu *inet_opt;
 569         __be32 daddr = inet->inet_daddr;
 570
 571         rcu_read_lock();
 572         inet_opt = rcu_dereference(inet->inet_opt);
 573         if (inet_opt && inet_opt->opt.srr)
 574                 daddr = inet_opt->opt.faddr;
 575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 578                            inet_sk_flowi_flags(sk),
 579                            daddr, inet->inet_saddr, 0, 0);
 580         rcu_read_unlock();
 581 }
 582
 583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 584                                  const struct sk_buff *skb)
 585 {
 586         if (skb)
 587                 build_skb_flow_key(fl4, skb, sk);
 588         else
 589                 build_sk_flow_key(fl4, sk);
 590 }
 591
 592 static inline void rt_free(struct rtable *rt)
 593 {
 594         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 595 }
 596
 597 static DEFINE_SPINLOCK(fnhe_lock);
 598
 599 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 600 {
 601         struct fib_nh_exception *fnhe, *oldest;
 602         struct rtable *orig;
 603
 604         oldest = rcu_dereference(hash->chain);
 605         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 606              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 607                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 608                         oldest = fnhe;
 609         }
 610         orig = rcu_dereference(oldest->fnhe_rth);
 611         if (orig) {
 612                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 613                 rt_free(orig);
 614         }
 615         return oldest;
 616 }
 617
 618 static inline u32 fnhe_hashfun(__be32 daddr)
 619 {
 620         u32 hval;
 621
 622         hval = (__force u32) daddr;
 623         hval ^= (hval >> 11) ^ (hval >> 22);
 624
 625         return hval & (FNHE_HASH_SIZE - 1);
 626 }
 627
 628 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 629                                   u32 pmtu, unsigned long expires)
 630 {
 631         struct fnhe_hash_bucket *hash;
 632         struct fib_nh_exception *fnhe;
 633         int depth;
 634         u32 hval = fnhe_hashfun(daddr);
 635
 636         spin_lock_bh(&fnhe_lock);
 637
 638         hash = nh->nh_exceptions;
 639         if (!hash) {
 640                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 641                 if (!hash)
 642                         goto out_unlock;
 643                 nh->nh_exceptions = hash;
 644         }
 645
 646         hash += hval;
 647
 648         depth = 0;
 649         for (fnhe = rcu_dereference(hash->chain); fnhe;
 650              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 651                 if (fnhe->fnhe_daddr == daddr)
 652                         break;
 653                 depth++;
 654         }
 655
 656         if (fnhe) {
 657                 if (gw)
 658                         fnhe->fnhe_gw = gw;
 659                 if (pmtu) {
 660                         fnhe->fnhe_pmtu = pmtu;
 661                         fnhe->fnhe_expires = expires;
 662                 }
 663         } else {
 664                 if (depth > FNHE_RECLAIM_DEPTH)
 665                         fnhe = fnhe_oldest(hash);
 666                 else {
 667                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 668                         if (!fnhe)
 669                                 goto out_unlock;
 670
 671                         fnhe->fnhe_next = hash->chain;
 672                         rcu_assign_pointer(hash->chain, fnhe);
 673                 }
 674                 fnhe->fnhe_daddr = daddr;
 675                 fnhe->fnhe_gw = gw;
 676                 fnhe->fnhe_pmtu = pmtu;
 677                 fnhe->fnhe_expires = expires;
 678         }
 679
 680         fnhe->fnhe_stamp = jiffies;
 681
 682 out_unlock:
 683         spin_unlock_bh(&fnhe_lock);
 684         return;
 685 }
 686
 687 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 688                              bool kill_route)
 689 {
 690         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 691         __be32 old_gw = ip_hdr(skb)->saddr;
 692         struct net_device *dev = skb->dev;
 693         struct in_device *in_dev;
 694         struct fib_result res;
 695         struct neighbour *n;
 696         struct net *net;
 697
 698         switch (icmp_hdr(skb)->code & 7) {
 699         case ICMP_REDIR_NET:
 700         case ICMP_REDIR_NETTOS:
 701         case ICMP_REDIR_HOST:
 702         case ICMP_REDIR_HOSTTOS:
 703                 break;
 704
 705         default:
 706                 return;
 707         }
 708
 709         if (rt->rt_gateway != old_gw)
 710                 return;
 711
 712         in_dev = __in_dev_get_rcu(dev);
 713         if (!in_dev)
 714                 return;
 715
 716         net = dev_net(dev);
 717         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 718             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 719             ipv4_is_zeronet(new_gw))
 720                 goto reject_redirect;
 721
 722         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 723                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 724                         goto reject_redirect;
 725                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 726                         goto reject_redirect;
 727         } else {
 728                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 729                         goto reject_redirect;
 730         }
 731
 732         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 733         if (n) {
 734                 if (!(n->nud_state & NUD_VALID)) {
 735                         neigh_event_send(n, NULL);
 736                 } else {
 737                         if (fib_lookup(net, fl4, &res) == 0) {
 738                                 struct fib_nh *nh = &FIB_RES_NH(res);
 739
 740                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 741                                                       0, 0);
 742                         }
 743                         if (kill_route)
 744                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 745                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 746                 }
 747                 neigh_release(n);
 748         }
 749         return;
 750
 751 reject_redirect:
 752 #ifdef CONFIG_IP_ROUTE_VERBOSE
 753         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 754                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 755                 __be32 daddr = iph->daddr;
 756                 __be32 saddr = iph->saddr;
 757
 758                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 759                                      "  Advised path = %pI4 -> %pI4\n",
 760                                      &old_gw, dev->name, &new_gw,
 761                                      &saddr, &daddr);
 762         }
 763 #endif
 764         ;
 765 }
 766
 767 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 768 {
 769         struct rtable *rt;
 770         struct flowi4 fl4;
 771
 772         rt = (struct rtable *) dst;
 773
 774         ip_rt_build_flow_key(&fl4, sk, skb);
 775         __ip_do_redirect(rt, skb, &fl4, true);
 776 }
 777
 778 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 779 {
 780         struct rtable *rt = (struct rtable *)dst;
 781         struct dst_entry *ret = dst;
 782
 783         if (rt) {
 784                 if (dst->obsolete > 0) {
 785                         ip_rt_put(rt);
 786                         ret = NULL;
 787                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 788                            rt->dst.expires) {
 789                         ip_rt_put(rt);
 790                         ret = NULL;
 791                 }
 792         }
 793         return ret;
 794 }
 795
 796 /*
 797  * Algorithm:
 798  *      1. The first ip_rt_redirect_number redirects are sent
 799  *         with exponential backoff, then we stop sending them at all,
 800  *         assuming that the host ignores our redirects.
 801  *      2. If we did not see packets requiring redirects
 802  *         during ip_rt_redirect_silence, we assume that the host
 803  *         forgot redirected route and start to send redirects again.
 804  *
 805  * This algorithm is much cheaper and more intelligent than dumb load limiting
 806  * in icmp.c.
 807  *
 808  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 809  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 810  */
 811
 812 void ip_rt_send_redirect(struct sk_buff *skb)
 813 {
 814         struct rtable *rt = skb_rtable(skb);
 815         struct in_device *in_dev;
 816         struct inet_peer *peer;
 817         struct net *net;
 818         int log_martians;
 819
 820         rcu_read_lock();
 821         in_dev = __in_dev_get_rcu(rt->dst.dev);
 822         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 823                 rcu_read_unlock();
 824                 return;
 825         }
 826         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 827         rcu_read_unlock();
 828
 829         net = dev_net(rt->dst.dev);
 830         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 831         if (!peer) {
 832                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 833                 return;
 834         }
 835
 836         /* No redirected packets during ip_rt_redirect_silence;
 837          * reset the algorithm.
 838          */
 839         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 840                 peer->rate_tokens = 0;
 841
 842         /* Too many ignored redirects; do not send anything
 843          * set dst.rate_last to the last seen redirected packet.
 844          */
 845         if (peer->rate_tokens >= ip_rt_redirect_number) {
 846                 peer->rate_last = jiffies;
 847                 goto out_put_peer;
 848         }
 849
 850         /* Check for load limit; set rate_last to the latest sent
 851          * redirect.
 852          */
 853         if (peer->rate_tokens == 0 ||
 854             time_after(jiffies,
 855                        (peer->rate_last +
 856                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 857                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 858                 peer->rate_last = jiffies;
 859                 ++peer->rate_tokens;
 860 #ifdef CONFIG_IP_ROUTE_VERBOSE
 861                 if (log_martians &&
 862                     peer->rate_tokens == ip_rt_redirect_number)
 863                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 864                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 865                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
 866 #endif
 867         }
 868 out_put_peer:
 869         inet_putpeer(peer);
 870 }
 871
 872 static int ip_error(struct sk_buff *skb)
 873 {
 874         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 875         struct rtable *rt = skb_rtable(skb);
 876         struct inet_peer *peer;
 877         unsigned long now;
 878         struct net *net;
 879         bool send;
 880         int code;
 881
 882         net = dev_net(rt->dst.dev);
 883         if (!IN_DEV_FORWARD(in_dev)) {
 884                 switch (rt->dst.error) {
 885                 case EHOSTUNREACH:
 886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 887                         break;
 888
 889                 case ENETUNREACH:
 890                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 891                         break;
 892                 }
 893                 goto out;
 894         }
 895
 896         switch (rt->dst.error) {
 897         case EINVAL:
 898         default:
 899                 goto out;
 900         case EHOSTUNREACH:
 901                 code = ICMP_HOST_UNREACH;
 902                 break;
 903         case ENETUNREACH:
 904                 code = ICMP_NET_UNREACH;
 905                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 906                 break;
 907         case EACCES:
 908                 code = ICMP_PKT_FILTERED;
 909                 break;
 910         }
 911
 912         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 913
 914         send = true;
 915         if (peer) {
 916                 now = jiffies;
 917                 peer->rate_tokens += now - peer->rate_last;
 918                 if (peer->rate_tokens > ip_rt_error_burst)
 919                         peer->rate_tokens = ip_rt_error_burst;
 920                 peer->rate_last = now;
 921                 if (peer->rate_tokens >= ip_rt_error_cost)
 922                         peer->rate_tokens -= ip_rt_error_cost;
 923                 else
 924                         send = false;
 925                 inet_putpeer(peer);
 926         }
 927         if (send)
 928                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 929
 930 out:    kfree_skb(skb);
 931         return 0;
 932 }
 933
 934 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 935 {
 936         struct fib_result res;
 937
 938         if (mtu < ip_rt_min_pmtu)
 939                 mtu = ip_rt_min_pmtu;
 940
 941         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 942                 struct fib_nh *nh = &FIB_RES_NH(res);
 943
 944                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 945                                       jiffies + ip_rt_mtu_expires);
 946         }
 947         return mtu;
 948 }
 949
 950 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 951                               struct sk_buff *skb, u32 mtu)
 952 {
 953         struct rtable *rt = (struct rtable *) dst;
 954         struct flowi4 fl4;
 955
 956         ip_rt_build_flow_key(&fl4, sk, skb);
 957         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
 958
 959         if (!rt->rt_pmtu) {
 960                 dst->obsolete = DST_OBSOLETE_KILL;
 961         } else {
 962                 rt->rt_pmtu = mtu;
 963                 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 964         }
 965 }
 966
 967 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 968                       int oif, u32 mark, u8 protocol, int flow_flags)
 969 {
 970         const struct iphdr *iph = (const struct iphdr *) skb->data;
 971         struct flowi4 fl4;
 972         struct rtable *rt;
 973
 974         __build_flow_key(&fl4, NULL, iph, oif,
 975                          RT_TOS(iph->tos), protocol, mark, flow_flags);
 976         rt = __ip_route_output_key(net, &fl4);
 977         if (!IS_ERR(rt)) {
 978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 979                 ip_rt_put(rt);
 980         }
 981 }
 982 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 983
 984 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 985 {
 986         const struct iphdr *iph = (const struct iphdr *) skb->data;
 987         struct flowi4 fl4;
 988         struct rtable *rt;
 989
 990         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 991         rt = __ip_route_output_key(sock_net(sk), &fl4);
 992         if (!IS_ERR(rt)) {
 993                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 994                 ip_rt_put(rt);
 995         }
 996 }
 997 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 998
 999 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1000                    int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002         const struct iphdr *iph = (const struct iphdr *) skb->data;
1003         struct flowi4 fl4;
1004         struct rtable *rt;
1005
1006         __build_flow_key(&fl4, NULL, iph, oif,
1007                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1008         rt = __ip_route_output_key(net, &fl4);
1009         if (!IS_ERR(rt)) {
1010                 __ip_do_redirect(rt, skb, &fl4, false);
1011                 ip_rt_put(rt);
1012         }
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_redirect);
1015
1016 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023         rt = __ip_route_output_key(sock_net(sk), &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_do_redirect(rt, skb, &fl4, false);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1030
1031 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1032 {
1033         struct rtable *rt = (struct rtable *) dst;
1034
1035         /* All IPV4 dsts are created with ->obsolete set to the value
1036          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1037          * into this function always.
1038          *
1039          * When a PMTU/redirect information update invalidates a
1040          * route, this is indicated by setting obsolete to
1041          * DST_OBSOLETE_KILL.
1042          */
1043         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1044                 return NULL;
1045         return dst;
1046 }
1047
1048 static void ipv4_link_failure(struct sk_buff *skb)
1049 {
1050         struct rtable *rt;
1051
1052         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1053
1054         rt = skb_rtable(skb);
1055         if (rt)
1056                 dst_set_expires(&rt->dst, 0);
1057 }
1058
1059 static int ip_rt_bug(struct sk_buff *skb)
1060 {
1061         pr_debug("%s: %pI4 -> %pI4, %s\n",
1062                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1063                  skb->dev ? skb->dev->name : "?");
1064         kfree_skb(skb);
1065         WARN_ON(1);
1066         return 0;
1067 }
1068
1069 /*
1070    We do not cache source address of outgoing interface,
1071    because it is used only by IP RR, TS and SRR options,
1072    so that it out of fast path.
1073
1074    BTW remember: "addr" is allowed to be not aligned
1075    in IP options!
1076  */
1077
1078 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1079 {
1080         __be32 src;
1081
1082         if (rt_is_output_route(rt))
1083                 src = ip_hdr(skb)->saddr;
1084         else {
1085                 struct fib_result res;
1086                 struct flowi4 fl4;
1087                 struct iphdr *iph;
1088
1089                 iph = ip_hdr(skb);
1090
1091                 memset(&fl4, 0, sizeof(fl4));
1092                 fl4.daddr = iph->daddr;
1093                 fl4.saddr = iph->saddr;
1094                 fl4.flowi4_tos = RT_TOS(iph->tos);
1095                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1096                 fl4.flowi4_iif = skb->dev->ifindex;
1097                 fl4.flowi4_mark = skb->mark;
1098
1099                 rcu_read_lock();
1100                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1101                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1102                 else
1103                         src = inet_select_addr(rt->dst.dev,
1104                                                rt_nexthop(rt, iph->daddr),
1105                                                RT_SCOPE_UNIVERSE);
1106                 rcu_read_unlock();
1107         }
1108         memcpy(addr, &src, 4);
1109 }
1110
1111 #ifdef CONFIG_IP_ROUTE_CLASSID
1112 static void set_class_tag(struct rtable *rt, u32 tag)
1113 {
1114         if (!(rt->dst.tclassid & 0xFFFF))
1115                 rt->dst.tclassid |= tag & 0xFFFF;
1116         if (!(rt->dst.tclassid & 0xFFFF0000))
1117                 rt->dst.tclassid |= tag & 0xFFFF0000;
1118 }
1119 #endif
1120
1121 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1122 {
1123         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1124
1125         if (advmss == 0) {
1126                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1127                                ip_rt_min_advmss);
1128                 if (advmss > 65535 - 40)
1129                         advmss = 65535 - 40;
1130         }
1131         return advmss;
1132 }
1133
1134 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1135 {
1136         const struct rtable *rt = (const struct rtable *) dst;
1137         unsigned int mtu = rt->rt_pmtu;
1138
1139         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1140                 mtu = 0;
1141
1142         if (!mtu)
1143                 mtu = dst_metric_raw(dst, RTAX_MTU);
1144
1145         if (mtu && rt_is_output_route(rt))
1146                 return mtu;
1147
1148         mtu = dst->dev->mtu;
1149
1150         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1151                 if (rt->rt_gateway && mtu > 576)
1152                         mtu = 576;
1153         }
1154
1155         if (mtu > IP_MAX_MTU)
1156                 mtu = IP_MAX_MTU;
1157
1158         return mtu;
1159 }
1160
1161 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1162 {
1163         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1164         struct fib_nh_exception *fnhe;
1165         u32 hval;
1166
1167         if (!hash)
1168                 return NULL;
1169
1170         hval = fnhe_hashfun(daddr);
1171
1172         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1173              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1174                 if (fnhe->fnhe_daddr == daddr)
1175                         return fnhe;
1176         }
1177         return NULL;
1178 }
1179
1180 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1181                               __be32 daddr)
1182 {
1183         bool ret = false;
1184
1185         spin_lock_bh(&fnhe_lock);
1186
1187         if (daddr == fnhe->fnhe_daddr) {
1188                 struct rtable *orig;
1189
1190                 if (fnhe->fnhe_pmtu) {
1191                         unsigned long expires = fnhe->fnhe_expires;
1192                         unsigned long diff = expires - jiffies;
1193
1194                         if (time_before(jiffies, expires)) {
1195                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1196                                 dst_set_expires(&rt->dst, diff);
1197                         }
1198                 }
1199                 if (fnhe->fnhe_gw) {
1200                         rt->rt_flags |= RTCF_REDIRECTED;
1201                         rt->rt_gateway = fnhe->fnhe_gw;
1202                 }
1203
1204                 orig = rcu_dereference(fnhe->fnhe_rth);
1205                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1206                 if (orig)
1207                         rt_free(orig);
1208
1209                 fnhe->fnhe_stamp = jiffies;
1210                 ret = true;
1211         } else {
1212                 /* Routes we intend to cache in nexthop exception have
1213                  * the DST_NOCACHE bit clear.  However, if we are
1214                  * unsuccessful at storing this route into the cache
1215                  * we really need to set it.
1216                  */
1217                 rt->dst.flags |= DST_NOCACHE;
1218         }
1219         spin_unlock_bh(&fnhe_lock);
1220
1221         return ret;
1222 }
1223
1224 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1225 {
1226         struct rtable *orig, *prev, **p;
1227         bool ret = true;
1228
1229         if (rt_is_input_route(rt)) {
1230                 p = (struct rtable **)&nh->nh_rth_input;
1231         } else {
1232                 if (!nh->nh_pcpu_rth_output)
1233                         goto nocache;
1234                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1235         }
1236         orig = *p;
1237
1238         prev = cmpxchg(p, orig, rt);
1239         if (prev == orig) {
1240                 if (orig)
1241                         rt_free(orig);
1242         } else {
1243                 /* Routes we intend to cache in the FIB nexthop have
1244                  * the DST_NOCACHE bit clear.  However, if we are
1245                  * unsuccessful at storing this route into the cache
1246                  * we really need to set it.
1247                  */
1248 nocache:
1249                 rt->dst.flags |= DST_NOCACHE;
1250                 ret = false;
1251         }
1252
1253         return ret;
1254 }
1255
1256 static DEFINE_SPINLOCK(rt_uncached_lock);
1257 static LIST_HEAD(rt_uncached_list);
1258
1259 static void rt_add_uncached_list(struct rtable *rt)
1260 {
1261         spin_lock_bh(&rt_uncached_lock);
1262         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1263         spin_unlock_bh(&rt_uncached_lock);
1264 }
1265
1266 static void ipv4_dst_destroy(struct dst_entry *dst)
1267 {
1268         struct rtable *rt = (struct rtable *) dst;
1269
1270         if (dst->flags & DST_NOCACHE) {
1271                 spin_lock_bh(&rt_uncached_lock);
1272                 list_del(&rt->rt_uncached);
1273                 spin_unlock_bh(&rt_uncached_lock);
1274         }
1275 }
1276
1277 void rt_flush_dev(struct net_device *dev)
1278 {
1279         if (!list_empty(&rt_uncached_list)) {
1280                 struct net *net = dev_net(dev);
1281                 struct rtable *rt;
1282
1283                 spin_lock_bh(&rt_uncached_lock);
1284                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1285                         if (rt->dst.dev != dev)
1286                                 continue;
1287                         rt->dst.dev = net->loopback_dev;
1288                         dev_hold(rt->dst.dev);
1289                         dev_put(dev);
1290                 }
1291                 spin_unlock_bh(&rt_uncached_lock);
1292         }
1293 }
1294
1295 static bool rt_cache_valid(const struct rtable *rt)
1296 {
1297         return  rt &&
1298                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1299                 !rt_is_expired(rt);
1300 }
1301
1302 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1303                            const struct fib_result *res,
1304                            struct fib_nh_exception *fnhe,
1305                            struct fib_info *fi, u16 type, u32 itag)
1306 {
1307         bool cached = false;
1308
1309         if (fi) {
1310                 struct fib_nh *nh = &FIB_RES_NH(*res);
1311
1312                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1313                         rt->rt_gateway = nh->nh_gw;
1314                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1315 #ifdef CONFIG_IP_ROUTE_CLASSID
1316                 rt->dst.tclassid = nh->nh_tclassid;
1317 #endif
1318                 if (unlikely(fnhe))
1319                         cached = rt_bind_exception(rt, fnhe, daddr);
1320                 else if (!(rt->dst.flags & DST_NOCACHE))
1321                         cached = rt_cache_route(nh, rt);
1322         }
1323         if (unlikely(!cached))
1324                 rt_add_uncached_list(rt);
1325
1326 #ifdef CONFIG_IP_ROUTE_CLASSID
1327 #ifdef CONFIG_IP_MULTIPLE_TABLES
1328         set_class_tag(rt, res->tclassid);
1329 #endif
1330         set_class_tag(rt, itag);
1331 #endif
1332 }
1333
1334 static struct rtable *rt_dst_alloc(struct net_device *dev,
1335                                    bool nopolicy, bool noxfrm, bool will_cache)
1336 {
1337         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1338                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1339                          (nopolicy ? DST_NOPOLICY : 0) |
1340                          (noxfrm ? DST_NOXFRM : 0));
1341 }
1342
1343 /* called in rcu_read_lock() section */
1344 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1345                                 u8 tos, struct net_device *dev, int our)
1346 {
1347         struct rtable *rth;
1348         struct in_device *in_dev = __in_dev_get_rcu(dev);
1349         u32 itag = 0;
1350         int err;
1351
1352         /* Primary sanity checks. */
1353
1354         if (in_dev == NULL)
1355                 return -EINVAL;
1356
1357         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1358             skb->protocol != htons(ETH_P_IP))
1359                 goto e_inval;
1360
1361         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1362                 if (ipv4_is_loopback(saddr))
1363                         goto e_inval;
1364
1365         if (ipv4_is_zeronet(saddr)) {
1366                 if (!ipv4_is_local_multicast(daddr))
1367                         goto e_inval;
1368         } else {
1369                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1370                                           in_dev, &itag);
1371                 if (err < 0)
1372                         goto e_err;
1373         }
1374         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1375                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1376         if (!rth)
1377                 goto e_nobufs;
1378
1379 #ifdef CONFIG_IP_ROUTE_CLASSID
1380         rth->dst.tclassid = itag;
1381 #endif
1382         rth->dst.output = ip_rt_bug;
1383
1384         rth->rt_genid   = rt_genid(dev_net(dev));
1385         rth->rt_flags   = RTCF_MULTICAST;
1386         rth->rt_type    = RTN_MULTICAST;
1387         rth->rt_is_input= 1;
1388         rth->rt_iif     = 0;
1389         rth->rt_pmtu    = 0;
1390         rth->rt_gateway = 0;
1391         INIT_LIST_HEAD(&rth->rt_uncached);
1392         if (our) {
1393                 rth->dst.input= ip_local_deliver;
1394                 rth->rt_flags |= RTCF_LOCAL;
1395         }
1396
1397 #ifdef CONFIG_IP_MROUTE
1398         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1399                 rth->dst.input = ip_mr_input;
1400 #endif
1401         RT_CACHE_STAT_INC(in_slow_mc);
1402
1403         skb_dst_set(skb, &rth->dst);
1404         return 0;
1405
1406 e_nobufs:
1407         return -ENOBUFS;
1408 e_inval:
1409         return -EINVAL;
1410 e_err:
1411         return err;
1412 }
1413
1414
1415 static void ip_handle_martian_source(struct net_device *dev,
1416                                      struct in_device *in_dev,
1417                                      struct sk_buff *skb,
1418                                      __be32 daddr,
1419                                      __be32 saddr)
1420 {
1421         RT_CACHE_STAT_INC(in_martian_src);
1422 #ifdef CONFIG_IP_ROUTE_VERBOSE
1423         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1424                 /*
1425                  *      RFC1812 recommendation, if source is martian,
1426                  *      the only hint is MAC header.
1427                  */
1428                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1429                         &daddr, &saddr, dev->name);
1430                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1431                         print_hex_dump(KERN_WARNING, "ll header: ",
1432                                        DUMP_PREFIX_OFFSET, 16, 1,
1433                                        skb_mac_header(skb),
1434                                        dev->hard_header_len, true);
1435                 }
1436         }
1437 #endif
1438 }
1439
1440 /* called in rcu_read_lock() section */
1441 static int __mkroute_input(struct sk_buff *skb,
1442                            const struct fib_result *res,
1443                            struct in_device *in_dev,
1444                            __be32 daddr, __be32 saddr, u32 tos)
1445 {
1446         struct rtable *rth;
1447         int err;
1448         struct in_device *out_dev;
1449         unsigned int flags = 0;
1450         bool do_cache;
1451         u32 itag;
1452
1453         /* get a working reference to the output device */
1454         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1455         if (out_dev == NULL) {
1456                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1457                 return -EINVAL;
1458         }
1459
1460
1461         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1462                                   in_dev->dev, in_dev, &itag);
1463         if (err < 0) {
1464                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1465                                          saddr);
1466
1467                 goto cleanup;
1468         }
1469
1470         if (out_dev == in_dev && err &&
1471             (IN_DEV_SHARED_MEDIA(out_dev) ||
1472              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1473                 flags |= RTCF_DOREDIRECT;
1474
1475         if (skb->protocol != htons(ETH_P_IP)) {
1476                 /* Not IP (i.e. ARP). Do not create route, if it is
1477                  * invalid for proxy arp. DNAT routes are always valid.
1478                  *
1479                  * Proxy arp feature have been extended to allow, ARP
1480                  * replies back to the same interface, to support
1481                  * Private VLAN switch technologies. See arp.c.
1482                  */
1483                 if (out_dev == in_dev &&
1484                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1485                         err = -EINVAL;
1486                         goto cleanup;
1487                 }
1488         }
1489
1490         do_cache = false;
1491         if (res->fi) {
1492                 if (!itag) {
1493                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1494                         if (rt_cache_valid(rth)) {
1495                                 skb_dst_set_noref(skb, &rth->dst);
1496                                 goto out;
1497                         }
1498                         do_cache = true;
1499                 }
1500         }
1501
1502         rth = rt_dst_alloc(out_dev->dev,
1503                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1504                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1505         if (!rth) {
1506                 err = -ENOBUFS;
1507                 goto cleanup;
1508         }
1509
1510         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1511         rth->rt_flags = flags;
1512         rth->rt_type = res->type;
1513         rth->rt_is_input = 1;
1514         rth->rt_iif     = 0;
1515         rth->rt_pmtu    = 0;
1516         rth->rt_gateway = 0;
1517         INIT_LIST_HEAD(&rth->rt_uncached);
1518
1519         rth->dst.input = ip_forward;
1520         rth->dst.output = ip_output;
1521
1522         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1523         skb_dst_set(skb, &rth->dst);
1524 out:
1525         err = 0;
1526  cleanup:
1527         return err;
1528 }
1529
1530 static int ip_mkroute_input(struct sk_buff *skb,
1531                             struct fib_result *res,
1532                             const struct flowi4 *fl4,
1533                             struct in_device *in_dev,
1534                             __be32 daddr, __be32 saddr, u32 tos)
1535 {
1536 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1537         if (res->fi && res->fi->fib_nhs > 1)
1538                 fib_select_multipath(res);
1539 #endif
1540
1541         /* create a routing cache entry */
1542         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1543 }
1544
1545 /*
1546  *      NOTE. We drop all the packets that has local source
1547  *      addresses, because every properly looped back packet
1548  *      must have correct destination already attached by output routine.
1549  *
1550  *      Such approach solves two big problems:
1551  *      1. Not simplex devices are handled properly.
1552  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1553  *      called with rcu_read_lock()
1554  */
1555
1556 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1557                                u8 tos, struct net_device *dev)
1558 {
1559         struct fib_result res;
1560         struct in_device *in_dev = __in_dev_get_rcu(dev);
1561         struct flowi4   fl4;
1562         unsigned int    flags = 0;
1563         u32             itag = 0;
1564         struct rtable   *rth;
1565         int             err = -EINVAL;
1566         struct net    *net = dev_net(dev);
1567         bool do_cache;
1568
1569         /* IP on this device is disabled. */
1570
1571         if (!in_dev)
1572                 goto out;
1573
1574         /* Check for the most weird martians, which can be not detected
1575            by fib_lookup.
1576          */
1577
1578         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1579                 goto martian_source;
1580
1581         res.fi = NULL;
1582         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1583                 goto brd_input;
1584
1585         /* Accept zero addresses only to limited broadcast;
1586          * I even do not know to fix it or not. Waiting for complains :-)
1587          */
1588         if (ipv4_is_zeronet(saddr))
1589                 goto martian_source;
1590
1591         if (ipv4_is_zeronet(daddr))
1592                 goto martian_destination;
1593
1594         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1595                 if (ipv4_is_loopback(daddr))
1596                         goto martian_destination;
1597
1598                 if (ipv4_is_loopback(saddr))
1599                         goto martian_source;
1600         }
1601
1602         /*
1603          *      Now we are ready to route packet.
1604          */
1605         fl4.flowi4_oif = 0;
1606         fl4.flowi4_iif = dev->ifindex;
1607         fl4.flowi4_mark = skb->mark;
1608         fl4.flowi4_tos = tos;
1609         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1610         fl4.daddr = daddr;
1611         fl4.saddr = saddr;
1612         err = fib_lookup(net, &fl4, &res);
1613         if (err != 0)
1614                 goto no_route;
1615
1616         RT_CACHE_STAT_INC(in_slow_tot);
1617
1618         if (res.type == RTN_BROADCAST)
1619                 goto brd_input;
1620
1621         if (res.type == RTN_LOCAL) {
1622                 err = fib_validate_source(skb, saddr, daddr, tos,
1623                                           net->loopback_dev->ifindex,
1624                                           dev, in_dev, &itag);
1625                 if (err < 0)
1626                         goto martian_source_keep_err;
1627                 goto local_input;
1628         }
1629
1630         if (!IN_DEV_FORWARD(in_dev))
1631                 goto no_route;
1632         if (res.type != RTN_UNICAST)
1633                 goto martian_destination;
1634
1635         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1636 out:    return err;
1637
1638 brd_input:
1639         if (skb->protocol != htons(ETH_P_IP))
1640                 goto e_inval;
1641
1642         if (!ipv4_is_zeronet(saddr)) {
1643                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1644                                           in_dev, &itag);
1645                 if (err < 0)
1646                         goto martian_source_keep_err;
1647         }
1648         flags |= RTCF_BROADCAST;
1649         res.type = RTN_BROADCAST;
1650         RT_CACHE_STAT_INC(in_brd);
1651
1652 local_input:
1653         do_cache = false;
1654         if (res.fi) {
1655                 if (!itag) {
1656                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1657                         if (rt_cache_valid(rth)) {
1658                                 skb_dst_set_noref(skb, &rth->dst);
1659                                 err = 0;
1660                                 goto out;
1661                         }
1662                         do_cache = true;
1663                 }
1664         }
1665
1666         rth = rt_dst_alloc(net->loopback_dev,
1667                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1668         if (!rth)
1669                 goto e_nobufs;
1670
1671         rth->dst.input= ip_local_deliver;
1672         rth->dst.output= ip_rt_bug;
1673 #ifdef CONFIG_IP_ROUTE_CLASSID
1674         rth->dst.tclassid = itag;
1675 #endif
1676
1677         rth->rt_genid = rt_genid(net);
1678         rth->rt_flags   = flags|RTCF_LOCAL;
1679         rth->rt_type    = res.type;
1680         rth->rt_is_input = 1;
1681         rth->rt_iif     = 0;
1682         rth->rt_pmtu    = 0;
1683         rth->rt_gateway = 0;
1684         INIT_LIST_HEAD(&rth->rt_uncached);
1685         if (res.type == RTN_UNREACHABLE) {
1686                 rth->dst.input= ip_error;
1687                 rth->dst.error= -err;
1688                 rth->rt_flags   &= ~RTCF_LOCAL;
1689         }
1690         if (do_cache)
1691                 rt_cache_route(&FIB_RES_NH(res), rth);
1692         skb_dst_set(skb, &rth->dst);
1693         err = 0;
1694         goto out;
1695
1696 no_route:
1697         RT_CACHE_STAT_INC(in_no_route);
1698         res.type = RTN_UNREACHABLE;
1699         if (err == -ESRCH)
1700                 err = -ENETUNREACH;
1701         goto local_input;
1702
1703         /*
1704          *      Do not cache martian addresses: they should be logged (RFC1812)
1705          */
1706 martian_destination:
1707         RT_CACHE_STAT_INC(in_martian_dst);
1708 #ifdef CONFIG_IP_ROUTE_VERBOSE
1709         if (IN_DEV_LOG_MARTIANS(in_dev))
1710                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1711                                      &daddr, &saddr, dev->name);
1712 #endif
1713
1714 e_inval:
1715         err = -EINVAL;
1716         goto out;
1717
1718 e_nobufs:
1719         err = -ENOBUFS;
1720         goto out;
1721
1722 martian_source:
1723         err = -EINVAL;
1724 martian_source_keep_err:
1725         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1726         goto out;
1727 }
1728
1729 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730                          u8 tos, struct net_device *dev)
1731 {
1732         int res;
1733
1734         rcu_read_lock();
1735
1736         /* Multicast recognition logic is moved from route cache to here.
1737            The problem was that too many Ethernet cards have broken/missing
1738            hardware multicast filters :-( As result the host on multicasting
1739            network acquires a lot of useless route cache entries, sort of
1740            SDR messages from all the world. Now we try to get rid of them.
1741            Really, provided software IP multicast filter is organized
1742            reasonably (at least, hashed), it does not result in a slowdown
1743            comparing with route cache reject entries.
1744            Note, that multicast routers are not affected, because
1745            route cache entry is created eventually.
1746          */
1747         if (ipv4_is_multicast(daddr)) {
1748                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1749
1750                 if (in_dev) {
1751                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1752                                                   ip_hdr(skb)->protocol);
1753                         if (our
1754 #ifdef CONFIG_IP_MROUTE
1755                                 ||
1756                             (!ipv4_is_local_multicast(daddr) &&
1757                              IN_DEV_MFORWARD(in_dev))
1758 #endif
1759                            ) {
1760                                 int res = ip_route_input_mc(skb, daddr, saddr,
1761                                                             tos, dev, our);
1762                                 rcu_read_unlock();
1763                                 return res;
1764                         }
1765                 }
1766                 rcu_read_unlock();
1767                 return -EINVAL;
1768         }
1769         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770         rcu_read_unlock();
1771         return res;
1772 }
1773 EXPORT_SYMBOL(ip_route_input_noref);
1774
1775 /* called with rcu_read_lock() */
1776 static struct rtable *__mkroute_output(const struct fib_result *res,
1777                                        const struct flowi4 *fl4, int orig_oif,
1778                                        struct net_device *dev_out,
1779                                        unsigned int flags)
1780 {
1781         struct fib_info *fi = res->fi;
1782         struct fib_nh_exception *fnhe;
1783         struct in_device *in_dev;
1784         u16 type = res->type;
1785         struct rtable *rth;
1786
1787         in_dev = __in_dev_get_rcu(dev_out);
1788         if (!in_dev)
1789                 return ERR_PTR(-EINVAL);
1790
1791         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1792                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1793                         return ERR_PTR(-EINVAL);
1794
1795         if (ipv4_is_lbcast(fl4->daddr))
1796                 type = RTN_BROADCAST;
1797         else if (ipv4_is_multicast(fl4->daddr))
1798                 type = RTN_MULTICAST;
1799         else if (ipv4_is_zeronet(fl4->daddr))
1800                 return ERR_PTR(-EINVAL);
1801
1802         if (dev_out->flags & IFF_LOOPBACK)
1803                 flags |= RTCF_LOCAL;
1804
1805         if (type == RTN_BROADCAST) {
1806                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1807                 fi = NULL;
1808         } else if (type == RTN_MULTICAST) {
1809                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1810                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1811                                      fl4->flowi4_proto))
1812                         flags &= ~RTCF_LOCAL;
1813                 /* If multicast route do not exist use
1814                  * default one, but do not gateway in this case.
1815                  * Yes, it is hack.
1816                  */
1817                 if (fi && res->prefixlen < 4)
1818                         fi = NULL;
1819         }
1820
1821         fnhe = NULL;
1822         if (fi) {
1823                 struct rtable __rcu **prth;
1824
1825                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1826                 if (fnhe)
1827                         prth = &fnhe->fnhe_rth;
1828                 else
1829                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1830                 rth = rcu_dereference(*prth);
1831                 if (rt_cache_valid(rth)) {
1832                         dst_hold(&rth->dst);
1833                         return rth;
1834                 }
1835         }
1836         rth = rt_dst_alloc(dev_out,
1837                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1838                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1839                            fi);
1840         if (!rth)
1841                 return ERR_PTR(-ENOBUFS);
1842
1843         rth->dst.output = ip_output;
1844
1845         rth->rt_genid = rt_genid(dev_net(dev_out));
1846         rth->rt_flags   = flags;
1847         rth->rt_type    = type;
1848         rth->rt_is_input = 0;
1849         rth->rt_iif     = orig_oif ? : 0;
1850         rth->rt_pmtu    = 0;
1851         rth->rt_gateway = 0;
1852         INIT_LIST_HEAD(&rth->rt_uncached);
1853
1854         RT_CACHE_STAT_INC(out_slow_tot);
1855
1856         if (flags & RTCF_LOCAL)
1857                 rth->dst.input = ip_local_deliver;
1858         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1859                 if (flags & RTCF_LOCAL &&
1860                     !(dev_out->flags & IFF_LOOPBACK)) {
1861                         rth->dst.output = ip_mc_output;
1862                         RT_CACHE_STAT_INC(out_slow_mc);
1863                 }
1864 #ifdef CONFIG_IP_MROUTE
1865                 if (type == RTN_MULTICAST) {
1866                         if (IN_DEV_MFORWARD(in_dev) &&
1867                             !ipv4_is_local_multicast(fl4->daddr)) {
1868                                 rth->dst.input = ip_mr_input;
1869                                 rth->dst.output = ip_mc_output;
1870                         }
1871                 }
1872 #endif
1873         }
1874
1875         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1876
1877         return rth;
1878 }
1879
1880 /*
1881  * Major route resolver routine.
1882  */
1883
1884 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1885 {
1886         struct net_device *dev_out = NULL;
1887         __u8 tos = RT_FL_TOS(fl4);
1888         unsigned int flags = 0;
1889         struct fib_result res;
1890         struct rtable *rth;
1891         int orig_oif;
1892
1893         res.tclassid    = 0;
1894         res.fi          = NULL;
1895         res.table       = NULL;
1896
1897         orig_oif = fl4->flowi4_oif;
1898
1899         fl4->flowi4_iif = net->loopback_dev->ifindex;
1900         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1901         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1902                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1903
1904         rcu_read_lock();
1905         if (fl4->saddr) {
1906                 rth = ERR_PTR(-EINVAL);
1907                 if (ipv4_is_multicast(fl4->saddr) ||
1908                     ipv4_is_lbcast(fl4->saddr) ||
1909                     ipv4_is_zeronet(fl4->saddr))
1910                         goto out;
1911
1912                 /* I removed check for oif == dev_out->oif here.
1913                    It was wrong for two reasons:
1914                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1915                       is assigned to multiple interfaces.
1916                    2. Moreover, we are allowed to send packets with saddr
1917                       of another iface. --ANK
1918                  */
1919
1920                 if (fl4->flowi4_oif == 0 &&
1921                     (ipv4_is_multicast(fl4->daddr) ||
1922                      ipv4_is_lbcast(fl4->daddr))) {
1923                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1925                         if (dev_out == NULL)
1926                                 goto out;
1927
1928                         /* Special hack: user can direct multicasts
1929                            and limited broadcast via necessary interface
1930                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1931                            This hack is not just for fun, it allows
1932                            vic,vat and friends to work.
1933                            They bind socket to loopback, set ttl to zero
1934                            and expect that it will work.
1935                            From the viewpoint of routing cache they are broken,
1936                            because we are not allowed to build multicast path
1937                            with loopback source addr (look, routing cache
1938                            cannot know, that ttl is zero, so that packet
1939                            will not leave this host and route is valid).
1940                            Luckily, this hack is good workaround.
1941                          */
1942
1943                         fl4->flowi4_oif = dev_out->ifindex;
1944                         goto make_route;
1945                 }
1946
1947                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1948                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1949                         if (!__ip_dev_find(net, fl4->saddr, false))
1950                                 goto out;
1951                 }
1952         }
1953
1954
1955         if (fl4->flowi4_oif) {
1956                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1957                 rth = ERR_PTR(-ENODEV);
1958                 if (dev_out == NULL)
1959                         goto out;
1960
1961                 /* RACE: Check return value of inet_select_addr instead. */
1962                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1963                         rth = ERR_PTR(-ENETUNREACH);
1964                         goto out;
1965                 }
1966                 if (ipv4_is_local_multicast(fl4->daddr) ||
1967                     ipv4_is_lbcast(fl4->daddr)) {
1968                         if (!fl4->saddr)
1969                                 fl4->saddr = inet_select_addr(dev_out, 0,
1970                                                               RT_SCOPE_LINK);
1971                         goto make_route;
1972                 }
1973                 if (fl4->saddr) {
1974                         if (ipv4_is_multicast(fl4->daddr))
1975                                 fl4->saddr = inet_select_addr(dev_out, 0,
1976                                                               fl4->flowi4_scope);
1977                         else if (!fl4->daddr)
1978                                 fl4->saddr = inet_select_addr(dev_out, 0,
1979                                                               RT_SCOPE_HOST);
1980                 }
1981         }
1982
1983         if (!fl4->daddr) {
1984                 fl4->daddr = fl4->saddr;
1985                 if (!fl4->daddr)
1986                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1987                 dev_out = net->loopback_dev;
1988                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1989                 res.type = RTN_LOCAL;
1990                 flags |= RTCF_LOCAL;
1991                 goto make_route;
1992         }
1993
1994         if (fib_lookup(net, fl4, &res)) {
1995                 res.fi = NULL;
1996                 res.table = NULL;
1997                 if (fl4->flowi4_oif) {
1998                         /* Apparently, routing tables are wrong. Assume,
1999                            that the destination is on link.
2000
2001                            WHY? DW.
2002                            Because we are allowed to send to iface
2003                            even if it has NO routes and NO assigned
2004                            addresses. When oif is specified, routing
2005                            tables are looked up with only one purpose:
2006                            to catch if destination is gatewayed, rather than
2007                            direct. Moreover, if MSG_DONTROUTE is set,
2008                            we send packet, ignoring both routing tables
2009                            and ifaddr state. --ANK
2010
2011
2012                            We could make it even if oif is unknown,
2013                            likely IPv6, but we do not.
2014                          */
2015
2016                         if (fl4->saddr == 0)
2017                                 fl4->saddr = inet_select_addr(dev_out, 0,
2018                                                               RT_SCOPE_LINK);
2019                         res.type = RTN_UNICAST;
2020                         goto make_route;
2021                 }
2022                 rth = ERR_PTR(-ENETUNREACH);
2023                 goto out;
2024         }
2025
2026         if (res.type == RTN_LOCAL) {
2027                 if (!fl4->saddr) {
2028                         if (res.fi->fib_prefsrc)
2029                                 fl4->saddr = res.fi->fib_prefsrc;
2030                         else
2031                                 fl4->saddr = fl4->daddr;
2032                 }
2033                 dev_out = net->loopback_dev;
2034                 fl4->flowi4_oif = dev_out->ifindex;
2035                 res.fi = NULL;
2036                 flags |= RTCF_LOCAL;
2037                 goto make_route;
2038         }
2039
2040 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2041         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2042                 fib_select_multipath(&res);
2043         else
2044 #endif
2045         if (!res.prefixlen &&
2046             res.table->tb_num_default > 1 &&
2047             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2048                 fib_select_default(&res);
2049
2050         if (!fl4->saddr)
2051                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2052
2053         dev_out = FIB_RES_DEV(res);
2054         fl4->flowi4_oif = dev_out->ifindex;
2055
2056
2057 make_route:
2058         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2059
2060 out:
2061         rcu_read_unlock();
2062         return rth;
2063 }
2064 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2065
2066 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2067 {
2068         return NULL;
2069 }
2070
2071 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2072 {
2073         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2074
2075         return mtu ? : dst->dev->mtu;
2076 }
2077
2078 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2079                                           struct sk_buff *skb, u32 mtu)
2080 {
2081 }
2082
2083 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2084                                        struct sk_buff *skb)
2085 {
2086 }
2087
2088 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2089                                           unsigned long old)
2090 {
2091         return NULL;
2092 }
2093
2094 static struct dst_ops ipv4_dst_blackhole_ops = {
2095         .family                 =       AF_INET,
2096         .protocol               =       cpu_to_be16(ETH_P_IP),
2097         .check                  =       ipv4_blackhole_dst_check,
2098         .mtu                    =       ipv4_blackhole_mtu,
2099         .default_advmss         =       ipv4_default_advmss,
2100         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2101         .redirect               =       ipv4_rt_blackhole_redirect,
2102         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2103         .neigh_lookup           =       ipv4_neigh_lookup,
2104 };
2105
2106 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2107 {
2108         struct rtable *ort = (struct rtable *) dst_orig;
2109         struct rtable *rt;
2110
2111         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2112         if (rt) {
2113                 struct dst_entry *new = &rt->dst;
2114
2115                 new->__use = 1;
2116                 new->input = dst_discard;
2117                 new->output = dst_discard;
2118
2119                 new->dev = ort->dst.dev;
2120                 if (new->dev)
2121                         dev_hold(new->dev);
2122
2123                 rt->rt_is_input = ort->rt_is_input;
2124                 rt->rt_iif = ort->rt_iif;
2125                 rt->rt_pmtu = ort->rt_pmtu;
2126
2127                 rt->rt_genid = rt_genid(net);
2128                 rt->rt_flags = ort->rt_flags;
2129                 rt->rt_type = ort->rt_type;
2130                 rt->rt_gateway = ort->rt_gateway;
2131
2132                 INIT_LIST_HEAD(&rt->rt_uncached);
2133
2134                 dst_free(new);
2135         }
2136
2137         dst_release(dst_orig);
2138
2139         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2140 }
2141
2142 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2143                                     struct sock *sk)
2144 {
2145         struct rtable *rt = __ip_route_output_key(net, flp4);
2146
2147         if (IS_ERR(rt))
2148                 return rt;
2149
2150         if (flp4->flowi4_proto)
2151                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2152                                                    flowi4_to_flowi(flp4),
2153                                                    sk, 0);
2154
2155         return rt;
2156 }
2157 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2158
2159 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2160                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2161                         u32 seq, int event, int nowait, unsigned int flags)
2162 {
2163         struct rtable *rt = skb_rtable(skb);
2164         struct rtmsg *r;
2165         struct nlmsghdr *nlh;
2166         unsigned long expires = 0;
2167         u32 error;
2168         u32 metrics[RTAX_MAX];
2169
2170         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2171         if (nlh == NULL)
2172                 return -EMSGSIZE;
2173
2174         r = nlmsg_data(nlh);
2175         r->rtm_family    = AF_INET;
2176         r->rtm_dst_len  = 32;
2177         r->rtm_src_len  = 0;
2178         r->rtm_tos      = fl4->flowi4_tos;
2179         r->rtm_table    = RT_TABLE_MAIN;
2180         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2181                 goto nla_put_failure;
2182         r->rtm_type     = rt->rt_type;
2183         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2184         r->rtm_protocol = RTPROT_UNSPEC;
2185         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2186         if (rt->rt_flags & RTCF_NOTIFY)
2187                 r->rtm_flags |= RTM_F_NOTIFY;
2188
2189         if (nla_put_be32(skb, RTA_DST, dst))
2190                 goto nla_put_failure;
2191         if (src) {
2192                 r->rtm_src_len = 32;
2193                 if (nla_put_be32(skb, RTA_SRC, src))
2194                         goto nla_put_failure;
2195         }
2196         if (rt->dst.dev &&
2197             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2198                 goto nla_put_failure;
2199 #ifdef CONFIG_IP_ROUTE_CLASSID
2200         if (rt->dst.tclassid &&
2201             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2202                 goto nla_put_failure;
2203 #endif
2204         if (!rt_is_input_route(rt) &&
2205             fl4->saddr != src) {
2206                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2207                         goto nla_put_failure;
2208         }
2209         if (rt->rt_gateway &&
2210             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2211                 goto nla_put_failure;
2212
2213         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2214         if (rt->rt_pmtu)
2215                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2216         if (rtnetlink_put_metrics(skb, metrics) < 0)
2217                 goto nla_put_failure;
2218
2219         if (fl4->flowi4_mark &&
2220             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2221                 goto nla_put_failure;
2222
2223         error = rt->dst.error;
2224         expires = rt->dst.expires;
2225         if (expires) {
2226                 if (time_before(jiffies, expires))
2227                         expires -= jiffies;
2228                 else
2229                         expires = 0;
2230         }
2231
2232         if (rt_is_input_route(rt)) {
2233                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2234                         goto nla_put_failure;
2235         }
2236
2237         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2238                 goto nla_put_failure;
2239
2240         return nlmsg_end(skb, nlh);
2241
2242 nla_put_failure:
2243         nlmsg_cancel(skb, nlh);
2244         return -EMSGSIZE;
2245 }
2246
2247 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2248 {
2249         struct net *net = sock_net(in_skb->sk);
2250         struct rtmsg *rtm;
2251         struct nlattr *tb[RTA_MAX+1];
2252         struct rtable *rt = NULL;
2253         struct flowi4 fl4;
2254         __be32 dst = 0;
2255         __be32 src = 0;
2256         u32 iif;
2257         int err;
2258         int mark;
2259         struct sk_buff *skb;
2260
2261         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2262         if (err < 0)
2263                 goto errout;
2264
2265         rtm = nlmsg_data(nlh);
2266
2267         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2268         if (skb == NULL) {
2269                 err = -ENOBUFS;
2270                 goto errout;
2271         }
2272
2273         /* Reserve room for dummy headers, this skb can pass
2274            through good chunk of routing engine.
2275          */
2276         skb_reset_mac_header(skb);
2277         skb_reset_network_header(skb);
2278
2279         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2280         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2281         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2282
2283         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2284         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2285         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2286         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2287
2288         memset(&fl4, 0, sizeof(fl4));
2289         fl4.daddr = dst;
2290         fl4.saddr = src;
2291         fl4.flowi4_tos = rtm->rtm_tos;
2292         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2293         fl4.flowi4_mark = mark;
2294
2295         if (iif) {
2296                 struct net_device *dev;
2297
2298                 dev = __dev_get_by_index(net, iif);
2299                 if (dev == NULL) {
2300                         err = -ENODEV;
2301                         goto errout_free;
2302                 }
2303
2304                 skb->protocol   = htons(ETH_P_IP);
2305                 skb->dev        = dev;
2306                 skb->mark       = mark;
2307                 local_bh_disable();
2308                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2309                 local_bh_enable();
2310
2311                 rt = skb_rtable(skb);
2312                 if (err == 0 && rt->dst.error)
2313                         err = -rt->dst.error;
2314         } else {
2315                 rt = ip_route_output_key(net, &fl4);
2316
2317                 err = 0;
2318                 if (IS_ERR(rt))
2319                         err = PTR_ERR(rt);
2320         }
2321
2322         if (err)
2323                 goto errout_free;
2324
2325         skb_dst_set(skb, &rt->dst);
2326         if (rtm->rtm_flags & RTM_F_NOTIFY)
2327                 rt->rt_flags |= RTCF_NOTIFY;
2328
2329         err = rt_fill_info(net, dst, src, &fl4, skb,
2330                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2331                            RTM_NEWROUTE, 0, 0);
2332         if (err <= 0)
2333                 goto errout_free;
2334
2335         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2336 errout:
2337         return err;
2338
2339 errout_free:
2340         kfree_skb(skb);
2341         goto errout;
2342 }
2343
2344 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2345 {
2346         return skb->len;
2347 }
2348
2349 void ip_rt_multicast_event(struct in_device *in_dev)
2350 {
2351         rt_cache_flush(dev_net(in_dev->dev), 0);
2352 }
2353
2354 #ifdef CONFIG_SYSCTL
2355 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2356                                         void __user *buffer,
2357                                         size_t *lenp, loff_t *ppos)
2358 {
2359         if (write) {
2360                 int flush_delay;
2361                 ctl_table ctl;
2362                 struct net *net;
2363
2364                 memcpy(&ctl, __ctl, sizeof(ctl));
2365                 ctl.data = &flush_delay;
2366                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2367
2368                 net = (struct net *)__ctl->extra1;
2369                 rt_cache_flush(net, flush_delay);
2370                 return 0;
2371         }
2372
2373         return -EINVAL;
2374 }
2375
2376 static ctl_table ipv4_route_table[] = {
2377         {
2378                 .procname       = "gc_thresh",
2379                 .data           = &ipv4_dst_ops.gc_thresh,
2380                 .maxlen         = sizeof(int),
2381                 .mode           = 0644,
2382                 .proc_handler   = proc_dointvec,
2383         },
2384         {
2385                 .procname       = "max_size",
2386                 .data           = &ip_rt_max_size,
2387                 .maxlen         = sizeof(int),
2388                 .mode           = 0644,
2389                 .proc_handler   = proc_dointvec,
2390         },
2391         {
2392                 /*  Deprecated. Use gc_min_interval_ms */
2393
2394                 .procname       = "gc_min_interval",
2395                 .data           = &ip_rt_gc_min_interval,
2396                 .maxlen         = sizeof(int),
2397                 .mode           = 0644,
2398                 .proc_handler   = proc_dointvec_jiffies,
2399         },
2400         {
2401                 .procname       = "gc_min_interval_ms",
2402                 .data           = &ip_rt_gc_min_interval,
2403                 .maxlen         = sizeof(int),
2404                 .mode           = 0644,
2405                 .proc_handler   = proc_dointvec_ms_jiffies,
2406         },
2407         {
2408                 .procname       = "gc_timeout",
2409                 .data           = &ip_rt_gc_timeout,
2410                 .maxlen         = sizeof(int),
2411                 .mode           = 0644,
2412                 .proc_handler   = proc_dointvec_jiffies,
2413         },
2414         {
2415                 .procname       = "gc_interval",
2416                 .data           = &ip_rt_gc_interval,
2417                 .maxlen         = sizeof(int),
2418                 .mode           = 0644,
2419                 .proc_handler   = proc_dointvec_jiffies,
2420         },
2421         {
2422                 .procname       = "redirect_load",
2423                 .data           = &ip_rt_redirect_load,
2424                 .maxlen         = sizeof(int),
2425                 .mode           = 0644,
2426                 .proc_handler   = proc_dointvec,
2427         },
2428         {
2429                 .procname       = "redirect_number",
2430                 .data           = &ip_rt_redirect_number,
2431                 .maxlen         = sizeof(int),
2432                 .mode           = 0644,
2433                 .proc_handler   = proc_dointvec,
2434         },
2435         {
2436                 .procname       = "redirect_silence",
2437                 .data           = &ip_rt_redirect_silence,
2438                 .maxlen         = sizeof(int),
2439                 .mode           = 0644,
2440                 .proc_handler   = proc_dointvec,
2441         },
2442         {
2443                 .procname       = "error_cost",
2444                 .data           = &ip_rt_error_cost,
2445                 .maxlen         = sizeof(int),
2446                 .mode           = 0644,
2447                 .proc_handler   = proc_dointvec,
2448         },
2449         {
2450                 .procname       = "error_burst",
2451                 .data           = &ip_rt_error_burst,
2452                 .maxlen         = sizeof(int),
2453                 .mode           = 0644,
2454                 .proc_handler   = proc_dointvec,
2455         },
2456         {
2457                 .procname       = "gc_elasticity",
2458                 .data           = &ip_rt_gc_elasticity,
2459                 .maxlen         = sizeof(int),
2460                 .mode           = 0644,
2461                 .proc_handler   = proc_dointvec,
2462         },
2463         {
2464                 .procname       = "mtu_expires",
2465                 .data           = &ip_rt_mtu_expires,
2466                 .maxlen         = sizeof(int),
2467                 .mode           = 0644,
2468                 .proc_handler   = proc_dointvec_jiffies,
2469         },
2470         {
2471                 .procname       = "min_pmtu",
2472                 .data           = &ip_rt_min_pmtu,
2473                 .maxlen         = sizeof(int),
2474                 .mode           = 0644,
2475                 .proc_handler   = proc_dointvec,
2476         },
2477         {
2478                 .procname       = "min_adv_mss",
2479                 .data           = &ip_rt_min_advmss,
2480                 .maxlen         = sizeof(int),
2481                 .mode           = 0644,
2482                 .proc_handler   = proc_dointvec,
2483         },
2484         { }
2485 };
2486
2487 static struct ctl_table ipv4_route_flush_table[] = {
2488         {
2489                 .procname       = "flush",
2490                 .maxlen         = sizeof(int),
2491                 .mode           = 0200,
2492                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2493         },
2494         { },
2495 };
2496
2497 static __net_init int sysctl_route_net_init(struct net *net)
2498 {
2499         struct ctl_table *tbl;
2500
2501         tbl = ipv4_route_flush_table;
2502         if (!net_eq(net, &init_net)) {
2503                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2504                 if (tbl == NULL)
2505                         goto err_dup;
2506         }
2507         tbl[0].extra1 = net;
2508
2509         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2510         if (net->ipv4.route_hdr == NULL)
2511                 goto err_reg;
2512         return 0;
2513
2514 err_reg:
2515         if (tbl != ipv4_route_flush_table)
2516                 kfree(tbl);
2517 err_dup:
2518         return -ENOMEM;
2519 }
2520
2521 static __net_exit void sysctl_route_net_exit(struct net *net)
2522 {
2523         struct ctl_table *tbl;
2524
2525         tbl = net->ipv4.route_hdr->ctl_table_arg;
2526         unregister_net_sysctl_table(net->ipv4.route_hdr);
2527         BUG_ON(tbl == ipv4_route_flush_table);
2528         kfree(tbl);
2529 }
2530
2531 static __net_initdata struct pernet_operations sysctl_route_ops = {
2532         .init = sysctl_route_net_init,
2533         .exit = sysctl_route_net_exit,
2534 };
2535 #endif
2536
2537 static __net_init int rt_genid_init(struct net *net)
2538 {
2539         get_random_bytes(&net->ipv4.rt_genid,
2540                          sizeof(net->ipv4.rt_genid));
2541         get_random_bytes(&net->ipv4.dev_addr_genid,
2542                          sizeof(net->ipv4.dev_addr_genid));
2543         return 0;
2544 }
2545
2546 static __net_initdata struct pernet_operations rt_genid_ops = {
2547         .init = rt_genid_init,
2548 };
2549
2550 static int __net_init ipv4_inetpeer_init(struct net *net)
2551 {
2552         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2553
2554         if (!bp)
2555                 return -ENOMEM;
2556         inet_peer_base_init(bp);
2557         net->ipv4.peers = bp;
2558         return 0;
2559 }
2560
2561 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2562 {
2563         struct inet_peer_base *bp = net->ipv4.peers;
2564
2565         net->ipv4.peers = NULL;
2566         inetpeer_invalidate_tree(bp);
2567         kfree(bp);
2568 }
2569
2570 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2571         .init   =       ipv4_inetpeer_init,
2572         .exit   =       ipv4_inetpeer_exit,
2573 };
2574
2575 #ifdef CONFIG_IP_ROUTE_CLASSID
2576 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2577 #endif /* CONFIG_IP_ROUTE_CLASSID */
2578
2579 int __init ip_rt_init(void)
2580 {
2581         int rc = 0;
2582
2583 #ifdef CONFIG_IP_ROUTE_CLASSID
2584         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2585         if (!ip_rt_acct)
2586                 panic("IP: failed to allocate ip_rt_acct\n");
2587 #endif
2588
2589         ipv4_dst_ops.kmem_cachep =
2590                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2591                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2592
2593         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2594
2595         if (dst_entries_init(&ipv4_dst_ops) < 0)
2596                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2597
2598         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2599                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2600
2601         ipv4_dst_ops.gc_thresh = ~0;
2602         ip_rt_max_size = INT_MAX;
2603
2604         devinet_init();
2605         ip_fib_init();
2606
2607         if (ip_rt_proc_init())
2608                 pr_err("Unable to create route proc files\n");
2609 #ifdef CONFIG_XFRM
2610         xfrm_init();
2611         xfrm4_init(ip_rt_max_size);
2612 #endif
2613         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2614
2615 #ifdef CONFIG_SYSCTL
2616         register_pernet_subsys(&sysctl_route_ops);
2617 #endif
2618         register_pernet_subsys(&rt_genid_ops);
2619         register_pernet_subsys(&ipv4_inetpeer_ops);
2620         return rc;
2621 }
2622
2623 #ifdef CONFIG_SYSCTL
2624 /*
2625  * We really need to sanitize the damn ipv4 init order, then all
2626  * this nonsense will go away.
2627  */
2628 void __init ip_static_sysctl_init(void)
2629 {
2630         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2631 }
2632 #endif