Merge branch 'irq-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cascardo/linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_redirect_number __read_mostly  = 9;
120 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
121 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
122 static int ip_rt_error_cost __read_mostly       = HZ;
123 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
124 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
125 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
126 static int ip_rt_min_advmss __read_mostly       = 256;
127
128 /*
129  *      Interface to generic destination cache.
130  */
131
132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
134 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136 static void              ipv4_link_failure(struct sk_buff *skb);
137 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138                                            struct sk_buff *skb, u32 mtu);
139 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
140                                         struct sk_buff *skb);
141 static void             ipv4_dst_destroy(struct dst_entry *dst);
142
143 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
144 {
145         WARN_ON(1);
146         return NULL;
147 }
148
149 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
150                                            struct sk_buff *skb,
151                                            const void *daddr);
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .protocol =             cpu_to_be16(ETH_P_IP),
156         .check =                ipv4_dst_check,
157         .default_advmss =       ipv4_default_advmss,
158         .mtu =                  ipv4_mtu,
159         .cow_metrics =          ipv4_cow_metrics,
160         .destroy =              ipv4_dst_destroy,
161         .negative_advice =      ipv4_negative_advice,
162         .link_failure =         ipv4_link_failure,
163         .update_pmtu =          ip_rt_update_pmtu,
164         .redirect =             ip_do_redirect,
165         .local_out =            __ip_local_out,
166         .neigh_lookup =         ipv4_neigh_lookup,
167 };
168
169 #define ECN_OR_COST(class)      TC_PRIO_##class
170
171 const __u8 ip_tos2prio[16] = {
172         TC_PRIO_BESTEFFORT,
173         ECN_OR_COST(BESTEFFORT),
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(BESTEFFORT),
176         TC_PRIO_BULK,
177         ECN_OR_COST(BULK),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_INTERACTIVE,
181         ECN_OR_COST(INTERACTIVE),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE_BULK,
185         ECN_OR_COST(INTERACTIVE_BULK),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK)
188 };
189 EXPORT_SYMBOL(ip_tos2prio);
190
191 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
192 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
193
194 #ifdef CONFIG_PROC_FS
195 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197         if (*pos)
198                 return NULL;
199         return SEQ_START_TOKEN;
200 }
201
202 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
203 {
204         ++*pos;
205         return NULL;
206 }
207
208 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
209 {
210 }
211
212 static int rt_cache_seq_show(struct seq_file *seq, void *v)
213 {
214         if (v == SEQ_START_TOKEN)
215                 seq_printf(seq, "%-127s\n",
216                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
217                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
218                            "HHUptod\tSpecDst");
219         return 0;
220 }
221
222 static const struct seq_operations rt_cache_seq_ops = {
223         .start  = rt_cache_seq_start,
224         .next   = rt_cache_seq_next,
225         .stop   = rt_cache_seq_stop,
226         .show   = rt_cache_seq_show,
227 };
228
229 static int rt_cache_seq_open(struct inode *inode, struct file *file)
230 {
231         return seq_open(file, &rt_cache_seq_ops);
232 }
233
234 static const struct file_operations rt_cache_seq_fops = {
235         .owner   = THIS_MODULE,
236         .open    = rt_cache_seq_open,
237         .read    = seq_read,
238         .llseek  = seq_lseek,
239         .release = seq_release,
240 };
241
242
243 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
244 {
245         int cpu;
246
247         if (*pos == 0)
248                 return SEQ_START_TOKEN;
249
250         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
251                 if (!cpu_possible(cpu))
252                         continue;
253                 *pos = cpu+1;
254                 return &per_cpu(rt_cache_stat, cpu);
255         }
256         return NULL;
257 }
258
259 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
260 {
261         int cpu;
262
263         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
264                 if (!cpu_possible(cpu))
265                         continue;
266                 *pos = cpu+1;
267                 return &per_cpu(rt_cache_stat, cpu);
268         }
269         return NULL;
270
271 }
272
273 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
274 {
275
276 }
277
278 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
279 {
280         struct rt_cache_stat *st = v;
281
282         if (v == SEQ_START_TOKEN) {
283                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
284                 return 0;
285         }
286
287         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
288                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
289                    dst_entries_get_slow(&ipv4_dst_ops),
290                    0, /* st->in_hit */
291                    st->in_slow_tot,
292                    st->in_slow_mc,
293                    st->in_no_route,
294                    st->in_brd,
295                    st->in_martian_dst,
296                    st->in_martian_src,
297
298                    0, /* st->out_hit */
299                    st->out_slow_tot,
300                    st->out_slow_mc,
301
302                    0, /* st->gc_total */
303                    0, /* st->gc_ignored */
304                    0, /* st->gc_goal_miss */
305                    0, /* st->gc_dst_overflow */
306                    0, /* st->in_hlist_search */
307                    0  /* st->out_hlist_search */
308                 );
309         return 0;
310 }
311
312 static const struct seq_operations rt_cpu_seq_ops = {
313         .start  = rt_cpu_seq_start,
314         .next   = rt_cpu_seq_next,
315         .stop   = rt_cpu_seq_stop,
316         .show   = rt_cpu_seq_show,
317 };
318
319
320 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
321 {
322         return seq_open(file, &rt_cpu_seq_ops);
323 }
324
325 static const struct file_operations rt_cpu_seq_fops = {
326         .owner   = THIS_MODULE,
327         .open    = rt_cpu_seq_open,
328         .read    = seq_read,
329         .llseek  = seq_lseek,
330         .release = seq_release,
331 };
332
333 #ifdef CONFIG_IP_ROUTE_CLASSID
334 static int rt_acct_proc_show(struct seq_file *m, void *v)
335 {
336         struct ip_rt_acct *dst, *src;
337         unsigned int i, j;
338
339         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
340         if (!dst)
341                 return -ENOMEM;
342
343         for_each_possible_cpu(i) {
344                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
345                 for (j = 0; j < 256; j++) {
346                         dst[j].o_bytes   += src[j].o_bytes;
347                         dst[j].o_packets += src[j].o_packets;
348                         dst[j].i_bytes   += src[j].i_bytes;
349                         dst[j].i_packets += src[j].i_packets;
350                 }
351         }
352
353         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
354         kfree(dst);
355         return 0;
356 }
357
358 static int rt_acct_proc_open(struct inode *inode, struct file *file)
359 {
360         return single_open(file, rt_acct_proc_show, NULL);
361 }
362
363 static const struct file_operations rt_acct_proc_fops = {
364         .owner          = THIS_MODULE,
365         .open           = rt_acct_proc_open,
366         .read           = seq_read,
367         .llseek         = seq_lseek,
368         .release        = single_release,
369 };
370 #endif
371
372 static int __net_init ip_rt_do_proc_init(struct net *net)
373 {
374         struct proc_dir_entry *pde;
375
376         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
377                           &rt_cache_seq_fops);
378         if (!pde)
379                 goto err1;
380
381         pde = proc_create("rt_cache", S_IRUGO,
382                           net->proc_net_stat, &rt_cpu_seq_fops);
383         if (!pde)
384                 goto err2;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
388         if (!pde)
389                 goto err3;
390 #endif
391         return 0;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 err3:
395         remove_proc_entry("rt_cache", net->proc_net_stat);
396 #endif
397 err2:
398         remove_proc_entry("rt_cache", net->proc_net);
399 err1:
400         return -ENOMEM;
401 }
402
403 static void __net_exit ip_rt_do_proc_exit(struct net *net)
404 {
405         remove_proc_entry("rt_cache", net->proc_net_stat);
406         remove_proc_entry("rt_cache", net->proc_net);
407 #ifdef CONFIG_IP_ROUTE_CLASSID
408         remove_proc_entry("rt_acct", net->proc_net);
409 #endif
410 }
411
412 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
413         .init = ip_rt_do_proc_init,
414         .exit = ip_rt_do_proc_exit,
415 };
416
417 static int __init ip_rt_proc_init(void)
418 {
419         return register_pernet_subsys(&ip_rt_proc_ops);
420 }
421
422 #else
423 static inline int ip_rt_proc_init(void)
424 {
425         return 0;
426 }
427 #endif /* CONFIG_PROC_FS */
428
429 static inline bool rt_is_expired(const struct rtable *rth)
430 {
431         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
432 }
433
434 void rt_cache_flush(struct net *net)
435 {
436         rt_genid_bump_ipv4(net);
437 }
438
439 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440                                            struct sk_buff *skb,
441                                            const void *daddr)
442 {
443         struct net_device *dev = dst->dev;
444         const __be32 *pkey = daddr;
445         const struct rtable *rt;
446         struct neighbour *n;
447
448         rt = (const struct rtable *) dst;
449         if (rt->rt_gateway)
450                 pkey = (const __be32 *) &rt->rt_gateway;
451         else if (skb)
452                 pkey = &ip_hdr(skb)->daddr;
453
454         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
455         if (n)
456                 return n;
457         return neigh_create(&arp_tbl, pkey, dev);
458 }
459
460 atomic_t *ip_idents __read_mostly;
461 EXPORT_SYMBOL(ip_idents);
462
463 void __ip_select_ident(struct iphdr *iph, int segs)
464 {
465         static u32 ip_idents_hashrnd __read_mostly;
466         u32 hash, id;
467
468         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
469
470         hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471         id = ip_idents_reserve(hash, segs);
472         iph->id = htons(id);
473 }
474 EXPORT_SYMBOL(__ip_select_ident);
475
476 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
477                              const struct iphdr *iph,
478                              int oif, u8 tos,
479                              u8 prot, u32 mark, int flow_flags)
480 {
481         if (sk) {
482                 const struct inet_sock *inet = inet_sk(sk);
483
484                 oif = sk->sk_bound_dev_if;
485                 mark = sk->sk_mark;
486                 tos = RT_CONN_FLAGS(sk);
487                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
488         }
489         flowi4_init_output(fl4, oif, mark, tos,
490                            RT_SCOPE_UNIVERSE, prot,
491                            flow_flags,
492                            iph->daddr, iph->saddr, 0, 0);
493 }
494
495 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
496                                const struct sock *sk)
497 {
498         const struct iphdr *iph = ip_hdr(skb);
499         int oif = skb->dev->ifindex;
500         u8 tos = RT_TOS(iph->tos);
501         u8 prot = iph->protocol;
502         u32 mark = skb->mark;
503
504         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
505 }
506
507 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
508 {
509         const struct inet_sock *inet = inet_sk(sk);
510         const struct ip_options_rcu *inet_opt;
511         __be32 daddr = inet->inet_daddr;
512
513         rcu_read_lock();
514         inet_opt = rcu_dereference(inet->inet_opt);
515         if (inet_opt && inet_opt->opt.srr)
516                 daddr = inet_opt->opt.faddr;
517         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
518                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
519                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
520                            inet_sk_flowi_flags(sk),
521                            daddr, inet->inet_saddr, 0, 0);
522         rcu_read_unlock();
523 }
524
525 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
526                                  const struct sk_buff *skb)
527 {
528         if (skb)
529                 build_skb_flow_key(fl4, skb, sk);
530         else
531                 build_sk_flow_key(fl4, sk);
532 }
533
534 static inline void rt_free(struct rtable *rt)
535 {
536         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
537 }
538
539 static DEFINE_SPINLOCK(fnhe_lock);
540
541 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
542 {
543         struct rtable *rt;
544
545         rt = rcu_dereference(fnhe->fnhe_rth_input);
546         if (rt) {
547                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
548                 rt_free(rt);
549         }
550         rt = rcu_dereference(fnhe->fnhe_rth_output);
551         if (rt) {
552                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
553                 rt_free(rt);
554         }
555 }
556
557 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
558 {
559         struct fib_nh_exception *fnhe, *oldest;
560
561         oldest = rcu_dereference(hash->chain);
562         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
563              fnhe = rcu_dereference(fnhe->fnhe_next)) {
564                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
565                         oldest = fnhe;
566         }
567         fnhe_flush_routes(oldest);
568         return oldest;
569 }
570
571 static inline u32 fnhe_hashfun(__be32 daddr)
572 {
573         u32 hval;
574
575         hval = (__force u32) daddr;
576         hval ^= (hval >> 11) ^ (hval >> 22);
577
578         return hval & (FNHE_HASH_SIZE - 1);
579 }
580
581 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
582 {
583         rt->rt_pmtu = fnhe->fnhe_pmtu;
584         rt->dst.expires = fnhe->fnhe_expires;
585
586         if (fnhe->fnhe_gw) {
587                 rt->rt_flags |= RTCF_REDIRECTED;
588                 rt->rt_gateway = fnhe->fnhe_gw;
589                 rt->rt_uses_gateway = 1;
590         }
591 }
592
593 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
594                                   u32 pmtu, unsigned long expires)
595 {
596         struct fnhe_hash_bucket *hash;
597         struct fib_nh_exception *fnhe;
598         struct rtable *rt;
599         unsigned int i;
600         int depth;
601         u32 hval = fnhe_hashfun(daddr);
602
603         spin_lock_bh(&fnhe_lock);
604
605         hash = nh->nh_exceptions;
606         if (!hash) {
607                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
608                 if (!hash)
609                         goto out_unlock;
610                 nh->nh_exceptions = hash;
611         }
612
613         hash += hval;
614
615         depth = 0;
616         for (fnhe = rcu_dereference(hash->chain); fnhe;
617              fnhe = rcu_dereference(fnhe->fnhe_next)) {
618                 if (fnhe->fnhe_daddr == daddr)
619                         break;
620                 depth++;
621         }
622
623         if (fnhe) {
624                 if (gw)
625                         fnhe->fnhe_gw = gw;
626                 if (pmtu) {
627                         fnhe->fnhe_pmtu = pmtu;
628                         fnhe->fnhe_expires = max(1UL, expires);
629                 }
630                 /* Update all cached dsts too */
631                 rt = rcu_dereference(fnhe->fnhe_rth_input);
632                 if (rt)
633                         fill_route_from_fnhe(rt, fnhe);
634                 rt = rcu_dereference(fnhe->fnhe_rth_output);
635                 if (rt)
636                         fill_route_from_fnhe(rt, fnhe);
637         } else {
638                 if (depth > FNHE_RECLAIM_DEPTH)
639                         fnhe = fnhe_oldest(hash);
640                 else {
641                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
642                         if (!fnhe)
643                                 goto out_unlock;
644
645                         fnhe->fnhe_next = hash->chain;
646                         rcu_assign_pointer(hash->chain, fnhe);
647                 }
648                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
649                 fnhe->fnhe_daddr = daddr;
650                 fnhe->fnhe_gw = gw;
651                 fnhe->fnhe_pmtu = pmtu;
652                 fnhe->fnhe_expires = expires;
653
654                 /* Exception created; mark the cached routes for the nexthop
655                  * stale, so anyone caching it rechecks if this exception
656                  * applies to them.
657                  */
658                 rt = rcu_dereference(nh->nh_rth_input);
659                 if (rt)
660                         rt->dst.obsolete = DST_OBSOLETE_KILL;
661
662                 for_each_possible_cpu(i) {
663                         struct rtable __rcu **prt;
664                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
665                         rt = rcu_dereference(*prt);
666                         if (rt)
667                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
668                 }
669         }
670
671         fnhe->fnhe_stamp = jiffies;
672
673 out_unlock:
674         spin_unlock_bh(&fnhe_lock);
675 }
676
677 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
678                              bool kill_route)
679 {
680         __be32 new_gw = icmp_hdr(skb)->un.gateway;
681         __be32 old_gw = ip_hdr(skb)->saddr;
682         struct net_device *dev = skb->dev;
683         struct in_device *in_dev;
684         struct fib_result res;
685         struct neighbour *n;
686         struct net *net;
687
688         switch (icmp_hdr(skb)->code & 7) {
689         case ICMP_REDIR_NET:
690         case ICMP_REDIR_NETTOS:
691         case ICMP_REDIR_HOST:
692         case ICMP_REDIR_HOSTTOS:
693                 break;
694
695         default:
696                 return;
697         }
698
699         if (rt->rt_gateway != old_gw)
700                 return;
701
702         in_dev = __in_dev_get_rcu(dev);
703         if (!in_dev)
704                 return;
705
706         net = dev_net(dev);
707         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
708             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
709             ipv4_is_zeronet(new_gw))
710                 goto reject_redirect;
711
712         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
713                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
714                         goto reject_redirect;
715                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
716                         goto reject_redirect;
717         } else {
718                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
719                         goto reject_redirect;
720         }
721
722         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
723         if (n) {
724                 if (!(n->nud_state & NUD_VALID)) {
725                         neigh_event_send(n, NULL);
726                 } else {
727                         if (fib_lookup(net, fl4, &res) == 0) {
728                                 struct fib_nh *nh = &FIB_RES_NH(res);
729
730                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
731                                                       0, 0);
732                         }
733                         if (kill_route)
734                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
735                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
736                 }
737                 neigh_release(n);
738         }
739         return;
740
741 reject_redirect:
742 #ifdef CONFIG_IP_ROUTE_VERBOSE
743         if (IN_DEV_LOG_MARTIANS(in_dev)) {
744                 const struct iphdr *iph = (const struct iphdr *) skb->data;
745                 __be32 daddr = iph->daddr;
746                 __be32 saddr = iph->saddr;
747
748                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
749                                      "  Advised path = %pI4 -> %pI4\n",
750                                      &old_gw, dev->name, &new_gw,
751                                      &saddr, &daddr);
752         }
753 #endif
754         ;
755 }
756
757 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
758 {
759         struct rtable *rt;
760         struct flowi4 fl4;
761         const struct iphdr *iph = (const struct iphdr *) skb->data;
762         int oif = skb->dev->ifindex;
763         u8 tos = RT_TOS(iph->tos);
764         u8 prot = iph->protocol;
765         u32 mark = skb->mark;
766
767         rt = (struct rtable *) dst;
768
769         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
770         __ip_do_redirect(rt, skb, &fl4, true);
771 }
772
773 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
774 {
775         struct rtable *rt = (struct rtable *)dst;
776         struct dst_entry *ret = dst;
777
778         if (rt) {
779                 if (dst->obsolete > 0) {
780                         ip_rt_put(rt);
781                         ret = NULL;
782                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
783                            rt->dst.expires) {
784                         ip_rt_put(rt);
785                         ret = NULL;
786                 }
787         }
788         return ret;
789 }
790
791 /*
792  * Algorithm:
793  *      1. The first ip_rt_redirect_number redirects are sent
794  *         with exponential backoff, then we stop sending them at all,
795  *         assuming that the host ignores our redirects.
796  *      2. If we did not see packets requiring redirects
797  *         during ip_rt_redirect_silence, we assume that the host
798  *         forgot redirected route and start to send redirects again.
799  *
800  * This algorithm is much cheaper and more intelligent than dumb load limiting
801  * in icmp.c.
802  *
803  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
804  * and "frag. need" (breaks PMTU discovery) in icmp.c.
805  */
806
807 void ip_rt_send_redirect(struct sk_buff *skb)
808 {
809         struct rtable *rt = skb_rtable(skb);
810         struct in_device *in_dev;
811         struct inet_peer *peer;
812         struct net *net;
813         int log_martians;
814
815         rcu_read_lock();
816         in_dev = __in_dev_get_rcu(rt->dst.dev);
817         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
818                 rcu_read_unlock();
819                 return;
820         }
821         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
822         rcu_read_unlock();
823
824         net = dev_net(rt->dst.dev);
825         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
826         if (!peer) {
827                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
828                           rt_nexthop(rt, ip_hdr(skb)->daddr));
829                 return;
830         }
831
832         /* No redirected packets during ip_rt_redirect_silence;
833          * reset the algorithm.
834          */
835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
836                 peer->rate_tokens = 0;
837
838         /* Too many ignored redirects; do not send anything
839          * set dst.rate_last to the last seen redirected packet.
840          */
841         if (peer->rate_tokens >= ip_rt_redirect_number) {
842                 peer->rate_last = jiffies;
843                 goto out_put_peer;
844         }
845
846         /* Check for load limit; set rate_last to the latest sent
847          * redirect.
848          */
849         if (peer->rate_tokens == 0 ||
850             time_after(jiffies,
851                        (peer->rate_last +
852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
853                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
854
855                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
856                 peer->rate_last = jiffies;
857                 ++peer->rate_tokens;
858 #ifdef CONFIG_IP_ROUTE_VERBOSE
859                 if (log_martians &&
860                     peer->rate_tokens == ip_rt_redirect_number)
861                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
862                                              &ip_hdr(skb)->saddr, inet_iif(skb),
863                                              &ip_hdr(skb)->daddr, &gw);
864 #endif
865         }
866 out_put_peer:
867         inet_putpeer(peer);
868 }
869
870 static int ip_error(struct sk_buff *skb)
871 {
872         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
873         struct rtable *rt = skb_rtable(skb);
874         struct inet_peer *peer;
875         unsigned long now;
876         struct net *net;
877         bool send;
878         int code;
879
880         net = dev_net(rt->dst.dev);
881         if (!IN_DEV_FORWARD(in_dev)) {
882                 switch (rt->dst.error) {
883                 case EHOSTUNREACH:
884                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
885                         break;
886
887                 case ENETUNREACH:
888                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
889                         break;
890                 }
891                 goto out;
892         }
893
894         switch (rt->dst.error) {
895         case EINVAL:
896         default:
897                 goto out;
898         case EHOSTUNREACH:
899                 code = ICMP_HOST_UNREACH;
900                 break;
901         case ENETUNREACH:
902                 code = ICMP_NET_UNREACH;
903                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
904                 break;
905         case EACCES:
906                 code = ICMP_PKT_FILTERED;
907                 break;
908         }
909
910         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
911
912         send = true;
913         if (peer) {
914                 now = jiffies;
915                 peer->rate_tokens += now - peer->rate_last;
916                 if (peer->rate_tokens > ip_rt_error_burst)
917                         peer->rate_tokens = ip_rt_error_burst;
918                 peer->rate_last = now;
919                 if (peer->rate_tokens >= ip_rt_error_cost)
920                         peer->rate_tokens -= ip_rt_error_cost;
921                 else
922                         send = false;
923                 inet_putpeer(peer);
924         }
925         if (send)
926                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
927
928 out:    kfree_skb(skb);
929         return 0;
930 }
931
932 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
933 {
934         struct dst_entry *dst = &rt->dst;
935         struct fib_result res;
936
937         if (dst_metric_locked(dst, RTAX_MTU))
938                 return;
939
940         if (dst->dev->mtu < mtu)
941                 return;
942
943         if (mtu < ip_rt_min_pmtu)
944                 mtu = ip_rt_min_pmtu;
945
946         if (rt->rt_pmtu == mtu &&
947             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
948                 return;
949
950         rcu_read_lock();
951         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
952                 struct fib_nh *nh = &FIB_RES_NH(res);
953
954                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
955                                       jiffies + ip_rt_mtu_expires);
956         }
957         rcu_read_unlock();
958 }
959
960 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
961                               struct sk_buff *skb, u32 mtu)
962 {
963         struct rtable *rt = (struct rtable *) dst;
964         struct flowi4 fl4;
965
966         ip_rt_build_flow_key(&fl4, sk, skb);
967         __ip_rt_update_pmtu(rt, &fl4, mtu);
968 }
969
970 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
971                       int oif, u32 mark, u8 protocol, int flow_flags)
972 {
973         const struct iphdr *iph = (const struct iphdr *) skb->data;
974         struct flowi4 fl4;
975         struct rtable *rt;
976
977         if (!mark)
978                 mark = IP4_REPLY_MARK(net, skb->mark);
979
980         __build_flow_key(&fl4, NULL, iph, oif,
981                          RT_TOS(iph->tos), protocol, mark, flow_flags);
982         rt = __ip_route_output_key(net, &fl4);
983         if (!IS_ERR(rt)) {
984                 __ip_rt_update_pmtu(rt, &fl4, mtu);
985                 ip_rt_put(rt);
986         }
987 }
988 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
989
990 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
991 {
992         const struct iphdr *iph = (const struct iphdr *) skb->data;
993         struct flowi4 fl4;
994         struct rtable *rt;
995
996         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
997
998         if (!fl4.flowi4_mark)
999                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1000
1001         rt = __ip_route_output_key(sock_net(sk), &fl4);
1002         if (!IS_ERR(rt)) {
1003                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1004                 ip_rt_put(rt);
1005         }
1006 }
1007
1008 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1009 {
1010         const struct iphdr *iph = (const struct iphdr *) skb->data;
1011         struct flowi4 fl4;
1012         struct rtable *rt;
1013         struct dst_entry *odst = NULL;
1014         bool new = false;
1015
1016         bh_lock_sock(sk);
1017
1018         if (!ip_sk_accept_pmtu(sk))
1019                 goto out;
1020
1021         odst = sk_dst_get(sk);
1022
1023         if (sock_owned_by_user(sk) || !odst) {
1024                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1025                 goto out;
1026         }
1027
1028         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1029
1030         rt = (struct rtable *)odst;
1031         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1032                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1033                 if (IS_ERR(rt))
1034                         goto out;
1035
1036                 new = true;
1037         }
1038
1039         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1040
1041         if (!dst_check(&rt->dst, 0)) {
1042                 if (new)
1043                         dst_release(&rt->dst);
1044
1045                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1046                 if (IS_ERR(rt))
1047                         goto out;
1048
1049                 new = true;
1050         }
1051
1052         if (new)
1053                 sk_dst_set(sk, &rt->dst);
1054
1055 out:
1056         bh_unlock_sock(sk);
1057         dst_release(odst);
1058 }
1059 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1060
1061 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1062                    int oif, u32 mark, u8 protocol, int flow_flags)
1063 {
1064         const struct iphdr *iph = (const struct iphdr *) skb->data;
1065         struct flowi4 fl4;
1066         struct rtable *rt;
1067
1068         __build_flow_key(&fl4, NULL, iph, oif,
1069                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1070         rt = __ip_route_output_key(net, &fl4);
1071         if (!IS_ERR(rt)) {
1072                 __ip_do_redirect(rt, skb, &fl4, false);
1073                 ip_rt_put(rt);
1074         }
1075 }
1076 EXPORT_SYMBOL_GPL(ipv4_redirect);
1077
1078 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1079 {
1080         const struct iphdr *iph = (const struct iphdr *) skb->data;
1081         struct flowi4 fl4;
1082         struct rtable *rt;
1083
1084         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1085         rt = __ip_route_output_key(sock_net(sk), &fl4);
1086         if (!IS_ERR(rt)) {
1087                 __ip_do_redirect(rt, skb, &fl4, false);
1088                 ip_rt_put(rt);
1089         }
1090 }
1091 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1092
1093 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1094 {
1095         struct rtable *rt = (struct rtable *) dst;
1096
1097         /* All IPV4 dsts are created with ->obsolete set to the value
1098          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1099          * into this function always.
1100          *
1101          * When a PMTU/redirect information update invalidates a route,
1102          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1103          * DST_OBSOLETE_DEAD by dst_free().
1104          */
1105         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1106                 return NULL;
1107         return dst;
1108 }
1109
1110 static void ipv4_link_failure(struct sk_buff *skb)
1111 {
1112         struct rtable *rt;
1113
1114         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1115
1116         rt = skb_rtable(skb);
1117         if (rt)
1118                 dst_set_expires(&rt->dst, 0);
1119 }
1120
1121 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1122 {
1123         pr_debug("%s: %pI4 -> %pI4, %s\n",
1124                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1125                  skb->dev ? skb->dev->name : "?");
1126         kfree_skb(skb);
1127         WARN_ON(1);
1128         return 0;
1129 }
1130
1131 /*
1132    We do not cache source address of outgoing interface,
1133    because it is used only by IP RR, TS and SRR options,
1134    so that it out of fast path.
1135
1136    BTW remember: "addr" is allowed to be not aligned
1137    in IP options!
1138  */
1139
1140 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1141 {
1142         __be32 src;
1143
1144         if (rt_is_output_route(rt))
1145                 src = ip_hdr(skb)->saddr;
1146         else {
1147                 struct fib_result res;
1148                 struct flowi4 fl4;
1149                 struct iphdr *iph;
1150
1151                 iph = ip_hdr(skb);
1152
1153                 memset(&fl4, 0, sizeof(fl4));
1154                 fl4.daddr = iph->daddr;
1155                 fl4.saddr = iph->saddr;
1156                 fl4.flowi4_tos = RT_TOS(iph->tos);
1157                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1158                 fl4.flowi4_iif = skb->dev->ifindex;
1159                 fl4.flowi4_mark = skb->mark;
1160
1161                 rcu_read_lock();
1162                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1163                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1164                 else
1165                         src = inet_select_addr(rt->dst.dev,
1166                                                rt_nexthop(rt, iph->daddr),
1167                                                RT_SCOPE_UNIVERSE);
1168                 rcu_read_unlock();
1169         }
1170         memcpy(addr, &src, 4);
1171 }
1172
1173 #ifdef CONFIG_IP_ROUTE_CLASSID
1174 static void set_class_tag(struct rtable *rt, u32 tag)
1175 {
1176         if (!(rt->dst.tclassid & 0xFFFF))
1177                 rt->dst.tclassid |= tag & 0xFFFF;
1178         if (!(rt->dst.tclassid & 0xFFFF0000))
1179                 rt->dst.tclassid |= tag & 0xFFFF0000;
1180 }
1181 #endif
1182
1183 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1184 {
1185         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1186
1187         if (advmss == 0) {
1188                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1189                                ip_rt_min_advmss);
1190                 if (advmss > 65535 - 40)
1191                         advmss = 65535 - 40;
1192         }
1193         return advmss;
1194 }
1195
1196 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1197 {
1198         const struct rtable *rt = (const struct rtable *) dst;
1199         unsigned int mtu = rt->rt_pmtu;
1200
1201         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1202                 mtu = dst_metric_raw(dst, RTAX_MTU);
1203
1204         if (mtu)
1205                 return mtu;
1206
1207         mtu = dst->dev->mtu;
1208
1209         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1210                 if (rt->rt_uses_gateway && mtu > 576)
1211                         mtu = 576;
1212         }
1213
1214         return min_t(unsigned int, mtu, IP_MAX_MTU);
1215 }
1216
1217 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1218 {
1219         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1220         struct fib_nh_exception *fnhe;
1221         u32 hval;
1222
1223         if (!hash)
1224                 return NULL;
1225
1226         hval = fnhe_hashfun(daddr);
1227
1228         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1229              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1230                 if (fnhe->fnhe_daddr == daddr)
1231                         return fnhe;
1232         }
1233         return NULL;
1234 }
1235
1236 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1237                               __be32 daddr)
1238 {
1239         bool ret = false;
1240
1241         spin_lock_bh(&fnhe_lock);
1242
1243         if (daddr == fnhe->fnhe_daddr) {
1244                 struct rtable __rcu **porig;
1245                 struct rtable *orig;
1246                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1247
1248                 if (rt_is_input_route(rt))
1249                         porig = &fnhe->fnhe_rth_input;
1250                 else
1251                         porig = &fnhe->fnhe_rth_output;
1252                 orig = rcu_dereference(*porig);
1253
1254                 if (fnhe->fnhe_genid != genid) {
1255                         fnhe->fnhe_genid = genid;
1256                         fnhe->fnhe_gw = 0;
1257                         fnhe->fnhe_pmtu = 0;
1258                         fnhe->fnhe_expires = 0;
1259                         fnhe_flush_routes(fnhe);
1260                         orig = NULL;
1261                 }
1262                 fill_route_from_fnhe(rt, fnhe);
1263                 if (!rt->rt_gateway)
1264                         rt->rt_gateway = daddr;
1265
1266                 if (!(rt->dst.flags & DST_NOCACHE)) {
1267                         rcu_assign_pointer(*porig, rt);
1268                         if (orig)
1269                                 rt_free(orig);
1270                         ret = true;
1271                 }
1272
1273                 fnhe->fnhe_stamp = jiffies;
1274         }
1275         spin_unlock_bh(&fnhe_lock);
1276
1277         return ret;
1278 }
1279
1280 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1281 {
1282         struct rtable *orig, *prev, **p;
1283         bool ret = true;
1284
1285         if (rt_is_input_route(rt)) {
1286                 p = (struct rtable **)&nh->nh_rth_input;
1287         } else {
1288                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1289         }
1290         orig = *p;
1291
1292         prev = cmpxchg(p, orig, rt);
1293         if (prev == orig) {
1294                 if (orig)
1295                         rt_free(orig);
1296         } else
1297                 ret = false;
1298
1299         return ret;
1300 }
1301
1302 static DEFINE_SPINLOCK(rt_uncached_lock);
1303 static LIST_HEAD(rt_uncached_list);
1304
1305 static void rt_add_uncached_list(struct rtable *rt)
1306 {
1307         spin_lock_bh(&rt_uncached_lock);
1308         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1309         spin_unlock_bh(&rt_uncached_lock);
1310 }
1311
1312 static void ipv4_dst_destroy(struct dst_entry *dst)
1313 {
1314         struct rtable *rt = (struct rtable *) dst;
1315
1316         if (!list_empty(&rt->rt_uncached)) {
1317                 spin_lock_bh(&rt_uncached_lock);
1318                 list_del(&rt->rt_uncached);
1319                 spin_unlock_bh(&rt_uncached_lock);
1320         }
1321 }
1322
1323 void rt_flush_dev(struct net_device *dev)
1324 {
1325         if (!list_empty(&rt_uncached_list)) {
1326                 struct net *net = dev_net(dev);
1327                 struct rtable *rt;
1328
1329                 spin_lock_bh(&rt_uncached_lock);
1330                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1331                         if (rt->dst.dev != dev)
1332                                 continue;
1333                         rt->dst.dev = net->loopback_dev;
1334                         dev_hold(rt->dst.dev);
1335                         dev_put(dev);
1336                 }
1337                 spin_unlock_bh(&rt_uncached_lock);
1338         }
1339 }
1340
1341 static bool rt_cache_valid(const struct rtable *rt)
1342 {
1343         return  rt &&
1344                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1345                 !rt_is_expired(rt);
1346 }
1347
1348 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1349                            const struct fib_result *res,
1350                            struct fib_nh_exception *fnhe,
1351                            struct fib_info *fi, u16 type, u32 itag)
1352 {
1353         bool cached = false;
1354
1355         if (fi) {
1356                 struct fib_nh *nh = &FIB_RES_NH(*res);
1357
1358                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1359                         rt->rt_gateway = nh->nh_gw;
1360                         rt->rt_uses_gateway = 1;
1361                 }
1362                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1363 #ifdef CONFIG_IP_ROUTE_CLASSID
1364                 rt->dst.tclassid = nh->nh_tclassid;
1365 #endif
1366                 if (unlikely(fnhe))
1367                         cached = rt_bind_exception(rt, fnhe, daddr);
1368                 else if (!(rt->dst.flags & DST_NOCACHE))
1369                         cached = rt_cache_route(nh, rt);
1370                 if (unlikely(!cached)) {
1371                         /* Routes we intend to cache in nexthop exception or
1372                          * FIB nexthop have the DST_NOCACHE bit clear.
1373                          * However, if we are unsuccessful at storing this
1374                          * route into the cache we really need to set it.
1375                          */
1376                         rt->dst.flags |= DST_NOCACHE;
1377                         if (!rt->rt_gateway)
1378                                 rt->rt_gateway = daddr;
1379                         rt_add_uncached_list(rt);
1380                 }
1381         } else
1382                 rt_add_uncached_list(rt);
1383
1384 #ifdef CONFIG_IP_ROUTE_CLASSID
1385 #ifdef CONFIG_IP_MULTIPLE_TABLES
1386         set_class_tag(rt, res->tclassid);
1387 #endif
1388         set_class_tag(rt, itag);
1389 #endif
1390 }
1391
1392 static struct rtable *rt_dst_alloc(struct net_device *dev,
1393                                    bool nopolicy, bool noxfrm, bool will_cache)
1394 {
1395         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1396                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1397                          (nopolicy ? DST_NOPOLICY : 0) |
1398                          (noxfrm ? DST_NOXFRM : 0));
1399 }
1400
1401 /* called in rcu_read_lock() section */
1402 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1403                                 u8 tos, struct net_device *dev, int our)
1404 {
1405         struct rtable *rth;
1406         struct in_device *in_dev = __in_dev_get_rcu(dev);
1407         u32 itag = 0;
1408         int err;
1409
1410         /* Primary sanity checks. */
1411
1412         if (in_dev == NULL)
1413                 return -EINVAL;
1414
1415         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1416             skb->protocol != htons(ETH_P_IP))
1417                 goto e_inval;
1418
1419         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1420                 if (ipv4_is_loopback(saddr))
1421                         goto e_inval;
1422
1423         if (ipv4_is_zeronet(saddr)) {
1424                 if (!ipv4_is_local_multicast(daddr))
1425                         goto e_inval;
1426         } else {
1427                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1428                                           in_dev, &itag);
1429                 if (err < 0)
1430                         goto e_err;
1431         }
1432         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1433                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1434         if (!rth)
1435                 goto e_nobufs;
1436
1437 #ifdef CONFIG_IP_ROUTE_CLASSID
1438         rth->dst.tclassid = itag;
1439 #endif
1440         rth->dst.output = ip_rt_bug;
1441
1442         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1443         rth->rt_flags   = RTCF_MULTICAST;
1444         rth->rt_type    = RTN_MULTICAST;
1445         rth->rt_is_input= 1;
1446         rth->rt_iif     = 0;
1447         rth->rt_pmtu    = 0;
1448         rth->rt_gateway = 0;
1449         rth->rt_uses_gateway = 0;
1450         INIT_LIST_HEAD(&rth->rt_uncached);
1451         if (our) {
1452                 rth->dst.input= ip_local_deliver;
1453                 rth->rt_flags |= RTCF_LOCAL;
1454         }
1455
1456 #ifdef CONFIG_IP_MROUTE
1457         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1458                 rth->dst.input = ip_mr_input;
1459 #endif
1460         RT_CACHE_STAT_INC(in_slow_mc);
1461
1462         skb_dst_set(skb, &rth->dst);
1463         return 0;
1464
1465 e_nobufs:
1466         return -ENOBUFS;
1467 e_inval:
1468         return -EINVAL;
1469 e_err:
1470         return err;
1471 }
1472
1473
1474 static void ip_handle_martian_source(struct net_device *dev,
1475                                      struct in_device *in_dev,
1476                                      struct sk_buff *skb,
1477                                      __be32 daddr,
1478                                      __be32 saddr)
1479 {
1480         RT_CACHE_STAT_INC(in_martian_src);
1481 #ifdef CONFIG_IP_ROUTE_VERBOSE
1482         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1483                 /*
1484                  *      RFC1812 recommendation, if source is martian,
1485                  *      the only hint is MAC header.
1486                  */
1487                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1488                         &daddr, &saddr, dev->name);
1489                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1490                         print_hex_dump(KERN_WARNING, "ll header: ",
1491                                        DUMP_PREFIX_OFFSET, 16, 1,
1492                                        skb_mac_header(skb),
1493                                        dev->hard_header_len, true);
1494                 }
1495         }
1496 #endif
1497 }
1498
1499 /* called in rcu_read_lock() section */
1500 static int __mkroute_input(struct sk_buff *skb,
1501                            const struct fib_result *res,
1502                            struct in_device *in_dev,
1503                            __be32 daddr, __be32 saddr, u32 tos)
1504 {
1505         struct fib_nh_exception *fnhe;
1506         struct rtable *rth;
1507         int err;
1508         struct in_device *out_dev;
1509         unsigned int flags = 0;
1510         bool do_cache;
1511         u32 itag = 0;
1512
1513         /* get a working reference to the output device */
1514         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1515         if (out_dev == NULL) {
1516                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1517                 return -EINVAL;
1518         }
1519
1520         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1521                                   in_dev->dev, in_dev, &itag);
1522         if (err < 0) {
1523                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1524                                          saddr);
1525
1526                 goto cleanup;
1527         }
1528
1529         do_cache = res->fi && !itag;
1530         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1531             (IN_DEV_SHARED_MEDIA(out_dev) ||
1532              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1533                 flags |= RTCF_DOREDIRECT;
1534                 do_cache = false;
1535         }
1536
1537         if (skb->protocol != htons(ETH_P_IP)) {
1538                 /* Not IP (i.e. ARP). Do not create route, if it is
1539                  * invalid for proxy arp. DNAT routes are always valid.
1540                  *
1541                  * Proxy arp feature have been extended to allow, ARP
1542                  * replies back to the same interface, to support
1543                  * Private VLAN switch technologies. See arp.c.
1544                  */
1545                 if (out_dev == in_dev &&
1546                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1547                         err = -EINVAL;
1548                         goto cleanup;
1549                 }
1550         }
1551
1552         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1553         if (do_cache) {
1554                 if (fnhe != NULL)
1555                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1556                 else
1557                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1558
1559                 if (rt_cache_valid(rth)) {
1560                         skb_dst_set_noref(skb, &rth->dst);
1561                         goto out;
1562                 }
1563         }
1564
1565         rth = rt_dst_alloc(out_dev->dev,
1566                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1567                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1568         if (!rth) {
1569                 err = -ENOBUFS;
1570                 goto cleanup;
1571         }
1572
1573         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1574         rth->rt_flags = flags;
1575         rth->rt_type = res->type;
1576         rth->rt_is_input = 1;
1577         rth->rt_iif     = 0;
1578         rth->rt_pmtu    = 0;
1579         rth->rt_gateway = 0;
1580         rth->rt_uses_gateway = 0;
1581         INIT_LIST_HEAD(&rth->rt_uncached);
1582         RT_CACHE_STAT_INC(in_slow_tot);
1583
1584         rth->dst.input = ip_forward;
1585         rth->dst.output = ip_output;
1586
1587         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1588         skb_dst_set(skb, &rth->dst);
1589 out:
1590         err = 0;
1591  cleanup:
1592         return err;
1593 }
1594
1595 static int ip_mkroute_input(struct sk_buff *skb,
1596                             struct fib_result *res,
1597                             const struct flowi4 *fl4,
1598                             struct in_device *in_dev,
1599                             __be32 daddr, __be32 saddr, u32 tos)
1600 {
1601 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1602         if (res->fi && res->fi->fib_nhs > 1)
1603                 fib_select_multipath(res);
1604 #endif
1605
1606         /* create a routing cache entry */
1607         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1608 }
1609
1610 /*
1611  *      NOTE. We drop all the packets that has local source
1612  *      addresses, because every properly looped back packet
1613  *      must have correct destination already attached by output routine.
1614  *
1615  *      Such approach solves two big problems:
1616  *      1. Not simplex devices are handled properly.
1617  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1618  *      called with rcu_read_lock()
1619  */
1620
1621 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1622                                u8 tos, struct net_device *dev)
1623 {
1624         struct fib_result res;
1625         struct in_device *in_dev = __in_dev_get_rcu(dev);
1626         struct flowi4   fl4;
1627         unsigned int    flags = 0;
1628         u32             itag = 0;
1629         struct rtable   *rth;
1630         int             err = -EINVAL;
1631         struct net    *net = dev_net(dev);
1632         bool do_cache;
1633
1634         /* IP on this device is disabled. */
1635
1636         if (!in_dev)
1637                 goto out;
1638
1639         /* Check for the most weird martians, which can be not detected
1640            by fib_lookup.
1641          */
1642
1643         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1644                 goto martian_source;
1645
1646         res.fi = NULL;
1647         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1648                 goto brd_input;
1649
1650         /* Accept zero addresses only to limited broadcast;
1651          * I even do not know to fix it or not. Waiting for complains :-)
1652          */
1653         if (ipv4_is_zeronet(saddr))
1654                 goto martian_source;
1655
1656         if (ipv4_is_zeronet(daddr))
1657                 goto martian_destination;
1658
1659         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1660          * and call it once if daddr or/and saddr are loopback addresses
1661          */
1662         if (ipv4_is_loopback(daddr)) {
1663                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1664                         goto martian_destination;
1665         } else if (ipv4_is_loopback(saddr)) {
1666                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1667                         goto martian_source;
1668         }
1669
1670         /*
1671          *      Now we are ready to route packet.
1672          */
1673         fl4.flowi4_oif = 0;
1674         fl4.flowi4_iif = dev->ifindex;
1675         fl4.flowi4_mark = skb->mark;
1676         fl4.flowi4_tos = tos;
1677         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1678         fl4.daddr = daddr;
1679         fl4.saddr = saddr;
1680         err = fib_lookup(net, &fl4, &res);
1681         if (err != 0) {
1682                 if (!IN_DEV_FORWARD(in_dev))
1683                         err = -EHOSTUNREACH;
1684                 goto no_route;
1685         }
1686
1687         if (res.type == RTN_BROADCAST)
1688                 goto brd_input;
1689
1690         if (res.type == RTN_LOCAL) {
1691                 err = fib_validate_source(skb, saddr, daddr, tos,
1692                                           0, dev, in_dev, &itag);
1693                 if (err < 0)
1694                         goto martian_source_keep_err;
1695                 goto local_input;
1696         }
1697
1698         if (!IN_DEV_FORWARD(in_dev)) {
1699                 err = -EHOSTUNREACH;
1700                 goto no_route;
1701         }
1702         if (res.type != RTN_UNICAST)
1703                 goto martian_destination;
1704
1705         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1706 out:    return err;
1707
1708 brd_input:
1709         if (skb->protocol != htons(ETH_P_IP))
1710                 goto e_inval;
1711
1712         if (!ipv4_is_zeronet(saddr)) {
1713                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714                                           in_dev, &itag);
1715                 if (err < 0)
1716                         goto martian_source_keep_err;
1717         }
1718         flags |= RTCF_BROADCAST;
1719         res.type = RTN_BROADCAST;
1720         RT_CACHE_STAT_INC(in_brd);
1721
1722 local_input:
1723         do_cache = false;
1724         if (res.fi) {
1725                 if (!itag) {
1726                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1727                         if (rt_cache_valid(rth)) {
1728                                 skb_dst_set_noref(skb, &rth->dst);
1729                                 err = 0;
1730                                 goto out;
1731                         }
1732                         do_cache = true;
1733                 }
1734         }
1735
1736         rth = rt_dst_alloc(net->loopback_dev,
1737                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1738         if (!rth)
1739                 goto e_nobufs;
1740
1741         rth->dst.input= ip_local_deliver;
1742         rth->dst.output= ip_rt_bug;
1743 #ifdef CONFIG_IP_ROUTE_CLASSID
1744         rth->dst.tclassid = itag;
1745 #endif
1746
1747         rth->rt_genid = rt_genid_ipv4(net);
1748         rth->rt_flags   = flags|RTCF_LOCAL;
1749         rth->rt_type    = res.type;
1750         rth->rt_is_input = 1;
1751         rth->rt_iif     = 0;
1752         rth->rt_pmtu    = 0;
1753         rth->rt_gateway = 0;
1754         rth->rt_uses_gateway = 0;
1755         INIT_LIST_HEAD(&rth->rt_uncached);
1756         RT_CACHE_STAT_INC(in_slow_tot);
1757         if (res.type == RTN_UNREACHABLE) {
1758                 rth->dst.input= ip_error;
1759                 rth->dst.error= -err;
1760                 rth->rt_flags   &= ~RTCF_LOCAL;
1761         }
1762         if (do_cache) {
1763                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1764                         rth->dst.flags |= DST_NOCACHE;
1765                         rt_add_uncached_list(rth);
1766                 }
1767         }
1768         skb_dst_set(skb, &rth->dst);
1769         err = 0;
1770         goto out;
1771
1772 no_route:
1773         RT_CACHE_STAT_INC(in_no_route);
1774         res.type = RTN_UNREACHABLE;
1775         if (err == -ESRCH)
1776                 err = -ENETUNREACH;
1777         goto local_input;
1778
1779         /*
1780          *      Do not cache martian addresses: they should be logged (RFC1812)
1781          */
1782 martian_destination:
1783         RT_CACHE_STAT_INC(in_martian_dst);
1784 #ifdef CONFIG_IP_ROUTE_VERBOSE
1785         if (IN_DEV_LOG_MARTIANS(in_dev))
1786                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1787                                      &daddr, &saddr, dev->name);
1788 #endif
1789
1790 e_inval:
1791         err = -EINVAL;
1792         goto out;
1793
1794 e_nobufs:
1795         err = -ENOBUFS;
1796         goto out;
1797
1798 martian_source:
1799         err = -EINVAL;
1800 martian_source_keep_err:
1801         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1802         goto out;
1803 }
1804
1805 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1806                          u8 tos, struct net_device *dev)
1807 {
1808         int res;
1809
1810         rcu_read_lock();
1811
1812         /* Multicast recognition logic is moved from route cache to here.
1813            The problem was that too many Ethernet cards have broken/missing
1814            hardware multicast filters :-( As result the host on multicasting
1815            network acquires a lot of useless route cache entries, sort of
1816            SDR messages from all the world. Now we try to get rid of them.
1817            Really, provided software IP multicast filter is organized
1818            reasonably (at least, hashed), it does not result in a slowdown
1819            comparing with route cache reject entries.
1820            Note, that multicast routers are not affected, because
1821            route cache entry is created eventually.
1822          */
1823         if (ipv4_is_multicast(daddr)) {
1824                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1825
1826                 if (in_dev) {
1827                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1828                                                   ip_hdr(skb)->protocol);
1829                         if (our
1830 #ifdef CONFIG_IP_MROUTE
1831                                 ||
1832                             (!ipv4_is_local_multicast(daddr) &&
1833                              IN_DEV_MFORWARD(in_dev))
1834 #endif
1835                            ) {
1836                                 int res = ip_route_input_mc(skb, daddr, saddr,
1837                                                             tos, dev, our);
1838                                 rcu_read_unlock();
1839                                 return res;
1840                         }
1841                 }
1842                 rcu_read_unlock();
1843                 return -EINVAL;
1844         }
1845         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1846         rcu_read_unlock();
1847         return res;
1848 }
1849 EXPORT_SYMBOL(ip_route_input_noref);
1850
1851 /* called with rcu_read_lock() */
1852 static struct rtable *__mkroute_output(const struct fib_result *res,
1853                                        const struct flowi4 *fl4, int orig_oif,
1854                                        struct net_device *dev_out,
1855                                        unsigned int flags)
1856 {
1857         struct fib_info *fi = res->fi;
1858         struct fib_nh_exception *fnhe;
1859         struct in_device *in_dev;
1860         u16 type = res->type;
1861         struct rtable *rth;
1862         bool do_cache;
1863
1864         in_dev = __in_dev_get_rcu(dev_out);
1865         if (!in_dev)
1866                 return ERR_PTR(-EINVAL);
1867
1868         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1869                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1870                         return ERR_PTR(-EINVAL);
1871
1872         if (ipv4_is_lbcast(fl4->daddr))
1873                 type = RTN_BROADCAST;
1874         else if (ipv4_is_multicast(fl4->daddr))
1875                 type = RTN_MULTICAST;
1876         else if (ipv4_is_zeronet(fl4->daddr))
1877                 return ERR_PTR(-EINVAL);
1878
1879         if (dev_out->flags & IFF_LOOPBACK)
1880                 flags |= RTCF_LOCAL;
1881
1882         do_cache = true;
1883         if (type == RTN_BROADCAST) {
1884                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1885                 fi = NULL;
1886         } else if (type == RTN_MULTICAST) {
1887                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1888                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1889                                      fl4->flowi4_proto))
1890                         flags &= ~RTCF_LOCAL;
1891                 else
1892                         do_cache = false;
1893                 /* If multicast route do not exist use
1894                  * default one, but do not gateway in this case.
1895                  * Yes, it is hack.
1896                  */
1897                 if (fi && res->prefixlen < 4)
1898                         fi = NULL;
1899         }
1900
1901         fnhe = NULL;
1902         do_cache &= fi != NULL;
1903         if (do_cache) {
1904                 struct rtable __rcu **prth;
1905                 struct fib_nh *nh = &FIB_RES_NH(*res);
1906
1907                 fnhe = find_exception(nh, fl4->daddr);
1908                 if (fnhe)
1909                         prth = &fnhe->fnhe_rth_output;
1910                 else {
1911                         if (unlikely(fl4->flowi4_flags &
1912                                      FLOWI_FLAG_KNOWN_NH &&
1913                                      !(nh->nh_gw &&
1914                                        nh->nh_scope == RT_SCOPE_LINK))) {
1915                                 do_cache = false;
1916                                 goto add;
1917                         }
1918                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1919                 }
1920                 rth = rcu_dereference(*prth);
1921                 if (rt_cache_valid(rth)) {
1922                         dst_hold(&rth->dst);
1923                         return rth;
1924                 }
1925         }
1926
1927 add:
1928         rth = rt_dst_alloc(dev_out,
1929                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1930                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1931                            do_cache);
1932         if (!rth)
1933                 return ERR_PTR(-ENOBUFS);
1934
1935         rth->dst.output = ip_output;
1936
1937         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1938         rth->rt_flags   = flags;
1939         rth->rt_type    = type;
1940         rth->rt_is_input = 0;
1941         rth->rt_iif     = orig_oif ? : 0;
1942         rth->rt_pmtu    = 0;
1943         rth->rt_gateway = 0;
1944         rth->rt_uses_gateway = 0;
1945         INIT_LIST_HEAD(&rth->rt_uncached);
1946
1947         RT_CACHE_STAT_INC(out_slow_tot);
1948
1949         if (flags & RTCF_LOCAL)
1950                 rth->dst.input = ip_local_deliver;
1951         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1952                 if (flags & RTCF_LOCAL &&
1953                     !(dev_out->flags & IFF_LOOPBACK)) {
1954                         rth->dst.output = ip_mc_output;
1955                         RT_CACHE_STAT_INC(out_slow_mc);
1956                 }
1957 #ifdef CONFIG_IP_MROUTE
1958                 if (type == RTN_MULTICAST) {
1959                         if (IN_DEV_MFORWARD(in_dev) &&
1960                             !ipv4_is_local_multicast(fl4->daddr)) {
1961                                 rth->dst.input = ip_mr_input;
1962                                 rth->dst.output = ip_mc_output;
1963                         }
1964                 }
1965 #endif
1966         }
1967
1968         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1969
1970         return rth;
1971 }
1972
1973 /*
1974  * Major route resolver routine.
1975  */
1976
1977 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1978 {
1979         struct net_device *dev_out = NULL;
1980         __u8 tos = RT_FL_TOS(fl4);
1981         unsigned int flags = 0;
1982         struct fib_result res;
1983         struct rtable *rth;
1984         int orig_oif;
1985
1986         res.tclassid    = 0;
1987         res.fi          = NULL;
1988         res.table       = NULL;
1989
1990         orig_oif = fl4->flowi4_oif;
1991
1992         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1993         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1994         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1995                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1996
1997         rcu_read_lock();
1998         if (fl4->saddr) {
1999                 rth = ERR_PTR(-EINVAL);
2000                 if (ipv4_is_multicast(fl4->saddr) ||
2001                     ipv4_is_lbcast(fl4->saddr) ||
2002                     ipv4_is_zeronet(fl4->saddr))
2003                         goto out;
2004
2005                 /* I removed check for oif == dev_out->oif here.
2006                    It was wrong for two reasons:
2007                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2008                       is assigned to multiple interfaces.
2009                    2. Moreover, we are allowed to send packets with saddr
2010                       of another iface. --ANK
2011                  */
2012
2013                 if (fl4->flowi4_oif == 0 &&
2014                     (ipv4_is_multicast(fl4->daddr) ||
2015                      ipv4_is_lbcast(fl4->daddr))) {
2016                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2017                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2018                         if (dev_out == NULL)
2019                                 goto out;
2020
2021                         /* Special hack: user can direct multicasts
2022                            and limited broadcast via necessary interface
2023                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2024                            This hack is not just for fun, it allows
2025                            vic,vat and friends to work.
2026                            They bind socket to loopback, set ttl to zero
2027                            and expect that it will work.
2028                            From the viewpoint of routing cache they are broken,
2029                            because we are not allowed to build multicast path
2030                            with loopback source addr (look, routing cache
2031                            cannot know, that ttl is zero, so that packet
2032                            will not leave this host and route is valid).
2033                            Luckily, this hack is good workaround.
2034                          */
2035
2036                         fl4->flowi4_oif = dev_out->ifindex;
2037                         goto make_route;
2038                 }
2039
2040                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2041                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2042                         if (!__ip_dev_find(net, fl4->saddr, false))
2043                                 goto out;
2044                 }
2045         }
2046
2047
2048         if (fl4->flowi4_oif) {
2049                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2050                 rth = ERR_PTR(-ENODEV);
2051                 if (dev_out == NULL)
2052                         goto out;
2053
2054                 /* RACE: Check return value of inet_select_addr instead. */
2055                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2056                         rth = ERR_PTR(-ENETUNREACH);
2057                         goto out;
2058                 }
2059                 if (ipv4_is_local_multicast(fl4->daddr) ||
2060                     ipv4_is_lbcast(fl4->daddr)) {
2061                         if (!fl4->saddr)
2062                                 fl4->saddr = inet_select_addr(dev_out, 0,
2063                                                               RT_SCOPE_LINK);
2064                         goto make_route;
2065                 }
2066                 if (!fl4->saddr) {
2067                         if (ipv4_is_multicast(fl4->daddr))
2068                                 fl4->saddr = inet_select_addr(dev_out, 0,
2069                                                               fl4->flowi4_scope);
2070                         else if (!fl4->daddr)
2071                                 fl4->saddr = inet_select_addr(dev_out, 0,
2072                                                               RT_SCOPE_HOST);
2073                 }
2074         }
2075
2076         if (!fl4->daddr) {
2077                 fl4->daddr = fl4->saddr;
2078                 if (!fl4->daddr)
2079                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2080                 dev_out = net->loopback_dev;
2081                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2082                 res.type = RTN_LOCAL;
2083                 flags |= RTCF_LOCAL;
2084                 goto make_route;
2085         }
2086
2087         if (fib_lookup(net, fl4, &res)) {
2088                 res.fi = NULL;
2089                 res.table = NULL;
2090                 if (fl4->flowi4_oif) {
2091                         /* Apparently, routing tables are wrong. Assume,
2092                            that the destination is on link.
2093
2094                            WHY? DW.
2095                            Because we are allowed to send to iface
2096                            even if it has NO routes and NO assigned
2097                            addresses. When oif is specified, routing
2098                            tables are looked up with only one purpose:
2099                            to catch if destination is gatewayed, rather than
2100                            direct. Moreover, if MSG_DONTROUTE is set,
2101                            we send packet, ignoring both routing tables
2102                            and ifaddr state. --ANK
2103
2104
2105                            We could make it even if oif is unknown,
2106                            likely IPv6, but we do not.
2107                          */
2108
2109                         if (fl4->saddr == 0)
2110                                 fl4->saddr = inet_select_addr(dev_out, 0,
2111                                                               RT_SCOPE_LINK);
2112                         res.type = RTN_UNICAST;
2113                         goto make_route;
2114                 }
2115                 rth = ERR_PTR(-ENETUNREACH);
2116                 goto out;
2117         }
2118
2119         if (res.type == RTN_LOCAL) {
2120                 if (!fl4->saddr) {
2121                         if (res.fi->fib_prefsrc)
2122                                 fl4->saddr = res.fi->fib_prefsrc;
2123                         else
2124                                 fl4->saddr = fl4->daddr;
2125                 }
2126                 dev_out = net->loopback_dev;
2127                 fl4->flowi4_oif = dev_out->ifindex;
2128                 flags |= RTCF_LOCAL;
2129                 goto make_route;
2130         }
2131
2132 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2133         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2134                 fib_select_multipath(&res);
2135         else
2136 #endif
2137         if (!res.prefixlen &&
2138             res.table->tb_num_default > 1 &&
2139             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2140                 fib_select_default(&res);
2141
2142         if (!fl4->saddr)
2143                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2144
2145         dev_out = FIB_RES_DEV(res);
2146         fl4->flowi4_oif = dev_out->ifindex;
2147
2148
2149 make_route:
2150         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2151
2152 out:
2153         rcu_read_unlock();
2154         return rth;
2155 }
2156 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2157
2158 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2159 {
2160         return NULL;
2161 }
2162
2163 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2164 {
2165         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2166
2167         return mtu ? : dst->dev->mtu;
2168 }
2169
2170 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2171                                           struct sk_buff *skb, u32 mtu)
2172 {
2173 }
2174
2175 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2176                                        struct sk_buff *skb)
2177 {
2178 }
2179
2180 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2181                                           unsigned long old)
2182 {
2183         return NULL;
2184 }
2185
2186 static struct dst_ops ipv4_dst_blackhole_ops = {
2187         .family                 =       AF_INET,
2188         .protocol               =       cpu_to_be16(ETH_P_IP),
2189         .check                  =       ipv4_blackhole_dst_check,
2190         .mtu                    =       ipv4_blackhole_mtu,
2191         .default_advmss         =       ipv4_default_advmss,
2192         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2193         .redirect               =       ipv4_rt_blackhole_redirect,
2194         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2195         .neigh_lookup           =       ipv4_neigh_lookup,
2196 };
2197
2198 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2199 {
2200         struct rtable *ort = (struct rtable *) dst_orig;
2201         struct rtable *rt;
2202
2203         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2204         if (rt) {
2205                 struct dst_entry *new = &rt->dst;
2206
2207                 new->__use = 1;
2208                 new->input = dst_discard;
2209                 new->output = dst_discard_sk;
2210
2211                 new->dev = ort->dst.dev;
2212                 if (new->dev)
2213                         dev_hold(new->dev);
2214
2215                 rt->rt_is_input = ort->rt_is_input;
2216                 rt->rt_iif = ort->rt_iif;
2217                 rt->rt_pmtu = ort->rt_pmtu;
2218
2219                 rt->rt_genid = rt_genid_ipv4(net);
2220                 rt->rt_flags = ort->rt_flags;
2221                 rt->rt_type = ort->rt_type;
2222                 rt->rt_gateway = ort->rt_gateway;
2223                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2224
2225                 INIT_LIST_HEAD(&rt->rt_uncached);
2226
2227                 dst_free(new);
2228         }
2229
2230         dst_release(dst_orig);
2231
2232         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2233 }
2234
2235 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2236                                     struct sock *sk)
2237 {
2238         struct rtable *rt = __ip_route_output_key(net, flp4);
2239
2240         if (IS_ERR(rt))
2241                 return rt;
2242
2243         if (flp4->flowi4_proto)
2244                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2245                                                    flowi4_to_flowi(flp4),
2246                                                    sk, 0);
2247
2248         return rt;
2249 }
2250 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2251
2252 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2253                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2254                         u32 seq, int event, int nowait, unsigned int flags)
2255 {
2256         struct rtable *rt = skb_rtable(skb);
2257         struct rtmsg *r;
2258         struct nlmsghdr *nlh;
2259         unsigned long expires = 0;
2260         u32 error;
2261         u32 metrics[RTAX_MAX];
2262
2263         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2264         if (nlh == NULL)
2265                 return -EMSGSIZE;
2266
2267         r = nlmsg_data(nlh);
2268         r->rtm_family    = AF_INET;
2269         r->rtm_dst_len  = 32;
2270         r->rtm_src_len  = 0;
2271         r->rtm_tos      = fl4->flowi4_tos;
2272         r->rtm_table    = RT_TABLE_MAIN;
2273         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2274                 goto nla_put_failure;
2275         r->rtm_type     = rt->rt_type;
2276         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2277         r->rtm_protocol = RTPROT_UNSPEC;
2278         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2279         if (rt->rt_flags & RTCF_NOTIFY)
2280                 r->rtm_flags |= RTM_F_NOTIFY;
2281
2282         if (nla_put_be32(skb, RTA_DST, dst))
2283                 goto nla_put_failure;
2284         if (src) {
2285                 r->rtm_src_len = 32;
2286                 if (nla_put_be32(skb, RTA_SRC, src))
2287                         goto nla_put_failure;
2288         }
2289         if (rt->dst.dev &&
2290             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2291                 goto nla_put_failure;
2292 #ifdef CONFIG_IP_ROUTE_CLASSID
2293         if (rt->dst.tclassid &&
2294             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2295                 goto nla_put_failure;
2296 #endif
2297         if (!rt_is_input_route(rt) &&
2298             fl4->saddr != src) {
2299                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2300                         goto nla_put_failure;
2301         }
2302         if (rt->rt_uses_gateway &&
2303             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2304                 goto nla_put_failure;
2305
2306         expires = rt->dst.expires;
2307         if (expires) {
2308                 unsigned long now = jiffies;
2309
2310                 if (time_before(now, expires))
2311                         expires -= now;
2312                 else
2313                         expires = 0;
2314         }
2315
2316         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2317         if (rt->rt_pmtu && expires)
2318                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2319         if (rtnetlink_put_metrics(skb, metrics) < 0)
2320                 goto nla_put_failure;
2321
2322         if (fl4->flowi4_mark &&
2323             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2324                 goto nla_put_failure;
2325
2326         error = rt->dst.error;
2327
2328         if (rt_is_input_route(rt)) {
2329 #ifdef CONFIG_IP_MROUTE
2330                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2331                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2332                         int err = ipmr_get_route(net, skb,
2333                                                  fl4->saddr, fl4->daddr,
2334                                                  r, nowait);
2335                         if (err <= 0) {
2336                                 if (!nowait) {
2337                                         if (err == 0)
2338                                                 return 0;
2339                                         goto nla_put_failure;
2340                                 } else {
2341                                         if (err == -EMSGSIZE)
2342                                                 goto nla_put_failure;
2343                                         error = err;
2344                                 }
2345                         }
2346                 } else
2347 #endif
2348                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2349                                 goto nla_put_failure;
2350         }
2351
2352         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2353                 goto nla_put_failure;
2354
2355         return nlmsg_end(skb, nlh);
2356
2357 nla_put_failure:
2358         nlmsg_cancel(skb, nlh);
2359         return -EMSGSIZE;
2360 }
2361
2362 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2363 {
2364         struct net *net = sock_net(in_skb->sk);
2365         struct rtmsg *rtm;
2366         struct nlattr *tb[RTA_MAX+1];
2367         struct rtable *rt = NULL;
2368         struct flowi4 fl4;
2369         __be32 dst = 0;
2370         __be32 src = 0;
2371         u32 iif;
2372         int err;
2373         int mark;
2374         struct sk_buff *skb;
2375
2376         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2377         if (err < 0)
2378                 goto errout;
2379
2380         rtm = nlmsg_data(nlh);
2381
2382         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2383         if (skb == NULL) {
2384                 err = -ENOBUFS;
2385                 goto errout;
2386         }
2387
2388         /* Reserve room for dummy headers, this skb can pass
2389            through good chunk of routing engine.
2390          */
2391         skb_reset_mac_header(skb);
2392         skb_reset_network_header(skb);
2393
2394         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2395         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2396         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2397
2398         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2399         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2400         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2401         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2402
2403         memset(&fl4, 0, sizeof(fl4));
2404         fl4.daddr = dst;
2405         fl4.saddr = src;
2406         fl4.flowi4_tos = rtm->rtm_tos;
2407         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2408         fl4.flowi4_mark = mark;
2409
2410         if (iif) {
2411                 struct net_device *dev;
2412
2413                 dev = __dev_get_by_index(net, iif);
2414                 if (dev == NULL) {
2415                         err = -ENODEV;
2416                         goto errout_free;
2417                 }
2418
2419                 skb->protocol   = htons(ETH_P_IP);
2420                 skb->dev        = dev;
2421                 skb->mark       = mark;
2422                 local_bh_disable();
2423                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2424                 local_bh_enable();
2425
2426                 rt = skb_rtable(skb);
2427                 if (err == 0 && rt->dst.error)
2428                         err = -rt->dst.error;
2429         } else {
2430                 rt = ip_route_output_key(net, &fl4);
2431
2432                 err = 0;
2433                 if (IS_ERR(rt))
2434                         err = PTR_ERR(rt);
2435         }
2436
2437         if (err)
2438                 goto errout_free;
2439
2440         skb_dst_set(skb, &rt->dst);
2441         if (rtm->rtm_flags & RTM_F_NOTIFY)
2442                 rt->rt_flags |= RTCF_NOTIFY;
2443
2444         err = rt_fill_info(net, dst, src, &fl4, skb,
2445                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2446                            RTM_NEWROUTE, 0, 0);
2447         if (err <= 0)
2448                 goto errout_free;
2449
2450         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2451 errout:
2452         return err;
2453
2454 errout_free:
2455         kfree_skb(skb);
2456         goto errout;
2457 }
2458
2459 void ip_rt_multicast_event(struct in_device *in_dev)
2460 {
2461         rt_cache_flush(dev_net(in_dev->dev));
2462 }
2463
2464 #ifdef CONFIG_SYSCTL
2465 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2466 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2467 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2468 static int ip_rt_gc_elasticity __read_mostly    = 8;
2469
2470 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2471                                         void __user *buffer,
2472                                         size_t *lenp, loff_t *ppos)
2473 {
2474         struct net *net = (struct net *)__ctl->extra1;
2475
2476         if (write) {
2477                 rt_cache_flush(net);
2478                 fnhe_genid_bump(net);
2479                 return 0;
2480         }
2481
2482         return -EINVAL;
2483 }
2484
2485 static struct ctl_table ipv4_route_table[] = {
2486         {
2487                 .procname       = "gc_thresh",
2488                 .data           = &ipv4_dst_ops.gc_thresh,
2489                 .maxlen         = sizeof(int),
2490                 .mode           = 0644,
2491                 .proc_handler   = proc_dointvec,
2492         },
2493         {
2494                 .procname       = "max_size",
2495                 .data           = &ip_rt_max_size,
2496                 .maxlen         = sizeof(int),
2497                 .mode           = 0644,
2498                 .proc_handler   = proc_dointvec,
2499         },
2500         {
2501                 /*  Deprecated. Use gc_min_interval_ms */
2502
2503                 .procname       = "gc_min_interval",
2504                 .data           = &ip_rt_gc_min_interval,
2505                 .maxlen         = sizeof(int),
2506                 .mode           = 0644,
2507                 .proc_handler   = proc_dointvec_jiffies,
2508         },
2509         {
2510                 .procname       = "gc_min_interval_ms",
2511                 .data           = &ip_rt_gc_min_interval,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = proc_dointvec_ms_jiffies,
2515         },
2516         {
2517                 .procname       = "gc_timeout",
2518                 .data           = &ip_rt_gc_timeout,
2519                 .maxlen         = sizeof(int),
2520                 .mode           = 0644,
2521                 .proc_handler   = proc_dointvec_jiffies,
2522         },
2523         {
2524                 .procname       = "gc_interval",
2525                 .data           = &ip_rt_gc_interval,
2526                 .maxlen         = sizeof(int),
2527                 .mode           = 0644,
2528                 .proc_handler   = proc_dointvec_jiffies,
2529         },
2530         {
2531                 .procname       = "redirect_load",
2532                 .data           = &ip_rt_redirect_load,
2533                 .maxlen         = sizeof(int),
2534                 .mode           = 0644,
2535                 .proc_handler   = proc_dointvec,
2536         },
2537         {
2538                 .procname       = "redirect_number",
2539                 .data           = &ip_rt_redirect_number,
2540                 .maxlen         = sizeof(int),
2541                 .mode           = 0644,
2542                 .proc_handler   = proc_dointvec,
2543         },
2544         {
2545                 .procname       = "redirect_silence",
2546                 .data           = &ip_rt_redirect_silence,
2547                 .maxlen         = sizeof(int),
2548                 .mode           = 0644,
2549                 .proc_handler   = proc_dointvec,
2550         },
2551         {
2552                 .procname       = "error_cost",
2553                 .data           = &ip_rt_error_cost,
2554                 .maxlen         = sizeof(int),
2555                 .mode           = 0644,
2556                 .proc_handler   = proc_dointvec,
2557         },
2558         {
2559                 .procname       = "error_burst",
2560                 .data           = &ip_rt_error_burst,
2561                 .maxlen         = sizeof(int),
2562                 .mode           = 0644,
2563                 .proc_handler   = proc_dointvec,
2564         },
2565         {
2566                 .procname       = "gc_elasticity",
2567                 .data           = &ip_rt_gc_elasticity,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = proc_dointvec,
2571         },
2572         {
2573                 .procname       = "mtu_expires",
2574                 .data           = &ip_rt_mtu_expires,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = proc_dointvec_jiffies,
2578         },
2579         {
2580                 .procname       = "min_pmtu",
2581                 .data           = &ip_rt_min_pmtu,
2582                 .maxlen         = sizeof(int),
2583                 .mode           = 0644,
2584                 .proc_handler   = proc_dointvec,
2585         },
2586         {
2587                 .procname       = "min_adv_mss",
2588                 .data           = &ip_rt_min_advmss,
2589                 .maxlen         = sizeof(int),
2590                 .mode           = 0644,
2591                 .proc_handler   = proc_dointvec,
2592         },
2593         { }
2594 };
2595
2596 static struct ctl_table ipv4_route_flush_table[] = {
2597         {
2598                 .procname       = "flush",
2599                 .maxlen         = sizeof(int),
2600                 .mode           = 0200,
2601                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2602         },
2603         { },
2604 };
2605
2606 static __net_init int sysctl_route_net_init(struct net *net)
2607 {
2608         struct ctl_table *tbl;
2609
2610         tbl = ipv4_route_flush_table;
2611         if (!net_eq(net, &init_net)) {
2612                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2613                 if (tbl == NULL)
2614                         goto err_dup;
2615
2616                 /* Don't export sysctls to unprivileged users */
2617                 if (net->user_ns != &init_user_ns)
2618                         tbl[0].procname = NULL;
2619         }
2620         tbl[0].extra1 = net;
2621
2622         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2623         if (net->ipv4.route_hdr == NULL)
2624                 goto err_reg;
2625         return 0;
2626
2627 err_reg:
2628         if (tbl != ipv4_route_flush_table)
2629                 kfree(tbl);
2630 err_dup:
2631         return -ENOMEM;
2632 }
2633
2634 static __net_exit void sysctl_route_net_exit(struct net *net)
2635 {
2636         struct ctl_table *tbl;
2637
2638         tbl = net->ipv4.route_hdr->ctl_table_arg;
2639         unregister_net_sysctl_table(net->ipv4.route_hdr);
2640         BUG_ON(tbl == ipv4_route_flush_table);
2641         kfree(tbl);
2642 }
2643
2644 static __net_initdata struct pernet_operations sysctl_route_ops = {
2645         .init = sysctl_route_net_init,
2646         .exit = sysctl_route_net_exit,
2647 };
2648 #endif
2649
2650 static __net_init int rt_genid_init(struct net *net)
2651 {
2652         atomic_set(&net->ipv4.rt_genid, 0);
2653         atomic_set(&net->fnhe_genid, 0);
2654         get_random_bytes(&net->ipv4.dev_addr_genid,
2655                          sizeof(net->ipv4.dev_addr_genid));
2656         return 0;
2657 }
2658
2659 static __net_initdata struct pernet_operations rt_genid_ops = {
2660         .init = rt_genid_init,
2661 };
2662
2663 static int __net_init ipv4_inetpeer_init(struct net *net)
2664 {
2665         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2666
2667         if (!bp)
2668                 return -ENOMEM;
2669         inet_peer_base_init(bp);
2670         net->ipv4.peers = bp;
2671         return 0;
2672 }
2673
2674 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2675 {
2676         struct inet_peer_base *bp = net->ipv4.peers;
2677
2678         net->ipv4.peers = NULL;
2679         inetpeer_invalidate_tree(bp);
2680         kfree(bp);
2681 }
2682
2683 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2684         .init   =       ipv4_inetpeer_init,
2685         .exit   =       ipv4_inetpeer_exit,
2686 };
2687
2688 #ifdef CONFIG_IP_ROUTE_CLASSID
2689 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2690 #endif /* CONFIG_IP_ROUTE_CLASSID */
2691
2692 int __init ip_rt_init(void)
2693 {
2694         int rc = 0;
2695
2696         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2697         if (!ip_idents)
2698                 panic("IP: failed to allocate ip_idents\n");
2699
2700         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2701
2702 #ifdef CONFIG_IP_ROUTE_CLASSID
2703         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2704         if (!ip_rt_acct)
2705                 panic("IP: failed to allocate ip_rt_acct\n");
2706 #endif
2707
2708         ipv4_dst_ops.kmem_cachep =
2709                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2710                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2711
2712         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2713
2714         if (dst_entries_init(&ipv4_dst_ops) < 0)
2715                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2716
2717         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2718                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2719
2720         ipv4_dst_ops.gc_thresh = ~0;
2721         ip_rt_max_size = INT_MAX;
2722
2723         devinet_init();
2724         ip_fib_init();
2725
2726         if (ip_rt_proc_init())
2727                 pr_err("Unable to create route proc files\n");
2728 #ifdef CONFIG_XFRM
2729         xfrm_init();
2730         xfrm4_init();
2731 #endif
2732         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2733
2734 #ifdef CONFIG_SYSCTL
2735         register_pernet_subsys(&sysctl_route_ops);
2736 #endif
2737         register_pernet_subsys(&rt_genid_ops);
2738         register_pernet_subsys(&ipv4_inetpeer_ops);
2739         return rc;
2740 }
2741
2742 #ifdef CONFIG_SYSCTL
2743 /*
2744  * We really need to sanitize the damn ipv4 init order, then all
2745  * this nonsense will go away.
2746  */
2747 void __init ip_static_sysctl_init(void)
2748 {
2749         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2750 }
2751 #endif