Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[cascardo/linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38
39 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
40                         struct nlmsghdr *n, u32 clid,
41                         struct Qdisc *old, struct Qdisc *new);
42 static int tclass_notify(struct net *net, struct sk_buff *oskb,
43                          struct nlmsghdr *n, struct Qdisc *q,
44                          unsigned long cl, int event);
45
46 /*
47
48    Short review.
49    -------------
50
51    This file consists of two interrelated parts:
52
53    1. queueing disciplines manager frontend.
54    2. traffic classes manager frontend.
55
56    Generally, queueing discipline ("qdisc") is a black box,
57    which is able to enqueue packets and to dequeue them (when
58    device is ready to send something) in order and at times
59    determined by algorithm hidden in it.
60
61    qdisc's are divided to two categories:
62    - "queues", which have no internal structure visible from outside.
63    - "schedulers", which split all the packets to "traffic classes",
64      using "packet classifiers" (look at cls_api.c)
65
66    In turn, classes may have child qdiscs (as rule, queues)
67    attached to them etc. etc. etc.
68
69    The goal of the routines in this file is to translate
70    information supplied by user in the form of handles
71    to more intelligible for kernel form, to make some sanity
72    checks and part of work, which is common to all qdiscs
73    and to provide rtnetlink notifications.
74
75    All real intelligent work is done inside qdisc modules.
76
77
78
79    Every discipline has two major routines: enqueue and dequeue.
80
81    ---dequeue
82
83    dequeue usually returns a skb to send. It is allowed to return NULL,
84    but it does not mean that queue is empty, it just means that
85    discipline does not want to send anything this time.
86    Queue is really empty if q->q.qlen == 0.
87    For complicated disciplines with multiple queues q->q is not
88    real packet queue, but however q->q.qlen must be valid.
89
90    ---enqueue
91
92    enqueue returns 0, if packet was enqueued successfully.
93    If packet (this one or another one) was dropped, it returns
94    not zero error code.
95    NET_XMIT_DROP        - this packet dropped
96      Expected action: do not backoff, but wait until queue will clear.
97    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
98      Expected action: backoff or ignore
99
100    Auxiliary routines:
101
102    ---peek
103
104    like dequeue but without removing a packet from the queue
105
106    ---reset
107
108    returns qdisc to initial state: purge all buffers, clear all
109    timers, counters (except for statistics) etc.
110
111    ---init
112
113    initializes newly created qdisc.
114
115    ---destroy
116
117    destroys resources allocated by init and during lifetime of qdisc.
118
119    ---change
120
121    changes qdisc parameters.
122  */
123
124 /* Protects list of registered TC modules. It is pure SMP lock. */
125 static DEFINE_RWLOCK(qdisc_mod_lock);
126
127
128 /************************************************
129  *      Queueing disciplines manipulation.      *
130  ************************************************/
131
132
133 /* The list of all installed queueing disciplines. */
134
135 static struct Qdisc_ops *qdisc_base;
136
137 /* Register/unregister queueing discipline */
138
139 int register_qdisc(struct Qdisc_ops *qops)
140 {
141         struct Qdisc_ops *q, **qp;
142         int rc = -EEXIST;
143
144         write_lock(&qdisc_mod_lock);
145         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
146                 if (!strcmp(qops->id, q->id))
147                         goto out;
148
149         if (qops->enqueue == NULL)
150                 qops->enqueue = noop_qdisc_ops.enqueue;
151         if (qops->peek == NULL) {
152                 if (qops->dequeue == NULL)
153                         qops->peek = noop_qdisc_ops.peek;
154                 else
155                         goto out_einval;
156         }
157         if (qops->dequeue == NULL)
158                 qops->dequeue = noop_qdisc_ops.dequeue;
159
160         if (qops->cl_ops) {
161                 const struct Qdisc_class_ops *cops = qops->cl_ops;
162
163                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
164                         goto out_einval;
165
166                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
167                         goto out_einval;
168         }
169
170         qops->next = NULL;
171         *qp = qops;
172         rc = 0;
173 out:
174         write_unlock(&qdisc_mod_lock);
175         return rc;
176
177 out_einval:
178         rc = -EINVAL;
179         goto out;
180 }
181 EXPORT_SYMBOL(register_qdisc);
182
183 int unregister_qdisc(struct Qdisc_ops *qops)
184 {
185         struct Qdisc_ops *q, **qp;
186         int err = -ENOENT;
187
188         write_lock(&qdisc_mod_lock);
189         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
190                 if (q == qops)
191                         break;
192         if (q) {
193                 *qp = q->next;
194                 q->next = NULL;
195                 err = 0;
196         }
197         write_unlock(&qdisc_mod_lock);
198         return err;
199 }
200 EXPORT_SYMBOL(unregister_qdisc);
201
202 /* Get default qdisc if not otherwise specified */
203 void qdisc_get_default(char *name, size_t len)
204 {
205         read_lock(&qdisc_mod_lock);
206         strlcpy(name, default_qdisc_ops->id, len);
207         read_unlock(&qdisc_mod_lock);
208 }
209
210 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
211 {
212         struct Qdisc_ops *q = NULL;
213
214         for (q = qdisc_base; q; q = q->next) {
215                 if (!strcmp(name, q->id)) {
216                         if (!try_module_get(q->owner))
217                                 q = NULL;
218                         break;
219                 }
220         }
221
222         return q;
223 }
224
225 /* Set new default qdisc to use */
226 int qdisc_set_default(const char *name)
227 {
228         const struct Qdisc_ops *ops;
229
230         if (!capable(CAP_NET_ADMIN))
231                 return -EPERM;
232
233         write_lock(&qdisc_mod_lock);
234         ops = qdisc_lookup_default(name);
235         if (!ops) {
236                 /* Not found, drop lock and try to load module */
237                 write_unlock(&qdisc_mod_lock);
238                 request_module("sch_%s", name);
239                 write_lock(&qdisc_mod_lock);
240
241                 ops = qdisc_lookup_default(name);
242         }
243
244         if (ops) {
245                 /* Set new default */
246                 module_put(default_qdisc_ops->owner);
247                 default_qdisc_ops = ops;
248         }
249         write_unlock(&qdisc_mod_lock);
250
251         return ops ? 0 : -ENOENT;
252 }
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_hash_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
277                 struct Qdisc *root = qdisc_dev(q)->qdisc;
278
279                 WARN_ON_ONCE(root == &noop_qdisc);
280                 ASSERT_RTNL();
281                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         q = qdisc_match_from_root(dev->qdisc, handle);
300         if (q)
301                 goto out;
302
303         if (dev_ingress_queue(dev))
304                 q = qdisc_match_from_root(
305                         dev_ingress_queue(dev)->qdisc_sleeping,
306                         handle);
307 out:
308         return q;
309 }
310
311 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
312 {
313         unsigned long cl;
314         struct Qdisc *leaf;
315         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
316
317         if (cops == NULL)
318                 return NULL;
319         cl = cops->get(p, classid);
320
321         if (cl == 0)
322                 return NULL;
323         leaf = cops->leaf(p, cl);
324         cops->put(p, cl);
325         return leaf;
326 }
327
328 /* Find queueing discipline by name */
329
330 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
331 {
332         struct Qdisc_ops *q = NULL;
333
334         if (kind) {
335                 read_lock(&qdisc_mod_lock);
336                 for (q = qdisc_base; q; q = q->next) {
337                         if (nla_strcmp(kind, q->id) == 0) {
338                                 if (!try_module_get(q->owner))
339                                         q = NULL;
340                                 break;
341                         }
342                 }
343                 read_unlock(&qdisc_mod_lock);
344         }
345         return q;
346 }
347
348 /* The linklayer setting were not transferred from iproute2, in older
349  * versions, and the rate tables lookup systems have been dropped in
350  * the kernel. To keep backward compatible with older iproute2 tc
351  * utils, we detect the linklayer setting by detecting if the rate
352  * table were modified.
353  *
354  * For linklayer ATM table entries, the rate table will be aligned to
355  * 48 bytes, thus some table entries will contain the same value.  The
356  * mpu (min packet unit) is also encoded into the old rate table, thus
357  * starting from the mpu, we find low and high table entries for
358  * mapping this cell.  If these entries contain the same value, when
359  * the rate tables have been modified for linklayer ATM.
360  *
361  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
362  * and then roundup to the next cell, calc the table entry one below,
363  * and compare.
364  */
365 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
366 {
367         int low       = roundup(r->mpu, 48);
368         int high      = roundup(low+1, 48);
369         int cell_low  = low >> r->cell_log;
370         int cell_high = (high >> r->cell_log) - 1;
371
372         /* rtab is too inaccurate at rates > 100Mbit/s */
373         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
374                 pr_debug("TC linklayer: Giving up ATM detection\n");
375                 return TC_LINKLAYER_ETHERNET;
376         }
377
378         if ((cell_high > cell_low) && (cell_high < 256)
379             && (rtab[cell_low] == rtab[cell_high])) {
380                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
381                          cell_low, cell_high, rtab[cell_high]);
382                 return TC_LINKLAYER_ATM;
383         }
384         return TC_LINKLAYER_ETHERNET;
385 }
386
387 static struct qdisc_rate_table *qdisc_rtab_list;
388
389 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
390 {
391         struct qdisc_rate_table *rtab;
392
393         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
394             nla_len(tab) != TC_RTAB_SIZE)
395                 return NULL;
396
397         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
398                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
399                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
400                         rtab->refcnt++;
401                         return rtab;
402                 }
403         }
404
405         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
406         if (rtab) {
407                 rtab->rate = *r;
408                 rtab->refcnt = 1;
409                 memcpy(rtab->data, nla_data(tab), 1024);
410                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
411                         r->linklayer = __detect_linklayer(r, rtab->data);
412                 rtab->next = qdisc_rtab_list;
413                 qdisc_rtab_list = rtab;
414         }
415         return rtab;
416 }
417 EXPORT_SYMBOL(qdisc_get_rtab);
418
419 void qdisc_put_rtab(struct qdisc_rate_table *tab)
420 {
421         struct qdisc_rate_table *rtab, **rtabp;
422
423         if (!tab || --tab->refcnt)
424                 return;
425
426         for (rtabp = &qdisc_rtab_list;
427              (rtab = *rtabp) != NULL;
428              rtabp = &rtab->next) {
429                 if (rtab == tab) {
430                         *rtabp = rtab->next;
431                         kfree(rtab);
432                         return;
433                 }
434         }
435 }
436 EXPORT_SYMBOL(qdisc_put_rtab);
437
438 static LIST_HEAD(qdisc_stab_list);
439 static DEFINE_SPINLOCK(qdisc_stab_lock);
440
441 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
442         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
443         [TCA_STAB_DATA] = { .type = NLA_BINARY },
444 };
445
446 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
447 {
448         struct nlattr *tb[TCA_STAB_MAX + 1];
449         struct qdisc_size_table *stab;
450         struct tc_sizespec *s;
451         unsigned int tsize = 0;
452         u16 *tab = NULL;
453         int err;
454
455         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
456         if (err < 0)
457                 return ERR_PTR(err);
458         if (!tb[TCA_STAB_BASE])
459                 return ERR_PTR(-EINVAL);
460
461         s = nla_data(tb[TCA_STAB_BASE]);
462
463         if (s->tsize > 0) {
464                 if (!tb[TCA_STAB_DATA])
465                         return ERR_PTR(-EINVAL);
466                 tab = nla_data(tb[TCA_STAB_DATA]);
467                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
468         }
469
470         if (tsize != s->tsize || (!tab && tsize > 0))
471                 return ERR_PTR(-EINVAL);
472
473         spin_lock(&qdisc_stab_lock);
474
475         list_for_each_entry(stab, &qdisc_stab_list, list) {
476                 if (memcmp(&stab->szopts, s, sizeof(*s)))
477                         continue;
478                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
479                         continue;
480                 stab->refcnt++;
481                 spin_unlock(&qdisc_stab_lock);
482                 return stab;
483         }
484
485         spin_unlock(&qdisc_stab_lock);
486
487         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
488         if (!stab)
489                 return ERR_PTR(-ENOMEM);
490
491         stab->refcnt = 1;
492         stab->szopts = *s;
493         if (tsize > 0)
494                 memcpy(stab->data, tab, tsize * sizeof(u16));
495
496         spin_lock(&qdisc_stab_lock);
497         list_add_tail(&stab->list, &qdisc_stab_list);
498         spin_unlock(&qdisc_stab_lock);
499
500         return stab;
501 }
502
503 static void stab_kfree_rcu(struct rcu_head *head)
504 {
505         kfree(container_of(head, struct qdisc_size_table, rcu));
506 }
507
508 void qdisc_put_stab(struct qdisc_size_table *tab)
509 {
510         if (!tab)
511                 return;
512
513         spin_lock(&qdisc_stab_lock);
514
515         if (--tab->refcnt == 0) {
516                 list_del(&tab->list);
517                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
518         }
519
520         spin_unlock(&qdisc_stab_lock);
521 }
522 EXPORT_SYMBOL(qdisc_put_stab);
523
524 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
525 {
526         struct nlattr *nest;
527
528         nest = nla_nest_start(skb, TCA_STAB);
529         if (nest == NULL)
530                 goto nla_put_failure;
531         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
532                 goto nla_put_failure;
533         nla_nest_end(skb, nest);
534
535         return skb->len;
536
537 nla_put_failure:
538         return -1;
539 }
540
541 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
542 {
543         int pkt_len, slot;
544
545         pkt_len = skb->len + stab->szopts.overhead;
546         if (unlikely(!stab->szopts.tsize))
547                 goto out;
548
549         slot = pkt_len + stab->szopts.cell_align;
550         if (unlikely(slot < 0))
551                 slot = 0;
552
553         slot >>= stab->szopts.cell_log;
554         if (likely(slot < stab->szopts.tsize))
555                 pkt_len = stab->data[slot];
556         else
557                 pkt_len = stab->data[stab->szopts.tsize - 1] *
558                                 (slot / stab->szopts.tsize) +
559                                 stab->data[slot % stab->szopts.tsize];
560
561         pkt_len <<= stab->szopts.size_log;
562 out:
563         if (unlikely(pkt_len < 1))
564                 pkt_len = 1;
565         qdisc_skb_cb(skb)->pkt_len = pkt_len;
566 }
567 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
568
569 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
570 {
571         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
572                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
573                         txt, qdisc->ops->id, qdisc->handle >> 16);
574                 qdisc->flags |= TCQ_F_WARN_NONWC;
575         }
576 }
577 EXPORT_SYMBOL(qdisc_warn_nonwc);
578
579 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
580 {
581         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
582                                                  timer);
583
584         rcu_read_lock();
585         __netif_schedule(qdisc_root(wd->qdisc));
586         rcu_read_unlock();
587
588         return HRTIMER_NORESTART;
589 }
590
591 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
592 {
593         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
594         wd->timer.function = qdisc_watchdog;
595         wd->qdisc = qdisc;
596 }
597 EXPORT_SYMBOL(qdisc_watchdog_init);
598
599 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
600 {
601         if (test_bit(__QDISC_STATE_DEACTIVATED,
602                      &qdisc_root_sleeping(wd->qdisc)->state))
603                 return;
604
605         if (wd->last_expires == expires)
606                 return;
607
608         wd->last_expires = expires;
609         hrtimer_start(&wd->timer,
610                       ns_to_ktime(expires),
611                       HRTIMER_MODE_ABS_PINNED);
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
614
615 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
616 {
617         hrtimer_cancel(&wd->timer);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_cancel);
620
621 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
622 {
623         unsigned int size = n * sizeof(struct hlist_head), i;
624         struct hlist_head *h;
625
626         if (size <= PAGE_SIZE)
627                 h = kmalloc(size, GFP_KERNEL);
628         else
629                 h = (struct hlist_head *)
630                         __get_free_pages(GFP_KERNEL, get_order(size));
631
632         if (h != NULL) {
633                 for (i = 0; i < n; i++)
634                         INIT_HLIST_HEAD(&h[i]);
635         }
636         return h;
637 }
638
639 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
640 {
641         unsigned int size = n * sizeof(struct hlist_head);
642
643         if (size <= PAGE_SIZE)
644                 kfree(h);
645         else
646                 free_pages((unsigned long)h, get_order(size));
647 }
648
649 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
650 {
651         struct Qdisc_class_common *cl;
652         struct hlist_node *next;
653         struct hlist_head *nhash, *ohash;
654         unsigned int nsize, nmask, osize;
655         unsigned int i, h;
656
657         /* Rehash when load factor exceeds 0.75 */
658         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
659                 return;
660         nsize = clhash->hashsize * 2;
661         nmask = nsize - 1;
662         nhash = qdisc_class_hash_alloc(nsize);
663         if (nhash == NULL)
664                 return;
665
666         ohash = clhash->hash;
667         osize = clhash->hashsize;
668
669         sch_tree_lock(sch);
670         for (i = 0; i < osize; i++) {
671                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
672                         h = qdisc_class_hash(cl->classid, nmask);
673                         hlist_add_head(&cl->hnode, &nhash[h]);
674                 }
675         }
676         clhash->hash     = nhash;
677         clhash->hashsize = nsize;
678         clhash->hashmask = nmask;
679         sch_tree_unlock(sch);
680
681         qdisc_class_hash_free(ohash, osize);
682 }
683 EXPORT_SYMBOL(qdisc_class_hash_grow);
684
685 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
686 {
687         unsigned int size = 4;
688
689         clhash->hash = qdisc_class_hash_alloc(size);
690         if (clhash->hash == NULL)
691                 return -ENOMEM;
692         clhash->hashsize  = size;
693         clhash->hashmask  = size - 1;
694         clhash->hashelems = 0;
695         return 0;
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_init);
698
699 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
700 {
701         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
702 }
703 EXPORT_SYMBOL(qdisc_class_hash_destroy);
704
705 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
706                              struct Qdisc_class_common *cl)
707 {
708         unsigned int h;
709
710         INIT_HLIST_NODE(&cl->hnode);
711         h = qdisc_class_hash(cl->classid, clhash->hashmask);
712         hlist_add_head(&cl->hnode, &clhash->hash[h]);
713         clhash->hashelems++;
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_insert);
716
717 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
718                              struct Qdisc_class_common *cl)
719 {
720         hlist_del(&cl->hnode);
721         clhash->hashelems--;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_remove);
724
725 /* Allocate an unique handle from space managed by kernel
726  * Possible range is [8000-FFFF]:0000 (0x8000 values)
727  */
728 static u32 qdisc_alloc_handle(struct net_device *dev)
729 {
730         int i = 0x8000;
731         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
732
733         do {
734                 autohandle += TC_H_MAKE(0x10000U, 0);
735                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
736                         autohandle = TC_H_MAKE(0x80000000U, 0);
737                 if (!qdisc_lookup(dev, autohandle))
738                         return autohandle;
739                 cond_resched();
740         } while (--i > 0);
741
742         return 0;
743 }
744
745 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
746                                unsigned int len)
747 {
748         const struct Qdisc_class_ops *cops;
749         unsigned long cl;
750         u32 parentid;
751         int drops;
752
753         if (n == 0 && len == 0)
754                 return;
755         drops = max_t(int, n, 0);
756         rcu_read_lock();
757         while ((parentid = sch->parent)) {
758                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
759                         break;
760
761                 if (sch->flags & TCQ_F_NOPARENT)
762                         break;
763                 /* TODO: perform the search on a per txq basis */
764                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
765                 if (sch == NULL) {
766                         WARN_ON_ONCE(parentid != TC_H_ROOT);
767                         break;
768                 }
769                 cops = sch->ops->cl_ops;
770                 if (cops->qlen_notify) {
771                         cl = cops->get(sch, parentid);
772                         cops->qlen_notify(sch, cl);
773                         cops->put(sch, cl);
774                 }
775                 sch->q.qlen -= n;
776                 sch->qstats.backlog -= len;
777                 __qdisc_qstats_drop(sch, drops);
778         }
779         rcu_read_unlock();
780 }
781 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
782
783 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
784                                struct nlmsghdr *n, u32 clid,
785                                struct Qdisc *old, struct Qdisc *new)
786 {
787         if (new || old)
788                 qdisc_notify(net, skb, n, clid, old, new);
789
790         if (old)
791                 qdisc_destroy(old);
792 }
793
794 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
795  * to device "dev".
796  *
797  * When appropriate send a netlink notification using 'skb'
798  * and "n".
799  *
800  * On success, destroy old qdisc.
801  */
802
803 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
804                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
805                        struct Qdisc *new, struct Qdisc *old)
806 {
807         struct Qdisc *q = old;
808         struct net *net = dev_net(dev);
809         int err = 0;
810
811         if (parent == NULL) {
812                 unsigned int i, num_q, ingress;
813
814                 ingress = 0;
815                 num_q = dev->num_tx_queues;
816                 if ((q && q->flags & TCQ_F_INGRESS) ||
817                     (new && new->flags & TCQ_F_INGRESS)) {
818                         num_q = 1;
819                         ingress = 1;
820                         if (!dev_ingress_queue(dev))
821                                 return -ENOENT;
822                 }
823
824                 if (dev->flags & IFF_UP)
825                         dev_deactivate(dev);
826
827                 if (new && new->ops->attach)
828                         goto skip;
829
830                 for (i = 0; i < num_q; i++) {
831                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
832
833                         if (!ingress)
834                                 dev_queue = netdev_get_tx_queue(dev, i);
835
836                         old = dev_graft_qdisc(dev_queue, new);
837                         if (new && i > 0)
838                                 atomic_inc(&new->refcnt);
839
840                         if (!ingress)
841                                 qdisc_destroy(old);
842                 }
843
844 skip:
845                 if (!ingress) {
846                         notify_and_destroy(net, skb, n, classid,
847                                            dev->qdisc, new);
848                         if (new && !new->ops->attach)
849                                 atomic_inc(&new->refcnt);
850                         dev->qdisc = new ? : &noop_qdisc;
851
852                         if (new && new->ops->attach)
853                                 new->ops->attach(new);
854                 } else {
855                         notify_and_destroy(net, skb, n, classid, old, new);
856                 }
857
858                 if (dev->flags & IFF_UP)
859                         dev_activate(dev);
860         } else {
861                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
862
863                 err = -EOPNOTSUPP;
864                 if (cops && cops->graft) {
865                         unsigned long cl = cops->get(parent, classid);
866                         if (cl) {
867                                 err = cops->graft(parent, cl, new, &old);
868                                 cops->put(parent, cl);
869                         } else
870                                 err = -ENOENT;
871                 }
872                 if (!err)
873                         notify_and_destroy(net, skb, n, classid, old, new);
874         }
875         return err;
876 }
877
878 /* lockdep annotation is needed for ingress; egress gets it only for name */
879 static struct lock_class_key qdisc_tx_lock;
880 static struct lock_class_key qdisc_rx_lock;
881
882 /*
883    Allocate and initialize new qdisc.
884
885    Parameters are passed via opt.
886  */
887
888 static struct Qdisc *
889 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
890              struct Qdisc *p, u32 parent, u32 handle,
891              struct nlattr **tca, int *errp)
892 {
893         int err;
894         struct nlattr *kind = tca[TCA_KIND];
895         struct Qdisc *sch;
896         struct Qdisc_ops *ops;
897         struct qdisc_size_table *stab;
898
899         ops = qdisc_lookup_ops(kind);
900 #ifdef CONFIG_MODULES
901         if (ops == NULL && kind != NULL) {
902                 char name[IFNAMSIZ];
903                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
904                         /* We dropped the RTNL semaphore in order to
905                          * perform the module load.  So, even if we
906                          * succeeded in loading the module we have to
907                          * tell the caller to replay the request.  We
908                          * indicate this using -EAGAIN.
909                          * We replay the request because the device may
910                          * go away in the mean time.
911                          */
912                         rtnl_unlock();
913                         request_module("sch_%s", name);
914                         rtnl_lock();
915                         ops = qdisc_lookup_ops(kind);
916                         if (ops != NULL) {
917                                 /* We will try again qdisc_lookup_ops,
918                                  * so don't keep a reference.
919                                  */
920                                 module_put(ops->owner);
921                                 err = -EAGAIN;
922                                 goto err_out;
923                         }
924                 }
925         }
926 #endif
927
928         err = -ENOENT;
929         if (ops == NULL)
930                 goto err_out;
931
932         sch = qdisc_alloc(dev_queue, ops);
933         if (IS_ERR(sch)) {
934                 err = PTR_ERR(sch);
935                 goto err_out2;
936         }
937
938         sch->parent = parent;
939
940         if (handle == TC_H_INGRESS) {
941                 sch->flags |= TCQ_F_INGRESS;
942                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
943                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
944         } else {
945                 if (handle == 0) {
946                         handle = qdisc_alloc_handle(dev);
947                         err = -ENOMEM;
948                         if (handle == 0)
949                                 goto err_out3;
950                 }
951                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
952                 if (!netif_is_multiqueue(dev))
953                         sch->flags |= TCQ_F_ONETXQUEUE;
954         }
955
956         sch->handle = handle;
957
958         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
959                 if (qdisc_is_percpu_stats(sch)) {
960                         sch->cpu_bstats =
961                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
962                         if (!sch->cpu_bstats)
963                                 goto err_out4;
964
965                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
966                         if (!sch->cpu_qstats)
967                                 goto err_out4;
968                 }
969
970                 if (tca[TCA_STAB]) {
971                         stab = qdisc_get_stab(tca[TCA_STAB]);
972                         if (IS_ERR(stab)) {
973                                 err = PTR_ERR(stab);
974                                 goto err_out4;
975                         }
976                         rcu_assign_pointer(sch->stab, stab);
977                 }
978                 if (tca[TCA_RATE]) {
979                         seqcount_t *running;
980
981                         err = -EOPNOTSUPP;
982                         if (sch->flags & TCQ_F_MQROOT)
983                                 goto err_out4;
984
985                         if ((sch->parent != TC_H_ROOT) &&
986                             !(sch->flags & TCQ_F_INGRESS) &&
987                             (!p || !(p->flags & TCQ_F_MQROOT)))
988                                 running = qdisc_root_sleeping_running(sch);
989                         else
990                                 running = &sch->running;
991
992                         err = gen_new_estimator(&sch->bstats,
993                                                 sch->cpu_bstats,
994                                                 &sch->rate_est,
995                                                 NULL,
996                                                 running,
997                                                 tca[TCA_RATE]);
998                         if (err)
999                                 goto err_out4;
1000                 }
1001
1002                 qdisc_hash_add(sch);
1003
1004                 return sch;
1005         }
1006 err_out3:
1007         dev_put(dev);
1008         kfree((char *) sch - sch->padded);
1009 err_out2:
1010         module_put(ops->owner);
1011 err_out:
1012         *errp = err;
1013         return NULL;
1014
1015 err_out4:
1016         free_percpu(sch->cpu_bstats);
1017         free_percpu(sch->cpu_qstats);
1018         /*
1019          * Any broken qdiscs that would require a ops->reset() here?
1020          * The qdisc was never in action so it shouldn't be necessary.
1021          */
1022         qdisc_put_stab(rtnl_dereference(sch->stab));
1023         if (ops->destroy)
1024                 ops->destroy(sch);
1025         goto err_out3;
1026 }
1027
1028 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1029 {
1030         struct qdisc_size_table *ostab, *stab = NULL;
1031         int err = 0;
1032
1033         if (tca[TCA_OPTIONS]) {
1034                 if (sch->ops->change == NULL)
1035                         return -EINVAL;
1036                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1037                 if (err)
1038                         return err;
1039         }
1040
1041         if (tca[TCA_STAB]) {
1042                 stab = qdisc_get_stab(tca[TCA_STAB]);
1043                 if (IS_ERR(stab))
1044                         return PTR_ERR(stab);
1045         }
1046
1047         ostab = rtnl_dereference(sch->stab);
1048         rcu_assign_pointer(sch->stab, stab);
1049         qdisc_put_stab(ostab);
1050
1051         if (tca[TCA_RATE]) {
1052                 /* NB: ignores errors from replace_estimator
1053                    because change can't be undone. */
1054                 if (sch->flags & TCQ_F_MQROOT)
1055                         goto out;
1056                 gen_replace_estimator(&sch->bstats,
1057                                       sch->cpu_bstats,
1058                                       &sch->rate_est,
1059                                       NULL,
1060                                       qdisc_root_sleeping_running(sch),
1061                                       tca[TCA_RATE]);
1062         }
1063 out:
1064         return 0;
1065 }
1066
1067 struct check_loop_arg {
1068         struct qdisc_walker     w;
1069         struct Qdisc            *p;
1070         int                     depth;
1071 };
1072
1073 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1074
1075 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1076 {
1077         struct check_loop_arg   arg;
1078
1079         if (q->ops->cl_ops == NULL)
1080                 return 0;
1081
1082         arg.w.stop = arg.w.skip = arg.w.count = 0;
1083         arg.w.fn = check_loop_fn;
1084         arg.depth = depth;
1085         arg.p = p;
1086         q->ops->cl_ops->walk(q, &arg.w);
1087         return arg.w.stop ? -ELOOP : 0;
1088 }
1089
1090 static int
1091 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1092 {
1093         struct Qdisc *leaf;
1094         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1095         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1096
1097         leaf = cops->leaf(q, cl);
1098         if (leaf) {
1099                 if (leaf == arg->p || arg->depth > 7)
1100                         return -ELOOP;
1101                 return check_loop(leaf, arg->p, arg->depth + 1);
1102         }
1103         return 0;
1104 }
1105
1106 /*
1107  * Delete/get qdisc.
1108  */
1109
1110 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1111 {
1112         struct net *net = sock_net(skb->sk);
1113         struct tcmsg *tcm = nlmsg_data(n);
1114         struct nlattr *tca[TCA_MAX + 1];
1115         struct net_device *dev;
1116         u32 clid;
1117         struct Qdisc *q = NULL;
1118         struct Qdisc *p = NULL;
1119         int err;
1120
1121         if ((n->nlmsg_type != RTM_GETQDISC) &&
1122             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1123                 return -EPERM;
1124
1125         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1126         if (err < 0)
1127                 return err;
1128
1129         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1130         if (!dev)
1131                 return -ENODEV;
1132
1133         clid = tcm->tcm_parent;
1134         if (clid) {
1135                 if (clid != TC_H_ROOT) {
1136                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1137                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1138                                 if (!p)
1139                                         return -ENOENT;
1140                                 q = qdisc_leaf(p, clid);
1141                         } else if (dev_ingress_queue(dev)) {
1142                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1143                         }
1144                 } else {
1145                         q = dev->qdisc;
1146                 }
1147                 if (!q)
1148                         return -ENOENT;
1149
1150                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1151                         return -EINVAL;
1152         } else {
1153                 q = qdisc_lookup(dev, tcm->tcm_handle);
1154                 if (!q)
1155                         return -ENOENT;
1156         }
1157
1158         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1159                 return -EINVAL;
1160
1161         if (n->nlmsg_type == RTM_DELQDISC) {
1162                 if (!clid)
1163                         return -EINVAL;
1164                 if (q->handle == 0)
1165                         return -ENOENT;
1166                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1167                 if (err != 0)
1168                         return err;
1169         } else {
1170                 qdisc_notify(net, skb, n, clid, NULL, q);
1171         }
1172         return 0;
1173 }
1174
1175 /*
1176  * Create/change qdisc.
1177  */
1178
1179 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1180 {
1181         struct net *net = sock_net(skb->sk);
1182         struct tcmsg *tcm;
1183         struct nlattr *tca[TCA_MAX + 1];
1184         struct net_device *dev;
1185         u32 clid;
1186         struct Qdisc *q, *p;
1187         int err;
1188
1189         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1190                 return -EPERM;
1191
1192 replay:
1193         /* Reinit, just in case something touches this. */
1194         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1195         if (err < 0)
1196                 return err;
1197
1198         tcm = nlmsg_data(n);
1199         clid = tcm->tcm_parent;
1200         q = p = NULL;
1201
1202         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1203         if (!dev)
1204                 return -ENODEV;
1205
1206
1207         if (clid) {
1208                 if (clid != TC_H_ROOT) {
1209                         if (clid != TC_H_INGRESS) {
1210                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1211                                 if (!p)
1212                                         return -ENOENT;
1213                                 q = qdisc_leaf(p, clid);
1214                         } else if (dev_ingress_queue_create(dev)) {
1215                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1216                         }
1217                 } else {
1218                         q = dev->qdisc;
1219                 }
1220
1221                 /* It may be default qdisc, ignore it */
1222                 if (q && q->handle == 0)
1223                         q = NULL;
1224
1225                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1226                         if (tcm->tcm_handle) {
1227                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1228                                         return -EEXIST;
1229                                 if (TC_H_MIN(tcm->tcm_handle))
1230                                         return -EINVAL;
1231                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1232                                 if (!q)
1233                                         goto create_n_graft;
1234                                 if (n->nlmsg_flags & NLM_F_EXCL)
1235                                         return -EEXIST;
1236                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1237                                         return -EINVAL;
1238                                 if (q == p ||
1239                                     (p && check_loop(q, p, 0)))
1240                                         return -ELOOP;
1241                                 atomic_inc(&q->refcnt);
1242                                 goto graft;
1243                         } else {
1244                                 if (!q)
1245                                         goto create_n_graft;
1246
1247                                 /* This magic test requires explanation.
1248                                  *
1249                                  *   We know, that some child q is already
1250                                  *   attached to this parent and have choice:
1251                                  *   either to change it or to create/graft new one.
1252                                  *
1253                                  *   1. We are allowed to create/graft only
1254                                  *   if CREATE and REPLACE flags are set.
1255                                  *
1256                                  *   2. If EXCL is set, requestor wanted to say,
1257                                  *   that qdisc tcm_handle is not expected
1258                                  *   to exist, so that we choose create/graft too.
1259                                  *
1260                                  *   3. The last case is when no flags are set.
1261                                  *   Alas, it is sort of hole in API, we
1262                                  *   cannot decide what to do unambiguously.
1263                                  *   For now we select create/graft, if
1264                                  *   user gave KIND, which does not match existing.
1265                                  */
1266                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1267                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1268                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1269                                      (tca[TCA_KIND] &&
1270                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1271                                         goto create_n_graft;
1272                         }
1273                 }
1274         } else {
1275                 if (!tcm->tcm_handle)
1276                         return -EINVAL;
1277                 q = qdisc_lookup(dev, tcm->tcm_handle);
1278         }
1279
1280         /* Change qdisc parameters */
1281         if (q == NULL)
1282                 return -ENOENT;
1283         if (n->nlmsg_flags & NLM_F_EXCL)
1284                 return -EEXIST;
1285         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1286                 return -EINVAL;
1287         err = qdisc_change(q, tca);
1288         if (err == 0)
1289                 qdisc_notify(net, skb, n, clid, NULL, q);
1290         return err;
1291
1292 create_n_graft:
1293         if (!(n->nlmsg_flags & NLM_F_CREATE))
1294                 return -ENOENT;
1295         if (clid == TC_H_INGRESS) {
1296                 if (dev_ingress_queue(dev))
1297                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1298                                          tcm->tcm_parent, tcm->tcm_parent,
1299                                          tca, &err);
1300                 else
1301                         err = -ENOENT;
1302         } else {
1303                 struct netdev_queue *dev_queue;
1304
1305                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1306                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1307                 else if (p)
1308                         dev_queue = p->dev_queue;
1309                 else
1310                         dev_queue = netdev_get_tx_queue(dev, 0);
1311
1312                 q = qdisc_create(dev, dev_queue, p,
1313                                  tcm->tcm_parent, tcm->tcm_handle,
1314                                  tca, &err);
1315         }
1316         if (q == NULL) {
1317                 if (err == -EAGAIN)
1318                         goto replay;
1319                 return err;
1320         }
1321
1322 graft:
1323         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1324         if (err) {
1325                 if (q)
1326                         qdisc_destroy(q);
1327                 return err;
1328         }
1329
1330         return 0;
1331 }
1332
1333 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1334                          u32 portid, u32 seq, u16 flags, int event)
1335 {
1336         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1337         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1338         struct tcmsg *tcm;
1339         struct nlmsghdr  *nlh;
1340         unsigned char *b = skb_tail_pointer(skb);
1341         struct gnet_dump d;
1342         struct qdisc_size_table *stab;
1343         __u32 qlen;
1344
1345         cond_resched();
1346         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1347         if (!nlh)
1348                 goto out_nlmsg_trim;
1349         tcm = nlmsg_data(nlh);
1350         tcm->tcm_family = AF_UNSPEC;
1351         tcm->tcm__pad1 = 0;
1352         tcm->tcm__pad2 = 0;
1353         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1354         tcm->tcm_parent = clid;
1355         tcm->tcm_handle = q->handle;
1356         tcm->tcm_info = atomic_read(&q->refcnt);
1357         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1358                 goto nla_put_failure;
1359         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1360                 goto nla_put_failure;
1361         qlen = q->q.qlen;
1362
1363         stab = rtnl_dereference(q->stab);
1364         if (stab && qdisc_dump_stab(skb, stab) < 0)
1365                 goto nla_put_failure;
1366
1367         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1368                                          NULL, &d, TCA_PAD) < 0)
1369                 goto nla_put_failure;
1370
1371         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1372                 goto nla_put_failure;
1373
1374         if (qdisc_is_percpu_stats(q)) {
1375                 cpu_bstats = q->cpu_bstats;
1376                 cpu_qstats = q->cpu_qstats;
1377         }
1378
1379         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1380                                   &d, cpu_bstats, &q->bstats) < 0 ||
1381             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1382             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1383                 goto nla_put_failure;
1384
1385         if (gnet_stats_finish_copy(&d) < 0)
1386                 goto nla_put_failure;
1387
1388         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1389         return skb->len;
1390
1391 out_nlmsg_trim:
1392 nla_put_failure:
1393         nlmsg_trim(skb, b);
1394         return -1;
1395 }
1396
1397 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1398 {
1399         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1400 }
1401
1402 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1403                         struct nlmsghdr *n, u32 clid,
1404                         struct Qdisc *old, struct Qdisc *new)
1405 {
1406         struct sk_buff *skb;
1407         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1408
1409         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1410         if (!skb)
1411                 return -ENOBUFS;
1412
1413         if (old && !tc_qdisc_dump_ignore(old)) {
1414                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1415                                   0, RTM_DELQDISC) < 0)
1416                         goto err_out;
1417         }
1418         if (new && !tc_qdisc_dump_ignore(new)) {
1419                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1420                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1421                         goto err_out;
1422         }
1423
1424         if (skb->len)
1425                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1426                                       n->nlmsg_flags & NLM_F_ECHO);
1427
1428 err_out:
1429         kfree_skb(skb);
1430         return -EINVAL;
1431 }
1432
1433 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1434                               struct netlink_callback *cb,
1435                               int *q_idx_p, int s_q_idx)
1436 {
1437         int ret = 0, q_idx = *q_idx_p;
1438         struct Qdisc *q;
1439         int b;
1440
1441         if (!root)
1442                 return 0;
1443
1444         q = root;
1445         if (q_idx < s_q_idx) {
1446                 q_idx++;
1447         } else {
1448                 if (!tc_qdisc_dump_ignore(q) &&
1449                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1450                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1451                         goto done;
1452                 q_idx++;
1453         }
1454         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1455                 if (q_idx < s_q_idx) {
1456                         q_idx++;
1457                         continue;
1458                 }
1459                 if (!tc_qdisc_dump_ignore(q) &&
1460                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1461                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1462                         goto done;
1463                 q_idx++;
1464         }
1465
1466 out:
1467         *q_idx_p = q_idx;
1468         return ret;
1469 done:
1470         ret = -1;
1471         goto out;
1472 }
1473
1474 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1475 {
1476         struct net *net = sock_net(skb->sk);
1477         int idx, q_idx;
1478         int s_idx, s_q_idx;
1479         struct net_device *dev;
1480
1481         s_idx = cb->args[0];
1482         s_q_idx = q_idx = cb->args[1];
1483
1484         idx = 0;
1485         ASSERT_RTNL();
1486         for_each_netdev(net, dev) {
1487                 struct netdev_queue *dev_queue;
1488
1489                 if (idx < s_idx)
1490                         goto cont;
1491                 if (idx > s_idx)
1492                         s_q_idx = 0;
1493                 q_idx = 0;
1494
1495                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1496                         goto done;
1497
1498                 dev_queue = dev_ingress_queue(dev);
1499                 if (dev_queue &&
1500                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1501                                        &q_idx, s_q_idx) < 0)
1502                         goto done;
1503
1504 cont:
1505                 idx++;
1506         }
1507
1508 done:
1509         cb->args[0] = idx;
1510         cb->args[1] = q_idx;
1511
1512         return skb->len;
1513 }
1514
1515
1516
1517 /************************************************
1518  *      Traffic classes manipulation.           *
1519  ************************************************/
1520
1521
1522
1523 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1524 {
1525         struct net *net = sock_net(skb->sk);
1526         struct tcmsg *tcm = nlmsg_data(n);
1527         struct nlattr *tca[TCA_MAX + 1];
1528         struct net_device *dev;
1529         struct Qdisc *q = NULL;
1530         const struct Qdisc_class_ops *cops;
1531         unsigned long cl = 0;
1532         unsigned long new_cl;
1533         u32 portid;
1534         u32 clid;
1535         u32 qid;
1536         int err;
1537
1538         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1539             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1540                 return -EPERM;
1541
1542         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1543         if (err < 0)
1544                 return err;
1545
1546         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1547         if (!dev)
1548                 return -ENODEV;
1549
1550         /*
1551            parent == TC_H_UNSPEC - unspecified parent.
1552            parent == TC_H_ROOT   - class is root, which has no parent.
1553            parent == X:0         - parent is root class.
1554            parent == X:Y         - parent is a node in hierarchy.
1555            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1556
1557            handle == 0:0         - generate handle from kernel pool.
1558            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1559            handle == X:Y         - clear.
1560            handle == X:0         - root class.
1561          */
1562
1563         /* Step 1. Determine qdisc handle X:0 */
1564
1565         portid = tcm->tcm_parent;
1566         clid = tcm->tcm_handle;
1567         qid = TC_H_MAJ(clid);
1568
1569         if (portid != TC_H_ROOT) {
1570                 u32 qid1 = TC_H_MAJ(portid);
1571
1572                 if (qid && qid1) {
1573                         /* If both majors are known, they must be identical. */
1574                         if (qid != qid1)
1575                                 return -EINVAL;
1576                 } else if (qid1) {
1577                         qid = qid1;
1578                 } else if (qid == 0)
1579                         qid = dev->qdisc->handle;
1580
1581                 /* Now qid is genuine qdisc handle consistent
1582                  * both with parent and child.
1583                  *
1584                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1585                  */
1586                 if (portid)
1587                         portid = TC_H_MAKE(qid, portid);
1588         } else {
1589                 if (qid == 0)
1590                         qid = dev->qdisc->handle;
1591         }
1592
1593         /* OK. Locate qdisc */
1594         q = qdisc_lookup(dev, qid);
1595         if (!q)
1596                 return -ENOENT;
1597
1598         /* An check that it supports classes */
1599         cops = q->ops->cl_ops;
1600         if (cops == NULL)
1601                 return -EINVAL;
1602
1603         /* Now try to get class */
1604         if (clid == 0) {
1605                 if (portid == TC_H_ROOT)
1606                         clid = qid;
1607         } else
1608                 clid = TC_H_MAKE(qid, clid);
1609
1610         if (clid)
1611                 cl = cops->get(q, clid);
1612
1613         if (cl == 0) {
1614                 err = -ENOENT;
1615                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1616                     !(n->nlmsg_flags & NLM_F_CREATE))
1617                         goto out;
1618         } else {
1619                 switch (n->nlmsg_type) {
1620                 case RTM_NEWTCLASS:
1621                         err = -EEXIST;
1622                         if (n->nlmsg_flags & NLM_F_EXCL)
1623                                 goto out;
1624                         break;
1625                 case RTM_DELTCLASS:
1626                         err = -EOPNOTSUPP;
1627                         if (cops->delete)
1628                                 err = cops->delete(q, cl);
1629                         if (err == 0)
1630                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1631                         goto out;
1632                 case RTM_GETTCLASS:
1633                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1634                         goto out;
1635                 default:
1636                         err = -EINVAL;
1637                         goto out;
1638                 }
1639         }
1640
1641         new_cl = cl;
1642         err = -EOPNOTSUPP;
1643         if (cops->change)
1644                 err = cops->change(q, clid, portid, tca, &new_cl);
1645         if (err == 0)
1646                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1647
1648 out:
1649         if (cl)
1650                 cops->put(q, cl);
1651
1652         return err;
1653 }
1654
1655
1656 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1657                           unsigned long cl,
1658                           u32 portid, u32 seq, u16 flags, int event)
1659 {
1660         struct tcmsg *tcm;
1661         struct nlmsghdr  *nlh;
1662         unsigned char *b = skb_tail_pointer(skb);
1663         struct gnet_dump d;
1664         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1665
1666         cond_resched();
1667         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1668         if (!nlh)
1669                 goto out_nlmsg_trim;
1670         tcm = nlmsg_data(nlh);
1671         tcm->tcm_family = AF_UNSPEC;
1672         tcm->tcm__pad1 = 0;
1673         tcm->tcm__pad2 = 0;
1674         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1675         tcm->tcm_parent = q->handle;
1676         tcm->tcm_handle = q->handle;
1677         tcm->tcm_info = 0;
1678         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1679                 goto nla_put_failure;
1680         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1681                 goto nla_put_failure;
1682
1683         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1684                                          NULL, &d, TCA_PAD) < 0)
1685                 goto nla_put_failure;
1686
1687         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1688                 goto nla_put_failure;
1689
1690         if (gnet_stats_finish_copy(&d) < 0)
1691                 goto nla_put_failure;
1692
1693         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1694         return skb->len;
1695
1696 out_nlmsg_trim:
1697 nla_put_failure:
1698         nlmsg_trim(skb, b);
1699         return -1;
1700 }
1701
1702 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1703                          struct nlmsghdr *n, struct Qdisc *q,
1704                          unsigned long cl, int event)
1705 {
1706         struct sk_buff *skb;
1707         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1708
1709         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1710         if (!skb)
1711                 return -ENOBUFS;
1712
1713         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1714                 kfree_skb(skb);
1715                 return -EINVAL;
1716         }
1717
1718         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1719                               n->nlmsg_flags & NLM_F_ECHO);
1720 }
1721
1722 struct qdisc_dump_args {
1723         struct qdisc_walker     w;
1724         struct sk_buff          *skb;
1725         struct netlink_callback *cb;
1726 };
1727
1728 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1729 {
1730         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1731
1732         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1733                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1734 }
1735
1736 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1737                                 struct tcmsg *tcm, struct netlink_callback *cb,
1738                                 int *t_p, int s_t)
1739 {
1740         struct qdisc_dump_args arg;
1741
1742         if (tc_qdisc_dump_ignore(q) ||
1743             *t_p < s_t || !q->ops->cl_ops ||
1744             (tcm->tcm_parent &&
1745              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1746                 (*t_p)++;
1747                 return 0;
1748         }
1749         if (*t_p > s_t)
1750                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1751         arg.w.fn = qdisc_class_dump;
1752         arg.skb = skb;
1753         arg.cb = cb;
1754         arg.w.stop  = 0;
1755         arg.w.skip = cb->args[1];
1756         arg.w.count = 0;
1757         q->ops->cl_ops->walk(q, &arg.w);
1758         cb->args[1] = arg.w.count;
1759         if (arg.w.stop)
1760                 return -1;
1761         (*t_p)++;
1762         return 0;
1763 }
1764
1765 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1766                                struct tcmsg *tcm, struct netlink_callback *cb,
1767                                int *t_p, int s_t)
1768 {
1769         struct Qdisc *q;
1770         int b;
1771
1772         if (!root)
1773                 return 0;
1774
1775         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1776                 return -1;
1777
1778         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1779                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1780                         return -1;
1781         }
1782
1783         return 0;
1784 }
1785
1786 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1787 {
1788         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1789         struct net *net = sock_net(skb->sk);
1790         struct netdev_queue *dev_queue;
1791         struct net_device *dev;
1792         int t, s_t;
1793
1794         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1795                 return 0;
1796         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1797         if (!dev)
1798                 return 0;
1799
1800         s_t = cb->args[0];
1801         t = 0;
1802
1803         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1804                 goto done;
1805
1806         dev_queue = dev_ingress_queue(dev);
1807         if (dev_queue &&
1808             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1809                                 &t, s_t) < 0)
1810                 goto done;
1811
1812 done:
1813         cb->args[0] = t;
1814
1815         dev_put(dev);
1816         return skb->len;
1817 }
1818
1819 /* Main classifier routine: scans classifier chain attached
1820  * to this qdisc, (optionally) tests for protocol and asks
1821  * specific classifiers.
1822  */
1823 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1824                 struct tcf_result *res, bool compat_mode)
1825 {
1826         __be16 protocol = tc_skb_protocol(skb);
1827 #ifdef CONFIG_NET_CLS_ACT
1828         const struct tcf_proto *old_tp = tp;
1829         int limit = 0;
1830
1831 reclassify:
1832 #endif
1833         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1834                 int err;
1835
1836                 if (tp->protocol != protocol &&
1837                     tp->protocol != htons(ETH_P_ALL))
1838                         continue;
1839
1840                 err = tp->classify(skb, tp, res);
1841 #ifdef CONFIG_NET_CLS_ACT
1842                 if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1843                         goto reset;
1844 #endif
1845                 if (err >= 0)
1846                         return err;
1847         }
1848
1849         return TC_ACT_UNSPEC; /* signal: continue lookup */
1850 #ifdef CONFIG_NET_CLS_ACT
1851 reset:
1852         if (unlikely(limit++ >= MAX_REC_LOOP)) {
1853                 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1854                                        tp->q->ops->id, tp->prio & 0xffff,
1855                                        ntohs(tp->protocol));
1856                 return TC_ACT_SHOT;
1857         }
1858
1859         tp = old_tp;
1860         protocol = tc_skb_protocol(skb);
1861         goto reclassify;
1862 #endif
1863 }
1864 EXPORT_SYMBOL(tc_classify);
1865
1866 bool tcf_destroy(struct tcf_proto *tp, bool force)
1867 {
1868         if (tp->ops->destroy(tp, force)) {
1869                 module_put(tp->ops->owner);
1870                 kfree_rcu(tp, rcu);
1871                 return true;
1872         }
1873
1874         return false;
1875 }
1876
1877 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1878 {
1879         struct tcf_proto *tp;
1880
1881         while ((tp = rtnl_dereference(*fl)) != NULL) {
1882                 RCU_INIT_POINTER(*fl, tp->next);
1883                 tcf_destroy(tp, true);
1884         }
1885 }
1886 EXPORT_SYMBOL(tcf_destroy_chain);
1887
1888 #ifdef CONFIG_PROC_FS
1889 static int psched_show(struct seq_file *seq, void *v)
1890 {
1891         seq_printf(seq, "%08x %08x %08x %08x\n",
1892                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1893                    1000000,
1894                    (u32)NSEC_PER_SEC / hrtimer_resolution);
1895
1896         return 0;
1897 }
1898
1899 static int psched_open(struct inode *inode, struct file *file)
1900 {
1901         return single_open(file, psched_show, NULL);
1902 }
1903
1904 static const struct file_operations psched_fops = {
1905         .owner = THIS_MODULE,
1906         .open = psched_open,
1907         .read  = seq_read,
1908         .llseek = seq_lseek,
1909         .release = single_release,
1910 };
1911
1912 static int __net_init psched_net_init(struct net *net)
1913 {
1914         struct proc_dir_entry *e;
1915
1916         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1917         if (e == NULL)
1918                 return -ENOMEM;
1919
1920         return 0;
1921 }
1922
1923 static void __net_exit psched_net_exit(struct net *net)
1924 {
1925         remove_proc_entry("psched", net->proc_net);
1926 }
1927 #else
1928 static int __net_init psched_net_init(struct net *net)
1929 {
1930         return 0;
1931 }
1932
1933 static void __net_exit psched_net_exit(struct net *net)
1934 {
1935 }
1936 #endif
1937
1938 static struct pernet_operations psched_net_ops = {
1939         .init = psched_net_init,
1940         .exit = psched_net_exit,
1941 };
1942
1943 static int __init pktsched_init(void)
1944 {
1945         int err;
1946
1947         err = register_pernet_subsys(&psched_net_ops);
1948         if (err) {
1949                 pr_err("pktsched_init: "
1950                        "cannot initialize per netns operations\n");
1951                 return err;
1952         }
1953
1954         register_qdisc(&pfifo_fast_ops);
1955         register_qdisc(&pfifo_qdisc_ops);
1956         register_qdisc(&bfifo_qdisc_ops);
1957         register_qdisc(&pfifo_head_drop_qdisc_ops);
1958         register_qdisc(&mq_qdisc_ops);
1959         register_qdisc(&noqueue_qdisc_ops);
1960
1961         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1962         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1963         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1964         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1965         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1966         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1967
1968         return 0;
1969 }
1970
1971 subsys_initcall(pktsched_init);