rxrpc: Split sendmsg from packet transmission code
[cascardo/linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38
39 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
40                         struct nlmsghdr *n, u32 clid,
41                         struct Qdisc *old, struct Qdisc *new);
42 static int tclass_notify(struct net *net, struct sk_buff *oskb,
43                          struct nlmsghdr *n, struct Qdisc *q,
44                          unsigned long cl, int event);
45
46 /*
47
48    Short review.
49    -------------
50
51    This file consists of two interrelated parts:
52
53    1. queueing disciplines manager frontend.
54    2. traffic classes manager frontend.
55
56    Generally, queueing discipline ("qdisc") is a black box,
57    which is able to enqueue packets and to dequeue them (when
58    device is ready to send something) in order and at times
59    determined by algorithm hidden in it.
60
61    qdisc's are divided to two categories:
62    - "queues", which have no internal structure visible from outside.
63    - "schedulers", which split all the packets to "traffic classes",
64      using "packet classifiers" (look at cls_api.c)
65
66    In turn, classes may have child qdiscs (as rule, queues)
67    attached to them etc. etc. etc.
68
69    The goal of the routines in this file is to translate
70    information supplied by user in the form of handles
71    to more intelligible for kernel form, to make some sanity
72    checks and part of work, which is common to all qdiscs
73    and to provide rtnetlink notifications.
74
75    All real intelligent work is done inside qdisc modules.
76
77
78
79    Every discipline has two major routines: enqueue and dequeue.
80
81    ---dequeue
82
83    dequeue usually returns a skb to send. It is allowed to return NULL,
84    but it does not mean that queue is empty, it just means that
85    discipline does not want to send anything this time.
86    Queue is really empty if q->q.qlen == 0.
87    For complicated disciplines with multiple queues q->q is not
88    real packet queue, but however q->q.qlen must be valid.
89
90    ---enqueue
91
92    enqueue returns 0, if packet was enqueued successfully.
93    If packet (this one or another one) was dropped, it returns
94    not zero error code.
95    NET_XMIT_DROP        - this packet dropped
96      Expected action: do not backoff, but wait until queue will clear.
97    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
98      Expected action: backoff or ignore
99
100    Auxiliary routines:
101
102    ---peek
103
104    like dequeue but without removing a packet from the queue
105
106    ---reset
107
108    returns qdisc to initial state: purge all buffers, clear all
109    timers, counters (except for statistics) etc.
110
111    ---init
112
113    initializes newly created qdisc.
114
115    ---destroy
116
117    destroys resources allocated by init and during lifetime of qdisc.
118
119    ---change
120
121    changes qdisc parameters.
122  */
123
124 /* Protects list of registered TC modules. It is pure SMP lock. */
125 static DEFINE_RWLOCK(qdisc_mod_lock);
126
127
128 /************************************************
129  *      Queueing disciplines manipulation.      *
130  ************************************************/
131
132
133 /* The list of all installed queueing disciplines. */
134
135 static struct Qdisc_ops *qdisc_base;
136
137 /* Register/unregister queueing discipline */
138
139 int register_qdisc(struct Qdisc_ops *qops)
140 {
141         struct Qdisc_ops *q, **qp;
142         int rc = -EEXIST;
143
144         write_lock(&qdisc_mod_lock);
145         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
146                 if (!strcmp(qops->id, q->id))
147                         goto out;
148
149         if (qops->enqueue == NULL)
150                 qops->enqueue = noop_qdisc_ops.enqueue;
151         if (qops->peek == NULL) {
152                 if (qops->dequeue == NULL)
153                         qops->peek = noop_qdisc_ops.peek;
154                 else
155                         goto out_einval;
156         }
157         if (qops->dequeue == NULL)
158                 qops->dequeue = noop_qdisc_ops.dequeue;
159
160         if (qops->cl_ops) {
161                 const struct Qdisc_class_ops *cops = qops->cl_ops;
162
163                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
164                         goto out_einval;
165
166                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
167                         goto out_einval;
168         }
169
170         qops->next = NULL;
171         *qp = qops;
172         rc = 0;
173 out:
174         write_unlock(&qdisc_mod_lock);
175         return rc;
176
177 out_einval:
178         rc = -EINVAL;
179         goto out;
180 }
181 EXPORT_SYMBOL(register_qdisc);
182
183 int unregister_qdisc(struct Qdisc_ops *qops)
184 {
185         struct Qdisc_ops *q, **qp;
186         int err = -ENOENT;
187
188         write_lock(&qdisc_mod_lock);
189         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
190                 if (q == qops)
191                         break;
192         if (q) {
193                 *qp = q->next;
194                 q->next = NULL;
195                 err = 0;
196         }
197         write_unlock(&qdisc_mod_lock);
198         return err;
199 }
200 EXPORT_SYMBOL(unregister_qdisc);
201
202 /* Get default qdisc if not otherwise specified */
203 void qdisc_get_default(char *name, size_t len)
204 {
205         read_lock(&qdisc_mod_lock);
206         strlcpy(name, default_qdisc_ops->id, len);
207         read_unlock(&qdisc_mod_lock);
208 }
209
210 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
211 {
212         struct Qdisc_ops *q = NULL;
213
214         for (q = qdisc_base; q; q = q->next) {
215                 if (!strcmp(name, q->id)) {
216                         if (!try_module_get(q->owner))
217                                 q = NULL;
218                         break;
219                 }
220         }
221
222         return q;
223 }
224
225 /* Set new default qdisc to use */
226 int qdisc_set_default(const char *name)
227 {
228         const struct Qdisc_ops *ops;
229
230         if (!capable(CAP_NET_ADMIN))
231                 return -EPERM;
232
233         write_lock(&qdisc_mod_lock);
234         ops = qdisc_lookup_default(name);
235         if (!ops) {
236                 /* Not found, drop lock and try to load module */
237                 write_unlock(&qdisc_mod_lock);
238                 request_module("sch_%s", name);
239                 write_lock(&qdisc_mod_lock);
240
241                 ops = qdisc_lookup_default(name);
242         }
243
244         if (ops) {
245                 /* Set new default */
246                 module_put(default_qdisc_ops->owner);
247                 default_qdisc_ops = ops;
248         }
249         write_unlock(&qdisc_mod_lock);
250
251         return ops ? 0 : -ENOENT;
252 }
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
271                 if (q->handle == handle)
272                         return q;
273         }
274         return NULL;
275 }
276
277 void qdisc_hash_add(struct Qdisc *q)
278 {
279         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
280                 struct Qdisc *root = qdisc_dev(q)->qdisc;
281
282                 WARN_ON_ONCE(root == &noop_qdisc);
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         q = qdisc_match_from_root(dev->qdisc, handle);
303         if (q)
304                 goto out;
305
306         if (dev_ingress_queue(dev))
307                 q = qdisc_match_from_root(
308                         dev_ingress_queue(dev)->qdisc_sleeping,
309                         handle);
310 out:
311         return q;
312 }
313
314 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
315 {
316         unsigned long cl;
317         struct Qdisc *leaf;
318         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
319
320         if (cops == NULL)
321                 return NULL;
322         cl = cops->get(p, classid);
323
324         if (cl == 0)
325                 return NULL;
326         leaf = cops->leaf(p, cl);
327         cops->put(p, cl);
328         return leaf;
329 }
330
331 /* Find queueing discipline by name */
332
333 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
334 {
335         struct Qdisc_ops *q = NULL;
336
337         if (kind) {
338                 read_lock(&qdisc_mod_lock);
339                 for (q = qdisc_base; q; q = q->next) {
340                         if (nla_strcmp(kind, q->id) == 0) {
341                                 if (!try_module_get(q->owner))
342                                         q = NULL;
343                                 break;
344                         }
345                 }
346                 read_unlock(&qdisc_mod_lock);
347         }
348         return q;
349 }
350
351 /* The linklayer setting were not transferred from iproute2, in older
352  * versions, and the rate tables lookup systems have been dropped in
353  * the kernel. To keep backward compatible with older iproute2 tc
354  * utils, we detect the linklayer setting by detecting if the rate
355  * table were modified.
356  *
357  * For linklayer ATM table entries, the rate table will be aligned to
358  * 48 bytes, thus some table entries will contain the same value.  The
359  * mpu (min packet unit) is also encoded into the old rate table, thus
360  * starting from the mpu, we find low and high table entries for
361  * mapping this cell.  If these entries contain the same value, when
362  * the rate tables have been modified for linklayer ATM.
363  *
364  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
365  * and then roundup to the next cell, calc the table entry one below,
366  * and compare.
367  */
368 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
369 {
370         int low       = roundup(r->mpu, 48);
371         int high      = roundup(low+1, 48);
372         int cell_low  = low >> r->cell_log;
373         int cell_high = (high >> r->cell_log) - 1;
374
375         /* rtab is too inaccurate at rates > 100Mbit/s */
376         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
377                 pr_debug("TC linklayer: Giving up ATM detection\n");
378                 return TC_LINKLAYER_ETHERNET;
379         }
380
381         if ((cell_high > cell_low) && (cell_high < 256)
382             && (rtab[cell_low] == rtab[cell_high])) {
383                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
384                          cell_low, cell_high, rtab[cell_high]);
385                 return TC_LINKLAYER_ATM;
386         }
387         return TC_LINKLAYER_ETHERNET;
388 }
389
390 static struct qdisc_rate_table *qdisc_rtab_list;
391
392 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
393 {
394         struct qdisc_rate_table *rtab;
395
396         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
397             nla_len(tab) != TC_RTAB_SIZE)
398                 return NULL;
399
400         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
401                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
402                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
403                         rtab->refcnt++;
404                         return rtab;
405                 }
406         }
407
408         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
409         if (rtab) {
410                 rtab->rate = *r;
411                 rtab->refcnt = 1;
412                 memcpy(rtab->data, nla_data(tab), 1024);
413                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
414                         r->linklayer = __detect_linklayer(r, rtab->data);
415                 rtab->next = qdisc_rtab_list;
416                 qdisc_rtab_list = rtab;
417         }
418         return rtab;
419 }
420 EXPORT_SYMBOL(qdisc_get_rtab);
421
422 void qdisc_put_rtab(struct qdisc_rate_table *tab)
423 {
424         struct qdisc_rate_table *rtab, **rtabp;
425
426         if (!tab || --tab->refcnt)
427                 return;
428
429         for (rtabp = &qdisc_rtab_list;
430              (rtab = *rtabp) != NULL;
431              rtabp = &rtab->next) {
432                 if (rtab == tab) {
433                         *rtabp = rtab->next;
434                         kfree(rtab);
435                         return;
436                 }
437         }
438 }
439 EXPORT_SYMBOL(qdisc_put_rtab);
440
441 static LIST_HEAD(qdisc_stab_list);
442 static DEFINE_SPINLOCK(qdisc_stab_lock);
443
444 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
445         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
446         [TCA_STAB_DATA] = { .type = NLA_BINARY },
447 };
448
449 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
450 {
451         struct nlattr *tb[TCA_STAB_MAX + 1];
452         struct qdisc_size_table *stab;
453         struct tc_sizespec *s;
454         unsigned int tsize = 0;
455         u16 *tab = NULL;
456         int err;
457
458         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
459         if (err < 0)
460                 return ERR_PTR(err);
461         if (!tb[TCA_STAB_BASE])
462                 return ERR_PTR(-EINVAL);
463
464         s = nla_data(tb[TCA_STAB_BASE]);
465
466         if (s->tsize > 0) {
467                 if (!tb[TCA_STAB_DATA])
468                         return ERR_PTR(-EINVAL);
469                 tab = nla_data(tb[TCA_STAB_DATA]);
470                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
471         }
472
473         if (tsize != s->tsize || (!tab && tsize > 0))
474                 return ERR_PTR(-EINVAL);
475
476         spin_lock(&qdisc_stab_lock);
477
478         list_for_each_entry(stab, &qdisc_stab_list, list) {
479                 if (memcmp(&stab->szopts, s, sizeof(*s)))
480                         continue;
481                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
482                         continue;
483                 stab->refcnt++;
484                 spin_unlock(&qdisc_stab_lock);
485                 return stab;
486         }
487
488         spin_unlock(&qdisc_stab_lock);
489
490         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
491         if (!stab)
492                 return ERR_PTR(-ENOMEM);
493
494         stab->refcnt = 1;
495         stab->szopts = *s;
496         if (tsize > 0)
497                 memcpy(stab->data, tab, tsize * sizeof(u16));
498
499         spin_lock(&qdisc_stab_lock);
500         list_add_tail(&stab->list, &qdisc_stab_list);
501         spin_unlock(&qdisc_stab_lock);
502
503         return stab;
504 }
505
506 static void stab_kfree_rcu(struct rcu_head *head)
507 {
508         kfree(container_of(head, struct qdisc_size_table, rcu));
509 }
510
511 void qdisc_put_stab(struct qdisc_size_table *tab)
512 {
513         if (!tab)
514                 return;
515
516         spin_lock(&qdisc_stab_lock);
517
518         if (--tab->refcnt == 0) {
519                 list_del(&tab->list);
520                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
521         }
522
523         spin_unlock(&qdisc_stab_lock);
524 }
525 EXPORT_SYMBOL(qdisc_put_stab);
526
527 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
528 {
529         struct nlattr *nest;
530
531         nest = nla_nest_start(skb, TCA_STAB);
532         if (nest == NULL)
533                 goto nla_put_failure;
534         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
535                 goto nla_put_failure;
536         nla_nest_end(skb, nest);
537
538         return skb->len;
539
540 nla_put_failure:
541         return -1;
542 }
543
544 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
545 {
546         int pkt_len, slot;
547
548         pkt_len = skb->len + stab->szopts.overhead;
549         if (unlikely(!stab->szopts.tsize))
550                 goto out;
551
552         slot = pkt_len + stab->szopts.cell_align;
553         if (unlikely(slot < 0))
554                 slot = 0;
555
556         slot >>= stab->szopts.cell_log;
557         if (likely(slot < stab->szopts.tsize))
558                 pkt_len = stab->data[slot];
559         else
560                 pkt_len = stab->data[stab->szopts.tsize - 1] *
561                                 (slot / stab->szopts.tsize) +
562                                 stab->data[slot % stab->szopts.tsize];
563
564         pkt_len <<= stab->szopts.size_log;
565 out:
566         if (unlikely(pkt_len < 1))
567                 pkt_len = 1;
568         qdisc_skb_cb(skb)->pkt_len = pkt_len;
569 }
570 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
571
572 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
573 {
574         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
575                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
576                         txt, qdisc->ops->id, qdisc->handle >> 16);
577                 qdisc->flags |= TCQ_F_WARN_NONWC;
578         }
579 }
580 EXPORT_SYMBOL(qdisc_warn_nonwc);
581
582 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
583 {
584         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
585                                                  timer);
586
587         rcu_read_lock();
588         __netif_schedule(qdisc_root(wd->qdisc));
589         rcu_read_unlock();
590
591         return HRTIMER_NORESTART;
592 }
593
594 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
595 {
596         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
597         wd->timer.function = qdisc_watchdog;
598         wd->qdisc = qdisc;
599 }
600 EXPORT_SYMBOL(qdisc_watchdog_init);
601
602 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
603 {
604         if (test_bit(__QDISC_STATE_DEACTIVATED,
605                      &qdisc_root_sleeping(wd->qdisc)->state))
606                 return;
607
608         if (wd->last_expires == expires)
609                 return;
610
611         wd->last_expires = expires;
612         hrtimer_start(&wd->timer,
613                       ns_to_ktime(expires),
614                       HRTIMER_MODE_ABS_PINNED);
615 }
616 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
617
618 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
619 {
620         hrtimer_cancel(&wd->timer);
621 }
622 EXPORT_SYMBOL(qdisc_watchdog_cancel);
623
624 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
625 {
626         unsigned int size = n * sizeof(struct hlist_head), i;
627         struct hlist_head *h;
628
629         if (size <= PAGE_SIZE)
630                 h = kmalloc(size, GFP_KERNEL);
631         else
632                 h = (struct hlist_head *)
633                         __get_free_pages(GFP_KERNEL, get_order(size));
634
635         if (h != NULL) {
636                 for (i = 0; i < n; i++)
637                         INIT_HLIST_HEAD(&h[i]);
638         }
639         return h;
640 }
641
642 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
643 {
644         unsigned int size = n * sizeof(struct hlist_head);
645
646         if (size <= PAGE_SIZE)
647                 kfree(h);
648         else
649                 free_pages((unsigned long)h, get_order(size));
650 }
651
652 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
653 {
654         struct Qdisc_class_common *cl;
655         struct hlist_node *next;
656         struct hlist_head *nhash, *ohash;
657         unsigned int nsize, nmask, osize;
658         unsigned int i, h;
659
660         /* Rehash when load factor exceeds 0.75 */
661         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
662                 return;
663         nsize = clhash->hashsize * 2;
664         nmask = nsize - 1;
665         nhash = qdisc_class_hash_alloc(nsize);
666         if (nhash == NULL)
667                 return;
668
669         ohash = clhash->hash;
670         osize = clhash->hashsize;
671
672         sch_tree_lock(sch);
673         for (i = 0; i < osize; i++) {
674                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
675                         h = qdisc_class_hash(cl->classid, nmask);
676                         hlist_add_head(&cl->hnode, &nhash[h]);
677                 }
678         }
679         clhash->hash     = nhash;
680         clhash->hashsize = nsize;
681         clhash->hashmask = nmask;
682         sch_tree_unlock(sch);
683
684         qdisc_class_hash_free(ohash, osize);
685 }
686 EXPORT_SYMBOL(qdisc_class_hash_grow);
687
688 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
689 {
690         unsigned int size = 4;
691
692         clhash->hash = qdisc_class_hash_alloc(size);
693         if (clhash->hash == NULL)
694                 return -ENOMEM;
695         clhash->hashsize  = size;
696         clhash->hashmask  = size - 1;
697         clhash->hashelems = 0;
698         return 0;
699 }
700 EXPORT_SYMBOL(qdisc_class_hash_init);
701
702 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
703 {
704         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
705 }
706 EXPORT_SYMBOL(qdisc_class_hash_destroy);
707
708 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
709                              struct Qdisc_class_common *cl)
710 {
711         unsigned int h;
712
713         INIT_HLIST_NODE(&cl->hnode);
714         h = qdisc_class_hash(cl->classid, clhash->hashmask);
715         hlist_add_head(&cl->hnode, &clhash->hash[h]);
716         clhash->hashelems++;
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_insert);
719
720 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
721                              struct Qdisc_class_common *cl)
722 {
723         hlist_del(&cl->hnode);
724         clhash->hashelems--;
725 }
726 EXPORT_SYMBOL(qdisc_class_hash_remove);
727
728 /* Allocate an unique handle from space managed by kernel
729  * Possible range is [8000-FFFF]:0000 (0x8000 values)
730  */
731 static u32 qdisc_alloc_handle(struct net_device *dev)
732 {
733         int i = 0x8000;
734         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
735
736         do {
737                 autohandle += TC_H_MAKE(0x10000U, 0);
738                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
739                         autohandle = TC_H_MAKE(0x80000000U, 0);
740                 if (!qdisc_lookup(dev, autohandle))
741                         return autohandle;
742                 cond_resched();
743         } while (--i > 0);
744
745         return 0;
746 }
747
748 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
749                                unsigned int len)
750 {
751         const struct Qdisc_class_ops *cops;
752         unsigned long cl;
753         u32 parentid;
754         int drops;
755
756         if (n == 0 && len == 0)
757                 return;
758         drops = max_t(int, n, 0);
759         rcu_read_lock();
760         while ((parentid = sch->parent)) {
761                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762                         break;
763
764                 if (sch->flags & TCQ_F_NOPARENT)
765                         break;
766                 /* TODO: perform the search on a per txq basis */
767                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
768                 if (sch == NULL) {
769                         WARN_ON_ONCE(parentid != TC_H_ROOT);
770                         break;
771                 }
772                 cops = sch->ops->cl_ops;
773                 if (cops->qlen_notify) {
774                         cl = cops->get(sch, parentid);
775                         cops->qlen_notify(sch, cl);
776                         cops->put(sch, cl);
777                 }
778                 sch->q.qlen -= n;
779                 sch->qstats.backlog -= len;
780                 __qdisc_qstats_drop(sch, drops);
781         }
782         rcu_read_unlock();
783 }
784 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
785
786 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
787                                struct nlmsghdr *n, u32 clid,
788                                struct Qdisc *old, struct Qdisc *new)
789 {
790         if (new || old)
791                 qdisc_notify(net, skb, n, clid, old, new);
792
793         if (old)
794                 qdisc_destroy(old);
795 }
796
797 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
798  * to device "dev".
799  *
800  * When appropriate send a netlink notification using 'skb'
801  * and "n".
802  *
803  * On success, destroy old qdisc.
804  */
805
806 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
807                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
808                        struct Qdisc *new, struct Qdisc *old)
809 {
810         struct Qdisc *q = old;
811         struct net *net = dev_net(dev);
812         int err = 0;
813
814         if (parent == NULL) {
815                 unsigned int i, num_q, ingress;
816
817                 ingress = 0;
818                 num_q = dev->num_tx_queues;
819                 if ((q && q->flags & TCQ_F_INGRESS) ||
820                     (new && new->flags & TCQ_F_INGRESS)) {
821                         num_q = 1;
822                         ingress = 1;
823                         if (!dev_ingress_queue(dev))
824                                 return -ENOENT;
825                 }
826
827                 if (dev->flags & IFF_UP)
828                         dev_deactivate(dev);
829
830                 if (new && new->ops->attach)
831                         goto skip;
832
833                 for (i = 0; i < num_q; i++) {
834                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
835
836                         if (!ingress)
837                                 dev_queue = netdev_get_tx_queue(dev, i);
838
839                         old = dev_graft_qdisc(dev_queue, new);
840                         if (new && i > 0)
841                                 atomic_inc(&new->refcnt);
842
843                         if (!ingress)
844                                 qdisc_destroy(old);
845                 }
846
847 skip:
848                 if (!ingress) {
849                         notify_and_destroy(net, skb, n, classid,
850                                            dev->qdisc, new);
851                         if (new && !new->ops->attach)
852                                 atomic_inc(&new->refcnt);
853                         dev->qdisc = new ? : &noop_qdisc;
854
855                         if (new && new->ops->attach)
856                                 new->ops->attach(new);
857                 } else {
858                         notify_and_destroy(net, skb, n, classid, old, new);
859                 }
860
861                 if (dev->flags & IFF_UP)
862                         dev_activate(dev);
863         } else {
864                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
865
866                 err = -EOPNOTSUPP;
867                 if (cops && cops->graft) {
868                         unsigned long cl = cops->get(parent, classid);
869                         if (cl) {
870                                 err = cops->graft(parent, cl, new, &old);
871                                 cops->put(parent, cl);
872                         } else
873                                 err = -ENOENT;
874                 }
875                 if (!err)
876                         notify_and_destroy(net, skb, n, classid, old, new);
877         }
878         return err;
879 }
880
881 /* lockdep annotation is needed for ingress; egress gets it only for name */
882 static struct lock_class_key qdisc_tx_lock;
883 static struct lock_class_key qdisc_rx_lock;
884
885 /*
886    Allocate and initialize new qdisc.
887
888    Parameters are passed via opt.
889  */
890
891 static struct Qdisc *
892 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
893              struct Qdisc *p, u32 parent, u32 handle,
894              struct nlattr **tca, int *errp)
895 {
896         int err;
897         struct nlattr *kind = tca[TCA_KIND];
898         struct Qdisc *sch;
899         struct Qdisc_ops *ops;
900         struct qdisc_size_table *stab;
901
902         ops = qdisc_lookup_ops(kind);
903 #ifdef CONFIG_MODULES
904         if (ops == NULL && kind != NULL) {
905                 char name[IFNAMSIZ];
906                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
907                         /* We dropped the RTNL semaphore in order to
908                          * perform the module load.  So, even if we
909                          * succeeded in loading the module we have to
910                          * tell the caller to replay the request.  We
911                          * indicate this using -EAGAIN.
912                          * We replay the request because the device may
913                          * go away in the mean time.
914                          */
915                         rtnl_unlock();
916                         request_module("sch_%s", name);
917                         rtnl_lock();
918                         ops = qdisc_lookup_ops(kind);
919                         if (ops != NULL) {
920                                 /* We will try again qdisc_lookup_ops,
921                                  * so don't keep a reference.
922                                  */
923                                 module_put(ops->owner);
924                                 err = -EAGAIN;
925                                 goto err_out;
926                         }
927                 }
928         }
929 #endif
930
931         err = -ENOENT;
932         if (ops == NULL)
933                 goto err_out;
934
935         sch = qdisc_alloc(dev_queue, ops);
936         if (IS_ERR(sch)) {
937                 err = PTR_ERR(sch);
938                 goto err_out2;
939         }
940
941         sch->parent = parent;
942
943         if (handle == TC_H_INGRESS) {
944                 sch->flags |= TCQ_F_INGRESS;
945                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
946                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
947         } else {
948                 if (handle == 0) {
949                         handle = qdisc_alloc_handle(dev);
950                         err = -ENOMEM;
951                         if (handle == 0)
952                                 goto err_out3;
953                 }
954                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
955                 if (!netif_is_multiqueue(dev))
956                         sch->flags |= TCQ_F_ONETXQUEUE;
957         }
958
959         sch->handle = handle;
960
961         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
962                 if (qdisc_is_percpu_stats(sch)) {
963                         sch->cpu_bstats =
964                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
965                         if (!sch->cpu_bstats)
966                                 goto err_out4;
967
968                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
969                         if (!sch->cpu_qstats)
970                                 goto err_out4;
971                 }
972
973                 if (tca[TCA_STAB]) {
974                         stab = qdisc_get_stab(tca[TCA_STAB]);
975                         if (IS_ERR(stab)) {
976                                 err = PTR_ERR(stab);
977                                 goto err_out4;
978                         }
979                         rcu_assign_pointer(sch->stab, stab);
980                 }
981                 if (tca[TCA_RATE]) {
982                         seqcount_t *running;
983
984                         err = -EOPNOTSUPP;
985                         if (sch->flags & TCQ_F_MQROOT)
986                                 goto err_out4;
987
988                         if ((sch->parent != TC_H_ROOT) &&
989                             !(sch->flags & TCQ_F_INGRESS) &&
990                             (!p || !(p->flags & TCQ_F_MQROOT)))
991                                 running = qdisc_root_sleeping_running(sch);
992                         else
993                                 running = &sch->running;
994
995                         err = gen_new_estimator(&sch->bstats,
996                                                 sch->cpu_bstats,
997                                                 &sch->rate_est,
998                                                 NULL,
999                                                 running,
1000                                                 tca[TCA_RATE]);
1001                         if (err)
1002                                 goto err_out4;
1003                 }
1004
1005                 qdisc_hash_add(sch);
1006
1007                 return sch;
1008         }
1009 err_out3:
1010         dev_put(dev);
1011         kfree((char *) sch - sch->padded);
1012 err_out2:
1013         module_put(ops->owner);
1014 err_out:
1015         *errp = err;
1016         return NULL;
1017
1018 err_out4:
1019         free_percpu(sch->cpu_bstats);
1020         free_percpu(sch->cpu_qstats);
1021         /*
1022          * Any broken qdiscs that would require a ops->reset() here?
1023          * The qdisc was never in action so it shouldn't be necessary.
1024          */
1025         qdisc_put_stab(rtnl_dereference(sch->stab));
1026         if (ops->destroy)
1027                 ops->destroy(sch);
1028         goto err_out3;
1029 }
1030
1031 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1032 {
1033         struct qdisc_size_table *ostab, *stab = NULL;
1034         int err = 0;
1035
1036         if (tca[TCA_OPTIONS]) {
1037                 if (sch->ops->change == NULL)
1038                         return -EINVAL;
1039                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1040                 if (err)
1041                         return err;
1042         }
1043
1044         if (tca[TCA_STAB]) {
1045                 stab = qdisc_get_stab(tca[TCA_STAB]);
1046                 if (IS_ERR(stab))
1047                         return PTR_ERR(stab);
1048         }
1049
1050         ostab = rtnl_dereference(sch->stab);
1051         rcu_assign_pointer(sch->stab, stab);
1052         qdisc_put_stab(ostab);
1053
1054         if (tca[TCA_RATE]) {
1055                 /* NB: ignores errors from replace_estimator
1056                    because change can't be undone. */
1057                 if (sch->flags & TCQ_F_MQROOT)
1058                         goto out;
1059                 gen_replace_estimator(&sch->bstats,
1060                                       sch->cpu_bstats,
1061                                       &sch->rate_est,
1062                                       NULL,
1063                                       qdisc_root_sleeping_running(sch),
1064                                       tca[TCA_RATE]);
1065         }
1066 out:
1067         return 0;
1068 }
1069
1070 struct check_loop_arg {
1071         struct qdisc_walker     w;
1072         struct Qdisc            *p;
1073         int                     depth;
1074 };
1075
1076 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1077
1078 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1079 {
1080         struct check_loop_arg   arg;
1081
1082         if (q->ops->cl_ops == NULL)
1083                 return 0;
1084
1085         arg.w.stop = arg.w.skip = arg.w.count = 0;
1086         arg.w.fn = check_loop_fn;
1087         arg.depth = depth;
1088         arg.p = p;
1089         q->ops->cl_ops->walk(q, &arg.w);
1090         return arg.w.stop ? -ELOOP : 0;
1091 }
1092
1093 static int
1094 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1095 {
1096         struct Qdisc *leaf;
1097         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1098         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1099
1100         leaf = cops->leaf(q, cl);
1101         if (leaf) {
1102                 if (leaf == arg->p || arg->depth > 7)
1103                         return -ELOOP;
1104                 return check_loop(leaf, arg->p, arg->depth + 1);
1105         }
1106         return 0;
1107 }
1108
1109 /*
1110  * Delete/get qdisc.
1111  */
1112
1113 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1114 {
1115         struct net *net = sock_net(skb->sk);
1116         struct tcmsg *tcm = nlmsg_data(n);
1117         struct nlattr *tca[TCA_MAX + 1];
1118         struct net_device *dev;
1119         u32 clid;
1120         struct Qdisc *q = NULL;
1121         struct Qdisc *p = NULL;
1122         int err;
1123
1124         if ((n->nlmsg_type != RTM_GETQDISC) &&
1125             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1126                 return -EPERM;
1127
1128         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1129         if (err < 0)
1130                 return err;
1131
1132         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1133         if (!dev)
1134                 return -ENODEV;
1135
1136         clid = tcm->tcm_parent;
1137         if (clid) {
1138                 if (clid != TC_H_ROOT) {
1139                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1140                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1141                                 if (!p)
1142                                         return -ENOENT;
1143                                 q = qdisc_leaf(p, clid);
1144                         } else if (dev_ingress_queue(dev)) {
1145                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1146                         }
1147                 } else {
1148                         q = dev->qdisc;
1149                 }
1150                 if (!q)
1151                         return -ENOENT;
1152
1153                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1154                         return -EINVAL;
1155         } else {
1156                 q = qdisc_lookup(dev, tcm->tcm_handle);
1157                 if (!q)
1158                         return -ENOENT;
1159         }
1160
1161         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1162                 return -EINVAL;
1163
1164         if (n->nlmsg_type == RTM_DELQDISC) {
1165                 if (!clid)
1166                         return -EINVAL;
1167                 if (q->handle == 0)
1168                         return -ENOENT;
1169                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1170                 if (err != 0)
1171                         return err;
1172         } else {
1173                 qdisc_notify(net, skb, n, clid, NULL, q);
1174         }
1175         return 0;
1176 }
1177
1178 /*
1179  * Create/change qdisc.
1180  */
1181
1182 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1183 {
1184         struct net *net = sock_net(skb->sk);
1185         struct tcmsg *tcm;
1186         struct nlattr *tca[TCA_MAX + 1];
1187         struct net_device *dev;
1188         u32 clid;
1189         struct Qdisc *q, *p;
1190         int err;
1191
1192         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1193                 return -EPERM;
1194
1195 replay:
1196         /* Reinit, just in case something touches this. */
1197         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1198         if (err < 0)
1199                 return err;
1200
1201         tcm = nlmsg_data(n);
1202         clid = tcm->tcm_parent;
1203         q = p = NULL;
1204
1205         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1206         if (!dev)
1207                 return -ENODEV;
1208
1209
1210         if (clid) {
1211                 if (clid != TC_H_ROOT) {
1212                         if (clid != TC_H_INGRESS) {
1213                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1214                                 if (!p)
1215                                         return -ENOENT;
1216                                 q = qdisc_leaf(p, clid);
1217                         } else if (dev_ingress_queue_create(dev)) {
1218                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1219                         }
1220                 } else {
1221                         q = dev->qdisc;
1222                 }
1223
1224                 /* It may be default qdisc, ignore it */
1225                 if (q && q->handle == 0)
1226                         q = NULL;
1227
1228                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1229                         if (tcm->tcm_handle) {
1230                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1231                                         return -EEXIST;
1232                                 if (TC_H_MIN(tcm->tcm_handle))
1233                                         return -EINVAL;
1234                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1235                                 if (!q)
1236                                         goto create_n_graft;
1237                                 if (n->nlmsg_flags & NLM_F_EXCL)
1238                                         return -EEXIST;
1239                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1240                                         return -EINVAL;
1241                                 if (q == p ||
1242                                     (p && check_loop(q, p, 0)))
1243                                         return -ELOOP;
1244                                 atomic_inc(&q->refcnt);
1245                                 goto graft;
1246                         } else {
1247                                 if (!q)
1248                                         goto create_n_graft;
1249
1250                                 /* This magic test requires explanation.
1251                                  *
1252                                  *   We know, that some child q is already
1253                                  *   attached to this parent and have choice:
1254                                  *   either to change it or to create/graft new one.
1255                                  *
1256                                  *   1. We are allowed to create/graft only
1257                                  *   if CREATE and REPLACE flags are set.
1258                                  *
1259                                  *   2. If EXCL is set, requestor wanted to say,
1260                                  *   that qdisc tcm_handle is not expected
1261                                  *   to exist, so that we choose create/graft too.
1262                                  *
1263                                  *   3. The last case is when no flags are set.
1264                                  *   Alas, it is sort of hole in API, we
1265                                  *   cannot decide what to do unambiguously.
1266                                  *   For now we select create/graft, if
1267                                  *   user gave KIND, which does not match existing.
1268                                  */
1269                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1270                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1271                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1272                                      (tca[TCA_KIND] &&
1273                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1274                                         goto create_n_graft;
1275                         }
1276                 }
1277         } else {
1278                 if (!tcm->tcm_handle)
1279                         return -EINVAL;
1280                 q = qdisc_lookup(dev, tcm->tcm_handle);
1281         }
1282
1283         /* Change qdisc parameters */
1284         if (q == NULL)
1285                 return -ENOENT;
1286         if (n->nlmsg_flags & NLM_F_EXCL)
1287                 return -EEXIST;
1288         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1289                 return -EINVAL;
1290         err = qdisc_change(q, tca);
1291         if (err == 0)
1292                 qdisc_notify(net, skb, n, clid, NULL, q);
1293         return err;
1294
1295 create_n_graft:
1296         if (!(n->nlmsg_flags & NLM_F_CREATE))
1297                 return -ENOENT;
1298         if (clid == TC_H_INGRESS) {
1299                 if (dev_ingress_queue(dev))
1300                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1301                                          tcm->tcm_parent, tcm->tcm_parent,
1302                                          tca, &err);
1303                 else
1304                         err = -ENOENT;
1305         } else {
1306                 struct netdev_queue *dev_queue;
1307
1308                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1309                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1310                 else if (p)
1311                         dev_queue = p->dev_queue;
1312                 else
1313                         dev_queue = netdev_get_tx_queue(dev, 0);
1314
1315                 q = qdisc_create(dev, dev_queue, p,
1316                                  tcm->tcm_parent, tcm->tcm_handle,
1317                                  tca, &err);
1318         }
1319         if (q == NULL) {
1320                 if (err == -EAGAIN)
1321                         goto replay;
1322                 return err;
1323         }
1324
1325 graft:
1326         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1327         if (err) {
1328                 if (q)
1329                         qdisc_destroy(q);
1330                 return err;
1331         }
1332
1333         return 0;
1334 }
1335
1336 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1337                          u32 portid, u32 seq, u16 flags, int event)
1338 {
1339         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1340         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1341         struct tcmsg *tcm;
1342         struct nlmsghdr  *nlh;
1343         unsigned char *b = skb_tail_pointer(skb);
1344         struct gnet_dump d;
1345         struct qdisc_size_table *stab;
1346         __u32 qlen;
1347
1348         cond_resched();
1349         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1350         if (!nlh)
1351                 goto out_nlmsg_trim;
1352         tcm = nlmsg_data(nlh);
1353         tcm->tcm_family = AF_UNSPEC;
1354         tcm->tcm__pad1 = 0;
1355         tcm->tcm__pad2 = 0;
1356         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1357         tcm->tcm_parent = clid;
1358         tcm->tcm_handle = q->handle;
1359         tcm->tcm_info = atomic_read(&q->refcnt);
1360         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1361                 goto nla_put_failure;
1362         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1363                 goto nla_put_failure;
1364         qlen = q->q.qlen;
1365
1366         stab = rtnl_dereference(q->stab);
1367         if (stab && qdisc_dump_stab(skb, stab) < 0)
1368                 goto nla_put_failure;
1369
1370         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1371                                          NULL, &d, TCA_PAD) < 0)
1372                 goto nla_put_failure;
1373
1374         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1375                 goto nla_put_failure;
1376
1377         if (qdisc_is_percpu_stats(q)) {
1378                 cpu_bstats = q->cpu_bstats;
1379                 cpu_qstats = q->cpu_qstats;
1380         }
1381
1382         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1383                                   &d, cpu_bstats, &q->bstats) < 0 ||
1384             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1385             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1386                 goto nla_put_failure;
1387
1388         if (gnet_stats_finish_copy(&d) < 0)
1389                 goto nla_put_failure;
1390
1391         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1392         return skb->len;
1393
1394 out_nlmsg_trim:
1395 nla_put_failure:
1396         nlmsg_trim(skb, b);
1397         return -1;
1398 }
1399
1400 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1401 {
1402         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1403 }
1404
1405 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1406                         struct nlmsghdr *n, u32 clid,
1407                         struct Qdisc *old, struct Qdisc *new)
1408 {
1409         struct sk_buff *skb;
1410         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1411
1412         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1413         if (!skb)
1414                 return -ENOBUFS;
1415
1416         if (old && !tc_qdisc_dump_ignore(old)) {
1417                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1418                                   0, RTM_DELQDISC) < 0)
1419                         goto err_out;
1420         }
1421         if (new && !tc_qdisc_dump_ignore(new)) {
1422                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1423                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1424                         goto err_out;
1425         }
1426
1427         if (skb->len)
1428                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1429                                       n->nlmsg_flags & NLM_F_ECHO);
1430
1431 err_out:
1432         kfree_skb(skb);
1433         return -EINVAL;
1434 }
1435
1436 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1437                               struct netlink_callback *cb,
1438                               int *q_idx_p, int s_q_idx, bool recur)
1439 {
1440         int ret = 0, q_idx = *q_idx_p;
1441         struct Qdisc *q;
1442         int b;
1443
1444         if (!root)
1445                 return 0;
1446
1447         q = root;
1448         if (q_idx < s_q_idx) {
1449                 q_idx++;
1450         } else {
1451                 if (!tc_qdisc_dump_ignore(q) &&
1452                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1453                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1454                         goto done;
1455                 q_idx++;
1456         }
1457
1458         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1459          * itself has already been dumped.
1460          *
1461          * If we've already dumped the top-level (ingress) qdisc above and the global
1462          * qdisc hashtable, we don't want to hit it again
1463          */
1464         if (!qdisc_dev(root) || !recur)
1465                 goto out;
1466
1467         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1468                 if (q_idx < s_q_idx) {
1469                         q_idx++;
1470                         continue;
1471                 }
1472                 if (!tc_qdisc_dump_ignore(q) &&
1473                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1474                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1475                         goto done;
1476                 q_idx++;
1477         }
1478
1479 out:
1480         *q_idx_p = q_idx;
1481         return ret;
1482 done:
1483         ret = -1;
1484         goto out;
1485 }
1486
1487 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1488 {
1489         struct net *net = sock_net(skb->sk);
1490         int idx, q_idx;
1491         int s_idx, s_q_idx;
1492         struct net_device *dev;
1493
1494         s_idx = cb->args[0];
1495         s_q_idx = q_idx = cb->args[1];
1496
1497         idx = 0;
1498         ASSERT_RTNL();
1499         for_each_netdev(net, dev) {
1500                 struct netdev_queue *dev_queue;
1501
1502                 if (idx < s_idx)
1503                         goto cont;
1504                 if (idx > s_idx)
1505                         s_q_idx = 0;
1506                 q_idx = 0;
1507
1508                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0)
1509                         goto done;
1510
1511                 dev_queue = dev_ingress_queue(dev);
1512                 if (dev_queue &&
1513                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1514                                        &q_idx, s_q_idx, false) < 0)
1515                         goto done;
1516
1517 cont:
1518                 idx++;
1519         }
1520
1521 done:
1522         cb->args[0] = idx;
1523         cb->args[1] = q_idx;
1524
1525         return skb->len;
1526 }
1527
1528
1529
1530 /************************************************
1531  *      Traffic classes manipulation.           *
1532  ************************************************/
1533
1534
1535
1536 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1537 {
1538         struct net *net = sock_net(skb->sk);
1539         struct tcmsg *tcm = nlmsg_data(n);
1540         struct nlattr *tca[TCA_MAX + 1];
1541         struct net_device *dev;
1542         struct Qdisc *q = NULL;
1543         const struct Qdisc_class_ops *cops;
1544         unsigned long cl = 0;
1545         unsigned long new_cl;
1546         u32 portid;
1547         u32 clid;
1548         u32 qid;
1549         int err;
1550
1551         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1552             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1553                 return -EPERM;
1554
1555         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1556         if (err < 0)
1557                 return err;
1558
1559         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1560         if (!dev)
1561                 return -ENODEV;
1562
1563         /*
1564            parent == TC_H_UNSPEC - unspecified parent.
1565            parent == TC_H_ROOT   - class is root, which has no parent.
1566            parent == X:0         - parent is root class.
1567            parent == X:Y         - parent is a node in hierarchy.
1568            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1569
1570            handle == 0:0         - generate handle from kernel pool.
1571            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1572            handle == X:Y         - clear.
1573            handle == X:0         - root class.
1574          */
1575
1576         /* Step 1. Determine qdisc handle X:0 */
1577
1578         portid = tcm->tcm_parent;
1579         clid = tcm->tcm_handle;
1580         qid = TC_H_MAJ(clid);
1581
1582         if (portid != TC_H_ROOT) {
1583                 u32 qid1 = TC_H_MAJ(portid);
1584
1585                 if (qid && qid1) {
1586                         /* If both majors are known, they must be identical. */
1587                         if (qid != qid1)
1588                                 return -EINVAL;
1589                 } else if (qid1) {
1590                         qid = qid1;
1591                 } else if (qid == 0)
1592                         qid = dev->qdisc->handle;
1593
1594                 /* Now qid is genuine qdisc handle consistent
1595                  * both with parent and child.
1596                  *
1597                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1598                  */
1599                 if (portid)
1600                         portid = TC_H_MAKE(qid, portid);
1601         } else {
1602                 if (qid == 0)
1603                         qid = dev->qdisc->handle;
1604         }
1605
1606         /* OK. Locate qdisc */
1607         q = qdisc_lookup(dev, qid);
1608         if (!q)
1609                 return -ENOENT;
1610
1611         /* An check that it supports classes */
1612         cops = q->ops->cl_ops;
1613         if (cops == NULL)
1614                 return -EINVAL;
1615
1616         /* Now try to get class */
1617         if (clid == 0) {
1618                 if (portid == TC_H_ROOT)
1619                         clid = qid;
1620         } else
1621                 clid = TC_H_MAKE(qid, clid);
1622
1623         if (clid)
1624                 cl = cops->get(q, clid);
1625
1626         if (cl == 0) {
1627                 err = -ENOENT;
1628                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1629                     !(n->nlmsg_flags & NLM_F_CREATE))
1630                         goto out;
1631         } else {
1632                 switch (n->nlmsg_type) {
1633                 case RTM_NEWTCLASS:
1634                         err = -EEXIST;
1635                         if (n->nlmsg_flags & NLM_F_EXCL)
1636                                 goto out;
1637                         break;
1638                 case RTM_DELTCLASS:
1639                         err = -EOPNOTSUPP;
1640                         if (cops->delete)
1641                                 err = cops->delete(q, cl);
1642                         if (err == 0)
1643                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1644                         goto out;
1645                 case RTM_GETTCLASS:
1646                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1647                         goto out;
1648                 default:
1649                         err = -EINVAL;
1650                         goto out;
1651                 }
1652         }
1653
1654         new_cl = cl;
1655         err = -EOPNOTSUPP;
1656         if (cops->change)
1657                 err = cops->change(q, clid, portid, tca, &new_cl);
1658         if (err == 0)
1659                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1660
1661 out:
1662         if (cl)
1663                 cops->put(q, cl);
1664
1665         return err;
1666 }
1667
1668
1669 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1670                           unsigned long cl,
1671                           u32 portid, u32 seq, u16 flags, int event)
1672 {
1673         struct tcmsg *tcm;
1674         struct nlmsghdr  *nlh;
1675         unsigned char *b = skb_tail_pointer(skb);
1676         struct gnet_dump d;
1677         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1678
1679         cond_resched();
1680         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1681         if (!nlh)
1682                 goto out_nlmsg_trim;
1683         tcm = nlmsg_data(nlh);
1684         tcm->tcm_family = AF_UNSPEC;
1685         tcm->tcm__pad1 = 0;
1686         tcm->tcm__pad2 = 0;
1687         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1688         tcm->tcm_parent = q->handle;
1689         tcm->tcm_handle = q->handle;
1690         tcm->tcm_info = 0;
1691         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1692                 goto nla_put_failure;
1693         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1694                 goto nla_put_failure;
1695
1696         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1697                                          NULL, &d, TCA_PAD) < 0)
1698                 goto nla_put_failure;
1699
1700         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1701                 goto nla_put_failure;
1702
1703         if (gnet_stats_finish_copy(&d) < 0)
1704                 goto nla_put_failure;
1705
1706         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1707         return skb->len;
1708
1709 out_nlmsg_trim:
1710 nla_put_failure:
1711         nlmsg_trim(skb, b);
1712         return -1;
1713 }
1714
1715 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1716                          struct nlmsghdr *n, struct Qdisc *q,
1717                          unsigned long cl, int event)
1718 {
1719         struct sk_buff *skb;
1720         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1721
1722         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1723         if (!skb)
1724                 return -ENOBUFS;
1725
1726         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1727                 kfree_skb(skb);
1728                 return -EINVAL;
1729         }
1730
1731         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1732                               n->nlmsg_flags & NLM_F_ECHO);
1733 }
1734
1735 struct qdisc_dump_args {
1736         struct qdisc_walker     w;
1737         struct sk_buff          *skb;
1738         struct netlink_callback *cb;
1739 };
1740
1741 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1742 {
1743         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1744
1745         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1746                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1747 }
1748
1749 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1750                                 struct tcmsg *tcm, struct netlink_callback *cb,
1751                                 int *t_p, int s_t)
1752 {
1753         struct qdisc_dump_args arg;
1754
1755         if (tc_qdisc_dump_ignore(q) ||
1756             *t_p < s_t || !q->ops->cl_ops ||
1757             (tcm->tcm_parent &&
1758              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1759                 (*t_p)++;
1760                 return 0;
1761         }
1762         if (*t_p > s_t)
1763                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1764         arg.w.fn = qdisc_class_dump;
1765         arg.skb = skb;
1766         arg.cb = cb;
1767         arg.w.stop  = 0;
1768         arg.w.skip = cb->args[1];
1769         arg.w.count = 0;
1770         q->ops->cl_ops->walk(q, &arg.w);
1771         cb->args[1] = arg.w.count;
1772         if (arg.w.stop)
1773                 return -1;
1774         (*t_p)++;
1775         return 0;
1776 }
1777
1778 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1779                                struct tcmsg *tcm, struct netlink_callback *cb,
1780                                int *t_p, int s_t)
1781 {
1782         struct Qdisc *q;
1783         int b;
1784
1785         if (!root)
1786                 return 0;
1787
1788         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1789                 return -1;
1790
1791         if (!qdisc_dev(root))
1792                 return 0;
1793
1794         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1795                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1796                         return -1;
1797         }
1798
1799         return 0;
1800 }
1801
1802 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1803 {
1804         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1805         struct net *net = sock_net(skb->sk);
1806         struct netdev_queue *dev_queue;
1807         struct net_device *dev;
1808         int t, s_t;
1809
1810         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1811                 return 0;
1812         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1813         if (!dev)
1814                 return 0;
1815
1816         s_t = cb->args[0];
1817         t = 0;
1818
1819         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1820                 goto done;
1821
1822         dev_queue = dev_ingress_queue(dev);
1823         if (dev_queue &&
1824             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1825                                 &t, s_t) < 0)
1826                 goto done;
1827
1828 done:
1829         cb->args[0] = t;
1830
1831         dev_put(dev);
1832         return skb->len;
1833 }
1834
1835 /* Main classifier routine: scans classifier chain attached
1836  * to this qdisc, (optionally) tests for protocol and asks
1837  * specific classifiers.
1838  */
1839 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1840                 struct tcf_result *res, bool compat_mode)
1841 {
1842         __be16 protocol = tc_skb_protocol(skb);
1843 #ifdef CONFIG_NET_CLS_ACT
1844         const struct tcf_proto *old_tp = tp;
1845         int limit = 0;
1846
1847 reclassify:
1848 #endif
1849         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1850                 int err;
1851
1852                 if (tp->protocol != protocol &&
1853                     tp->protocol != htons(ETH_P_ALL))
1854                         continue;
1855
1856                 err = tp->classify(skb, tp, res);
1857 #ifdef CONFIG_NET_CLS_ACT
1858                 if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1859                         goto reset;
1860 #endif
1861                 if (err >= 0)
1862                         return err;
1863         }
1864
1865         return TC_ACT_UNSPEC; /* signal: continue lookup */
1866 #ifdef CONFIG_NET_CLS_ACT
1867 reset:
1868         if (unlikely(limit++ >= MAX_REC_LOOP)) {
1869                 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1870                                        tp->q->ops->id, tp->prio & 0xffff,
1871                                        ntohs(tp->protocol));
1872                 return TC_ACT_SHOT;
1873         }
1874
1875         tp = old_tp;
1876         protocol = tc_skb_protocol(skb);
1877         goto reclassify;
1878 #endif
1879 }
1880 EXPORT_SYMBOL(tc_classify);
1881
1882 bool tcf_destroy(struct tcf_proto *tp, bool force)
1883 {
1884         if (tp->ops->destroy(tp, force)) {
1885                 module_put(tp->ops->owner);
1886                 kfree_rcu(tp, rcu);
1887                 return true;
1888         }
1889
1890         return false;
1891 }
1892
1893 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1894 {
1895         struct tcf_proto *tp;
1896
1897         while ((tp = rtnl_dereference(*fl)) != NULL) {
1898                 RCU_INIT_POINTER(*fl, tp->next);
1899                 tcf_destroy(tp, true);
1900         }
1901 }
1902 EXPORT_SYMBOL(tcf_destroy_chain);
1903
1904 #ifdef CONFIG_PROC_FS
1905 static int psched_show(struct seq_file *seq, void *v)
1906 {
1907         seq_printf(seq, "%08x %08x %08x %08x\n",
1908                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1909                    1000000,
1910                    (u32)NSEC_PER_SEC / hrtimer_resolution);
1911
1912         return 0;
1913 }
1914
1915 static int psched_open(struct inode *inode, struct file *file)
1916 {
1917         return single_open(file, psched_show, NULL);
1918 }
1919
1920 static const struct file_operations psched_fops = {
1921         .owner = THIS_MODULE,
1922         .open = psched_open,
1923         .read  = seq_read,
1924         .llseek = seq_lseek,
1925         .release = single_release,
1926 };
1927
1928 static int __net_init psched_net_init(struct net *net)
1929 {
1930         struct proc_dir_entry *e;
1931
1932         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1933         if (e == NULL)
1934                 return -ENOMEM;
1935
1936         return 0;
1937 }
1938
1939 static void __net_exit psched_net_exit(struct net *net)
1940 {
1941         remove_proc_entry("psched", net->proc_net);
1942 }
1943 #else
1944 static int __net_init psched_net_init(struct net *net)
1945 {
1946         return 0;
1947 }
1948
1949 static void __net_exit psched_net_exit(struct net *net)
1950 {
1951 }
1952 #endif
1953
1954 static struct pernet_operations psched_net_ops = {
1955         .init = psched_net_init,
1956         .exit = psched_net_exit,
1957 };
1958
1959 static int __init pktsched_init(void)
1960 {
1961         int err;
1962
1963         err = register_pernet_subsys(&psched_net_ops);
1964         if (err) {
1965                 pr_err("pktsched_init: "
1966                        "cannot initialize per netns operations\n");
1967                 return err;
1968         }
1969
1970         register_qdisc(&pfifo_fast_ops);
1971         register_qdisc(&pfifo_qdisc_ops);
1972         register_qdisc(&bfifo_qdisc_ops);
1973         register_qdisc(&pfifo_head_drop_qdisc_ops);
1974         register_qdisc(&mq_qdisc_ops);
1975         register_qdisc(&noqueue_qdisc_ops);
1976
1977         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1978         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1979         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1980         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1981         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1982         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1983
1984         return 0;
1985 }
1986
1987 subsys_initcall(pktsched_init);