2 * net/sched/sch_generic.c Generic packet scheduler routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
35 #include <net/pkt_sched.h>
37 #define SCHED_TX_DROP -2
38 #define SCHED_TX_QUEUE -3
40 /* Main transmission queue. */
42 /* Modifications to data participating in scheduling must be protected with
43 * dev->queue_lock spinlock.
45 * The idea is the following:
46 * - enqueue, dequeue are serialized via top level device
47 * spinlock dev->queue_lock.
48 * - ingress filtering is serialized via top level device
49 * spinlock dev->ingress_lock.
50 * - updates to tree and tree walking are only done under the rtnl mutex.
53 void qdisc_lock_tree(struct net_device *dev)
55 spin_lock_bh(&dev->queue_lock);
56 spin_lock(&dev->ingress_lock);
59 void qdisc_unlock_tree(struct net_device *dev)
61 spin_unlock(&dev->ingress_lock);
62 spin_unlock_bh(&dev->queue_lock);
65 static inline int qdisc_qlen(struct Qdisc *q)
67 BUG_ON((int) q->q.qlen < 0);
71 static inline int handle_dev_cpu_collision(struct net_device *dev)
73 if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
76 "Dead loop on netdevice %s, fix it urgently!\n",
80 __get_cpu_var(netdev_rx_stat).cpu_collision++;
81 return SCHED_TX_QUEUE;
85 do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
88 if (unlikely(skb->next))
91 q->ops->requeue(skb, q);
92 /* XXX: Could netif_schedule fail? Or is the fact we are
93 * requeueing imply the hardware path is closed
94 * and even if we fail, some interupt will wake us
100 static inline struct sk_buff *
101 try_get_tx_pkt(struct net_device *dev, struct Qdisc *q)
103 struct sk_buff *skb = dev->gso_skb;
114 tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
116 int ret = handle_dev_cpu_collision(dev);
118 if (ret == SCHED_TX_DROP) {
120 return qdisc_qlen(q);
123 return do_dev_requeue(skb, dev, q);
128 NOTE: Called under dev->queue_lock with locally disabled BH.
130 __LINK_STATE_QDISC_RUNNING guarantees only one CPU
131 can enter this region at a time.
133 dev->queue_lock serializes queue accesses for this device
134 AND dev->qdisc pointer itself.
136 netif_tx_lock serializes accesses to device driver.
138 dev->queue_lock and netif_tx_lock are mutually exclusive,
139 if one is grabbed, another must be free.
141 Multiple CPUs may contend for the two locks.
143 Note, that this procedure can be called by a watchdog timer
145 Returns to the caller:
146 Returns: 0 - queue is empty or throttled.
147 >0 - queue is not empty.
151 static inline int qdisc_restart(struct net_device *dev)
153 struct Qdisc *q = dev->qdisc;
154 unsigned lockless = (dev->features & NETIF_F_LLTX);
158 skb = try_get_tx_pkt(dev, q);
162 /* we have a packet to send */
164 if (!netif_tx_trylock(dev))
165 return tx_islocked(skb, dev, q);
168 spin_unlock(&dev->queue_lock);
170 ret = NETDEV_TX_BUSY;
171 if (!netif_queue_stopped(dev))
172 /* churn baby churn .. */
173 ret = dev_hard_start_xmit(skb, dev);
176 netif_tx_unlock(dev);
178 spin_lock(&dev->queue_lock);
180 /* we need to refresh q because it may be invalid since
181 * we dropped dev->queue_lock earlier ...
182 * So dont try to be clever grasshopper
185 /* most likely result, packet went ok */
186 if (ret == NETDEV_TX_OK)
187 return qdisc_qlen(q);
188 /* only for lockless drivers .. */
189 if (ret == NETDEV_TX_LOCKED && lockless)
190 return tx_islocked(skb, dev, q);
192 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
193 printk(KERN_WARNING " BUG %s code %d qlen %d\n",dev->name, ret, q->q.qlen);
195 return do_dev_requeue(skb, dev, q);
199 void __qdisc_run(struct net_device *dev)
202 if (!qdisc_restart(dev))
204 } while (!netif_queue_stopped(dev));
206 clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
209 static void dev_watchdog(unsigned long arg)
211 struct net_device *dev = (struct net_device *)arg;
214 if (dev->qdisc != &noop_qdisc) {
215 if (netif_device_present(dev) &&
216 netif_running(dev) &&
217 netif_carrier_ok(dev)) {
218 if (netif_queue_stopped(dev) &&
219 time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
221 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
223 dev->tx_timeout(dev);
225 if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
229 netif_tx_unlock(dev);
234 static void dev_watchdog_init(struct net_device *dev)
236 init_timer(&dev->watchdog_timer);
237 dev->watchdog_timer.data = (unsigned long)dev;
238 dev->watchdog_timer.function = dev_watchdog;
241 void __netdev_watchdog_up(struct net_device *dev)
243 if (dev->tx_timeout) {
244 if (dev->watchdog_timeo <= 0)
245 dev->watchdog_timeo = 5*HZ;
246 if (!mod_timer(&dev->watchdog_timer,
247 round_jiffies(jiffies + dev->watchdog_timeo)))
252 static void dev_watchdog_up(struct net_device *dev)
254 __netdev_watchdog_up(dev);
257 static void dev_watchdog_down(struct net_device *dev)
259 netif_tx_lock_bh(dev);
260 if (del_timer(&dev->watchdog_timer))
262 netif_tx_unlock_bh(dev);
265 void netif_carrier_on(struct net_device *dev)
267 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
268 linkwatch_fire_event(dev);
269 if (netif_running(dev))
270 __netdev_watchdog_up(dev);
273 void netif_carrier_off(struct net_device *dev)
275 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
276 linkwatch_fire_event(dev);
279 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
280 under all circumstances. It is difficult to invent anything faster or
284 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
290 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
295 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
298 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
304 struct Qdisc_ops noop_qdisc_ops = {
307 .enqueue = noop_enqueue,
308 .dequeue = noop_dequeue,
309 .requeue = noop_requeue,
310 .owner = THIS_MODULE,
313 struct Qdisc noop_qdisc = {
314 .enqueue = noop_enqueue,
315 .dequeue = noop_dequeue,
316 .flags = TCQ_F_BUILTIN,
317 .ops = &noop_qdisc_ops,
318 .list = LIST_HEAD_INIT(noop_qdisc.list),
321 static struct Qdisc_ops noqueue_qdisc_ops = {
324 .enqueue = noop_enqueue,
325 .dequeue = noop_dequeue,
326 .requeue = noop_requeue,
327 .owner = THIS_MODULE,
330 static struct Qdisc noqueue_qdisc = {
332 .dequeue = noop_dequeue,
333 .flags = TCQ_F_BUILTIN,
334 .ops = &noqueue_qdisc_ops,
335 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
339 static const u8 prio2band[TC_PRIO_MAX+1] =
340 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
342 /* 3-band FIFO queue: old style, but should be a bit faster than
343 generic prio+fifo combination.
346 #define PFIFO_FAST_BANDS 3
348 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
351 struct sk_buff_head *list = qdisc_priv(qdisc);
352 return list + prio2band[skb->priority & TC_PRIO_MAX];
355 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
357 struct sk_buff_head *list = prio2list(skb, qdisc);
359 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
361 return __qdisc_enqueue_tail(skb, qdisc, list);
364 return qdisc_drop(skb, qdisc);
367 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
370 struct sk_buff_head *list = qdisc_priv(qdisc);
372 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
373 if (!skb_queue_empty(list + prio)) {
375 return __qdisc_dequeue_head(qdisc, list + prio);
382 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
385 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
388 static void pfifo_fast_reset(struct Qdisc* qdisc)
391 struct sk_buff_head *list = qdisc_priv(qdisc);
393 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
394 __qdisc_reset_queue(qdisc, list + prio);
396 qdisc->qstats.backlog = 0;
400 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
402 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
404 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
405 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
412 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
415 struct sk_buff_head *list = qdisc_priv(qdisc);
417 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
418 skb_queue_head_init(list + prio);
423 static struct Qdisc_ops pfifo_fast_ops = {
425 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
426 .enqueue = pfifo_fast_enqueue,
427 .dequeue = pfifo_fast_dequeue,
428 .requeue = pfifo_fast_requeue,
429 .init = pfifo_fast_init,
430 .reset = pfifo_fast_reset,
431 .dump = pfifo_fast_dump,
432 .owner = THIS_MODULE,
435 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
442 /* ensure that the Qdisc and the private data are 32-byte aligned */
443 size = QDISC_ALIGN(sizeof(*sch));
444 size += ops->priv_size + (QDISC_ALIGNTO - 1);
446 p = kzalloc(size, GFP_KERNEL);
449 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
450 sch->padded = (char *) sch - (char *) p;
452 INIT_LIST_HEAD(&sch->list);
453 skb_queue_head_init(&sch->q);
455 sch->enqueue = ops->enqueue;
456 sch->dequeue = ops->dequeue;
459 atomic_set(&sch->refcnt, 1);
463 return ERR_PTR(-err);
466 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
467 unsigned int parentid)
471 sch = qdisc_alloc(dev, ops);
474 sch->stats_lock = &dev->queue_lock;
475 sch->parent = parentid;
477 if (!ops->init || ops->init(sch, NULL) == 0)
485 /* Under dev->queue_lock and BH! */
487 void qdisc_reset(struct Qdisc *qdisc)
489 struct Qdisc_ops *ops = qdisc->ops;
495 /* this is the rcu callback function to clean up a qdisc when there
496 * are no further references to it */
498 static void __qdisc_destroy(struct rcu_head *head)
500 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
501 kfree((char *) qdisc - qdisc->padded);
504 /* Under dev->queue_lock and BH! */
506 void qdisc_destroy(struct Qdisc *qdisc)
508 struct Qdisc_ops *ops = qdisc->ops;
510 if (qdisc->flags & TCQ_F_BUILTIN ||
511 !atomic_dec_and_test(&qdisc->refcnt))
514 list_del(&qdisc->list);
515 #ifdef CONFIG_NET_ESTIMATOR
516 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
523 module_put(ops->owner);
525 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
528 void dev_activate(struct net_device *dev)
530 /* No queueing discipline is attached to device;
531 create default one i.e. pfifo_fast for devices,
532 which need queueing and noqueue_qdisc for
536 if (dev->qdisc_sleeping == &noop_qdisc) {
538 if (dev->tx_queue_len) {
539 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
542 printk(KERN_INFO "%s: activation failed\n", dev->name);
545 list_add_tail(&qdisc->list, &dev->qdisc_list);
547 qdisc = &noqueue_qdisc;
549 dev->qdisc_sleeping = qdisc;
552 if (!netif_carrier_ok(dev))
553 /* Delay activation until next carrier-on event */
556 spin_lock_bh(&dev->queue_lock);
557 rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
558 if (dev->qdisc != &noqueue_qdisc) {
559 dev->trans_start = jiffies;
560 dev_watchdog_up(dev);
562 spin_unlock_bh(&dev->queue_lock);
565 void dev_deactivate(struct net_device *dev)
570 spin_lock_bh(&dev->queue_lock);
572 dev->qdisc = &noop_qdisc;
578 spin_unlock_bh(&dev->queue_lock);
582 dev_watchdog_down(dev);
584 /* Wait for outstanding dev_queue_xmit calls. */
587 /* Wait for outstanding qdisc_run calls. */
588 while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
592 void dev_init_scheduler(struct net_device *dev)
594 qdisc_lock_tree(dev);
595 dev->qdisc = &noop_qdisc;
596 dev->qdisc_sleeping = &noop_qdisc;
597 INIT_LIST_HEAD(&dev->qdisc_list);
598 qdisc_unlock_tree(dev);
600 dev_watchdog_init(dev);
603 void dev_shutdown(struct net_device *dev)
607 qdisc_lock_tree(dev);
608 qdisc = dev->qdisc_sleeping;
609 dev->qdisc = &noop_qdisc;
610 dev->qdisc_sleeping = &noop_qdisc;
611 qdisc_destroy(qdisc);
612 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
613 if ((qdisc = dev->qdisc_ingress) != NULL) {
614 dev->qdisc_ingress = NULL;
615 qdisc_destroy(qdisc);
618 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
619 qdisc_unlock_tree(dev);
622 EXPORT_SYMBOL(netif_carrier_on);
623 EXPORT_SYMBOL(netif_carrier_off);
624 EXPORT_SYMBOL(noop_qdisc);
625 EXPORT_SYMBOL(qdisc_create_dflt);
626 EXPORT_SYMBOL(qdisc_destroy);
627 EXPORT_SYMBOL(qdisc_reset);
628 EXPORT_SYMBOL(qdisc_lock_tree);
629 EXPORT_SYMBOL(qdisc_unlock_tree);