net/sched/sch_generic.c

   1 /*
   2  * net/sched/sch_generic.c      Generic packet scheduler routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
  11  *              - Ingress support
  12  */
  13
  14 #include <asm/uaccess.h>
  15 #include <asm/system.h>
  16 #include <linux/bitops.h>
  17 #include <linux/module.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/string.h>
  22 #include <linux/mm.h>
  23 #include <linux/socket.h>
  24 #include <linux/sockios.h>
  25 #include <linux/in.h>
  26 #include <linux/errno.h>
  27 #include <linux/interrupt.h>
  28 #include <linux/netdevice.h>
  29 #include <linux/skbuff.h>
  30 #include <linux/rtnetlink.h>
  31 #include <linux/init.h>
  32 #include <linux/rcupdate.h>
  33 #include <linux/list.h>
  34 #include <net/sock.h>
  35 #include <net/pkt_sched.h>
  36
  37 #define SCHED_TX_DROP -2
  38 #define SCHED_TX_QUEUE -3
  39
  40 /* Main transmission queue. */
  41
  42 /* Modifications to data participating in scheduling must be protected with
  43  * dev->queue_lock spinlock.
  44  *
  45  * The idea is the following:
  46  * - enqueue, dequeue are serialized via top level device
  47  *   spinlock dev->queue_lock.
  48  * - ingress filtering is serialized via top level device
  49  *   spinlock dev->ingress_lock.
  50  * - updates to tree and tree walking are only done under the rtnl mutex.
  51  */
  52
  53 void qdisc_lock_tree(struct net_device *dev)
  54 {
  55         spin_lock_bh(&dev->queue_lock);
  56         spin_lock(&dev->ingress_lock);
  57 }
  58
  59 void qdisc_unlock_tree(struct net_device *dev)
  60 {
  61         spin_unlock(&dev->ingress_lock);
  62         spin_unlock_bh(&dev->queue_lock);
  63 }
  64
  65 static inline int qdisc_qlen(struct Qdisc *q)
  66 {
  67         BUG_ON((int) q->q.qlen < 0);
  68         return q->q.qlen;
  69 }
  70
  71 static inline int handle_dev_cpu_collision(struct net_device *dev)
  72 {
  73         if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
  74                 if (net_ratelimit())
  75                         printk(KERN_WARNING
  76                                "Dead loop on netdevice %s, fix it urgently!\n",
  77                                dev->name);
  78                 return SCHED_TX_DROP;
  79         }
  80         __get_cpu_var(netdev_rx_stat).cpu_collision++;
  81         return SCHED_TX_QUEUE;
  82 }
  83
  84 static inline int
  85 do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
  86 {
  87
  88         if (unlikely(skb->next))
  89                 dev->gso_skb = skb;
  90         else
  91                 q->ops->requeue(skb, q);
  92         /* XXX: Could netif_schedule fail? Or is the fact we are
  93          * requeueing imply the hardware path is closed
  94          * and even if we fail, some interupt will wake us
  95          */
  96         netif_schedule(dev);
  97         return 0;
  98 }
  99
 100 static inline struct sk_buff *
 101 try_get_tx_pkt(struct net_device *dev, struct Qdisc *q)
 102 {
 103         struct sk_buff *skb = dev->gso_skb;
 104
 105         if (skb)
 106                 dev->gso_skb = NULL;
 107         else
 108                 skb = q->dequeue(q);
 109
 110         return skb;
 111 }
 112
 113 static inline int
 114 tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
 115 {
 116         int ret = handle_dev_cpu_collision(dev);
 117
 118         if (ret == SCHED_TX_DROP) {
 119                 kfree_skb(skb);
 120                 return qdisc_qlen(q);
 121         }
 122
 123         return do_dev_requeue(skb, dev, q);
 124 }
 125
 126
 127 /*
 128    NOTE: Called under dev->queue_lock with locally disabled BH.
 129
 130    __LINK_STATE_QDISC_RUNNING guarantees only one CPU
 131    can enter this region at a time.
 132
 133    dev->queue_lock serializes queue accesses for this device
 134    AND dev->qdisc pointer itself.
 135
 136    netif_tx_lock serializes accesses to device driver.
 137
 138    dev->queue_lock and netif_tx_lock are mutually exclusive,
 139    if one is grabbed, another must be free.
 140
 141    Multiple CPUs may contend for the two locks.
 142
 143    Note, that this procedure can be called by a watchdog timer
 144
 145    Returns to the caller:
 146    Returns:  0  - queue is empty or throttled.
 147             >0  - queue is not empty.
 148
 149 */
 150
 151 static inline int qdisc_restart(struct net_device *dev)
 152 {
 153         struct Qdisc *q = dev->qdisc;
 154         unsigned lockless = (dev->features & NETIF_F_LLTX);
 155         struct sk_buff *skb;
 156         int ret;
 157
 158         skb = try_get_tx_pkt(dev, q);
 159         if (skb == NULL)
 160                 return 0;
 161
 162         /* we have a packet to send */
 163         if (!lockless) {
 164                 if (!netif_tx_trylock(dev))
 165                         return tx_islocked(skb, dev, q);
 166         }
 167         /* all clear .. */
 168         spin_unlock(&dev->queue_lock);
 169
 170         ret = NETDEV_TX_BUSY;
 171         if (!netif_queue_stopped(dev))
 172                 /* churn baby churn .. */
 173                 ret = dev_hard_start_xmit(skb, dev);
 174
 175         if (!lockless)
 176                 netif_tx_unlock(dev);
 177
 178         spin_lock(&dev->queue_lock);
 179
 180         /* we need to refresh q because it may be invalid since
 181          * we dropped  dev->queue_lock earlier ...
 182          * So dont try to be clever grasshopper
 183          */
 184         q = dev->qdisc;
 185         /* most likely result, packet went ok */
 186         if (ret == NETDEV_TX_OK)
 187                 return qdisc_qlen(q);
 188         /* only for lockless drivers .. */
 189         if (ret == NETDEV_TX_LOCKED && lockless)
 190                 return tx_islocked(skb, dev, q);
 191
 192         if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
 193                 printk(KERN_WARNING " BUG %s code %d qlen %d\n",dev->name, ret, q->q.qlen);
 194
 195         return do_dev_requeue(skb, dev, q);
 196 }
 197
 198
 199 void __qdisc_run(struct net_device *dev)
 200 {
 201         do {
 202                 if (!qdisc_restart(dev))
 203                         break;
 204         } while (!netif_queue_stopped(dev));
 205
 206         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
 207 }
 208
 209 static void dev_watchdog(unsigned long arg)
 210 {
 211         struct net_device *dev = (struct net_device *)arg;
 212
 213         netif_tx_lock(dev);
 214         if (dev->qdisc != &noop_qdisc) {
 215                 if (netif_device_present(dev) &&
 216                     netif_running(dev) &&
 217                     netif_carrier_ok(dev)) {
 218                         if (netif_queue_stopped(dev) &&
 219                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
 220
 221                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
 222                                        dev->name);
 223                                 dev->tx_timeout(dev);
 224                         }
 225                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
 226                                 dev_hold(dev);
 227                 }
 228         }
 229         netif_tx_unlock(dev);
 230
 231         dev_put(dev);
 232 }
 233
 234 static void dev_watchdog_init(struct net_device *dev)
 235 {
 236         init_timer(&dev->watchdog_timer);
 237         dev->watchdog_timer.data = (unsigned long)dev;
 238         dev->watchdog_timer.function = dev_watchdog;
 239 }
 240
 241 void __netdev_watchdog_up(struct net_device *dev)
 242 {
 243         if (dev->tx_timeout) {
 244                 if (dev->watchdog_timeo <= 0)
 245                         dev->watchdog_timeo = 5*HZ;
 246                 if (!mod_timer(&dev->watchdog_timer,
 247                                round_jiffies(jiffies + dev->watchdog_timeo)))
 248                         dev_hold(dev);
 249         }
 250 }
 251
 252 static void dev_watchdog_up(struct net_device *dev)
 253 {
 254         __netdev_watchdog_up(dev);
 255 }
 256
 257 static void dev_watchdog_down(struct net_device *dev)
 258 {
 259         netif_tx_lock_bh(dev);
 260         if (del_timer(&dev->watchdog_timer))
 261                 dev_put(dev);
 262         netif_tx_unlock_bh(dev);
 263 }
 264
 265 void netif_carrier_on(struct net_device *dev)
 266 {
 267         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
 268                 linkwatch_fire_event(dev);
 269         if (netif_running(dev))
 270                 __netdev_watchdog_up(dev);
 271 }
 272
 273 void netif_carrier_off(struct net_device *dev)
 274 {
 275         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
 276                 linkwatch_fire_event(dev);
 277 }
 278
 279 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
 280    under all circumstances. It is difficult to invent anything faster or
 281    cheaper.
 282  */
 283
 284 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
 285 {
 286         kfree_skb(skb);
 287         return NET_XMIT_CN;
 288 }
 289
 290 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
 291 {
 292         return NULL;
 293 }
 294
 295 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 296 {
 297         if (net_ratelimit())
 298                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
 299                        skb->dev->name);
 300         kfree_skb(skb);
 301         return NET_XMIT_CN;
 302 }
 303
 304 struct Qdisc_ops noop_qdisc_ops = {
 305         .id             =       "noop",
 306         .priv_size      =       0,
 307         .enqueue        =       noop_enqueue,
 308         .dequeue        =       noop_dequeue,
 309         .requeue        =       noop_requeue,
 310         .owner          =       THIS_MODULE,
 311 };
 312
 313 struct Qdisc noop_qdisc = {
 314         .enqueue        =       noop_enqueue,
 315         .dequeue        =       noop_dequeue,
 316         .flags          =       TCQ_F_BUILTIN,
 317         .ops            =       &noop_qdisc_ops,
 318         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
 319 };
 320
 321 static struct Qdisc_ops noqueue_qdisc_ops = {
 322         .id             =       "noqueue",
 323         .priv_size      =       0,
 324         .enqueue        =       noop_enqueue,
 325         .dequeue        =       noop_dequeue,
 326         .requeue        =       noop_requeue,
 327         .owner          =       THIS_MODULE,
 328 };
 329
 330 static struct Qdisc noqueue_qdisc = {
 331         .enqueue        =       NULL,
 332         .dequeue        =       noop_dequeue,
 333         .flags          =       TCQ_F_BUILTIN,
 334         .ops            =       &noqueue_qdisc_ops,
 335         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
 336 };
 337
 338
 339 static const u8 prio2band[TC_PRIO_MAX+1] =
 340         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
 341
 342 /* 3-band FIFO queue: old style, but should be a bit faster than
 343    generic prio+fifo combination.
 344  */
 345
 346 #define PFIFO_FAST_BANDS 3
 347
 348 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
 349                                              struct Qdisc *qdisc)
 350 {
 351         struct sk_buff_head *list = qdisc_priv(qdisc);
 352         return list + prio2band[skb->priority & TC_PRIO_MAX];
 353 }
 354
 355 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 356 {
 357         struct sk_buff_head *list = prio2list(skb, qdisc);
 358
 359         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
 360                 qdisc->q.qlen++;
 361                 return __qdisc_enqueue_tail(skb, qdisc, list);
 362         }
 363
 364         return qdisc_drop(skb, qdisc);
 365 }
 366
 367 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 368 {
 369         int prio;
 370         struct sk_buff_head *list = qdisc_priv(qdisc);
 371
 372         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
 373                 if (!skb_queue_empty(list + prio)) {
 374                         qdisc->q.qlen--;
 375                         return __qdisc_dequeue_head(qdisc, list + prio);
 376                 }
 377         }
 378
 379         return NULL;
 380 }
 381
 382 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 383 {
 384         qdisc->q.qlen++;
 385         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
 386 }
 387
 388 static void pfifo_fast_reset(struct Qdisc* qdisc)
 389 {
 390         int prio;
 391         struct sk_buff_head *list = qdisc_priv(qdisc);
 392
 393         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 394                 __qdisc_reset_queue(qdisc, list + prio);
 395
 396         qdisc->qstats.backlog = 0;
 397         qdisc->q.qlen = 0;
 398 }
 399
 400 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 401 {
 402         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 403
 404         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
 405         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 406         return skb->len;
 407
 408 rtattr_failure:
 409         return -1;
 410 }
 411
 412 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 413 {
 414         int prio;
 415         struct sk_buff_head *list = qdisc_priv(qdisc);
 416
 417         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 418                 skb_queue_head_init(list + prio);
 419
 420         return 0;
 421 }
 422
 423 static struct Qdisc_ops pfifo_fast_ops = {
 424         .id             =       "pfifo_fast",
 425         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
 426         .enqueue        =       pfifo_fast_enqueue,
 427         .dequeue        =       pfifo_fast_dequeue,
 428         .requeue        =       pfifo_fast_requeue,
 429         .init           =       pfifo_fast_init,
 430         .reset          =       pfifo_fast_reset,
 431         .dump           =       pfifo_fast_dump,
 432         .owner          =       THIS_MODULE,
 433 };
 434
 435 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 436 {
 437         void *p;
 438         struct Qdisc *sch;
 439         unsigned int size;
 440         int err = -ENOBUFS;
 441
 442         /* ensure that the Qdisc and the private data are 32-byte aligned */
 443         size = QDISC_ALIGN(sizeof(*sch));
 444         size += ops->priv_size + (QDISC_ALIGNTO - 1);
 445
 446         p = kzalloc(size, GFP_KERNEL);
 447         if (!p)
 448                 goto errout;
 449         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 450         sch->padded = (char *) sch - (char *) p;
 451
 452         INIT_LIST_HEAD(&sch->list);
 453         skb_queue_head_init(&sch->q);
 454         sch->ops = ops;
 455         sch->enqueue = ops->enqueue;
 456         sch->dequeue = ops->dequeue;
 457         sch->dev = dev;
 458         dev_hold(dev);
 459         atomic_set(&sch->refcnt, 1);
 460
 461         return sch;
 462 errout:
 463         return ERR_PTR(-err);
 464 }
 465
 466 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
 467                                  unsigned int parentid)
 468 {
 469         struct Qdisc *sch;
 470
 471         sch = qdisc_alloc(dev, ops);
 472         if (IS_ERR(sch))
 473                 goto errout;
 474         sch->stats_lock = &dev->queue_lock;
 475         sch->parent = parentid;
 476
 477         if (!ops->init || ops->init(sch, NULL) == 0)
 478                 return sch;
 479
 480         qdisc_destroy(sch);
 481 errout:
 482         return NULL;
 483 }
 484
 485 /* Under dev->queue_lock and BH! */
 486
 487 void qdisc_reset(struct Qdisc *qdisc)
 488 {
 489         struct Qdisc_ops *ops = qdisc->ops;
 490
 491         if (ops->reset)
 492                 ops->reset(qdisc);
 493 }
 494
 495 /* this is the rcu callback function to clean up a qdisc when there
 496  * are no further references to it */
 497
 498 static void __qdisc_destroy(struct rcu_head *head)
 499 {
 500         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
 501         kfree((char *) qdisc - qdisc->padded);
 502 }
 503
 504 /* Under dev->queue_lock and BH! */
 505
 506 void qdisc_destroy(struct Qdisc *qdisc)
 507 {
 508         struct Qdisc_ops  *ops = qdisc->ops;
 509
 510         if (qdisc->flags & TCQ_F_BUILTIN ||
 511             !atomic_dec_and_test(&qdisc->refcnt))
 512                 return;
 513
 514         list_del(&qdisc->list);
 515 #ifdef CONFIG_NET_ESTIMATOR
 516         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
 517 #endif
 518         if (ops->reset)
 519                 ops->reset(qdisc);
 520         if (ops->destroy)
 521                 ops->destroy(qdisc);
 522
 523         module_put(ops->owner);
 524         dev_put(qdisc->dev);
 525         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
 526 }
 527
 528 void dev_activate(struct net_device *dev)
 529 {
 530         /* No queueing discipline is attached to device;
 531            create default one i.e. pfifo_fast for devices,
 532            which need queueing and noqueue_qdisc for
 533            virtual interfaces
 534          */
 535
 536         if (dev->qdisc_sleeping == &noop_qdisc) {
 537                 struct Qdisc *qdisc;
 538                 if (dev->tx_queue_len) {
 539                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
 540                                                   TC_H_ROOT);
 541                         if (qdisc == NULL) {
 542                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
 543                                 return;
 544                         }
 545                         list_add_tail(&qdisc->list, &dev->qdisc_list);
 546                 } else {
 547                         qdisc =  &noqueue_qdisc;
 548                 }
 549                 dev->qdisc_sleeping = qdisc;
 550         }
 551
 552         if (!netif_carrier_ok(dev))
 553                 /* Delay activation until next carrier-on event */
 554                 return;
 555
 556         spin_lock_bh(&dev->queue_lock);
 557         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
 558         if (dev->qdisc != &noqueue_qdisc) {
 559                 dev->trans_start = jiffies;
 560                 dev_watchdog_up(dev);
 561         }
 562         spin_unlock_bh(&dev->queue_lock);
 563 }
 564
 565 void dev_deactivate(struct net_device *dev)
 566 {
 567         struct Qdisc *qdisc;
 568         struct sk_buff *skb;
 569
 570         spin_lock_bh(&dev->queue_lock);
 571         qdisc = dev->qdisc;
 572         dev->qdisc = &noop_qdisc;
 573
 574         qdisc_reset(qdisc);
 575
 576         skb = dev->gso_skb;
 577         dev->gso_skb = NULL;
 578         spin_unlock_bh(&dev->queue_lock);
 579
 580         kfree_skb(skb);
 581
 582         dev_watchdog_down(dev);
 583
 584         /* Wait for outstanding dev_queue_xmit calls. */
 585         synchronize_rcu();
 586
 587         /* Wait for outstanding qdisc_run calls. */
 588         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 589                 yield();
 590 }
 591
 592 void dev_init_scheduler(struct net_device *dev)
 593 {
 594         qdisc_lock_tree(dev);
 595         dev->qdisc = &noop_qdisc;
 596         dev->qdisc_sleeping = &noop_qdisc;
 597         INIT_LIST_HEAD(&dev->qdisc_list);
 598         qdisc_unlock_tree(dev);
 599
 600         dev_watchdog_init(dev);
 601 }
 602
 603 void dev_shutdown(struct net_device *dev)
 604 {
 605         struct Qdisc *qdisc;
 606
 607         qdisc_lock_tree(dev);
 608         qdisc = dev->qdisc_sleeping;
 609         dev->qdisc = &noop_qdisc;
 610         dev->qdisc_sleeping = &noop_qdisc;
 611         qdisc_destroy(qdisc);
 612 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
 613         if ((qdisc = dev->qdisc_ingress) != NULL) {
 614                 dev->qdisc_ingress = NULL;
 615                 qdisc_destroy(qdisc);
 616         }
 617 #endif
 618         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
 619         qdisc_unlock_tree(dev);
 620 }
 621
 622 EXPORT_SYMBOL(netif_carrier_on);
 623 EXPORT_SYMBOL(netif_carrier_off);
 624 EXPORT_SYMBOL(noop_qdisc);
 625 EXPORT_SYMBOL(qdisc_create_dflt);
 626 EXPORT_SYMBOL(qdisc_destroy);
 627 EXPORT_SYMBOL(qdisc_reset);
 628 EXPORT_SYMBOL(qdisc_lock_tree);
 629 EXPORT_SYMBOL(qdisc_unlock_tree);