2 * net/sched/sch_choke.c CHOKE scheduler
4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation.
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/reciprocal_div.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
23 #include <linux/ipv6.h>
27 CHOKe stateless AQM for fair bandwidth allocation
28 =================================================
30 CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
31 unresponsive flows) is a variant of RED that penalizes misbehaving flows but
32 maintains no flow state. The difference from RED is an additional step
33 during the enqueuing process. If average queue size is over the
34 low threshold (qmin), a packet is chosen at random from the queue.
35 If both the new and chosen packet are from the same flow, both
36 are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
37 needs to access packets in queue randomly. It has a minimal class
38 interface to allow overriding the builtin flow classifier with
42 R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
43 Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
46 A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
47 Characteristics", IEEE/ACM Transactions on Networking, 2004
51 /* Upper bound on size of sk_buff table (packets) */
52 #define CHOKE_MAX_QUEUE (128*1024 - 1)
54 struct choke_sched_data {
59 struct red_parms parms;
62 struct tcf_proto *filter_list;
64 u32 prob_drop; /* Early probability drops */
65 u32 prob_mark; /* Early probability marks */
66 u32 forced_drop; /* Forced drops, qavg > max_thresh */
67 u32 forced_mark; /* Forced marks, qavg > max_thresh */
68 u32 pdrop; /* Drops due to queue limits */
69 u32 other; /* Drops due to drop() calls */
70 u32 matched; /* Drops to flow match */
76 unsigned int tab_mask; /* size - 1 */
81 /* deliver a random number between 0 and N - 1 */
82 static u32 random_N(unsigned int N)
84 return reciprocal_divide(random32(), N);
87 /* number of elements in queue including holes */
88 static unsigned int choke_len(const struct choke_sched_data *q)
90 return (q->tail - q->head) & q->tab_mask;
93 /* Is ECN parameter configured */
94 static int use_ecn(const struct choke_sched_data *q)
96 return q->flags & TC_RED_ECN;
99 /* Should packets over max just be dropped (versus marked) */
100 static int use_harddrop(const struct choke_sched_data *q)
102 return q->flags & TC_RED_HARDDROP;
105 /* Move head pointer forward to skip over holes */
106 static void choke_zap_head_holes(struct choke_sched_data *q)
109 q->head = (q->head + 1) & q->tab_mask;
110 if (q->head == q->tail)
112 } while (q->tab[q->head] == NULL);
115 /* Move tail pointer backwards to reuse holes */
116 static void choke_zap_tail_holes(struct choke_sched_data *q)
119 q->tail = (q->tail - 1) & q->tab_mask;
120 if (q->head == q->tail)
122 } while (q->tab[q->tail] == NULL);
125 /* Drop packet from queue array by creating a "hole" */
126 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
128 struct choke_sched_data *q = qdisc_priv(sch);
129 struct sk_buff *skb = q->tab[idx];
134 choke_zap_head_holes(q);
136 choke_zap_tail_holes(q);
138 sch->qstats.backlog -= qdisc_pkt_len(skb);
139 qdisc_drop(skb, sch);
140 qdisc_tree_decrease_qlen(sch, 1);
145 * Compare flow of two packets
146 * Returns true only if source and destination address and port match.
147 * false for special cases
149 static bool choke_match_flow(struct sk_buff *skb1,
150 struct sk_buff *skb2)
152 int off1, off2, poff;
153 const u32 *ports1, *ports2;
157 if (skb1->protocol != skb2->protocol)
160 /* Use hash value as quick check
161 * Assumes that __skb_get_rxhash makes IP header and ports linear
163 hash1 = skb_get_rxhash(skb1);
164 if (!hash1 || hash1 != skb_get_rxhash(skb2))
167 /* Probably match, but be sure to avoid hash collisions */
168 off1 = skb_network_offset(skb1);
169 off2 = skb_network_offset(skb2);
171 switch (skb1->protocol) {
172 case __constant_htons(ETH_P_IP): {
173 const struct iphdr *ip1, *ip2;
175 ip1 = (const struct iphdr *) (skb1->data + off1);
176 ip2 = (const struct iphdr *) (skb2->data + off2);
178 ip_proto = ip1->protocol;
179 if (ip_proto != ip2->protocol ||
180 ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
183 if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
185 off1 += ip1->ihl * 4;
186 off2 += ip2->ihl * 4;
190 case __constant_htons(ETH_P_IPV6): {
191 const struct ipv6hdr *ip1, *ip2;
193 ip1 = (const struct ipv6hdr *) (skb1->data + off1);
194 ip2 = (const struct ipv6hdr *) (skb2->data + off2);
196 ip_proto = ip1->nexthdr;
197 if (ip_proto != ip2->nexthdr ||
198 ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
199 ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
205 default: /* Maybe compare MAC header here? */
209 poff = proto_ports_offset(ip_proto);
216 ports1 = (__force u32 *)(skb1->data + off1);
217 ports2 = (__force u32 *)(skb2->data + off2);
218 return *ports1 == *ports2;
221 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
223 *(unsigned int *)(qdisc_skb_cb(skb)->data) = classid;
226 static u16 choke_get_classid(const struct sk_buff *skb)
228 return *(unsigned int *)(qdisc_skb_cb(skb)->data);
232 * Classify flow using either:
233 * 1. pre-existing classification result in skb
234 * 2. fast internal classification
235 * 3. use TC filter based classification
237 static bool choke_classify(struct sk_buff *skb,
238 struct Qdisc *sch, int *qerr)
241 struct choke_sched_data *q = qdisc_priv(sch);
242 struct tcf_result res;
245 result = tc_classify(skb, q->filter_list, &res);
247 #ifdef CONFIG_NET_CLS_ACT
251 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
256 choke_set_classid(skb, TC_H_MIN(res.classid));
264 * Select a packet at random from queue
265 * HACK: since queue can have holes from previous deletion; retry several
266 * times to find a random skb but then just give up and return the head
267 * Will return NULL if queue is empty (q->head == q->tail)
269 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
276 *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
280 } while (--retrys > 0);
282 return q->tab[*pidx = q->head];
286 * Compare new packet with random packet in queue
287 * returns true if matched and sets *pidx
289 static bool choke_match_random(const struct choke_sched_data *q,
290 struct sk_buff *nskb,
293 struct sk_buff *oskb;
295 if (q->head == q->tail)
298 oskb = choke_peek_random(q, pidx);
300 return choke_get_classid(nskb) == choke_get_classid(oskb);
302 return choke_match_flow(oskb, nskb);
305 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
307 struct choke_sched_data *q = qdisc_priv(sch);
308 struct red_parms *p = &q->parms;
309 int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
311 if (q->filter_list) {
312 /* If using external classifiers, get result and record it. */
313 if (!choke_classify(skb, sch, &ret))
314 goto other_drop; /* Packet was eaten by filter */
317 /* Compute average queue usage (see RED) */
318 p->qavg = red_calc_qavg(p, sch->q.qlen);
319 if (red_is_idling(p))
320 red_end_of_idle_period(p);
322 /* Is queue small? */
323 if (p->qavg <= p->qth_min)
328 /* Draw a packet at random from queue and compare flow */
329 if (choke_match_random(q, skb, &idx)) {
331 choke_drop_by_idx(sch, idx);
332 goto congestion_drop;
335 /* Queue is large, always mark/drop */
336 if (p->qavg > p->qth_max) {
339 sch->qstats.overlimits++;
340 if (use_harddrop(q) || !use_ecn(q) ||
341 !INET_ECN_set_ce(skb)) {
342 q->stats.forced_drop++;
343 goto congestion_drop;
346 q->stats.forced_mark++;
347 } else if (++p->qcount) {
348 if (red_mark_probability(p, p->qavg)) {
350 p->qR = red_random(p);
352 sch->qstats.overlimits++;
353 if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
354 q->stats.prob_drop++;
355 goto congestion_drop;
358 q->stats.prob_mark++;
361 p->qR = red_random(p);
364 /* Admit new packet */
365 if (sch->q.qlen < q->limit) {
366 q->tab[q->tail] = skb;
367 q->tail = (q->tail + 1) & q->tab_mask;
369 sch->qstats.backlog += qdisc_pkt_len(skb);
370 return NET_XMIT_SUCCESS;
376 return NET_XMIT_DROP;
379 qdisc_drop(skb, sch);
383 if (ret & __NET_XMIT_BYPASS)
389 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
391 struct choke_sched_data *q = qdisc_priv(sch);
394 if (q->head == q->tail) {
395 if (!red_is_idling(&q->parms))
396 red_start_of_idle_period(&q->parms);
400 skb = q->tab[q->head];
401 q->tab[q->head] = NULL;
402 choke_zap_head_holes(q);
404 sch->qstats.backlog -= qdisc_pkt_len(skb);
405 qdisc_bstats_update(sch, skb);
410 static unsigned int choke_drop(struct Qdisc *sch)
412 struct choke_sched_data *q = qdisc_priv(sch);
415 len = qdisc_queue_drop(sch);
419 if (!red_is_idling(&q->parms))
420 red_start_of_idle_period(&q->parms);
426 static void choke_reset(struct Qdisc *sch)
428 struct choke_sched_data *q = qdisc_priv(sch);
430 red_restart(&q->parms);
433 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
434 [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
435 [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
439 static void choke_free(void *addr)
442 if (is_vmalloc_addr(addr))
449 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
451 struct choke_sched_data *q = qdisc_priv(sch);
452 struct nlattr *tb[TCA_CHOKE_MAX + 1];
453 const struct tc_red_qopt *ctl;
455 struct sk_buff **old = NULL;
461 err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
465 if (tb[TCA_CHOKE_PARMS] == NULL ||
466 tb[TCA_CHOKE_STAB] == NULL)
469 ctl = nla_data(tb[TCA_CHOKE_PARMS]);
471 if (ctl->limit > CHOKE_MAX_QUEUE)
474 mask = roundup_pow_of_two(ctl->limit + 1) - 1;
475 if (mask != q->tab_mask) {
476 struct sk_buff **ntab;
478 ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
480 ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
487 unsigned int oqlen = sch->q.qlen, tail = 0;
489 while (q->head != q->tail) {
490 struct sk_buff *skb = q->tab[q->head];
492 q->head = (q->head + 1) & q->tab_mask;
499 sch->qstats.backlog -= qdisc_pkt_len(skb);
501 qdisc_drop(skb, sch);
503 qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
513 q->flags = ctl->flags;
514 q->limit = ctl->limit;
516 red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
517 ctl->Plog, ctl->Scell_log,
518 nla_data(tb[TCA_CHOKE_STAB]));
520 if (q->head == q->tail)
521 red_end_of_idle_period(&q->parms);
523 sch_tree_unlock(sch);
528 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
530 return choke_change(sch, opt);
533 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
535 struct choke_sched_data *q = qdisc_priv(sch);
536 struct nlattr *opts = NULL;
537 struct tc_red_qopt opt = {
540 .qth_min = q->parms.qth_min >> q->parms.Wlog,
541 .qth_max = q->parms.qth_max >> q->parms.Wlog,
542 .Wlog = q->parms.Wlog,
543 .Plog = q->parms.Plog,
544 .Scell_log = q->parms.Scell_log,
547 opts = nla_nest_start(skb, TCA_OPTIONS);
549 goto nla_put_failure;
551 NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
552 return nla_nest_end(skb, opts);
555 nla_nest_cancel(skb, opts);
559 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
561 struct choke_sched_data *q = qdisc_priv(sch);
562 struct tc_choke_xstats st = {
563 .early = q->stats.prob_drop + q->stats.forced_drop,
564 .marked = q->stats.prob_mark + q->stats.forced_mark,
565 .pdrop = q->stats.pdrop,
566 .other = q->stats.other,
567 .matched = q->stats.matched,
570 return gnet_stats_copy_app(d, &st, sizeof(st));
573 static void choke_destroy(struct Qdisc *sch)
575 struct choke_sched_data *q = qdisc_priv(sch);
577 tcf_destroy_chain(&q->filter_list);
581 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
586 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
591 static void choke_put(struct Qdisc *q, unsigned long cl)
595 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
601 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
603 struct choke_sched_data *q = qdisc_priv(sch);
607 return &q->filter_list;
610 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
611 struct sk_buff *skb, struct tcmsg *tcm)
613 tcm->tcm_handle |= TC_H_MIN(cl);
617 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
620 if (arg->fn(sch, 1, arg) < 0) {
628 static const struct Qdisc_class_ops choke_class_ops = {
632 .tcf_chain = choke_find_tcf,
633 .bind_tcf = choke_bind,
634 .unbind_tcf = choke_put,
635 .dump = choke_dump_class,
639 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
641 struct choke_sched_data *q = qdisc_priv(sch);
643 return (q->head != q->tail) ? q->tab[q->head] : NULL;
646 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
648 .priv_size = sizeof(struct choke_sched_data),
650 .enqueue = choke_enqueue,
651 .dequeue = choke_dequeue,
652 .peek = choke_peek_head,
655 .destroy = choke_destroy,
656 .reset = choke_reset,
657 .change = choke_change,
659 .dump_stats = choke_dump_stats,
660 .owner = THIS_MODULE,
663 static int __init choke_module_init(void)
665 return register_qdisc(&choke_qdisc_ops);
668 static void __exit choke_module_exit(void)
670 unregister_qdisc(&choke_qdisc_ops);
673 module_init(choke_module_init)
674 module_exit(choke_module_exit)
676 MODULE_LICENSE("GPL");