0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/module.h>
0034 #include <linux/types.h>
0035 #include <linux/kernel.h>
0036 #include <linux/jiffies.h>
0037 #include <linux/string.h>
0038 #include <linux/in.h>
0039 #include <linux/errno.h>
0040 #include <linux/init.h>
0041 #include <linux/skbuff.h>
0042 #include <linux/slab.h>
0043 #include <linux/rbtree.h>
0044 #include <linux/hash.h>
0045 #include <linux/prefetch.h>
0046 #include <linux/vmalloc.h>
0047 #include <net/netlink.h>
0048 #include <net/pkt_sched.h>
0049 #include <net/sock.h>
0050 #include <net/tcp_states.h>
0051 #include <net/tcp.h>
0052
0053 struct fq_skb_cb {
0054 u64 time_to_send;
0055 };
0056
0057 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
0058 {
0059 qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
0060 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
0061 }
0062
0063
0064
0065
0066
0067
0068 struct fq_flow {
0069
0070 struct rb_root t_root;
0071 struct sk_buff *head;
0072 union {
0073 struct sk_buff *tail;
0074 unsigned long age;
0075 };
0076 struct rb_node fq_node;
0077 struct sock *sk;
0078 u32 socket_hash;
0079 int qlen;
0080
0081
0082 int credit;
0083
0084
0085 struct fq_flow *next;
0086
0087 struct rb_node rate_node;
0088 u64 time_next_packet;
0089 } ____cacheline_aligned_in_smp;
0090
0091 struct fq_flow_head {
0092 struct fq_flow *first;
0093 struct fq_flow *last;
0094 };
0095
0096 struct fq_sched_data {
0097 struct fq_flow_head new_flows;
0098
0099 struct fq_flow_head old_flows;
0100
0101 struct rb_root delayed;
0102 u64 time_next_delayed_flow;
0103 u64 ktime_cache;
0104 unsigned long unthrottle_latency_ns;
0105
0106 struct fq_flow internal;
0107 u32 quantum;
0108 u32 initial_quantum;
0109 u32 flow_refill_delay;
0110 u32 flow_plimit;
0111 unsigned long flow_max_rate;
0112 u64 ce_threshold;
0113 u64 horizon;
0114 u32 orphan_mask;
0115 u32 low_rate_threshold;
0116 struct rb_root *fq_root;
0117 u8 rate_enable;
0118 u8 fq_trees_log;
0119 u8 horizon_drop;
0120 u32 flows;
0121 u32 inactive_flows;
0122 u32 throttled_flows;
0123
0124 u64 stat_gc_flows;
0125 u64 stat_internal_packets;
0126 u64 stat_throttled;
0127 u64 stat_ce_mark;
0128 u64 stat_horizon_drops;
0129 u64 stat_horizon_caps;
0130 u64 stat_flows_plimit;
0131 u64 stat_pkts_too_long;
0132 u64 stat_allocation_errors;
0133
0134 u32 timer_slack;
0135 struct qdisc_watchdog watchdog;
0136 };
0137
0138
0139
0140
0141
0142
0143
0144 static void fq_flow_set_detached(struct fq_flow *f)
0145 {
0146 f->age = jiffies | 1UL;
0147 }
0148
0149 static bool fq_flow_is_detached(const struct fq_flow *f)
0150 {
0151 return !!(f->age & 1UL);
0152 }
0153
0154
0155 static struct fq_flow throttled;
0156
0157 static bool fq_flow_is_throttled(const struct fq_flow *f)
0158 {
0159 return f->next == &throttled;
0160 }
0161
0162 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
0163 {
0164 if (head->first)
0165 head->last->next = flow;
0166 else
0167 head->first = flow;
0168 head->last = flow;
0169 flow->next = NULL;
0170 }
0171
0172 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
0173 {
0174 rb_erase(&f->rate_node, &q->delayed);
0175 q->throttled_flows--;
0176 fq_flow_add_tail(&q->old_flows, f);
0177 }
0178
0179 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
0180 {
0181 struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
0182
0183 while (*p) {
0184 struct fq_flow *aux;
0185
0186 parent = *p;
0187 aux = rb_entry(parent, struct fq_flow, rate_node);
0188 if (f->time_next_packet >= aux->time_next_packet)
0189 p = &parent->rb_right;
0190 else
0191 p = &parent->rb_left;
0192 }
0193 rb_link_node(&f->rate_node, parent, p);
0194 rb_insert_color(&f->rate_node, &q->delayed);
0195 q->throttled_flows++;
0196 q->stat_throttled++;
0197
0198 f->next = &throttled;
0199 if (q->time_next_delayed_flow > f->time_next_packet)
0200 q->time_next_delayed_flow = f->time_next_packet;
0201 }
0202
0203
0204 static struct kmem_cache *fq_flow_cachep __read_mostly;
0205
0206
0207
0208 #define FQ_GC_MAX 8
0209 #define FQ_GC_AGE (3*HZ)
0210
0211 static bool fq_gc_candidate(const struct fq_flow *f)
0212 {
0213 return fq_flow_is_detached(f) &&
0214 time_after(jiffies, f->age + FQ_GC_AGE);
0215 }
0216
0217 static void fq_gc(struct fq_sched_data *q,
0218 struct rb_root *root,
0219 struct sock *sk)
0220 {
0221 struct rb_node **p, *parent;
0222 void *tofree[FQ_GC_MAX];
0223 struct fq_flow *f;
0224 int i, fcnt = 0;
0225
0226 p = &root->rb_node;
0227 parent = NULL;
0228 while (*p) {
0229 parent = *p;
0230
0231 f = rb_entry(parent, struct fq_flow, fq_node);
0232 if (f->sk == sk)
0233 break;
0234
0235 if (fq_gc_candidate(f)) {
0236 tofree[fcnt++] = f;
0237 if (fcnt == FQ_GC_MAX)
0238 break;
0239 }
0240
0241 if (f->sk > sk)
0242 p = &parent->rb_right;
0243 else
0244 p = &parent->rb_left;
0245 }
0246
0247 if (!fcnt)
0248 return;
0249
0250 for (i = fcnt; i > 0; ) {
0251 f = tofree[--i];
0252 rb_erase(&f->fq_node, root);
0253 }
0254 q->flows -= fcnt;
0255 q->inactive_flows -= fcnt;
0256 q->stat_gc_flows += fcnt;
0257
0258 kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
0259 }
0260
0261 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
0262 {
0263 struct rb_node **p, *parent;
0264 struct sock *sk = skb->sk;
0265 struct rb_root *root;
0266 struct fq_flow *f;
0267
0268
0269 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
0270 return &q->internal;
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281 if (!sk || sk_listener(sk)) {
0282 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
0283
0284
0285
0286
0287 sk = (struct sock *)((hash << 1) | 1UL);
0288 skb_orphan(skb);
0289 } else if (sk->sk_state == TCP_CLOSE) {
0290 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
0291
0292
0293
0294
0295
0296
0297
0298
0299 sk = (struct sock *)((hash << 1) | 1UL);
0300 }
0301
0302 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
0303
0304 if (q->flows >= (2U << q->fq_trees_log) &&
0305 q->inactive_flows > q->flows/2)
0306 fq_gc(q, root, sk);
0307
0308 p = &root->rb_node;
0309 parent = NULL;
0310 while (*p) {
0311 parent = *p;
0312
0313 f = rb_entry(parent, struct fq_flow, fq_node);
0314 if (f->sk == sk) {
0315
0316
0317
0318
0319
0320 if (unlikely(skb->sk == sk &&
0321 f->socket_hash != sk->sk_hash)) {
0322 f->credit = q->initial_quantum;
0323 f->socket_hash = sk->sk_hash;
0324 if (q->rate_enable)
0325 smp_store_release(&sk->sk_pacing_status,
0326 SK_PACING_FQ);
0327 if (fq_flow_is_throttled(f))
0328 fq_flow_unset_throttled(q, f);
0329 f->time_next_packet = 0ULL;
0330 }
0331 return f;
0332 }
0333 if (f->sk > sk)
0334 p = &parent->rb_right;
0335 else
0336 p = &parent->rb_left;
0337 }
0338
0339 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
0340 if (unlikely(!f)) {
0341 q->stat_allocation_errors++;
0342 return &q->internal;
0343 }
0344
0345
0346 fq_flow_set_detached(f);
0347 f->sk = sk;
0348 if (skb->sk == sk) {
0349 f->socket_hash = sk->sk_hash;
0350 if (q->rate_enable)
0351 smp_store_release(&sk->sk_pacing_status,
0352 SK_PACING_FQ);
0353 }
0354 f->credit = q->initial_quantum;
0355
0356 rb_link_node(&f->fq_node, parent, p);
0357 rb_insert_color(&f->fq_node, root);
0358
0359 q->flows++;
0360 q->inactive_flows++;
0361 return f;
0362 }
0363
0364 static struct sk_buff *fq_peek(struct fq_flow *flow)
0365 {
0366 struct sk_buff *skb = skb_rb_first(&flow->t_root);
0367 struct sk_buff *head = flow->head;
0368
0369 if (!skb)
0370 return head;
0371
0372 if (!head)
0373 return skb;
0374
0375 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
0376 return skb;
0377 return head;
0378 }
0379
0380 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
0381 struct sk_buff *skb)
0382 {
0383 if (skb == flow->head) {
0384 flow->head = skb->next;
0385 } else {
0386 rb_erase(&skb->rbnode, &flow->t_root);
0387 skb->dev = qdisc_dev(sch);
0388 }
0389 }
0390
0391
0392
0393
0394 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
0395 struct sk_buff *skb)
0396 {
0397 fq_erase_head(sch, flow, skb);
0398 skb_mark_not_on_list(skb);
0399 flow->qlen--;
0400 qdisc_qstats_backlog_dec(sch, skb);
0401 sch->q.qlen--;
0402 }
0403
0404 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
0405 {
0406 struct rb_node **p, *parent;
0407 struct sk_buff *head, *aux;
0408
0409 head = flow->head;
0410 if (!head ||
0411 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
0412 if (!head)
0413 flow->head = skb;
0414 else
0415 flow->tail->next = skb;
0416 flow->tail = skb;
0417 skb->next = NULL;
0418 return;
0419 }
0420
0421 p = &flow->t_root.rb_node;
0422 parent = NULL;
0423
0424 while (*p) {
0425 parent = *p;
0426 aux = rb_to_skb(parent);
0427 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
0428 p = &parent->rb_right;
0429 else
0430 p = &parent->rb_left;
0431 }
0432 rb_link_node(&skb->rbnode, parent, p);
0433 rb_insert_color(&skb->rbnode, &flow->t_root);
0434 }
0435
0436 static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
0437 const struct fq_sched_data *q)
0438 {
0439 return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
0440 }
0441
0442 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
0443 struct sk_buff **to_free)
0444 {
0445 struct fq_sched_data *q = qdisc_priv(sch);
0446 struct fq_flow *f;
0447
0448 if (unlikely(sch->q.qlen >= sch->limit))
0449 return qdisc_drop(skb, sch, to_free);
0450
0451 if (!skb->tstamp) {
0452 fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
0453 } else {
0454
0455
0456
0457
0458 if (fq_packet_beyond_horizon(skb, q)) {
0459
0460 q->ktime_cache = ktime_get_ns();
0461 if (fq_packet_beyond_horizon(skb, q)) {
0462 if (q->horizon_drop) {
0463 q->stat_horizon_drops++;
0464 return qdisc_drop(skb, sch, to_free);
0465 }
0466 q->stat_horizon_caps++;
0467 skb->tstamp = q->ktime_cache + q->horizon;
0468 }
0469 }
0470 fq_skb_cb(skb)->time_to_send = skb->tstamp;
0471 }
0472
0473 f = fq_classify(skb, q);
0474 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
0475 q->stat_flows_plimit++;
0476 return qdisc_drop(skb, sch, to_free);
0477 }
0478
0479 f->qlen++;
0480 qdisc_qstats_backlog_inc(sch, skb);
0481 if (fq_flow_is_detached(f)) {
0482 fq_flow_add_tail(&q->new_flows, f);
0483 if (time_after(jiffies, f->age + q->flow_refill_delay))
0484 f->credit = max_t(u32, f->credit, q->quantum);
0485 q->inactive_flows--;
0486 }
0487
0488
0489 flow_queue_add(f, skb);
0490
0491 if (unlikely(f == &q->internal)) {
0492 q->stat_internal_packets++;
0493 }
0494 sch->q.qlen++;
0495
0496 return NET_XMIT_SUCCESS;
0497 }
0498
0499 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
0500 {
0501 unsigned long sample;
0502 struct rb_node *p;
0503
0504 if (q->time_next_delayed_flow > now)
0505 return;
0506
0507
0508
0509
0510 sample = (unsigned long)(now - q->time_next_delayed_flow);
0511 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
0512 q->unthrottle_latency_ns += sample >> 3;
0513
0514 q->time_next_delayed_flow = ~0ULL;
0515 while ((p = rb_first(&q->delayed)) != NULL) {
0516 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
0517
0518 if (f->time_next_packet > now) {
0519 q->time_next_delayed_flow = f->time_next_packet;
0520 break;
0521 }
0522 fq_flow_unset_throttled(q, f);
0523 }
0524 }
0525
0526 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
0527 {
0528 struct fq_sched_data *q = qdisc_priv(sch);
0529 struct fq_flow_head *head;
0530 struct sk_buff *skb;
0531 struct fq_flow *f;
0532 unsigned long rate;
0533 u32 plen;
0534 u64 now;
0535
0536 if (!sch->q.qlen)
0537 return NULL;
0538
0539 skb = fq_peek(&q->internal);
0540 if (unlikely(skb)) {
0541 fq_dequeue_skb(sch, &q->internal, skb);
0542 goto out;
0543 }
0544
0545 q->ktime_cache = now = ktime_get_ns();
0546 fq_check_throttled(q, now);
0547 begin:
0548 head = &q->new_flows;
0549 if (!head->first) {
0550 head = &q->old_flows;
0551 if (!head->first) {
0552 if (q->time_next_delayed_flow != ~0ULL)
0553 qdisc_watchdog_schedule_range_ns(&q->watchdog,
0554 q->time_next_delayed_flow,
0555 q->timer_slack);
0556 return NULL;
0557 }
0558 }
0559 f = head->first;
0560
0561 if (f->credit <= 0) {
0562 f->credit += q->quantum;
0563 head->first = f->next;
0564 fq_flow_add_tail(&q->old_flows, f);
0565 goto begin;
0566 }
0567
0568 skb = fq_peek(f);
0569 if (skb) {
0570 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
0571 f->time_next_packet);
0572
0573 if (now < time_next_packet) {
0574 head->first = f->next;
0575 f->time_next_packet = time_next_packet;
0576 fq_flow_set_throttled(q, f);
0577 goto begin;
0578 }
0579 prefetch(&skb->end);
0580 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
0581 INET_ECN_set_ce(skb);
0582 q->stat_ce_mark++;
0583 }
0584 fq_dequeue_skb(sch, f, skb);
0585 } else {
0586 head->first = f->next;
0587
0588 if ((head == &q->new_flows) && q->old_flows.first) {
0589 fq_flow_add_tail(&q->old_flows, f);
0590 } else {
0591 fq_flow_set_detached(f);
0592 q->inactive_flows++;
0593 }
0594 goto begin;
0595 }
0596 plen = qdisc_pkt_len(skb);
0597 f->credit -= plen;
0598
0599 if (!q->rate_enable)
0600 goto out;
0601
0602 rate = q->flow_max_rate;
0603
0604
0605
0606
0607
0608 if (!skb->tstamp) {
0609 if (skb->sk)
0610 rate = min(skb->sk->sk_pacing_rate, rate);
0611
0612 if (rate <= q->low_rate_threshold) {
0613 f->credit = 0;
0614 } else {
0615 plen = max(plen, q->quantum);
0616 if (f->credit > 0)
0617 goto out;
0618 }
0619 }
0620 if (rate != ~0UL) {
0621 u64 len = (u64)plen * NSEC_PER_SEC;
0622
0623 if (likely(rate))
0624 len = div64_ul(len, rate);
0625
0626
0627
0628
0629 if (unlikely(len > NSEC_PER_SEC)) {
0630 len = NSEC_PER_SEC;
0631 q->stat_pkts_too_long++;
0632 }
0633
0634
0635
0636
0637 if (f->time_next_packet)
0638 len -= min(len/2, now - f->time_next_packet);
0639 f->time_next_packet = now + len;
0640 }
0641 out:
0642 qdisc_bstats_update(sch, skb);
0643 return skb;
0644 }
0645
0646 static void fq_flow_purge(struct fq_flow *flow)
0647 {
0648 struct rb_node *p = rb_first(&flow->t_root);
0649
0650 while (p) {
0651 struct sk_buff *skb = rb_to_skb(p);
0652
0653 p = rb_next(p);
0654 rb_erase(&skb->rbnode, &flow->t_root);
0655 rtnl_kfree_skbs(skb, skb);
0656 }
0657 rtnl_kfree_skbs(flow->head, flow->tail);
0658 flow->head = NULL;
0659 flow->qlen = 0;
0660 }
0661
0662 static void fq_reset(struct Qdisc *sch)
0663 {
0664 struct fq_sched_data *q = qdisc_priv(sch);
0665 struct rb_root *root;
0666 struct rb_node *p;
0667 struct fq_flow *f;
0668 unsigned int idx;
0669
0670 sch->q.qlen = 0;
0671 sch->qstats.backlog = 0;
0672
0673 fq_flow_purge(&q->internal);
0674
0675 if (!q->fq_root)
0676 return;
0677
0678 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
0679 root = &q->fq_root[idx];
0680 while ((p = rb_first(root)) != NULL) {
0681 f = rb_entry(p, struct fq_flow, fq_node);
0682 rb_erase(p, root);
0683
0684 fq_flow_purge(f);
0685
0686 kmem_cache_free(fq_flow_cachep, f);
0687 }
0688 }
0689 q->new_flows.first = NULL;
0690 q->old_flows.first = NULL;
0691 q->delayed = RB_ROOT;
0692 q->flows = 0;
0693 q->inactive_flows = 0;
0694 q->throttled_flows = 0;
0695 }
0696
0697 static void fq_rehash(struct fq_sched_data *q,
0698 struct rb_root *old_array, u32 old_log,
0699 struct rb_root *new_array, u32 new_log)
0700 {
0701 struct rb_node *op, **np, *parent;
0702 struct rb_root *oroot, *nroot;
0703 struct fq_flow *of, *nf;
0704 int fcnt = 0;
0705 u32 idx;
0706
0707 for (idx = 0; idx < (1U << old_log); idx++) {
0708 oroot = &old_array[idx];
0709 while ((op = rb_first(oroot)) != NULL) {
0710 rb_erase(op, oroot);
0711 of = rb_entry(op, struct fq_flow, fq_node);
0712 if (fq_gc_candidate(of)) {
0713 fcnt++;
0714 kmem_cache_free(fq_flow_cachep, of);
0715 continue;
0716 }
0717 nroot = &new_array[hash_ptr(of->sk, new_log)];
0718
0719 np = &nroot->rb_node;
0720 parent = NULL;
0721 while (*np) {
0722 parent = *np;
0723
0724 nf = rb_entry(parent, struct fq_flow, fq_node);
0725 BUG_ON(nf->sk == of->sk);
0726
0727 if (nf->sk > of->sk)
0728 np = &parent->rb_right;
0729 else
0730 np = &parent->rb_left;
0731 }
0732
0733 rb_link_node(&of->fq_node, parent, np);
0734 rb_insert_color(&of->fq_node, nroot);
0735 }
0736 }
0737 q->flows -= fcnt;
0738 q->inactive_flows -= fcnt;
0739 q->stat_gc_flows += fcnt;
0740 }
0741
0742 static void fq_free(void *addr)
0743 {
0744 kvfree(addr);
0745 }
0746
0747 static int fq_resize(struct Qdisc *sch, u32 log)
0748 {
0749 struct fq_sched_data *q = qdisc_priv(sch);
0750 struct rb_root *array;
0751 void *old_fq_root;
0752 u32 idx;
0753
0754 if (q->fq_root && log == q->fq_trees_log)
0755 return 0;
0756
0757
0758 array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
0759 netdev_queue_numa_node_read(sch->dev_queue));
0760 if (!array)
0761 return -ENOMEM;
0762
0763 for (idx = 0; idx < (1U << log); idx++)
0764 array[idx] = RB_ROOT;
0765
0766 sch_tree_lock(sch);
0767
0768 old_fq_root = q->fq_root;
0769 if (old_fq_root)
0770 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
0771
0772 q->fq_root = array;
0773 q->fq_trees_log = log;
0774
0775 sch_tree_unlock(sch);
0776
0777 fq_free(old_fq_root);
0778
0779 return 0;
0780 }
0781
0782 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
0783 [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
0784
0785 [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
0786 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
0787 [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
0788 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
0789 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
0790 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
0791 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
0792 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
0793 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
0794 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
0795 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
0796 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
0797 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
0798 [TCA_FQ_HORIZON] = { .type = NLA_U32 },
0799 [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
0800 };
0801
0802 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
0803 struct netlink_ext_ack *extack)
0804 {
0805 struct fq_sched_data *q = qdisc_priv(sch);
0806 struct nlattr *tb[TCA_FQ_MAX + 1];
0807 int err, drop_count = 0;
0808 unsigned drop_len = 0;
0809 u32 fq_log;
0810
0811 if (!opt)
0812 return -EINVAL;
0813
0814 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
0815 NULL);
0816 if (err < 0)
0817 return err;
0818
0819 sch_tree_lock(sch);
0820
0821 fq_log = q->fq_trees_log;
0822
0823 if (tb[TCA_FQ_BUCKETS_LOG]) {
0824 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
0825
0826 if (nval >= 1 && nval <= ilog2(256*1024))
0827 fq_log = nval;
0828 else
0829 err = -EINVAL;
0830 }
0831 if (tb[TCA_FQ_PLIMIT])
0832 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
0833
0834 if (tb[TCA_FQ_FLOW_PLIMIT])
0835 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
0836
0837 if (tb[TCA_FQ_QUANTUM]) {
0838 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
0839
0840 if (quantum > 0 && quantum <= (1 << 20)) {
0841 q->quantum = quantum;
0842 } else {
0843 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
0844 err = -EINVAL;
0845 }
0846 }
0847
0848 if (tb[TCA_FQ_INITIAL_QUANTUM])
0849 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
0850
0851 if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
0852 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
0853 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
0854
0855 if (tb[TCA_FQ_FLOW_MAX_RATE]) {
0856 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
0857
0858 q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
0859 }
0860 if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
0861 q->low_rate_threshold =
0862 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
0863
0864 if (tb[TCA_FQ_RATE_ENABLE]) {
0865 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
0866
0867 if (enable <= 1)
0868 q->rate_enable = enable;
0869 else
0870 err = -EINVAL;
0871 }
0872
0873 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
0874 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
0875
0876 q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
0877 }
0878
0879 if (tb[TCA_FQ_ORPHAN_MASK])
0880 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
0881
0882 if (tb[TCA_FQ_CE_THRESHOLD])
0883 q->ce_threshold = (u64)NSEC_PER_USEC *
0884 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
0885
0886 if (tb[TCA_FQ_TIMER_SLACK])
0887 q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
0888
0889 if (tb[TCA_FQ_HORIZON])
0890 q->horizon = (u64)NSEC_PER_USEC *
0891 nla_get_u32(tb[TCA_FQ_HORIZON]);
0892
0893 if (tb[TCA_FQ_HORIZON_DROP])
0894 q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
0895
0896 if (!err) {
0897
0898 sch_tree_unlock(sch);
0899 err = fq_resize(sch, fq_log);
0900 sch_tree_lock(sch);
0901 }
0902 while (sch->q.qlen > sch->limit) {
0903 struct sk_buff *skb = fq_dequeue(sch);
0904
0905 if (!skb)
0906 break;
0907 drop_len += qdisc_pkt_len(skb);
0908 rtnl_kfree_skbs(skb, skb);
0909 drop_count++;
0910 }
0911 qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
0912
0913 sch_tree_unlock(sch);
0914 return err;
0915 }
0916
0917 static void fq_destroy(struct Qdisc *sch)
0918 {
0919 struct fq_sched_data *q = qdisc_priv(sch);
0920
0921 fq_reset(sch);
0922 fq_free(q->fq_root);
0923 qdisc_watchdog_cancel(&q->watchdog);
0924 }
0925
0926 static int fq_init(struct Qdisc *sch, struct nlattr *opt,
0927 struct netlink_ext_ack *extack)
0928 {
0929 struct fq_sched_data *q = qdisc_priv(sch);
0930 int err;
0931
0932 sch->limit = 10000;
0933 q->flow_plimit = 100;
0934 q->quantum = 2 * psched_mtu(qdisc_dev(sch));
0935 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
0936 q->flow_refill_delay = msecs_to_jiffies(40);
0937 q->flow_max_rate = ~0UL;
0938 q->time_next_delayed_flow = ~0ULL;
0939 q->rate_enable = 1;
0940 q->new_flows.first = NULL;
0941 q->old_flows.first = NULL;
0942 q->delayed = RB_ROOT;
0943 q->fq_root = NULL;
0944 q->fq_trees_log = ilog2(1024);
0945 q->orphan_mask = 1024 - 1;
0946 q->low_rate_threshold = 550000 / 8;
0947
0948 q->timer_slack = 10 * NSEC_PER_USEC;
0949
0950 q->horizon = 10ULL * NSEC_PER_SEC;
0951 q->horizon_drop = 1;
0952
0953
0954 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
0955
0956 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
0957
0958 if (opt)
0959 err = fq_change(sch, opt, extack);
0960 else
0961 err = fq_resize(sch, q->fq_trees_log);
0962
0963 return err;
0964 }
0965
0966 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
0967 {
0968 struct fq_sched_data *q = qdisc_priv(sch);
0969 u64 ce_threshold = q->ce_threshold;
0970 u64 horizon = q->horizon;
0971 struct nlattr *opts;
0972
0973 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
0974 if (opts == NULL)
0975 goto nla_put_failure;
0976
0977
0978
0979 do_div(ce_threshold, NSEC_PER_USEC);
0980 do_div(horizon, NSEC_PER_USEC);
0981
0982 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
0983 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
0984 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
0985 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
0986 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
0987 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
0988 min_t(unsigned long, q->flow_max_rate, ~0U)) ||
0989 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
0990 jiffies_to_usecs(q->flow_refill_delay)) ||
0991 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
0992 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
0993 q->low_rate_threshold) ||
0994 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
0995 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
0996 nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
0997 nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
0998 nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
0999 goto nla_put_failure;
1000
1001 return nla_nest_end(skb, opts);
1002
1003 nla_put_failure:
1004 return -1;
1005 }
1006
1007 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1008 {
1009 struct fq_sched_data *q = qdisc_priv(sch);
1010 struct tc_fq_qd_stats st;
1011
1012 sch_tree_lock(sch);
1013
1014 st.gc_flows = q->stat_gc_flows;
1015 st.highprio_packets = q->stat_internal_packets;
1016 st.tcp_retrans = 0;
1017 st.throttled = q->stat_throttled;
1018 st.flows_plimit = q->stat_flows_plimit;
1019 st.pkts_too_long = q->stat_pkts_too_long;
1020 st.allocation_errors = q->stat_allocation_errors;
1021 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1022 ktime_get_ns();
1023 st.flows = q->flows;
1024 st.inactive_flows = q->inactive_flows;
1025 st.throttled_flows = q->throttled_flows;
1026 st.unthrottle_latency_ns = min_t(unsigned long,
1027 q->unthrottle_latency_ns, ~0U);
1028 st.ce_mark = q->stat_ce_mark;
1029 st.horizon_drops = q->stat_horizon_drops;
1030 st.horizon_caps = q->stat_horizon_caps;
1031 sch_tree_unlock(sch);
1032
1033 return gnet_stats_copy_app(d, &st, sizeof(st));
1034 }
1035
1036 static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1037 .id = "fq",
1038 .priv_size = sizeof(struct fq_sched_data),
1039
1040 .enqueue = fq_enqueue,
1041 .dequeue = fq_dequeue,
1042 .peek = qdisc_peek_dequeued,
1043 .init = fq_init,
1044 .reset = fq_reset,
1045 .destroy = fq_destroy,
1046 .change = fq_change,
1047 .dump = fq_dump,
1048 .dump_stats = fq_dump_stats,
1049 .owner = THIS_MODULE,
1050 };
1051
1052 static int __init fq_module_init(void)
1053 {
1054 int ret;
1055
1056 fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1057 sizeof(struct fq_flow),
1058 0, 0, NULL);
1059 if (!fq_flow_cachep)
1060 return -ENOMEM;
1061
1062 ret = register_qdisc(&fq_qdisc_ops);
1063 if (ret)
1064 kmem_cache_destroy(fq_flow_cachep);
1065 return ret;
1066 }
1067
1068 static void __exit fq_module_exit(void)
1069 {
1070 unregister_qdisc(&fq_qdisc_ops);
1071 kmem_cache_destroy(fq_flow_cachep);
1072 }
1073
1074 module_init(fq_module_init)
1075 module_exit(fq_module_exit)
1076 MODULE_AUTHOR("Eric Dumazet");
1077 MODULE_LICENSE("GPL");
1078 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");