net/sched/sch_cake.c

0001 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
0002
0003 /* COMMON Applications Kept Enhanced (CAKE) discipline
0004  *
0005  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
0006  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
0007  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
0008  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
0009  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
0010  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
0011  *
0012  * The CAKE Principles:
0013  *         (or, how to have your cake and eat it too)
0014  *
0015  * This is a combination of several shaping, AQM and FQ techniques into one
0016  * easy-to-use package:
0017  *
0018  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
0019  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
0020  *   eliminating the need for any sort of burst parameter (eg. token bucket
0021  *   depth).  Burst support is limited to that necessary to overcome scheduling
0022  *   latency.
0023  *
0024  * - A Diffserv-aware priority queue, giving more priority to certain classes,
0025  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
0026  *   the priority is reduced to avoid starving other tins.
0027  *
0028  * - Each priority tin has a separate Flow Queue system, to isolate traffic
0029  *   flows from each other.  This prevents a burst on one flow from increasing
0030  *   the delay to another.  Flows are distributed to queues using a
0031  *   set-associative hash function.
0032  *
0033  * - Each queue is actively managed by Cobalt, which is a combination of the
0034  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
0035  *   congestion early via ECN (if available) and/or packet drops, to keep
0036  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
0037  *   setting, as is necessary at low bandwidths.
0038  *
0039  * The configuration parameters are kept deliberately simple for ease of use.
0040  * Everything has sane defaults.  Complete generality of configuration is *not*
0041  * a goal.
0042  *
0043  * The priority queue operates according to a weighted DRR scheme, combined with
0044  * a bandwidth tracker which reuses the shaper logic to detect which side of the
0045  * bandwidth sharing threshold the tin is operating.  This determines whether a
0046  * priority-based weight (high) or a bandwidth-based weight (low) is used for
0047  * that tin in the current pass.
0048  *
0049  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
0050  * granted us permission to leverage.
0051  */
0052
0053 #include <linux/module.h>
0054 #include <linux/types.h>
0055 #include <linux/kernel.h>
0056 #include <linux/jiffies.h>
0057 #include <linux/string.h>
0058 #include <linux/in.h>
0059 #include <linux/errno.h>
0060 #include <linux/init.h>
0061 #include <linux/skbuff.h>
0062 #include <linux/jhash.h>
0063 #include <linux/slab.h>
0064 #include <linux/vmalloc.h>
0065 #include <linux/reciprocal_div.h>
0066 #include <net/netlink.h>
0067 #include <linux/if_vlan.h>
0068 #include <net/pkt_sched.h>
0069 #include <net/pkt_cls.h>
0070 #include <net/tcp.h>
0071 #include <net/flow_dissector.h>
0072
0073 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
0074 #include <net/netfilter/nf_conntrack_core.h>
0075 #endif
0076
0077 #define CAKE_SET_WAYS (8)
0078 #define CAKE_MAX_TINS (8)
0079 #define CAKE_QUEUES (1024)
0080 #define CAKE_FLOW_MASK 63
0081 #define CAKE_FLOW_NAT_FLAG 64
0082
0083 /* struct cobalt_params - contains codel and blue parameters
0084  * @interval:   codel initial drop rate
0085  * @target:     maximum persistent sojourn time & blue update rate
0086  * @mtu_time:   serialisation delay of maximum-size packet
0087  * @p_inc:      increment of blue drop probability (0.32 fxp)
0088  * @p_dec:      decrement of blue drop probability (0.32 fxp)
0089  */
0090 struct cobalt_params {
0091     u64 interval;
0092     u64 target;
0093     u64 mtu_time;
0094     u32 p_inc;
0095     u32 p_dec;
0096 };
0097
0098 /* struct cobalt_vars - contains codel and blue variables
0099  * @count:      codel dropping frequency
0100  * @rec_inv_sqrt:   reciprocal value of sqrt(count) >> 1
0101  * @drop_next:      time to drop next packet, or when we dropped last
0102  * @blue_timer:     Blue time to next drop
0103  * @p_drop:     BLUE drop probability (0.32 fxp)
0104  * @dropping:       set if in dropping state
0105  * @ecn_marked:     set if marked
0106  */
0107 struct cobalt_vars {
0108     u32 count;
0109     u32 rec_inv_sqrt;
0110     ktime_t drop_next;
0111     ktime_t blue_timer;
0112     u32     p_drop;
0113     bool    dropping;
0114     bool    ecn_marked;
0115 };
0116
0117 enum {
0118     CAKE_SET_NONE = 0,
0119     CAKE_SET_SPARSE,
0120     CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
0121     CAKE_SET_BULK,
0122     CAKE_SET_DECAYING
0123 };
0124
0125 struct cake_flow {
0126     /* this stuff is all needed per-flow at dequeue time */
0127     struct sk_buff    *head;
0128     struct sk_buff    *tail;
0129     struct list_head  flowchain;
0130     s32       deficit;
0131     u32       dropped;
0132     struct cobalt_vars cvars;
0133     u16       srchost; /* index into cake_host table */
0134     u16       dsthost;
0135     u8        set;
0136 }; /* please try to keep this structure <= 64 bytes */
0137
0138 struct cake_host {
0139     u32 srchost_tag;
0140     u32 dsthost_tag;
0141     u16 srchost_bulk_flow_count;
0142     u16 dsthost_bulk_flow_count;
0143 };
0144
0145 struct cake_heap_entry {
0146     u16 t:3, b:10;
0147 };
0148
0149 struct cake_tin_data {
0150     struct cake_flow flows[CAKE_QUEUES];
0151     u32 backlogs[CAKE_QUEUES];
0152     u32 tags[CAKE_QUEUES]; /* for set association */
0153     u16 overflow_idx[CAKE_QUEUES];
0154     struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
0155     u16 flow_quantum;
0156
0157     struct cobalt_params cparams;
0158     u32 drop_overlimit;
0159     u16 bulk_flow_count;
0160     u16 sparse_flow_count;
0161     u16 decaying_flow_count;
0162     u16 unresponsive_flow_count;
0163
0164     u32 max_skblen;
0165
0166     struct list_head new_flows;
0167     struct list_head old_flows;
0168     struct list_head decaying_flows;
0169
0170     /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
0171     ktime_t time_next_packet;
0172     u64 tin_rate_ns;
0173     u64 tin_rate_bps;
0174     u16 tin_rate_shft;
0175
0176     u16 tin_quantum;
0177     s32 tin_deficit;
0178     u32 tin_backlog;
0179     u32 tin_dropped;
0180     u32 tin_ecn_mark;
0181
0182     u32 packets;
0183     u64 bytes;
0184
0185     u32 ack_drops;
0186
0187     /* moving averages */
0188     u64 avge_delay;
0189     u64 peak_delay;
0190     u64 base_delay;
0191
0192     /* hash function stats */
0193     u32 way_directs;
0194     u32 way_hits;
0195     u32 way_misses;
0196     u32 way_collisions;
0197 }; /* number of tins is small, so size of this struct doesn't matter much */
0198
0199 struct cake_sched_data {
0200     struct tcf_proto __rcu *filter_list; /* optional external classifier */
0201     struct tcf_block *block;
0202     struct cake_tin_data *tins;
0203
0204     struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
0205     u16     overflow_timeout;
0206
0207     u16     tin_cnt;
0208     u8      tin_mode;
0209     u8      flow_mode;
0210     u8      ack_filter;
0211     u8      atm_mode;
0212
0213     u32     fwmark_mask;
0214     u16     fwmark_shft;
0215
0216     /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
0217     u16     rate_shft;
0218     ktime_t     time_next_packet;
0219     ktime_t     failsafe_next_packet;
0220     u64     rate_ns;
0221     u64     rate_bps;
0222     u16     rate_flags;
0223     s16     rate_overhead;
0224     u16     rate_mpu;
0225     u64     interval;
0226     u64     target;
0227
0228     /* resource tracking */
0229     u32     buffer_used;
0230     u32     buffer_max_used;
0231     u32     buffer_limit;
0232     u32     buffer_config_limit;
0233
0234     /* indices for dequeue */
0235     u16     cur_tin;
0236     u16     cur_flow;
0237
0238     struct qdisc_watchdog watchdog;
0239     const u8    *tin_index;
0240     const u8    *tin_order;
0241
0242     /* bandwidth capacity estimate */
0243     ktime_t     last_packet_time;
0244     ktime_t     avg_window_begin;
0245     u64     avg_packet_interval;
0246     u64     avg_window_bytes;
0247     u64     avg_peak_bandwidth;
0248     ktime_t     last_reconfig_time;
0249
0250     /* packet length stats */
0251     u32     avg_netoff;
0252     u16     max_netlen;
0253     u16     max_adjlen;
0254     u16     min_netlen;
0255     u16     min_adjlen;
0256 };
0257
0258 enum {
0259     CAKE_FLAG_OVERHEAD     = BIT(0),
0260     CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
0261     CAKE_FLAG_INGRESS      = BIT(2),
0262     CAKE_FLAG_WASH         = BIT(3),
0263     CAKE_FLAG_SPLIT_GSO    = BIT(4)
0264 };
0265
0266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
0267  * obtain the best features of each.  Codel is excellent on flows which
0268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
0269  * unresponsive flows.
0270  */
0271
0272 struct cobalt_skb_cb {
0273     ktime_t enqueue_time;
0274     u32     adjusted_len;
0275 };
0276
0277 static u64 us_to_ns(u64 us)
0278 {
0279     return us * NSEC_PER_USEC;
0280 }
0281
0282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
0283 {
0284     qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
0285     return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
0286 }
0287
0288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
0289 {
0290     return get_cobalt_cb(skb)->enqueue_time;
0291 }
0292
0293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
0294                     ktime_t now)
0295 {
0296     get_cobalt_cb(skb)->enqueue_time = now;
0297 }
0298
0299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
0300
0301 /* Diffserv lookup tables */
0302
0303 static const u8 precedence[] = {
0304     0, 0, 0, 0, 0, 0, 0, 0,
0305     1, 1, 1, 1, 1, 1, 1, 1,
0306     2, 2, 2, 2, 2, 2, 2, 2,
0307     3, 3, 3, 3, 3, 3, 3, 3,
0308     4, 4, 4, 4, 4, 4, 4, 4,
0309     5, 5, 5, 5, 5, 5, 5, 5,
0310     6, 6, 6, 6, 6, 6, 6, 6,
0311     7, 7, 7, 7, 7, 7, 7, 7,
0312 };
0313
0314 static const u8 diffserv8[] = {
0315     2, 0, 1, 2, 4, 2, 2, 2,
0316     1, 2, 1, 2, 1, 2, 1, 2,
0317     5, 2, 4, 2, 4, 2, 4, 2,
0318     3, 2, 3, 2, 3, 2, 3, 2,
0319     6, 2, 3, 2, 3, 2, 3, 2,
0320     6, 2, 2, 2, 6, 2, 6, 2,
0321     7, 2, 2, 2, 2, 2, 2, 2,
0322     7, 2, 2, 2, 2, 2, 2, 2,
0323 };
0324
0325 static const u8 diffserv4[] = {
0326     0, 1, 0, 0, 2, 0, 0, 0,
0327     1, 0, 0, 0, 0, 0, 0, 0,
0328     2, 0, 2, 0, 2, 0, 2, 0,
0329     2, 0, 2, 0, 2, 0, 2, 0,
0330     3, 0, 2, 0, 2, 0, 2, 0,
0331     3, 0, 0, 0, 3, 0, 3, 0,
0332     3, 0, 0, 0, 0, 0, 0, 0,
0333     3, 0, 0, 0, 0, 0, 0, 0,
0334 };
0335
0336 static const u8 diffserv3[] = {
0337     0, 1, 0, 0, 2, 0, 0, 0,
0338     1, 0, 0, 0, 0, 0, 0, 0,
0339     0, 0, 0, 0, 0, 0, 0, 0,
0340     0, 0, 0, 0, 0, 0, 0, 0,
0341     0, 0, 0, 0, 0, 0, 0, 0,
0342     0, 0, 0, 0, 2, 0, 2, 0,
0343     2, 0, 0, 0, 0, 0, 0, 0,
0344     2, 0, 0, 0, 0, 0, 0, 0,
0345 };
0346
0347 static const u8 besteffort[] = {
0348     0, 0, 0, 0, 0, 0, 0, 0,
0349     0, 0, 0, 0, 0, 0, 0, 0,
0350     0, 0, 0, 0, 0, 0, 0, 0,
0351     0, 0, 0, 0, 0, 0, 0, 0,
0352     0, 0, 0, 0, 0, 0, 0, 0,
0353     0, 0, 0, 0, 0, 0, 0, 0,
0354     0, 0, 0, 0, 0, 0, 0, 0,
0355     0, 0, 0, 0, 0, 0, 0, 0,
0356 };
0357
0358 /* tin priority order for stats dumping */
0359
0360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
0361 static const u8 bulk_order[] = {1, 0, 2, 3};
0362
0363 #define REC_INV_SQRT_CACHE (16)
0364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
0365
0366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
0367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
0368  *
0369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
0370  */
0371
0372 static void cobalt_newton_step(struct cobalt_vars *vars)
0373 {
0374     u32 invsqrt, invsqrt2;
0375     u64 val;
0376
0377     invsqrt = vars->rec_inv_sqrt;
0378     invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
0379     val = (3LL << 32) - ((u64)vars->count * invsqrt2);
0380
0381     val >>= 2; /* avoid overflow in following multiply */
0382     val = (val * invsqrt) >> (32 - 2 + 1);
0383
0384     vars->rec_inv_sqrt = val;
0385 }
0386
0387 static void cobalt_invsqrt(struct cobalt_vars *vars)
0388 {
0389     if (vars->count < REC_INV_SQRT_CACHE)
0390         vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
0391     else
0392         cobalt_newton_step(vars);
0393 }
0394
0395 /* There is a big difference in timing between the accurate values placed in
0396  * the cache and the approximations given by a single Newton step for small
0397  * count values, particularly when stepping from count 1 to 2 or vice versa.
0398  * Above 16, a single Newton step gives sufficient accuracy in either
0399  * direction, given the precision stored.
0400  *
0401  * The magnitude of the error when stepping up to count 2 is such as to give
0402  * the value that *should* have been produced at count 4.
0403  */
0404
0405 static void cobalt_cache_init(void)
0406 {
0407     struct cobalt_vars v;
0408
0409     memset(&v, 0, sizeof(v));
0410     v.rec_inv_sqrt = ~0U;
0411     cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
0412
0413     for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
0414         cobalt_newton_step(&v);
0415         cobalt_newton_step(&v);
0416         cobalt_newton_step(&v);
0417         cobalt_newton_step(&v);
0418
0419         cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
0420     }
0421 }
0422
0423 static void cobalt_vars_init(struct cobalt_vars *vars)
0424 {
0425     memset(vars, 0, sizeof(*vars));
0426
0427     if (!cobalt_rec_inv_sqrt_cache[0]) {
0428         cobalt_cache_init();
0429         cobalt_rec_inv_sqrt_cache[0] = ~0;
0430     }
0431 }
0432
0433 /* CoDel control_law is t + interval/sqrt(count)
0434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
0435  * both sqrt() and divide operation.
0436  */
0437 static ktime_t cobalt_control(ktime_t t,
0438                   u64 interval,
0439                   u32 rec_inv_sqrt)
0440 {
0441     return ktime_add_ns(t, reciprocal_scale(interval,
0442                         rec_inv_sqrt));
0443 }
0444
0445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
0446  * true if the BLUE state was quiescent before but active after this call.
0447  */
0448 static bool cobalt_queue_full(struct cobalt_vars *vars,
0449                   struct cobalt_params *p,
0450                   ktime_t now)
0451 {
0452     bool up = false;
0453
0454     if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
0455         up = !vars->p_drop;
0456         vars->p_drop += p->p_inc;
0457         if (vars->p_drop < p->p_inc)
0458             vars->p_drop = ~0;
0459         vars->blue_timer = now;
0460     }
0461     vars->dropping = true;
0462     vars->drop_next = now;
0463     if (!vars->count)
0464         vars->count = 1;
0465
0466     return up;
0467 }
0468
0469 /* Call this when the queue was serviced but turned out to be empty.  Returns
0470  * true if the BLUE state was active before but quiescent after this call.
0471  */
0472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
0473                    struct cobalt_params *p,
0474                    ktime_t now)
0475 {
0476     bool down = false;
0477
0478     if (vars->p_drop &&
0479         ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
0480         if (vars->p_drop < p->p_dec)
0481             vars->p_drop = 0;
0482         else
0483             vars->p_drop -= p->p_dec;
0484         vars->blue_timer = now;
0485         down = !vars->p_drop;
0486     }
0487     vars->dropping = false;
0488
0489     if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
0490         vars->count--;
0491         cobalt_invsqrt(vars);
0492         vars->drop_next = cobalt_control(vars->drop_next,
0493                          p->interval,
0494                          vars->rec_inv_sqrt);
0495     }
0496
0497     return down;
0498 }
0499
0500 /* Call this with a freshly dequeued packet for possible congestion marking.
0501  * Returns true as an instruction to drop the packet, false for delivery.
0502  */
0503 static bool cobalt_should_drop(struct cobalt_vars *vars,
0504                    struct cobalt_params *p,
0505                    ktime_t now,
0506                    struct sk_buff *skb,
0507                    u32 bulk_flows)
0508 {
0509     bool next_due, over_target, drop = false;
0510     ktime_t schedule;
0511     u64 sojourn;
0512
0513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
0514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
0515  * scheduling decision is actually branched, without destroying that
0516  * information.  Similarly, the first 'schedule' value calculated is preserved
0517  * in the boolean 'next_due'.
0518  *
0519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
0520  * the delay between first exceeding 'target' and the first signalling event,
0521  * *and* the scaling factor for the signalling frequency.  It's therefore very
0522  * natural to use a single mechanism for both purposes, and eliminates a
0523  * significant amount of reference Codel's spaghetti code.  To help with this,
0524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
0525  * as possible to 1.0 in fixed-point.
0526  */
0527
0528     sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0529     schedule = ktime_sub(now, vars->drop_next);
0530     over_target = sojourn > p->target &&
0531               sojourn > p->mtu_time * bulk_flows * 2 &&
0532               sojourn > p->mtu_time * 4;
0533     next_due = vars->count && ktime_to_ns(schedule) >= 0;
0534
0535     vars->ecn_marked = false;
0536
0537     if (over_target) {
0538         if (!vars->dropping) {
0539             vars->dropping = true;
0540             vars->drop_next = cobalt_control(now,
0541                              p->interval,
0542                              vars->rec_inv_sqrt);
0543         }
0544         if (!vars->count)
0545             vars->count = 1;
0546     } else if (vars->dropping) {
0547         vars->dropping = false;
0548     }
0549
0550     if (next_due && vars->dropping) {
0551         /* Use ECN mark if possible, otherwise drop */
0552         drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
0553
0554         vars->count++;
0555         if (!vars->count)
0556             vars->count--;
0557         cobalt_invsqrt(vars);
0558         vars->drop_next = cobalt_control(vars->drop_next,
0559                          p->interval,
0560                          vars->rec_inv_sqrt);
0561         schedule = ktime_sub(now, vars->drop_next);
0562     } else {
0563         while (next_due) {
0564             vars->count--;
0565             cobalt_invsqrt(vars);
0566             vars->drop_next = cobalt_control(vars->drop_next,
0567                              p->interval,
0568                              vars->rec_inv_sqrt);
0569             schedule = ktime_sub(now, vars->drop_next);
0570             next_due = vars->count && ktime_to_ns(schedule) >= 0;
0571         }
0572     }
0573
0574     /* Simple BLUE implementation.  Lack of ECN is deliberate. */
0575     if (vars->p_drop)
0576         drop |= (prandom_u32() < vars->p_drop);
0577
0578     /* Overload the drop_next field as an activity timeout */
0579     if (!vars->count)
0580         vars->drop_next = ktime_add_ns(now, p->interval);
0581     else if (ktime_to_ns(schedule) > 0 && !drop)
0582         vars->drop_next = now;
0583
0584     return drop;
0585 }
0586
0587 static bool cake_update_flowkeys(struct flow_keys *keys,
0588                  const struct sk_buff *skb)
0589 {
0590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
0591     struct nf_conntrack_tuple tuple = {};
0592     bool rev = !skb->_nfct, upd = false;
0593     __be32 ip;
0594
0595     if (skb_protocol(skb, true) != htons(ETH_P_IP))
0596         return false;
0597
0598     if (!nf_ct_get_tuple_skb(&tuple, skb))
0599         return false;
0600
0601     ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
0602     if (ip != keys->addrs.v4addrs.src) {
0603         keys->addrs.v4addrs.src = ip;
0604         upd = true;
0605     }
0606     ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
0607     if (ip != keys->addrs.v4addrs.dst) {
0608         keys->addrs.v4addrs.dst = ip;
0609         upd = true;
0610     }
0611
0612     if (keys->ports.ports) {
0613         __be16 port;
0614
0615         port = rev ? tuple.dst.u.all : tuple.src.u.all;
0616         if (port != keys->ports.src) {
0617             keys->ports.src = port;
0618             upd = true;
0619         }
0620         port = rev ? tuple.src.u.all : tuple.dst.u.all;
0621         if (port != keys->ports.dst) {
0622             port = keys->ports.dst;
0623             upd = true;
0624         }
0625     }
0626     return upd;
0627 #else
0628     return false;
0629 #endif
0630 }
0631
0632 /* Cake has several subtle multiple bit settings. In these cases you
0633  *  would be matching triple isolate mode as well.
0634  */
0635
0636 static bool cake_dsrc(int flow_mode)
0637 {
0638     return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
0639 }
0640
0641 static bool cake_ddst(int flow_mode)
0642 {
0643     return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
0644 }
0645
0646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
0647              int flow_mode, u16 flow_override, u16 host_override)
0648 {
0649     bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
0650     bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
0651     bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
0652     u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
0653     u16 reduced_hash, srchost_idx, dsthost_idx;
0654     struct flow_keys keys, host_keys;
0655     bool use_skbhash = skb->l4_hash;
0656
0657     if (unlikely(flow_mode == CAKE_FLOW_NONE))
0658         return 0;
0659
0660     /* If both overrides are set, or we can use the SKB hash and nat mode is
0661      * disabled, we can skip packet dissection entirely. If nat mode is
0662      * enabled there's another check below after doing the conntrack lookup.
0663      */
0664     if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
0665         goto skip_hash;
0666
0667     skb_flow_dissect_flow_keys(skb, &keys,
0668                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
0669
0670     /* Don't use the SKB hash if we change the lookup keys from conntrack */
0671     if (nat_enabled && cake_update_flowkeys(&keys, skb))
0672         use_skbhash = false;
0673
0674     /* If we can still use the SKB hash and don't need the host hash, we can
0675      * skip the rest of the hashing procedure
0676      */
0677     if (use_skbhash && !hash_hosts)
0678         goto skip_hash;
0679
0680     /* flow_hash_from_keys() sorts the addresses by value, so we have
0681      * to preserve their order in a separate data structure to treat
0682      * src and dst host addresses as independently selectable.
0683      */
0684     host_keys = keys;
0685     host_keys.ports.ports     = 0;
0686     host_keys.basic.ip_proto  = 0;
0687     host_keys.keyid.keyid     = 0;
0688     host_keys.tags.flow_label = 0;
0689
0690     switch (host_keys.control.addr_type) {
0691     case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
0692         host_keys.addrs.v4addrs.src = 0;
0693         dsthost_hash = flow_hash_from_keys(&host_keys);
0694         host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
0695         host_keys.addrs.v4addrs.dst = 0;
0696         srchost_hash = flow_hash_from_keys(&host_keys);
0697         break;
0698
0699     case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
0700         memset(&host_keys.addrs.v6addrs.src, 0,
0701                sizeof(host_keys.addrs.v6addrs.src));
0702         dsthost_hash = flow_hash_from_keys(&host_keys);
0703         host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
0704         memset(&host_keys.addrs.v6addrs.dst, 0,
0705                sizeof(host_keys.addrs.v6addrs.dst));
0706         srchost_hash = flow_hash_from_keys(&host_keys);
0707         break;
0708
0709     default:
0710         dsthost_hash = 0;
0711         srchost_hash = 0;
0712     }
0713
0714     /* This *must* be after the above switch, since as a
0715      * side-effect it sorts the src and dst addresses.
0716      */
0717     if (hash_flows && !use_skbhash)
0718         flow_hash = flow_hash_from_keys(&keys);
0719
0720 skip_hash:
0721     if (flow_override)
0722         flow_hash = flow_override - 1;
0723     else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
0724         flow_hash = skb->hash;
0725     if (host_override) {
0726         dsthost_hash = host_override - 1;
0727         srchost_hash = host_override - 1;
0728     }
0729
0730     if (!(flow_mode & CAKE_FLOW_FLOWS)) {
0731         if (flow_mode & CAKE_FLOW_SRC_IP)
0732             flow_hash ^= srchost_hash;
0733
0734         if (flow_mode & CAKE_FLOW_DST_IP)
0735             flow_hash ^= dsthost_hash;
0736     }
0737
0738     reduced_hash = flow_hash % CAKE_QUEUES;
0739
0740     /* set-associative hashing */
0741     /* fast path if no hash collision (direct lookup succeeds) */
0742     if (likely(q->tags[reduced_hash] == flow_hash &&
0743            q->flows[reduced_hash].set)) {
0744         q->way_directs++;
0745     } else {
0746         u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
0747         u32 outer_hash = reduced_hash - inner_hash;
0748         bool allocate_src = false;
0749         bool allocate_dst = false;
0750         u32 i, k;
0751
0752         /* check if any active queue in the set is reserved for
0753          * this flow.
0754          */
0755         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
0756              i++, k = (k + 1) % CAKE_SET_WAYS) {
0757             if (q->tags[outer_hash + k] == flow_hash) {
0758                 if (i)
0759                     q->way_hits++;
0760
0761                 if (!q->flows[outer_hash + k].set) {
0762                     /* need to increment host refcnts */
0763                     allocate_src = cake_dsrc(flow_mode);
0764                     allocate_dst = cake_ddst(flow_mode);
0765                 }
0766
0767                 goto found;
0768             }
0769         }
0770
0771         /* no queue is reserved for this flow, look for an
0772          * empty one.
0773          */
0774         for (i = 0; i < CAKE_SET_WAYS;
0775              i++, k = (k + 1) % CAKE_SET_WAYS) {
0776             if (!q->flows[outer_hash + k].set) {
0777                 q->way_misses++;
0778                 allocate_src = cake_dsrc(flow_mode);
0779                 allocate_dst = cake_ddst(flow_mode);
0780                 goto found;
0781             }
0782         }
0783
0784         /* With no empty queues, default to the original
0785          * queue, accept the collision, update the host tags.
0786          */
0787         q->way_collisions++;
0788         if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
0789             q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
0790             q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
0791         }
0792         allocate_src = cake_dsrc(flow_mode);
0793         allocate_dst = cake_ddst(flow_mode);
0794 found:
0795         /* reserve queue for future packets in same flow */
0796         reduced_hash = outer_hash + k;
0797         q->tags[reduced_hash] = flow_hash;
0798
0799         if (allocate_src) {
0800             srchost_idx = srchost_hash % CAKE_QUEUES;
0801             inner_hash = srchost_idx % CAKE_SET_WAYS;
0802             outer_hash = srchost_idx - inner_hash;
0803             for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
0804                 i++, k = (k + 1) % CAKE_SET_WAYS) {
0805                 if (q->hosts[outer_hash + k].srchost_tag ==
0806                     srchost_hash)
0807                     goto found_src;
0808             }
0809             for (i = 0; i < CAKE_SET_WAYS;
0810                 i++, k = (k + 1) % CAKE_SET_WAYS) {
0811                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
0812                     break;
0813             }
0814             q->hosts[outer_hash + k].srchost_tag = srchost_hash;
0815 found_src:
0816             srchost_idx = outer_hash + k;
0817             if (q->flows[reduced_hash].set == CAKE_SET_BULK)
0818                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
0819             q->flows[reduced_hash].srchost = srchost_idx;
0820         }
0821
0822         if (allocate_dst) {
0823             dsthost_idx = dsthost_hash % CAKE_QUEUES;
0824             inner_hash = dsthost_idx % CAKE_SET_WAYS;
0825             outer_hash = dsthost_idx - inner_hash;
0826             for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
0827                  i++, k = (k + 1) % CAKE_SET_WAYS) {
0828                 if (q->hosts[outer_hash + k].dsthost_tag ==
0829                     dsthost_hash)
0830                     goto found_dst;
0831             }
0832             for (i = 0; i < CAKE_SET_WAYS;
0833                  i++, k = (k + 1) % CAKE_SET_WAYS) {
0834                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
0835                     break;
0836             }
0837             q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
0838 found_dst:
0839             dsthost_idx = outer_hash + k;
0840             if (q->flows[reduced_hash].set == CAKE_SET_BULK)
0841                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
0842             q->flows[reduced_hash].dsthost = dsthost_idx;
0843         }
0844     }
0845
0846     return reduced_hash;
0847 }
0848
0849 /* helper functions : might be changed when/if skb use a standard list_head */
0850 /* remove one skb from head of slot queue */
0851
0852 static struct sk_buff *dequeue_head(struct cake_flow *flow)
0853 {
0854     struct sk_buff *skb = flow->head;
0855
0856     if (skb) {
0857         flow->head = skb->next;
0858         skb_mark_not_on_list(skb);
0859     }
0860
0861     return skb;
0862 }
0863
0864 /* add skb to flow queue (tail add) */
0865
0866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
0867 {
0868     if (!flow->head)
0869         flow->head = skb;
0870     else
0871         flow->tail->next = skb;
0872     flow->tail = skb;
0873     skb->next = NULL;
0874 }
0875
0876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
0877                     struct ipv6hdr *buf)
0878 {
0879     unsigned int offset = skb_network_offset(skb);
0880     struct iphdr *iph;
0881
0882     iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
0883
0884     if (!iph)
0885         return NULL;
0886
0887     if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
0888         return skb_header_pointer(skb, offset + iph->ihl * 4,
0889                       sizeof(struct ipv6hdr), buf);
0890
0891     else if (iph->version == 4)
0892         return iph;
0893
0894     else if (iph->version == 6)
0895         return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
0896                       buf);
0897
0898     return NULL;
0899 }
0900
0901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
0902                       void *buf, unsigned int bufsize)
0903 {
0904     unsigned int offset = skb_network_offset(skb);
0905     const struct ipv6hdr *ipv6h;
0906     const struct tcphdr *tcph;
0907     const struct iphdr *iph;
0908     struct ipv6hdr _ipv6h;
0909     struct tcphdr _tcph;
0910
0911     ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
0912
0913     if (!ipv6h)
0914         return NULL;
0915
0916     if (ipv6h->version == 4) {
0917         iph = (struct iphdr *)ipv6h;
0918         offset += iph->ihl * 4;
0919
0920         /* special-case 6in4 tunnelling, as that is a common way to get
0921          * v6 connectivity in the home
0922          */
0923         if (iph->protocol == IPPROTO_IPV6) {
0924             ipv6h = skb_header_pointer(skb, offset,
0925                            sizeof(_ipv6h), &_ipv6h);
0926
0927             if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
0928                 return NULL;
0929
0930             offset += sizeof(struct ipv6hdr);
0931
0932         } else if (iph->protocol != IPPROTO_TCP) {
0933             return NULL;
0934         }
0935
0936     } else if (ipv6h->version == 6) {
0937         if (ipv6h->nexthdr != IPPROTO_TCP)
0938             return NULL;
0939
0940         offset += sizeof(struct ipv6hdr);
0941     } else {
0942         return NULL;
0943     }
0944
0945     tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
0946     if (!tcph || tcph->doff < 5)
0947         return NULL;
0948
0949     return skb_header_pointer(skb, offset,
0950                   min(__tcp_hdrlen(tcph), bufsize), buf);
0951 }
0952
0953 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
0954                    int code, int *oplen)
0955 {
0956     /* inspired by tcp_parse_options in tcp_input.c */
0957     int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
0958     const u8 *ptr = (const u8 *)(tcph + 1);
0959
0960     while (length > 0) {
0961         int opcode = *ptr++;
0962         int opsize;
0963
0964         if (opcode == TCPOPT_EOL)
0965             break;
0966         if (opcode == TCPOPT_NOP) {
0967             length--;
0968             continue;
0969         }
0970         if (length < 2)
0971             break;
0972         opsize = *ptr++;
0973         if (opsize < 2 || opsize > length)
0974             break;
0975
0976         if (opcode == code) {
0977             *oplen = opsize;
0978             return ptr;
0979         }
0980
0981         ptr += opsize - 2;
0982         length -= opsize;
0983     }
0984
0985     return NULL;
0986 }
0987
0988 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
0989  * bytes than the other. In the case where both sequences ACKs bytes that the
0990  * other doesn't, A is considered greater. DSACKs in A also makes A be
0991  * considered greater.
0992  *
0993  * @return -1, 0 or 1 as normal compare functions
0994  */
0995 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
0996                   const struct tcphdr *tcph_b)
0997 {
0998     const struct tcp_sack_block_wire *sack_a, *sack_b;
0999     u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1000     u32 bytes_a = 0, bytes_b = 0;
1001     int oplen_a, oplen_b;
1002     bool first = true;
1003
1004     sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1005     sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1006
1007     /* pointers point to option contents */
1008     oplen_a -= TCPOLEN_SACK_BASE;
1009     oplen_b -= TCPOLEN_SACK_BASE;
1010
1011     if (sack_a && oplen_a >= sizeof(*sack_a) &&
1012         (!sack_b || oplen_b < sizeof(*sack_b)))
1013         return -1;
1014     else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1015          (!sack_a || oplen_a < sizeof(*sack_a)))
1016         return 1;
1017     else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1018          (!sack_b || oplen_b < sizeof(*sack_b)))
1019         return 0;
1020
1021     while (oplen_a >= sizeof(*sack_a)) {
1022         const struct tcp_sack_block_wire *sack_tmp = sack_b;
1023         u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1024         u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1025         int oplen_tmp = oplen_b;
1026         bool found = false;
1027
1028         /* DSACK; always considered greater to prevent dropping */
1029         if (before(start_a, ack_seq_a))
1030             return -1;
1031
1032         bytes_a += end_a - start_a;
1033
1034         while (oplen_tmp >= sizeof(*sack_tmp)) {
1035             u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1036             u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1037
1038             /* first time through we count the total size */
1039             if (first)
1040                 bytes_b += end_b - start_b;
1041
1042             if (!after(start_b, start_a) && !before(end_b, end_a)) {
1043                 found = true;
1044                 if (!first)
1045                     break;
1046             }
1047             oplen_tmp -= sizeof(*sack_tmp);
1048             sack_tmp++;
1049         }
1050
1051         if (!found)
1052             return -1;
1053
1054         oplen_a -= sizeof(*sack_a);
1055         sack_a++;
1056         first = false;
1057     }
1058
1059     /* If we made it this far, all ranges SACKed by A are covered by B, so
1060      * either the SACKs are equal, or B SACKs more bytes.
1061      */
1062     return bytes_b > bytes_a ? 1 : 0;
1063 }
1064
1065 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1066                  u32 *tsval, u32 *tsecr)
1067 {
1068     const u8 *ptr;
1069     int opsize;
1070
1071     ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1072
1073     if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1074         *tsval = get_unaligned_be32(ptr);
1075         *tsecr = get_unaligned_be32(ptr + 4);
1076     }
1077 }
1078
1079 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1080                    u32 tstamp_new, u32 tsecr_new)
1081 {
1082     /* inspired by tcp_parse_options in tcp_input.c */
1083     int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1084     const u8 *ptr = (const u8 *)(tcph + 1);
1085     u32 tstamp, tsecr;
1086
1087     /* 3 reserved flags must be unset to avoid future breakage
1088      * ACK must be set
1089      * ECE/CWR are handled separately
1090      * All other flags URG/PSH/RST/SYN/FIN must be unset
1091      * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1092      * 0x00C00000 = CWR/ECE (handled separately)
1093      * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1094      */
1095     if (((tcp_flag_word(tcph) &
1096           cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1097         return false;
1098
1099     while (length > 0) {
1100         int opcode = *ptr++;
1101         int opsize;
1102
1103         if (opcode == TCPOPT_EOL)
1104             break;
1105         if (opcode == TCPOPT_NOP) {
1106             length--;
1107             continue;
1108         }
1109         if (length < 2)
1110             break;
1111         opsize = *ptr++;
1112         if (opsize < 2 || opsize > length)
1113             break;
1114
1115         switch (opcode) {
1116         case TCPOPT_MD5SIG: /* doesn't influence state */
1117             break;
1118
1119         case TCPOPT_SACK: /* stricter checking performed later */
1120             if (opsize % 8 != 2)
1121                 return false;
1122             break;
1123
1124         case TCPOPT_TIMESTAMP:
1125             /* only drop timestamps lower than new */
1126             if (opsize != TCPOLEN_TIMESTAMP)
1127                 return false;
1128             tstamp = get_unaligned_be32(ptr);
1129             tsecr = get_unaligned_be32(ptr + 4);
1130             if (after(tstamp, tstamp_new) ||
1131                 after(tsecr, tsecr_new))
1132                 return false;
1133             break;
1134
1135         case TCPOPT_MSS:  /* these should only be set on SYN */
1136         case TCPOPT_WINDOW:
1137         case TCPOPT_SACK_PERM:
1138         case TCPOPT_FASTOPEN:
1139         case TCPOPT_EXP:
1140         default: /* don't drop if any unknown options are present */
1141             return false;
1142         }
1143
1144         ptr += opsize - 2;
1145         length -= opsize;
1146     }
1147
1148     return true;
1149 }
1150
1151 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1152                        struct cake_flow *flow)
1153 {
1154     bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1155     struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1156     struct sk_buff *skb_check, *skb_prev = NULL;
1157     const struct ipv6hdr *ipv6h, *ipv6h_check;
1158     unsigned char _tcph[64], _tcph_check[64];
1159     const struct tcphdr *tcph, *tcph_check;
1160     const struct iphdr *iph, *iph_check;
1161     struct ipv6hdr _iph, _iph_check;
1162     const struct sk_buff *skb;
1163     int seglen, num_found = 0;
1164     u32 tstamp = 0, tsecr = 0;
1165     __be32 elig_flags = 0;
1166     int sack_comp;
1167
1168     /* no other possible ACKs to filter */
1169     if (flow->head == flow->tail)
1170         return NULL;
1171
1172     skb = flow->tail;
1173     tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1174     iph = cake_get_iphdr(skb, &_iph);
1175     if (!tcph)
1176         return NULL;
1177
1178     cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1179
1180     /* the 'triggering' packet need only have the ACK flag set.
1181      * also check that SYN is not set, as there won't be any previous ACKs.
1182      */
1183     if ((tcp_flag_word(tcph) &
1184          (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1185         return NULL;
1186
1187     /* the 'triggering' ACK is at the tail of the queue, we have already
1188      * returned if it is the only packet in the flow. loop through the rest
1189      * of the queue looking for pure ACKs with the same 5-tuple as the
1190      * triggering one.
1191      */
1192     for (skb_check = flow->head;
1193          skb_check && skb_check != skb;
1194          skb_prev = skb_check, skb_check = skb_check->next) {
1195         iph_check = cake_get_iphdr(skb_check, &_iph_check);
1196         tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1197                          sizeof(_tcph_check));
1198
1199         /* only TCP packets with matching 5-tuple are eligible, and only
1200          * drop safe headers
1201          */
1202         if (!tcph_check || iph->version != iph_check->version ||
1203             tcph_check->source != tcph->source ||
1204             tcph_check->dest != tcph->dest)
1205             continue;
1206
1207         if (iph_check->version == 4) {
1208             if (iph_check->saddr != iph->saddr ||
1209                 iph_check->daddr != iph->daddr)
1210                 continue;
1211
1212             seglen = ntohs(iph_check->tot_len) -
1213                        (4 * iph_check->ihl);
1214         } else if (iph_check->version == 6) {
1215             ipv6h = (struct ipv6hdr *)iph;
1216             ipv6h_check = (struct ipv6hdr *)iph_check;
1217
1218             if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1219                 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1220                 continue;
1221
1222             seglen = ntohs(ipv6h_check->payload_len);
1223         } else {
1224             WARN_ON(1);  /* shouldn't happen */
1225             continue;
1226         }
1227
1228         /* If the ECE/CWR flags changed from the previous eligible
1229          * packet in the same flow, we should no longer be dropping that
1230          * previous packet as this would lose information.
1231          */
1232         if (elig_ack && (tcp_flag_word(tcph_check) &
1233                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1234             elig_ack = NULL;
1235             elig_ack_prev = NULL;
1236             num_found--;
1237         }
1238
1239         /* Check TCP options and flags, don't drop ACKs with segment
1240          * data, and don't drop ACKs with a higher cumulative ACK
1241          * counter than the triggering packet. Check ACK seqno here to
1242          * avoid parsing SACK options of packets we are going to exclude
1243          * anyway.
1244          */
1245         if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1246             (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1247             after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1248             continue;
1249
1250         /* Check SACK options. The triggering packet must SACK more data
1251          * than the ACK under consideration, or SACK the same range but
1252          * have a larger cumulative ACK counter. The latter is a
1253          * pathological case, but is contained in the following check
1254          * anyway, just to be safe.
1255          */
1256         sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1257
1258         if (sack_comp < 0 ||
1259             (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1260              sack_comp == 0))
1261             continue;
1262
1263         /* At this point we have found an eligible pure ACK to drop; if
1264          * we are in aggressive mode, we are done. Otherwise, keep
1265          * searching unless this is the second eligible ACK we
1266          * found.
1267          *
1268          * Since we want to drop ACK closest to the head of the queue,
1269          * save the first eligible ACK we find, even if we need to loop
1270          * again.
1271          */
1272         if (!elig_ack) {
1273             elig_ack = skb_check;
1274             elig_ack_prev = skb_prev;
1275             elig_flags = (tcp_flag_word(tcph_check)
1276                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1277         }
1278
1279         if (num_found++ > 0)
1280             goto found;
1281     }
1282
1283     /* We made it through the queue without finding two eligible ACKs . If
1284      * we found a single eligible ACK we can drop it in aggressive mode if
1285      * we can guarantee that this does not interfere with ECN flag
1286      * information. We ensure this by dropping it only if the enqueued
1287      * packet is consecutive with the eligible ACK, and their flags match.
1288      */
1289     if (elig_ack && aggressive && elig_ack->next == skb &&
1290         (elig_flags == (tcp_flag_word(tcph) &
1291                 (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1292         goto found;
1293
1294     return NULL;
1295
1296 found:
1297     if (elig_ack_prev)
1298         elig_ack_prev->next = elig_ack->next;
1299     else
1300         flow->head = elig_ack->next;
1301
1302     skb_mark_not_on_list(elig_ack);
1303
1304     return elig_ack;
1305 }
1306
1307 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1308 {
1309     avg -= avg >> shift;
1310     avg += sample >> shift;
1311     return avg;
1312 }
1313
1314 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1315 {
1316     if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1317         len -= off;
1318
1319     if (q->max_netlen < len)
1320         q->max_netlen = len;
1321     if (q->min_netlen > len)
1322         q->min_netlen = len;
1323
1324     len += q->rate_overhead;
1325
1326     if (len < q->rate_mpu)
1327         len = q->rate_mpu;
1328
1329     if (q->atm_mode == CAKE_ATM_ATM) {
1330         len += 47;
1331         len /= 48;
1332         len *= 53;
1333     } else if (q->atm_mode == CAKE_ATM_PTM) {
1334         /* Add one byte per 64 bytes or part thereof.
1335          * This is conservative and easier to calculate than the
1336          * precise value.
1337          */
1338         len += (len + 63) / 64;
1339     }
1340
1341     if (q->max_adjlen < len)
1342         q->max_adjlen = len;
1343     if (q->min_adjlen > len)
1344         q->min_adjlen = len;
1345
1346     return len;
1347 }
1348
1349 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1350 {
1351     const struct skb_shared_info *shinfo = skb_shinfo(skb);
1352     unsigned int hdr_len, last_len = 0;
1353     u32 off = skb_network_offset(skb);
1354     u32 len = qdisc_pkt_len(skb);
1355     u16 segs = 1;
1356
1357     q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1358
1359     if (!shinfo->gso_size)
1360         return cake_calc_overhead(q, len, off);
1361
1362     /* borrowed from qdisc_pkt_len_init() */
1363     hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1364
1365     /* + transport layer */
1366     if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1367                         SKB_GSO_TCPV6))) {
1368         const struct tcphdr *th;
1369         struct tcphdr _tcphdr;
1370
1371         th = skb_header_pointer(skb, skb_transport_offset(skb),
1372                     sizeof(_tcphdr), &_tcphdr);
1373         if (likely(th))
1374             hdr_len += __tcp_hdrlen(th);
1375     } else {
1376         struct udphdr _udphdr;
1377
1378         if (skb_header_pointer(skb, skb_transport_offset(skb),
1379                        sizeof(_udphdr), &_udphdr))
1380             hdr_len += sizeof(struct udphdr);
1381     }
1382
1383     if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1384         segs = DIV_ROUND_UP(skb->len - hdr_len,
1385                     shinfo->gso_size);
1386     else
1387         segs = shinfo->gso_segs;
1388
1389     len = shinfo->gso_size + hdr_len;
1390     last_len = skb->len - shinfo->gso_size * (segs - 1);
1391
1392     return (cake_calc_overhead(q, len, off) * (segs - 1) +
1393         cake_calc_overhead(q, last_len, off));
1394 }
1395
1396 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1397 {
1398     struct cake_heap_entry ii = q->overflow_heap[i];
1399     struct cake_heap_entry jj = q->overflow_heap[j];
1400
1401     q->overflow_heap[i] = jj;
1402     q->overflow_heap[j] = ii;
1403
1404     q->tins[ii.t].overflow_idx[ii.b] = j;
1405     q->tins[jj.t].overflow_idx[jj.b] = i;
1406 }
1407
1408 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1409 {
1410     struct cake_heap_entry ii = q->overflow_heap[i];
1411
1412     return q->tins[ii.t].backlogs[ii.b];
1413 }
1414
1415 static void cake_heapify(struct cake_sched_data *q, u16 i)
1416 {
1417     static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1418     u32 mb = cake_heap_get_backlog(q, i);
1419     u32 m = i;
1420
1421     while (m < a) {
1422         u32 l = m + m + 1;
1423         u32 r = l + 1;
1424
1425         if (l < a) {
1426             u32 lb = cake_heap_get_backlog(q, l);
1427
1428             if (lb > mb) {
1429                 m  = l;
1430                 mb = lb;
1431             }
1432         }
1433
1434         if (r < a) {
1435             u32 rb = cake_heap_get_backlog(q, r);
1436
1437             if (rb > mb) {
1438                 m  = r;
1439                 mb = rb;
1440             }
1441         }
1442
1443         if (m != i) {
1444             cake_heap_swap(q, i, m);
1445             i = m;
1446         } else {
1447             break;
1448         }
1449     }
1450 }
1451
1452 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1453 {
1454     while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1455         u16 p = (i - 1) >> 1;
1456         u32 ib = cake_heap_get_backlog(q, i);
1457         u32 pb = cake_heap_get_backlog(q, p);
1458
1459         if (ib > pb) {
1460             cake_heap_swap(q, i, p);
1461             i = p;
1462         } else {
1463             break;
1464         }
1465     }
1466 }
1467
1468 static int cake_advance_shaper(struct cake_sched_data *q,
1469                    struct cake_tin_data *b,
1470                    struct sk_buff *skb,
1471                    ktime_t now, bool drop)
1472 {
1473     u32 len = get_cobalt_cb(skb)->adjusted_len;
1474
1475     /* charge packet bandwidth to this tin
1476      * and to the global shaper.
1477      */
1478     if (q->rate_ns) {
1479         u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1480         u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1481         u64 failsafe_dur = global_dur + (global_dur >> 1);
1482
1483         if (ktime_before(b->time_next_packet, now))
1484             b->time_next_packet = ktime_add_ns(b->time_next_packet,
1485                                tin_dur);
1486
1487         else if (ktime_before(b->time_next_packet,
1488                       ktime_add_ns(now, tin_dur)))
1489             b->time_next_packet = ktime_add_ns(now, tin_dur);
1490
1491         q->time_next_packet = ktime_add_ns(q->time_next_packet,
1492                            global_dur);
1493         if (!drop)
1494             q->failsafe_next_packet = \
1495                 ktime_add_ns(q->failsafe_next_packet,
1496                          failsafe_dur);
1497     }
1498     return len;
1499 }
1500
1501 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1502 {
1503     struct cake_sched_data *q = qdisc_priv(sch);
1504     ktime_t now = ktime_get();
1505     u32 idx = 0, tin = 0, len;
1506     struct cake_heap_entry qq;
1507     struct cake_tin_data *b;
1508     struct cake_flow *flow;
1509     struct sk_buff *skb;
1510
1511     if (!q->overflow_timeout) {
1512         int i;
1513         /* Build fresh max-heap */
1514         for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1515             cake_heapify(q, i);
1516     }
1517     q->overflow_timeout = 65535;
1518
1519     /* select longest queue for pruning */
1520     qq  = q->overflow_heap[0];
1521     tin = qq.t;
1522     idx = qq.b;
1523
1524     b = &q->tins[tin];
1525     flow = &b->flows[idx];
1526     skb = dequeue_head(flow);
1527     if (unlikely(!skb)) {
1528         /* heap has gone wrong, rebuild it next time */
1529         q->overflow_timeout = 0;
1530         return idx + (tin << 16);
1531     }
1532
1533     if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1534         b->unresponsive_flow_count++;
1535
1536     len = qdisc_pkt_len(skb);
1537     q->buffer_used      -= skb->truesize;
1538     b->backlogs[idx]    -= len;
1539     b->tin_backlog      -= len;
1540     sch->qstats.backlog -= len;
1541     qdisc_tree_reduce_backlog(sch, 1, len);
1542
1543     flow->dropped++;
1544     b->tin_dropped++;
1545     sch->qstats.drops++;
1546
1547     if (q->rate_flags & CAKE_FLAG_INGRESS)
1548         cake_advance_shaper(q, b, skb, now, true);
1549
1550     __qdisc_drop(skb, to_free);
1551     sch->q.qlen--;
1552
1553     cake_heapify(q, 0);
1554
1555     return idx + (tin << 16);
1556 }
1557
1558 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1559 {
1560     const int offset = skb_network_offset(skb);
1561     u16 *buf, buf_;
1562     u8 dscp;
1563
1564     switch (skb_protocol(skb, true)) {
1565     case htons(ETH_P_IP):
1566         buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1567         if (unlikely(!buf))
1568             return 0;
1569
1570         /* ToS is in the second byte of iphdr */
1571         dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1572
1573         if (wash && dscp) {
1574             const int wlen = offset + sizeof(struct iphdr);
1575
1576             if (!pskb_may_pull(skb, wlen) ||
1577                 skb_try_make_writable(skb, wlen))
1578                 return 0;
1579
1580             ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1581         }
1582
1583         return dscp;
1584
1585     case htons(ETH_P_IPV6):
1586         buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1587         if (unlikely(!buf))
1588             return 0;
1589
1590         /* Traffic class is in the first and second bytes of ipv6hdr */
1591         dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1592
1593         if (wash && dscp) {
1594             const int wlen = offset + sizeof(struct ipv6hdr);
1595
1596             if (!pskb_may_pull(skb, wlen) ||
1597                 skb_try_make_writable(skb, wlen))
1598                 return 0;
1599
1600             ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1601         }
1602
1603         return dscp;
1604
1605     case htons(ETH_P_ARP):
1606         return 0x38;  /* CS7 - Net Control */
1607
1608     default:
1609         /* If there is no Diffserv field, treat as best-effort */
1610         return 0;
1611     }
1612 }
1613
1614 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1615                          struct sk_buff *skb)
1616 {
1617     struct cake_sched_data *q = qdisc_priv(sch);
1618     u32 tin, mark;
1619     bool wash;
1620     u8 dscp;
1621
1622     /* Tin selection: Default to diffserv-based selection, allow overriding
1623      * using firewall marks or skb->priority. Call DSCP parsing early if
1624      * wash is enabled, otherwise defer to below to skip unneeded parsing.
1625      */
1626     mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1627     wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1628     if (wash)
1629         dscp = cake_handle_diffserv(skb, wash);
1630
1631     if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1632         tin = 0;
1633
1634     else if (mark && mark <= q->tin_cnt)
1635         tin = q->tin_order[mark - 1];
1636
1637     else if (TC_H_MAJ(skb->priority) == sch->handle &&
1638          TC_H_MIN(skb->priority) > 0 &&
1639          TC_H_MIN(skb->priority) <= q->tin_cnt)
1640         tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1641
1642     else {
1643         if (!wash)
1644             dscp = cake_handle_diffserv(skb, wash);
1645         tin = q->tin_index[dscp];
1646
1647         if (unlikely(tin >= q->tin_cnt))
1648             tin = 0;
1649     }
1650
1651     return &q->tins[tin];
1652 }
1653
1654 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1655              struct sk_buff *skb, int flow_mode, int *qerr)
1656 {
1657     struct cake_sched_data *q = qdisc_priv(sch);
1658     struct tcf_proto *filter;
1659     struct tcf_result res;
1660     u16 flow = 0, host = 0;
1661     int result;
1662
1663     filter = rcu_dereference_bh(q->filter_list);
1664     if (!filter)
1665         goto hash;
1666
1667     *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1668     result = tcf_classify(skb, NULL, filter, &res, false);
1669
1670     if (result >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672         switch (result) {
1673         case TC_ACT_STOLEN:
1674         case TC_ACT_QUEUED:
1675         case TC_ACT_TRAP:
1676             *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1677             fallthrough;
1678         case TC_ACT_SHOT:
1679             return 0;
1680         }
1681 #endif
1682         if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1683             flow = TC_H_MIN(res.classid);
1684         if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1685             host = TC_H_MAJ(res.classid) >> 16;
1686     }
1687 hash:
1688     *t = cake_select_tin(sch, skb);
1689     return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1690 }
1691
1692 static void cake_reconfigure(struct Qdisc *sch);
1693
1694 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1695             struct sk_buff **to_free)
1696 {
1697     struct cake_sched_data *q = qdisc_priv(sch);
1698     int len = qdisc_pkt_len(skb);
1699     int ret;
1700     struct sk_buff *ack = NULL;
1701     ktime_t now = ktime_get();
1702     struct cake_tin_data *b;
1703     struct cake_flow *flow;
1704     u32 idx;
1705
1706     /* choose flow to insert into */
1707     idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1708     if (idx == 0) {
1709         if (ret & __NET_XMIT_BYPASS)
1710             qdisc_qstats_drop(sch);
1711         __qdisc_drop(skb, to_free);
1712         return ret;
1713     }
1714     idx--;
1715     flow = &b->flows[idx];
1716
1717     /* ensure shaper state isn't stale */
1718     if (!b->tin_backlog) {
1719         if (ktime_before(b->time_next_packet, now))
1720             b->time_next_packet = now;
1721
1722         if (!sch->q.qlen) {
1723             if (ktime_before(q->time_next_packet, now)) {
1724                 q->failsafe_next_packet = now;
1725                 q->time_next_packet = now;
1726             } else if (ktime_after(q->time_next_packet, now) &&
1727                    ktime_after(q->failsafe_next_packet, now)) {
1728                 u64 next = \
1729                     min(ktime_to_ns(q->time_next_packet),
1730                         ktime_to_ns(
1731                            q->failsafe_next_packet));
1732                 sch->qstats.overlimits++;
1733                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1734             }
1735         }
1736     }
1737
1738     if (unlikely(len > b->max_skblen))
1739         b->max_skblen = len;
1740
1741     if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1742         struct sk_buff *segs, *nskb;
1743         netdev_features_t features = netif_skb_features(skb);
1744         unsigned int slen = 0, numsegs = 0;
1745
1746         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1747         if (IS_ERR_OR_NULL(segs))
1748             return qdisc_drop(skb, sch, to_free);
1749
1750         skb_list_walk_safe(segs, segs, nskb) {
1751             skb_mark_not_on_list(segs);
1752             qdisc_skb_cb(segs)->pkt_len = segs->len;
1753             cobalt_set_enqueue_time(segs, now);
1754             get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1755                                       segs);
1756             flow_queue_add(flow, segs);
1757
1758             sch->q.qlen++;
1759             numsegs++;
1760             slen += segs->len;
1761             q->buffer_used += segs->truesize;
1762             b->packets++;
1763         }
1764
1765         /* stats */
1766         b->bytes        += slen;
1767         b->backlogs[idx]    += slen;
1768         b->tin_backlog      += slen;
1769         sch->qstats.backlog += slen;
1770         q->avg_window_bytes += slen;
1771
1772         qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1773         consume_skb(skb);
1774     } else {
1775         /* not splitting */
1776         cobalt_set_enqueue_time(skb, now);
1777         get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1778         flow_queue_add(flow, skb);
1779
1780         if (q->ack_filter)
1781             ack = cake_ack_filter(q, flow);
1782
1783         if (ack) {
1784             b->ack_drops++;
1785             sch->qstats.drops++;
1786             b->bytes += qdisc_pkt_len(ack);
1787             len -= qdisc_pkt_len(ack);
1788             q->buffer_used += skb->truesize - ack->truesize;
1789             if (q->rate_flags & CAKE_FLAG_INGRESS)
1790                 cake_advance_shaper(q, b, ack, now, true);
1791
1792             qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1793             consume_skb(ack);
1794         } else {
1795             sch->q.qlen++;
1796             q->buffer_used      += skb->truesize;
1797         }
1798
1799         /* stats */
1800         b->packets++;
1801         b->bytes        += len;
1802         b->backlogs[idx]    += len;
1803         b->tin_backlog      += len;
1804         sch->qstats.backlog += len;
1805         q->avg_window_bytes += len;
1806     }
1807
1808     if (q->overflow_timeout)
1809         cake_heapify_up(q, b->overflow_idx[idx]);
1810
1811     /* incoming bandwidth capacity estimate */
1812     if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1813         u64 packet_interval = \
1814             ktime_to_ns(ktime_sub(now, q->last_packet_time));
1815
1816         if (packet_interval > NSEC_PER_SEC)
1817             packet_interval = NSEC_PER_SEC;
1818
1819         /* filter out short-term bursts, eg. wifi aggregation */
1820         q->avg_packet_interval = \
1821             cake_ewma(q->avg_packet_interval,
1822                   packet_interval,
1823                   (packet_interval > q->avg_packet_interval ?
1824                       2 : 8));
1825
1826         q->last_packet_time = now;
1827
1828         if (packet_interval > q->avg_packet_interval) {
1829             u64 window_interval = \
1830                 ktime_to_ns(ktime_sub(now,
1831                               q->avg_window_begin));
1832             u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1833
1834             b = div64_u64(b, window_interval);
1835             q->avg_peak_bandwidth =
1836                 cake_ewma(q->avg_peak_bandwidth, b,
1837                       b > q->avg_peak_bandwidth ? 2 : 8);
1838             q->avg_window_bytes = 0;
1839             q->avg_window_begin = now;
1840
1841             if (ktime_after(now,
1842                     ktime_add_ms(q->last_reconfig_time,
1843                              250))) {
1844                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1845                 cake_reconfigure(sch);
1846             }
1847         }
1848     } else {
1849         q->avg_window_bytes = 0;
1850         q->last_packet_time = now;
1851     }
1852
1853     /* flowchain */
1854     if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1855         struct cake_host *srchost = &b->hosts[flow->srchost];
1856         struct cake_host *dsthost = &b->hosts[flow->dsthost];
1857         u16 host_load = 1;
1858
1859         if (!flow->set) {
1860             list_add_tail(&flow->flowchain, &b->new_flows);
1861         } else {
1862             b->decaying_flow_count--;
1863             list_move_tail(&flow->flowchain, &b->new_flows);
1864         }
1865         flow->set = CAKE_SET_SPARSE;
1866         b->sparse_flow_count++;
1867
1868         if (cake_dsrc(q->flow_mode))
1869             host_load = max(host_load, srchost->srchost_bulk_flow_count);
1870
1871         if (cake_ddst(q->flow_mode))
1872             host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1873
1874         flow->deficit = (b->flow_quantum *
1875                  quantum_div[host_load]) >> 16;
1876     } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1877         struct cake_host *srchost = &b->hosts[flow->srchost];
1878         struct cake_host *dsthost = &b->hosts[flow->dsthost];
1879
1880         /* this flow was empty, accounted as a sparse flow, but actually
1881          * in the bulk rotation.
1882          */
1883         flow->set = CAKE_SET_BULK;
1884         b->sparse_flow_count--;
1885         b->bulk_flow_count++;
1886
1887         if (cake_dsrc(q->flow_mode))
1888             srchost->srchost_bulk_flow_count++;
1889
1890         if (cake_ddst(q->flow_mode))
1891             dsthost->dsthost_bulk_flow_count++;
1892
1893     }
1894
1895     if (q->buffer_used > q->buffer_max_used)
1896         q->buffer_max_used = q->buffer_used;
1897
1898     if (q->buffer_used > q->buffer_limit) {
1899         u32 dropped = 0;
1900
1901         while (q->buffer_used > q->buffer_limit) {
1902             dropped++;
1903             cake_drop(sch, to_free);
1904         }
1905         b->drop_overlimit += dropped;
1906     }
1907     return NET_XMIT_SUCCESS;
1908 }
1909
1910 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1911 {
1912     struct cake_sched_data *q = qdisc_priv(sch);
1913     struct cake_tin_data *b = &q->tins[q->cur_tin];
1914     struct cake_flow *flow = &b->flows[q->cur_flow];
1915     struct sk_buff *skb = NULL;
1916     u32 len;
1917
1918     if (flow->head) {
1919         skb = dequeue_head(flow);
1920         len = qdisc_pkt_len(skb);
1921         b->backlogs[q->cur_flow] -= len;
1922         b->tin_backlog       -= len;
1923         sch->qstats.backlog      -= len;
1924         q->buffer_used       -= skb->truesize;
1925         sch->q.qlen--;
1926
1927         if (q->overflow_timeout)
1928             cake_heapify(q, b->overflow_idx[q->cur_flow]);
1929     }
1930     return skb;
1931 }
1932
1933 /* Discard leftover packets from a tin no longer in use. */
1934 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1935 {
1936     struct cake_sched_data *q = qdisc_priv(sch);
1937     struct sk_buff *skb;
1938
1939     q->cur_tin = tin;
1940     for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1941         while (!!(skb = cake_dequeue_one(sch)))
1942             kfree_skb(skb);
1943 }
1944
1945 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1946 {
1947     struct cake_sched_data *q = qdisc_priv(sch);
1948     struct cake_tin_data *b = &q->tins[q->cur_tin];
1949     struct cake_host *srchost, *dsthost;
1950     ktime_t now = ktime_get();
1951     struct cake_flow *flow;
1952     struct list_head *head;
1953     bool first_flow = true;
1954     struct sk_buff *skb;
1955     u16 host_load;
1956     u64 delay;
1957     u32 len;
1958
1959 begin:
1960     if (!sch->q.qlen)
1961         return NULL;
1962
1963     /* global hard shaper */
1964     if (ktime_after(q->time_next_packet, now) &&
1965         ktime_after(q->failsafe_next_packet, now)) {
1966         u64 next = min(ktime_to_ns(q->time_next_packet),
1967                    ktime_to_ns(q->failsafe_next_packet));
1968
1969         sch->qstats.overlimits++;
1970         qdisc_watchdog_schedule_ns(&q->watchdog, next);
1971         return NULL;
1972     }
1973
1974     /* Choose a class to work on. */
1975     if (!q->rate_ns) {
1976         /* In unlimited mode, can't rely on shaper timings, just balance
1977          * with DRR
1978          */
1979         bool wrapped = false, empty = true;
1980
1981         while (b->tin_deficit < 0 ||
1982                !(b->sparse_flow_count + b->bulk_flow_count)) {
1983             if (b->tin_deficit <= 0)
1984                 b->tin_deficit += b->tin_quantum;
1985             if (b->sparse_flow_count + b->bulk_flow_count)
1986                 empty = false;
1987
1988             q->cur_tin++;
1989             b++;
1990             if (q->cur_tin >= q->tin_cnt) {
1991                 q->cur_tin = 0;
1992                 b = q->tins;
1993
1994                 if (wrapped) {
1995                     /* It's possible for q->qlen to be
1996                      * nonzero when we actually have no
1997                      * packets anywhere.
1998                      */
1999                     if (empty)
2000                         return NULL;
2001                 } else {
2002                     wrapped = true;
2003                 }
2004             }
2005         }
2006     } else {
2007         /* In shaped mode, choose:
2008          * - Highest-priority tin with queue and meeting schedule, or
2009          * - The earliest-scheduled tin with queue.
2010          */
2011         ktime_t best_time = KTIME_MAX;
2012         int tin, best_tin = 0;
2013
2014         for (tin = 0; tin < q->tin_cnt; tin++) {
2015             b = q->tins + tin;
2016             if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2017                 ktime_t time_to_pkt = \
2018                     ktime_sub(b->time_next_packet, now);
2019
2020                 if (ktime_to_ns(time_to_pkt) <= 0 ||
2021                     ktime_compare(time_to_pkt,
2022                           best_time) <= 0) {
2023                     best_time = time_to_pkt;
2024                     best_tin = tin;
2025                 }
2026             }
2027         }
2028
2029         q->cur_tin = best_tin;
2030         b = q->tins + best_tin;
2031
2032         /* No point in going further if no packets to deliver. */
2033         if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2034             return NULL;
2035     }
2036
2037 retry:
2038     /* service this class */
2039     head = &b->decaying_flows;
2040     if (!first_flow || list_empty(head)) {
2041         head = &b->new_flows;
2042         if (list_empty(head)) {
2043             head = &b->old_flows;
2044             if (unlikely(list_empty(head))) {
2045                 head = &b->decaying_flows;
2046                 if (unlikely(list_empty(head)))
2047                     goto begin;
2048             }
2049         }
2050     }
2051     flow = list_first_entry(head, struct cake_flow, flowchain);
2052     q->cur_flow = flow - b->flows;
2053     first_flow = false;
2054
2055     /* triple isolation (modified DRR++) */
2056     srchost = &b->hosts[flow->srchost];
2057     dsthost = &b->hosts[flow->dsthost];
2058     host_load = 1;
2059
2060     /* flow isolation (DRR++) */
2061     if (flow->deficit <= 0) {
2062         /* Keep all flows with deficits out of the sparse and decaying
2063          * rotations.  No non-empty flow can go into the decaying
2064          * rotation, so they can't get deficits
2065          */
2066         if (flow->set == CAKE_SET_SPARSE) {
2067             if (flow->head) {
2068                 b->sparse_flow_count--;
2069                 b->bulk_flow_count++;
2070
2071                 if (cake_dsrc(q->flow_mode))
2072                     srchost->srchost_bulk_flow_count++;
2073
2074                 if (cake_ddst(q->flow_mode))
2075                     dsthost->dsthost_bulk_flow_count++;
2076
2077                 flow->set = CAKE_SET_BULK;
2078             } else {
2079                 /* we've moved it to the bulk rotation for
2080                  * correct deficit accounting but we still want
2081                  * to count it as a sparse flow, not a bulk one.
2082                  */
2083                 flow->set = CAKE_SET_SPARSE_WAIT;
2084             }
2085         }
2086
2087         if (cake_dsrc(q->flow_mode))
2088             host_load = max(host_load, srchost->srchost_bulk_flow_count);
2089
2090         if (cake_ddst(q->flow_mode))
2091             host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2092
2093         WARN_ON(host_load > CAKE_QUEUES);
2094
2095         /* The shifted prandom_u32() is a way to apply dithering to
2096          * avoid accumulating roundoff errors
2097          */
2098         flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2099                   (prandom_u32() >> 16)) >> 16;
2100         list_move_tail(&flow->flowchain, &b->old_flows);
2101
2102         goto retry;
2103     }
2104
2105     /* Retrieve a packet via the AQM */
2106     while (1) {
2107         skb = cake_dequeue_one(sch);
2108         if (!skb) {
2109             /* this queue was actually empty */
2110             if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2111                 b->unresponsive_flow_count--;
2112
2113             if (flow->cvars.p_drop || flow->cvars.count ||
2114                 ktime_before(now, flow->cvars.drop_next)) {
2115                 /* keep in the flowchain until the state has
2116                  * decayed to rest
2117                  */
2118                 list_move_tail(&flow->flowchain,
2119                            &b->decaying_flows);
2120                 if (flow->set == CAKE_SET_BULK) {
2121                     b->bulk_flow_count--;
2122
2123                     if (cake_dsrc(q->flow_mode))
2124                         srchost->srchost_bulk_flow_count--;
2125
2126                     if (cake_ddst(q->flow_mode))
2127                         dsthost->dsthost_bulk_flow_count--;
2128
2129                     b->decaying_flow_count++;
2130                 } else if (flow->set == CAKE_SET_SPARSE ||
2131                        flow->set == CAKE_SET_SPARSE_WAIT) {
2132                     b->sparse_flow_count--;
2133                     b->decaying_flow_count++;
2134                 }
2135                 flow->set = CAKE_SET_DECAYING;
2136             } else {
2137                 /* remove empty queue from the flowchain */
2138                 list_del_init(&flow->flowchain);
2139                 if (flow->set == CAKE_SET_SPARSE ||
2140                     flow->set == CAKE_SET_SPARSE_WAIT)
2141                     b->sparse_flow_count--;
2142                 else if (flow->set == CAKE_SET_BULK) {
2143                     b->bulk_flow_count--;
2144
2145                     if (cake_dsrc(q->flow_mode))
2146                         srchost->srchost_bulk_flow_count--;
2147
2148                     if (cake_ddst(q->flow_mode))
2149                         dsthost->dsthost_bulk_flow_count--;
2150
2151                 } else
2152                     b->decaying_flow_count--;
2153
2154                 flow->set = CAKE_SET_NONE;
2155             }
2156             goto begin;
2157         }
2158
2159         /* Last packet in queue may be marked, shouldn't be dropped */
2160         if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2161                     (b->bulk_flow_count *
2162                      !!(q->rate_flags &
2163                         CAKE_FLAG_INGRESS))) ||
2164             !flow->head)
2165             break;
2166
2167         /* drop this packet, get another one */
2168         if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169             len = cake_advance_shaper(q, b, skb,
2170                           now, true);
2171             flow->deficit -= len;
2172             b->tin_deficit -= len;
2173         }
2174         flow->dropped++;
2175         b->tin_dropped++;
2176         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177         qdisc_qstats_drop(sch);
2178         kfree_skb(skb);
2179         if (q->rate_flags & CAKE_FLAG_INGRESS)
2180             goto retry;
2181     }
2182
2183     b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184     qdisc_bstats_update(sch, skb);
2185
2186     /* collect delay stats */
2187     delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188     b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189     b->peak_delay = cake_ewma(b->peak_delay, delay,
2190                   delay > b->peak_delay ? 2 : 8);
2191     b->base_delay = cake_ewma(b->base_delay, delay,
2192                   delay < b->base_delay ? 2 : 8);
2193
2194     len = cake_advance_shaper(q, b, skb, now, false);
2195     flow->deficit -= len;
2196     b->tin_deficit -= len;
2197
2198     if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199         u64 next = min(ktime_to_ns(q->time_next_packet),
2200                    ktime_to_ns(q->failsafe_next_packet));
2201
2202         qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203     } else if (!sch->q.qlen) {
2204         int i;
2205
2206         for (i = 0; i < q->tin_cnt; i++) {
2207             if (q->tins[i].decaying_flow_count) {
2208                 ktime_t next = \
2209                     ktime_add_ns(now,
2210                              q->tins[i].cparams.target);
2211
2212                 qdisc_watchdog_schedule_ns(&q->watchdog,
2213                                ktime_to_ns(next));
2214                 break;
2215             }
2216         }
2217     }
2218
2219     if (q->overflow_timeout)
2220         q->overflow_timeout--;
2221
2222     return skb;
2223 }
2224
2225 static void cake_reset(struct Qdisc *sch)
2226 {
2227     u32 c;
2228
2229     for (c = 0; c < CAKE_MAX_TINS; c++)
2230         cake_clear_tin(sch, c);
2231 }
2232
2233 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2234     [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2235     [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2236     [TCA_CAKE_ATM]       = { .type = NLA_U32 },
2237     [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2238     [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2239     [TCA_CAKE_RTT]       = { .type = NLA_U32 },
2240     [TCA_CAKE_TARGET]    = { .type = NLA_U32 },
2241     [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2242     [TCA_CAKE_MEMORY]    = { .type = NLA_U32 },
2243     [TCA_CAKE_NAT]       = { .type = NLA_U32 },
2244     [TCA_CAKE_RAW]       = { .type = NLA_U32 },
2245     [TCA_CAKE_WASH]      = { .type = NLA_U32 },
2246     [TCA_CAKE_MPU]       = { .type = NLA_U32 },
2247     [TCA_CAKE_INGRESS]   = { .type = NLA_U32 },
2248     [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2249     [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2250     [TCA_CAKE_FWMARK]    = { .type = NLA_U32 },
2251 };
2252
2253 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2254               u64 target_ns, u64 rtt_est_ns)
2255 {
2256     /* convert byte-rate into time-per-byte
2257      * so it will always unwedge in reasonable time.
2258      */
2259     static const u64 MIN_RATE = 64;
2260     u32 byte_target = mtu;
2261     u64 byte_target_ns;
2262     u8  rate_shft = 0;
2263     u64 rate_ns = 0;
2264
2265     b->flow_quantum = 1514;
2266     if (rate) {
2267         b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2268         rate_shft = 34;
2269         rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2270         rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2271         while (!!(rate_ns >> 34)) {
2272             rate_ns >>= 1;
2273             rate_shft--;
2274         }
2275     } /* else unlimited, ie. zero delay */
2276
2277     b->tin_rate_bps  = rate;
2278     b->tin_rate_ns   = rate_ns;
2279     b->tin_rate_shft = rate_shft;
2280
2281     byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2282
2283     b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2284     b->cparams.interval = max(rtt_est_ns +
2285                      b->cparams.target - target_ns,
2286                      b->cparams.target * 2);
2287     b->cparams.mtu_time = byte_target_ns;
2288     b->cparams.p_inc = 1 << 24; /* 1/256 */
2289     b->cparams.p_dec = 1 << 20; /* 1/4096 */
2290 }
2291
2292 static int cake_config_besteffort(struct Qdisc *sch)
2293 {
2294     struct cake_sched_data *q = qdisc_priv(sch);
2295     struct cake_tin_data *b = &q->tins[0];
2296     u32 mtu = psched_mtu(qdisc_dev(sch));
2297     u64 rate = q->rate_bps;
2298
2299     q->tin_cnt = 1;
2300
2301     q->tin_index = besteffort;
2302     q->tin_order = normal_order;
2303
2304     cake_set_rate(b, rate, mtu,
2305               us_to_ns(q->target), us_to_ns(q->interval));
2306     b->tin_quantum = 65535;
2307
2308     return 0;
2309 }
2310
2311 static int cake_config_precedence(struct Qdisc *sch)
2312 {
2313     /* convert high-level (user visible) parameters into internal format */
2314     struct cake_sched_data *q = qdisc_priv(sch);
2315     u32 mtu = psched_mtu(qdisc_dev(sch));
2316     u64 rate = q->rate_bps;
2317     u32 quantum = 256;
2318     u32 i;
2319
2320     q->tin_cnt = 8;
2321     q->tin_index = precedence;
2322     q->tin_order = normal_order;
2323
2324     for (i = 0; i < q->tin_cnt; i++) {
2325         struct cake_tin_data *b = &q->tins[i];
2326
2327         cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2328                   us_to_ns(q->interval));
2329
2330         b->tin_quantum = max_t(u16, 1U, quantum);
2331
2332         /* calculate next class's parameters */
2333         rate  *= 7;
2334         rate >>= 3;
2335
2336         quantum  *= 7;
2337         quantum >>= 3;
2338     }
2339
2340     return 0;
2341 }
2342
2343 /*  List of known Diffserv codepoints:
2344  *
2345  *  Default Forwarding (DF/CS0) - Best Effort
2346  *  Max Throughput (TOS2)
2347  *  Min Delay (TOS4)
2348  *  LLT "La" (TOS5)
2349  *  Assured Forwarding 1 (AF1x) - x3
2350  *  Assured Forwarding 2 (AF2x) - x3
2351  *  Assured Forwarding 3 (AF3x) - x3
2352  *  Assured Forwarding 4 (AF4x) - x3
2353  *  Precedence Class 1 (CS1)
2354  *  Precedence Class 2 (CS2)
2355  *  Precedence Class 3 (CS3)
2356  *  Precedence Class 4 (CS4)
2357  *  Precedence Class 5 (CS5)
2358  *  Precedence Class 6 (CS6)
2359  *  Precedence Class 7 (CS7)
2360  *  Voice Admit (VA)
2361  *  Expedited Forwarding (EF)
2362  *  Lower Effort (LE)
2363  *
2364  *  Total 26 codepoints.
2365  */
2366
2367 /*  List of traffic classes in RFC 4594, updated by RFC 8622:
2368  *      (roughly descending order of contended priority)
2369  *      (roughly ascending order of uncontended throughput)
2370  *
2371  *  Network Control (CS6,CS7)      - routing traffic
2372  *  Telephony (EF,VA)         - aka. VoIP streams
2373  *  Signalling (CS5)               - VoIP setup
2374  *  Multimedia Conferencing (AF4x) - aka. video calls
2375  *  Realtime Interactive (CS4)     - eg. games
2376  *  Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2377  *  Broadcast Video (CS3)
2378  *  Low-Latency Data (AF2x,TOS4)      - eg. database
2379  *  Ops, Admin, Management (CS2)      - eg. ssh
2380  *  Standard Service (DF & unrecognised codepoints)
2381  *  High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2382  *  Low-Priority Data (LE,CS1)        - eg. BitTorrent
2383  *
2384  *  Total 12 traffic classes.
2385  */
2386
2387 static int cake_config_diffserv8(struct Qdisc *sch)
2388 {
2389 /*  Pruned list of traffic classes for typical applications:
2390  *
2391  *      Network Control          (CS6, CS7)
2392  *      Minimum Latency          (EF, VA, CS5, CS4)
2393  *      Interactive Shell        (CS2)
2394  *      Low Latency Transactions (AF2x, TOS4)
2395  *      Video Streaming          (AF4x, AF3x, CS3)
2396  *      Bog Standard             (DF etc.)
2397  *      High Throughput          (AF1x, TOS2, CS1)
2398  *      Background Traffic       (LE)
2399  *
2400  *      Total 8 traffic classes.
2401  */
2402
2403     struct cake_sched_data *q = qdisc_priv(sch);
2404     u32 mtu = psched_mtu(qdisc_dev(sch));
2405     u64 rate = q->rate_bps;
2406     u32 quantum = 256;
2407     u32 i;
2408
2409     q->tin_cnt = 8;
2410
2411     /* codepoint to class mapping */
2412     q->tin_index = diffserv8;
2413     q->tin_order = normal_order;
2414
2415     /* class characteristics */
2416     for (i = 0; i < q->tin_cnt; i++) {
2417         struct cake_tin_data *b = &q->tins[i];
2418
2419         cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2420                   us_to_ns(q->interval));
2421
2422         b->tin_quantum = max_t(u16, 1U, quantum);
2423
2424         /* calculate next class's parameters */
2425         rate  *= 7;
2426         rate >>= 3;
2427
2428         quantum  *= 7;
2429         quantum >>= 3;
2430     }
2431
2432     return 0;
2433 }
2434
2435 static int cake_config_diffserv4(struct Qdisc *sch)
2436 {
2437 /*  Further pruned list of traffic classes for four-class system:
2438  *
2439  *      Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2440  *      Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2441  *      Best Effort        (DF, AF1x, TOS2, and those not specified)
2442  *      Background Traffic (LE, CS1)
2443  *
2444  *      Total 4 traffic classes.
2445  */
2446
2447     struct cake_sched_data *q = qdisc_priv(sch);
2448     u32 mtu = psched_mtu(qdisc_dev(sch));
2449     u64 rate = q->rate_bps;
2450     u32 quantum = 1024;
2451
2452     q->tin_cnt = 4;
2453
2454     /* codepoint to class mapping */
2455     q->tin_index = diffserv4;
2456     q->tin_order = bulk_order;
2457
2458     /* class characteristics */
2459     cake_set_rate(&q->tins[0], rate, mtu,
2460               us_to_ns(q->target), us_to_ns(q->interval));
2461     cake_set_rate(&q->tins[1], rate >> 4, mtu,
2462               us_to_ns(q->target), us_to_ns(q->interval));
2463     cake_set_rate(&q->tins[2], rate >> 1, mtu,
2464               us_to_ns(q->target), us_to_ns(q->interval));
2465     cake_set_rate(&q->tins[3], rate >> 2, mtu,
2466               us_to_ns(q->target), us_to_ns(q->interval));
2467
2468     /* bandwidth-sharing weights */
2469     q->tins[0].tin_quantum = quantum;
2470     q->tins[1].tin_quantum = quantum >> 4;
2471     q->tins[2].tin_quantum = quantum >> 1;
2472     q->tins[3].tin_quantum = quantum >> 2;
2473
2474     return 0;
2475 }
2476
2477 static int cake_config_diffserv3(struct Qdisc *sch)
2478 {
2479 /*  Simplified Diffserv structure with 3 tins.
2480  *      Latency Sensitive   (CS7, CS6, EF, VA, TOS4)
2481  *      Best Effort
2482  *      Low Priority        (LE, CS1)
2483  */
2484     struct cake_sched_data *q = qdisc_priv(sch);
2485     u32 mtu = psched_mtu(qdisc_dev(sch));
2486     u64 rate = q->rate_bps;
2487     u32 quantum = 1024;
2488
2489     q->tin_cnt = 3;
2490
2491     /* codepoint to class mapping */
2492     q->tin_index = diffserv3;
2493     q->tin_order = bulk_order;
2494
2495     /* class characteristics */
2496     cake_set_rate(&q->tins[0], rate, mtu,
2497               us_to_ns(q->target), us_to_ns(q->interval));
2498     cake_set_rate(&q->tins[1], rate >> 4, mtu,
2499               us_to_ns(q->target), us_to_ns(q->interval));
2500     cake_set_rate(&q->tins[2], rate >> 2, mtu,
2501               us_to_ns(q->target), us_to_ns(q->interval));
2502
2503     /* bandwidth-sharing weights */
2504     q->tins[0].tin_quantum = quantum;
2505     q->tins[1].tin_quantum = quantum >> 4;
2506     q->tins[2].tin_quantum = quantum >> 2;
2507
2508     return 0;
2509 }
2510
2511 static void cake_reconfigure(struct Qdisc *sch)
2512 {
2513     struct cake_sched_data *q = qdisc_priv(sch);
2514     int c, ft;
2515
2516     switch (q->tin_mode) {
2517     case CAKE_DIFFSERV_BESTEFFORT:
2518         ft = cake_config_besteffort(sch);
2519         break;
2520
2521     case CAKE_DIFFSERV_PRECEDENCE:
2522         ft = cake_config_precedence(sch);
2523         break;
2524
2525     case CAKE_DIFFSERV_DIFFSERV8:
2526         ft = cake_config_diffserv8(sch);
2527         break;
2528
2529     case CAKE_DIFFSERV_DIFFSERV4:
2530         ft = cake_config_diffserv4(sch);
2531         break;
2532
2533     case CAKE_DIFFSERV_DIFFSERV3:
2534     default:
2535         ft = cake_config_diffserv3(sch);
2536         break;
2537     }
2538
2539     for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2540         cake_clear_tin(sch, c);
2541         q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2542     }
2543
2544     q->rate_ns   = q->tins[ft].tin_rate_ns;
2545     q->rate_shft = q->tins[ft].tin_rate_shft;
2546
2547     if (q->buffer_config_limit) {
2548         q->buffer_limit = q->buffer_config_limit;
2549     } else if (q->rate_bps) {
2550         u64 t = q->rate_bps * q->interval;
2551
2552         do_div(t, USEC_PER_SEC / 4);
2553         q->buffer_limit = max_t(u32, t, 4U << 20);
2554     } else {
2555         q->buffer_limit = ~0;
2556     }
2557
2558     sch->flags &= ~TCQ_F_CAN_BYPASS;
2559
2560     q->buffer_limit = min(q->buffer_limit,
2561                   max(sch->limit * psched_mtu(qdisc_dev(sch)),
2562                   q->buffer_config_limit));
2563 }
2564
2565 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2566                struct netlink_ext_ack *extack)
2567 {
2568     struct cake_sched_data *q = qdisc_priv(sch);
2569     struct nlattr *tb[TCA_CAKE_MAX + 1];
2570     int err;
2571
2572     if (!opt)
2573         return -EINVAL;
2574
2575     err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2576                       extack);
2577     if (err < 0)
2578         return err;
2579
2580     if (tb[TCA_CAKE_NAT]) {
2581 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2582         q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2583         q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2584             !!nla_get_u32(tb[TCA_CAKE_NAT]);
2585 #else
2586         NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2587                     "No conntrack support in kernel");
2588         return -EOPNOTSUPP;
2589 #endif
2590     }
2591
2592     if (tb[TCA_CAKE_BASE_RATE64])
2593         q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2594
2595     if (tb[TCA_CAKE_DIFFSERV_MODE])
2596         q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2597
2598     if (tb[TCA_CAKE_WASH]) {
2599         if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2600             q->rate_flags |= CAKE_FLAG_WASH;
2601         else
2602             q->rate_flags &= ~CAKE_FLAG_WASH;
2603     }
2604
2605     if (tb[TCA_CAKE_FLOW_MODE])
2606         q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2607                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2608                     CAKE_FLOW_MASK));
2609
2610     if (tb[TCA_CAKE_ATM])
2611         q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2612
2613     if (tb[TCA_CAKE_OVERHEAD]) {
2614         q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2615         q->rate_flags |= CAKE_FLAG_OVERHEAD;
2616
2617         q->max_netlen = 0;
2618         q->max_adjlen = 0;
2619         q->min_netlen = ~0;
2620         q->min_adjlen = ~0;
2621     }
2622
2623     if (tb[TCA_CAKE_RAW]) {
2624         q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2625
2626         q->max_netlen = 0;
2627         q->max_adjlen = 0;
2628         q->min_netlen = ~0;
2629         q->min_adjlen = ~0;
2630     }
2631
2632     if (tb[TCA_CAKE_MPU])
2633         q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2634
2635     if (tb[TCA_CAKE_RTT]) {
2636         q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2637
2638         if (!q->interval)
2639             q->interval = 1;
2640     }
2641
2642     if (tb[TCA_CAKE_TARGET]) {
2643         q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2644
2645         if (!q->target)
2646             q->target = 1;
2647     }
2648
2649     if (tb[TCA_CAKE_AUTORATE]) {
2650         if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2651             q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2652         else
2653             q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2654     }
2655
2656     if (tb[TCA_CAKE_INGRESS]) {
2657         if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2658             q->rate_flags |= CAKE_FLAG_INGRESS;
2659         else
2660             q->rate_flags &= ~CAKE_FLAG_INGRESS;
2661     }
2662
2663     if (tb[TCA_CAKE_ACK_FILTER])
2664         q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2665
2666     if (tb[TCA_CAKE_MEMORY])
2667         q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2668
2669     if (tb[TCA_CAKE_SPLIT_GSO]) {
2670         if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2671             q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2672         else
2673             q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2674     }
2675
2676     if (tb[TCA_CAKE_FWMARK]) {
2677         q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2678         q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2679     }
2680
2681     if (q->tins) {
2682         sch_tree_lock(sch);
2683         cake_reconfigure(sch);
2684         sch_tree_unlock(sch);
2685     }
2686
2687     return 0;
2688 }
2689
2690 static void cake_destroy(struct Qdisc *sch)
2691 {
2692     struct cake_sched_data *q = qdisc_priv(sch);
2693
2694     qdisc_watchdog_cancel(&q->watchdog);
2695     tcf_block_put(q->block);
2696     kvfree(q->tins);
2697 }
2698
2699 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2700              struct netlink_ext_ack *extack)
2701 {
2702     struct cake_sched_data *q = qdisc_priv(sch);
2703     int i, j, err;
2704
2705     sch->limit = 10240;
2706     q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2707     q->flow_mode  = CAKE_FLOW_TRIPLE;
2708
2709     q->rate_bps = 0; /* unlimited by default */
2710
2711     q->interval = 100000; /* 100ms default */
2712     q->target   =   5000; /* 5ms: codel RFC argues
2713                    * for 5 to 10% of interval
2714                    */
2715     q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2716     q->cur_tin = 0;
2717     q->cur_flow  = 0;
2718
2719     qdisc_watchdog_init(&q->watchdog, sch);
2720
2721     if (opt) {
2722         err = cake_change(sch, opt, extack);
2723
2724         if (err)
2725             return err;
2726     }
2727
2728     err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2729     if (err)
2730         return err;
2731
2732     quantum_div[0] = ~0;
2733     for (i = 1; i <= CAKE_QUEUES; i++)
2734         quantum_div[i] = 65535 / i;
2735
2736     q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2737                GFP_KERNEL);
2738     if (!q->tins)
2739         return -ENOMEM;
2740
2741     for (i = 0; i < CAKE_MAX_TINS; i++) {
2742         struct cake_tin_data *b = q->tins + i;
2743
2744         INIT_LIST_HEAD(&b->new_flows);
2745         INIT_LIST_HEAD(&b->old_flows);
2746         INIT_LIST_HEAD(&b->decaying_flows);
2747         b->sparse_flow_count = 0;
2748         b->bulk_flow_count = 0;
2749         b->decaying_flow_count = 0;
2750
2751         for (j = 0; j < CAKE_QUEUES; j++) {
2752             struct cake_flow *flow = b->flows + j;
2753             u32 k = j * CAKE_MAX_TINS + i;
2754
2755             INIT_LIST_HEAD(&flow->flowchain);
2756             cobalt_vars_init(&flow->cvars);
2757
2758             q->overflow_heap[k].t = i;
2759             q->overflow_heap[k].b = j;
2760             b->overflow_idx[j] = k;
2761         }
2762     }
2763
2764     cake_reconfigure(sch);
2765     q->avg_peak_bandwidth = q->rate_bps;
2766     q->min_netlen = ~0;
2767     q->min_adjlen = ~0;
2768     return 0;
2769 }
2770
2771 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2772 {
2773     struct cake_sched_data *q = qdisc_priv(sch);
2774     struct nlattr *opts;
2775
2776     opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2777     if (!opts)
2778         goto nla_put_failure;
2779
2780     if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2781                   TCA_CAKE_PAD))
2782         goto nla_put_failure;
2783
2784     if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2785             q->flow_mode & CAKE_FLOW_MASK))
2786         goto nla_put_failure;
2787
2788     if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2789         goto nla_put_failure;
2790
2791     if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2792         goto nla_put_failure;
2793
2794     if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2795         goto nla_put_failure;
2796
2797     if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2798             !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2799         goto nla_put_failure;
2800
2801     if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2802             !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2803         goto nla_put_failure;
2804
2805     if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2806         goto nla_put_failure;
2807
2808     if (nla_put_u32(skb, TCA_CAKE_NAT,
2809             !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2810         goto nla_put_failure;
2811
2812     if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2813         goto nla_put_failure;
2814
2815     if (nla_put_u32(skb, TCA_CAKE_WASH,
2816             !!(q->rate_flags & CAKE_FLAG_WASH)))
2817         goto nla_put_failure;
2818
2819     if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2820         goto nla_put_failure;
2821
2822     if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2823         if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2824             goto nla_put_failure;
2825
2826     if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2827         goto nla_put_failure;
2828
2829     if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2830         goto nla_put_failure;
2831
2832     if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2833             !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2834         goto nla_put_failure;
2835
2836     if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2837         goto nla_put_failure;
2838
2839     return nla_nest_end(skb, opts);
2840
2841 nla_put_failure:
2842     return -1;
2843 }
2844
2845 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2846 {
2847     struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2848     struct cake_sched_data *q = qdisc_priv(sch);
2849     struct nlattr *tstats, *ts;
2850     int i;
2851
2852     if (!stats)
2853         return -1;
2854
2855 #define PUT_STAT_U32(attr, data) do {                      \
2856         if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2857             goto nla_put_failure;                  \
2858     } while (0)
2859 #define PUT_STAT_U64(attr, data) do {                      \
2860         if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2861                     data, TCA_CAKE_STATS_PAD)) \
2862             goto nla_put_failure;                  \
2863     } while (0)
2864
2865     PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2866     PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2867     PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2868     PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2869     PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2870     PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2871     PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2872     PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2873
2874 #undef PUT_STAT_U32
2875 #undef PUT_STAT_U64
2876
2877     tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2878     if (!tstats)
2879         goto nla_put_failure;
2880
2881 #define PUT_TSTAT_U32(attr, data) do {                  \
2882         if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2883             goto nla_put_failure;               \
2884     } while (0)
2885 #define PUT_TSTAT_U64(attr, data) do {                  \
2886         if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2887                     data, TCA_CAKE_TIN_STATS_PAD))  \
2888             goto nla_put_failure;               \
2889     } while (0)
2890
2891     for (i = 0; i < q->tin_cnt; i++) {
2892         struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2893
2894         ts = nla_nest_start_noflag(d->skb, i + 1);
2895         if (!ts)
2896             goto nla_put_failure;
2897
2898         PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2899         PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2900         PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2901
2902         PUT_TSTAT_U32(TARGET_US,
2903                   ktime_to_us(ns_to_ktime(b->cparams.target)));
2904         PUT_TSTAT_U32(INTERVAL_US,
2905                   ktime_to_us(ns_to_ktime(b->cparams.interval)));
2906
2907         PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2908         PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2909         PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2910         PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2911
2912         PUT_TSTAT_U32(PEAK_DELAY_US,
2913                   ktime_to_us(ns_to_ktime(b->peak_delay)));
2914         PUT_TSTAT_U32(AVG_DELAY_US,
2915                   ktime_to_us(ns_to_ktime(b->avge_delay)));
2916         PUT_TSTAT_U32(BASE_DELAY_US,
2917                   ktime_to_us(ns_to_ktime(b->base_delay)));
2918
2919         PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2920         PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2921         PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2922
2923         PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2924                         b->decaying_flow_count);
2925         PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2926         PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2927         PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2928
2929         PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2930         nla_nest_end(d->skb, ts);
2931     }
2932
2933 #undef PUT_TSTAT_U32
2934 #undef PUT_TSTAT_U64
2935
2936     nla_nest_end(d->skb, tstats);
2937     return nla_nest_end(d->skb, stats);
2938
2939 nla_put_failure:
2940     nla_nest_cancel(d->skb, stats);
2941     return -1;
2942 }
2943
2944 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2945 {
2946     return NULL;
2947 }
2948
2949 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2950 {
2951     return 0;
2952 }
2953
2954 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2955                    u32 classid)
2956 {
2957     return 0;
2958 }
2959
2960 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2961 {
2962 }
2963
2964 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2965                     struct netlink_ext_ack *extack)
2966 {
2967     struct cake_sched_data *q = qdisc_priv(sch);
2968
2969     if (cl)
2970         return NULL;
2971     return q->block;
2972 }
2973
2974 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2975                struct sk_buff *skb, struct tcmsg *tcm)
2976 {
2977     tcm->tcm_handle |= TC_H_MIN(cl);
2978     return 0;
2979 }
2980
2981 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2982                  struct gnet_dump *d)
2983 {
2984     struct cake_sched_data *q = qdisc_priv(sch);
2985     const struct cake_flow *flow = NULL;
2986     struct gnet_stats_queue qs = { 0 };
2987     struct nlattr *stats;
2988     u32 idx = cl - 1;
2989
2990     if (idx < CAKE_QUEUES * q->tin_cnt) {
2991         const struct cake_tin_data *b = \
2992             &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2993         const struct sk_buff *skb;
2994
2995         flow = &b->flows[idx % CAKE_QUEUES];
2996
2997         if (flow->head) {
2998             sch_tree_lock(sch);
2999             skb = flow->head;
3000             while (skb) {
3001                 qs.qlen++;
3002                 skb = skb->next;
3003             }
3004             sch_tree_unlock(sch);
3005         }
3006         qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3007         qs.drops = flow->dropped;
3008     }
3009     if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3010         return -1;
3011     if (flow) {
3012         ktime_t now = ktime_get();
3013
3014         stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3015         if (!stats)
3016             return -1;
3017
3018 #define PUT_STAT_U32(attr, data) do {                      \
3019         if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3020             goto nla_put_failure;                  \
3021     } while (0)
3022 #define PUT_STAT_S32(attr, data) do {                      \
3023         if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3024             goto nla_put_failure;                  \
3025     } while (0)
3026
3027         PUT_STAT_S32(DEFICIT, flow->deficit);
3028         PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3029         PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3030         PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3031         if (flow->cvars.p_drop) {
3032             PUT_STAT_S32(BLUE_TIMER_US,
3033                      ktime_to_us(
3034                          ktime_sub(now,
3035                                flow->cvars.blue_timer)));
3036         }
3037         if (flow->cvars.dropping) {
3038             PUT_STAT_S32(DROP_NEXT_US,
3039                      ktime_to_us(
3040                          ktime_sub(now,
3041                                flow->cvars.drop_next)));
3042         }
3043
3044         if (nla_nest_end(d->skb, stats) < 0)
3045             return -1;
3046     }
3047
3048     return 0;
3049
3050 nla_put_failure:
3051     nla_nest_cancel(d->skb, stats);
3052     return -1;
3053 }
3054
3055 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3056 {
3057     struct cake_sched_data *q = qdisc_priv(sch);
3058     unsigned int i, j;
3059
3060     if (arg->stop)
3061         return;
3062
3063     for (i = 0; i < q->tin_cnt; i++) {
3064         struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3065
3066         for (j = 0; j < CAKE_QUEUES; j++) {
3067             if (list_empty(&b->flows[j].flowchain) ||
3068                 arg->count < arg->skip) {
3069                 arg->count++;
3070                 continue;
3071             }
3072             if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3073                 arg->stop = 1;
3074                 break;
3075             }
3076             arg->count++;
3077         }
3078     }
3079 }
3080
3081 static const struct Qdisc_class_ops cake_class_ops = {
3082     .leaf       =   cake_leaf,
3083     .find       =   cake_find,
3084     .tcf_block  =   cake_tcf_block,
3085     .bind_tcf   =   cake_bind,
3086     .unbind_tcf =   cake_unbind,
3087     .dump       =   cake_dump_class,
3088     .dump_stats =   cake_dump_class_stats,
3089     .walk       =   cake_walk,
3090 };
3091
3092 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3093     .cl_ops     =   &cake_class_ops,
3094     .id     =   "cake",
3095     .priv_size  =   sizeof(struct cake_sched_data),
3096     .enqueue    =   cake_enqueue,
3097     .dequeue    =   cake_dequeue,
3098     .peek       =   qdisc_peek_dequeued,
3099     .init       =   cake_init,
3100     .reset      =   cake_reset,
3101     .destroy    =   cake_destroy,
3102     .change     =   cake_change,
3103     .dump       =   cake_dump,
3104     .dump_stats =   cake_dump_stats,
3105     .owner      =   THIS_MODULE,
3106 };
3107
3108 static int __init cake_module_init(void)
3109 {
3110     return register_qdisc(&cake_qdisc_ops);
3111 }
3112
3113 static void __exit cake_module_exit(void)
3114 {
3115     unregister_qdisc(&cake_qdisc_ops);
3116 }
3117
3118 module_init(cake_module_init)
3119 module_exit(cake_module_exit)
3120 MODULE_AUTHOR("Jonathan Morton");
3121 MODULE_LICENSE("Dual BSD/GPL");
3122 MODULE_DESCRIPTION("The CAKE shaper.");