net/sched/sch_netem.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * net/sched/sch_netem.c    Network emulator
0004  *
0005  *          Many of the algorithms and ideas for this came from
0006  *      NIST Net which is not copyrighted.
0007  *
0008  * Authors: Stephen Hemminger <shemminger@osdl.org>
0009  *      Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
0010  */
0011
0012 #include <linux/mm.h>
0013 #include <linux/module.h>
0014 #include <linux/slab.h>
0015 #include <linux/types.h>
0016 #include <linux/kernel.h>
0017 #include <linux/errno.h>
0018 #include <linux/skbuff.h>
0019 #include <linux/vmalloc.h>
0020 #include <linux/rtnetlink.h>
0021 #include <linux/reciprocal_div.h>
0022 #include <linux/rbtree.h>
0023
0024 #include <net/netlink.h>
0025 #include <net/pkt_sched.h>
0026 #include <net/inet_ecn.h>
0027
0028 #define VERSION "1.3"
0029
0030 /*  Network Emulation Queuing algorithm.
0031     ====================================
0032
0033     Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
0034          Network Emulation Tool
0035          [2] Luigi Rizzo, DummyNet for FreeBSD
0036
0037      ----------------------------------------------------------------
0038
0039      This started out as a simple way to delay outgoing packets to
0040      test TCP but has grown to include most of the functionality
0041      of a full blown network emulator like NISTnet. It can delay
0042      packets and add random jitter (and correlation). The random
0043      distribution can be loaded from a table as well to provide
0044      normal, Pareto, or experimental curves. Packet loss,
0045      duplication, and reordering can also be emulated.
0046
0047      This qdisc does not do classification that can be handled in
0048      layering other disciplines.  It does not need to do bandwidth
0049      control either since that can be handled by using token
0050      bucket or other rate control.
0051
0052      Correlated Loss Generator models
0053
0054     Added generation of correlated loss according to the
0055     "Gilbert-Elliot" model, a 4-state markov model.
0056
0057     References:
0058     [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
0059     [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
0060     and intuitive loss model for packet networks and its implementation
0061     in the Netem module in the Linux kernel", available in [1]
0062
0063     Authors: Stefano Salsano <stefano.salsano at uniroma2.it
0064          Fabio Ludovici <fabio.ludovici at yahoo.it>
0065 */
0066
0067 struct disttable {
0068     u32  size;
0069     s16 table[];
0070 };
0071
0072 struct netem_sched_data {
0073     /* internal t(ime)fifo qdisc uses t_root and sch->limit */
0074     struct rb_root t_root;
0075
0076     /* a linear queue; reduces rbtree rebalancing when jitter is low */
0077     struct sk_buff  *t_head;
0078     struct sk_buff  *t_tail;
0079
0080     /* optional qdisc for classful handling (NULL at netem init) */
0081     struct Qdisc    *qdisc;
0082
0083     struct qdisc_watchdog watchdog;
0084
0085     s64 latency;
0086     s64 jitter;
0087
0088     u32 loss;
0089     u32 ecn;
0090     u32 limit;
0091     u32 counter;
0092     u32 gap;
0093     u32 duplicate;
0094     u32 reorder;
0095     u32 corrupt;
0096     u64 rate;
0097     s32 packet_overhead;
0098     u32 cell_size;
0099     struct reciprocal_value cell_size_reciprocal;
0100     s32 cell_overhead;
0101
0102     struct crndstate {
0103         u32 last;
0104         u32 rho;
0105     } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
0106
0107     struct disttable *delay_dist;
0108
0109     enum  {
0110         CLG_RANDOM,
0111         CLG_4_STATES,
0112         CLG_GILB_ELL,
0113     } loss_model;
0114
0115     enum {
0116         TX_IN_GAP_PERIOD = 1,
0117         TX_IN_BURST_PERIOD,
0118         LOST_IN_GAP_PERIOD,
0119         LOST_IN_BURST_PERIOD,
0120     } _4_state_model;
0121
0122     enum {
0123         GOOD_STATE = 1,
0124         BAD_STATE,
0125     } GE_state_model;
0126
0127     /* Correlated Loss Generation models */
0128     struct clgstate {
0129         /* state of the Markov chain */
0130         u8 state;
0131
0132         /* 4-states and Gilbert-Elliot models */
0133         u32 a1; /* p13 for 4-states or p for GE */
0134         u32 a2; /* p31 for 4-states or r for GE */
0135         u32 a3; /* p32 for 4-states or h for GE */
0136         u32 a4; /* p14 for 4-states or 1-k for GE */
0137         u32 a5; /* p23 used only in 4-states */
0138     } clg;
0139
0140     struct tc_netem_slot slot_config;
0141     struct slotstate {
0142         u64 slot_next;
0143         s32 packets_left;
0144         s32 bytes_left;
0145     } slot;
0146
0147     struct disttable *slot_dist;
0148 };
0149
0150 /* Time stamp put into socket buffer control block
0151  * Only valid when skbs are in our internal t(ime)fifo queue.
0152  *
0153  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
0154  * and skb->next & skb->prev are scratch space for a qdisc,
0155  * we save skb->tstamp value in skb->cb[] before destroying it.
0156  */
0157 struct netem_skb_cb {
0158     u64         time_to_send;
0159 };
0160
0161 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
0162 {
0163     /* we assume we can use skb next/prev/tstamp as storage for rb_node */
0164     qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
0165     return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
0166 }
0167
0168 /* init_crandom - initialize correlated random number generator
0169  * Use entropy source for initial seed.
0170  */
0171 static void init_crandom(struct crndstate *state, unsigned long rho)
0172 {
0173     state->rho = rho;
0174     state->last = prandom_u32();
0175 }
0176
0177 /* get_crandom - correlated random number generator
0178  * Next number depends on last value.
0179  * rho is scaled to avoid floating point.
0180  */
0181 static u32 get_crandom(struct crndstate *state)
0182 {
0183     u64 value, rho;
0184     unsigned long answer;
0185
0186     if (!state || state->rho == 0)  /* no correlation */
0187         return prandom_u32();
0188
0189     value = prandom_u32();
0190     rho = (u64)state->rho + 1;
0191     answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
0192     state->last = answer;
0193     return answer;
0194 }
0195
0196 /* loss_4state - 4-state model loss generator
0197  * Generates losses according to the 4-state Markov chain adopted in
0198  * the GI (General and Intuitive) loss model.
0199  */
0200 static bool loss_4state(struct netem_sched_data *q)
0201 {
0202     struct clgstate *clg = &q->clg;
0203     u32 rnd = prandom_u32();
0204
0205     /*
0206      * Makes a comparison between rnd and the transition
0207      * probabilities outgoing from the current state, then decides the
0208      * next state and if the next packet has to be transmitted or lost.
0209      * The four states correspond to:
0210      *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
0211      *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
0212      *   LOST_IN_BURST_PERIOD => lost packets within a burst period
0213      *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
0214      */
0215     switch (clg->state) {
0216     case TX_IN_GAP_PERIOD:
0217         if (rnd < clg->a4) {
0218             clg->state = LOST_IN_GAP_PERIOD;
0219             return true;
0220         } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
0221             clg->state = LOST_IN_BURST_PERIOD;
0222             return true;
0223         } else if (clg->a1 + clg->a4 < rnd) {
0224             clg->state = TX_IN_GAP_PERIOD;
0225         }
0226
0227         break;
0228     case TX_IN_BURST_PERIOD:
0229         if (rnd < clg->a5) {
0230             clg->state = LOST_IN_BURST_PERIOD;
0231             return true;
0232         } else {
0233             clg->state = TX_IN_BURST_PERIOD;
0234         }
0235
0236         break;
0237     case LOST_IN_BURST_PERIOD:
0238         if (rnd < clg->a3)
0239             clg->state = TX_IN_BURST_PERIOD;
0240         else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
0241             clg->state = TX_IN_GAP_PERIOD;
0242         } else if (clg->a2 + clg->a3 < rnd) {
0243             clg->state = LOST_IN_BURST_PERIOD;
0244             return true;
0245         }
0246         break;
0247     case LOST_IN_GAP_PERIOD:
0248         clg->state = TX_IN_GAP_PERIOD;
0249         break;
0250     }
0251
0252     return false;
0253 }
0254
0255 /* loss_gilb_ell - Gilbert-Elliot model loss generator
0256  * Generates losses according to the Gilbert-Elliot loss model or
0257  * its special cases  (Gilbert or Simple Gilbert)
0258  *
0259  * Makes a comparison between random number and the transition
0260  * probabilities outgoing from the current state, then decides the
0261  * next state. A second random number is extracted and the comparison
0262  * with the loss probability of the current state decides if the next
0263  * packet will be transmitted or lost.
0264  */
0265 static bool loss_gilb_ell(struct netem_sched_data *q)
0266 {
0267     struct clgstate *clg = &q->clg;
0268
0269     switch (clg->state) {
0270     case GOOD_STATE:
0271         if (prandom_u32() < clg->a1)
0272             clg->state = BAD_STATE;
0273         if (prandom_u32() < clg->a4)
0274             return true;
0275         break;
0276     case BAD_STATE:
0277         if (prandom_u32() < clg->a2)
0278             clg->state = GOOD_STATE;
0279         if (prandom_u32() > clg->a3)
0280             return true;
0281     }
0282
0283     return false;
0284 }
0285
0286 static bool loss_event(struct netem_sched_data *q)
0287 {
0288     switch (q->loss_model) {
0289     case CLG_RANDOM:
0290         /* Random packet drop 0 => none, ~0 => all */
0291         return q->loss && q->loss >= get_crandom(&q->loss_cor);
0292
0293     case CLG_4_STATES:
0294         /* 4state loss model algorithm (used also for GI model)
0295         * Extracts a value from the markov 4 state loss generator,
0296         * if it is 1 drops a packet and if needed writes the event in
0297         * the kernel logs
0298         */
0299         return loss_4state(q);
0300
0301     case CLG_GILB_ELL:
0302         /* Gilbert-Elliot loss model algorithm
0303         * Extracts a value from the Gilbert-Elliot loss generator,
0304         * if it is 1 drops a packet and if needed writes the event in
0305         * the kernel logs
0306         */
0307         return loss_gilb_ell(q);
0308     }
0309
0310     return false;   /* not reached */
0311 }
0312
0313
0314 /* tabledist - return a pseudo-randomly distributed value with mean mu and
0315  * std deviation sigma.  Uses table lookup to approximate the desired
0316  * distribution, and a uniformly-distributed pseudo-random source.
0317  */
0318 static s64 tabledist(s64 mu, s32 sigma,
0319              struct crndstate *state,
0320              const struct disttable *dist)
0321 {
0322     s64 x;
0323     long t;
0324     u32 rnd;
0325
0326     if (sigma == 0)
0327         return mu;
0328
0329     rnd = get_crandom(state);
0330
0331     /* default uniform distribution */
0332     if (dist == NULL)
0333         return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
0334
0335     t = dist->table[rnd % dist->size];
0336     x = (sigma % NETEM_DIST_SCALE) * t;
0337     if (x >= 0)
0338         x += NETEM_DIST_SCALE/2;
0339     else
0340         x -= NETEM_DIST_SCALE/2;
0341
0342     return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
0343 }
0344
0345 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
0346 {
0347     len += q->packet_overhead;
0348
0349     if (q->cell_size) {
0350         u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
0351
0352         if (len > cells * q->cell_size) /* extra cell needed for remainder */
0353             cells++;
0354         len = cells * (q->cell_size + q->cell_overhead);
0355     }
0356
0357     return div64_u64(len * NSEC_PER_SEC, q->rate);
0358 }
0359
0360 static void tfifo_reset(struct Qdisc *sch)
0361 {
0362     struct netem_sched_data *q = qdisc_priv(sch);
0363     struct rb_node *p = rb_first(&q->t_root);
0364
0365     while (p) {
0366         struct sk_buff *skb = rb_to_skb(p);
0367
0368         p = rb_next(p);
0369         rb_erase(&skb->rbnode, &q->t_root);
0370         rtnl_kfree_skbs(skb, skb);
0371     }
0372
0373     rtnl_kfree_skbs(q->t_head, q->t_tail);
0374     q->t_head = NULL;
0375     q->t_tail = NULL;
0376 }
0377
0378 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
0379 {
0380     struct netem_sched_data *q = qdisc_priv(sch);
0381     u64 tnext = netem_skb_cb(nskb)->time_to_send;
0382
0383     if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
0384         if (q->t_tail)
0385             q->t_tail->next = nskb;
0386         else
0387             q->t_head = nskb;
0388         q->t_tail = nskb;
0389     } else {
0390         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
0391
0392         while (*p) {
0393             struct sk_buff *skb;
0394
0395             parent = *p;
0396             skb = rb_to_skb(parent);
0397             if (tnext >= netem_skb_cb(skb)->time_to_send)
0398                 p = &parent->rb_right;
0399             else
0400                 p = &parent->rb_left;
0401         }
0402         rb_link_node(&nskb->rbnode, parent, p);
0403         rb_insert_color(&nskb->rbnode, &q->t_root);
0404     }
0405     sch->q.qlen++;
0406 }
0407
0408 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
0409  * when we statistically choose to corrupt one, we instead segment it, returning
0410  * the first packet to be corrupted, and re-enqueue the remaining frames
0411  */
0412 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
0413                      struct sk_buff **to_free)
0414 {
0415     struct sk_buff *segs;
0416     netdev_features_t features = netif_skb_features(skb);
0417
0418     segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
0419
0420     if (IS_ERR_OR_NULL(segs)) {
0421         qdisc_drop(skb, sch, to_free);
0422         return NULL;
0423     }
0424     consume_skb(skb);
0425     return segs;
0426 }
0427
0428 /*
0429  * Insert one skb into qdisc.
0430  * Note: parent depends on return value to account for queue length.
0431  *  NET_XMIT_DROP: queue length didn't change.
0432  *      NET_XMIT_SUCCESS: one skb was queued.
0433  */
0434 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
0435              struct sk_buff **to_free)
0436 {
0437     struct netem_sched_data *q = qdisc_priv(sch);
0438     /* We don't fill cb now as skb_unshare() may invalidate it */
0439     struct netem_skb_cb *cb;
0440     struct sk_buff *skb2;
0441     struct sk_buff *segs = NULL;
0442     unsigned int prev_len = qdisc_pkt_len(skb);
0443     int count = 1;
0444     int rc = NET_XMIT_SUCCESS;
0445     int rc_drop = NET_XMIT_DROP;
0446
0447     /* Do not fool qdisc_drop_all() */
0448     skb->prev = NULL;
0449
0450     /* Random duplication */
0451     if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
0452         ++count;
0453
0454     /* Drop packet? */
0455     if (loss_event(q)) {
0456         if (q->ecn && INET_ECN_set_ce(skb))
0457             qdisc_qstats_drop(sch); /* mark packet */
0458         else
0459             --count;
0460     }
0461     if (count == 0) {
0462         qdisc_qstats_drop(sch);
0463         __qdisc_drop(skb, to_free);
0464         return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
0465     }
0466
0467     /* If a delay is expected, orphan the skb. (orphaning usually takes
0468      * place at TX completion time, so _before_ the link transit delay)
0469      */
0470     if (q->latency || q->jitter || q->rate)
0471         skb_orphan_partial(skb);
0472
0473     /*
0474      * If we need to duplicate packet, then re-insert at top of the
0475      * qdisc tree, since parent queuer expects that only one
0476      * skb will be queued.
0477      */
0478     if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
0479         struct Qdisc *rootq = qdisc_root_bh(sch);
0480         u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
0481
0482         q->duplicate = 0;
0483         rootq->enqueue(skb2, rootq, to_free);
0484         q->duplicate = dupsave;
0485         rc_drop = NET_XMIT_SUCCESS;
0486     }
0487
0488     /*
0489      * Randomized packet corruption.
0490      * Make copy if needed since we are modifying
0491      * If packet is going to be hardware checksummed, then
0492      * do it now in software before we mangle it.
0493      */
0494     if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
0495         if (skb_is_gso(skb)) {
0496             skb = netem_segment(skb, sch, to_free);
0497             if (!skb)
0498                 return rc_drop;
0499             segs = skb->next;
0500             skb_mark_not_on_list(skb);
0501             qdisc_skb_cb(skb)->pkt_len = skb->len;
0502         }
0503
0504         skb = skb_unshare(skb, GFP_ATOMIC);
0505         if (unlikely(!skb)) {
0506             qdisc_qstats_drop(sch);
0507             goto finish_segs;
0508         }
0509         if (skb->ip_summed == CHECKSUM_PARTIAL &&
0510             skb_checksum_help(skb)) {
0511             qdisc_drop(skb, sch, to_free);
0512             skb = NULL;
0513             goto finish_segs;
0514         }
0515
0516         skb->data[prandom_u32() % skb_headlen(skb)] ^=
0517             1<<(prandom_u32() % 8);
0518     }
0519
0520     if (unlikely(sch->q.qlen >= sch->limit)) {
0521         /* re-link segs, so that qdisc_drop_all() frees them all */
0522         skb->next = segs;
0523         qdisc_drop_all(skb, sch, to_free);
0524         return rc_drop;
0525     }
0526
0527     qdisc_qstats_backlog_inc(sch, skb);
0528
0529     cb = netem_skb_cb(skb);
0530     if (q->gap == 0 ||      /* not doing reordering */
0531         q->counter < q->gap - 1 ||  /* inside last reordering gap */
0532         q->reorder < get_crandom(&q->reorder_cor)) {
0533         u64 now;
0534         s64 delay;
0535
0536         delay = tabledist(q->latency, q->jitter,
0537                   &q->delay_cor, q->delay_dist);
0538
0539         now = ktime_get_ns();
0540
0541         if (q->rate) {
0542             struct netem_skb_cb *last = NULL;
0543
0544             if (sch->q.tail)
0545                 last = netem_skb_cb(sch->q.tail);
0546             if (q->t_root.rb_node) {
0547                 struct sk_buff *t_skb;
0548                 struct netem_skb_cb *t_last;
0549
0550                 t_skb = skb_rb_last(&q->t_root);
0551                 t_last = netem_skb_cb(t_skb);
0552                 if (!last ||
0553                     t_last->time_to_send > last->time_to_send)
0554                     last = t_last;
0555             }
0556             if (q->t_tail) {
0557                 struct netem_skb_cb *t_last =
0558                     netem_skb_cb(q->t_tail);
0559
0560                 if (!last ||
0561                     t_last->time_to_send > last->time_to_send)
0562                     last = t_last;
0563             }
0564
0565             if (last) {
0566                 /*
0567                  * Last packet in queue is reference point (now),
0568                  * calculate this time bonus and subtract
0569                  * from delay.
0570                  */
0571                 delay -= last->time_to_send - now;
0572                 delay = max_t(s64, 0, delay);
0573                 now = last->time_to_send;
0574             }
0575
0576             delay += packet_time_ns(qdisc_pkt_len(skb), q);
0577         }
0578
0579         cb->time_to_send = now + delay;
0580         ++q->counter;
0581         tfifo_enqueue(skb, sch);
0582     } else {
0583         /*
0584          * Do re-ordering by putting one out of N packets at the front
0585          * of the queue.
0586          */
0587         cb->time_to_send = ktime_get_ns();
0588         q->counter = 0;
0589
0590         __qdisc_enqueue_head(skb, &sch->q);
0591         sch->qstats.requeues++;
0592     }
0593
0594 finish_segs:
0595     if (segs) {
0596         unsigned int len, last_len;
0597         int nb;
0598
0599         len = skb ? skb->len : 0;
0600         nb = skb ? 1 : 0;
0601
0602         while (segs) {
0603             skb2 = segs->next;
0604             skb_mark_not_on_list(segs);
0605             qdisc_skb_cb(segs)->pkt_len = segs->len;
0606             last_len = segs->len;
0607             rc = qdisc_enqueue(segs, sch, to_free);
0608             if (rc != NET_XMIT_SUCCESS) {
0609                 if (net_xmit_drop_count(rc))
0610                     qdisc_qstats_drop(sch);
0611             } else {
0612                 nb++;
0613                 len += last_len;
0614             }
0615             segs = skb2;
0616         }
0617         /* Parent qdiscs accounted for 1 skb of size @prev_len */
0618         qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
0619     } else if (!skb) {
0620         return NET_XMIT_DROP;
0621     }
0622     return NET_XMIT_SUCCESS;
0623 }
0624
0625 /* Delay the next round with a new future slot with a
0626  * correct number of bytes and packets.
0627  */
0628
0629 static void get_slot_next(struct netem_sched_data *q, u64 now)
0630 {
0631     s64 next_delay;
0632
0633     if (!q->slot_dist)
0634         next_delay = q->slot_config.min_delay +
0635                 (prandom_u32() *
0636                  (q->slot_config.max_delay -
0637                   q->slot_config.min_delay) >> 32);
0638     else
0639         next_delay = tabledist(q->slot_config.dist_delay,
0640                        (s32)(q->slot_config.dist_jitter),
0641                        NULL, q->slot_dist);
0642
0643     q->slot.slot_next = now + next_delay;
0644     q->slot.packets_left = q->slot_config.max_packets;
0645     q->slot.bytes_left = q->slot_config.max_bytes;
0646 }
0647
0648 static struct sk_buff *netem_peek(struct netem_sched_data *q)
0649 {
0650     struct sk_buff *skb = skb_rb_first(&q->t_root);
0651     u64 t1, t2;
0652
0653     if (!skb)
0654         return q->t_head;
0655     if (!q->t_head)
0656         return skb;
0657
0658     t1 = netem_skb_cb(skb)->time_to_send;
0659     t2 = netem_skb_cb(q->t_head)->time_to_send;
0660     if (t1 < t2)
0661         return skb;
0662     return q->t_head;
0663 }
0664
0665 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
0666 {
0667     if (skb == q->t_head) {
0668         q->t_head = skb->next;
0669         if (!q->t_head)
0670             q->t_tail = NULL;
0671     } else {
0672         rb_erase(&skb->rbnode, &q->t_root);
0673     }
0674 }
0675
0676 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
0677 {
0678     struct netem_sched_data *q = qdisc_priv(sch);
0679     struct sk_buff *skb;
0680
0681 tfifo_dequeue:
0682     skb = __qdisc_dequeue_head(&sch->q);
0683     if (skb) {
0684         qdisc_qstats_backlog_dec(sch, skb);
0685 deliver:
0686         qdisc_bstats_update(sch, skb);
0687         return skb;
0688     }
0689     skb = netem_peek(q);
0690     if (skb) {
0691         u64 time_to_send;
0692         u64 now = ktime_get_ns();
0693
0694         /* if more time remaining? */
0695         time_to_send = netem_skb_cb(skb)->time_to_send;
0696         if (q->slot.slot_next && q->slot.slot_next < time_to_send)
0697             get_slot_next(q, now);
0698
0699         if (time_to_send <= now && q->slot.slot_next <= now) {
0700             netem_erase_head(q, skb);
0701             sch->q.qlen--;
0702             qdisc_qstats_backlog_dec(sch, skb);
0703             skb->next = NULL;
0704             skb->prev = NULL;
0705             /* skb->dev shares skb->rbnode area,
0706              * we need to restore its value.
0707              */
0708             skb->dev = qdisc_dev(sch);
0709
0710             if (q->slot.slot_next) {
0711                 q->slot.packets_left--;
0712                 q->slot.bytes_left -= qdisc_pkt_len(skb);
0713                 if (q->slot.packets_left <= 0 ||
0714                     q->slot.bytes_left <= 0)
0715                     get_slot_next(q, now);
0716             }
0717
0718             if (q->qdisc) {
0719                 unsigned int pkt_len = qdisc_pkt_len(skb);
0720                 struct sk_buff *to_free = NULL;
0721                 int err;
0722
0723                 err = qdisc_enqueue(skb, q->qdisc, &to_free);
0724                 kfree_skb_list(to_free);
0725                 if (err != NET_XMIT_SUCCESS &&
0726                     net_xmit_drop_count(err)) {
0727                     qdisc_qstats_drop(sch);
0728                     qdisc_tree_reduce_backlog(sch, 1,
0729                                   pkt_len);
0730                 }
0731                 goto tfifo_dequeue;
0732             }
0733             goto deliver;
0734         }
0735
0736         if (q->qdisc) {
0737             skb = q->qdisc->ops->dequeue(q->qdisc);
0738             if (skb)
0739                 goto deliver;
0740         }
0741
0742         qdisc_watchdog_schedule_ns(&q->watchdog,
0743                        max(time_to_send,
0744                            q->slot.slot_next));
0745     }
0746
0747     if (q->qdisc) {
0748         skb = q->qdisc->ops->dequeue(q->qdisc);
0749         if (skb)
0750             goto deliver;
0751     }
0752     return NULL;
0753 }
0754
0755 static void netem_reset(struct Qdisc *sch)
0756 {
0757     struct netem_sched_data *q = qdisc_priv(sch);
0758
0759     qdisc_reset_queue(sch);
0760     tfifo_reset(sch);
0761     if (q->qdisc)
0762         qdisc_reset(q->qdisc);
0763     qdisc_watchdog_cancel(&q->watchdog);
0764 }
0765
0766 static void dist_free(struct disttable *d)
0767 {
0768     kvfree(d);
0769 }
0770
0771 /*
0772  * Distribution data is a variable size payload containing
0773  * signed 16 bit values.
0774  */
0775
0776 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
0777               const struct nlattr *attr)
0778 {
0779     size_t n = nla_len(attr)/sizeof(__s16);
0780     const __s16 *data = nla_data(attr);
0781     spinlock_t *root_lock;
0782     struct disttable *d;
0783     int i;
0784
0785     if (!n || n > NETEM_DIST_MAX)
0786         return -EINVAL;
0787
0788     d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
0789     if (!d)
0790         return -ENOMEM;
0791
0792     d->size = n;
0793     for (i = 0; i < n; i++)
0794         d->table[i] = data[i];
0795
0796     root_lock = qdisc_root_sleeping_lock(sch);
0797
0798     spin_lock_bh(root_lock);
0799     swap(*tbl, d);
0800     spin_unlock_bh(root_lock);
0801
0802     dist_free(d);
0803     return 0;
0804 }
0805
0806 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
0807 {
0808     const struct tc_netem_slot *c = nla_data(attr);
0809
0810     q->slot_config = *c;
0811     if (q->slot_config.max_packets == 0)
0812         q->slot_config.max_packets = INT_MAX;
0813     if (q->slot_config.max_bytes == 0)
0814         q->slot_config.max_bytes = INT_MAX;
0815
0816     /* capping dist_jitter to the range acceptable by tabledist() */
0817     q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
0818
0819     q->slot.packets_left = q->slot_config.max_packets;
0820     q->slot.bytes_left = q->slot_config.max_bytes;
0821     if (q->slot_config.min_delay | q->slot_config.max_delay |
0822         q->slot_config.dist_jitter)
0823         q->slot.slot_next = ktime_get_ns();
0824     else
0825         q->slot.slot_next = 0;
0826 }
0827
0828 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
0829 {
0830     const struct tc_netem_corr *c = nla_data(attr);
0831
0832     init_crandom(&q->delay_cor, c->delay_corr);
0833     init_crandom(&q->loss_cor, c->loss_corr);
0834     init_crandom(&q->dup_cor, c->dup_corr);
0835 }
0836
0837 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
0838 {
0839     const struct tc_netem_reorder *r = nla_data(attr);
0840
0841     q->reorder = r->probability;
0842     init_crandom(&q->reorder_cor, r->correlation);
0843 }
0844
0845 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
0846 {
0847     const struct tc_netem_corrupt *r = nla_data(attr);
0848
0849     q->corrupt = r->probability;
0850     init_crandom(&q->corrupt_cor, r->correlation);
0851 }
0852
0853 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
0854 {
0855     const struct tc_netem_rate *r = nla_data(attr);
0856
0857     q->rate = r->rate;
0858     q->packet_overhead = r->packet_overhead;
0859     q->cell_size = r->cell_size;
0860     q->cell_overhead = r->cell_overhead;
0861     if (q->cell_size)
0862         q->cell_size_reciprocal = reciprocal_value(q->cell_size);
0863     else
0864         q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
0865 }
0866
0867 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
0868 {
0869     const struct nlattr *la;
0870     int rem;
0871
0872     nla_for_each_nested(la, attr, rem) {
0873         u16 type = nla_type(la);
0874
0875         switch (type) {
0876         case NETEM_LOSS_GI: {
0877             const struct tc_netem_gimodel *gi = nla_data(la);
0878
0879             if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
0880                 pr_info("netem: incorrect gi model size\n");
0881                 return -EINVAL;
0882             }
0883
0884             q->loss_model = CLG_4_STATES;
0885
0886             q->clg.state = TX_IN_GAP_PERIOD;
0887             q->clg.a1 = gi->p13;
0888             q->clg.a2 = gi->p31;
0889             q->clg.a3 = gi->p32;
0890             q->clg.a4 = gi->p14;
0891             q->clg.a5 = gi->p23;
0892             break;
0893         }
0894
0895         case NETEM_LOSS_GE: {
0896             const struct tc_netem_gemodel *ge = nla_data(la);
0897
0898             if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
0899                 pr_info("netem: incorrect ge model size\n");
0900                 return -EINVAL;
0901             }
0902
0903             q->loss_model = CLG_GILB_ELL;
0904             q->clg.state = GOOD_STATE;
0905             q->clg.a1 = ge->p;
0906             q->clg.a2 = ge->r;
0907             q->clg.a3 = ge->h;
0908             q->clg.a4 = ge->k1;
0909             break;
0910         }
0911
0912         default:
0913             pr_info("netem: unknown loss type %u\n", type);
0914             return -EINVAL;
0915         }
0916     }
0917
0918     return 0;
0919 }
0920
0921 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
0922     [TCA_NETEM_CORR]    = { .len = sizeof(struct tc_netem_corr) },
0923     [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
0924     [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
0925     [TCA_NETEM_RATE]    = { .len = sizeof(struct tc_netem_rate) },
0926     [TCA_NETEM_LOSS]    = { .type = NLA_NESTED },
0927     [TCA_NETEM_ECN]     = { .type = NLA_U32 },
0928     [TCA_NETEM_RATE64]  = { .type = NLA_U64 },
0929     [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
0930     [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
0931     [TCA_NETEM_SLOT]    = { .len = sizeof(struct tc_netem_slot) },
0932 };
0933
0934 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
0935               const struct nla_policy *policy, int len)
0936 {
0937     int nested_len = nla_len(nla) - NLA_ALIGN(len);
0938
0939     if (nested_len < 0) {
0940         pr_info("netem: invalid attributes len %d\n", nested_len);
0941         return -EINVAL;
0942     }
0943
0944     if (nested_len >= nla_attr_size(0))
0945         return nla_parse_deprecated(tb, maxtype,
0946                         nla_data(nla) + NLA_ALIGN(len),
0947                         nested_len, policy, NULL);
0948
0949     memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
0950     return 0;
0951 }
0952
0953 /* Parse netlink message to set options */
0954 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
0955             struct netlink_ext_ack *extack)
0956 {
0957     struct netem_sched_data *q = qdisc_priv(sch);
0958     struct nlattr *tb[TCA_NETEM_MAX + 1];
0959     struct tc_netem_qopt *qopt;
0960     struct clgstate old_clg;
0961     int old_loss_model = CLG_RANDOM;
0962     int ret;
0963
0964     if (opt == NULL)
0965         return -EINVAL;
0966
0967     qopt = nla_data(opt);
0968     ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
0969     if (ret < 0)
0970         return ret;
0971
0972     /* backup q->clg and q->loss_model */
0973     old_clg = q->clg;
0974     old_loss_model = q->loss_model;
0975
0976     if (tb[TCA_NETEM_LOSS]) {
0977         ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
0978         if (ret) {
0979             q->loss_model = old_loss_model;
0980             return ret;
0981         }
0982     } else {
0983         q->loss_model = CLG_RANDOM;
0984     }
0985
0986     if (tb[TCA_NETEM_DELAY_DIST]) {
0987         ret = get_dist_table(sch, &q->delay_dist,
0988                      tb[TCA_NETEM_DELAY_DIST]);
0989         if (ret)
0990             goto get_table_failure;
0991     }
0992
0993     if (tb[TCA_NETEM_SLOT_DIST]) {
0994         ret = get_dist_table(sch, &q->slot_dist,
0995                      tb[TCA_NETEM_SLOT_DIST]);
0996         if (ret)
0997             goto get_table_failure;
0998     }
0999
1000     sch->limit = qopt->limit;
1001
1002     q->latency = PSCHED_TICKS2NS(qopt->latency);
1003     q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1004     q->limit = qopt->limit;
1005     q->gap = qopt->gap;
1006     q->counter = 0;
1007     q->loss = qopt->loss;
1008     q->duplicate = qopt->duplicate;
1009
1010     /* for compatibility with earlier versions.
1011      * if gap is set, need to assume 100% probability
1012      */
1013     if (q->gap)
1014         q->reorder = ~0;
1015
1016     if (tb[TCA_NETEM_CORR])
1017         get_correlation(q, tb[TCA_NETEM_CORR]);
1018
1019     if (tb[TCA_NETEM_REORDER])
1020         get_reorder(q, tb[TCA_NETEM_REORDER]);
1021
1022     if (tb[TCA_NETEM_CORRUPT])
1023         get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1024
1025     if (tb[TCA_NETEM_RATE])
1026         get_rate(q, tb[TCA_NETEM_RATE]);
1027
1028     if (tb[TCA_NETEM_RATE64])
1029         q->rate = max_t(u64, q->rate,
1030                 nla_get_u64(tb[TCA_NETEM_RATE64]));
1031
1032     if (tb[TCA_NETEM_LATENCY64])
1033         q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1034
1035     if (tb[TCA_NETEM_JITTER64])
1036         q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1037
1038     if (tb[TCA_NETEM_ECN])
1039         q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1040
1041     if (tb[TCA_NETEM_SLOT])
1042         get_slot(q, tb[TCA_NETEM_SLOT]);
1043
1044     /* capping jitter to the range acceptable by tabledist() */
1045     q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1046
1047     return ret;
1048
1049 get_table_failure:
1050     /* recover clg and loss_model, in case of
1051      * q->clg and q->loss_model were modified
1052      * in get_loss_clg()
1053      */
1054     q->clg = old_clg;
1055     q->loss_model = old_loss_model;
1056     return ret;
1057 }
1058
1059 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1060               struct netlink_ext_ack *extack)
1061 {
1062     struct netem_sched_data *q = qdisc_priv(sch);
1063     int ret;
1064
1065     qdisc_watchdog_init(&q->watchdog, sch);
1066
1067     if (!opt)
1068         return -EINVAL;
1069
1070     q->loss_model = CLG_RANDOM;
1071     ret = netem_change(sch, opt, extack);
1072     if (ret)
1073         pr_info("netem: change failed\n");
1074     return ret;
1075 }
1076
1077 static void netem_destroy(struct Qdisc *sch)
1078 {
1079     struct netem_sched_data *q = qdisc_priv(sch);
1080
1081     qdisc_watchdog_cancel(&q->watchdog);
1082     if (q->qdisc)
1083         qdisc_put(q->qdisc);
1084     dist_free(q->delay_dist);
1085     dist_free(q->slot_dist);
1086 }
1087
1088 static int dump_loss_model(const struct netem_sched_data *q,
1089                struct sk_buff *skb)
1090 {
1091     struct nlattr *nest;
1092
1093     nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1094     if (nest == NULL)
1095         goto nla_put_failure;
1096
1097     switch (q->loss_model) {
1098     case CLG_RANDOM:
1099         /* legacy loss model */
1100         nla_nest_cancel(skb, nest);
1101         return 0;   /* no data */
1102
1103     case CLG_4_STATES: {
1104         struct tc_netem_gimodel gi = {
1105             .p13 = q->clg.a1,
1106             .p31 = q->clg.a2,
1107             .p32 = q->clg.a3,
1108             .p14 = q->clg.a4,
1109             .p23 = q->clg.a5,
1110         };
1111
1112         if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1113             goto nla_put_failure;
1114         break;
1115     }
1116     case CLG_GILB_ELL: {
1117         struct tc_netem_gemodel ge = {
1118             .p = q->clg.a1,
1119             .r = q->clg.a2,
1120             .h = q->clg.a3,
1121             .k1 = q->clg.a4,
1122         };
1123
1124         if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1125             goto nla_put_failure;
1126         break;
1127     }
1128     }
1129
1130     nla_nest_end(skb, nest);
1131     return 0;
1132
1133 nla_put_failure:
1134     nla_nest_cancel(skb, nest);
1135     return -1;
1136 }
1137
1138 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1139 {
1140     const struct netem_sched_data *q = qdisc_priv(sch);
1141     struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1142     struct tc_netem_qopt qopt;
1143     struct tc_netem_corr cor;
1144     struct tc_netem_reorder reorder;
1145     struct tc_netem_corrupt corrupt;
1146     struct tc_netem_rate rate;
1147     struct tc_netem_slot slot;
1148
1149     qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1150                  UINT_MAX);
1151     qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1152                 UINT_MAX);
1153     qopt.limit = q->limit;
1154     qopt.loss = q->loss;
1155     qopt.gap = q->gap;
1156     qopt.duplicate = q->duplicate;
1157     if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1158         goto nla_put_failure;
1159
1160     if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1161         goto nla_put_failure;
1162
1163     if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1164         goto nla_put_failure;
1165
1166     cor.delay_corr = q->delay_cor.rho;
1167     cor.loss_corr = q->loss_cor.rho;
1168     cor.dup_corr = q->dup_cor.rho;
1169     if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1170         goto nla_put_failure;
1171
1172     reorder.probability = q->reorder;
1173     reorder.correlation = q->reorder_cor.rho;
1174     if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1175         goto nla_put_failure;
1176
1177     corrupt.probability = q->corrupt;
1178     corrupt.correlation = q->corrupt_cor.rho;
1179     if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1180         goto nla_put_failure;
1181
1182     if (q->rate >= (1ULL << 32)) {
1183         if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1184                       TCA_NETEM_PAD))
1185             goto nla_put_failure;
1186         rate.rate = ~0U;
1187     } else {
1188         rate.rate = q->rate;
1189     }
1190     rate.packet_overhead = q->packet_overhead;
1191     rate.cell_size = q->cell_size;
1192     rate.cell_overhead = q->cell_overhead;
1193     if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1194         goto nla_put_failure;
1195
1196     if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1197         goto nla_put_failure;
1198
1199     if (dump_loss_model(q, skb) != 0)
1200         goto nla_put_failure;
1201
1202     if (q->slot_config.min_delay | q->slot_config.max_delay |
1203         q->slot_config.dist_jitter) {
1204         slot = q->slot_config;
1205         if (slot.max_packets == INT_MAX)
1206             slot.max_packets = 0;
1207         if (slot.max_bytes == INT_MAX)
1208             slot.max_bytes = 0;
1209         if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1210             goto nla_put_failure;
1211     }
1212
1213     return nla_nest_end(skb, nla);
1214
1215 nla_put_failure:
1216     nlmsg_trim(skb, nla);
1217     return -1;
1218 }
1219
1220 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1221               struct sk_buff *skb, struct tcmsg *tcm)
1222 {
1223     struct netem_sched_data *q = qdisc_priv(sch);
1224
1225     if (cl != 1 || !q->qdisc)   /* only one class */
1226         return -ENOENT;
1227
1228     tcm->tcm_handle |= TC_H_MIN(1);
1229     tcm->tcm_info = q->qdisc->handle;
1230
1231     return 0;
1232 }
1233
1234 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1235              struct Qdisc **old, struct netlink_ext_ack *extack)
1236 {
1237     struct netem_sched_data *q = qdisc_priv(sch);
1238
1239     *old = qdisc_replace(sch, new, &q->qdisc);
1240     return 0;
1241 }
1242
1243 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1244 {
1245     struct netem_sched_data *q = qdisc_priv(sch);
1246     return q->qdisc;
1247 }
1248
1249 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1250 {
1251     return 1;
1252 }
1253
1254 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1255 {
1256     if (!walker->stop) {
1257         if (walker->count >= walker->skip)
1258             if (walker->fn(sch, 1, walker) < 0) {
1259                 walker->stop = 1;
1260                 return;
1261             }
1262         walker->count++;
1263     }
1264 }
1265
1266 static const struct Qdisc_class_ops netem_class_ops = {
1267     .graft      =   netem_graft,
1268     .leaf       =   netem_leaf,
1269     .find       =   netem_find,
1270     .walk       =   netem_walk,
1271     .dump       =   netem_dump_class,
1272 };
1273
1274 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1275     .id     =   "netem",
1276     .cl_ops     =   &netem_class_ops,
1277     .priv_size  =   sizeof(struct netem_sched_data),
1278     .enqueue    =   netem_enqueue,
1279     .dequeue    =   netem_dequeue,
1280     .peek       =   qdisc_peek_dequeued,
1281     .init       =   netem_init,
1282     .reset      =   netem_reset,
1283     .destroy    =   netem_destroy,
1284     .change     =   netem_change,
1285     .dump       =   netem_dump,
1286     .owner      =   THIS_MODULE,
1287 };
1288
1289
1290 static int __init netem_module_init(void)
1291 {
1292     pr_info("netem: version " VERSION "\n");
1293     return register_qdisc(&netem_qdisc_ops);
1294 }
1295 static void __exit netem_module_exit(void)
1296 {
1297     unregister_qdisc(&netem_qdisc_ops);
1298 }
1299 module_init(netem_module_init)
1300 module_exit(netem_module_exit)
1301 MODULE_LICENSE("GPL");