Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * net/sched/sch_netem.c    Network emulator
0004  *
0005  *          Many of the algorithms and ideas for this came from
0006  *      NIST Net which is not copyrighted.
0007  *
0008  * Authors: Stephen Hemminger <shemminger@osdl.org>
0009  *      Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
0010  */
0011 
0012 #include <linux/mm.h>
0013 #include <linux/module.h>
0014 #include <linux/slab.h>
0015 #include <linux/types.h>
0016 #include <linux/kernel.h>
0017 #include <linux/errno.h>
0018 #include <linux/skbuff.h>
0019 #include <linux/vmalloc.h>
0020 #include <linux/rtnetlink.h>
0021 #include <linux/reciprocal_div.h>
0022 #include <linux/rbtree.h>
0023 
0024 #include <net/netlink.h>
0025 #include <net/pkt_sched.h>
0026 #include <net/inet_ecn.h>
0027 
0028 #define VERSION "1.3"
0029 
0030 /*  Network Emulation Queuing algorithm.
0031     ====================================
0032 
0033     Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
0034          Network Emulation Tool
0035          [2] Luigi Rizzo, DummyNet for FreeBSD
0036 
0037      ----------------------------------------------------------------
0038 
0039      This started out as a simple way to delay outgoing packets to
0040      test TCP but has grown to include most of the functionality
0041      of a full blown network emulator like NISTnet. It can delay
0042      packets and add random jitter (and correlation). The random
0043      distribution can be loaded from a table as well to provide
0044      normal, Pareto, or experimental curves. Packet loss,
0045      duplication, and reordering can also be emulated.
0046 
0047      This qdisc does not do classification that can be handled in
0048      layering other disciplines.  It does not need to do bandwidth
0049      control either since that can be handled by using token
0050      bucket or other rate control.
0051 
0052      Correlated Loss Generator models
0053 
0054     Added generation of correlated loss according to the
0055     "Gilbert-Elliot" model, a 4-state markov model.
0056 
0057     References:
0058     [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
0059     [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
0060     and intuitive loss model for packet networks and its implementation
0061     in the Netem module in the Linux kernel", available in [1]
0062 
0063     Authors: Stefano Salsano <stefano.salsano at uniroma2.it
0064          Fabio Ludovici <fabio.ludovici at yahoo.it>
0065 */
0066 
0067 struct disttable {
0068     u32  size;
0069     s16 table[];
0070 };
0071 
0072 struct netem_sched_data {
0073     /* internal t(ime)fifo qdisc uses t_root and sch->limit */
0074     struct rb_root t_root;
0075 
0076     /* a linear queue; reduces rbtree rebalancing when jitter is low */
0077     struct sk_buff  *t_head;
0078     struct sk_buff  *t_tail;
0079 
0080     /* optional qdisc for classful handling (NULL at netem init) */
0081     struct Qdisc    *qdisc;
0082 
0083     struct qdisc_watchdog watchdog;
0084 
0085     s64 latency;
0086     s64 jitter;
0087 
0088     u32 loss;
0089     u32 ecn;
0090     u32 limit;
0091     u32 counter;
0092     u32 gap;
0093     u32 duplicate;
0094     u32 reorder;
0095     u32 corrupt;
0096     u64 rate;
0097     s32 packet_overhead;
0098     u32 cell_size;
0099     struct reciprocal_value cell_size_reciprocal;
0100     s32 cell_overhead;
0101 
0102     struct crndstate {
0103         u32 last;
0104         u32 rho;
0105     } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
0106 
0107     struct disttable *delay_dist;
0108 
0109     enum  {
0110         CLG_RANDOM,
0111         CLG_4_STATES,
0112         CLG_GILB_ELL,
0113     } loss_model;
0114 
0115     enum {
0116         TX_IN_GAP_PERIOD = 1,
0117         TX_IN_BURST_PERIOD,
0118         LOST_IN_GAP_PERIOD,
0119         LOST_IN_BURST_PERIOD,
0120     } _4_state_model;
0121 
0122     enum {
0123         GOOD_STATE = 1,
0124         BAD_STATE,
0125     } GE_state_model;
0126 
0127     /* Correlated Loss Generation models */
0128     struct clgstate {
0129         /* state of the Markov chain */
0130         u8 state;
0131 
0132         /* 4-states and Gilbert-Elliot models */
0133         u32 a1; /* p13 for 4-states or p for GE */
0134         u32 a2; /* p31 for 4-states or r for GE */
0135         u32 a3; /* p32 for 4-states or h for GE */
0136         u32 a4; /* p14 for 4-states or 1-k for GE */
0137         u32 a5; /* p23 used only in 4-states */
0138     } clg;
0139 
0140     struct tc_netem_slot slot_config;
0141     struct slotstate {
0142         u64 slot_next;
0143         s32 packets_left;
0144         s32 bytes_left;
0145     } slot;
0146 
0147     struct disttable *slot_dist;
0148 };
0149 
0150 /* Time stamp put into socket buffer control block
0151  * Only valid when skbs are in our internal t(ime)fifo queue.
0152  *
0153  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
0154  * and skb->next & skb->prev are scratch space for a qdisc,
0155  * we save skb->tstamp value in skb->cb[] before destroying it.
0156  */
0157 struct netem_skb_cb {
0158     u64         time_to_send;
0159 };
0160 
0161 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
0162 {
0163     /* we assume we can use skb next/prev/tstamp as storage for rb_node */
0164     qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
0165     return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
0166 }
0167 
0168 /* init_crandom - initialize correlated random number generator
0169  * Use entropy source for initial seed.
0170  */
0171 static void init_crandom(struct crndstate *state, unsigned long rho)
0172 {
0173     state->rho = rho;
0174     state->last = prandom_u32();
0175 }
0176 
0177 /* get_crandom - correlated random number generator
0178  * Next number depends on last value.
0179  * rho is scaled to avoid floating point.
0180  */
0181 static u32 get_crandom(struct crndstate *state)
0182 {
0183     u64 value, rho;
0184     unsigned long answer;
0185 
0186     if (!state || state->rho == 0)  /* no correlation */
0187         return prandom_u32();
0188 
0189     value = prandom_u32();
0190     rho = (u64)state->rho + 1;
0191     answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
0192     state->last = answer;
0193     return answer;
0194 }
0195 
0196 /* loss_4state - 4-state model loss generator
0197  * Generates losses according to the 4-state Markov chain adopted in
0198  * the GI (General and Intuitive) loss model.
0199  */
0200 static bool loss_4state(struct netem_sched_data *q)
0201 {
0202     struct clgstate *clg = &q->clg;
0203     u32 rnd = prandom_u32();
0204 
0205     /*
0206      * Makes a comparison between rnd and the transition
0207      * probabilities outgoing from the current state, then decides the
0208      * next state and if the next packet has to be transmitted or lost.
0209      * The four states correspond to:
0210      *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
0211      *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
0212      *   LOST_IN_BURST_PERIOD => lost packets within a burst period
0213      *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
0214      */
0215     switch (clg->state) {
0216     case TX_IN_GAP_PERIOD:
0217         if (rnd < clg->a4) {
0218             clg->state = LOST_IN_GAP_PERIOD;
0219             return true;
0220         } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
0221             clg->state = LOST_IN_BURST_PERIOD;
0222             return true;
0223         } else if (clg->a1 + clg->a4 < rnd) {
0224             clg->state = TX_IN_GAP_PERIOD;
0225         }
0226 
0227         break;
0228     case TX_IN_BURST_PERIOD:
0229         if (rnd < clg->a5) {
0230             clg->state = LOST_IN_BURST_PERIOD;
0231             return true;
0232         } else {
0233             clg->state = TX_IN_BURST_PERIOD;
0234         }
0235 
0236         break;
0237     case LOST_IN_BURST_PERIOD:
0238         if (rnd < clg->a3)
0239             clg->state = TX_IN_BURST_PERIOD;
0240         else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
0241             clg->state = TX_IN_GAP_PERIOD;
0242         } else if (clg->a2 + clg->a3 < rnd) {
0243             clg->state = LOST_IN_BURST_PERIOD;
0244             return true;
0245         }
0246         break;
0247     case LOST_IN_GAP_PERIOD:
0248         clg->state = TX_IN_GAP_PERIOD;
0249         break;
0250     }
0251 
0252     return false;
0253 }
0254 
0255 /* loss_gilb_ell - Gilbert-Elliot model loss generator
0256  * Generates losses according to the Gilbert-Elliot loss model or
0257  * its special cases  (Gilbert or Simple Gilbert)
0258  *
0259  * Makes a comparison between random number and the transition
0260  * probabilities outgoing from the current state, then decides the
0261  * next state. A second random number is extracted and the comparison
0262  * with the loss probability of the current state decides if the next
0263  * packet will be transmitted or lost.
0264  */
0265 static bool loss_gilb_ell(struct netem_sched_data *q)
0266 {
0267     struct clgstate *clg = &q->clg;
0268 
0269     switch (clg->state) {
0270     case GOOD_STATE:
0271         if (prandom_u32() < clg->a1)
0272             clg->state = BAD_STATE;
0273         if (prandom_u32() < clg->a4)
0274             return true;
0275         break;
0276     case BAD_STATE:
0277         if (prandom_u32() < clg->a2)
0278             clg->state = GOOD_STATE;
0279         if (prandom_u32() > clg->a3)
0280             return true;
0281     }
0282 
0283     return false;
0284 }
0285 
0286 static bool loss_event(struct netem_sched_data *q)
0287 {
0288     switch (q->loss_model) {
0289     case CLG_RANDOM:
0290         /* Random packet drop 0 => none, ~0 => all */
0291         return q->loss && q->loss >= get_crandom(&q->loss_cor);
0292 
0293     case CLG_4_STATES:
0294         /* 4state loss model algorithm (used also for GI model)
0295         * Extracts a value from the markov 4 state loss generator,
0296         * if it is 1 drops a packet and if needed writes the event in
0297         * the kernel logs
0298         */
0299         return loss_4state(q);
0300 
0301     case CLG_GILB_ELL:
0302         /* Gilbert-Elliot loss model algorithm
0303         * Extracts a value from the Gilbert-Elliot loss generator,
0304         * if it is 1 drops a packet and if needed writes the event in
0305         * the kernel logs
0306         */
0307         return loss_gilb_ell(q);
0308     }
0309 
0310     return false;   /* not reached */
0311 }
0312 
0313 
0314 /* tabledist - return a pseudo-randomly distributed value with mean mu and
0315  * std deviation sigma.  Uses table lookup to approximate the desired
0316  * distribution, and a uniformly-distributed pseudo-random source.
0317  */
0318 static s64 tabledist(s64 mu, s32 sigma,
0319              struct crndstate *state,
0320              const struct disttable *dist)
0321 {
0322     s64 x;
0323     long t;
0324     u32 rnd;
0325 
0326     if (sigma == 0)
0327         return mu;
0328 
0329     rnd = get_crandom(state);
0330 
0331     /* default uniform distribution */
0332     if (dist == NULL)
0333         return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
0334 
0335     t = dist->table[rnd % dist->size];
0336     x = (sigma % NETEM_DIST_SCALE) * t;
0337     if (x >= 0)
0338         x += NETEM_DIST_SCALE/2;
0339     else
0340         x -= NETEM_DIST_SCALE/2;
0341 
0342     return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
0343 }
0344 
0345 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
0346 {
0347     len += q->packet_overhead;
0348 
0349     if (q->cell_size) {
0350         u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
0351 
0352         if (len > cells * q->cell_size) /* extra cell needed for remainder */
0353             cells++;
0354         len = cells * (q->cell_size + q->cell_overhead);
0355     }
0356 
0357     return div64_u64(len * NSEC_PER_SEC, q->rate);
0358 }
0359 
0360 static void tfifo_reset(struct Qdisc *sch)
0361 {
0362     struct netem_sched_data *q = qdisc_priv(sch);
0363     struct rb_node *p = rb_first(&q->t_root);
0364 
0365     while (p) {
0366         struct sk_buff *skb = rb_to_skb(p);
0367 
0368         p = rb_next(p);
0369         rb_erase(&skb->rbnode, &q->t_root);
0370         rtnl_kfree_skbs(skb, skb);
0371     }
0372 
0373     rtnl_kfree_skbs(q->t_head, q->t_tail);
0374     q->t_head = NULL;
0375     q->t_tail = NULL;
0376 }
0377 
0378 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
0379 {
0380     struct netem_sched_data *q = qdisc_priv(sch);
0381     u64 tnext = netem_skb_cb(nskb)->time_to_send;
0382 
0383     if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
0384         if (q->t_tail)
0385             q->t_tail->next = nskb;
0386         else
0387             q->t_head = nskb;
0388         q->t_tail = nskb;
0389     } else {
0390         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
0391 
0392         while (*p) {
0393             struct sk_buff *skb;
0394 
0395             parent = *p;
0396             skb = rb_to_skb(parent);
0397             if (tnext >= netem_skb_cb(skb)->time_to_send)
0398                 p = &parent->rb_right;
0399             else
0400                 p = &parent->rb_left;
0401         }
0402         rb_link_node(&nskb->rbnode, parent, p);
0403         rb_insert_color(&nskb->rbnode, &q->t_root);
0404     }
0405     sch->q.qlen++;
0406 }
0407 
0408 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
0409  * when we statistically choose to corrupt one, we instead segment it, returning
0410  * the first packet to be corrupted, and re-enqueue the remaining frames
0411  */
0412 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
0413                      struct sk_buff **to_free)
0414 {
0415     struct sk_buff *segs;
0416     netdev_features_t features = netif_skb_features(skb);
0417 
0418     segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
0419 
0420     if (IS_ERR_OR_NULL(segs)) {
0421         qdisc_drop(skb, sch, to_free);
0422         return NULL;
0423     }
0424     consume_skb(skb);
0425     return segs;
0426 }
0427 
0428 /*
0429  * Insert one skb into qdisc.
0430  * Note: parent depends on return value to account for queue length.
0431  *  NET_XMIT_DROP: queue length didn't change.
0432  *      NET_XMIT_SUCCESS: one skb was queued.
0433  */
0434 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
0435              struct sk_buff **to_free)
0436 {
0437     struct netem_sched_data *q = qdisc_priv(sch);
0438     /* We don't fill cb now as skb_unshare() may invalidate it */
0439     struct netem_skb_cb *cb;
0440     struct sk_buff *skb2;
0441     struct sk_buff *segs = NULL;
0442     unsigned int prev_len = qdisc_pkt_len(skb);
0443     int count = 1;
0444     int rc = NET_XMIT_SUCCESS;
0445     int rc_drop = NET_XMIT_DROP;
0446 
0447     /* Do not fool qdisc_drop_all() */
0448     skb->prev = NULL;
0449 
0450     /* Random duplication */
0451     if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
0452         ++count;
0453 
0454     /* Drop packet? */
0455     if (loss_event(q)) {
0456         if (q->ecn && INET_ECN_set_ce(skb))
0457             qdisc_qstats_drop(sch); /* mark packet */
0458         else
0459             --count;
0460     }
0461     if (count == 0) {
0462         qdisc_qstats_drop(sch);
0463         __qdisc_drop(skb, to_free);
0464         return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
0465     }
0466 
0467     /* If a delay is expected, orphan the skb. (orphaning usually takes
0468      * place at TX completion time, so _before_ the link transit delay)
0469      */
0470     if (q->latency || q->jitter || q->rate)
0471         skb_orphan_partial(skb);
0472 
0473     /*
0474      * If we need to duplicate packet, then re-insert at top of the
0475      * qdisc tree, since parent queuer expects that only one
0476      * skb will be queued.
0477      */
0478     if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
0479         struct Qdisc *rootq = qdisc_root_bh(sch);
0480         u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
0481 
0482         q->duplicate = 0;
0483         rootq->enqueue(skb2, rootq, to_free);
0484         q->duplicate = dupsave;
0485         rc_drop = NET_XMIT_SUCCESS;
0486     }
0487 
0488     /*
0489      * Randomized packet corruption.
0490      * Make copy if needed since we are modifying
0491      * If packet is going to be hardware checksummed, then
0492      * do it now in software before we mangle it.
0493      */
0494     if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
0495         if (skb_is_gso(skb)) {
0496             skb = netem_segment(skb, sch, to_free);
0497             if (!skb)
0498                 return rc_drop;
0499             segs = skb->next;
0500             skb_mark_not_on_list(skb);
0501             qdisc_skb_cb(skb)->pkt_len = skb->len;
0502         }
0503 
0504         skb = skb_unshare(skb, GFP_ATOMIC);
0505         if (unlikely(!skb)) {
0506             qdisc_qstats_drop(sch);
0507             goto finish_segs;
0508         }
0509         if (skb->ip_summed == CHECKSUM_PARTIAL &&
0510             skb_checksum_help(skb)) {
0511             qdisc_drop(skb, sch, to_free);
0512             skb = NULL;
0513             goto finish_segs;
0514         }
0515 
0516         skb->data[prandom_u32() % skb_headlen(skb)] ^=
0517             1<<(prandom_u32() % 8);
0518     }
0519 
0520     if (unlikely(sch->q.qlen >= sch->limit)) {
0521         /* re-link segs, so that qdisc_drop_all() frees them all */
0522         skb->next = segs;
0523         qdisc_drop_all(skb, sch, to_free);
0524         return rc_drop;
0525     }
0526 
0527     qdisc_qstats_backlog_inc(sch, skb);
0528 
0529     cb = netem_skb_cb(skb);
0530     if (q->gap == 0 ||      /* not doing reordering */
0531         q->counter < q->gap - 1 ||  /* inside last reordering gap */
0532         q->reorder < get_crandom(&q->reorder_cor)) {
0533         u64 now;
0534         s64 delay;
0535 
0536         delay = tabledist(q->latency, q->jitter,
0537                   &q->delay_cor, q->delay_dist);
0538 
0539         now = ktime_get_ns();
0540 
0541         if (q->rate) {
0542             struct netem_skb_cb *last = NULL;
0543 
0544             if (sch->q.tail)
0545                 last = netem_skb_cb(sch->q.tail);
0546             if (q->t_root.rb_node) {
0547                 struct sk_buff *t_skb;
0548                 struct netem_skb_cb *t_last;
0549 
0550                 t_skb = skb_rb_last(&q->t_root);
0551                 t_last = netem_skb_cb(t_skb);
0552                 if (!last ||
0553                     t_last->time_to_send > last->time_to_send)
0554                     last = t_last;
0555             }
0556             if (q->t_tail) {
0557                 struct netem_skb_cb *t_last =
0558                     netem_skb_cb(q->t_tail);
0559 
0560                 if (!last ||
0561                     t_last->time_to_send > last->time_to_send)
0562                     last = t_last;
0563             }
0564 
0565             if (last) {
0566                 /*
0567                  * Last packet in queue is reference point (now),
0568                  * calculate this time bonus and subtract
0569                  * from delay.
0570                  */
0571                 delay -= last->time_to_send - now;
0572                 delay = max_t(s64, 0, delay);
0573                 now = last->time_to_send;
0574             }
0575 
0576             delay += packet_time_ns(qdisc_pkt_len(skb), q);
0577         }
0578 
0579         cb->time_to_send = now + delay;
0580         ++q->counter;
0581         tfifo_enqueue(skb, sch);
0582     } else {
0583         /*
0584          * Do re-ordering by putting one out of N packets at the front
0585          * of the queue.
0586          */
0587         cb->time_to_send = ktime_get_ns();
0588         q->counter = 0;
0589 
0590         __qdisc_enqueue_head(skb, &sch->q);
0591         sch->qstats.requeues++;
0592     }
0593 
0594 finish_segs:
0595     if (segs) {
0596         unsigned int len, last_len;
0597         int nb;
0598 
0599         len = skb ? skb->len : 0;
0600         nb = skb ? 1 : 0;
0601 
0602         while (segs) {
0603             skb2 = segs->next;
0604             skb_mark_not_on_list(segs);
0605             qdisc_skb_cb(segs)->pkt_len = segs->len;
0606             last_len = segs->len;
0607             rc = qdisc_enqueue(segs, sch, to_free);
0608             if (rc != NET_XMIT_SUCCESS) {
0609                 if (net_xmit_drop_count(rc))
0610                     qdisc_qstats_drop(sch);
0611             } else {
0612                 nb++;
0613                 len += last_len;
0614             }
0615             segs = skb2;
0616         }
0617         /* Parent qdiscs accounted for 1 skb of size @prev_len */
0618         qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
0619     } else if (!skb) {
0620         return NET_XMIT_DROP;
0621     }
0622     return NET_XMIT_SUCCESS;
0623 }
0624 
0625 /* Delay the next round with a new future slot with a
0626  * correct number of bytes and packets.
0627  */
0628 
0629 static void get_slot_next(struct netem_sched_data *q, u64 now)
0630 {
0631     s64 next_delay;
0632 
0633     if (!q->slot_dist)
0634         next_delay = q->slot_config.min_delay +
0635                 (prandom_u32() *
0636                  (q->slot_config.max_delay -
0637                   q->slot_config.min_delay) >> 32);
0638     else
0639         next_delay = tabledist(q->slot_config.dist_delay,
0640                        (s32)(q->slot_config.dist_jitter),
0641                        NULL, q->slot_dist);
0642 
0643     q->slot.slot_next = now + next_delay;
0644     q->slot.packets_left = q->slot_config.max_packets;
0645     q->slot.bytes_left = q->slot_config.max_bytes;
0646 }
0647 
0648 static struct sk_buff *netem_peek(struct netem_sched_data *q)
0649 {
0650     struct sk_buff *skb = skb_rb_first(&q->t_root);
0651     u64 t1, t2;
0652 
0653     if (!skb)
0654         return q->t_head;
0655     if (!q->t_head)
0656         return skb;
0657 
0658     t1 = netem_skb_cb(skb)->time_to_send;
0659     t2 = netem_skb_cb(q->t_head)->time_to_send;
0660     if (t1 < t2)
0661         return skb;
0662     return q->t_head;
0663 }
0664 
0665 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
0666 {
0667     if (skb == q->t_head) {
0668         q->t_head = skb->next;
0669         if (!q->t_head)
0670             q->t_tail = NULL;
0671     } else {
0672         rb_erase(&skb->rbnode, &q->t_root);
0673     }
0674 }
0675 
0676 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
0677 {
0678     struct netem_sched_data *q = qdisc_priv(sch);
0679     struct sk_buff *skb;
0680 
0681 tfifo_dequeue:
0682     skb = __qdisc_dequeue_head(&sch->q);
0683     if (skb) {
0684         qdisc_qstats_backlog_dec(sch, skb);
0685 deliver:
0686         qdisc_bstats_update(sch, skb);
0687         return skb;
0688     }
0689     skb = netem_peek(q);
0690     if (skb) {
0691         u64 time_to_send;
0692         u64 now = ktime_get_ns();
0693 
0694         /* if more time remaining? */
0695         time_to_send = netem_skb_cb(skb)->time_to_send;
0696         if (q->slot.slot_next && q->slot.slot_next < time_to_send)
0697             get_slot_next(q, now);
0698 
0699         if (time_to_send <= now && q->slot.slot_next <= now) {
0700             netem_erase_head(q, skb);
0701             sch->q.qlen--;
0702             qdisc_qstats_backlog_dec(sch, skb);
0703             skb->next = NULL;
0704             skb->prev = NULL;
0705             /* skb->dev shares skb->rbnode area,
0706              * we need to restore its value.
0707              */
0708             skb->dev = qdisc_dev(sch);
0709 
0710             if (q->slot.slot_next) {
0711                 q->slot.packets_left--;
0712                 q->slot.bytes_left -= qdisc_pkt_len(skb);
0713                 if (q->slot.packets_left <= 0 ||
0714                     q->slot.bytes_left <= 0)
0715                     get_slot_next(q, now);
0716             }
0717 
0718             if (q->qdisc) {
0719                 unsigned int pkt_len = qdisc_pkt_len(skb);
0720                 struct sk_buff *to_free = NULL;
0721                 int err;
0722 
0723                 err = qdisc_enqueue(skb, q->qdisc, &to_free);
0724                 kfree_skb_list(to_free);
0725                 if (err != NET_XMIT_SUCCESS &&
0726                     net_xmit_drop_count(err)) {
0727                     qdisc_qstats_drop(sch);
0728                     qdisc_tree_reduce_backlog(sch, 1,
0729                                   pkt_len);
0730                 }
0731                 goto tfifo_dequeue;
0732             }
0733             goto deliver;
0734         }
0735 
0736         if (q->qdisc) {
0737             skb = q->qdisc->ops->dequeue(q->qdisc);
0738             if (skb)
0739                 goto deliver;
0740         }
0741 
0742         qdisc_watchdog_schedule_ns(&q->watchdog,
0743                        max(time_to_send,
0744                            q->slot.slot_next));
0745     }
0746 
0747     if (q->qdisc) {
0748         skb = q->qdisc->ops->dequeue(q->qdisc);
0749         if (skb)
0750             goto deliver;
0751     }
0752     return NULL;
0753 }
0754 
0755 static void netem_reset(struct Qdisc *sch)
0756 {
0757     struct netem_sched_data *q = qdisc_priv(sch);
0758 
0759     qdisc_reset_queue(sch);
0760     tfifo_reset(sch);
0761     if (q->qdisc)
0762         qdisc_reset(q->qdisc);
0763     qdisc_watchdog_cancel(&q->watchdog);
0764 }
0765 
0766 static void dist_free(struct disttable *d)
0767 {
0768     kvfree(d);
0769 }
0770 
0771 /*
0772  * Distribution data is a variable size payload containing
0773  * signed 16 bit values.
0774  */
0775 
0776 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
0777               const struct nlattr *attr)
0778 {
0779     size_t n = nla_len(attr)/sizeof(__s16);
0780     const __s16 *data = nla_data(attr);
0781     spinlock_t *root_lock;
0782     struct disttable *d;
0783     int i;
0784 
0785     if (!n || n > NETEM_DIST_MAX)
0786         return -EINVAL;
0787 
0788     d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
0789     if (!d)
0790         return -ENOMEM;
0791 
0792     d->size = n;
0793     for (i = 0; i < n; i++)
0794         d->table[i] = data[i];
0795 
0796     root_lock = qdisc_root_sleeping_lock(sch);
0797 
0798     spin_lock_bh(root_lock);
0799     swap(*tbl, d);
0800     spin_unlock_bh(root_lock);
0801 
0802     dist_free(d);
0803     return 0;
0804 }
0805 
0806 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
0807 {
0808     const struct tc_netem_slot *c = nla_data(attr);
0809 
0810     q->slot_config = *c;
0811     if (q->slot_config.max_packets == 0)
0812         q->slot_config.max_packets = INT_MAX;
0813     if (q->slot_config.max_bytes == 0)
0814         q->slot_config.max_bytes = INT_MAX;
0815 
0816     /* capping dist_jitter to the range acceptable by tabledist() */
0817     q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
0818 
0819     q->slot.packets_left = q->slot_config.max_packets;
0820     q->slot.bytes_left = q->slot_config.max_bytes;
0821     if (q->slot_config.min_delay | q->slot_config.max_delay |
0822         q->slot_config.dist_jitter)
0823         q->slot.slot_next = ktime_get_ns();
0824     else
0825         q->slot.slot_next = 0;
0826 }
0827 
0828 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
0829 {
0830     const struct tc_netem_corr *c = nla_data(attr);
0831 
0832     init_crandom(&q->delay_cor, c->delay_corr);
0833     init_crandom(&q->loss_cor, c->loss_corr);
0834     init_crandom(&q->dup_cor, c->dup_corr);
0835 }
0836 
0837 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
0838 {
0839     const struct tc_netem_reorder *r = nla_data(attr);
0840 
0841     q->reorder = r->probability;
0842     init_crandom(&q->reorder_cor, r->correlation);
0843 }
0844 
0845 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
0846 {
0847     const struct tc_netem_corrupt *r = nla_data(attr);
0848 
0849     q->corrupt = r->probability;
0850     init_crandom(&q->corrupt_cor, r->correlation);
0851 }
0852 
0853 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
0854 {
0855     const struct tc_netem_rate *r = nla_data(attr);
0856 
0857     q->rate = r->rate;
0858     q->packet_overhead = r->packet_overhead;
0859     q->cell_size = r->cell_size;
0860     q->cell_overhead = r->cell_overhead;
0861     if (q->cell_size)
0862         q->cell_size_reciprocal = reciprocal_value(q->cell_size);
0863     else
0864         q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
0865 }
0866 
0867 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
0868 {
0869     const struct nlattr *la;
0870     int rem;
0871 
0872     nla_for_each_nested(la, attr, rem) {
0873         u16 type = nla_type(la);
0874 
0875         switch (type) {
0876         case NETEM_LOSS_GI: {
0877             const struct tc_netem_gimodel *gi = nla_data(la);
0878 
0879             if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
0880                 pr_info("netem: incorrect gi model size\n");
0881                 return -EINVAL;
0882             }
0883 
0884             q->loss_model = CLG_4_STATES;
0885 
0886             q->clg.state = TX_IN_GAP_PERIOD;
0887             q->clg.a1 = gi->p13;
0888             q->clg.a2 = gi->p31;
0889             q->clg.a3 = gi->p32;
0890             q->clg.a4 = gi->p14;
0891             q->clg.a5 = gi->p23;
0892             break;
0893         }
0894 
0895         case NETEM_LOSS_GE: {
0896             const struct tc_netem_gemodel *ge = nla_data(la);
0897 
0898             if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
0899                 pr_info("netem: incorrect ge model size\n");
0900                 return -EINVAL;
0901             }
0902 
0903             q->loss_model = CLG_GILB_ELL;
0904             q->clg.state = GOOD_STATE;
0905             q->clg.a1 = ge->p;
0906             q->clg.a2 = ge->r;
0907             q->clg.a3 = ge->h;
0908             q->clg.a4 = ge->k1;
0909             break;
0910         }
0911 
0912         default:
0913             pr_info("netem: unknown loss type %u\n", type);
0914             return -EINVAL;
0915         }
0916     }
0917 
0918     return 0;
0919 }
0920 
0921 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
0922     [TCA_NETEM_CORR]    = { .len = sizeof(struct tc_netem_corr) },
0923     [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
0924     [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
0925     [TCA_NETEM_RATE]    = { .len = sizeof(struct tc_netem_rate) },
0926     [TCA_NETEM_LOSS]    = { .type = NLA_NESTED },
0927     [TCA_NETEM_ECN]     = { .type = NLA_U32 },
0928     [TCA_NETEM_RATE64]  = { .type = NLA_U64 },
0929     [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
0930     [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
0931     [TCA_NETEM_SLOT]    = { .len = sizeof(struct tc_netem_slot) },
0932 };
0933 
0934 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
0935               const struct nla_policy *policy, int len)
0936 {
0937     int nested_len = nla_len(nla) - NLA_ALIGN(len);
0938 
0939     if (nested_len < 0) {
0940         pr_info("netem: invalid attributes len %d\n", nested_len);
0941         return -EINVAL;
0942     }
0943 
0944     if (nested_len >= nla_attr_size(0))
0945         return nla_parse_deprecated(tb, maxtype,
0946                         nla_data(nla) + NLA_ALIGN(len),
0947                         nested_len, policy, NULL);
0948 
0949     memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
0950     return 0;
0951 }
0952 
0953 /* Parse netlink message to set options */
0954 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
0955             struct netlink_ext_ack *extack)
0956 {
0957     struct netem_sched_data *q = qdisc_priv(sch);
0958     struct nlattr *tb[TCA_NETEM_MAX + 1];
0959     struct tc_netem_qopt *qopt;
0960     struct clgstate old_clg;
0961     int old_loss_model = CLG_RANDOM;
0962     int ret;
0963 
0964     if (opt == NULL)
0965         return -EINVAL;
0966 
0967     qopt = nla_data(opt);
0968     ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
0969     if (ret < 0)
0970         return ret;
0971 
0972     /* backup q->clg and q->loss_model */
0973     old_clg = q->clg;
0974     old_loss_model = q->loss_model;
0975 
0976     if (tb[TCA_NETEM_LOSS]) {
0977         ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
0978         if (ret) {
0979             q->loss_model = old_loss_model;
0980             return ret;
0981         }
0982     } else {
0983         q->loss_model = CLG_RANDOM;
0984     }
0985 
0986     if (tb[TCA_NETEM_DELAY_DIST]) {
0987         ret = get_dist_table(sch, &q->delay_dist,
0988                      tb[TCA_NETEM_DELAY_DIST]);
0989         if (ret)
0990             goto get_table_failure;
0991     }
0992 
0993     if (tb[TCA_NETEM_SLOT_DIST]) {
0994         ret = get_dist_table(sch, &q->slot_dist,
0995                      tb[TCA_NETEM_SLOT_DIST]);
0996         if (ret)
0997             goto get_table_failure;
0998     }
0999 
1000     sch->limit = qopt->limit;
1001 
1002     q->latency = PSCHED_TICKS2NS(qopt->latency);
1003     q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1004     q->limit = qopt->limit;
1005     q->gap = qopt->gap;
1006     q->counter = 0;
1007     q->loss = qopt->loss;
1008     q->duplicate = qopt->duplicate;
1009 
1010     /* for compatibility with earlier versions.
1011      * if gap is set, need to assume 100% probability
1012      */
1013     if (q->gap)
1014         q->reorder = ~0;
1015 
1016     if (tb[TCA_NETEM_CORR])
1017         get_correlation(q, tb[TCA_NETEM_CORR]);
1018 
1019     if (tb[TCA_NETEM_REORDER])
1020         get_reorder(q, tb[TCA_NETEM_REORDER]);
1021 
1022     if (tb[TCA_NETEM_CORRUPT])
1023         get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1024 
1025     if (tb[TCA_NETEM_RATE])
1026         get_rate(q, tb[TCA_NETEM_RATE]);
1027 
1028     if (tb[TCA_NETEM_RATE64])
1029         q->rate = max_t(u64, q->rate,
1030                 nla_get_u64(tb[TCA_NETEM_RATE64]));
1031 
1032     if (tb[TCA_NETEM_LATENCY64])
1033         q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1034 
1035     if (tb[TCA_NETEM_JITTER64])
1036         q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1037 
1038     if (tb[TCA_NETEM_ECN])
1039         q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1040 
1041     if (tb[TCA_NETEM_SLOT])
1042         get_slot(q, tb[TCA_NETEM_SLOT]);
1043 
1044     /* capping jitter to the range acceptable by tabledist() */
1045     q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1046 
1047     return ret;
1048 
1049 get_table_failure:
1050     /* recover clg and loss_model, in case of
1051      * q->clg and q->loss_model were modified
1052      * in get_loss_clg()
1053      */
1054     q->clg = old_clg;
1055     q->loss_model = old_loss_model;
1056     return ret;
1057 }
1058 
1059 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1060               struct netlink_ext_ack *extack)
1061 {
1062     struct netem_sched_data *q = qdisc_priv(sch);
1063     int ret;
1064 
1065     qdisc_watchdog_init(&q->watchdog, sch);
1066 
1067     if (!opt)
1068         return -EINVAL;
1069 
1070     q->loss_model = CLG_RANDOM;
1071     ret = netem_change(sch, opt, extack);
1072     if (ret)
1073         pr_info("netem: change failed\n");
1074     return ret;
1075 }
1076 
1077 static void netem_destroy(struct Qdisc *sch)
1078 {
1079     struct netem_sched_data *q = qdisc_priv(sch);
1080 
1081     qdisc_watchdog_cancel(&q->watchdog);
1082     if (q->qdisc)
1083         qdisc_put(q->qdisc);
1084     dist_free(q->delay_dist);
1085     dist_free(q->slot_dist);
1086 }
1087 
1088 static int dump_loss_model(const struct netem_sched_data *q,
1089                struct sk_buff *skb)
1090 {
1091     struct nlattr *nest;
1092 
1093     nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1094     if (nest == NULL)
1095         goto nla_put_failure;
1096 
1097     switch (q->loss_model) {
1098     case CLG_RANDOM:
1099         /* legacy loss model */
1100         nla_nest_cancel(skb, nest);
1101         return 0;   /* no data */
1102 
1103     case CLG_4_STATES: {
1104         struct tc_netem_gimodel gi = {
1105             .p13 = q->clg.a1,
1106             .p31 = q->clg.a2,
1107             .p32 = q->clg.a3,
1108             .p14 = q->clg.a4,
1109             .p23 = q->clg.a5,
1110         };
1111 
1112         if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1113             goto nla_put_failure;
1114         break;
1115     }
1116     case CLG_GILB_ELL: {
1117         struct tc_netem_gemodel ge = {
1118             .p = q->clg.a1,
1119             .r = q->clg.a2,
1120             .h = q->clg.a3,
1121             .k1 = q->clg.a4,
1122         };
1123 
1124         if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1125             goto nla_put_failure;
1126         break;
1127     }
1128     }
1129 
1130     nla_nest_end(skb, nest);
1131     return 0;
1132 
1133 nla_put_failure:
1134     nla_nest_cancel(skb, nest);
1135     return -1;
1136 }
1137 
1138 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1139 {
1140     const struct netem_sched_data *q = qdisc_priv(sch);
1141     struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1142     struct tc_netem_qopt qopt;
1143     struct tc_netem_corr cor;
1144     struct tc_netem_reorder reorder;
1145     struct tc_netem_corrupt corrupt;
1146     struct tc_netem_rate rate;
1147     struct tc_netem_slot slot;
1148 
1149     qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1150                  UINT_MAX);
1151     qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1152                 UINT_MAX);
1153     qopt.limit = q->limit;
1154     qopt.loss = q->loss;
1155     qopt.gap = q->gap;
1156     qopt.duplicate = q->duplicate;
1157     if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1158         goto nla_put_failure;
1159 
1160     if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1161         goto nla_put_failure;
1162 
1163     if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1164         goto nla_put_failure;
1165 
1166     cor.delay_corr = q->delay_cor.rho;
1167     cor.loss_corr = q->loss_cor.rho;
1168     cor.dup_corr = q->dup_cor.rho;
1169     if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1170         goto nla_put_failure;
1171 
1172     reorder.probability = q->reorder;
1173     reorder.correlation = q->reorder_cor.rho;
1174     if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1175         goto nla_put_failure;
1176 
1177     corrupt.probability = q->corrupt;
1178     corrupt.correlation = q->corrupt_cor.rho;
1179     if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1180         goto nla_put_failure;
1181 
1182     if (q->rate >= (1ULL << 32)) {
1183         if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1184                       TCA_NETEM_PAD))
1185             goto nla_put_failure;
1186         rate.rate = ~0U;
1187     } else {
1188         rate.rate = q->rate;
1189     }
1190     rate.packet_overhead = q->packet_overhead;
1191     rate.cell_size = q->cell_size;
1192     rate.cell_overhead = q->cell_overhead;
1193     if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1194         goto nla_put_failure;
1195 
1196     if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1197         goto nla_put_failure;
1198 
1199     if (dump_loss_model(q, skb) != 0)
1200         goto nla_put_failure;
1201 
1202     if (q->slot_config.min_delay | q->slot_config.max_delay |
1203         q->slot_config.dist_jitter) {
1204         slot = q->slot_config;
1205         if (slot.max_packets == INT_MAX)
1206             slot.max_packets = 0;
1207         if (slot.max_bytes == INT_MAX)
1208             slot.max_bytes = 0;
1209         if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1210             goto nla_put_failure;
1211     }
1212 
1213     return nla_nest_end(skb, nla);
1214 
1215 nla_put_failure:
1216     nlmsg_trim(skb, nla);
1217     return -1;
1218 }
1219 
1220 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1221               struct sk_buff *skb, struct tcmsg *tcm)
1222 {
1223     struct netem_sched_data *q = qdisc_priv(sch);
1224 
1225     if (cl != 1 || !q->qdisc)   /* only one class */
1226         return -ENOENT;
1227 
1228     tcm->tcm_handle |= TC_H_MIN(1);
1229     tcm->tcm_info = q->qdisc->handle;
1230 
1231     return 0;
1232 }
1233 
1234 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1235              struct Qdisc **old, struct netlink_ext_ack *extack)
1236 {
1237     struct netem_sched_data *q = qdisc_priv(sch);
1238 
1239     *old = qdisc_replace(sch, new, &q->qdisc);
1240     return 0;
1241 }
1242 
1243 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1244 {
1245     struct netem_sched_data *q = qdisc_priv(sch);
1246     return q->qdisc;
1247 }
1248 
1249 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1250 {
1251     return 1;
1252 }
1253 
1254 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1255 {
1256     if (!walker->stop) {
1257         if (walker->count >= walker->skip)
1258             if (walker->fn(sch, 1, walker) < 0) {
1259                 walker->stop = 1;
1260                 return;
1261             }
1262         walker->count++;
1263     }
1264 }
1265 
1266 static const struct Qdisc_class_ops netem_class_ops = {
1267     .graft      =   netem_graft,
1268     .leaf       =   netem_leaf,
1269     .find       =   netem_find,
1270     .walk       =   netem_walk,
1271     .dump       =   netem_dump_class,
1272 };
1273 
1274 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1275     .id     =   "netem",
1276     .cl_ops     =   &netem_class_ops,
1277     .priv_size  =   sizeof(struct netem_sched_data),
1278     .enqueue    =   netem_enqueue,
1279     .dequeue    =   netem_dequeue,
1280     .peek       =   qdisc_peek_dequeued,
1281     .init       =   netem_init,
1282     .reset      =   netem_reset,
1283     .destroy    =   netem_destroy,
1284     .change     =   netem_change,
1285     .dump       =   netem_dump,
1286     .owner      =   THIS_MODULE,
1287 };
1288 
1289 
1290 static int __init netem_module_init(void)
1291 {
1292     pr_info("netem: version " VERSION "\n");
1293     return register_qdisc(&netem_qdisc_ops);
1294 }
1295 static void __exit netem_module_exit(void)
1296 {
1297     unregister_qdisc(&netem_qdisc_ops);
1298 }
1299 module_init(netem_module_init)
1300 module_exit(netem_module_exit)
1301 MODULE_LICENSE("GPL");