Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Flow Queue PIE discipline
0003  *
0004  * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
0005  * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
0006  * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
0007  * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
0008  * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
0009  * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
0010  */
0011 
0012 #include <linux/jhash.h>
0013 #include <linux/sizes.h>
0014 #include <linux/vmalloc.h>
0015 #include <net/pkt_cls.h>
0016 #include <net/pie.h>
0017 
0018 /* Flow Queue PIE
0019  *
0020  * Principles:
0021  *   - Packets are classified on flows.
0022  *   - This is a Stochastic model (as we use a hash, several flows might
0023  *                                 be hashed to the same slot)
0024  *   - Each flow has a PIE managed queue.
0025  *   - Flows are linked onto two (Round Robin) lists,
0026  *     so that new flows have priority on old ones.
0027  *   - For a given flow, packets are not reordered.
0028  *   - Drops during enqueue only.
0029  *   - ECN capability is off by default.
0030  *   - ECN threshold (if ECN is enabled) is at 10% by default.
0031  *   - Uses timestamps to calculate queue delay by default.
0032  */
0033 
0034 /**
0035  * struct fq_pie_flow - contains data for each flow
0036  * @vars:   pie vars associated with the flow
0037  * @deficit:    number of remaining byte credits
0038  * @backlog:    size of data in the flow
0039  * @qlen:   number of packets in the flow
0040  * @flowchain:  flowchain for the flow
0041  * @head:   first packet in the flow
0042  * @tail:   last packet in the flow
0043  */
0044 struct fq_pie_flow {
0045     struct pie_vars vars;
0046     s32 deficit;
0047     u32 backlog;
0048     u32 qlen;
0049     struct list_head flowchain;
0050     struct sk_buff *head;
0051     struct sk_buff *tail;
0052 };
0053 
0054 struct fq_pie_sched_data {
0055     struct tcf_proto __rcu *filter_list; /* optional external classifier */
0056     struct tcf_block *block;
0057     struct fq_pie_flow *flows;
0058     struct Qdisc *sch;
0059     struct list_head old_flows;
0060     struct list_head new_flows;
0061     struct pie_params p_params;
0062     u32 ecn_prob;
0063     u32 flows_cnt;
0064     u32 quantum;
0065     u32 memory_limit;
0066     u32 new_flow_count;
0067     u32 memory_usage;
0068     u32 overmemory;
0069     struct pie_stats stats;
0070     struct timer_list adapt_timer;
0071 };
0072 
0073 static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
0074                 struct sk_buff *skb)
0075 {
0076     return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
0077 }
0078 
0079 static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
0080                     int *qerr)
0081 {
0082     struct fq_pie_sched_data *q = qdisc_priv(sch);
0083     struct tcf_proto *filter;
0084     struct tcf_result res;
0085     int result;
0086 
0087     if (TC_H_MAJ(skb->priority) == sch->handle &&
0088         TC_H_MIN(skb->priority) > 0 &&
0089         TC_H_MIN(skb->priority) <= q->flows_cnt)
0090         return TC_H_MIN(skb->priority);
0091 
0092     filter = rcu_dereference_bh(q->filter_list);
0093     if (!filter)
0094         return fq_pie_hash(q, skb) + 1;
0095 
0096     *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
0097     result = tcf_classify(skb, NULL, filter, &res, false);
0098     if (result >= 0) {
0099 #ifdef CONFIG_NET_CLS_ACT
0100         switch (result) {
0101         case TC_ACT_STOLEN:
0102         case TC_ACT_QUEUED:
0103         case TC_ACT_TRAP:
0104             *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
0105             fallthrough;
0106         case TC_ACT_SHOT:
0107             return 0;
0108         }
0109 #endif
0110         if (TC_H_MIN(res.classid) <= q->flows_cnt)
0111             return TC_H_MIN(res.classid);
0112     }
0113     return 0;
0114 }
0115 
0116 /* add skb to flow queue (tail add) */
0117 static inline void flow_queue_add(struct fq_pie_flow *flow,
0118                   struct sk_buff *skb)
0119 {
0120     if (!flow->head)
0121         flow->head = skb;
0122     else
0123         flow->tail->next = skb;
0124     flow->tail = skb;
0125     skb->next = NULL;
0126 }
0127 
0128 static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
0129                 struct sk_buff **to_free)
0130 {
0131     struct fq_pie_sched_data *q = qdisc_priv(sch);
0132     struct fq_pie_flow *sel_flow;
0133     int ret;
0134     u8 memory_limited = false;
0135     u8 enqueue = false;
0136     u32 pkt_len;
0137     u32 idx;
0138 
0139     /* Classifies packet into corresponding flow */
0140     idx = fq_pie_classify(skb, sch, &ret);
0141     if (idx == 0) {
0142         if (ret & __NET_XMIT_BYPASS)
0143             qdisc_qstats_drop(sch);
0144         __qdisc_drop(skb, to_free);
0145         return ret;
0146     }
0147     idx--;
0148 
0149     sel_flow = &q->flows[idx];
0150     /* Checks whether adding a new packet would exceed memory limit */
0151     get_pie_cb(skb)->mem_usage = skb->truesize;
0152     memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
0153 
0154     /* Checks if the qdisc is full */
0155     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
0156         q->stats.overlimit++;
0157         goto out;
0158     } else if (unlikely(memory_limited)) {
0159         q->overmemory++;
0160     }
0161 
0162     if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
0163                 sel_flow->backlog, skb->len)) {
0164         enqueue = true;
0165     } else if (q->p_params.ecn &&
0166            sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
0167            INET_ECN_set_ce(skb)) {
0168         /* If packet is ecn capable, mark it if drop probability
0169          * is lower than the parameter ecn_prob, else drop it.
0170          */
0171         q->stats.ecn_mark++;
0172         enqueue = true;
0173     }
0174     if (enqueue) {
0175         /* Set enqueue time only when dq_rate_estimator is disabled. */
0176         if (!q->p_params.dq_rate_estimator)
0177             pie_set_enqueue_time(skb);
0178 
0179         pkt_len = qdisc_pkt_len(skb);
0180         q->stats.packets_in++;
0181         q->memory_usage += skb->truesize;
0182         sch->qstats.backlog += pkt_len;
0183         sch->q.qlen++;
0184         flow_queue_add(sel_flow, skb);
0185         if (list_empty(&sel_flow->flowchain)) {
0186             list_add_tail(&sel_flow->flowchain, &q->new_flows);
0187             q->new_flow_count++;
0188             sel_flow->deficit = q->quantum;
0189             sel_flow->qlen = 0;
0190             sel_flow->backlog = 0;
0191         }
0192         sel_flow->qlen++;
0193         sel_flow->backlog += pkt_len;
0194         return NET_XMIT_SUCCESS;
0195     }
0196 out:
0197     q->stats.dropped++;
0198     sel_flow->vars.accu_prob = 0;
0199     __qdisc_drop(skb, to_free);
0200     qdisc_qstats_drop(sch);
0201     return NET_XMIT_CN;
0202 }
0203 
0204 static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
0205     [TCA_FQ_PIE_LIMIT]      = {.type = NLA_U32},
0206     [TCA_FQ_PIE_FLOWS]      = {.type = NLA_U32},
0207     [TCA_FQ_PIE_TARGET]     = {.type = NLA_U32},
0208     [TCA_FQ_PIE_TUPDATE]        = {.type = NLA_U32},
0209     [TCA_FQ_PIE_ALPHA]      = {.type = NLA_U32},
0210     [TCA_FQ_PIE_BETA]       = {.type = NLA_U32},
0211     [TCA_FQ_PIE_QUANTUM]        = {.type = NLA_U32},
0212     [TCA_FQ_PIE_MEMORY_LIMIT]   = {.type = NLA_U32},
0213     [TCA_FQ_PIE_ECN_PROB]       = {.type = NLA_U32},
0214     [TCA_FQ_PIE_ECN]        = {.type = NLA_U32},
0215     [TCA_FQ_PIE_BYTEMODE]       = {.type = NLA_U32},
0216     [TCA_FQ_PIE_DQ_RATE_ESTIMATOR]  = {.type = NLA_U32},
0217 };
0218 
0219 static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
0220 {
0221     struct sk_buff *skb = flow->head;
0222 
0223     flow->head = skb->next;
0224     skb->next = NULL;
0225     return skb;
0226 }
0227 
0228 static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
0229 {
0230     struct fq_pie_sched_data *q = qdisc_priv(sch);
0231     struct sk_buff *skb = NULL;
0232     struct fq_pie_flow *flow;
0233     struct list_head *head;
0234     u32 pkt_len;
0235 
0236 begin:
0237     head = &q->new_flows;
0238     if (list_empty(head)) {
0239         head = &q->old_flows;
0240         if (list_empty(head))
0241             return NULL;
0242     }
0243 
0244     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
0245     /* Flow has exhausted all its credits */
0246     if (flow->deficit <= 0) {
0247         flow->deficit += q->quantum;
0248         list_move_tail(&flow->flowchain, &q->old_flows);
0249         goto begin;
0250     }
0251 
0252     if (flow->head) {
0253         skb = dequeue_head(flow);
0254         pkt_len = qdisc_pkt_len(skb);
0255         sch->qstats.backlog -= pkt_len;
0256         sch->q.qlen--;
0257         qdisc_bstats_update(sch, skb);
0258     }
0259 
0260     if (!skb) {
0261         /* force a pass through old_flows to prevent starvation */
0262         if (head == &q->new_flows && !list_empty(&q->old_flows))
0263             list_move_tail(&flow->flowchain, &q->old_flows);
0264         else
0265             list_del_init(&flow->flowchain);
0266         goto begin;
0267     }
0268 
0269     flow->qlen--;
0270     flow->deficit -= pkt_len;
0271     flow->backlog -= pkt_len;
0272     q->memory_usage -= get_pie_cb(skb)->mem_usage;
0273     pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
0274     return skb;
0275 }
0276 
0277 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
0278              struct netlink_ext_ack *extack)
0279 {
0280     struct fq_pie_sched_data *q = qdisc_priv(sch);
0281     struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
0282     unsigned int len_dropped = 0;
0283     unsigned int num_dropped = 0;
0284     int err;
0285 
0286     if (!opt)
0287         return -EINVAL;
0288 
0289     err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
0290     if (err < 0)
0291         return err;
0292 
0293     sch_tree_lock(sch);
0294     if (tb[TCA_FQ_PIE_LIMIT]) {
0295         u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
0296 
0297         q->p_params.limit = limit;
0298         sch->limit = limit;
0299     }
0300     if (tb[TCA_FQ_PIE_FLOWS]) {
0301         if (q->flows) {
0302             NL_SET_ERR_MSG_MOD(extack,
0303                        "Number of flows cannot be changed");
0304             goto flow_error;
0305         }
0306         q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
0307         if (!q->flows_cnt || q->flows_cnt > 65536) {
0308             NL_SET_ERR_MSG_MOD(extack,
0309                        "Number of flows must range in [1..65536]");
0310             goto flow_error;
0311         }
0312     }
0313 
0314     /* convert from microseconds to pschedtime */
0315     if (tb[TCA_FQ_PIE_TARGET]) {
0316         /* target is in us */
0317         u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
0318 
0319         /* convert to pschedtime */
0320         q->p_params.target =
0321             PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
0322     }
0323 
0324     /* tupdate is in jiffies */
0325     if (tb[TCA_FQ_PIE_TUPDATE])
0326         q->p_params.tupdate =
0327             usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
0328 
0329     if (tb[TCA_FQ_PIE_ALPHA])
0330         q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
0331 
0332     if (tb[TCA_FQ_PIE_BETA])
0333         q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
0334 
0335     if (tb[TCA_FQ_PIE_QUANTUM])
0336         q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
0337 
0338     if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
0339         q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
0340 
0341     if (tb[TCA_FQ_PIE_ECN_PROB])
0342         q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
0343 
0344     if (tb[TCA_FQ_PIE_ECN])
0345         q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
0346 
0347     if (tb[TCA_FQ_PIE_BYTEMODE])
0348         q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
0349 
0350     if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
0351         q->p_params.dq_rate_estimator =
0352             nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
0353 
0354     /* Drop excess packets if new limit is lower */
0355     while (sch->q.qlen > sch->limit) {
0356         struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
0357 
0358         len_dropped += qdisc_pkt_len(skb);
0359         num_dropped += 1;
0360         rtnl_kfree_skbs(skb, skb);
0361     }
0362     qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
0363 
0364     sch_tree_unlock(sch);
0365     return 0;
0366 
0367 flow_error:
0368     sch_tree_unlock(sch);
0369     return -EINVAL;
0370 }
0371 
0372 static void fq_pie_timer(struct timer_list *t)
0373 {
0374     struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
0375     struct Qdisc *sch = q->sch;
0376     spinlock_t *root_lock; /* to lock qdisc for probability calculations */
0377     u32 idx;
0378 
0379     root_lock = qdisc_lock(qdisc_root_sleeping(sch));
0380     spin_lock(root_lock);
0381 
0382     for (idx = 0; idx < q->flows_cnt; idx++)
0383         pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
0384                       q->flows[idx].backlog);
0385 
0386     /* reset the timer to fire after 'tupdate' jiffies. */
0387     if (q->p_params.tupdate)
0388         mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
0389 
0390     spin_unlock(root_lock);
0391 }
0392 
0393 static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
0394                struct netlink_ext_ack *extack)
0395 {
0396     struct fq_pie_sched_data *q = qdisc_priv(sch);
0397     int err;
0398     u32 idx;
0399 
0400     pie_params_init(&q->p_params);
0401     sch->limit = 10 * 1024;
0402     q->p_params.limit = sch->limit;
0403     q->quantum = psched_mtu(qdisc_dev(sch));
0404     q->sch = sch;
0405     q->ecn_prob = 10;
0406     q->flows_cnt = 1024;
0407     q->memory_limit = SZ_32M;
0408 
0409     INIT_LIST_HEAD(&q->new_flows);
0410     INIT_LIST_HEAD(&q->old_flows);
0411     timer_setup(&q->adapt_timer, fq_pie_timer, 0);
0412 
0413     if (opt) {
0414         err = fq_pie_change(sch, opt, extack);
0415 
0416         if (err)
0417             return err;
0418     }
0419 
0420     err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
0421     if (err)
0422         goto init_failure;
0423 
0424     q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
0425                 GFP_KERNEL);
0426     if (!q->flows) {
0427         err = -ENOMEM;
0428         goto init_failure;
0429     }
0430     for (idx = 0; idx < q->flows_cnt; idx++) {
0431         struct fq_pie_flow *flow = q->flows + idx;
0432 
0433         INIT_LIST_HEAD(&flow->flowchain);
0434         pie_vars_init(&flow->vars);
0435     }
0436 
0437     mod_timer(&q->adapt_timer, jiffies + HZ / 2);
0438 
0439     return 0;
0440 
0441 init_failure:
0442     q->flows_cnt = 0;
0443 
0444     return err;
0445 }
0446 
0447 static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
0448 {
0449     struct fq_pie_sched_data *q = qdisc_priv(sch);
0450     struct nlattr *opts;
0451 
0452     opts = nla_nest_start(skb, TCA_OPTIONS);
0453     if (!opts)
0454         return -EMSGSIZE;
0455 
0456     /* convert target from pschedtime to us */
0457     if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
0458         nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
0459         nla_put_u32(skb, TCA_FQ_PIE_TARGET,
0460             ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
0461             NSEC_PER_USEC) ||
0462         nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
0463             jiffies_to_usecs(q->p_params.tupdate)) ||
0464         nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
0465         nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
0466         nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
0467         nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
0468         nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
0469         nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
0470         nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
0471         nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
0472             q->p_params.dq_rate_estimator))
0473         goto nla_put_failure;
0474 
0475     return nla_nest_end(skb, opts);
0476 
0477 nla_put_failure:
0478     nla_nest_cancel(skb, opts);
0479     return -EMSGSIZE;
0480 }
0481 
0482 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
0483 {
0484     struct fq_pie_sched_data *q = qdisc_priv(sch);
0485     struct tc_fq_pie_xstats st = {
0486         .packets_in = q->stats.packets_in,
0487         .overlimit  = q->stats.overlimit,
0488         .overmemory = q->overmemory,
0489         .dropped    = q->stats.dropped,
0490         .ecn_mark   = q->stats.ecn_mark,
0491         .new_flow_count = q->new_flow_count,
0492         .memory_usage   = q->memory_usage,
0493     };
0494     struct list_head *pos;
0495 
0496     sch_tree_lock(sch);
0497     list_for_each(pos, &q->new_flows)
0498         st.new_flows_len++;
0499 
0500     list_for_each(pos, &q->old_flows)
0501         st.old_flows_len++;
0502     sch_tree_unlock(sch);
0503 
0504     return gnet_stats_copy_app(d, &st, sizeof(st));
0505 }
0506 
0507 static void fq_pie_reset(struct Qdisc *sch)
0508 {
0509     struct fq_pie_sched_data *q = qdisc_priv(sch);
0510     u32 idx;
0511 
0512     INIT_LIST_HEAD(&q->new_flows);
0513     INIT_LIST_HEAD(&q->old_flows);
0514     for (idx = 0; idx < q->flows_cnt; idx++) {
0515         struct fq_pie_flow *flow = q->flows + idx;
0516 
0517         /* Removes all packets from flow */
0518         rtnl_kfree_skbs(flow->head, flow->tail);
0519         flow->head = NULL;
0520 
0521         INIT_LIST_HEAD(&flow->flowchain);
0522         pie_vars_init(&flow->vars);
0523     }
0524 
0525     sch->q.qlen = 0;
0526     sch->qstats.backlog = 0;
0527 }
0528 
0529 static void fq_pie_destroy(struct Qdisc *sch)
0530 {
0531     struct fq_pie_sched_data *q = qdisc_priv(sch);
0532 
0533     tcf_block_put(q->block);
0534     q->p_params.tupdate = 0;
0535     del_timer_sync(&q->adapt_timer);
0536     kvfree(q->flows);
0537 }
0538 
0539 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
0540     .id     = "fq_pie",
0541     .priv_size  = sizeof(struct fq_pie_sched_data),
0542     .enqueue    = fq_pie_qdisc_enqueue,
0543     .dequeue    = fq_pie_qdisc_dequeue,
0544     .peek       = qdisc_peek_dequeued,
0545     .init       = fq_pie_init,
0546     .destroy    = fq_pie_destroy,
0547     .reset      = fq_pie_reset,
0548     .change     = fq_pie_change,
0549     .dump       = fq_pie_dump,
0550     .dump_stats = fq_pie_dump_stats,
0551     .owner      = THIS_MODULE,
0552 };
0553 
0554 static int __init fq_pie_module_init(void)
0555 {
0556     return register_qdisc(&fq_pie_qdisc_ops);
0557 }
0558 
0559 static void __exit fq_pie_module_exit(void)
0560 {
0561     unregister_qdisc(&fq_pie_qdisc_ops);
0562 }
0563 
0564 module_init(fq_pie_module_init);
0565 module_exit(fq_pie_module_exit);
0566 
0567 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
0568 MODULE_AUTHOR("Mohit P. Tahiliani");
0569 MODULE_LICENSE("GPL");