Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * net/sched/sch_api.c  Packet scheduler API.
0004  *
0005  * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
0006  *
0007  * Fixes:
0008  *
0009  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
0010  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
0011  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
0012  */
0013 
0014 #include <linux/module.h>
0015 #include <linux/types.h>
0016 #include <linux/kernel.h>
0017 #include <linux/string.h>
0018 #include <linux/errno.h>
0019 #include <linux/skbuff.h>
0020 #include <linux/init.h>
0021 #include <linux/proc_fs.h>
0022 #include <linux/seq_file.h>
0023 #include <linux/kmod.h>
0024 #include <linux/list.h>
0025 #include <linux/hrtimer.h>
0026 #include <linux/slab.h>
0027 #include <linux/hashtable.h>
0028 
0029 #include <net/net_namespace.h>
0030 #include <net/sock.h>
0031 #include <net/netlink.h>
0032 #include <net/pkt_sched.h>
0033 #include <net/pkt_cls.h>
0034 
0035 #include <trace/events/qdisc.h>
0036 
0037 /*
0038 
0039    Short review.
0040    -------------
0041 
0042    This file consists of two interrelated parts:
0043 
0044    1. queueing disciplines manager frontend.
0045    2. traffic classes manager frontend.
0046 
0047    Generally, queueing discipline ("qdisc") is a black box,
0048    which is able to enqueue packets and to dequeue them (when
0049    device is ready to send something) in order and at times
0050    determined by algorithm hidden in it.
0051 
0052    qdisc's are divided to two categories:
0053    - "queues", which have no internal structure visible from outside.
0054    - "schedulers", which split all the packets to "traffic classes",
0055      using "packet classifiers" (look at cls_api.c)
0056 
0057    In turn, classes may have child qdiscs (as rule, queues)
0058    attached to them etc. etc. etc.
0059 
0060    The goal of the routines in this file is to translate
0061    information supplied by user in the form of handles
0062    to more intelligible for kernel form, to make some sanity
0063    checks and part of work, which is common to all qdiscs
0064    and to provide rtnetlink notifications.
0065 
0066    All real intelligent work is done inside qdisc modules.
0067 
0068 
0069 
0070    Every discipline has two major routines: enqueue and dequeue.
0071 
0072    ---dequeue
0073 
0074    dequeue usually returns a skb to send. It is allowed to return NULL,
0075    but it does not mean that queue is empty, it just means that
0076    discipline does not want to send anything this time.
0077    Queue is really empty if q->q.qlen == 0.
0078    For complicated disciplines with multiple queues q->q is not
0079    real packet queue, but however q->q.qlen must be valid.
0080 
0081    ---enqueue
0082 
0083    enqueue returns 0, if packet was enqueued successfully.
0084    If packet (this one or another one) was dropped, it returns
0085    not zero error code.
0086    NET_XMIT_DROP    - this packet dropped
0087      Expected action: do not backoff, but wait until queue will clear.
0088    NET_XMIT_CN      - probably this packet enqueued, but another one dropped.
0089      Expected action: backoff or ignore
0090 
0091    Auxiliary routines:
0092 
0093    ---peek
0094 
0095    like dequeue but without removing a packet from the queue
0096 
0097    ---reset
0098 
0099    returns qdisc to initial state: purge all buffers, clear all
0100    timers, counters (except for statistics) etc.
0101 
0102    ---init
0103 
0104    initializes newly created qdisc.
0105 
0106    ---destroy
0107 
0108    destroys resources allocated by init and during lifetime of qdisc.
0109 
0110    ---change
0111 
0112    changes qdisc parameters.
0113  */
0114 
0115 /* Protects list of registered TC modules. It is pure SMP lock. */
0116 static DEFINE_RWLOCK(qdisc_mod_lock);
0117 
0118 
0119 /************************************************
0120  *  Queueing disciplines manipulation.  *
0121  ************************************************/
0122 
0123 
0124 /* The list of all installed queueing disciplines. */
0125 
0126 static struct Qdisc_ops *qdisc_base;
0127 
0128 /* Register/unregister queueing discipline */
0129 
0130 int register_qdisc(struct Qdisc_ops *qops)
0131 {
0132     struct Qdisc_ops *q, **qp;
0133     int rc = -EEXIST;
0134 
0135     write_lock(&qdisc_mod_lock);
0136     for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
0137         if (!strcmp(qops->id, q->id))
0138             goto out;
0139 
0140     if (qops->enqueue == NULL)
0141         qops->enqueue = noop_qdisc_ops.enqueue;
0142     if (qops->peek == NULL) {
0143         if (qops->dequeue == NULL)
0144             qops->peek = noop_qdisc_ops.peek;
0145         else
0146             goto out_einval;
0147     }
0148     if (qops->dequeue == NULL)
0149         qops->dequeue = noop_qdisc_ops.dequeue;
0150 
0151     if (qops->cl_ops) {
0152         const struct Qdisc_class_ops *cops = qops->cl_ops;
0153 
0154         if (!(cops->find && cops->walk && cops->leaf))
0155             goto out_einval;
0156 
0157         if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
0158             goto out_einval;
0159     }
0160 
0161     qops->next = NULL;
0162     *qp = qops;
0163     rc = 0;
0164 out:
0165     write_unlock(&qdisc_mod_lock);
0166     return rc;
0167 
0168 out_einval:
0169     rc = -EINVAL;
0170     goto out;
0171 }
0172 EXPORT_SYMBOL(register_qdisc);
0173 
0174 int unregister_qdisc(struct Qdisc_ops *qops)
0175 {
0176     struct Qdisc_ops *q, **qp;
0177     int err = -ENOENT;
0178 
0179     write_lock(&qdisc_mod_lock);
0180     for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
0181         if (q == qops)
0182             break;
0183     if (q) {
0184         *qp = q->next;
0185         q->next = NULL;
0186         err = 0;
0187     }
0188     write_unlock(&qdisc_mod_lock);
0189     return err;
0190 }
0191 EXPORT_SYMBOL(unregister_qdisc);
0192 
0193 /* Get default qdisc if not otherwise specified */
0194 void qdisc_get_default(char *name, size_t len)
0195 {
0196     read_lock(&qdisc_mod_lock);
0197     strlcpy(name, default_qdisc_ops->id, len);
0198     read_unlock(&qdisc_mod_lock);
0199 }
0200 
0201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
0202 {
0203     struct Qdisc_ops *q = NULL;
0204 
0205     for (q = qdisc_base; q; q = q->next) {
0206         if (!strcmp(name, q->id)) {
0207             if (!try_module_get(q->owner))
0208                 q = NULL;
0209             break;
0210         }
0211     }
0212 
0213     return q;
0214 }
0215 
0216 /* Set new default qdisc to use */
0217 int qdisc_set_default(const char *name)
0218 {
0219     const struct Qdisc_ops *ops;
0220 
0221     if (!capable(CAP_NET_ADMIN))
0222         return -EPERM;
0223 
0224     write_lock(&qdisc_mod_lock);
0225     ops = qdisc_lookup_default(name);
0226     if (!ops) {
0227         /* Not found, drop lock and try to load module */
0228         write_unlock(&qdisc_mod_lock);
0229         request_module("sch_%s", name);
0230         write_lock(&qdisc_mod_lock);
0231 
0232         ops = qdisc_lookup_default(name);
0233     }
0234 
0235     if (ops) {
0236         /* Set new default */
0237         module_put(default_qdisc_ops->owner);
0238         default_qdisc_ops = ops;
0239     }
0240     write_unlock(&qdisc_mod_lock);
0241 
0242     return ops ? 0 : -ENOENT;
0243 }
0244 
0245 #ifdef CONFIG_NET_SCH_DEFAULT
0246 /* Set default value from kernel config */
0247 static int __init sch_default_qdisc(void)
0248 {
0249     return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
0250 }
0251 late_initcall(sch_default_qdisc);
0252 #endif
0253 
0254 /* We know handle. Find qdisc among all qdisc's attached to device
0255  * (root qdisc, all its children, children of children etc.)
0256  * Note: caller either uses rtnl or rcu_read_lock()
0257  */
0258 
0259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
0260 {
0261     struct Qdisc *q;
0262 
0263     if (!qdisc_dev(root))
0264         return (root->handle == handle ? root : NULL);
0265 
0266     if (!(root->flags & TCQ_F_BUILTIN) &&
0267         root->handle == handle)
0268         return root;
0269 
0270     hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
0271                    lockdep_rtnl_is_held()) {
0272         if (q->handle == handle)
0273             return q;
0274     }
0275     return NULL;
0276 }
0277 
0278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
0279 {
0280     if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
0281         ASSERT_RTNL();
0282         hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
0283         if (invisible)
0284             q->flags |= TCQ_F_INVISIBLE;
0285     }
0286 }
0287 EXPORT_SYMBOL(qdisc_hash_add);
0288 
0289 void qdisc_hash_del(struct Qdisc *q)
0290 {
0291     if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
0292         ASSERT_RTNL();
0293         hash_del_rcu(&q->hash);
0294     }
0295 }
0296 EXPORT_SYMBOL(qdisc_hash_del);
0297 
0298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
0299 {
0300     struct Qdisc *q;
0301 
0302     if (!handle)
0303         return NULL;
0304     q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
0305     if (q)
0306         goto out;
0307 
0308     if (dev_ingress_queue(dev))
0309         q = qdisc_match_from_root(
0310             dev_ingress_queue(dev)->qdisc_sleeping,
0311             handle);
0312 out:
0313     return q;
0314 }
0315 
0316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
0317 {
0318     struct netdev_queue *nq;
0319     struct Qdisc *q;
0320 
0321     if (!handle)
0322         return NULL;
0323     q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
0324     if (q)
0325         goto out;
0326 
0327     nq = dev_ingress_queue_rcu(dev);
0328     if (nq)
0329         q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
0330 out:
0331     return q;
0332 }
0333 
0334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
0335 {
0336     unsigned long cl;
0337     const struct Qdisc_class_ops *cops = p->ops->cl_ops;
0338 
0339     if (cops == NULL)
0340         return NULL;
0341     cl = cops->find(p, classid);
0342 
0343     if (cl == 0)
0344         return NULL;
0345     return cops->leaf(p, cl);
0346 }
0347 
0348 /* Find queueing discipline by name */
0349 
0350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
0351 {
0352     struct Qdisc_ops *q = NULL;
0353 
0354     if (kind) {
0355         read_lock(&qdisc_mod_lock);
0356         for (q = qdisc_base; q; q = q->next) {
0357             if (nla_strcmp(kind, q->id) == 0) {
0358                 if (!try_module_get(q->owner))
0359                     q = NULL;
0360                 break;
0361             }
0362         }
0363         read_unlock(&qdisc_mod_lock);
0364     }
0365     return q;
0366 }
0367 
0368 /* The linklayer setting were not transferred from iproute2, in older
0369  * versions, and the rate tables lookup systems have been dropped in
0370  * the kernel. To keep backward compatible with older iproute2 tc
0371  * utils, we detect the linklayer setting by detecting if the rate
0372  * table were modified.
0373  *
0374  * For linklayer ATM table entries, the rate table will be aligned to
0375  * 48 bytes, thus some table entries will contain the same value.  The
0376  * mpu (min packet unit) is also encoded into the old rate table, thus
0377  * starting from the mpu, we find low and high table entries for
0378  * mapping this cell.  If these entries contain the same value, when
0379  * the rate tables have been modified for linklayer ATM.
0380  *
0381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
0382  * and then roundup to the next cell, calc the table entry one below,
0383  * and compare.
0384  */
0385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
0386 {
0387     int low       = roundup(r->mpu, 48);
0388     int high      = roundup(low+1, 48);
0389     int cell_low  = low >> r->cell_log;
0390     int cell_high = (high >> r->cell_log) - 1;
0391 
0392     /* rtab is too inaccurate at rates > 100Mbit/s */
0393     if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
0394         pr_debug("TC linklayer: Giving up ATM detection\n");
0395         return TC_LINKLAYER_ETHERNET;
0396     }
0397 
0398     if ((cell_high > cell_low) && (cell_high < 256)
0399         && (rtab[cell_low] == rtab[cell_high])) {
0400         pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
0401              cell_low, cell_high, rtab[cell_high]);
0402         return TC_LINKLAYER_ATM;
0403     }
0404     return TC_LINKLAYER_ETHERNET;
0405 }
0406 
0407 static struct qdisc_rate_table *qdisc_rtab_list;
0408 
0409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
0410                     struct nlattr *tab,
0411                     struct netlink_ext_ack *extack)
0412 {
0413     struct qdisc_rate_table *rtab;
0414 
0415     if (tab == NULL || r->rate == 0 ||
0416         r->cell_log == 0 || r->cell_log >= 32 ||
0417         nla_len(tab) != TC_RTAB_SIZE) {
0418         NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
0419         return NULL;
0420     }
0421 
0422     for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
0423         if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
0424             !memcmp(&rtab->data, nla_data(tab), 1024)) {
0425             rtab->refcnt++;
0426             return rtab;
0427         }
0428     }
0429 
0430     rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
0431     if (rtab) {
0432         rtab->rate = *r;
0433         rtab->refcnt = 1;
0434         memcpy(rtab->data, nla_data(tab), 1024);
0435         if (r->linklayer == TC_LINKLAYER_UNAWARE)
0436             r->linklayer = __detect_linklayer(r, rtab->data);
0437         rtab->next = qdisc_rtab_list;
0438         qdisc_rtab_list = rtab;
0439     } else {
0440         NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
0441     }
0442     return rtab;
0443 }
0444 EXPORT_SYMBOL(qdisc_get_rtab);
0445 
0446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
0447 {
0448     struct qdisc_rate_table *rtab, **rtabp;
0449 
0450     if (!tab || --tab->refcnt)
0451         return;
0452 
0453     for (rtabp = &qdisc_rtab_list;
0454          (rtab = *rtabp) != NULL;
0455          rtabp = &rtab->next) {
0456         if (rtab == tab) {
0457             *rtabp = rtab->next;
0458             kfree(rtab);
0459             return;
0460         }
0461     }
0462 }
0463 EXPORT_SYMBOL(qdisc_put_rtab);
0464 
0465 static LIST_HEAD(qdisc_stab_list);
0466 
0467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
0468     [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
0469     [TCA_STAB_DATA] = { .type = NLA_BINARY },
0470 };
0471 
0472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
0473                            struct netlink_ext_ack *extack)
0474 {
0475     struct nlattr *tb[TCA_STAB_MAX + 1];
0476     struct qdisc_size_table *stab;
0477     struct tc_sizespec *s;
0478     unsigned int tsize = 0;
0479     u16 *tab = NULL;
0480     int err;
0481 
0482     err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
0483                       extack);
0484     if (err < 0)
0485         return ERR_PTR(err);
0486     if (!tb[TCA_STAB_BASE]) {
0487         NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
0488         return ERR_PTR(-EINVAL);
0489     }
0490 
0491     s = nla_data(tb[TCA_STAB_BASE]);
0492 
0493     if (s->tsize > 0) {
0494         if (!tb[TCA_STAB_DATA]) {
0495             NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
0496             return ERR_PTR(-EINVAL);
0497         }
0498         tab = nla_data(tb[TCA_STAB_DATA]);
0499         tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
0500     }
0501 
0502     if (tsize != s->tsize || (!tab && tsize > 0)) {
0503         NL_SET_ERR_MSG(extack, "Invalid size of size table");
0504         return ERR_PTR(-EINVAL);
0505     }
0506 
0507     list_for_each_entry(stab, &qdisc_stab_list, list) {
0508         if (memcmp(&stab->szopts, s, sizeof(*s)))
0509             continue;
0510         if (tsize > 0 &&
0511             memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
0512             continue;
0513         stab->refcnt++;
0514         return stab;
0515     }
0516 
0517     if (s->size_log > STAB_SIZE_LOG_MAX ||
0518         s->cell_log > STAB_SIZE_LOG_MAX) {
0519         NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
0520         return ERR_PTR(-EINVAL);
0521     }
0522 
0523     stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
0524     if (!stab)
0525         return ERR_PTR(-ENOMEM);
0526 
0527     stab->refcnt = 1;
0528     stab->szopts = *s;
0529     if (tsize > 0)
0530         memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
0531 
0532     list_add_tail(&stab->list, &qdisc_stab_list);
0533 
0534     return stab;
0535 }
0536 
0537 void qdisc_put_stab(struct qdisc_size_table *tab)
0538 {
0539     if (!tab)
0540         return;
0541 
0542     if (--tab->refcnt == 0) {
0543         list_del(&tab->list);
0544         kfree_rcu(tab, rcu);
0545     }
0546 }
0547 EXPORT_SYMBOL(qdisc_put_stab);
0548 
0549 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
0550 {
0551     struct nlattr *nest;
0552 
0553     nest = nla_nest_start_noflag(skb, TCA_STAB);
0554     if (nest == NULL)
0555         goto nla_put_failure;
0556     if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
0557         goto nla_put_failure;
0558     nla_nest_end(skb, nest);
0559 
0560     return skb->len;
0561 
0562 nla_put_failure:
0563     return -1;
0564 }
0565 
0566 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
0567                    const struct qdisc_size_table *stab)
0568 {
0569     int pkt_len, slot;
0570 
0571     pkt_len = skb->len + stab->szopts.overhead;
0572     if (unlikely(!stab->szopts.tsize))
0573         goto out;
0574 
0575     slot = pkt_len + stab->szopts.cell_align;
0576     if (unlikely(slot < 0))
0577         slot = 0;
0578 
0579     slot >>= stab->szopts.cell_log;
0580     if (likely(slot < stab->szopts.tsize))
0581         pkt_len = stab->data[slot];
0582     else
0583         pkt_len = stab->data[stab->szopts.tsize - 1] *
0584                 (slot / stab->szopts.tsize) +
0585                 stab->data[slot % stab->szopts.tsize];
0586 
0587     pkt_len <<= stab->szopts.size_log;
0588 out:
0589     if (unlikely(pkt_len < 1))
0590         pkt_len = 1;
0591     qdisc_skb_cb(skb)->pkt_len = pkt_len;
0592 }
0593 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
0594 
0595 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
0596 {
0597     if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
0598         pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
0599             txt, qdisc->ops->id, qdisc->handle >> 16);
0600         qdisc->flags |= TCQ_F_WARN_NONWC;
0601     }
0602 }
0603 EXPORT_SYMBOL(qdisc_warn_nonwc);
0604 
0605 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
0606 {
0607     struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
0608                          timer);
0609 
0610     rcu_read_lock();
0611     __netif_schedule(qdisc_root(wd->qdisc));
0612     rcu_read_unlock();
0613 
0614     return HRTIMER_NORESTART;
0615 }
0616 
0617 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
0618                  clockid_t clockid)
0619 {
0620     hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
0621     wd->timer.function = qdisc_watchdog;
0622     wd->qdisc = qdisc;
0623 }
0624 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
0625 
0626 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
0627 {
0628     qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
0629 }
0630 EXPORT_SYMBOL(qdisc_watchdog_init);
0631 
0632 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
0633                       u64 delta_ns)
0634 {
0635     if (test_bit(__QDISC_STATE_DEACTIVATED,
0636              &qdisc_root_sleeping(wd->qdisc)->state))
0637         return;
0638 
0639     if (hrtimer_is_queued(&wd->timer)) {
0640         /* If timer is already set in [expires, expires + delta_ns],
0641          * do not reprogram it.
0642          */
0643         if (wd->last_expires - expires <= delta_ns)
0644             return;
0645     }
0646 
0647     wd->last_expires = expires;
0648     hrtimer_start_range_ns(&wd->timer,
0649                    ns_to_ktime(expires),
0650                    delta_ns,
0651                    HRTIMER_MODE_ABS_PINNED);
0652 }
0653 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
0654 
0655 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
0656 {
0657     hrtimer_cancel(&wd->timer);
0658 }
0659 EXPORT_SYMBOL(qdisc_watchdog_cancel);
0660 
0661 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
0662 {
0663     struct hlist_head *h;
0664     unsigned int i;
0665 
0666     h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
0667 
0668     if (h != NULL) {
0669         for (i = 0; i < n; i++)
0670             INIT_HLIST_HEAD(&h[i]);
0671     }
0672     return h;
0673 }
0674 
0675 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
0676 {
0677     struct Qdisc_class_common *cl;
0678     struct hlist_node *next;
0679     struct hlist_head *nhash, *ohash;
0680     unsigned int nsize, nmask, osize;
0681     unsigned int i, h;
0682 
0683     /* Rehash when load factor exceeds 0.75 */
0684     if (clhash->hashelems * 4 <= clhash->hashsize * 3)
0685         return;
0686     nsize = clhash->hashsize * 2;
0687     nmask = nsize - 1;
0688     nhash = qdisc_class_hash_alloc(nsize);
0689     if (nhash == NULL)
0690         return;
0691 
0692     ohash = clhash->hash;
0693     osize = clhash->hashsize;
0694 
0695     sch_tree_lock(sch);
0696     for (i = 0; i < osize; i++) {
0697         hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
0698             h = qdisc_class_hash(cl->classid, nmask);
0699             hlist_add_head(&cl->hnode, &nhash[h]);
0700         }
0701     }
0702     clhash->hash     = nhash;
0703     clhash->hashsize = nsize;
0704     clhash->hashmask = nmask;
0705     sch_tree_unlock(sch);
0706 
0707     kvfree(ohash);
0708 }
0709 EXPORT_SYMBOL(qdisc_class_hash_grow);
0710 
0711 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
0712 {
0713     unsigned int size = 4;
0714 
0715     clhash->hash = qdisc_class_hash_alloc(size);
0716     if (!clhash->hash)
0717         return -ENOMEM;
0718     clhash->hashsize  = size;
0719     clhash->hashmask  = size - 1;
0720     clhash->hashelems = 0;
0721     return 0;
0722 }
0723 EXPORT_SYMBOL(qdisc_class_hash_init);
0724 
0725 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
0726 {
0727     kvfree(clhash->hash);
0728 }
0729 EXPORT_SYMBOL(qdisc_class_hash_destroy);
0730 
0731 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
0732                  struct Qdisc_class_common *cl)
0733 {
0734     unsigned int h;
0735 
0736     INIT_HLIST_NODE(&cl->hnode);
0737     h = qdisc_class_hash(cl->classid, clhash->hashmask);
0738     hlist_add_head(&cl->hnode, &clhash->hash[h]);
0739     clhash->hashelems++;
0740 }
0741 EXPORT_SYMBOL(qdisc_class_hash_insert);
0742 
0743 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
0744                  struct Qdisc_class_common *cl)
0745 {
0746     hlist_del(&cl->hnode);
0747     clhash->hashelems--;
0748 }
0749 EXPORT_SYMBOL(qdisc_class_hash_remove);
0750 
0751 /* Allocate an unique handle from space managed by kernel
0752  * Possible range is [8000-FFFF]:0000 (0x8000 values)
0753  */
0754 static u32 qdisc_alloc_handle(struct net_device *dev)
0755 {
0756     int i = 0x8000;
0757     static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
0758 
0759     do {
0760         autohandle += TC_H_MAKE(0x10000U, 0);
0761         if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
0762             autohandle = TC_H_MAKE(0x80000000U, 0);
0763         if (!qdisc_lookup(dev, autohandle))
0764             return autohandle;
0765         cond_resched();
0766     } while (--i > 0);
0767 
0768     return 0;
0769 }
0770 
0771 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
0772 {
0773     bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
0774     const struct Qdisc_class_ops *cops;
0775     unsigned long cl;
0776     u32 parentid;
0777     bool notify;
0778     int drops;
0779 
0780     if (n == 0 && len == 0)
0781         return;
0782     drops = max_t(int, n, 0);
0783     rcu_read_lock();
0784     while ((parentid = sch->parent)) {
0785         if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
0786             break;
0787 
0788         if (sch->flags & TCQ_F_NOPARENT)
0789             break;
0790         /* Notify parent qdisc only if child qdisc becomes empty.
0791          *
0792          * If child was empty even before update then backlog
0793          * counter is screwed and we skip notification because
0794          * parent class is already passive.
0795          *
0796          * If the original child was offloaded then it is allowed
0797          * to be seem as empty, so the parent is notified anyway.
0798          */
0799         notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
0800                                !qdisc_is_offloaded);
0801         /* TODO: perform the search on a per txq basis */
0802         sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
0803         if (sch == NULL) {
0804             WARN_ON_ONCE(parentid != TC_H_ROOT);
0805             break;
0806         }
0807         cops = sch->ops->cl_ops;
0808         if (notify && cops->qlen_notify) {
0809             cl = cops->find(sch, parentid);
0810             cops->qlen_notify(sch, cl);
0811         }
0812         sch->q.qlen -= n;
0813         sch->qstats.backlog -= len;
0814         __qdisc_qstats_drop(sch, drops);
0815     }
0816     rcu_read_unlock();
0817 }
0818 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
0819 
0820 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
0821                   void *type_data)
0822 {
0823     struct net_device *dev = qdisc_dev(sch);
0824     int err;
0825 
0826     sch->flags &= ~TCQ_F_OFFLOADED;
0827     if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
0828         return 0;
0829 
0830     err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
0831     if (err == -EOPNOTSUPP)
0832         return 0;
0833 
0834     if (!err)
0835         sch->flags |= TCQ_F_OFFLOADED;
0836 
0837     return err;
0838 }
0839 EXPORT_SYMBOL(qdisc_offload_dump_helper);
0840 
0841 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
0842                 struct Qdisc *new, struct Qdisc *old,
0843                 enum tc_setup_type type, void *type_data,
0844                 struct netlink_ext_ack *extack)
0845 {
0846     bool any_qdisc_is_offloaded;
0847     int err;
0848 
0849     if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
0850         return;
0851 
0852     err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
0853 
0854     /* Don't report error if the graft is part of destroy operation. */
0855     if (!err || !new || new == &noop_qdisc)
0856         return;
0857 
0858     /* Don't report error if the parent, the old child and the new
0859      * one are not offloaded.
0860      */
0861     any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
0862     any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
0863     any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
0864 
0865     if (any_qdisc_is_offloaded)
0866         NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
0867 }
0868 EXPORT_SYMBOL(qdisc_offload_graft_helper);
0869 
0870 static void qdisc_offload_graft_root(struct net_device *dev,
0871                      struct Qdisc *new, struct Qdisc *old,
0872                      struct netlink_ext_ack *extack)
0873 {
0874     struct tc_root_qopt_offload graft_offload = {
0875         .command    = TC_ROOT_GRAFT,
0876         .handle     = new ? new->handle : 0,
0877         .ingress    = (new && new->flags & TCQ_F_INGRESS) ||
0878                   (old && old->flags & TCQ_F_INGRESS),
0879     };
0880 
0881     qdisc_offload_graft_helper(dev, NULL, new, old,
0882                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
0883 }
0884 
0885 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
0886              u32 portid, u32 seq, u16 flags, int event)
0887 {
0888     struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
0889     struct gnet_stats_queue __percpu *cpu_qstats = NULL;
0890     struct tcmsg *tcm;
0891     struct nlmsghdr  *nlh;
0892     unsigned char *b = skb_tail_pointer(skb);
0893     struct gnet_dump d;
0894     struct qdisc_size_table *stab;
0895     u32 block_index;
0896     __u32 qlen;
0897 
0898     cond_resched();
0899     nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
0900     if (!nlh)
0901         goto out_nlmsg_trim;
0902     tcm = nlmsg_data(nlh);
0903     tcm->tcm_family = AF_UNSPEC;
0904     tcm->tcm__pad1 = 0;
0905     tcm->tcm__pad2 = 0;
0906     tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
0907     tcm->tcm_parent = clid;
0908     tcm->tcm_handle = q->handle;
0909     tcm->tcm_info = refcount_read(&q->refcnt);
0910     if (nla_put_string(skb, TCA_KIND, q->ops->id))
0911         goto nla_put_failure;
0912     if (q->ops->ingress_block_get) {
0913         block_index = q->ops->ingress_block_get(q);
0914         if (block_index &&
0915             nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
0916             goto nla_put_failure;
0917     }
0918     if (q->ops->egress_block_get) {
0919         block_index = q->ops->egress_block_get(q);
0920         if (block_index &&
0921             nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
0922             goto nla_put_failure;
0923     }
0924     if (q->ops->dump && q->ops->dump(q, skb) < 0)
0925         goto nla_put_failure;
0926     if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
0927         goto nla_put_failure;
0928     qlen = qdisc_qlen_sum(q);
0929 
0930     stab = rtnl_dereference(q->stab);
0931     if (stab && qdisc_dump_stab(skb, stab) < 0)
0932         goto nla_put_failure;
0933 
0934     if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
0935                      NULL, &d, TCA_PAD) < 0)
0936         goto nla_put_failure;
0937 
0938     if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
0939         goto nla_put_failure;
0940 
0941     if (qdisc_is_percpu_stats(q)) {
0942         cpu_bstats = q->cpu_bstats;
0943         cpu_qstats = q->cpu_qstats;
0944     }
0945 
0946     if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
0947         gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
0948         gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
0949         goto nla_put_failure;
0950 
0951     if (gnet_stats_finish_copy(&d) < 0)
0952         goto nla_put_failure;
0953 
0954     nlh->nlmsg_len = skb_tail_pointer(skb) - b;
0955     return skb->len;
0956 
0957 out_nlmsg_trim:
0958 nla_put_failure:
0959     nlmsg_trim(skb, b);
0960     return -1;
0961 }
0962 
0963 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
0964 {
0965     if (q->flags & TCQ_F_BUILTIN)
0966         return true;
0967     if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
0968         return true;
0969 
0970     return false;
0971 }
0972 
0973 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
0974             struct nlmsghdr *n, u32 clid,
0975             struct Qdisc *old, struct Qdisc *new)
0976 {
0977     struct sk_buff *skb;
0978     u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
0979 
0980     skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
0981     if (!skb)
0982         return -ENOBUFS;
0983 
0984     if (old && !tc_qdisc_dump_ignore(old, false)) {
0985         if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
0986                   0, RTM_DELQDISC) < 0)
0987             goto err_out;
0988     }
0989     if (new && !tc_qdisc_dump_ignore(new, false)) {
0990         if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
0991                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
0992             goto err_out;
0993     }
0994 
0995     if (skb->len)
0996         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
0997                       n->nlmsg_flags & NLM_F_ECHO);
0998 
0999 err_out:
1000     kfree_skb(skb);
1001     return -EINVAL;
1002 }
1003 
1004 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005                    struct nlmsghdr *n, u32 clid,
1006                    struct Qdisc *old, struct Qdisc *new)
1007 {
1008     if (new || old)
1009         qdisc_notify(net, skb, n, clid, old, new);
1010 
1011     if (old)
1012         qdisc_put(old);
1013 }
1014 
1015 static void qdisc_clear_nolock(struct Qdisc *sch)
1016 {
1017     sch->flags &= ~TCQ_F_NOLOCK;
1018     if (!(sch->flags & TCQ_F_CPUSTATS))
1019         return;
1020 
1021     free_percpu(sch->cpu_bstats);
1022     free_percpu(sch->cpu_qstats);
1023     sch->cpu_bstats = NULL;
1024     sch->cpu_qstats = NULL;
1025     sch->flags &= ~TCQ_F_CPUSTATS;
1026 }
1027 
1028 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029  * to device "dev".
1030  *
1031  * When appropriate send a netlink notification using 'skb'
1032  * and "n".
1033  *
1034  * On success, destroy old qdisc.
1035  */
1036 
1037 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038                struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039                struct Qdisc *new, struct Qdisc *old,
1040                struct netlink_ext_ack *extack)
1041 {
1042     struct Qdisc *q = old;
1043     struct net *net = dev_net(dev);
1044 
1045     if (parent == NULL) {
1046         unsigned int i, num_q, ingress;
1047 
1048         ingress = 0;
1049         num_q = dev->num_tx_queues;
1050         if ((q && q->flags & TCQ_F_INGRESS) ||
1051             (new && new->flags & TCQ_F_INGRESS)) {
1052             num_q = 1;
1053             ingress = 1;
1054             if (!dev_ingress_queue(dev)) {
1055                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056                 return -ENOENT;
1057             }
1058         }
1059 
1060         if (dev->flags & IFF_UP)
1061             dev_deactivate(dev);
1062 
1063         qdisc_offload_graft_root(dev, new, old, extack);
1064 
1065         if (new && new->ops->attach && !ingress)
1066             goto skip;
1067 
1068         for (i = 0; i < num_q; i++) {
1069             struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1070 
1071             if (!ingress)
1072                 dev_queue = netdev_get_tx_queue(dev, i);
1073 
1074             old = dev_graft_qdisc(dev_queue, new);
1075             if (new && i > 0)
1076                 qdisc_refcount_inc(new);
1077 
1078             if (!ingress)
1079                 qdisc_put(old);
1080         }
1081 
1082 skip:
1083         if (!ingress) {
1084             notify_and_destroy(net, skb, n, classid,
1085                        rtnl_dereference(dev->qdisc), new);
1086             if (new && !new->ops->attach)
1087                 qdisc_refcount_inc(new);
1088             rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1089 
1090             if (new && new->ops->attach)
1091                 new->ops->attach(new);
1092         } else {
1093             notify_and_destroy(net, skb, n, classid, old, new);
1094         }
1095 
1096         if (dev->flags & IFF_UP)
1097             dev_activate(dev);
1098     } else {
1099         const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1100         unsigned long cl;
1101         int err;
1102 
1103         /* Only support running class lockless if parent is lockless */
1104         if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1105             qdisc_clear_nolock(new);
1106 
1107         if (!cops || !cops->graft)
1108             return -EOPNOTSUPP;
1109 
1110         cl = cops->find(parent, classid);
1111         if (!cl) {
1112             NL_SET_ERR_MSG(extack, "Specified class not found");
1113             return -ENOENT;
1114         }
1115 
1116         err = cops->graft(parent, cl, new, &old, extack);
1117         if (err)
1118             return err;
1119         notify_and_destroy(net, skb, n, classid, old, new);
1120     }
1121     return 0;
1122 }
1123 
1124 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1125                    struct netlink_ext_ack *extack)
1126 {
1127     u32 block_index;
1128 
1129     if (tca[TCA_INGRESS_BLOCK]) {
1130         block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1131 
1132         if (!block_index) {
1133             NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1134             return -EINVAL;
1135         }
1136         if (!sch->ops->ingress_block_set) {
1137             NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1138             return -EOPNOTSUPP;
1139         }
1140         sch->ops->ingress_block_set(sch, block_index);
1141     }
1142     if (tca[TCA_EGRESS_BLOCK]) {
1143         block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1144 
1145         if (!block_index) {
1146             NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1147             return -EINVAL;
1148         }
1149         if (!sch->ops->egress_block_set) {
1150             NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1151             return -EOPNOTSUPP;
1152         }
1153         sch->ops->egress_block_set(sch, block_index);
1154     }
1155     return 0;
1156 }
1157 
1158 /*
1159    Allocate and initialize new qdisc.
1160 
1161    Parameters are passed via opt.
1162  */
1163 
1164 static struct Qdisc *qdisc_create(struct net_device *dev,
1165                   struct netdev_queue *dev_queue,
1166                   struct Qdisc *p, u32 parent, u32 handle,
1167                   struct nlattr **tca, int *errp,
1168                   struct netlink_ext_ack *extack)
1169 {
1170     int err;
1171     struct nlattr *kind = tca[TCA_KIND];
1172     struct Qdisc *sch;
1173     struct Qdisc_ops *ops;
1174     struct qdisc_size_table *stab;
1175 
1176     ops = qdisc_lookup_ops(kind);
1177 #ifdef CONFIG_MODULES
1178     if (ops == NULL && kind != NULL) {
1179         char name[IFNAMSIZ];
1180         if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1181             /* We dropped the RTNL semaphore in order to
1182              * perform the module load.  So, even if we
1183              * succeeded in loading the module we have to
1184              * tell the caller to replay the request.  We
1185              * indicate this using -EAGAIN.
1186              * We replay the request because the device may
1187              * go away in the mean time.
1188              */
1189             rtnl_unlock();
1190             request_module("sch_%s", name);
1191             rtnl_lock();
1192             ops = qdisc_lookup_ops(kind);
1193             if (ops != NULL) {
1194                 /* We will try again qdisc_lookup_ops,
1195                  * so don't keep a reference.
1196                  */
1197                 module_put(ops->owner);
1198                 err = -EAGAIN;
1199                 goto err_out;
1200             }
1201         }
1202     }
1203 #endif
1204 
1205     err = -ENOENT;
1206     if (!ops) {
1207         NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1208         goto err_out;
1209     }
1210 
1211     sch = qdisc_alloc(dev_queue, ops, extack);
1212     if (IS_ERR(sch)) {
1213         err = PTR_ERR(sch);
1214         goto err_out2;
1215     }
1216 
1217     sch->parent = parent;
1218 
1219     if (handle == TC_H_INGRESS) {
1220         sch->flags |= TCQ_F_INGRESS;
1221         handle = TC_H_MAKE(TC_H_INGRESS, 0);
1222     } else {
1223         if (handle == 0) {
1224             handle = qdisc_alloc_handle(dev);
1225             if (handle == 0) {
1226                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1227                 err = -ENOSPC;
1228                 goto err_out3;
1229             }
1230         }
1231         if (!netif_is_multiqueue(dev))
1232             sch->flags |= TCQ_F_ONETXQUEUE;
1233     }
1234 
1235     sch->handle = handle;
1236 
1237     /* This exist to keep backward compatible with a userspace
1238      * loophole, what allowed userspace to get IFF_NO_QUEUE
1239      * facility on older kernels by setting tx_queue_len=0 (prior
1240      * to qdisc init), and then forgot to reinit tx_queue_len
1241      * before again attaching a qdisc.
1242      */
1243     if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1244         dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1245         netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1246     }
1247 
1248     err = qdisc_block_indexes_set(sch, tca, extack);
1249     if (err)
1250         goto err_out3;
1251 
1252     if (ops->init) {
1253         err = ops->init(sch, tca[TCA_OPTIONS], extack);
1254         if (err != 0)
1255             goto err_out5;
1256     }
1257 
1258     if (tca[TCA_STAB]) {
1259         stab = qdisc_get_stab(tca[TCA_STAB], extack);
1260         if (IS_ERR(stab)) {
1261             err = PTR_ERR(stab);
1262             goto err_out4;
1263         }
1264         rcu_assign_pointer(sch->stab, stab);
1265     }
1266     if (tca[TCA_RATE]) {
1267         err = -EOPNOTSUPP;
1268         if (sch->flags & TCQ_F_MQROOT) {
1269             NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1270             goto err_out4;
1271         }
1272 
1273         err = gen_new_estimator(&sch->bstats,
1274                     sch->cpu_bstats,
1275                     &sch->rate_est,
1276                     NULL,
1277                     true,
1278                     tca[TCA_RATE]);
1279         if (err) {
1280             NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1281             goto err_out4;
1282         }
1283     }
1284 
1285     qdisc_hash_add(sch, false);
1286     trace_qdisc_create(ops, dev, parent);
1287 
1288     return sch;
1289 
1290 err_out5:
1291     /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1292     if (ops->destroy)
1293         ops->destroy(sch);
1294 err_out3:
1295     netdev_put(dev, &sch->dev_tracker);
1296     qdisc_free(sch);
1297 err_out2:
1298     module_put(ops->owner);
1299 err_out:
1300     *errp = err;
1301     return NULL;
1302 
1303 err_out4:
1304     /*
1305      * Any broken qdiscs that would require a ops->reset() here?
1306      * The qdisc was never in action so it shouldn't be necessary.
1307      */
1308     qdisc_put_stab(rtnl_dereference(sch->stab));
1309     if (ops->destroy)
1310         ops->destroy(sch);
1311     goto err_out3;
1312 }
1313 
1314 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1315             struct netlink_ext_ack *extack)
1316 {
1317     struct qdisc_size_table *ostab, *stab = NULL;
1318     int err = 0;
1319 
1320     if (tca[TCA_OPTIONS]) {
1321         if (!sch->ops->change) {
1322             NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1323             return -EINVAL;
1324         }
1325         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1326             NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1327             return -EOPNOTSUPP;
1328         }
1329         err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1330         if (err)
1331             return err;
1332     }
1333 
1334     if (tca[TCA_STAB]) {
1335         stab = qdisc_get_stab(tca[TCA_STAB], extack);
1336         if (IS_ERR(stab))
1337             return PTR_ERR(stab);
1338     }
1339 
1340     ostab = rtnl_dereference(sch->stab);
1341     rcu_assign_pointer(sch->stab, stab);
1342     qdisc_put_stab(ostab);
1343 
1344     if (tca[TCA_RATE]) {
1345         /* NB: ignores errors from replace_estimator
1346            because change can't be undone. */
1347         if (sch->flags & TCQ_F_MQROOT)
1348             goto out;
1349         gen_replace_estimator(&sch->bstats,
1350                       sch->cpu_bstats,
1351                       &sch->rate_est,
1352                       NULL,
1353                       true,
1354                       tca[TCA_RATE]);
1355     }
1356 out:
1357     return 0;
1358 }
1359 
1360 struct check_loop_arg {
1361     struct qdisc_walker w;
1362     struct Qdisc        *p;
1363     int         depth;
1364 };
1365 
1366 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1367              struct qdisc_walker *w);
1368 
1369 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1370 {
1371     struct check_loop_arg   arg;
1372 
1373     if (q->ops->cl_ops == NULL)
1374         return 0;
1375 
1376     arg.w.stop = arg.w.skip = arg.w.count = 0;
1377     arg.w.fn = check_loop_fn;
1378     arg.depth = depth;
1379     arg.p = p;
1380     q->ops->cl_ops->walk(q, &arg.w);
1381     return arg.w.stop ? -ELOOP : 0;
1382 }
1383 
1384 static int
1385 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1386 {
1387     struct Qdisc *leaf;
1388     const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1389     struct check_loop_arg *arg = (struct check_loop_arg *)w;
1390 
1391     leaf = cops->leaf(q, cl);
1392     if (leaf) {
1393         if (leaf == arg->p || arg->depth > 7)
1394             return -ELOOP;
1395         return check_loop(leaf, arg->p, arg->depth + 1);
1396     }
1397     return 0;
1398 }
1399 
1400 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1401     [TCA_KIND]      = { .type = NLA_STRING },
1402     [TCA_RATE]      = { .type = NLA_BINARY,
1403                     .len = sizeof(struct tc_estimator) },
1404     [TCA_STAB]      = { .type = NLA_NESTED },
1405     [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1406     [TCA_CHAIN]     = { .type = NLA_U32 },
1407     [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1408     [TCA_EGRESS_BLOCK]  = { .type = NLA_U32 },
1409 };
1410 
1411 /*
1412  * Delete/get qdisc.
1413  */
1414 
1415 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1416             struct netlink_ext_ack *extack)
1417 {
1418     struct net *net = sock_net(skb->sk);
1419     struct tcmsg *tcm = nlmsg_data(n);
1420     struct nlattr *tca[TCA_MAX + 1];
1421     struct net_device *dev;
1422     u32 clid;
1423     struct Qdisc *q = NULL;
1424     struct Qdisc *p = NULL;
1425     int err;
1426 
1427     if ((n->nlmsg_type != RTM_GETQDISC) &&
1428         !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1429         return -EPERM;
1430 
1431     err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1432                      rtm_tca_policy, extack);
1433     if (err < 0)
1434         return err;
1435 
1436     dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1437     if (!dev)
1438         return -ENODEV;
1439 
1440     clid = tcm->tcm_parent;
1441     if (clid) {
1442         if (clid != TC_H_ROOT) {
1443             if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1444                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1445                 if (!p) {
1446                     NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1447                     return -ENOENT;
1448                 }
1449                 q = qdisc_leaf(p, clid);
1450             } else if (dev_ingress_queue(dev)) {
1451                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1452             }
1453         } else {
1454             q = rtnl_dereference(dev->qdisc);
1455         }
1456         if (!q) {
1457             NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1458             return -ENOENT;
1459         }
1460 
1461         if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1462             NL_SET_ERR_MSG(extack, "Invalid handle");
1463             return -EINVAL;
1464         }
1465     } else {
1466         q = qdisc_lookup(dev, tcm->tcm_handle);
1467         if (!q) {
1468             NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1469             return -ENOENT;
1470         }
1471     }
1472 
1473     if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1474         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1475         return -EINVAL;
1476     }
1477 
1478     if (n->nlmsg_type == RTM_DELQDISC) {
1479         if (!clid) {
1480             NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1481             return -EINVAL;
1482         }
1483         if (q->handle == 0) {
1484             NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1485             return -ENOENT;
1486         }
1487         err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1488         if (err != 0)
1489             return err;
1490     } else {
1491         qdisc_notify(net, skb, n, clid, NULL, q);
1492     }
1493     return 0;
1494 }
1495 
1496 /*
1497  * Create/change qdisc.
1498  */
1499 
1500 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1501                struct netlink_ext_ack *extack)
1502 {
1503     struct net *net = sock_net(skb->sk);
1504     struct tcmsg *tcm;
1505     struct nlattr *tca[TCA_MAX + 1];
1506     struct net_device *dev;
1507     u32 clid;
1508     struct Qdisc *q, *p;
1509     int err;
1510 
1511     if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1512         return -EPERM;
1513 
1514 replay:
1515     /* Reinit, just in case something touches this. */
1516     err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1517                      rtm_tca_policy, extack);
1518     if (err < 0)
1519         return err;
1520 
1521     tcm = nlmsg_data(n);
1522     clid = tcm->tcm_parent;
1523     q = p = NULL;
1524 
1525     dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1526     if (!dev)
1527         return -ENODEV;
1528 
1529 
1530     if (clid) {
1531         if (clid != TC_H_ROOT) {
1532             if (clid != TC_H_INGRESS) {
1533                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1534                 if (!p) {
1535                     NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1536                     return -ENOENT;
1537                 }
1538                 q = qdisc_leaf(p, clid);
1539             } else if (dev_ingress_queue_create(dev)) {
1540                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1541             }
1542         } else {
1543             q = rtnl_dereference(dev->qdisc);
1544         }
1545 
1546         /* It may be default qdisc, ignore it */
1547         if (q && q->handle == 0)
1548             q = NULL;
1549 
1550         if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1551             if (tcm->tcm_handle) {
1552                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1553                     NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1554                     return -EEXIST;
1555                 }
1556                 if (TC_H_MIN(tcm->tcm_handle)) {
1557                     NL_SET_ERR_MSG(extack, "Invalid minor handle");
1558                     return -EINVAL;
1559                 }
1560                 q = qdisc_lookup(dev, tcm->tcm_handle);
1561                 if (!q)
1562                     goto create_n_graft;
1563                 if (n->nlmsg_flags & NLM_F_EXCL) {
1564                     NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1565                     return -EEXIST;
1566                 }
1567                 if (tca[TCA_KIND] &&
1568                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1569                     NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1570                     return -EINVAL;
1571                 }
1572                 if (q == p ||
1573                     (p && check_loop(q, p, 0))) {
1574                     NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1575                     return -ELOOP;
1576                 }
1577                 qdisc_refcount_inc(q);
1578                 goto graft;
1579             } else {
1580                 if (!q)
1581                     goto create_n_graft;
1582 
1583                 /* This magic test requires explanation.
1584                  *
1585                  *   We know, that some child q is already
1586                  *   attached to this parent and have choice:
1587                  *   either to change it or to create/graft new one.
1588                  *
1589                  *   1. We are allowed to create/graft only
1590                  *   if CREATE and REPLACE flags are set.
1591                  *
1592                  *   2. If EXCL is set, requestor wanted to say,
1593                  *   that qdisc tcm_handle is not expected
1594                  *   to exist, so that we choose create/graft too.
1595                  *
1596                  *   3. The last case is when no flags are set.
1597                  *   Alas, it is sort of hole in API, we
1598                  *   cannot decide what to do unambiguously.
1599                  *   For now we select create/graft, if
1600                  *   user gave KIND, which does not match existing.
1601                  */
1602                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1603                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1604                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1605                      (tca[TCA_KIND] &&
1606                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1607                     goto create_n_graft;
1608             }
1609         }
1610     } else {
1611         if (!tcm->tcm_handle) {
1612             NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1613             return -EINVAL;
1614         }
1615         q = qdisc_lookup(dev, tcm->tcm_handle);
1616     }
1617 
1618     /* Change qdisc parameters */
1619     if (!q) {
1620         NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1621         return -ENOENT;
1622     }
1623     if (n->nlmsg_flags & NLM_F_EXCL) {
1624         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1625         return -EEXIST;
1626     }
1627     if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1628         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1629         return -EINVAL;
1630     }
1631     err = qdisc_change(q, tca, extack);
1632     if (err == 0)
1633         qdisc_notify(net, skb, n, clid, NULL, q);
1634     return err;
1635 
1636 create_n_graft:
1637     if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1638         NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1639         return -ENOENT;
1640     }
1641     if (clid == TC_H_INGRESS) {
1642         if (dev_ingress_queue(dev)) {
1643             q = qdisc_create(dev, dev_ingress_queue(dev), p,
1644                      tcm->tcm_parent, tcm->tcm_parent,
1645                      tca, &err, extack);
1646         } else {
1647             NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1648             err = -ENOENT;
1649         }
1650     } else {
1651         struct netdev_queue *dev_queue;
1652 
1653         if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1654             dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1655         else if (p)
1656             dev_queue = p->dev_queue;
1657         else
1658             dev_queue = netdev_get_tx_queue(dev, 0);
1659 
1660         q = qdisc_create(dev, dev_queue, p,
1661                  tcm->tcm_parent, tcm->tcm_handle,
1662                  tca, &err, extack);
1663     }
1664     if (q == NULL) {
1665         if (err == -EAGAIN)
1666             goto replay;
1667         return err;
1668     }
1669 
1670 graft:
1671     err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1672     if (err) {
1673         if (q)
1674             qdisc_put(q);
1675         return err;
1676     }
1677 
1678     return 0;
1679 }
1680 
1681 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1682                   struct netlink_callback *cb,
1683                   int *q_idx_p, int s_q_idx, bool recur,
1684                   bool dump_invisible)
1685 {
1686     int ret = 0, q_idx = *q_idx_p;
1687     struct Qdisc *q;
1688     int b;
1689 
1690     if (!root)
1691         return 0;
1692 
1693     q = root;
1694     if (q_idx < s_q_idx) {
1695         q_idx++;
1696     } else {
1697         if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1698             tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1699                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1700                   RTM_NEWQDISC) <= 0)
1701             goto done;
1702         q_idx++;
1703     }
1704 
1705     /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1706      * itself has already been dumped.
1707      *
1708      * If we've already dumped the top-level (ingress) qdisc above and the global
1709      * qdisc hashtable, we don't want to hit it again
1710      */
1711     if (!qdisc_dev(root) || !recur)
1712         goto out;
1713 
1714     hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1715         if (q_idx < s_q_idx) {
1716             q_idx++;
1717             continue;
1718         }
1719         if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1720             tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1721                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1722                   RTM_NEWQDISC) <= 0)
1723             goto done;
1724         q_idx++;
1725     }
1726 
1727 out:
1728     *q_idx_p = q_idx;
1729     return ret;
1730 done:
1731     ret = -1;
1732     goto out;
1733 }
1734 
1735 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1736 {
1737     struct net *net = sock_net(skb->sk);
1738     int idx, q_idx;
1739     int s_idx, s_q_idx;
1740     struct net_device *dev;
1741     const struct nlmsghdr *nlh = cb->nlh;
1742     struct nlattr *tca[TCA_MAX + 1];
1743     int err;
1744 
1745     s_idx = cb->args[0];
1746     s_q_idx = q_idx = cb->args[1];
1747 
1748     idx = 0;
1749     ASSERT_RTNL();
1750 
1751     err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1752                      rtm_tca_policy, cb->extack);
1753     if (err < 0)
1754         return err;
1755 
1756     for_each_netdev(net, dev) {
1757         struct netdev_queue *dev_queue;
1758 
1759         if (idx < s_idx)
1760             goto cont;
1761         if (idx > s_idx)
1762             s_q_idx = 0;
1763         q_idx = 0;
1764 
1765         if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1766                        skb, cb, &q_idx, s_q_idx,
1767                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1768             goto done;
1769 
1770         dev_queue = dev_ingress_queue(dev);
1771         if (dev_queue &&
1772             tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1773                        &q_idx, s_q_idx, false,
1774                        tca[TCA_DUMP_INVISIBLE]) < 0)
1775             goto done;
1776 
1777 cont:
1778         idx++;
1779     }
1780 
1781 done:
1782     cb->args[0] = idx;
1783     cb->args[1] = q_idx;
1784 
1785     return skb->len;
1786 }
1787 
1788 
1789 
1790 /************************************************
1791  *  Traffic classes manipulation.       *
1792  ************************************************/
1793 
1794 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1795               unsigned long cl,
1796               u32 portid, u32 seq, u16 flags, int event)
1797 {
1798     struct tcmsg *tcm;
1799     struct nlmsghdr  *nlh;
1800     unsigned char *b = skb_tail_pointer(skb);
1801     struct gnet_dump d;
1802     const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1803 
1804     cond_resched();
1805     nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1806     if (!nlh)
1807         goto out_nlmsg_trim;
1808     tcm = nlmsg_data(nlh);
1809     tcm->tcm_family = AF_UNSPEC;
1810     tcm->tcm__pad1 = 0;
1811     tcm->tcm__pad2 = 0;
1812     tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1813     tcm->tcm_parent = q->handle;
1814     tcm->tcm_handle = q->handle;
1815     tcm->tcm_info = 0;
1816     if (nla_put_string(skb, TCA_KIND, q->ops->id))
1817         goto nla_put_failure;
1818     if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1819         goto nla_put_failure;
1820 
1821     if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1822                      NULL, &d, TCA_PAD) < 0)
1823         goto nla_put_failure;
1824 
1825     if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1826         goto nla_put_failure;
1827 
1828     if (gnet_stats_finish_copy(&d) < 0)
1829         goto nla_put_failure;
1830 
1831     nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1832     return skb->len;
1833 
1834 out_nlmsg_trim:
1835 nla_put_failure:
1836     nlmsg_trim(skb, b);
1837     return -1;
1838 }
1839 
1840 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1841              struct nlmsghdr *n, struct Qdisc *q,
1842              unsigned long cl, int event)
1843 {
1844     struct sk_buff *skb;
1845     u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1846 
1847     skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1848     if (!skb)
1849         return -ENOBUFS;
1850 
1851     if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1852         kfree_skb(skb);
1853         return -EINVAL;
1854     }
1855 
1856     return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1857                   n->nlmsg_flags & NLM_F_ECHO);
1858 }
1859 
1860 static int tclass_del_notify(struct net *net,
1861                  const struct Qdisc_class_ops *cops,
1862                  struct sk_buff *oskb, struct nlmsghdr *n,
1863                  struct Qdisc *q, unsigned long cl,
1864                  struct netlink_ext_ack *extack)
1865 {
1866     u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1867     struct sk_buff *skb;
1868     int err = 0;
1869 
1870     if (!cops->delete)
1871         return -EOPNOTSUPP;
1872 
1873     skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1874     if (!skb)
1875         return -ENOBUFS;
1876 
1877     if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1878                RTM_DELTCLASS) < 0) {
1879         kfree_skb(skb);
1880         return -EINVAL;
1881     }
1882 
1883     err = cops->delete(q, cl, extack);
1884     if (err) {
1885         kfree_skb(skb);
1886         return err;
1887     }
1888 
1889     err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1890                  n->nlmsg_flags & NLM_F_ECHO);
1891     return err;
1892 }
1893 
1894 #ifdef CONFIG_NET_CLS
1895 
1896 struct tcf_bind_args {
1897     struct tcf_walker w;
1898     unsigned long base;
1899     unsigned long cl;
1900     u32 classid;
1901 };
1902 
1903 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1904 {
1905     struct tcf_bind_args *a = (void *)arg;
1906 
1907     if (tp->ops->bind_class) {
1908         struct Qdisc *q = tcf_block_q(tp->chain->block);
1909 
1910         sch_tree_lock(q);
1911         tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1912         sch_tree_unlock(q);
1913     }
1914     return 0;
1915 }
1916 
1917 struct tc_bind_class_args {
1918     struct qdisc_walker w;
1919     unsigned long new_cl;
1920     u32 portid;
1921     u32 clid;
1922 };
1923 
1924 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1925                 struct qdisc_walker *w)
1926 {
1927     struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1928     const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1929     struct tcf_block *block;
1930     struct tcf_chain *chain;
1931 
1932     block = cops->tcf_block(q, cl, NULL);
1933     if (!block)
1934         return 0;
1935     for (chain = tcf_get_next_chain(block, NULL);
1936          chain;
1937          chain = tcf_get_next_chain(block, chain)) {
1938         struct tcf_proto *tp;
1939 
1940         for (tp = tcf_get_next_proto(chain, NULL);
1941              tp; tp = tcf_get_next_proto(chain, tp)) {
1942             struct tcf_bind_args arg = {};
1943 
1944             arg.w.fn = tcf_node_bind;
1945             arg.classid = a->clid;
1946             arg.base = cl;
1947             arg.cl = a->new_cl;
1948             tp->ops->walk(tp, &arg.w, true);
1949         }
1950     }
1951 
1952     return 0;
1953 }
1954 
1955 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1956                unsigned long new_cl)
1957 {
1958     const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1959     struct tc_bind_class_args args = {};
1960 
1961     if (!cops->tcf_block)
1962         return;
1963     args.portid = portid;
1964     args.clid = clid;
1965     args.new_cl = new_cl;
1966     args.w.fn = tc_bind_class_walker;
1967     q->ops->cl_ops->walk(q, &args.w);
1968 }
1969 
1970 #else
1971 
1972 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1973                unsigned long new_cl)
1974 {
1975 }
1976 
1977 #endif
1978 
1979 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1980              struct netlink_ext_ack *extack)
1981 {
1982     struct net *net = sock_net(skb->sk);
1983     struct tcmsg *tcm = nlmsg_data(n);
1984     struct nlattr *tca[TCA_MAX + 1];
1985     struct net_device *dev;
1986     struct Qdisc *q = NULL;
1987     const struct Qdisc_class_ops *cops;
1988     unsigned long cl = 0;
1989     unsigned long new_cl;
1990     u32 portid;
1991     u32 clid;
1992     u32 qid;
1993     int err;
1994 
1995     if ((n->nlmsg_type != RTM_GETTCLASS) &&
1996         !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1997         return -EPERM;
1998 
1999     err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2000                      rtm_tca_policy, extack);
2001     if (err < 0)
2002         return err;
2003 
2004     dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2005     if (!dev)
2006         return -ENODEV;
2007 
2008     /*
2009        parent == TC_H_UNSPEC - unspecified parent.
2010        parent == TC_H_ROOT   - class is root, which has no parent.
2011        parent == X:0     - parent is root class.
2012        parent == X:Y     - parent is a node in hierarchy.
2013        parent == 0:Y     - parent is X:Y, where X:0 is qdisc.
2014 
2015        handle == 0:0     - generate handle from kernel pool.
2016        handle == 0:Y     - class is X:Y, where X:0 is qdisc.
2017        handle == X:Y     - clear.
2018        handle == X:0     - root class.
2019      */
2020 
2021     /* Step 1. Determine qdisc handle X:0 */
2022 
2023     portid = tcm->tcm_parent;
2024     clid = tcm->tcm_handle;
2025     qid = TC_H_MAJ(clid);
2026 
2027     if (portid != TC_H_ROOT) {
2028         u32 qid1 = TC_H_MAJ(portid);
2029 
2030         if (qid && qid1) {
2031             /* If both majors are known, they must be identical. */
2032             if (qid != qid1)
2033                 return -EINVAL;
2034         } else if (qid1) {
2035             qid = qid1;
2036         } else if (qid == 0)
2037             qid = rtnl_dereference(dev->qdisc)->handle;
2038 
2039         /* Now qid is genuine qdisc handle consistent
2040          * both with parent and child.
2041          *
2042          * TC_H_MAJ(portid) still may be unspecified, complete it now.
2043          */
2044         if (portid)
2045             portid = TC_H_MAKE(qid, portid);
2046     } else {
2047         if (qid == 0)
2048             qid = rtnl_dereference(dev->qdisc)->handle;
2049     }
2050 
2051     /* OK. Locate qdisc */
2052     q = qdisc_lookup(dev, qid);
2053     if (!q)
2054         return -ENOENT;
2055 
2056     /* An check that it supports classes */
2057     cops = q->ops->cl_ops;
2058     if (cops == NULL)
2059         return -EINVAL;
2060 
2061     /* Now try to get class */
2062     if (clid == 0) {
2063         if (portid == TC_H_ROOT)
2064             clid = qid;
2065     } else
2066         clid = TC_H_MAKE(qid, clid);
2067 
2068     if (clid)
2069         cl = cops->find(q, clid);
2070 
2071     if (cl == 0) {
2072         err = -ENOENT;
2073         if (n->nlmsg_type != RTM_NEWTCLASS ||
2074             !(n->nlmsg_flags & NLM_F_CREATE))
2075             goto out;
2076     } else {
2077         switch (n->nlmsg_type) {
2078         case RTM_NEWTCLASS:
2079             err = -EEXIST;
2080             if (n->nlmsg_flags & NLM_F_EXCL)
2081                 goto out;
2082             break;
2083         case RTM_DELTCLASS:
2084             err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2085             /* Unbind the class with flilters with 0 */
2086             tc_bind_tclass(q, portid, clid, 0);
2087             goto out;
2088         case RTM_GETTCLASS:
2089             err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2090             goto out;
2091         default:
2092             err = -EINVAL;
2093             goto out;
2094         }
2095     }
2096 
2097     if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2098         NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2099         return -EOPNOTSUPP;
2100     }
2101 
2102     new_cl = cl;
2103     err = -EOPNOTSUPP;
2104     if (cops->change)
2105         err = cops->change(q, clid, portid, tca, &new_cl, extack);
2106     if (err == 0) {
2107         tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2108         /* We just create a new class, need to do reverse binding. */
2109         if (cl != new_cl)
2110             tc_bind_tclass(q, portid, clid, new_cl);
2111     }
2112 out:
2113     return err;
2114 }
2115 
2116 struct qdisc_dump_args {
2117     struct qdisc_walker w;
2118     struct sk_buff      *skb;
2119     struct netlink_callback *cb;
2120 };
2121 
2122 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2123                 struct qdisc_walker *arg)
2124 {
2125     struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2126 
2127     return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2128                   a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2129                   RTM_NEWTCLASS);
2130 }
2131 
2132 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2133                 struct tcmsg *tcm, struct netlink_callback *cb,
2134                 int *t_p, int s_t)
2135 {
2136     struct qdisc_dump_args arg;
2137 
2138     if (tc_qdisc_dump_ignore(q, false) ||
2139         *t_p < s_t || !q->ops->cl_ops ||
2140         (tcm->tcm_parent &&
2141          TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2142         (*t_p)++;
2143         return 0;
2144     }
2145     if (*t_p > s_t)
2146         memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2147     arg.w.fn = qdisc_class_dump;
2148     arg.skb = skb;
2149     arg.cb = cb;
2150     arg.w.stop  = 0;
2151     arg.w.skip = cb->args[1];
2152     arg.w.count = 0;
2153     q->ops->cl_ops->walk(q, &arg.w);
2154     cb->args[1] = arg.w.count;
2155     if (arg.w.stop)
2156         return -1;
2157     (*t_p)++;
2158     return 0;
2159 }
2160 
2161 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2162                    struct tcmsg *tcm, struct netlink_callback *cb,
2163                    int *t_p, int s_t, bool recur)
2164 {
2165     struct Qdisc *q;
2166     int b;
2167 
2168     if (!root)
2169         return 0;
2170 
2171     if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2172         return -1;
2173 
2174     if (!qdisc_dev(root) || !recur)
2175         return 0;
2176 
2177     if (tcm->tcm_parent) {
2178         q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2179         if (q && q != root &&
2180             tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2181             return -1;
2182         return 0;
2183     }
2184     hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2185         if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2186             return -1;
2187     }
2188 
2189     return 0;
2190 }
2191 
2192 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2193 {
2194     struct tcmsg *tcm = nlmsg_data(cb->nlh);
2195     struct net *net = sock_net(skb->sk);
2196     struct netdev_queue *dev_queue;
2197     struct net_device *dev;
2198     int t, s_t;
2199 
2200     if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2201         return 0;
2202     dev = dev_get_by_index(net, tcm->tcm_ifindex);
2203     if (!dev)
2204         return 0;
2205 
2206     s_t = cb->args[0];
2207     t = 0;
2208 
2209     if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2210                 skb, tcm, cb, &t, s_t, true) < 0)
2211         goto done;
2212 
2213     dev_queue = dev_ingress_queue(dev);
2214     if (dev_queue &&
2215         tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2216                 &t, s_t, false) < 0)
2217         goto done;
2218 
2219 done:
2220     cb->args[0] = t;
2221 
2222     dev_put(dev);
2223     return skb->len;
2224 }
2225 
2226 #ifdef CONFIG_PROC_FS
2227 static int psched_show(struct seq_file *seq, void *v)
2228 {
2229     seq_printf(seq, "%08x %08x %08x %08x\n",
2230            (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2231            1000000,
2232            (u32)NSEC_PER_SEC / hrtimer_resolution);
2233 
2234     return 0;
2235 }
2236 
2237 static int __net_init psched_net_init(struct net *net)
2238 {
2239     struct proc_dir_entry *e;
2240 
2241     e = proc_create_single("psched", 0, net->proc_net, psched_show);
2242     if (e == NULL)
2243         return -ENOMEM;
2244 
2245     return 0;
2246 }
2247 
2248 static void __net_exit psched_net_exit(struct net *net)
2249 {
2250     remove_proc_entry("psched", net->proc_net);
2251 }
2252 #else
2253 static int __net_init psched_net_init(struct net *net)
2254 {
2255     return 0;
2256 }
2257 
2258 static void __net_exit psched_net_exit(struct net *net)
2259 {
2260 }
2261 #endif
2262 
2263 static struct pernet_operations psched_net_ops = {
2264     .init = psched_net_init,
2265     .exit = psched_net_exit,
2266 };
2267 
2268 static int __init pktsched_init(void)
2269 {
2270     int err;
2271 
2272     err = register_pernet_subsys(&psched_net_ops);
2273     if (err) {
2274         pr_err("pktsched_init: "
2275                "cannot initialize per netns operations\n");
2276         return err;
2277     }
2278 
2279     register_qdisc(&pfifo_fast_ops);
2280     register_qdisc(&pfifo_qdisc_ops);
2281     register_qdisc(&bfifo_qdisc_ops);
2282     register_qdisc(&pfifo_head_drop_qdisc_ops);
2283     register_qdisc(&mq_qdisc_ops);
2284     register_qdisc(&noqueue_qdisc_ops);
2285 
2286     rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2287     rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2288     rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2289               0);
2290     rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2291     rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2292     rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2293               0);
2294 
2295     return 0;
2296 }
2297 
2298 subsys_initcall(pktsched_init);