Back to home page

LXR

 
 

    


0001 /*
0002  * Common Block IO controller cgroup interface
0003  *
0004  * Based on ideas and code from CFQ, CFS and BFQ:
0005  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
0006  *
0007  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
0008  *            Paolo Valente <paolo.valente@unimore.it>
0009  *
0010  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
0011  *                Nauman Rafique <nauman@google.com>
0012  *
0013  * For policy-specific per-blkcg data:
0014  * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
0015  *                    Arianna Avanzini <avanzini.arianna@gmail.com>
0016  */
0017 #include <linux/ioprio.h>
0018 #include <linux/kdev_t.h>
0019 #include <linux/module.h>
0020 #include <linux/err.h>
0021 #include <linux/blkdev.h>
0022 #include <linux/backing-dev.h>
0023 #include <linux/slab.h>
0024 #include <linux/genhd.h>
0025 #include <linux/delay.h>
0026 #include <linux/atomic.h>
0027 #include <linux/ctype.h>
0028 #include <linux/blk-cgroup.h>
0029 #include "blk.h"
0030 
0031 #define MAX_KEY_LEN 100
0032 
0033 /*
0034  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
0035  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
0036  * policy [un]register operations including cgroup file additions /
0037  * removals.  Putting cgroup file registration outside blkcg_pol_mutex
0038  * allows grabbing it from cgroup callbacks.
0039  */
0040 static DEFINE_MUTEX(blkcg_pol_register_mutex);
0041 static DEFINE_MUTEX(blkcg_pol_mutex);
0042 
0043 struct blkcg blkcg_root;
0044 EXPORT_SYMBOL_GPL(blkcg_root);
0045 
0046 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
0047 
0048 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
0049 
0050 static LIST_HEAD(all_blkcgs);       /* protected by blkcg_pol_mutex */
0051 
0052 static bool blkcg_policy_enabled(struct request_queue *q,
0053                  const struct blkcg_policy *pol)
0054 {
0055     return pol && test_bit(pol->plid, q->blkcg_pols);
0056 }
0057 
0058 /**
0059  * blkg_free - free a blkg
0060  * @blkg: blkg to free
0061  *
0062  * Free @blkg which may be partially allocated.
0063  */
0064 static void blkg_free(struct blkcg_gq *blkg)
0065 {
0066     int i;
0067 
0068     if (!blkg)
0069         return;
0070 
0071     for (i = 0; i < BLKCG_MAX_POLS; i++)
0072         if (blkg->pd[i])
0073             blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
0074 
0075     if (blkg->blkcg != &blkcg_root)
0076         blk_exit_rl(&blkg->rl);
0077 
0078     blkg_rwstat_exit(&blkg->stat_ios);
0079     blkg_rwstat_exit(&blkg->stat_bytes);
0080     kfree(blkg);
0081 }
0082 
0083 /**
0084  * blkg_alloc - allocate a blkg
0085  * @blkcg: block cgroup the new blkg is associated with
0086  * @q: request_queue the new blkg is associated with
0087  * @gfp_mask: allocation mask to use
0088  *
0089  * Allocate a new blkg assocating @blkcg and @q.
0090  */
0091 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
0092                    gfp_t gfp_mask)
0093 {
0094     struct blkcg_gq *blkg;
0095     int i;
0096 
0097     /* alloc and init base part */
0098     blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
0099     if (!blkg)
0100         return NULL;
0101 
0102     if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
0103         blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
0104         goto err_free;
0105 
0106     blkg->q = q;
0107     INIT_LIST_HEAD(&blkg->q_node);
0108     blkg->blkcg = blkcg;
0109     atomic_set(&blkg->refcnt, 1);
0110 
0111     /* root blkg uses @q->root_rl, init rl only for !root blkgs */
0112     if (blkcg != &blkcg_root) {
0113         if (blk_init_rl(&blkg->rl, q, gfp_mask))
0114             goto err_free;
0115         blkg->rl.blkg = blkg;
0116     }
0117 
0118     for (i = 0; i < BLKCG_MAX_POLS; i++) {
0119         struct blkcg_policy *pol = blkcg_policy[i];
0120         struct blkg_policy_data *pd;
0121 
0122         if (!blkcg_policy_enabled(q, pol))
0123             continue;
0124 
0125         /* alloc per-policy data and attach it to blkg */
0126         pd = pol->pd_alloc_fn(gfp_mask, q->node);
0127         if (!pd)
0128             goto err_free;
0129 
0130         blkg->pd[i] = pd;
0131         pd->blkg = blkg;
0132         pd->plid = i;
0133     }
0134 
0135     return blkg;
0136 
0137 err_free:
0138     blkg_free(blkg);
0139     return NULL;
0140 }
0141 
0142 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
0143                       struct request_queue *q, bool update_hint)
0144 {
0145     struct blkcg_gq *blkg;
0146 
0147     /*
0148      * Hint didn't match.  Look up from the radix tree.  Note that the
0149      * hint can only be updated under queue_lock as otherwise @blkg
0150      * could have already been removed from blkg_tree.  The caller is
0151      * responsible for grabbing queue_lock if @update_hint.
0152      */
0153     blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
0154     if (blkg && blkg->q == q) {
0155         if (update_hint) {
0156             lockdep_assert_held(q->queue_lock);
0157             rcu_assign_pointer(blkcg->blkg_hint, blkg);
0158         }
0159         return blkg;
0160     }
0161 
0162     return NULL;
0163 }
0164 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
0165 
0166 /*
0167  * If @new_blkg is %NULL, this function tries to allocate a new one as
0168  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
0169  */
0170 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
0171                     struct request_queue *q,
0172                     struct blkcg_gq *new_blkg)
0173 {
0174     struct blkcg_gq *blkg;
0175     struct bdi_writeback_congested *wb_congested;
0176     int i, ret;
0177 
0178     WARN_ON_ONCE(!rcu_read_lock_held());
0179     lockdep_assert_held(q->queue_lock);
0180 
0181     /* blkg holds a reference to blkcg */
0182     if (!css_tryget_online(&blkcg->css)) {
0183         ret = -ENODEV;
0184         goto err_free_blkg;
0185     }
0186 
0187     wb_congested = wb_congested_get_create(&q->backing_dev_info,
0188                            blkcg->css.id,
0189                            GFP_NOWAIT | __GFP_NOWARN);
0190     if (!wb_congested) {
0191         ret = -ENOMEM;
0192         goto err_put_css;
0193     }
0194 
0195     /* allocate */
0196     if (!new_blkg) {
0197         new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
0198         if (unlikely(!new_blkg)) {
0199             ret = -ENOMEM;
0200             goto err_put_congested;
0201         }
0202     }
0203     blkg = new_blkg;
0204     blkg->wb_congested = wb_congested;
0205 
0206     /* link parent */
0207     if (blkcg_parent(blkcg)) {
0208         blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
0209         if (WARN_ON_ONCE(!blkg->parent)) {
0210             ret = -ENODEV;
0211             goto err_put_congested;
0212         }
0213         blkg_get(blkg->parent);
0214     }
0215 
0216     /* invoke per-policy init */
0217     for (i = 0; i < BLKCG_MAX_POLS; i++) {
0218         struct blkcg_policy *pol = blkcg_policy[i];
0219 
0220         if (blkg->pd[i] && pol->pd_init_fn)
0221             pol->pd_init_fn(blkg->pd[i]);
0222     }
0223 
0224     /* insert */
0225     spin_lock(&blkcg->lock);
0226     ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
0227     if (likely(!ret)) {
0228         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
0229         list_add(&blkg->q_node, &q->blkg_list);
0230 
0231         for (i = 0; i < BLKCG_MAX_POLS; i++) {
0232             struct blkcg_policy *pol = blkcg_policy[i];
0233 
0234             if (blkg->pd[i] && pol->pd_online_fn)
0235                 pol->pd_online_fn(blkg->pd[i]);
0236         }
0237     }
0238     blkg->online = true;
0239     spin_unlock(&blkcg->lock);
0240 
0241     if (!ret)
0242         return blkg;
0243 
0244     /* @blkg failed fully initialized, use the usual release path */
0245     blkg_put(blkg);
0246     return ERR_PTR(ret);
0247 
0248 err_put_congested:
0249     wb_congested_put(wb_congested);
0250 err_put_css:
0251     css_put(&blkcg->css);
0252 err_free_blkg:
0253     blkg_free(new_blkg);
0254     return ERR_PTR(ret);
0255 }
0256 
0257 /**
0258  * blkg_lookup_create - lookup blkg, try to create one if not there
0259  * @blkcg: blkcg of interest
0260  * @q: request_queue of interest
0261  *
0262  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
0263  * create one.  blkg creation is performed recursively from blkcg_root such
0264  * that all non-root blkg's have access to the parent blkg.  This function
0265  * should be called under RCU read lock and @q->queue_lock.
0266  *
0267  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
0268  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
0269  * dead and bypassing, returns ERR_PTR(-EBUSY).
0270  */
0271 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
0272                     struct request_queue *q)
0273 {
0274     struct blkcg_gq *blkg;
0275 
0276     WARN_ON_ONCE(!rcu_read_lock_held());
0277     lockdep_assert_held(q->queue_lock);
0278 
0279     /*
0280      * This could be the first entry point of blkcg implementation and
0281      * we shouldn't allow anything to go through for a bypassing queue.
0282      */
0283     if (unlikely(blk_queue_bypass(q)))
0284         return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
0285 
0286     blkg = __blkg_lookup(blkcg, q, true);
0287     if (blkg)
0288         return blkg;
0289 
0290     /*
0291      * Create blkgs walking down from blkcg_root to @blkcg, so that all
0292      * non-root blkgs have access to their parents.
0293      */
0294     while (true) {
0295         struct blkcg *pos = blkcg;
0296         struct blkcg *parent = blkcg_parent(blkcg);
0297 
0298         while (parent && !__blkg_lookup(parent, q, false)) {
0299             pos = parent;
0300             parent = blkcg_parent(parent);
0301         }
0302 
0303         blkg = blkg_create(pos, q, NULL);
0304         if (pos == blkcg || IS_ERR(blkg))
0305             return blkg;
0306     }
0307 }
0308 
0309 static void blkg_destroy(struct blkcg_gq *blkg)
0310 {
0311     struct blkcg *blkcg = blkg->blkcg;
0312     struct blkcg_gq *parent = blkg->parent;
0313     int i;
0314 
0315     lockdep_assert_held(blkg->q->queue_lock);
0316     lockdep_assert_held(&blkcg->lock);
0317 
0318     /* Something wrong if we are trying to remove same group twice */
0319     WARN_ON_ONCE(list_empty(&blkg->q_node));
0320     WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
0321 
0322     for (i = 0; i < BLKCG_MAX_POLS; i++) {
0323         struct blkcg_policy *pol = blkcg_policy[i];
0324 
0325         if (blkg->pd[i] && pol->pd_offline_fn)
0326             pol->pd_offline_fn(blkg->pd[i]);
0327     }
0328 
0329     if (parent) {
0330         blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
0331         blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
0332     }
0333 
0334     blkg->online = false;
0335 
0336     radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
0337     list_del_init(&blkg->q_node);
0338     hlist_del_init_rcu(&blkg->blkcg_node);
0339 
0340     /*
0341      * Both setting lookup hint to and clearing it from @blkg are done
0342      * under queue_lock.  If it's not pointing to @blkg now, it never
0343      * will.  Hint assignment itself can race safely.
0344      */
0345     if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
0346         rcu_assign_pointer(blkcg->blkg_hint, NULL);
0347 
0348     /*
0349      * Put the reference taken at the time of creation so that when all
0350      * queues are gone, group can be destroyed.
0351      */
0352     blkg_put(blkg);
0353 }
0354 
0355 /**
0356  * blkg_destroy_all - destroy all blkgs associated with a request_queue
0357  * @q: request_queue of interest
0358  *
0359  * Destroy all blkgs associated with @q.
0360  */
0361 static void blkg_destroy_all(struct request_queue *q)
0362 {
0363     struct blkcg_gq *blkg, *n;
0364 
0365     lockdep_assert_held(q->queue_lock);
0366 
0367     list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
0368         struct blkcg *blkcg = blkg->blkcg;
0369 
0370         spin_lock(&blkcg->lock);
0371         blkg_destroy(blkg);
0372         spin_unlock(&blkcg->lock);
0373     }
0374 
0375     q->root_blkg = NULL;
0376     q->root_rl.blkg = NULL;
0377 }
0378 
0379 /*
0380  * A group is RCU protected, but having an rcu lock does not mean that one
0381  * can access all the fields of blkg and assume these are valid.  For
0382  * example, don't try to follow throtl_data and request queue links.
0383  *
0384  * Having a reference to blkg under an rcu allows accesses to only values
0385  * local to groups like group stats and group rate limits.
0386  */
0387 void __blkg_release_rcu(struct rcu_head *rcu_head)
0388 {
0389     struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
0390 
0391     /* release the blkcg and parent blkg refs this blkg has been holding */
0392     css_put(&blkg->blkcg->css);
0393     if (blkg->parent)
0394         blkg_put(blkg->parent);
0395 
0396     wb_congested_put(blkg->wb_congested);
0397 
0398     blkg_free(blkg);
0399 }
0400 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
0401 
0402 /*
0403  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
0404  * because the root blkg uses @q->root_rl instead of its own rl.
0405  */
0406 struct request_list *__blk_queue_next_rl(struct request_list *rl,
0407                      struct request_queue *q)
0408 {
0409     struct list_head *ent;
0410     struct blkcg_gq *blkg;
0411 
0412     /*
0413      * Determine the current blkg list_head.  The first entry is
0414      * root_rl which is off @q->blkg_list and mapped to the head.
0415      */
0416     if (rl == &q->root_rl) {
0417         ent = &q->blkg_list;
0418         /* There are no more block groups, hence no request lists */
0419         if (list_empty(ent))
0420             return NULL;
0421     } else {
0422         blkg = container_of(rl, struct blkcg_gq, rl);
0423         ent = &blkg->q_node;
0424     }
0425 
0426     /* walk to the next list_head, skip root blkcg */
0427     ent = ent->next;
0428     if (ent == &q->root_blkg->q_node)
0429         ent = ent->next;
0430     if (ent == &q->blkg_list)
0431         return NULL;
0432 
0433     blkg = container_of(ent, struct blkcg_gq, q_node);
0434     return &blkg->rl;
0435 }
0436 
0437 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
0438                  struct cftype *cftype, u64 val)
0439 {
0440     struct blkcg *blkcg = css_to_blkcg(css);
0441     struct blkcg_gq *blkg;
0442     int i;
0443 
0444     mutex_lock(&blkcg_pol_mutex);
0445     spin_lock_irq(&blkcg->lock);
0446 
0447     /*
0448      * Note that stat reset is racy - it doesn't synchronize against
0449      * stat updates.  This is a debug feature which shouldn't exist
0450      * anyway.  If you get hit by a race, retry.
0451      */
0452     hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
0453         blkg_rwstat_reset(&blkg->stat_bytes);
0454         blkg_rwstat_reset(&blkg->stat_ios);
0455 
0456         for (i = 0; i < BLKCG_MAX_POLS; i++) {
0457             struct blkcg_policy *pol = blkcg_policy[i];
0458 
0459             if (blkg->pd[i] && pol->pd_reset_stats_fn)
0460                 pol->pd_reset_stats_fn(blkg->pd[i]);
0461         }
0462     }
0463 
0464     spin_unlock_irq(&blkcg->lock);
0465     mutex_unlock(&blkcg_pol_mutex);
0466     return 0;
0467 }
0468 
0469 const char *blkg_dev_name(struct blkcg_gq *blkg)
0470 {
0471     /* some drivers (floppy) instantiate a queue w/o disk registered */
0472     if (blkg->q->backing_dev_info.dev)
0473         return dev_name(blkg->q->backing_dev_info.dev);
0474     return NULL;
0475 }
0476 EXPORT_SYMBOL_GPL(blkg_dev_name);
0477 
0478 /**
0479  * blkcg_print_blkgs - helper for printing per-blkg data
0480  * @sf: seq_file to print to
0481  * @blkcg: blkcg of interest
0482  * @prfill: fill function to print out a blkg
0483  * @pol: policy in question
0484  * @data: data to be passed to @prfill
0485  * @show_total: to print out sum of prfill return values or not
0486  *
0487  * This function invokes @prfill on each blkg of @blkcg if pd for the
0488  * policy specified by @pol exists.  @prfill is invoked with @sf, the
0489  * policy data and @data and the matching queue lock held.  If @show_total
0490  * is %true, the sum of the return values from @prfill is printed with
0491  * "Total" label at the end.
0492  *
0493  * This is to be used to construct print functions for
0494  * cftype->read_seq_string method.
0495  */
0496 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
0497                u64 (*prfill)(struct seq_file *,
0498                      struct blkg_policy_data *, int),
0499                const struct blkcg_policy *pol, int data,
0500                bool show_total)
0501 {
0502     struct blkcg_gq *blkg;
0503     u64 total = 0;
0504 
0505     rcu_read_lock();
0506     hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
0507         spin_lock_irq(blkg->q->queue_lock);
0508         if (blkcg_policy_enabled(blkg->q, pol))
0509             total += prfill(sf, blkg->pd[pol->plid], data);
0510         spin_unlock_irq(blkg->q->queue_lock);
0511     }
0512     rcu_read_unlock();
0513 
0514     if (show_total)
0515         seq_printf(sf, "Total %llu\n", (unsigned long long)total);
0516 }
0517 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
0518 
0519 /**
0520  * __blkg_prfill_u64 - prfill helper for a single u64 value
0521  * @sf: seq_file to print to
0522  * @pd: policy private data of interest
0523  * @v: value to print
0524  *
0525  * Print @v to @sf for the device assocaited with @pd.
0526  */
0527 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
0528 {
0529     const char *dname = blkg_dev_name(pd->blkg);
0530 
0531     if (!dname)
0532         return 0;
0533 
0534     seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
0535     return v;
0536 }
0537 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
0538 
0539 /**
0540  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
0541  * @sf: seq_file to print to
0542  * @pd: policy private data of interest
0543  * @rwstat: rwstat to print
0544  *
0545  * Print @rwstat to @sf for the device assocaited with @pd.
0546  */
0547 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
0548              const struct blkg_rwstat *rwstat)
0549 {
0550     static const char *rwstr[] = {
0551         [BLKG_RWSTAT_READ]  = "Read",
0552         [BLKG_RWSTAT_WRITE] = "Write",
0553         [BLKG_RWSTAT_SYNC]  = "Sync",
0554         [BLKG_RWSTAT_ASYNC] = "Async",
0555     };
0556     const char *dname = blkg_dev_name(pd->blkg);
0557     u64 v;
0558     int i;
0559 
0560     if (!dname)
0561         return 0;
0562 
0563     for (i = 0; i < BLKG_RWSTAT_NR; i++)
0564         seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
0565                (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
0566 
0567     v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
0568         atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
0569     seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
0570     return v;
0571 }
0572 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
0573 
0574 /**
0575  * blkg_prfill_stat - prfill callback for blkg_stat
0576  * @sf: seq_file to print to
0577  * @pd: policy private data of interest
0578  * @off: offset to the blkg_stat in @pd
0579  *
0580  * prfill callback for printing a blkg_stat.
0581  */
0582 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
0583 {
0584     return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
0585 }
0586 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
0587 
0588 /**
0589  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
0590  * @sf: seq_file to print to
0591  * @pd: policy private data of interest
0592  * @off: offset to the blkg_rwstat in @pd
0593  *
0594  * prfill callback for printing a blkg_rwstat.
0595  */
0596 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
0597                int off)
0598 {
0599     struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
0600 
0601     return __blkg_prfill_rwstat(sf, pd, &rwstat);
0602 }
0603 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
0604 
0605 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
0606                     struct blkg_policy_data *pd, int off)
0607 {
0608     struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
0609 
0610     return __blkg_prfill_rwstat(sf, pd, &rwstat);
0611 }
0612 
0613 /**
0614  * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
0615  * @sf: seq_file to print to
0616  * @v: unused
0617  *
0618  * To be used as cftype->seq_show to print blkg->stat_bytes.
0619  * cftype->private must be set to the blkcg_policy.
0620  */
0621 int blkg_print_stat_bytes(struct seq_file *sf, void *v)
0622 {
0623     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
0624               blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
0625               offsetof(struct blkcg_gq, stat_bytes), true);
0626     return 0;
0627 }
0628 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
0629 
0630 /**
0631  * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
0632  * @sf: seq_file to print to
0633  * @v: unused
0634  *
0635  * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
0636  * must be set to the blkcg_policy.
0637  */
0638 int blkg_print_stat_ios(struct seq_file *sf, void *v)
0639 {
0640     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
0641               blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
0642               offsetof(struct blkcg_gq, stat_ios), true);
0643     return 0;
0644 }
0645 EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
0646 
0647 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
0648                           struct blkg_policy_data *pd,
0649                           int off)
0650 {
0651     struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
0652                                   NULL, off);
0653     return __blkg_prfill_rwstat(sf, pd, &rwstat);
0654 }
0655 
0656 /**
0657  * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
0658  * @sf: seq_file to print to
0659  * @v: unused
0660  */
0661 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
0662 {
0663     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
0664               blkg_prfill_rwstat_field_recursive,
0665               (void *)seq_cft(sf)->private,
0666               offsetof(struct blkcg_gq, stat_bytes), true);
0667     return 0;
0668 }
0669 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
0670 
0671 /**
0672  * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
0673  * @sf: seq_file to print to
0674  * @v: unused
0675  */
0676 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
0677 {
0678     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
0679               blkg_prfill_rwstat_field_recursive,
0680               (void *)seq_cft(sf)->private,
0681               offsetof(struct blkcg_gq, stat_ios), true);
0682     return 0;
0683 }
0684 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
0685 
0686 /**
0687  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
0688  * @blkg: blkg of interest
0689  * @pol: blkcg_policy which contains the blkg_stat
0690  * @off: offset to the blkg_stat in blkg_policy_data or @blkg
0691  *
0692  * Collect the blkg_stat specified by @blkg, @pol and @off and all its
0693  * online descendants and their aux counts.  The caller must be holding the
0694  * queue lock for online tests.
0695  *
0696  * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
0697  * at @off bytes into @blkg's blkg_policy_data of the policy.
0698  */
0699 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
0700                 struct blkcg_policy *pol, int off)
0701 {
0702     struct blkcg_gq *pos_blkg;
0703     struct cgroup_subsys_state *pos_css;
0704     u64 sum = 0;
0705 
0706     lockdep_assert_held(blkg->q->queue_lock);
0707 
0708     rcu_read_lock();
0709     blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
0710         struct blkg_stat *stat;
0711 
0712         if (!pos_blkg->online)
0713             continue;
0714 
0715         if (pol)
0716             stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
0717         else
0718             stat = (void *)blkg + off;
0719 
0720         sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
0721     }
0722     rcu_read_unlock();
0723 
0724     return sum;
0725 }
0726 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
0727 
0728 /**
0729  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
0730  * @blkg: blkg of interest
0731  * @pol: blkcg_policy which contains the blkg_rwstat
0732  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
0733  *
0734  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
0735  * online descendants and their aux counts.  The caller must be holding the
0736  * queue lock for online tests.
0737  *
0738  * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
0739  * is at @off bytes into @blkg's blkg_policy_data of the policy.
0740  */
0741 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
0742                          struct blkcg_policy *pol, int off)
0743 {
0744     struct blkcg_gq *pos_blkg;
0745     struct cgroup_subsys_state *pos_css;
0746     struct blkg_rwstat sum = { };
0747     int i;
0748 
0749     lockdep_assert_held(blkg->q->queue_lock);
0750 
0751     rcu_read_lock();
0752     blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
0753         struct blkg_rwstat *rwstat;
0754 
0755         if (!pos_blkg->online)
0756             continue;
0757 
0758         if (pol)
0759             rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
0760         else
0761             rwstat = (void *)pos_blkg + off;
0762 
0763         for (i = 0; i < BLKG_RWSTAT_NR; i++)
0764             atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
0765                 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
0766                 &sum.aux_cnt[i]);
0767     }
0768     rcu_read_unlock();
0769 
0770     return sum;
0771 }
0772 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
0773 
0774 /**
0775  * blkg_conf_prep - parse and prepare for per-blkg config update
0776  * @blkcg: target block cgroup
0777  * @pol: target policy
0778  * @input: input string
0779  * @ctx: blkg_conf_ctx to be filled
0780  *
0781  * Parse per-blkg config update from @input and initialize @ctx with the
0782  * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
0783  * part of @input following MAJ:MIN.  This function returns with RCU read
0784  * lock and queue lock held and must be paired with blkg_conf_finish().
0785  */
0786 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
0787            char *input, struct blkg_conf_ctx *ctx)
0788     __acquires(rcu) __acquires(disk->queue->queue_lock)
0789 {
0790     struct gendisk *disk;
0791     struct blkcg_gq *blkg;
0792     struct module *owner;
0793     unsigned int major, minor;
0794     int key_len, part, ret;
0795     char *body;
0796 
0797     if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
0798         return -EINVAL;
0799 
0800     body = input + key_len;
0801     if (!isspace(*body))
0802         return -EINVAL;
0803     body = skip_spaces(body);
0804 
0805     disk = get_gendisk(MKDEV(major, minor), &part);
0806     if (!disk)
0807         return -ENODEV;
0808     if (part) {
0809         owner = disk->fops->owner;
0810         put_disk(disk);
0811         module_put(owner);
0812         return -ENODEV;
0813     }
0814 
0815     rcu_read_lock();
0816     spin_lock_irq(disk->queue->queue_lock);
0817 
0818     if (blkcg_policy_enabled(disk->queue, pol))
0819         blkg = blkg_lookup_create(blkcg, disk->queue);
0820     else
0821         blkg = ERR_PTR(-EOPNOTSUPP);
0822 
0823     if (IS_ERR(blkg)) {
0824         ret = PTR_ERR(blkg);
0825         rcu_read_unlock();
0826         spin_unlock_irq(disk->queue->queue_lock);
0827         owner = disk->fops->owner;
0828         put_disk(disk);
0829         module_put(owner);
0830         /*
0831          * If queue was bypassing, we should retry.  Do so after a
0832          * short msleep().  It isn't strictly necessary but queue
0833          * can be bypassing for some time and it's always nice to
0834          * avoid busy looping.
0835          */
0836         if (ret == -EBUSY) {
0837             msleep(10);
0838             ret = restart_syscall();
0839         }
0840         return ret;
0841     }
0842 
0843     ctx->disk = disk;
0844     ctx->blkg = blkg;
0845     ctx->body = body;
0846     return 0;
0847 }
0848 EXPORT_SYMBOL_GPL(blkg_conf_prep);
0849 
0850 /**
0851  * blkg_conf_finish - finish up per-blkg config update
0852  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
0853  *
0854  * Finish up after per-blkg config update.  This function must be paired
0855  * with blkg_conf_prep().
0856  */
0857 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
0858     __releases(ctx->disk->queue->queue_lock) __releases(rcu)
0859 {
0860     struct module *owner;
0861 
0862     spin_unlock_irq(ctx->disk->queue->queue_lock);
0863     rcu_read_unlock();
0864     owner = ctx->disk->fops->owner;
0865     put_disk(ctx->disk);
0866     module_put(owner);
0867 }
0868 EXPORT_SYMBOL_GPL(blkg_conf_finish);
0869 
0870 static int blkcg_print_stat(struct seq_file *sf, void *v)
0871 {
0872     struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
0873     struct blkcg_gq *blkg;
0874 
0875     rcu_read_lock();
0876 
0877     hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
0878         const char *dname;
0879         struct blkg_rwstat rwstat;
0880         u64 rbytes, wbytes, rios, wios;
0881 
0882         dname = blkg_dev_name(blkg);
0883         if (!dname)
0884             continue;
0885 
0886         spin_lock_irq(blkg->q->queue_lock);
0887 
0888         rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
0889                     offsetof(struct blkcg_gq, stat_bytes));
0890         rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
0891         wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
0892 
0893         rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
0894                     offsetof(struct blkcg_gq, stat_ios));
0895         rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
0896         wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
0897 
0898         spin_unlock_irq(blkg->q->queue_lock);
0899 
0900         if (rbytes || wbytes || rios || wios)
0901             seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
0902                    dname, rbytes, wbytes, rios, wios);
0903     }
0904 
0905     rcu_read_unlock();
0906     return 0;
0907 }
0908 
0909 static struct cftype blkcg_files[] = {
0910     {
0911         .name = "stat",
0912         .flags = CFTYPE_NOT_ON_ROOT,
0913         .seq_show = blkcg_print_stat,
0914     },
0915     { } /* terminate */
0916 };
0917 
0918 static struct cftype blkcg_legacy_files[] = {
0919     {
0920         .name = "reset_stats",
0921         .write_u64 = blkcg_reset_stats,
0922     },
0923     { } /* terminate */
0924 };
0925 
0926 /**
0927  * blkcg_css_offline - cgroup css_offline callback
0928  * @css: css of interest
0929  *
0930  * This function is called when @css is about to go away and responsible
0931  * for shooting down all blkgs associated with @css.  blkgs should be
0932  * removed while holding both q and blkcg locks.  As blkcg lock is nested
0933  * inside q lock, this function performs reverse double lock dancing.
0934  *
0935  * This is the blkcg counterpart of ioc_release_fn().
0936  */
0937 static void blkcg_css_offline(struct cgroup_subsys_state *css)
0938 {
0939     struct blkcg *blkcg = css_to_blkcg(css);
0940 
0941     spin_lock_irq(&blkcg->lock);
0942 
0943     while (!hlist_empty(&blkcg->blkg_list)) {
0944         struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
0945                         struct blkcg_gq, blkcg_node);
0946         struct request_queue *q = blkg->q;
0947 
0948         if (spin_trylock(q->queue_lock)) {
0949             blkg_destroy(blkg);
0950             spin_unlock(q->queue_lock);
0951         } else {
0952             spin_unlock_irq(&blkcg->lock);
0953             cpu_relax();
0954             spin_lock_irq(&blkcg->lock);
0955         }
0956     }
0957 
0958     spin_unlock_irq(&blkcg->lock);
0959 
0960     wb_blkcg_offline(blkcg);
0961 }
0962 
0963 static void blkcg_css_free(struct cgroup_subsys_state *css)
0964 {
0965     struct blkcg *blkcg = css_to_blkcg(css);
0966     int i;
0967 
0968     mutex_lock(&blkcg_pol_mutex);
0969 
0970     list_del(&blkcg->all_blkcgs_node);
0971 
0972     for (i = 0; i < BLKCG_MAX_POLS; i++)
0973         if (blkcg->cpd[i])
0974             blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
0975 
0976     mutex_unlock(&blkcg_pol_mutex);
0977 
0978     kfree(blkcg);
0979 }
0980 
0981 static struct cgroup_subsys_state *
0982 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
0983 {
0984     struct blkcg *blkcg;
0985     struct cgroup_subsys_state *ret;
0986     int i;
0987 
0988     mutex_lock(&blkcg_pol_mutex);
0989 
0990     if (!parent_css) {
0991         blkcg = &blkcg_root;
0992     } else {
0993         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
0994         if (!blkcg) {
0995             ret = ERR_PTR(-ENOMEM);
0996             goto free_blkcg;
0997         }
0998     }
0999 
1000     for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1001         struct blkcg_policy *pol = blkcg_policy[i];
1002         struct blkcg_policy_data *cpd;
1003 
1004         /*
1005          * If the policy hasn't been attached yet, wait for it
1006          * to be attached before doing anything else. Otherwise,
1007          * check if the policy requires any specific per-cgroup
1008          * data: if it does, allocate and initialize it.
1009          */
1010         if (!pol || !pol->cpd_alloc_fn)
1011             continue;
1012 
1013         cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1014         if (!cpd) {
1015             ret = ERR_PTR(-ENOMEM);
1016             goto free_pd_blkcg;
1017         }
1018         blkcg->cpd[i] = cpd;
1019         cpd->blkcg = blkcg;
1020         cpd->plid = i;
1021         if (pol->cpd_init_fn)
1022             pol->cpd_init_fn(cpd);
1023     }
1024 
1025     spin_lock_init(&blkcg->lock);
1026     INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1027     INIT_HLIST_HEAD(&blkcg->blkg_list);
1028 #ifdef CONFIG_CGROUP_WRITEBACK
1029     INIT_LIST_HEAD(&blkcg->cgwb_list);
1030 #endif
1031     list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1032 
1033     mutex_unlock(&blkcg_pol_mutex);
1034     return &blkcg->css;
1035 
1036 free_pd_blkcg:
1037     for (i--; i >= 0; i--)
1038         if (blkcg->cpd[i])
1039             blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1040 free_blkcg:
1041     kfree(blkcg);
1042     mutex_unlock(&blkcg_pol_mutex);
1043     return ret;
1044 }
1045 
1046 /**
1047  * blkcg_init_queue - initialize blkcg part of request queue
1048  * @q: request_queue to initialize
1049  *
1050  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1051  * part of new request_queue @q.
1052  *
1053  * RETURNS:
1054  * 0 on success, -errno on failure.
1055  */
1056 int blkcg_init_queue(struct request_queue *q)
1057 {
1058     struct blkcg_gq *new_blkg, *blkg;
1059     bool preloaded;
1060     int ret;
1061 
1062     new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1063     if (!new_blkg)
1064         return -ENOMEM;
1065 
1066     preloaded = !radix_tree_preload(GFP_KERNEL);
1067 
1068     /*
1069      * Make sure the root blkg exists and count the existing blkgs.  As
1070      * @q is bypassing at this point, blkg_lookup_create() can't be
1071      * used.  Open code insertion.
1072      */
1073     rcu_read_lock();
1074     spin_lock_irq(q->queue_lock);
1075     blkg = blkg_create(&blkcg_root, q, new_blkg);
1076     spin_unlock_irq(q->queue_lock);
1077     rcu_read_unlock();
1078 
1079     if (preloaded)
1080         radix_tree_preload_end();
1081 
1082     if (IS_ERR(blkg)) {
1083         blkg_free(new_blkg);
1084         return PTR_ERR(blkg);
1085     }
1086 
1087     q->root_blkg = blkg;
1088     q->root_rl.blkg = blkg;
1089 
1090     ret = blk_throtl_init(q);
1091     if (ret) {
1092         spin_lock_irq(q->queue_lock);
1093         blkg_destroy_all(q);
1094         spin_unlock_irq(q->queue_lock);
1095     }
1096     return ret;
1097 }
1098 
1099 /**
1100  * blkcg_drain_queue - drain blkcg part of request_queue
1101  * @q: request_queue to drain
1102  *
1103  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1104  */
1105 void blkcg_drain_queue(struct request_queue *q)
1106 {
1107     lockdep_assert_held(q->queue_lock);
1108 
1109     /*
1110      * @q could be exiting and already have destroyed all blkgs as
1111      * indicated by NULL root_blkg.  If so, don't confuse policies.
1112      */
1113     if (!q->root_blkg)
1114         return;
1115 
1116     blk_throtl_drain(q);
1117 }
1118 
1119 /**
1120  * blkcg_exit_queue - exit and release blkcg part of request_queue
1121  * @q: request_queue being released
1122  *
1123  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1124  */
1125 void blkcg_exit_queue(struct request_queue *q)
1126 {
1127     spin_lock_irq(q->queue_lock);
1128     blkg_destroy_all(q);
1129     spin_unlock_irq(q->queue_lock);
1130 
1131     blk_throtl_exit(q);
1132 }
1133 
1134 /*
1135  * We cannot support shared io contexts, as we have no mean to support
1136  * two tasks with the same ioc in two different groups without major rework
1137  * of the main cic data structures.  For now we allow a task to change
1138  * its cgroup only if it's the only owner of its ioc.
1139  */
1140 static int blkcg_can_attach(struct cgroup_taskset *tset)
1141 {
1142     struct task_struct *task;
1143     struct cgroup_subsys_state *dst_css;
1144     struct io_context *ioc;
1145     int ret = 0;
1146 
1147     /* task_lock() is needed to avoid races with exit_io_context() */
1148     cgroup_taskset_for_each(task, dst_css, tset) {
1149         task_lock(task);
1150         ioc = task->io_context;
1151         if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1152             ret = -EINVAL;
1153         task_unlock(task);
1154         if (ret)
1155             break;
1156     }
1157     return ret;
1158 }
1159 
1160 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1161 {
1162     int i;
1163 
1164     mutex_lock(&blkcg_pol_mutex);
1165 
1166     for (i = 0; i < BLKCG_MAX_POLS; i++) {
1167         struct blkcg_policy *pol = blkcg_policy[i];
1168         struct blkcg *blkcg;
1169 
1170         if (!pol || !pol->cpd_bind_fn)
1171             continue;
1172 
1173         list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1174             if (blkcg->cpd[pol->plid])
1175                 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1176     }
1177     mutex_unlock(&blkcg_pol_mutex);
1178 }
1179 
1180 struct cgroup_subsys io_cgrp_subsys = {
1181     .css_alloc = blkcg_css_alloc,
1182     .css_offline = blkcg_css_offline,
1183     .css_free = blkcg_css_free,
1184     .can_attach = blkcg_can_attach,
1185     .bind = blkcg_bind,
1186     .dfl_cftypes = blkcg_files,
1187     .legacy_cftypes = blkcg_legacy_files,
1188     .legacy_name = "blkio",
1189 #ifdef CONFIG_MEMCG
1190     /*
1191      * This ensures that, if available, memcg is automatically enabled
1192      * together on the default hierarchy so that the owner cgroup can
1193      * be retrieved from writeback pages.
1194      */
1195     .depends_on = 1 << memory_cgrp_id,
1196 #endif
1197 };
1198 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1199 
1200 /**
1201  * blkcg_activate_policy - activate a blkcg policy on a request_queue
1202  * @q: request_queue of interest
1203  * @pol: blkcg policy to activate
1204  *
1205  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1206  * bypass mode to populate its blkgs with policy_data for @pol.
1207  *
1208  * Activation happens with @q bypassed, so nobody would be accessing blkgs
1209  * from IO path.  Update of each blkg is protected by both queue and blkcg
1210  * locks so that holding either lock and testing blkcg_policy_enabled() is
1211  * always enough for dereferencing policy data.
1212  *
1213  * The caller is responsible for synchronizing [de]activations and policy
1214  * [un]registerations.  Returns 0 on success, -errno on failure.
1215  */
1216 int blkcg_activate_policy(struct request_queue *q,
1217               const struct blkcg_policy *pol)
1218 {
1219     struct blkg_policy_data *pd_prealloc = NULL;
1220     struct blkcg_gq *blkg;
1221     int ret;
1222 
1223     if (blkcg_policy_enabled(q, pol))
1224         return 0;
1225 
1226     blk_queue_bypass_start(q);
1227 pd_prealloc:
1228     if (!pd_prealloc) {
1229         pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1230         if (!pd_prealloc) {
1231             ret = -ENOMEM;
1232             goto out_bypass_end;
1233         }
1234     }
1235 
1236     spin_lock_irq(q->queue_lock);
1237 
1238     list_for_each_entry(blkg, &q->blkg_list, q_node) {
1239         struct blkg_policy_data *pd;
1240 
1241         if (blkg->pd[pol->plid])
1242             continue;
1243 
1244         pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1245         if (!pd)
1246             swap(pd, pd_prealloc);
1247         if (!pd) {
1248             spin_unlock_irq(q->queue_lock);
1249             goto pd_prealloc;
1250         }
1251 
1252         blkg->pd[pol->plid] = pd;
1253         pd->blkg = blkg;
1254         pd->plid = pol->plid;
1255         if (pol->pd_init_fn)
1256             pol->pd_init_fn(pd);
1257     }
1258 
1259     __set_bit(pol->plid, q->blkcg_pols);
1260     ret = 0;
1261 
1262     spin_unlock_irq(q->queue_lock);
1263 out_bypass_end:
1264     blk_queue_bypass_end(q);
1265     if (pd_prealloc)
1266         pol->pd_free_fn(pd_prealloc);
1267     return ret;
1268 }
1269 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1270 
1271 /**
1272  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1273  * @q: request_queue of interest
1274  * @pol: blkcg policy to deactivate
1275  *
1276  * Deactivate @pol on @q.  Follows the same synchronization rules as
1277  * blkcg_activate_policy().
1278  */
1279 void blkcg_deactivate_policy(struct request_queue *q,
1280                  const struct blkcg_policy *pol)
1281 {
1282     struct blkcg_gq *blkg;
1283 
1284     if (!blkcg_policy_enabled(q, pol))
1285         return;
1286 
1287     blk_queue_bypass_start(q);
1288     spin_lock_irq(q->queue_lock);
1289 
1290     __clear_bit(pol->plid, q->blkcg_pols);
1291 
1292     list_for_each_entry(blkg, &q->blkg_list, q_node) {
1293         /* grab blkcg lock too while removing @pd from @blkg */
1294         spin_lock(&blkg->blkcg->lock);
1295 
1296         if (blkg->pd[pol->plid]) {
1297             if (pol->pd_offline_fn)
1298                 pol->pd_offline_fn(blkg->pd[pol->plid]);
1299             pol->pd_free_fn(blkg->pd[pol->plid]);
1300             blkg->pd[pol->plid] = NULL;
1301         }
1302 
1303         spin_unlock(&blkg->blkcg->lock);
1304     }
1305 
1306     spin_unlock_irq(q->queue_lock);
1307     blk_queue_bypass_end(q);
1308 }
1309 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1310 
1311 /**
1312  * blkcg_policy_register - register a blkcg policy
1313  * @pol: blkcg policy to register
1314  *
1315  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1316  * successful registration.  Returns 0 on success and -errno on failure.
1317  */
1318 int blkcg_policy_register(struct blkcg_policy *pol)
1319 {
1320     struct blkcg *blkcg;
1321     int i, ret;
1322 
1323     mutex_lock(&blkcg_pol_register_mutex);
1324     mutex_lock(&blkcg_pol_mutex);
1325 
1326     /* find an empty slot */
1327     ret = -ENOSPC;
1328     for (i = 0; i < BLKCG_MAX_POLS; i++)
1329         if (!blkcg_policy[i])
1330             break;
1331     if (i >= BLKCG_MAX_POLS)
1332         goto err_unlock;
1333 
1334     /* register @pol */
1335     pol->plid = i;
1336     blkcg_policy[pol->plid] = pol;
1337 
1338     /* allocate and install cpd's */
1339     if (pol->cpd_alloc_fn) {
1340         list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1341             struct blkcg_policy_data *cpd;
1342 
1343             cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1344             if (!cpd)
1345                 goto err_free_cpds;
1346 
1347             blkcg->cpd[pol->plid] = cpd;
1348             cpd->blkcg = blkcg;
1349             cpd->plid = pol->plid;
1350             pol->cpd_init_fn(cpd);
1351         }
1352     }
1353 
1354     mutex_unlock(&blkcg_pol_mutex);
1355 
1356     /* everything is in place, add intf files for the new policy */
1357     if (pol->dfl_cftypes)
1358         WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1359                            pol->dfl_cftypes));
1360     if (pol->legacy_cftypes)
1361         WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1362                           pol->legacy_cftypes));
1363     mutex_unlock(&blkcg_pol_register_mutex);
1364     return 0;
1365 
1366 err_free_cpds:
1367     if (pol->cpd_alloc_fn) {
1368         list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1369             if (blkcg->cpd[pol->plid]) {
1370                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1371                 blkcg->cpd[pol->plid] = NULL;
1372             }
1373         }
1374     }
1375     blkcg_policy[pol->plid] = NULL;
1376 err_unlock:
1377     mutex_unlock(&blkcg_pol_mutex);
1378     mutex_unlock(&blkcg_pol_register_mutex);
1379     return ret;
1380 }
1381 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1382 
1383 /**
1384  * blkcg_policy_unregister - unregister a blkcg policy
1385  * @pol: blkcg policy to unregister
1386  *
1387  * Undo blkcg_policy_register(@pol).  Might sleep.
1388  */
1389 void blkcg_policy_unregister(struct blkcg_policy *pol)
1390 {
1391     struct blkcg *blkcg;
1392 
1393     mutex_lock(&blkcg_pol_register_mutex);
1394 
1395     if (WARN_ON(blkcg_policy[pol->plid] != pol))
1396         goto out_unlock;
1397 
1398     /* kill the intf files first */
1399     if (pol->dfl_cftypes)
1400         cgroup_rm_cftypes(pol->dfl_cftypes);
1401     if (pol->legacy_cftypes)
1402         cgroup_rm_cftypes(pol->legacy_cftypes);
1403 
1404     /* remove cpds and unregister */
1405     mutex_lock(&blkcg_pol_mutex);
1406 
1407     if (pol->cpd_alloc_fn) {
1408         list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1409             if (blkcg->cpd[pol->plid]) {
1410                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1411                 blkcg->cpd[pol->plid] = NULL;
1412             }
1413         }
1414     }
1415     blkcg_policy[pol->plid] = NULL;
1416 
1417     mutex_unlock(&blkcg_pol_mutex);
1418 out_unlock:
1419     mutex_unlock(&blkcg_pol_register_mutex);
1420 }
1421 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);