0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #include <linux/ioprio.h>
0019 #include <linux/kdev_t.h>
0020 #include <linux/module.h>
0021 #include <linux/sched/signal.h>
0022 #include <linux/err.h>
0023 #include <linux/blkdev.h>
0024 #include <linux/backing-dev.h>
0025 #include <linux/slab.h>
0026 #include <linux/delay.h>
0027 #include <linux/atomic.h>
0028 #include <linux/ctype.h>
0029 #include <linux/resume_user_mode.h>
0030 #include <linux/psi.h>
0031 #include <linux/part_stat.h>
0032 #include "blk.h"
0033 #include "blk-cgroup.h"
0034 #include "blk-ioprio.h"
0035 #include "blk-throttle.h"
0036
0037
0038
0039
0040
0041
0042
0043
0044 static DEFINE_MUTEX(blkcg_pol_register_mutex);
0045 static DEFINE_MUTEX(blkcg_pol_mutex);
0046
0047 struct blkcg blkcg_root;
0048 EXPORT_SYMBOL_GPL(blkcg_root);
0049
0050 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
0051 EXPORT_SYMBOL_GPL(blkcg_root_css);
0052
0053 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
0054
0055 static LIST_HEAD(all_blkcgs);
0056
0057 bool blkcg_debug_stats = false;
0058 static struct workqueue_struct *blkcg_punt_bio_wq;
0059
0060 #define BLKG_DESTROY_BATCH_SIZE 64
0061
0062
0063
0064
0065
0066
0067
0068
0069 static struct cgroup_subsys_state *blkcg_css(void)
0070 {
0071 struct cgroup_subsys_state *css;
0072
0073 css = kthread_blkcg();
0074 if (css)
0075 return css;
0076 return task_css(current, io_cgrp_id);
0077 }
0078
0079 static bool blkcg_policy_enabled(struct request_queue *q,
0080 const struct blkcg_policy *pol)
0081 {
0082 return pol && test_bit(pol->plid, q->blkcg_pols);
0083 }
0084
0085 static void blkg_free_workfn(struct work_struct *work)
0086 {
0087 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
0088 free_work);
0089 int i;
0090
0091 for (i = 0; i < BLKCG_MAX_POLS; i++)
0092 if (blkg->pd[i])
0093 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
0094
0095 if (blkg->q)
0096 blk_put_queue(blkg->q);
0097 free_percpu(blkg->iostat_cpu);
0098 percpu_ref_exit(&blkg->refcnt);
0099 kfree(blkg);
0100 }
0101
0102
0103
0104
0105
0106
0107
0108 static void blkg_free(struct blkcg_gq *blkg)
0109 {
0110 if (!blkg)
0111 return;
0112
0113
0114
0115
0116
0117 INIT_WORK(&blkg->free_work, blkg_free_workfn);
0118 schedule_work(&blkg->free_work);
0119 }
0120
0121 static void __blkg_release(struct rcu_head *rcu)
0122 {
0123 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
0124
0125 WARN_ON(!bio_list_empty(&blkg->async_bios));
0126
0127
0128 css_put(&blkg->blkcg->css);
0129 if (blkg->parent)
0130 blkg_put(blkg->parent);
0131 blkg_free(blkg);
0132 }
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142 static void blkg_release(struct percpu_ref *ref)
0143 {
0144 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
0145
0146 call_rcu(&blkg->rcu_head, __blkg_release);
0147 }
0148
0149 static void blkg_async_bio_workfn(struct work_struct *work)
0150 {
0151 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
0152 async_bio_work);
0153 struct bio_list bios = BIO_EMPTY_LIST;
0154 struct bio *bio;
0155 struct blk_plug plug;
0156 bool need_plug = false;
0157
0158
0159 spin_lock_bh(&blkg->async_bio_lock);
0160 bio_list_merge(&bios, &blkg->async_bios);
0161 bio_list_init(&blkg->async_bios);
0162 spin_unlock_bh(&blkg->async_bio_lock);
0163
0164
0165 if (bios.head && bios.head->bi_next) {
0166 need_plug = true;
0167 blk_start_plug(&plug);
0168 }
0169 while ((bio = bio_list_pop(&bios)))
0170 submit_bio(bio);
0171 if (need_plug)
0172 blk_finish_plug(&plug);
0173 }
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183 struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
0184 {
0185 if (!bio || !bio->bi_blkg)
0186 return NULL;
0187 return &bio->bi_blkg->blkcg->css;
0188 }
0189 EXPORT_SYMBOL_GPL(bio_blkcg_css);
0190
0191
0192
0193
0194
0195
0196
0197 static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
0198 {
0199 return css_to_blkcg(blkcg->css.parent);
0200 }
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
0211 gfp_t gfp_mask)
0212 {
0213 struct blkcg_gq *blkg;
0214 int i, cpu;
0215
0216
0217 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
0218 if (!blkg)
0219 return NULL;
0220
0221 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
0222 goto err_free;
0223
0224 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
0225 if (!blkg->iostat_cpu)
0226 goto err_free;
0227
0228 if (!blk_get_queue(q))
0229 goto err_free;
0230
0231 blkg->q = q;
0232 INIT_LIST_HEAD(&blkg->q_node);
0233 spin_lock_init(&blkg->async_bio_lock);
0234 bio_list_init(&blkg->async_bios);
0235 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
0236 blkg->blkcg = blkcg;
0237
0238 u64_stats_init(&blkg->iostat.sync);
0239 for_each_possible_cpu(cpu)
0240 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
0241
0242 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0243 struct blkcg_policy *pol = blkcg_policy[i];
0244 struct blkg_policy_data *pd;
0245
0246 if (!blkcg_policy_enabled(q, pol))
0247 continue;
0248
0249
0250 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
0251 if (!pd)
0252 goto err_free;
0253
0254 blkg->pd[i] = pd;
0255 pd->blkg = blkg;
0256 pd->plid = i;
0257 }
0258
0259 return blkg;
0260
0261 err_free:
0262 blkg_free(blkg);
0263 return NULL;
0264 }
0265
0266 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
0267 struct request_queue *q, bool update_hint)
0268 {
0269 struct blkcg_gq *blkg;
0270
0271
0272
0273
0274
0275
0276
0277 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
0278 if (blkg && blkg->q == q) {
0279 if (update_hint) {
0280 lockdep_assert_held(&q->queue_lock);
0281 rcu_assign_pointer(blkcg->blkg_hint, blkg);
0282 }
0283 return blkg;
0284 }
0285
0286 return NULL;
0287 }
0288 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
0289
0290
0291
0292
0293
0294 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
0295 struct request_queue *q,
0296 struct blkcg_gq *new_blkg)
0297 {
0298 struct blkcg_gq *blkg;
0299 int i, ret;
0300
0301 lockdep_assert_held(&q->queue_lock);
0302
0303
0304 if (blk_queue_dying(q)) {
0305 ret = -ENODEV;
0306 goto err_free_blkg;
0307 }
0308
0309
0310 if (!css_tryget_online(&blkcg->css)) {
0311 ret = -ENODEV;
0312 goto err_free_blkg;
0313 }
0314
0315
0316 if (!new_blkg) {
0317 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
0318 if (unlikely(!new_blkg)) {
0319 ret = -ENOMEM;
0320 goto err_put_css;
0321 }
0322 }
0323 blkg = new_blkg;
0324
0325
0326 if (blkcg_parent(blkcg)) {
0327 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
0328 if (WARN_ON_ONCE(!blkg->parent)) {
0329 ret = -ENODEV;
0330 goto err_put_css;
0331 }
0332 blkg_get(blkg->parent);
0333 }
0334
0335
0336 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0337 struct blkcg_policy *pol = blkcg_policy[i];
0338
0339 if (blkg->pd[i] && pol->pd_init_fn)
0340 pol->pd_init_fn(blkg->pd[i]);
0341 }
0342
0343
0344 spin_lock(&blkcg->lock);
0345 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
0346 if (likely(!ret)) {
0347 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
0348 list_add(&blkg->q_node, &q->blkg_list);
0349
0350 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0351 struct blkcg_policy *pol = blkcg_policy[i];
0352
0353 if (blkg->pd[i] && pol->pd_online_fn)
0354 pol->pd_online_fn(blkg->pd[i]);
0355 }
0356 }
0357 blkg->online = true;
0358 spin_unlock(&blkcg->lock);
0359
0360 if (!ret)
0361 return blkg;
0362
0363
0364 blkg_put(blkg);
0365 return ERR_PTR(ret);
0366
0367 err_put_css:
0368 css_put(&blkcg->css);
0369 err_free_blkg:
0370 blkg_free(new_blkg);
0371 return ERR_PTR(ret);
0372 }
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387 static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
0388 struct request_queue *q)
0389 {
0390 struct blkcg_gq *blkg;
0391 unsigned long flags;
0392
0393 WARN_ON_ONCE(!rcu_read_lock_held());
0394
0395 blkg = blkg_lookup(blkcg, q);
0396 if (blkg)
0397 return blkg;
0398
0399 spin_lock_irqsave(&q->queue_lock, flags);
0400 blkg = __blkg_lookup(blkcg, q, true);
0401 if (blkg)
0402 goto found;
0403
0404
0405
0406
0407
0408
0409 while (true) {
0410 struct blkcg *pos = blkcg;
0411 struct blkcg *parent = blkcg_parent(blkcg);
0412 struct blkcg_gq *ret_blkg = q->root_blkg;
0413
0414 while (parent) {
0415 blkg = __blkg_lookup(parent, q, false);
0416 if (blkg) {
0417
0418 ret_blkg = blkg;
0419 break;
0420 }
0421 pos = parent;
0422 parent = blkcg_parent(parent);
0423 }
0424
0425 blkg = blkg_create(pos, q, NULL);
0426 if (IS_ERR(blkg)) {
0427 blkg = ret_blkg;
0428 break;
0429 }
0430 if (pos == blkcg)
0431 break;
0432 }
0433
0434 found:
0435 spin_unlock_irqrestore(&q->queue_lock, flags);
0436 return blkg;
0437 }
0438
0439 static void blkg_destroy(struct blkcg_gq *blkg)
0440 {
0441 struct blkcg *blkcg = blkg->blkcg;
0442 int i;
0443
0444 lockdep_assert_held(&blkg->q->queue_lock);
0445 lockdep_assert_held(&blkcg->lock);
0446
0447
0448 WARN_ON_ONCE(list_empty(&blkg->q_node));
0449 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
0450
0451 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0452 struct blkcg_policy *pol = blkcg_policy[i];
0453
0454 if (blkg->pd[i] && pol->pd_offline_fn)
0455 pol->pd_offline_fn(blkg->pd[i]);
0456 }
0457
0458 blkg->online = false;
0459
0460 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
0461 list_del_init(&blkg->q_node);
0462 hlist_del_init_rcu(&blkg->blkcg_node);
0463
0464
0465
0466
0467
0468
0469 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
0470 rcu_assign_pointer(blkcg->blkg_hint, NULL);
0471
0472
0473
0474
0475
0476 percpu_ref_kill(&blkg->refcnt);
0477 }
0478
0479
0480
0481
0482
0483
0484
0485 static void blkg_destroy_all(struct request_queue *q)
0486 {
0487 struct blkcg_gq *blkg, *n;
0488 int count = BLKG_DESTROY_BATCH_SIZE;
0489
0490 restart:
0491 spin_lock_irq(&q->queue_lock);
0492 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
0493 struct blkcg *blkcg = blkg->blkcg;
0494
0495 spin_lock(&blkcg->lock);
0496 blkg_destroy(blkg);
0497 spin_unlock(&blkcg->lock);
0498
0499
0500
0501
0502
0503 if (!(--count)) {
0504 count = BLKG_DESTROY_BATCH_SIZE;
0505 spin_unlock_irq(&q->queue_lock);
0506 cond_resched();
0507 goto restart;
0508 }
0509 }
0510
0511 q->root_blkg = NULL;
0512 spin_unlock_irq(&q->queue_lock);
0513 }
0514
0515 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
0516 struct cftype *cftype, u64 val)
0517 {
0518 struct blkcg *blkcg = css_to_blkcg(css);
0519 struct blkcg_gq *blkg;
0520 int i, cpu;
0521
0522 mutex_lock(&blkcg_pol_mutex);
0523 spin_lock_irq(&blkcg->lock);
0524
0525
0526
0527
0528
0529
0530 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
0531 for_each_possible_cpu(cpu) {
0532 struct blkg_iostat_set *bis =
0533 per_cpu_ptr(blkg->iostat_cpu, cpu);
0534 memset(bis, 0, sizeof(*bis));
0535 }
0536 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
0537
0538 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0539 struct blkcg_policy *pol = blkcg_policy[i];
0540
0541 if (blkg->pd[i] && pol->pd_reset_stats_fn)
0542 pol->pd_reset_stats_fn(blkg->pd[i]);
0543 }
0544 }
0545
0546 spin_unlock_irq(&blkcg->lock);
0547 mutex_unlock(&blkcg_pol_mutex);
0548 return 0;
0549 }
0550
0551 const char *blkg_dev_name(struct blkcg_gq *blkg)
0552 {
0553 if (!blkg->q->disk || !blkg->q->disk->bdi->dev)
0554 return NULL;
0555 return bdi_dev_name(blkg->q->disk->bdi);
0556 }
0557
0558
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
0577 u64 (*prfill)(struct seq_file *,
0578 struct blkg_policy_data *, int),
0579 const struct blkcg_policy *pol, int data,
0580 bool show_total)
0581 {
0582 struct blkcg_gq *blkg;
0583 u64 total = 0;
0584
0585 rcu_read_lock();
0586 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
0587 spin_lock_irq(&blkg->q->queue_lock);
0588 if (blkcg_policy_enabled(blkg->q, pol))
0589 total += prfill(sf, blkg->pd[pol->plid], data);
0590 spin_unlock_irq(&blkg->q->queue_lock);
0591 }
0592 rcu_read_unlock();
0593
0594 if (show_total)
0595 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
0596 }
0597 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
0608 {
0609 const char *dname = blkg_dev_name(pd->blkg);
0610
0611 if (!dname)
0612 return 0;
0613
0614 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
0615 return v;
0616 }
0617 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
0618
0619
0620 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
0621 const struct blkcg_policy *pol,
0622 struct request_queue *q)
0623 {
0624 WARN_ON_ONCE(!rcu_read_lock_held());
0625 lockdep_assert_held(&q->queue_lock);
0626
0627 if (!blkcg_policy_enabled(q, pol))
0628 return ERR_PTR(-EOPNOTSUPP);
0629 return __blkg_lookup(blkcg, q, true );
0630 }
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643 struct block_device *blkcg_conf_open_bdev(char **inputp)
0644 {
0645 char *input = *inputp;
0646 unsigned int major, minor;
0647 struct block_device *bdev;
0648 int key_len;
0649
0650 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
0651 return ERR_PTR(-EINVAL);
0652
0653 input += key_len;
0654 if (!isspace(*input))
0655 return ERR_PTR(-EINVAL);
0656 input = skip_spaces(input);
0657
0658 bdev = blkdev_get_no_open(MKDEV(major, minor));
0659 if (!bdev)
0660 return ERR_PTR(-ENODEV);
0661 if (bdev_is_partition(bdev)) {
0662 blkdev_put_no_open(bdev);
0663 return ERR_PTR(-ENODEV);
0664 }
0665
0666 *inputp = input;
0667 return bdev;
0668 }
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
0683 char *input, struct blkg_conf_ctx *ctx)
0684 __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
0685 {
0686 struct block_device *bdev;
0687 struct request_queue *q;
0688 struct blkcg_gq *blkg;
0689 int ret;
0690
0691 bdev = blkcg_conf_open_bdev(&input);
0692 if (IS_ERR(bdev))
0693 return PTR_ERR(bdev);
0694
0695 q = bdev_get_queue(bdev);
0696
0697
0698
0699
0700
0701 ret = blk_queue_enter(q, 0);
0702 if (ret)
0703 goto fail;
0704
0705 rcu_read_lock();
0706 spin_lock_irq(&q->queue_lock);
0707
0708 blkg = blkg_lookup_check(blkcg, pol, q);
0709 if (IS_ERR(blkg)) {
0710 ret = PTR_ERR(blkg);
0711 goto fail_unlock;
0712 }
0713
0714 if (blkg)
0715 goto success;
0716
0717
0718
0719
0720
0721 while (true) {
0722 struct blkcg *pos = blkcg;
0723 struct blkcg *parent;
0724 struct blkcg_gq *new_blkg;
0725
0726 parent = blkcg_parent(blkcg);
0727 while (parent && !__blkg_lookup(parent, q, false)) {
0728 pos = parent;
0729 parent = blkcg_parent(parent);
0730 }
0731
0732
0733 spin_unlock_irq(&q->queue_lock);
0734 rcu_read_unlock();
0735
0736 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
0737 if (unlikely(!new_blkg)) {
0738 ret = -ENOMEM;
0739 goto fail_exit_queue;
0740 }
0741
0742 if (radix_tree_preload(GFP_KERNEL)) {
0743 blkg_free(new_blkg);
0744 ret = -ENOMEM;
0745 goto fail_exit_queue;
0746 }
0747
0748 rcu_read_lock();
0749 spin_lock_irq(&q->queue_lock);
0750
0751 blkg = blkg_lookup_check(pos, pol, q);
0752 if (IS_ERR(blkg)) {
0753 ret = PTR_ERR(blkg);
0754 blkg_free(new_blkg);
0755 goto fail_preloaded;
0756 }
0757
0758 if (blkg) {
0759 blkg_free(new_blkg);
0760 } else {
0761 blkg = blkg_create(pos, q, new_blkg);
0762 if (IS_ERR(blkg)) {
0763 ret = PTR_ERR(blkg);
0764 goto fail_preloaded;
0765 }
0766 }
0767
0768 radix_tree_preload_end();
0769
0770 if (pos == blkcg)
0771 goto success;
0772 }
0773 success:
0774 blk_queue_exit(q);
0775 ctx->bdev = bdev;
0776 ctx->blkg = blkg;
0777 ctx->body = input;
0778 return 0;
0779
0780 fail_preloaded:
0781 radix_tree_preload_end();
0782 fail_unlock:
0783 spin_unlock_irq(&q->queue_lock);
0784 rcu_read_unlock();
0785 fail_exit_queue:
0786 blk_queue_exit(q);
0787 fail:
0788 blkdev_put_no_open(bdev);
0789
0790
0791
0792
0793
0794
0795 if (ret == -EBUSY) {
0796 msleep(10);
0797 ret = restart_syscall();
0798 }
0799 return ret;
0800 }
0801 EXPORT_SYMBOL_GPL(blkg_conf_prep);
0802
0803
0804
0805
0806
0807
0808
0809
0810 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
0811 __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
0812 {
0813 spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
0814 rcu_read_unlock();
0815 blkdev_put_no_open(ctx->bdev);
0816 }
0817 EXPORT_SYMBOL_GPL(blkg_conf_finish);
0818
0819 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
0820 {
0821 int i;
0822
0823 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
0824 dst->bytes[i] = src->bytes[i];
0825 dst->ios[i] = src->ios[i];
0826 }
0827 }
0828
0829 static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
0830 {
0831 int i;
0832
0833 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
0834 dst->bytes[i] += src->bytes[i];
0835 dst->ios[i] += src->ios[i];
0836 }
0837 }
0838
0839 static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
0840 {
0841 int i;
0842
0843 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
0844 dst->bytes[i] -= src->bytes[i];
0845 dst->ios[i] -= src->ios[i];
0846 }
0847 }
0848
0849 static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
0850 struct blkg_iostat *last)
0851 {
0852 struct blkg_iostat delta;
0853 unsigned long flags;
0854
0855
0856 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
0857 blkg_iostat_set(&delta, cur);
0858 blkg_iostat_sub(&delta, last);
0859 blkg_iostat_add(&blkg->iostat.cur, &delta);
0860 blkg_iostat_add(last, &delta);
0861 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
0862 }
0863
0864 static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
0865 {
0866 struct blkcg *blkcg = css_to_blkcg(css);
0867 struct blkcg_gq *blkg;
0868
0869
0870 if (!cgroup_parent(css->cgroup))
0871 return;
0872
0873 rcu_read_lock();
0874
0875 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
0876 struct blkcg_gq *parent = blkg->parent;
0877 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
0878 struct blkg_iostat cur;
0879 unsigned int seq;
0880
0881
0882 do {
0883 seq = u64_stats_fetch_begin(&bisc->sync);
0884 blkg_iostat_set(&cur, &bisc->cur);
0885 } while (u64_stats_fetch_retry(&bisc->sync, seq));
0886
0887 blkcg_iostat_update(blkg, &cur, &bisc->last);
0888
0889
0890 if (parent && parent->parent)
0891 blkcg_iostat_update(parent, &blkg->iostat.cur,
0892 &blkg->iostat.last);
0893 }
0894
0895 rcu_read_unlock();
0896 }
0897
0898
0899
0900
0901
0902
0903
0904
0905
0906
0907
0908
0909
0910 static void blkcg_fill_root_iostats(void)
0911 {
0912 struct class_dev_iter iter;
0913 struct device *dev;
0914
0915 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
0916 while ((dev = class_dev_iter_next(&iter))) {
0917 struct block_device *bdev = dev_to_bdev(dev);
0918 struct blkcg_gq *blkg =
0919 blk_queue_root_blkg(bdev_get_queue(bdev));
0920 struct blkg_iostat tmp;
0921 int cpu;
0922 unsigned long flags;
0923
0924 memset(&tmp, 0, sizeof(tmp));
0925 for_each_possible_cpu(cpu) {
0926 struct disk_stats *cpu_dkstats;
0927
0928 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
0929 tmp.ios[BLKG_IOSTAT_READ] +=
0930 cpu_dkstats->ios[STAT_READ];
0931 tmp.ios[BLKG_IOSTAT_WRITE] +=
0932 cpu_dkstats->ios[STAT_WRITE];
0933 tmp.ios[BLKG_IOSTAT_DISCARD] +=
0934 cpu_dkstats->ios[STAT_DISCARD];
0935
0936 tmp.bytes[BLKG_IOSTAT_READ] +=
0937 cpu_dkstats->sectors[STAT_READ] << 9;
0938 tmp.bytes[BLKG_IOSTAT_WRITE] +=
0939 cpu_dkstats->sectors[STAT_WRITE] << 9;
0940 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
0941 cpu_dkstats->sectors[STAT_DISCARD] << 9;
0942 }
0943
0944 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
0945 blkg_iostat_set(&blkg->iostat.cur, &tmp);
0946 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
0947 }
0948 }
0949
0950 static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
0951 {
0952 struct blkg_iostat_set *bis = &blkg->iostat;
0953 u64 rbytes, wbytes, rios, wios, dbytes, dios;
0954 const char *dname;
0955 unsigned seq;
0956 int i;
0957
0958 if (!blkg->online)
0959 return;
0960
0961 dname = blkg_dev_name(blkg);
0962 if (!dname)
0963 return;
0964
0965 seq_printf(s, "%s ", dname);
0966
0967 do {
0968 seq = u64_stats_fetch_begin(&bis->sync);
0969
0970 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
0971 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
0972 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
0973 rios = bis->cur.ios[BLKG_IOSTAT_READ];
0974 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
0975 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
0976 } while (u64_stats_fetch_retry(&bis->sync, seq));
0977
0978 if (rbytes || wbytes || rios || wios) {
0979 seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
0980 rbytes, wbytes, rios, wios,
0981 dbytes, dios);
0982 }
0983
0984 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
0985 seq_printf(s, " use_delay=%d delay_nsec=%llu",
0986 atomic_read(&blkg->use_delay),
0987 atomic64_read(&blkg->delay_nsec));
0988 }
0989
0990 for (i = 0; i < BLKCG_MAX_POLS; i++) {
0991 struct blkcg_policy *pol = blkcg_policy[i];
0992
0993 if (!blkg->pd[i] || !pol->pd_stat_fn)
0994 continue;
0995
0996 pol->pd_stat_fn(blkg->pd[i], s);
0997 }
0998
0999 seq_puts(s, "\n");
1000 }
1001
1002 static int blkcg_print_stat(struct seq_file *sf, void *v)
1003 {
1004 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1005 struct blkcg_gq *blkg;
1006
1007 if (!seq_css(sf)->parent)
1008 blkcg_fill_root_iostats();
1009 else
1010 cgroup_rstat_flush(blkcg->css.cgroup);
1011
1012 rcu_read_lock();
1013 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
1014 spin_lock_irq(&blkg->q->queue_lock);
1015 blkcg_print_one_stat(blkg, sf);
1016 spin_unlock_irq(&blkg->q->queue_lock);
1017 }
1018 rcu_read_unlock();
1019 return 0;
1020 }
1021
1022 static struct cftype blkcg_files[] = {
1023 {
1024 .name = "stat",
1025 .seq_show = blkcg_print_stat,
1026 },
1027 { }
1028 };
1029
1030 static struct cftype blkcg_legacy_files[] = {
1031 {
1032 .name = "reset_stats",
1033 .write_u64 = blkcg_reset_stats,
1034 },
1035 { }
1036 };
1037
1038 #ifdef CONFIG_CGROUP_WRITEBACK
1039 struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
1040 {
1041 return &css_to_blkcg(css)->cgwb_list;
1042 }
1043 #endif
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077 static void blkcg_destroy_blkgs(struct blkcg *blkcg)
1078 {
1079 might_sleep();
1080
1081 spin_lock_irq(&blkcg->lock);
1082
1083 while (!hlist_empty(&blkcg->blkg_list)) {
1084 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1085 struct blkcg_gq, blkcg_node);
1086 struct request_queue *q = blkg->q;
1087
1088 if (need_resched() || !spin_trylock(&q->queue_lock)) {
1089
1090
1091
1092
1093
1094 spin_unlock_irq(&blkcg->lock);
1095 cond_resched();
1096 spin_lock_irq(&blkcg->lock);
1097 continue;
1098 }
1099
1100 blkg_destroy(blkg);
1101 spin_unlock(&q->queue_lock);
1102 }
1103
1104 spin_unlock_irq(&blkcg->lock);
1105 }
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115 void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
1116 {
1117 refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
1118 }
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129 void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
1130 {
1131 struct blkcg *blkcg = css_to_blkcg(blkcg_css);
1132
1133 do {
1134 if (!refcount_dec_and_test(&blkcg->online_pin))
1135 break;
1136 blkcg_destroy_blkgs(blkcg);
1137 blkcg = blkcg_parent(blkcg);
1138 } while (blkcg);
1139 }
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149 static void blkcg_css_offline(struct cgroup_subsys_state *css)
1150 {
1151
1152 wb_blkcg_offline(css);
1153
1154
1155 blkcg_unpin_online(css);
1156 }
1157
1158 static void blkcg_css_free(struct cgroup_subsys_state *css)
1159 {
1160 struct blkcg *blkcg = css_to_blkcg(css);
1161 int i;
1162
1163 mutex_lock(&blkcg_pol_mutex);
1164
1165 list_del(&blkcg->all_blkcgs_node);
1166
1167 for (i = 0; i < BLKCG_MAX_POLS; i++)
1168 if (blkcg->cpd[i])
1169 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1170
1171 mutex_unlock(&blkcg_pol_mutex);
1172
1173 kfree(blkcg);
1174 }
1175
1176 static struct cgroup_subsys_state *
1177 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1178 {
1179 struct blkcg *blkcg;
1180 struct cgroup_subsys_state *ret;
1181 int i;
1182
1183 mutex_lock(&blkcg_pol_mutex);
1184
1185 if (!parent_css) {
1186 blkcg = &blkcg_root;
1187 } else {
1188 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1189 if (!blkcg) {
1190 ret = ERR_PTR(-ENOMEM);
1191 goto unlock;
1192 }
1193 }
1194
1195 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1196 struct blkcg_policy *pol = blkcg_policy[i];
1197 struct blkcg_policy_data *cpd;
1198
1199
1200
1201
1202
1203
1204
1205 if (!pol || !pol->cpd_alloc_fn)
1206 continue;
1207
1208 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1209 if (!cpd) {
1210 ret = ERR_PTR(-ENOMEM);
1211 goto free_pd_blkcg;
1212 }
1213 blkcg->cpd[i] = cpd;
1214 cpd->blkcg = blkcg;
1215 cpd->plid = i;
1216 if (pol->cpd_init_fn)
1217 pol->cpd_init_fn(cpd);
1218 }
1219
1220 spin_lock_init(&blkcg->lock);
1221 refcount_set(&blkcg->online_pin, 1);
1222 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1223 INIT_HLIST_HEAD(&blkcg->blkg_list);
1224 #ifdef CONFIG_CGROUP_WRITEBACK
1225 INIT_LIST_HEAD(&blkcg->cgwb_list);
1226 #endif
1227 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1228
1229 mutex_unlock(&blkcg_pol_mutex);
1230 return &blkcg->css;
1231
1232 free_pd_blkcg:
1233 for (i--; i >= 0; i--)
1234 if (blkcg->cpd[i])
1235 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1236
1237 if (blkcg != &blkcg_root)
1238 kfree(blkcg);
1239 unlock:
1240 mutex_unlock(&blkcg_pol_mutex);
1241 return ret;
1242 }
1243
1244 static int blkcg_css_online(struct cgroup_subsys_state *css)
1245 {
1246 struct blkcg *parent = blkcg_parent(css_to_blkcg(css));
1247
1248
1249
1250
1251
1252
1253 if (parent)
1254 blkcg_pin_online(css);
1255 return 0;
1256 }
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268 int blkcg_init_queue(struct request_queue *q)
1269 {
1270 struct blkcg_gq *new_blkg, *blkg;
1271 bool preloaded;
1272 int ret;
1273
1274 INIT_LIST_HEAD(&q->blkg_list);
1275
1276 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1277 if (!new_blkg)
1278 return -ENOMEM;
1279
1280 preloaded = !radix_tree_preload(GFP_KERNEL);
1281
1282
1283
1284 spin_lock_irq(&q->queue_lock);
1285 blkg = blkg_create(&blkcg_root, q, new_blkg);
1286 if (IS_ERR(blkg))
1287 goto err_unlock;
1288 q->root_blkg = blkg;
1289 spin_unlock_irq(&q->queue_lock);
1290
1291 if (preloaded)
1292 radix_tree_preload_end();
1293
1294 ret = blk_ioprio_init(q);
1295 if (ret)
1296 goto err_destroy_all;
1297
1298 ret = blk_throtl_init(q);
1299 if (ret)
1300 goto err_destroy_all;
1301
1302 ret = blk_iolatency_init(q);
1303 if (ret) {
1304 blk_throtl_exit(q);
1305 blk_ioprio_exit(q);
1306 goto err_destroy_all;
1307 }
1308
1309 return 0;
1310
1311 err_destroy_all:
1312 blkg_destroy_all(q);
1313 return ret;
1314 err_unlock:
1315 spin_unlock_irq(&q->queue_lock);
1316 if (preloaded)
1317 radix_tree_preload_end();
1318 return PTR_ERR(blkg);
1319 }
1320
1321
1322
1323
1324
1325
1326
1327 void blkcg_exit_queue(struct request_queue *q)
1328 {
1329 blkg_destroy_all(q);
1330 blk_throtl_exit(q);
1331 }
1332
1333 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1334 {
1335 int i;
1336
1337 mutex_lock(&blkcg_pol_mutex);
1338
1339 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1340 struct blkcg_policy *pol = blkcg_policy[i];
1341 struct blkcg *blkcg;
1342
1343 if (!pol || !pol->cpd_bind_fn)
1344 continue;
1345
1346 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1347 if (blkcg->cpd[pol->plid])
1348 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1349 }
1350 mutex_unlock(&blkcg_pol_mutex);
1351 }
1352
1353 static void blkcg_exit(struct task_struct *tsk)
1354 {
1355 if (tsk->throttle_queue)
1356 blk_put_queue(tsk->throttle_queue);
1357 tsk->throttle_queue = NULL;
1358 }
1359
1360 struct cgroup_subsys io_cgrp_subsys = {
1361 .css_alloc = blkcg_css_alloc,
1362 .css_online = blkcg_css_online,
1363 .css_offline = blkcg_css_offline,
1364 .css_free = blkcg_css_free,
1365 .css_rstat_flush = blkcg_rstat_flush,
1366 .bind = blkcg_bind,
1367 .dfl_cftypes = blkcg_files,
1368 .legacy_cftypes = blkcg_legacy_files,
1369 .legacy_name = "blkio",
1370 .exit = blkcg_exit,
1371 #ifdef CONFIG_MEMCG
1372
1373
1374
1375
1376
1377 .depends_on = 1 << memory_cgrp_id,
1378 #endif
1379 };
1380 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398 int blkcg_activate_policy(struct request_queue *q,
1399 const struct blkcg_policy *pol)
1400 {
1401 struct blkg_policy_data *pd_prealloc = NULL;
1402 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1403 int ret;
1404
1405 if (blkcg_policy_enabled(q, pol))
1406 return 0;
1407
1408 if (queue_is_mq(q))
1409 blk_mq_freeze_queue(q);
1410 retry:
1411 spin_lock_irq(&q->queue_lock);
1412
1413
1414 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1415 struct blkg_policy_data *pd;
1416
1417 if (blkg->pd[pol->plid])
1418 continue;
1419
1420
1421 if (blkg == pinned_blkg) {
1422 pd = pd_prealloc;
1423 pd_prealloc = NULL;
1424 } else {
1425 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1426 blkg->blkcg);
1427 }
1428
1429 if (!pd) {
1430
1431
1432
1433
1434 if (pinned_blkg)
1435 blkg_put(pinned_blkg);
1436 blkg_get(blkg);
1437 pinned_blkg = blkg;
1438
1439 spin_unlock_irq(&q->queue_lock);
1440
1441 if (pd_prealloc)
1442 pol->pd_free_fn(pd_prealloc);
1443 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1444 blkg->blkcg);
1445 if (pd_prealloc)
1446 goto retry;
1447 else
1448 goto enomem;
1449 }
1450
1451 blkg->pd[pol->plid] = pd;
1452 pd->blkg = blkg;
1453 pd->plid = pol->plid;
1454 }
1455
1456
1457 if (pol->pd_init_fn)
1458 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1459 pol->pd_init_fn(blkg->pd[pol->plid]);
1460
1461 __set_bit(pol->plid, q->blkcg_pols);
1462 ret = 0;
1463
1464 spin_unlock_irq(&q->queue_lock);
1465 out:
1466 if (queue_is_mq(q))
1467 blk_mq_unfreeze_queue(q);
1468 if (pinned_blkg)
1469 blkg_put(pinned_blkg);
1470 if (pd_prealloc)
1471 pol->pd_free_fn(pd_prealloc);
1472 return ret;
1473
1474 enomem:
1475
1476 spin_lock_irq(&q->queue_lock);
1477 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1478 struct blkcg *blkcg = blkg->blkcg;
1479
1480 spin_lock(&blkcg->lock);
1481 if (blkg->pd[pol->plid]) {
1482 pol->pd_free_fn(blkg->pd[pol->plid]);
1483 blkg->pd[pol->plid] = NULL;
1484 }
1485 spin_unlock(&blkcg->lock);
1486 }
1487 spin_unlock_irq(&q->queue_lock);
1488 ret = -ENOMEM;
1489 goto out;
1490 }
1491 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501 void blkcg_deactivate_policy(struct request_queue *q,
1502 const struct blkcg_policy *pol)
1503 {
1504 struct blkcg_gq *blkg;
1505
1506 if (!blkcg_policy_enabled(q, pol))
1507 return;
1508
1509 if (queue_is_mq(q))
1510 blk_mq_freeze_queue(q);
1511
1512 spin_lock_irq(&q->queue_lock);
1513
1514 __clear_bit(pol->plid, q->blkcg_pols);
1515
1516 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1517 struct blkcg *blkcg = blkg->blkcg;
1518
1519 spin_lock(&blkcg->lock);
1520 if (blkg->pd[pol->plid]) {
1521 if (pol->pd_offline_fn)
1522 pol->pd_offline_fn(blkg->pd[pol->plid]);
1523 pol->pd_free_fn(blkg->pd[pol->plid]);
1524 blkg->pd[pol->plid] = NULL;
1525 }
1526 spin_unlock(&blkcg->lock);
1527 }
1528
1529 spin_unlock_irq(&q->queue_lock);
1530
1531 if (queue_is_mq(q))
1532 blk_mq_unfreeze_queue(q);
1533 }
1534 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1535
1536 static void blkcg_free_all_cpd(struct blkcg_policy *pol)
1537 {
1538 struct blkcg *blkcg;
1539
1540 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1541 if (blkcg->cpd[pol->plid]) {
1542 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1543 blkcg->cpd[pol->plid] = NULL;
1544 }
1545 }
1546 }
1547
1548
1549
1550
1551
1552
1553
1554
1555 int blkcg_policy_register(struct blkcg_policy *pol)
1556 {
1557 struct blkcg *blkcg;
1558 int i, ret;
1559
1560 mutex_lock(&blkcg_pol_register_mutex);
1561 mutex_lock(&blkcg_pol_mutex);
1562
1563
1564 ret = -ENOSPC;
1565 for (i = 0; i < BLKCG_MAX_POLS; i++)
1566 if (!blkcg_policy[i])
1567 break;
1568 if (i >= BLKCG_MAX_POLS) {
1569 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1570 goto err_unlock;
1571 }
1572
1573
1574 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1575 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1576 goto err_unlock;
1577
1578
1579 pol->plid = i;
1580 blkcg_policy[pol->plid] = pol;
1581
1582
1583 if (pol->cpd_alloc_fn) {
1584 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1585 struct blkcg_policy_data *cpd;
1586
1587 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1588 if (!cpd)
1589 goto err_free_cpds;
1590
1591 blkcg->cpd[pol->plid] = cpd;
1592 cpd->blkcg = blkcg;
1593 cpd->plid = pol->plid;
1594 if (pol->cpd_init_fn)
1595 pol->cpd_init_fn(cpd);
1596 }
1597 }
1598
1599 mutex_unlock(&blkcg_pol_mutex);
1600
1601
1602 if (pol->dfl_cftypes)
1603 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1604 pol->dfl_cftypes));
1605 if (pol->legacy_cftypes)
1606 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1607 pol->legacy_cftypes));
1608 mutex_unlock(&blkcg_pol_register_mutex);
1609 return 0;
1610
1611 err_free_cpds:
1612 if (pol->cpd_free_fn)
1613 blkcg_free_all_cpd(pol);
1614
1615 blkcg_policy[pol->plid] = NULL;
1616 err_unlock:
1617 mutex_unlock(&blkcg_pol_mutex);
1618 mutex_unlock(&blkcg_pol_register_mutex);
1619 return ret;
1620 }
1621 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1622
1623
1624
1625
1626
1627
1628
1629 void blkcg_policy_unregister(struct blkcg_policy *pol)
1630 {
1631 mutex_lock(&blkcg_pol_register_mutex);
1632
1633 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1634 goto out_unlock;
1635
1636
1637 if (pol->dfl_cftypes)
1638 cgroup_rm_cftypes(pol->dfl_cftypes);
1639 if (pol->legacy_cftypes)
1640 cgroup_rm_cftypes(pol->legacy_cftypes);
1641
1642
1643 mutex_lock(&blkcg_pol_mutex);
1644
1645 if (pol->cpd_free_fn)
1646 blkcg_free_all_cpd(pol);
1647
1648 blkcg_policy[pol->plid] = NULL;
1649
1650 mutex_unlock(&blkcg_pol_mutex);
1651 out_unlock:
1652 mutex_unlock(&blkcg_pol_register_mutex);
1653 }
1654 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1655
1656 bool __blkcg_punt_bio_submit(struct bio *bio)
1657 {
1658 struct blkcg_gq *blkg = bio->bi_blkg;
1659
1660
1661 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1662
1663
1664 if (!blkg->parent)
1665 return false;
1666
1667 spin_lock_bh(&blkg->async_bio_lock);
1668 bio_list_add(&blkg->async_bios, bio);
1669 spin_unlock_bh(&blkg->async_bio_lock);
1670
1671 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1672 return true;
1673 }
1674
1675
1676
1677
1678
1679
1680
1681 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1682 {
1683 u64 old = atomic64_read(&blkg->delay_start);
1684
1685
1686 if (atomic_read(&blkg->use_delay) < 0)
1687 return;
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702 if (time_before64(old + NSEC_PER_SEC, now) &&
1703 atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
1704 u64 cur = atomic64_read(&blkg->delay_nsec);
1705 u64 sub = min_t(u64, blkg->last_delay, now - old);
1706 int cur_use = atomic_read(&blkg->use_delay);
1707
1708
1709
1710
1711
1712 if (cur_use < blkg->last_use)
1713 sub = max_t(u64, sub, blkg->last_delay >> 1);
1714
1715
1716
1717
1718
1719
1720
1721 if (unlikely(cur < sub)) {
1722 atomic64_set(&blkg->delay_nsec, 0);
1723 blkg->last_delay = 0;
1724 } else {
1725 atomic64_sub(sub, &blkg->delay_nsec);
1726 blkg->last_delay = cur - sub;
1727 }
1728 blkg->last_use = cur_use;
1729 }
1730 }
1731
1732
1733
1734
1735
1736
1737
1738 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1739 {
1740 unsigned long pflags;
1741 bool clamp;
1742 u64 now = ktime_to_ns(ktime_get());
1743 u64 exp;
1744 u64 delay_nsec = 0;
1745 int tok;
1746
1747 while (blkg->parent) {
1748 int use_delay = atomic_read(&blkg->use_delay);
1749
1750 if (use_delay) {
1751 u64 this_delay;
1752
1753 blkcg_scale_delay(blkg, now);
1754 this_delay = atomic64_read(&blkg->delay_nsec);
1755 if (this_delay > delay_nsec) {
1756 delay_nsec = this_delay;
1757 clamp = use_delay > 0;
1758 }
1759 }
1760 blkg = blkg->parent;
1761 }
1762
1763 if (!delay_nsec)
1764 return;
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775 if (clamp)
1776 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1777
1778 if (use_memdelay)
1779 psi_memstall_enter(&pflags);
1780
1781 exp = ktime_add_ns(now, delay_nsec);
1782 tok = io_schedule_prepare();
1783 do {
1784 __set_current_state(TASK_KILLABLE);
1785 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1786 break;
1787 } while (!fatal_signal_pending(current));
1788 io_schedule_finish(tok);
1789
1790 if (use_memdelay)
1791 psi_memstall_leave(&pflags);
1792 }
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 void blkcg_maybe_throttle_current(void)
1805 {
1806 struct request_queue *q = current->throttle_queue;
1807 struct blkcg *blkcg;
1808 struct blkcg_gq *blkg;
1809 bool use_memdelay = current->use_memdelay;
1810
1811 if (!q)
1812 return;
1813
1814 current->throttle_queue = NULL;
1815 current->use_memdelay = false;
1816
1817 rcu_read_lock();
1818 blkcg = css_to_blkcg(blkcg_css());
1819 if (!blkcg)
1820 goto out;
1821 blkg = blkg_lookup(blkcg, q);
1822 if (!blkg)
1823 goto out;
1824 if (!blkg_tryget(blkg))
1825 goto out;
1826 rcu_read_unlock();
1827
1828 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1829 blkg_put(blkg);
1830 blk_put_queue(q);
1831 return;
1832 out:
1833 rcu_read_unlock();
1834 blk_put_queue(q);
1835 }
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1855 {
1856 if (unlikely(current->flags & PF_KTHREAD))
1857 return;
1858
1859 if (current->throttle_queue != q) {
1860 if (!blk_get_queue(q))
1861 return;
1862
1863 if (current->throttle_queue)
1864 blk_put_queue(current->throttle_queue);
1865 current->throttle_queue = q;
1866 }
1867
1868 if (use_memdelay)
1869 current->use_memdelay = use_memdelay;
1870 set_notify_resume(current);
1871 }
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1883 {
1884 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1885 return;
1886 blkcg_scale_delay(blkg, now);
1887 atomic64_add(delta, &blkg->delay_nsec);
1888 }
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899 static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1900 struct cgroup_subsys_state *css)
1901 {
1902 struct blkcg_gq *blkg, *ret_blkg = NULL;
1903
1904 rcu_read_lock();
1905 blkg = blkg_lookup_create(css_to_blkcg(css),
1906 bdev_get_queue(bio->bi_bdev));
1907 while (blkg) {
1908 if (blkg_tryget(blkg)) {
1909 ret_blkg = blkg;
1910 break;
1911 }
1912 blkg = blkg->parent;
1913 }
1914 rcu_read_unlock();
1915
1916 return ret_blkg;
1917 }
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933 void bio_associate_blkg_from_css(struct bio *bio,
1934 struct cgroup_subsys_state *css)
1935 {
1936 if (bio->bi_blkg)
1937 blkg_put(bio->bi_blkg);
1938
1939 if (css && css->parent) {
1940 bio->bi_blkg = blkg_tryget_closest(bio, css);
1941 } else {
1942 blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
1943 bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
1944 }
1945 }
1946 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957 void bio_associate_blkg(struct bio *bio)
1958 {
1959 struct cgroup_subsys_state *css;
1960
1961 rcu_read_lock();
1962
1963 if (bio->bi_blkg)
1964 css = bio_blkcg_css(bio);
1965 else
1966 css = blkcg_css();
1967
1968 bio_associate_blkg_from_css(bio, css);
1969
1970 rcu_read_unlock();
1971 }
1972 EXPORT_SYMBOL_GPL(bio_associate_blkg);
1973
1974
1975
1976
1977
1978
1979 void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1980 {
1981 if (src->bi_blkg)
1982 bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
1983 }
1984 EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1985
1986 static int blk_cgroup_io_type(struct bio *bio)
1987 {
1988 if (op_is_discard(bio->bi_opf))
1989 return BLKG_IOSTAT_DISCARD;
1990 if (op_is_write(bio->bi_opf))
1991 return BLKG_IOSTAT_WRITE;
1992 return BLKG_IOSTAT_READ;
1993 }
1994
1995 void blk_cgroup_bio_start(struct bio *bio)
1996 {
1997 int rwd = blk_cgroup_io_type(bio), cpu;
1998 struct blkg_iostat_set *bis;
1999 unsigned long flags;
2000
2001 cpu = get_cpu();
2002 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
2003 flags = u64_stats_update_begin_irqsave(&bis->sync);
2004
2005
2006
2007
2008
2009 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
2010 bio_set_flag(bio, BIO_CGROUP_ACCT);
2011 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
2012 }
2013 bis->cur.ios[rwd]++;
2014
2015 u64_stats_update_end_irqrestore(&bis->sync, flags);
2016 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
2017 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
2018 put_cpu();
2019 }
2020
2021 bool blk_cgroup_congested(void)
2022 {
2023 struct cgroup_subsys_state *css;
2024 bool ret = false;
2025
2026 rcu_read_lock();
2027 for (css = blkcg_css(); css; css = css->parent) {
2028 if (atomic_read(&css->cgroup->congestion_count)) {
2029 ret = true;
2030 break;
2031 }
2032 }
2033 rcu_read_unlock();
2034 return ret;
2035 }
2036
2037 static int __init blkcg_init(void)
2038 {
2039 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
2040 WQ_MEM_RECLAIM | WQ_FREEZABLE |
2041 WQ_UNBOUND | WQ_SYSFS, 0);
2042 if (!blkcg_punt_bio_wq)
2043 return -ENOMEM;
2044 return 0;
2045 }
2046 subsys_initcall(blkcg_init);
2047
2048 module_param(blkcg_debug_stats, bool, 0644);
2049 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");