0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/module.h>
0009 #include <linux/slab.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/bio.h>
0012 #include <linux/blktrace_api.h>
0013 #include "blk.h"
0014 #include "blk-cgroup-rwstat.h"
0015 #include "blk-stat.h"
0016 #include "blk-throttle.h"
0017
0018
0019 #define THROTL_GRP_QUANTUM 8
0020
0021
0022 #define THROTL_QUANTUM 32
0023
0024
0025 #define DFL_THROTL_SLICE_HD (HZ / 10)
0026 #define DFL_THROTL_SLICE_SSD (HZ / 50)
0027 #define MAX_THROTL_SLICE (HZ)
0028 #define MAX_IDLE_TIME (5L * 1000 * 1000)
0029 #define MIN_THROTL_BPS (320 * 1024)
0030 #define MIN_THROTL_IOPS (10)
0031 #define DFL_LATENCY_TARGET (-1L)
0032 #define DFL_IDLE_THRESHOLD (0)
0033 #define DFL_HD_BASELINE_LATENCY (4000L)
0034 #define LATENCY_FILTERED_SSD (0)
0035
0036
0037
0038
0039 #define LATENCY_FILTERED_HD (1000L)
0040
0041
0042 static struct workqueue_struct *kthrotld_workqueue;
0043
0044 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
0045
0046
0047 #define LATENCY_BUCKET_SIZE 9
0048
0049 struct latency_bucket {
0050 unsigned long total_latency;
0051 int samples;
0052 };
0053
0054 struct avg_latency_bucket {
0055 unsigned long latency;
0056 bool valid;
0057 };
0058
0059 struct throtl_data
0060 {
0061
0062 struct throtl_service_queue service_queue;
0063
0064 struct request_queue *queue;
0065
0066
0067 unsigned int nr_queued[2];
0068
0069 unsigned int throtl_slice;
0070
0071
0072 struct work_struct dispatch_work;
0073 unsigned int limit_index;
0074 bool limit_valid[LIMIT_CNT];
0075
0076 unsigned long low_upgrade_time;
0077 unsigned long low_downgrade_time;
0078
0079 unsigned int scale;
0080
0081 struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
0082 struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
0083 struct latency_bucket __percpu *latency_buckets[2];
0084 unsigned long last_calculate_time;
0085 unsigned long filtered_latency;
0086
0087 bool track_bio_latency;
0088 };
0089
0090 static void throtl_pending_timer_fn(struct timer_list *t);
0091
0092 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0093 {
0094 return pd_to_blkg(&tg->pd);
0095 }
0096
0097
0098
0099
0100
0101
0102
0103
0104 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
0105 {
0106 if (sq && sq->parent_sq)
0107 return container_of(sq, struct throtl_grp, service_queue);
0108 else
0109 return NULL;
0110 }
0111
0112
0113
0114
0115
0116
0117
0118
0119 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
0120 {
0121 struct throtl_grp *tg = sq_to_tg(sq);
0122
0123 if (tg)
0124 return tg->td;
0125 else
0126 return container_of(sq, struct throtl_data, service_queue);
0127 }
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137 static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
0138 {
0139
0140 if (td->scale < 4096 && time_after_eq(jiffies,
0141 td->low_upgrade_time + td->scale * td->throtl_slice))
0142 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
0143
0144 return low + (low >> 1) * td->scale;
0145 }
0146
0147 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
0148 {
0149 struct blkcg_gq *blkg = tg_to_blkg(tg);
0150 struct throtl_data *td;
0151 uint64_t ret;
0152
0153 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
0154 return U64_MAX;
0155
0156 td = tg->td;
0157 ret = tg->bps[rw][td->limit_index];
0158 if (ret == 0 && td->limit_index == LIMIT_LOW) {
0159
0160 if (!list_empty(&blkg->blkcg->css.children) ||
0161 tg->iops[rw][td->limit_index])
0162 return U64_MAX;
0163 else
0164 return MIN_THROTL_BPS;
0165 }
0166
0167 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
0168 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
0169 uint64_t adjusted;
0170
0171 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
0172 ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
0173 }
0174 return ret;
0175 }
0176
0177 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
0178 {
0179 struct blkcg_gq *blkg = tg_to_blkg(tg);
0180 struct throtl_data *td;
0181 unsigned int ret;
0182
0183 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
0184 return UINT_MAX;
0185
0186 td = tg->td;
0187 ret = tg->iops[rw][td->limit_index];
0188 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
0189
0190 if (!list_empty(&blkg->blkcg->css.children) ||
0191 tg->bps[rw][td->limit_index])
0192 return UINT_MAX;
0193 else
0194 return MIN_THROTL_IOPS;
0195 }
0196
0197 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
0198 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
0199 uint64_t adjusted;
0200
0201 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
0202 if (adjusted > UINT_MAX)
0203 adjusted = UINT_MAX;
0204 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
0205 }
0206 return ret;
0207 }
0208
0209 #define request_bucket_index(sectors) \
0210 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221 #define throtl_log(sq, fmt, args...) do { \
0222 struct throtl_grp *__tg = sq_to_tg((sq)); \
0223 struct throtl_data *__td = sq_to_td((sq)); \
0224 \
0225 (void)__td; \
0226 if (likely(!blk_trace_note_message_enabled(__td->queue))) \
0227 break; \
0228 if ((__tg)) { \
0229 blk_add_cgroup_trace_msg(__td->queue, \
0230 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
0231 } else { \
0232 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
0233 } \
0234 } while (0)
0235
0236 static inline unsigned int throtl_bio_data_size(struct bio *bio)
0237 {
0238
0239 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
0240 return 512;
0241 return bio->bi_iter.bi_size;
0242 }
0243
0244 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
0245 {
0246 INIT_LIST_HEAD(&qn->node);
0247 bio_list_init(&qn->bios);
0248 qn->tg = tg;
0249 }
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
0262 struct list_head *queued)
0263 {
0264 bio_list_add(&qn->bios, bio);
0265 if (list_empty(&qn->node)) {
0266 list_add_tail(&qn->node, queued);
0267 blkg_get(tg_to_blkg(qn->tg));
0268 }
0269 }
0270
0271
0272
0273
0274
0275 static struct bio *throtl_peek_queued(struct list_head *queued)
0276 {
0277 struct throtl_qnode *qn;
0278 struct bio *bio;
0279
0280 if (list_empty(queued))
0281 return NULL;
0282
0283 qn = list_first_entry(queued, struct throtl_qnode, node);
0284 bio = bio_list_peek(&qn->bios);
0285 WARN_ON_ONCE(!bio);
0286 return bio;
0287 }
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303 static struct bio *throtl_pop_queued(struct list_head *queued,
0304 struct throtl_grp **tg_to_put)
0305 {
0306 struct throtl_qnode *qn;
0307 struct bio *bio;
0308
0309 if (list_empty(queued))
0310 return NULL;
0311
0312 qn = list_first_entry(queued, struct throtl_qnode, node);
0313 bio = bio_list_pop(&qn->bios);
0314 WARN_ON_ONCE(!bio);
0315
0316 if (bio_list_empty(&qn->bios)) {
0317 list_del_init(&qn->node);
0318 if (tg_to_put)
0319 *tg_to_put = qn->tg;
0320 else
0321 blkg_put(tg_to_blkg(qn->tg));
0322 } else {
0323 list_move_tail(&qn->node, queued);
0324 }
0325
0326 return bio;
0327 }
0328
0329
0330 static void throtl_service_queue_init(struct throtl_service_queue *sq)
0331 {
0332 INIT_LIST_HEAD(&sq->queued[0]);
0333 INIT_LIST_HEAD(&sq->queued[1]);
0334 sq->pending_tree = RB_ROOT_CACHED;
0335 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
0336 }
0337
0338 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
0339 struct request_queue *q,
0340 struct blkcg *blkcg)
0341 {
0342 struct throtl_grp *tg;
0343 int rw;
0344
0345 tg = kzalloc_node(sizeof(*tg), gfp, q->node);
0346 if (!tg)
0347 return NULL;
0348
0349 if (blkg_rwstat_init(&tg->stat_bytes, gfp))
0350 goto err_free_tg;
0351
0352 if (blkg_rwstat_init(&tg->stat_ios, gfp))
0353 goto err_exit_stat_bytes;
0354
0355 throtl_service_queue_init(&tg->service_queue);
0356
0357 for (rw = READ; rw <= WRITE; rw++) {
0358 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
0359 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
0360 }
0361
0362 RB_CLEAR_NODE(&tg->rb_node);
0363 tg->bps[READ][LIMIT_MAX] = U64_MAX;
0364 tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
0365 tg->iops[READ][LIMIT_MAX] = UINT_MAX;
0366 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
0367 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
0368 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
0369 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
0370 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
0371
0372
0373 tg->latency_target = DFL_LATENCY_TARGET;
0374 tg->latency_target_conf = DFL_LATENCY_TARGET;
0375 tg->idletime_threshold = DFL_IDLE_THRESHOLD;
0376 tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
0377
0378 return &tg->pd;
0379
0380 err_exit_stat_bytes:
0381 blkg_rwstat_exit(&tg->stat_bytes);
0382 err_free_tg:
0383 kfree(tg);
0384 return NULL;
0385 }
0386
0387 static void throtl_pd_init(struct blkg_policy_data *pd)
0388 {
0389 struct throtl_grp *tg = pd_to_tg(pd);
0390 struct blkcg_gq *blkg = tg_to_blkg(tg);
0391 struct throtl_data *td = blkg->q->td;
0392 struct throtl_service_queue *sq = &tg->service_queue;
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407 sq->parent_sq = &td->service_queue;
0408 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
0409 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
0410 tg->td = td;
0411 }
0412
0413
0414
0415
0416
0417
0418 static void tg_update_has_rules(struct throtl_grp *tg)
0419 {
0420 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
0421 struct throtl_data *td = tg->td;
0422 int rw;
0423 int has_iops_limit = 0;
0424
0425 for (rw = READ; rw <= WRITE; rw++) {
0426 unsigned int iops_limit = tg_iops_limit(tg, rw);
0427
0428 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
0429 (td->limit_valid[td->limit_index] &&
0430 (tg_bps_limit(tg, rw) != U64_MAX ||
0431 iops_limit != UINT_MAX));
0432
0433 if (iops_limit != UINT_MAX)
0434 has_iops_limit = 1;
0435 }
0436
0437 if (has_iops_limit)
0438 tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
0439 else
0440 tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
0441 }
0442
0443 static void throtl_pd_online(struct blkg_policy_data *pd)
0444 {
0445 struct throtl_grp *tg = pd_to_tg(pd);
0446
0447
0448
0449
0450 tg_update_has_rules(tg);
0451 }
0452
0453 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
0454 static void blk_throtl_update_limit_valid(struct throtl_data *td)
0455 {
0456 struct cgroup_subsys_state *pos_css;
0457 struct blkcg_gq *blkg;
0458 bool low_valid = false;
0459
0460 rcu_read_lock();
0461 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
0462 struct throtl_grp *tg = blkg_to_tg(blkg);
0463
0464 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
0465 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
0466 low_valid = true;
0467 break;
0468 }
0469 }
0470 rcu_read_unlock();
0471
0472 td->limit_valid[LIMIT_LOW] = low_valid;
0473 }
0474 #else
0475 static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
0476 {
0477 }
0478 #endif
0479
0480 static void throtl_upgrade_state(struct throtl_data *td);
0481 static void throtl_pd_offline(struct blkg_policy_data *pd)
0482 {
0483 struct throtl_grp *tg = pd_to_tg(pd);
0484
0485 tg->bps[READ][LIMIT_LOW] = 0;
0486 tg->bps[WRITE][LIMIT_LOW] = 0;
0487 tg->iops[READ][LIMIT_LOW] = 0;
0488 tg->iops[WRITE][LIMIT_LOW] = 0;
0489
0490 blk_throtl_update_limit_valid(tg->td);
0491
0492 if (!tg->td->limit_valid[tg->td->limit_index])
0493 throtl_upgrade_state(tg->td);
0494 }
0495
0496 static void throtl_pd_free(struct blkg_policy_data *pd)
0497 {
0498 struct throtl_grp *tg = pd_to_tg(pd);
0499
0500 del_timer_sync(&tg->service_queue.pending_timer);
0501 blkg_rwstat_exit(&tg->stat_bytes);
0502 blkg_rwstat_exit(&tg->stat_ios);
0503 kfree(tg);
0504 }
0505
0506 static struct throtl_grp *
0507 throtl_rb_first(struct throtl_service_queue *parent_sq)
0508 {
0509 struct rb_node *n;
0510
0511 n = rb_first_cached(&parent_sq->pending_tree);
0512 WARN_ON_ONCE(!n);
0513 if (!n)
0514 return NULL;
0515 return rb_entry_tg(n);
0516 }
0517
0518 static void throtl_rb_erase(struct rb_node *n,
0519 struct throtl_service_queue *parent_sq)
0520 {
0521 rb_erase_cached(n, &parent_sq->pending_tree);
0522 RB_CLEAR_NODE(n);
0523 --parent_sq->nr_pending;
0524 }
0525
0526 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
0527 {
0528 struct throtl_grp *tg;
0529
0530 tg = throtl_rb_first(parent_sq);
0531 if (!tg)
0532 return;
0533
0534 parent_sq->first_pending_disptime = tg->disptime;
0535 }
0536
0537 static void tg_service_queue_add(struct throtl_grp *tg)
0538 {
0539 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0540 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
0541 struct rb_node *parent = NULL;
0542 struct throtl_grp *__tg;
0543 unsigned long key = tg->disptime;
0544 bool leftmost = true;
0545
0546 while (*node != NULL) {
0547 parent = *node;
0548 __tg = rb_entry_tg(parent);
0549
0550 if (time_before(key, __tg->disptime))
0551 node = &parent->rb_left;
0552 else {
0553 node = &parent->rb_right;
0554 leftmost = false;
0555 }
0556 }
0557
0558 rb_link_node(&tg->rb_node, parent, node);
0559 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
0560 leftmost);
0561 }
0562
0563 static void throtl_enqueue_tg(struct throtl_grp *tg)
0564 {
0565 if (!(tg->flags & THROTL_TG_PENDING)) {
0566 tg_service_queue_add(tg);
0567 tg->flags |= THROTL_TG_PENDING;
0568 tg->service_queue.parent_sq->nr_pending++;
0569 }
0570 }
0571
0572 static void throtl_dequeue_tg(struct throtl_grp *tg)
0573 {
0574 if (tg->flags & THROTL_TG_PENDING) {
0575 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
0576 tg->flags &= ~THROTL_TG_PENDING;
0577 }
0578 }
0579
0580
0581 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
0582 unsigned long expires)
0583 {
0584 unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
0585
0586
0587
0588
0589
0590
0591
0592
0593 if (time_after(expires, max_expire))
0594 expires = max_expire;
0595 mod_timer(&sq->pending_timer, expires);
0596 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
0597 expires - jiffies, jiffies);
0598 }
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
0619 bool force)
0620 {
0621
0622 if (!sq->nr_pending)
0623 return true;
0624
0625 update_min_dispatch_time(sq);
0626
0627
0628 if (force || time_after(sq->first_pending_disptime, jiffies)) {
0629 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
0630 return true;
0631 }
0632
0633
0634 return false;
0635 }
0636
0637 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
0638 bool rw, unsigned long start)
0639 {
0640 tg->bytes_disp[rw] = 0;
0641 tg->io_disp[rw] = 0;
0642
0643
0644
0645
0646
0647
0648
0649 if (time_after_eq(start, tg->slice_start[rw]))
0650 tg->slice_start[rw] = start;
0651
0652 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
0653 throtl_log(&tg->service_queue,
0654 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
0655 rw == READ ? 'R' : 'W', tg->slice_start[rw],
0656 tg->slice_end[rw], jiffies);
0657 }
0658
0659 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
0660 {
0661 tg->bytes_disp[rw] = 0;
0662 tg->io_disp[rw] = 0;
0663 tg->slice_start[rw] = jiffies;
0664 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
0665
0666 throtl_log(&tg->service_queue,
0667 "[%c] new slice start=%lu end=%lu jiffies=%lu",
0668 rw == READ ? 'R' : 'W', tg->slice_start[rw],
0669 tg->slice_end[rw], jiffies);
0670 }
0671
0672 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
0673 unsigned long jiffy_end)
0674 {
0675 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
0676 }
0677
0678 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
0679 unsigned long jiffy_end)
0680 {
0681 throtl_set_slice_end(tg, rw, jiffy_end);
0682 throtl_log(&tg->service_queue,
0683 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
0684 rw == READ ? 'R' : 'W', tg->slice_start[rw],
0685 tg->slice_end[rw], jiffies);
0686 }
0687
0688
0689 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
0690 {
0691 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
0692 return false;
0693
0694 return true;
0695 }
0696
0697
0698 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
0699 {
0700 unsigned long nr_slices, time_elapsed, io_trim;
0701 u64 bytes_trim, tmp;
0702
0703 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
0704
0705
0706
0707
0708
0709
0710 if (throtl_slice_used(tg, rw))
0711 return;
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
0722
0723 time_elapsed = jiffies - tg->slice_start[rw];
0724
0725 nr_slices = time_elapsed / tg->td->throtl_slice;
0726
0727 if (!nr_slices)
0728 return;
0729 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
0730 do_div(tmp, HZ);
0731 bytes_trim = tmp;
0732
0733 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
0734 HZ;
0735
0736 if (!bytes_trim && !io_trim)
0737 return;
0738
0739 if (tg->bytes_disp[rw] >= bytes_trim)
0740 tg->bytes_disp[rw] -= bytes_trim;
0741 else
0742 tg->bytes_disp[rw] = 0;
0743
0744 if (tg->io_disp[rw] >= io_trim)
0745 tg->io_disp[rw] -= io_trim;
0746 else
0747 tg->io_disp[rw] = 0;
0748
0749 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
0750
0751 throtl_log(&tg->service_queue,
0752 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
0753 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
0754 tg->slice_start[rw], tg->slice_end[rw], jiffies);
0755 }
0756
0757 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
0758 u32 iops_limit, unsigned long *wait)
0759 {
0760 bool rw = bio_data_dir(bio);
0761 unsigned int io_allowed;
0762 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
0763 u64 tmp;
0764
0765 if (iops_limit == UINT_MAX) {
0766 if (wait)
0767 *wait = 0;
0768 return true;
0769 }
0770
0771 jiffy_elapsed = jiffies - tg->slice_start[rw];
0772
0773
0774 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
0775
0776
0777
0778
0779
0780
0781
0782
0783 tmp = (u64)iops_limit * jiffy_elapsed_rnd;
0784 do_div(tmp, HZ);
0785
0786 if (tmp > UINT_MAX)
0787 io_allowed = UINT_MAX;
0788 else
0789 io_allowed = tmp;
0790
0791 if (tg->io_disp[rw] + 1 <= io_allowed) {
0792 if (wait)
0793 *wait = 0;
0794 return true;
0795 }
0796
0797
0798 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
0799
0800 if (wait)
0801 *wait = jiffy_wait;
0802 return false;
0803 }
0804
0805 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
0806 u64 bps_limit, unsigned long *wait)
0807 {
0808 bool rw = bio_data_dir(bio);
0809 u64 bytes_allowed, extra_bytes, tmp;
0810 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
0811 unsigned int bio_size = throtl_bio_data_size(bio);
0812
0813
0814 if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
0815 if (wait)
0816 *wait = 0;
0817 return true;
0818 }
0819
0820 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
0821
0822
0823 if (!jiffy_elapsed)
0824 jiffy_elapsed_rnd = tg->td->throtl_slice;
0825
0826 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
0827
0828 tmp = bps_limit * jiffy_elapsed_rnd;
0829 do_div(tmp, HZ);
0830 bytes_allowed = tmp;
0831
0832 if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
0833 if (wait)
0834 *wait = 0;
0835 return true;
0836 }
0837
0838
0839 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
0840 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
0841
0842 if (!jiffy_wait)
0843 jiffy_wait = 1;
0844
0845
0846
0847
0848
0849 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
0850 if (wait)
0851 *wait = jiffy_wait;
0852 return false;
0853 }
0854
0855
0856
0857
0858
0859 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
0860 unsigned long *wait)
0861 {
0862 bool rw = bio_data_dir(bio);
0863 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
0864 u64 bps_limit = tg_bps_limit(tg, rw);
0865 u32 iops_limit = tg_iops_limit(tg, rw);
0866
0867
0868
0869
0870
0871
0872
0873 BUG_ON(tg->service_queue.nr_queued[rw] &&
0874 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
0875
0876
0877 if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
0878 tg->flags & THROTL_TG_CANCELING) {
0879 if (wait)
0880 *wait = 0;
0881 return true;
0882 }
0883
0884
0885
0886
0887
0888
0889
0890
0891 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
0892 throtl_start_new_slice(tg, rw);
0893 else {
0894 if (time_before(tg->slice_end[rw],
0895 jiffies + tg->td->throtl_slice))
0896 throtl_extend_slice(tg, rw,
0897 jiffies + tg->td->throtl_slice);
0898 }
0899
0900 if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
0901 tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
0902 if (wait)
0903 *wait = 0;
0904 return true;
0905 }
0906
0907 max_wait = max(bps_wait, iops_wait);
0908
0909 if (wait)
0910 *wait = max_wait;
0911
0912 if (time_before(tg->slice_end[rw], jiffies + max_wait))
0913 throtl_extend_slice(tg, rw, jiffies + max_wait);
0914
0915 return false;
0916 }
0917
0918 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
0919 {
0920 bool rw = bio_data_dir(bio);
0921 unsigned int bio_size = throtl_bio_data_size(bio);
0922
0923
0924 if (!bio_flagged(bio, BIO_THROTTLED)) {
0925 tg->bytes_disp[rw] += bio_size;
0926 tg->last_bytes_disp[rw] += bio_size;
0927 }
0928
0929 tg->io_disp[rw]++;
0930 tg->last_io_disp[rw]++;
0931
0932
0933
0934
0935
0936
0937
0938 if (!bio_flagged(bio, BIO_THROTTLED))
0939 bio_set_flag(bio, BIO_THROTTLED);
0940 }
0941
0942
0943
0944
0945
0946
0947
0948
0949
0950
0951 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
0952 struct throtl_grp *tg)
0953 {
0954 struct throtl_service_queue *sq = &tg->service_queue;
0955 bool rw = bio_data_dir(bio);
0956
0957 if (!qn)
0958 qn = &tg->qnode_on_self[rw];
0959
0960
0961
0962
0963
0964
0965
0966 if (!sq->nr_queued[rw])
0967 tg->flags |= THROTL_TG_WAS_EMPTY;
0968
0969 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
0970
0971 sq->nr_queued[rw]++;
0972 throtl_enqueue_tg(tg);
0973 }
0974
0975 static void tg_update_disptime(struct throtl_grp *tg)
0976 {
0977 struct throtl_service_queue *sq = &tg->service_queue;
0978 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
0979 struct bio *bio;
0980
0981 bio = throtl_peek_queued(&sq->queued[READ]);
0982 if (bio)
0983 tg_may_dispatch(tg, bio, &read_wait);
0984
0985 bio = throtl_peek_queued(&sq->queued[WRITE]);
0986 if (bio)
0987 tg_may_dispatch(tg, bio, &write_wait);
0988
0989 min_wait = min(read_wait, write_wait);
0990 disptime = jiffies + min_wait;
0991
0992
0993 throtl_dequeue_tg(tg);
0994 tg->disptime = disptime;
0995 throtl_enqueue_tg(tg);
0996
0997
0998 tg->flags &= ~THROTL_TG_WAS_EMPTY;
0999 }
1000
1001 static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
1002 struct throtl_grp *parent_tg, bool rw)
1003 {
1004 if (throtl_slice_used(parent_tg, rw)) {
1005 throtl_start_new_slice_with_credit(parent_tg, rw,
1006 child_tg->slice_start[rw]);
1007 }
1008
1009 }
1010
1011 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
1012 {
1013 struct throtl_service_queue *sq = &tg->service_queue;
1014 struct throtl_service_queue *parent_sq = sq->parent_sq;
1015 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
1016 struct throtl_grp *tg_to_put = NULL;
1017 struct bio *bio;
1018
1019
1020
1021
1022
1023
1024
1025 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
1026 sq->nr_queued[rw]--;
1027
1028 throtl_charge_bio(tg, bio);
1029
1030
1031
1032
1033
1034
1035
1036
1037 if (parent_tg) {
1038 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
1039 start_parent_slice_with_credit(tg, parent_tg, rw);
1040 } else {
1041 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1042 &parent_sq->queued[rw]);
1043 BUG_ON(tg->td->nr_queued[rw] <= 0);
1044 tg->td->nr_queued[rw]--;
1045 }
1046
1047 throtl_trim_slice(tg, rw);
1048
1049 if (tg_to_put)
1050 blkg_put(tg_to_blkg(tg_to_put));
1051 }
1052
1053 static int throtl_dispatch_tg(struct throtl_grp *tg)
1054 {
1055 struct throtl_service_queue *sq = &tg->service_queue;
1056 unsigned int nr_reads = 0, nr_writes = 0;
1057 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
1058 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
1059 struct bio *bio;
1060
1061
1062
1063 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
1064 tg_may_dispatch(tg, bio, NULL)) {
1065
1066 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1067 nr_reads++;
1068
1069 if (nr_reads >= max_nr_reads)
1070 break;
1071 }
1072
1073 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
1074 tg_may_dispatch(tg, bio, NULL)) {
1075
1076 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1077 nr_writes++;
1078
1079 if (nr_writes >= max_nr_writes)
1080 break;
1081 }
1082
1083 return nr_reads + nr_writes;
1084 }
1085
1086 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
1087 {
1088 unsigned int nr_disp = 0;
1089
1090 while (1) {
1091 struct throtl_grp *tg;
1092 struct throtl_service_queue *sq;
1093
1094 if (!parent_sq->nr_pending)
1095 break;
1096
1097 tg = throtl_rb_first(parent_sq);
1098 if (!tg)
1099 break;
1100
1101 if (time_before(jiffies, tg->disptime))
1102 break;
1103
1104 throtl_dequeue_tg(tg);
1105
1106 nr_disp += throtl_dispatch_tg(tg);
1107
1108 sq = &tg->service_queue;
1109 if (sq->nr_queued[0] || sq->nr_queued[1])
1110 tg_update_disptime(tg);
1111
1112 if (nr_disp >= THROTL_QUANTUM)
1113 break;
1114 }
1115
1116 return nr_disp;
1117 }
1118
1119 static bool throtl_can_upgrade(struct throtl_data *td,
1120 struct throtl_grp *this_tg);
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136 static void throtl_pending_timer_fn(struct timer_list *t)
1137 {
1138 struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
1139 struct throtl_grp *tg = sq_to_tg(sq);
1140 struct throtl_data *td = sq_to_td(sq);
1141 struct throtl_service_queue *parent_sq;
1142 struct request_queue *q;
1143 bool dispatched;
1144 int ret;
1145
1146
1147 if (tg)
1148 q = tg->pd.blkg->q;
1149 else
1150 q = td->queue;
1151
1152 spin_lock_irq(&q->queue_lock);
1153
1154 if (!q->root_blkg)
1155 goto out_unlock;
1156
1157 if (throtl_can_upgrade(td, NULL))
1158 throtl_upgrade_state(td);
1159
1160 again:
1161 parent_sq = sq->parent_sq;
1162 dispatched = false;
1163
1164 while (true) {
1165 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
1166 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1167 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1168
1169 ret = throtl_select_dispatch(sq);
1170 if (ret) {
1171 throtl_log(sq, "bios disp=%u", ret);
1172 dispatched = true;
1173 }
1174
1175 if (throtl_schedule_next_dispatch(sq, false))
1176 break;
1177
1178
1179 spin_unlock_irq(&q->queue_lock);
1180 cpu_relax();
1181 spin_lock_irq(&q->queue_lock);
1182 }
1183
1184 if (!dispatched)
1185 goto out_unlock;
1186
1187 if (parent_sq) {
1188
1189 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1190 tg_update_disptime(tg);
1191 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1192
1193 sq = parent_sq;
1194 tg = sq_to_tg(sq);
1195 goto again;
1196 }
1197 }
1198 } else {
1199
1200 queue_work(kthrotld_workqueue, &td->dispatch_work);
1201 }
1202 out_unlock:
1203 spin_unlock_irq(&q->queue_lock);
1204 }
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214 static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1215 {
1216 struct throtl_data *td = container_of(work, struct throtl_data,
1217 dispatch_work);
1218 struct throtl_service_queue *td_sq = &td->service_queue;
1219 struct request_queue *q = td->queue;
1220 struct bio_list bio_list_on_stack;
1221 struct bio *bio;
1222 struct blk_plug plug;
1223 int rw;
1224
1225 bio_list_init(&bio_list_on_stack);
1226
1227 spin_lock_irq(&q->queue_lock);
1228 for (rw = READ; rw <= WRITE; rw++)
1229 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1230 bio_list_add(&bio_list_on_stack, bio);
1231 spin_unlock_irq(&q->queue_lock);
1232
1233 if (!bio_list_empty(&bio_list_on_stack)) {
1234 blk_start_plug(&plug);
1235 while ((bio = bio_list_pop(&bio_list_on_stack)))
1236 submit_bio_noacct_nocheck(bio);
1237 blk_finish_plug(&plug);
1238 }
1239 }
1240
1241 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1242 int off)
1243 {
1244 struct throtl_grp *tg = pd_to_tg(pd);
1245 u64 v = *(u64 *)((void *)tg + off);
1246
1247 if (v == U64_MAX)
1248 return 0;
1249 return __blkg_prfill_u64(sf, pd, v);
1250 }
1251
1252 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1253 int off)
1254 {
1255 struct throtl_grp *tg = pd_to_tg(pd);
1256 unsigned int v = *(unsigned int *)((void *)tg + off);
1257
1258 if (v == UINT_MAX)
1259 return 0;
1260 return __blkg_prfill_u64(sf, pd, v);
1261 }
1262
1263 static int tg_print_conf_u64(struct seq_file *sf, void *v)
1264 {
1265 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
1266 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1267 return 0;
1268 }
1269
1270 static int tg_print_conf_uint(struct seq_file *sf, void *v)
1271 {
1272 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
1273 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1274 return 0;
1275 }
1276
1277 static void tg_conf_updated(struct throtl_grp *tg, bool global)
1278 {
1279 struct throtl_service_queue *sq = &tg->service_queue;
1280 struct cgroup_subsys_state *pos_css;
1281 struct blkcg_gq *blkg;
1282
1283 throtl_log(&tg->service_queue,
1284 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
1285 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
1286 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
1287
1288
1289
1290
1291
1292
1293
1294
1295 blkg_for_each_descendant_pre(blkg, pos_css,
1296 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
1297 struct throtl_grp *this_tg = blkg_to_tg(blkg);
1298 struct throtl_grp *parent_tg;
1299
1300 tg_update_has_rules(this_tg);
1301
1302 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
1303 !blkg->parent->parent)
1304 continue;
1305 parent_tg = blkg_to_tg(blkg->parent);
1306
1307
1308
1309
1310 this_tg->idletime_threshold = min(this_tg->idletime_threshold,
1311 parent_tg->idletime_threshold);
1312 this_tg->latency_target = max(this_tg->latency_target,
1313 parent_tg->latency_target);
1314 }
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 throtl_start_new_slice(tg, READ);
1325 throtl_start_new_slice(tg, WRITE);
1326
1327 if (tg->flags & THROTL_TG_PENDING) {
1328 tg_update_disptime(tg);
1329 throtl_schedule_next_dispatch(sq->parent_sq, true);
1330 }
1331 }
1332
1333 static ssize_t tg_set_conf(struct kernfs_open_file *of,
1334 char *buf, size_t nbytes, loff_t off, bool is_u64)
1335 {
1336 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1337 struct blkg_conf_ctx ctx;
1338 struct throtl_grp *tg;
1339 int ret;
1340 u64 v;
1341
1342 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1343 if (ret)
1344 return ret;
1345
1346 ret = -EINVAL;
1347 if (sscanf(ctx.body, "%llu", &v) != 1)
1348 goto out_finish;
1349 if (!v)
1350 v = U64_MAX;
1351
1352 tg = blkg_to_tg(ctx.blkg);
1353
1354 if (is_u64)
1355 *(u64 *)((void *)tg + of_cft(of)->private) = v;
1356 else
1357 *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
1358
1359 tg_conf_updated(tg, false);
1360 ret = 0;
1361 out_finish:
1362 blkg_conf_finish(&ctx);
1363 return ret ?: nbytes;
1364 }
1365
1366 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
1367 char *buf, size_t nbytes, loff_t off)
1368 {
1369 return tg_set_conf(of, buf, nbytes, off, true);
1370 }
1371
1372 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1373 char *buf, size_t nbytes, loff_t off)
1374 {
1375 return tg_set_conf(of, buf, nbytes, off, false);
1376 }
1377
1378 static int tg_print_rwstat(struct seq_file *sf, void *v)
1379 {
1380 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1381 blkg_prfill_rwstat, &blkcg_policy_throtl,
1382 seq_cft(sf)->private, true);
1383 return 0;
1384 }
1385
1386 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf,
1387 struct blkg_policy_data *pd, int off)
1388 {
1389 struct blkg_rwstat_sample sum;
1390
1391 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off,
1392 &sum);
1393 return __blkg_prfill_rwstat(sf, pd, &sum);
1394 }
1395
1396 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
1397 {
1398 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1399 tg_prfill_rwstat_recursive, &blkcg_policy_throtl,
1400 seq_cft(sf)->private, true);
1401 return 0;
1402 }
1403
1404 static struct cftype throtl_legacy_files[] = {
1405 {
1406 .name = "throttle.read_bps_device",
1407 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
1408 .seq_show = tg_print_conf_u64,
1409 .write = tg_set_conf_u64,
1410 },
1411 {
1412 .name = "throttle.write_bps_device",
1413 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
1414 .seq_show = tg_print_conf_u64,
1415 .write = tg_set_conf_u64,
1416 },
1417 {
1418 .name = "throttle.read_iops_device",
1419 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
1420 .seq_show = tg_print_conf_uint,
1421 .write = tg_set_conf_uint,
1422 },
1423 {
1424 .name = "throttle.write_iops_device",
1425 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
1426 .seq_show = tg_print_conf_uint,
1427 .write = tg_set_conf_uint,
1428 },
1429 {
1430 .name = "throttle.io_service_bytes",
1431 .private = offsetof(struct throtl_grp, stat_bytes),
1432 .seq_show = tg_print_rwstat,
1433 },
1434 {
1435 .name = "throttle.io_service_bytes_recursive",
1436 .private = offsetof(struct throtl_grp, stat_bytes),
1437 .seq_show = tg_print_rwstat_recursive,
1438 },
1439 {
1440 .name = "throttle.io_serviced",
1441 .private = offsetof(struct throtl_grp, stat_ios),
1442 .seq_show = tg_print_rwstat,
1443 },
1444 {
1445 .name = "throttle.io_serviced_recursive",
1446 .private = offsetof(struct throtl_grp, stat_ios),
1447 .seq_show = tg_print_rwstat_recursive,
1448 },
1449 { }
1450 };
1451
1452 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
1453 int off)
1454 {
1455 struct throtl_grp *tg = pd_to_tg(pd);
1456 const char *dname = blkg_dev_name(pd->blkg);
1457 char bufs[4][21] = { "max", "max", "max", "max" };
1458 u64 bps_dft;
1459 unsigned int iops_dft;
1460 char idle_time[26] = "";
1461 char latency_time[26] = "";
1462
1463 if (!dname)
1464 return 0;
1465
1466 if (off == LIMIT_LOW) {
1467 bps_dft = 0;
1468 iops_dft = 0;
1469 } else {
1470 bps_dft = U64_MAX;
1471 iops_dft = UINT_MAX;
1472 }
1473
1474 if (tg->bps_conf[READ][off] == bps_dft &&
1475 tg->bps_conf[WRITE][off] == bps_dft &&
1476 tg->iops_conf[READ][off] == iops_dft &&
1477 tg->iops_conf[WRITE][off] == iops_dft &&
1478 (off != LIMIT_LOW ||
1479 (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
1480 tg->latency_target_conf == DFL_LATENCY_TARGET)))
1481 return 0;
1482
1483 if (tg->bps_conf[READ][off] != U64_MAX)
1484 snprintf(bufs[0], sizeof(bufs[0]), "%llu",
1485 tg->bps_conf[READ][off]);
1486 if (tg->bps_conf[WRITE][off] != U64_MAX)
1487 snprintf(bufs[1], sizeof(bufs[1]), "%llu",
1488 tg->bps_conf[WRITE][off]);
1489 if (tg->iops_conf[READ][off] != UINT_MAX)
1490 snprintf(bufs[2], sizeof(bufs[2]), "%u",
1491 tg->iops_conf[READ][off]);
1492 if (tg->iops_conf[WRITE][off] != UINT_MAX)
1493 snprintf(bufs[3], sizeof(bufs[3]), "%u",
1494 tg->iops_conf[WRITE][off]);
1495 if (off == LIMIT_LOW) {
1496 if (tg->idletime_threshold_conf == ULONG_MAX)
1497 strcpy(idle_time, " idle=max");
1498 else
1499 snprintf(idle_time, sizeof(idle_time), " idle=%lu",
1500 tg->idletime_threshold_conf);
1501
1502 if (tg->latency_target_conf == ULONG_MAX)
1503 strcpy(latency_time, " latency=max");
1504 else
1505 snprintf(latency_time, sizeof(latency_time),
1506 " latency=%lu", tg->latency_target_conf);
1507 }
1508
1509 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
1510 dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
1511 latency_time);
1512 return 0;
1513 }
1514
1515 static int tg_print_limit(struct seq_file *sf, void *v)
1516 {
1517 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
1518 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1519 return 0;
1520 }
1521
1522 static ssize_t tg_set_limit(struct kernfs_open_file *of,
1523 char *buf, size_t nbytes, loff_t off)
1524 {
1525 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1526 struct blkg_conf_ctx ctx;
1527 struct throtl_grp *tg;
1528 u64 v[4];
1529 unsigned long idle_time;
1530 unsigned long latency_time;
1531 int ret;
1532 int index = of_cft(of)->private;
1533
1534 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1535 if (ret)
1536 return ret;
1537
1538 tg = blkg_to_tg(ctx.blkg);
1539
1540 v[0] = tg->bps_conf[READ][index];
1541 v[1] = tg->bps_conf[WRITE][index];
1542 v[2] = tg->iops_conf[READ][index];
1543 v[3] = tg->iops_conf[WRITE][index];
1544
1545 idle_time = tg->idletime_threshold_conf;
1546 latency_time = tg->latency_target_conf;
1547 while (true) {
1548 char tok[27];
1549 char *p;
1550 u64 val = U64_MAX;
1551 int len;
1552
1553 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
1554 break;
1555 if (tok[0] == '\0')
1556 break;
1557 ctx.body += len;
1558
1559 ret = -EINVAL;
1560 p = tok;
1561 strsep(&p, "=");
1562 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
1563 goto out_finish;
1564
1565 ret = -ERANGE;
1566 if (!val)
1567 goto out_finish;
1568
1569 ret = -EINVAL;
1570 if (!strcmp(tok, "rbps") && val > 1)
1571 v[0] = val;
1572 else if (!strcmp(tok, "wbps") && val > 1)
1573 v[1] = val;
1574 else if (!strcmp(tok, "riops") && val > 1)
1575 v[2] = min_t(u64, val, UINT_MAX);
1576 else if (!strcmp(tok, "wiops") && val > 1)
1577 v[3] = min_t(u64, val, UINT_MAX);
1578 else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
1579 idle_time = val;
1580 else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
1581 latency_time = val;
1582 else
1583 goto out_finish;
1584 }
1585
1586 tg->bps_conf[READ][index] = v[0];
1587 tg->bps_conf[WRITE][index] = v[1];
1588 tg->iops_conf[READ][index] = v[2];
1589 tg->iops_conf[WRITE][index] = v[3];
1590
1591 if (index == LIMIT_MAX) {
1592 tg->bps[READ][index] = v[0];
1593 tg->bps[WRITE][index] = v[1];
1594 tg->iops[READ][index] = v[2];
1595 tg->iops[WRITE][index] = v[3];
1596 }
1597 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
1598 tg->bps_conf[READ][LIMIT_MAX]);
1599 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
1600 tg->bps_conf[WRITE][LIMIT_MAX]);
1601 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
1602 tg->iops_conf[READ][LIMIT_MAX]);
1603 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
1604 tg->iops_conf[WRITE][LIMIT_MAX]);
1605 tg->idletime_threshold_conf = idle_time;
1606 tg->latency_target_conf = latency_time;
1607
1608
1609 if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
1610 tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
1611 tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
1612 tg->latency_target_conf == DFL_LATENCY_TARGET) {
1613 tg->bps[READ][LIMIT_LOW] = 0;
1614 tg->bps[WRITE][LIMIT_LOW] = 0;
1615 tg->iops[READ][LIMIT_LOW] = 0;
1616 tg->iops[WRITE][LIMIT_LOW] = 0;
1617 tg->idletime_threshold = DFL_IDLE_THRESHOLD;
1618 tg->latency_target = DFL_LATENCY_TARGET;
1619 } else if (index == LIMIT_LOW) {
1620 tg->idletime_threshold = tg->idletime_threshold_conf;
1621 tg->latency_target = tg->latency_target_conf;
1622 }
1623
1624 blk_throtl_update_limit_valid(tg->td);
1625 if (tg->td->limit_valid[LIMIT_LOW]) {
1626 if (index == LIMIT_LOW)
1627 tg->td->limit_index = LIMIT_LOW;
1628 } else
1629 tg->td->limit_index = LIMIT_MAX;
1630 tg_conf_updated(tg, index == LIMIT_LOW &&
1631 tg->td->limit_valid[LIMIT_LOW]);
1632 ret = 0;
1633 out_finish:
1634 blkg_conf_finish(&ctx);
1635 return ret ?: nbytes;
1636 }
1637
1638 static struct cftype throtl_files[] = {
1639 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1640 {
1641 .name = "low",
1642 .flags = CFTYPE_NOT_ON_ROOT,
1643 .seq_show = tg_print_limit,
1644 .write = tg_set_limit,
1645 .private = LIMIT_LOW,
1646 },
1647 #endif
1648 {
1649 .name = "max",
1650 .flags = CFTYPE_NOT_ON_ROOT,
1651 .seq_show = tg_print_limit,
1652 .write = tg_set_limit,
1653 .private = LIMIT_MAX,
1654 },
1655 { }
1656 };
1657
1658 static void throtl_shutdown_wq(struct request_queue *q)
1659 {
1660 struct throtl_data *td = q->td;
1661
1662 cancel_work_sync(&td->dispatch_work);
1663 }
1664
1665 struct blkcg_policy blkcg_policy_throtl = {
1666 .dfl_cftypes = throtl_files,
1667 .legacy_cftypes = throtl_legacy_files,
1668
1669 .pd_alloc_fn = throtl_pd_alloc,
1670 .pd_init_fn = throtl_pd_init,
1671 .pd_online_fn = throtl_pd_online,
1672 .pd_offline_fn = throtl_pd_offline,
1673 .pd_free_fn = throtl_pd_free,
1674 };
1675
1676 static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
1677 {
1678 unsigned long rtime = jiffies, wtime = jiffies;
1679
1680 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
1681 rtime = tg->last_low_overflow_time[READ];
1682 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
1683 wtime = tg->last_low_overflow_time[WRITE];
1684 return min(rtime, wtime);
1685 }
1686
1687
1688 static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
1689 {
1690 struct throtl_service_queue *parent_sq;
1691 struct throtl_grp *parent = tg;
1692 unsigned long ret = __tg_last_low_overflow_time(tg);
1693
1694 while (true) {
1695 parent_sq = parent->service_queue.parent_sq;
1696 parent = sq_to_tg(parent_sq);
1697 if (!parent)
1698 break;
1699
1700
1701
1702
1703
1704 if (!parent->bps[READ][LIMIT_LOW] &&
1705 !parent->iops[READ][LIMIT_LOW] &&
1706 !parent->bps[WRITE][LIMIT_LOW] &&
1707 !parent->iops[WRITE][LIMIT_LOW])
1708 continue;
1709 if (time_after(__tg_last_low_overflow_time(parent), ret))
1710 ret = __tg_last_low_overflow_time(parent);
1711 }
1712 return ret;
1713 }
1714
1715 static bool throtl_tg_is_idle(struct throtl_grp *tg)
1716 {
1717
1718
1719
1720
1721
1722
1723
1724 unsigned long time;
1725 bool ret;
1726
1727 time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
1728 ret = tg->latency_target == DFL_LATENCY_TARGET ||
1729 tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
1730 (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
1731 tg->avg_idletime > tg->idletime_threshold ||
1732 (tg->latency_target && tg->bio_cnt &&
1733 tg->bad_bio_cnt * 5 < tg->bio_cnt);
1734 throtl_log(&tg->service_queue,
1735 "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
1736 tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
1737 tg->bio_cnt, ret, tg->td->scale);
1738 return ret;
1739 }
1740
1741 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
1742 {
1743 struct throtl_service_queue *sq = &tg->service_queue;
1744 bool read_limit, write_limit;
1745
1746
1747
1748
1749
1750 read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
1751 write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
1752 if (!read_limit && !write_limit)
1753 return true;
1754 if (read_limit && sq->nr_queued[READ] &&
1755 (!write_limit || sq->nr_queued[WRITE]))
1756 return true;
1757 if (write_limit && sq->nr_queued[WRITE] &&
1758 (!read_limit || sq->nr_queued[READ]))
1759 return true;
1760
1761 if (time_after_eq(jiffies,
1762 tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
1763 throtl_tg_is_idle(tg))
1764 return true;
1765 return false;
1766 }
1767
1768 static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
1769 {
1770 while (true) {
1771 if (throtl_tg_can_upgrade(tg))
1772 return true;
1773 tg = sq_to_tg(tg->service_queue.parent_sq);
1774 if (!tg || !tg_to_blkg(tg)->parent)
1775 return false;
1776 }
1777 return false;
1778 }
1779
1780 void blk_throtl_cancel_bios(struct request_queue *q)
1781 {
1782 struct cgroup_subsys_state *pos_css;
1783 struct blkcg_gq *blkg;
1784
1785 spin_lock_irq(&q->queue_lock);
1786
1787
1788
1789
1790
1791 rcu_read_lock();
1792 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
1793 struct throtl_grp *tg = blkg_to_tg(blkg);
1794 struct throtl_service_queue *sq = &tg->service_queue;
1795
1796
1797
1798
1799
1800 blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
1801
1802
1803
1804
1805 tg_update_disptime(tg);
1806
1807 throtl_schedule_pending_timer(sq, jiffies + 1);
1808 }
1809 rcu_read_unlock();
1810 spin_unlock_irq(&q->queue_lock);
1811 }
1812
1813 static bool throtl_can_upgrade(struct throtl_data *td,
1814 struct throtl_grp *this_tg)
1815 {
1816 struct cgroup_subsys_state *pos_css;
1817 struct blkcg_gq *blkg;
1818
1819 if (td->limit_index != LIMIT_LOW)
1820 return false;
1821
1822 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
1823 return false;
1824
1825 rcu_read_lock();
1826 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1827 struct throtl_grp *tg = blkg_to_tg(blkg);
1828
1829 if (tg == this_tg)
1830 continue;
1831 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1832 continue;
1833 if (!throtl_hierarchy_can_upgrade(tg)) {
1834 rcu_read_unlock();
1835 return false;
1836 }
1837 }
1838 rcu_read_unlock();
1839 return true;
1840 }
1841
1842 static void throtl_upgrade_check(struct throtl_grp *tg)
1843 {
1844 unsigned long now = jiffies;
1845
1846 if (tg->td->limit_index != LIMIT_LOW)
1847 return;
1848
1849 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
1850 return;
1851
1852 tg->last_check_time = now;
1853
1854 if (!time_after_eq(now,
1855 __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
1856 return;
1857
1858 if (throtl_can_upgrade(tg->td, NULL))
1859 throtl_upgrade_state(tg->td);
1860 }
1861
1862 static void throtl_upgrade_state(struct throtl_data *td)
1863 {
1864 struct cgroup_subsys_state *pos_css;
1865 struct blkcg_gq *blkg;
1866
1867 throtl_log(&td->service_queue, "upgrade to max");
1868 td->limit_index = LIMIT_MAX;
1869 td->low_upgrade_time = jiffies;
1870 td->scale = 0;
1871 rcu_read_lock();
1872 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1873 struct throtl_grp *tg = blkg_to_tg(blkg);
1874 struct throtl_service_queue *sq = &tg->service_queue;
1875
1876 tg->disptime = jiffies - 1;
1877 throtl_select_dispatch(sq);
1878 throtl_schedule_next_dispatch(sq, true);
1879 }
1880 rcu_read_unlock();
1881 throtl_select_dispatch(&td->service_queue);
1882 throtl_schedule_next_dispatch(&td->service_queue, true);
1883 queue_work(kthrotld_workqueue, &td->dispatch_work);
1884 }
1885
1886 static void throtl_downgrade_state(struct throtl_data *td)
1887 {
1888 td->scale /= 2;
1889
1890 throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
1891 if (td->scale) {
1892 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
1893 return;
1894 }
1895
1896 td->limit_index = LIMIT_LOW;
1897 td->low_downgrade_time = jiffies;
1898 }
1899
1900 static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
1901 {
1902 struct throtl_data *td = tg->td;
1903 unsigned long now = jiffies;
1904
1905
1906
1907
1908
1909 if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
1910 time_after_eq(now, tg_last_low_overflow_time(tg) +
1911 td->throtl_slice) &&
1912 (!throtl_tg_is_idle(tg) ||
1913 !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
1914 return true;
1915 return false;
1916 }
1917
1918 static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
1919 {
1920 while (true) {
1921 if (!throtl_tg_can_downgrade(tg))
1922 return false;
1923 tg = sq_to_tg(tg->service_queue.parent_sq);
1924 if (!tg || !tg_to_blkg(tg)->parent)
1925 break;
1926 }
1927 return true;
1928 }
1929
1930 static void throtl_downgrade_check(struct throtl_grp *tg)
1931 {
1932 uint64_t bps;
1933 unsigned int iops;
1934 unsigned long elapsed_time;
1935 unsigned long now = jiffies;
1936
1937 if (tg->td->limit_index != LIMIT_MAX ||
1938 !tg->td->limit_valid[LIMIT_LOW])
1939 return;
1940 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1941 return;
1942 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
1943 return;
1944
1945 elapsed_time = now - tg->last_check_time;
1946 tg->last_check_time = now;
1947
1948 if (time_before(now, tg_last_low_overflow_time(tg) +
1949 tg->td->throtl_slice))
1950 return;
1951
1952 if (tg->bps[READ][LIMIT_LOW]) {
1953 bps = tg->last_bytes_disp[READ] * HZ;
1954 do_div(bps, elapsed_time);
1955 if (bps >= tg->bps[READ][LIMIT_LOW])
1956 tg->last_low_overflow_time[READ] = now;
1957 }
1958
1959 if (tg->bps[WRITE][LIMIT_LOW]) {
1960 bps = tg->last_bytes_disp[WRITE] * HZ;
1961 do_div(bps, elapsed_time);
1962 if (bps >= tg->bps[WRITE][LIMIT_LOW])
1963 tg->last_low_overflow_time[WRITE] = now;
1964 }
1965
1966 if (tg->iops[READ][LIMIT_LOW]) {
1967 iops = tg->last_io_disp[READ] * HZ / elapsed_time;
1968 if (iops >= tg->iops[READ][LIMIT_LOW])
1969 tg->last_low_overflow_time[READ] = now;
1970 }
1971
1972 if (tg->iops[WRITE][LIMIT_LOW]) {
1973 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
1974 if (iops >= tg->iops[WRITE][LIMIT_LOW])
1975 tg->last_low_overflow_time[WRITE] = now;
1976 }
1977
1978
1979
1980
1981
1982 if (throtl_hierarchy_can_downgrade(tg))
1983 throtl_downgrade_state(tg->td);
1984
1985 tg->last_bytes_disp[READ] = 0;
1986 tg->last_bytes_disp[WRITE] = 0;
1987 tg->last_io_disp[READ] = 0;
1988 tg->last_io_disp[WRITE] = 0;
1989 }
1990
1991 static void blk_throtl_update_idletime(struct throtl_grp *tg)
1992 {
1993 unsigned long now;
1994 unsigned long last_finish_time = tg->last_finish_time;
1995
1996 if (last_finish_time == 0)
1997 return;
1998
1999 now = ktime_get_ns() >> 10;
2000 if (now <= last_finish_time ||
2001 last_finish_time == tg->checked_last_finish_time)
2002 return;
2003
2004 tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
2005 tg->checked_last_finish_time = last_finish_time;
2006 }
2007
2008 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2009 static void throtl_update_latency_buckets(struct throtl_data *td)
2010 {
2011 struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
2012 int i, cpu, rw;
2013 unsigned long last_latency[2] = { 0 };
2014 unsigned long latency[2];
2015
2016 if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
2017 return;
2018 if (time_before(jiffies, td->last_calculate_time + HZ))
2019 return;
2020 td->last_calculate_time = jiffies;
2021
2022 memset(avg_latency, 0, sizeof(avg_latency));
2023 for (rw = READ; rw <= WRITE; rw++) {
2024 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2025 struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
2026
2027 for_each_possible_cpu(cpu) {
2028 struct latency_bucket *bucket;
2029
2030
2031 bucket = per_cpu_ptr(td->latency_buckets[rw],
2032 cpu);
2033 tmp->total_latency += bucket[i].total_latency;
2034 tmp->samples += bucket[i].samples;
2035 bucket[i].total_latency = 0;
2036 bucket[i].samples = 0;
2037 }
2038
2039 if (tmp->samples >= 32) {
2040 int samples = tmp->samples;
2041
2042 latency[rw] = tmp->total_latency;
2043
2044 tmp->total_latency = 0;
2045 tmp->samples = 0;
2046 latency[rw] /= samples;
2047 if (latency[rw] == 0)
2048 continue;
2049 avg_latency[rw][i].latency = latency[rw];
2050 }
2051 }
2052 }
2053
2054 for (rw = READ; rw <= WRITE; rw++) {
2055 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2056 if (!avg_latency[rw][i].latency) {
2057 if (td->avg_buckets[rw][i].latency < last_latency[rw])
2058 td->avg_buckets[rw][i].latency =
2059 last_latency[rw];
2060 continue;
2061 }
2062
2063 if (!td->avg_buckets[rw][i].valid)
2064 latency[rw] = avg_latency[rw][i].latency;
2065 else
2066 latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
2067 avg_latency[rw][i].latency) >> 3;
2068
2069 td->avg_buckets[rw][i].latency = max(latency[rw],
2070 last_latency[rw]);
2071 td->avg_buckets[rw][i].valid = true;
2072 last_latency[rw] = td->avg_buckets[rw][i].latency;
2073 }
2074 }
2075
2076 for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
2077 throtl_log(&td->service_queue,
2078 "Latency bucket %d: read latency=%ld, read valid=%d, "
2079 "write latency=%ld, write valid=%d", i,
2080 td->avg_buckets[READ][i].latency,
2081 td->avg_buckets[READ][i].valid,
2082 td->avg_buckets[WRITE][i].latency,
2083 td->avg_buckets[WRITE][i].valid);
2084 }
2085 #else
2086 static inline void throtl_update_latency_buckets(struct throtl_data *td)
2087 {
2088 }
2089 #endif
2090
2091 bool __blk_throtl_bio(struct bio *bio)
2092 {
2093 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
2094 struct blkcg_gq *blkg = bio->bi_blkg;
2095 struct throtl_qnode *qn = NULL;
2096 struct throtl_grp *tg = blkg_to_tg(blkg);
2097 struct throtl_service_queue *sq;
2098 bool rw = bio_data_dir(bio);
2099 bool throttled = false;
2100 struct throtl_data *td = tg->td;
2101
2102 rcu_read_lock();
2103
2104 if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
2105 blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
2106 bio->bi_iter.bi_size);
2107 blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
2108 }
2109
2110 spin_lock_irq(&q->queue_lock);
2111
2112 throtl_update_latency_buckets(td);
2113
2114 blk_throtl_update_idletime(tg);
2115
2116 sq = &tg->service_queue;
2117
2118 again:
2119 while (true) {
2120 if (tg->last_low_overflow_time[rw] == 0)
2121 tg->last_low_overflow_time[rw] = jiffies;
2122 throtl_downgrade_check(tg);
2123 throtl_upgrade_check(tg);
2124
2125 if (sq->nr_queued[rw])
2126 break;
2127
2128
2129 if (!tg_may_dispatch(tg, bio, NULL)) {
2130 tg->last_low_overflow_time[rw] = jiffies;
2131 if (throtl_can_upgrade(td, tg)) {
2132 throtl_upgrade_state(td);
2133 goto again;
2134 }
2135 break;
2136 }
2137
2138
2139 throtl_charge_bio(tg, bio);
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152 throtl_trim_slice(tg, rw);
2153
2154
2155
2156
2157
2158
2159 qn = &tg->qnode_on_parent[rw];
2160 sq = sq->parent_sq;
2161 tg = sq_to_tg(sq);
2162 if (!tg)
2163 goto out_unlock;
2164 }
2165
2166
2167 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
2168 rw == READ ? 'R' : 'W',
2169 tg->bytes_disp[rw], bio->bi_iter.bi_size,
2170 tg_bps_limit(tg, rw),
2171 tg->io_disp[rw], tg_iops_limit(tg, rw),
2172 sq->nr_queued[READ], sq->nr_queued[WRITE]);
2173
2174 tg->last_low_overflow_time[rw] = jiffies;
2175
2176 td->nr_queued[rw]++;
2177 throtl_add_bio_tg(bio, qn, tg);
2178 throttled = true;
2179
2180
2181
2182
2183
2184
2185
2186 if (tg->flags & THROTL_TG_WAS_EMPTY) {
2187 tg_update_disptime(tg);
2188 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
2189 }
2190
2191 out_unlock:
2192 bio_set_flag(bio, BIO_THROTTLED);
2193
2194 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2195 if (throttled || !td->track_bio_latency)
2196 bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
2197 #endif
2198 spin_unlock_irq(&q->queue_lock);
2199
2200 rcu_read_unlock();
2201 return throttled;
2202 }
2203
2204 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2205 static void throtl_track_latency(struct throtl_data *td, sector_t size,
2206 enum req_op op, unsigned long time)
2207 {
2208 const bool rw = op_is_write(op);
2209 struct latency_bucket *latency;
2210 int index;
2211
2212 if (!td || td->limit_index != LIMIT_LOW ||
2213 !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
2214 !blk_queue_nonrot(td->queue))
2215 return;
2216
2217 index = request_bucket_index(size);
2218
2219 latency = get_cpu_ptr(td->latency_buckets[rw]);
2220 latency[index].total_latency += time;
2221 latency[index].samples++;
2222 put_cpu_ptr(td->latency_buckets[rw]);
2223 }
2224
2225 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2226 {
2227 struct request_queue *q = rq->q;
2228 struct throtl_data *td = q->td;
2229
2230 throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
2231 time_ns >> 10);
2232 }
2233
2234 void blk_throtl_bio_endio(struct bio *bio)
2235 {
2236 struct blkcg_gq *blkg;
2237 struct throtl_grp *tg;
2238 u64 finish_time_ns;
2239 unsigned long finish_time;
2240 unsigned long start_time;
2241 unsigned long lat;
2242 int rw = bio_data_dir(bio);
2243
2244 blkg = bio->bi_blkg;
2245 if (!blkg)
2246 return;
2247 tg = blkg_to_tg(blkg);
2248 if (!tg->td->limit_valid[LIMIT_LOW])
2249 return;
2250
2251 finish_time_ns = ktime_get_ns();
2252 tg->last_finish_time = finish_time_ns >> 10;
2253
2254 start_time = bio_issue_time(&bio->bi_issue) >> 10;
2255 finish_time = __bio_issue_time(finish_time_ns) >> 10;
2256 if (!start_time || finish_time <= start_time)
2257 return;
2258
2259 lat = finish_time - start_time;
2260
2261 if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
2262 throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
2263 bio_op(bio), lat);
2264
2265 if (tg->latency_target && lat >= tg->td->filtered_latency) {
2266 int bucket;
2267 unsigned int threshold;
2268
2269 bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
2270 threshold = tg->td->avg_buckets[rw][bucket].latency +
2271 tg->latency_target;
2272 if (lat > threshold)
2273 tg->bad_bio_cnt++;
2274
2275
2276
2277
2278 tg->bio_cnt++;
2279 }
2280
2281 if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
2282 tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
2283 tg->bio_cnt /= 2;
2284 tg->bad_bio_cnt /= 2;
2285 }
2286 }
2287 #endif
2288
2289 int blk_throtl_init(struct request_queue *q)
2290 {
2291 struct throtl_data *td;
2292 int ret;
2293
2294 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
2295 if (!td)
2296 return -ENOMEM;
2297 td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
2298 LATENCY_BUCKET_SIZE, __alignof__(u64));
2299 if (!td->latency_buckets[READ]) {
2300 kfree(td);
2301 return -ENOMEM;
2302 }
2303 td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
2304 LATENCY_BUCKET_SIZE, __alignof__(u64));
2305 if (!td->latency_buckets[WRITE]) {
2306 free_percpu(td->latency_buckets[READ]);
2307 kfree(td);
2308 return -ENOMEM;
2309 }
2310
2311 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
2312 throtl_service_queue_init(&td->service_queue);
2313
2314 q->td = td;
2315 td->queue = q;
2316
2317 td->limit_valid[LIMIT_MAX] = true;
2318 td->limit_index = LIMIT_MAX;
2319 td->low_upgrade_time = jiffies;
2320 td->low_downgrade_time = jiffies;
2321
2322
2323 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
2324 if (ret) {
2325 free_percpu(td->latency_buckets[READ]);
2326 free_percpu(td->latency_buckets[WRITE]);
2327 kfree(td);
2328 }
2329 return ret;
2330 }
2331
2332 void blk_throtl_exit(struct request_queue *q)
2333 {
2334 BUG_ON(!q->td);
2335 del_timer_sync(&q->td->service_queue.pending_timer);
2336 throtl_shutdown_wq(q);
2337 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
2338 free_percpu(q->td->latency_buckets[READ]);
2339 free_percpu(q->td->latency_buckets[WRITE]);
2340 kfree(q->td);
2341 }
2342
2343 void blk_throtl_register_queue(struct request_queue *q)
2344 {
2345 struct throtl_data *td;
2346 int i;
2347
2348 td = q->td;
2349 BUG_ON(!td);
2350
2351 if (blk_queue_nonrot(q)) {
2352 td->throtl_slice = DFL_THROTL_SLICE_SSD;
2353 td->filtered_latency = LATENCY_FILTERED_SSD;
2354 } else {
2355 td->throtl_slice = DFL_THROTL_SLICE_HD;
2356 td->filtered_latency = LATENCY_FILTERED_HD;
2357 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2358 td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
2359 td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
2360 }
2361 }
2362 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
2363
2364 td->throtl_slice = DFL_THROTL_SLICE_HD;
2365 #endif
2366
2367 td->track_bio_latency = !queue_is_mq(q);
2368 if (!td->track_bio_latency)
2369 blk_stat_enable_accounting(q);
2370 }
2371
2372 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2373 ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
2374 {
2375 if (!q->td)
2376 return -EINVAL;
2377 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
2378 }
2379
2380 ssize_t blk_throtl_sample_time_store(struct request_queue *q,
2381 const char *page, size_t count)
2382 {
2383 unsigned long v;
2384 unsigned long t;
2385
2386 if (!q->td)
2387 return -EINVAL;
2388 if (kstrtoul(page, 10, &v))
2389 return -EINVAL;
2390 t = msecs_to_jiffies(v);
2391 if (t == 0 || t > MAX_THROTL_SLICE)
2392 return -EINVAL;
2393 q->td->throtl_slice = t;
2394 return count;
2395 }
2396 #endif
2397
2398 static int __init throtl_init(void)
2399 {
2400 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
2401 if (!kthrotld_workqueue)
2402 panic("Failed to create kthrotld\n");
2403
2404 return blkcg_policy_register(&blkcg_policy_throtl);
2405 }
2406
2407 module_init(throtl_init);