0001
0002
0003
0004
0005
0006
0007 #include <linux/kernel.h>
0008 #include <linux/module.h>
0009 #include <linux/blk-mq.h>
0010 #include <linux/list_sort.h>
0011
0012 #include <trace/events/block.h>
0013
0014 #include "blk.h"
0015 #include "blk-mq.h"
0016 #include "blk-mq-debugfs.h"
0017 #include "blk-mq-sched.h"
0018 #include "blk-mq-tag.h"
0019 #include "blk-wbt.h"
0020
0021
0022
0023
0024
0025 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
0026 {
0027 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
0028 return;
0029
0030 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
0031 }
0032 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
0033
0034 void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
0035 {
0036 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
0037
0038
0039
0040
0041
0042
0043
0044
0045 smp_mb();
0046
0047 blk_mq_run_hw_queue(hctx, true);
0048 }
0049
0050 static int sched_rq_cmp(void *priv, const struct list_head *a,
0051 const struct list_head *b)
0052 {
0053 struct request *rqa = container_of(a, struct request, queuelist);
0054 struct request *rqb = container_of(b, struct request, queuelist);
0055
0056 return rqa->mq_hctx > rqb->mq_hctx;
0057 }
0058
0059 static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
0060 {
0061 struct blk_mq_hw_ctx *hctx =
0062 list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
0063 struct request *rq;
0064 LIST_HEAD(hctx_list);
0065 unsigned int count = 0;
0066
0067 list_for_each_entry(rq, rq_list, queuelist) {
0068 if (rq->mq_hctx != hctx) {
0069 list_cut_before(&hctx_list, rq_list, &rq->queuelist);
0070 goto dispatch;
0071 }
0072 count++;
0073 }
0074 list_splice_tail_init(rq_list, &hctx_list);
0075
0076 dispatch:
0077 return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
0078 }
0079
0080 #define BLK_MQ_BUDGET_DELAY 3
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090 static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
0091 {
0092 struct request_queue *q = hctx->queue;
0093 struct elevator_queue *e = q->elevator;
0094 bool multi_hctxs = false, run_queue = false;
0095 bool dispatched = false, busy = false;
0096 unsigned int max_dispatch;
0097 LIST_HEAD(rq_list);
0098 int count = 0;
0099
0100 if (hctx->dispatch_busy)
0101 max_dispatch = 1;
0102 else
0103 max_dispatch = hctx->queue->nr_requests;
0104
0105 do {
0106 struct request *rq;
0107 int budget_token;
0108
0109 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
0110 break;
0111
0112 if (!list_empty_careful(&hctx->dispatch)) {
0113 busy = true;
0114 break;
0115 }
0116
0117 budget_token = blk_mq_get_dispatch_budget(q);
0118 if (budget_token < 0)
0119 break;
0120
0121 rq = e->type->ops.dispatch_request(hctx);
0122 if (!rq) {
0123 blk_mq_put_dispatch_budget(q, budget_token);
0124
0125
0126
0127
0128
0129
0130
0131 run_queue = true;
0132 break;
0133 }
0134
0135 blk_mq_set_rq_budget_token(rq, budget_token);
0136
0137
0138
0139
0140
0141
0142 list_add_tail(&rq->queuelist, &rq_list);
0143 count++;
0144 if (rq->mq_hctx != hctx)
0145 multi_hctxs = true;
0146
0147
0148
0149
0150
0151
0152
0153 if (!blk_mq_get_driver_tag(rq))
0154 break;
0155 } while (count < max_dispatch);
0156
0157 if (!count) {
0158 if (run_queue)
0159 blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
0160 } else if (multi_hctxs) {
0161
0162
0163
0164
0165
0166
0167
0168 list_sort(NULL, &rq_list, sched_rq_cmp);
0169 do {
0170 dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
0171 } while (!list_empty(&rq_list));
0172 } else {
0173 dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
0174 }
0175
0176 if (busy)
0177 return -EAGAIN;
0178 return !!dispatched;
0179 }
0180
0181 static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
0182 {
0183 unsigned long end = jiffies + HZ;
0184 int ret;
0185
0186 do {
0187 ret = __blk_mq_do_dispatch_sched(hctx);
0188 if (ret != 1)
0189 break;
0190 if (need_resched() || time_is_before_jiffies(end)) {
0191 blk_mq_delay_run_hw_queue(hctx, 0);
0192 break;
0193 }
0194 } while (1);
0195
0196 return ret;
0197 }
0198
0199 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
0200 struct blk_mq_ctx *ctx)
0201 {
0202 unsigned short idx = ctx->index_hw[hctx->type];
0203
0204 if (++idx == hctx->nr_ctx)
0205 idx = 0;
0206
0207 return hctx->ctxs[idx];
0208 }
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218 static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
0219 {
0220 struct request_queue *q = hctx->queue;
0221 LIST_HEAD(rq_list);
0222 struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
0223 int ret = 0;
0224 struct request *rq;
0225
0226 do {
0227 int budget_token;
0228
0229 if (!list_empty_careful(&hctx->dispatch)) {
0230 ret = -EAGAIN;
0231 break;
0232 }
0233
0234 if (!sbitmap_any_bit_set(&hctx->ctx_map))
0235 break;
0236
0237 budget_token = blk_mq_get_dispatch_budget(q);
0238 if (budget_token < 0)
0239 break;
0240
0241 rq = blk_mq_dequeue_from_ctx(hctx, ctx);
0242 if (!rq) {
0243 blk_mq_put_dispatch_budget(q, budget_token);
0244
0245
0246
0247
0248
0249
0250
0251 blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
0252 break;
0253 }
0254
0255 blk_mq_set_rq_budget_token(rq, budget_token);
0256
0257
0258
0259
0260
0261
0262 list_add(&rq->queuelist, &rq_list);
0263
0264
0265 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
0266
0267 } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
0268
0269 WRITE_ONCE(hctx->dispatch_from, ctx);
0270 return ret;
0271 }
0272
0273 static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
0274 {
0275 struct request_queue *q = hctx->queue;
0276 const bool has_sched = q->elevator;
0277 int ret = 0;
0278 LIST_HEAD(rq_list);
0279
0280
0281
0282
0283
0284 if (!list_empty_careful(&hctx->dispatch)) {
0285 spin_lock(&hctx->lock);
0286 if (!list_empty(&hctx->dispatch))
0287 list_splice_init(&hctx->dispatch, &rq_list);
0288 spin_unlock(&hctx->lock);
0289 }
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304 if (!list_empty(&rq_list)) {
0305 blk_mq_sched_mark_restart_hctx(hctx);
0306 if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
0307 if (has_sched)
0308 ret = blk_mq_do_dispatch_sched(hctx);
0309 else
0310 ret = blk_mq_do_dispatch_ctx(hctx);
0311 }
0312 } else if (has_sched) {
0313 ret = blk_mq_do_dispatch_sched(hctx);
0314 } else if (hctx->dispatch_busy) {
0315
0316 ret = blk_mq_do_dispatch_ctx(hctx);
0317 } else {
0318 blk_mq_flush_busy_ctxs(hctx, &rq_list);
0319 blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
0320 }
0321
0322 return ret;
0323 }
0324
0325 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
0326 {
0327 struct request_queue *q = hctx->queue;
0328
0329
0330 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
0331 return;
0332
0333 hctx->run++;
0334
0335
0336
0337
0338
0339 if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
0340 if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
0341 blk_mq_run_hw_queue(hctx, true);
0342 }
0343 }
0344
0345 bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
0346 unsigned int nr_segs)
0347 {
0348 struct elevator_queue *e = q->elevator;
0349 struct blk_mq_ctx *ctx;
0350 struct blk_mq_hw_ctx *hctx;
0351 bool ret = false;
0352 enum hctx_type type;
0353
0354 if (e && e->type->ops.bio_merge) {
0355 ret = e->type->ops.bio_merge(q, bio, nr_segs);
0356 goto out_put;
0357 }
0358
0359 ctx = blk_mq_get_ctx(q);
0360 hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
0361 type = hctx->type;
0362 if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
0363 list_empty_careful(&ctx->rq_lists[type]))
0364 goto out_put;
0365
0366
0367 spin_lock(&ctx->lock);
0368
0369
0370
0371
0372
0373 if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
0374 ret = true;
0375
0376 spin_unlock(&ctx->lock);
0377 out_put:
0378 return ret;
0379 }
0380
0381 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
0382 struct list_head *free)
0383 {
0384 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
0385 }
0386 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
0387
0388 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
0389 struct request *rq)
0390 {
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402 if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
0403 return true;
0404
0405 return false;
0406 }
0407
0408 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
0409 bool run_queue, bool async)
0410 {
0411 struct request_queue *q = rq->q;
0412 struct elevator_queue *e = q->elevator;
0413 struct blk_mq_ctx *ctx = rq->mq_ctx;
0414 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
0415
0416 WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
0417
0418 if (blk_mq_sched_bypass_insert(hctx, rq)) {
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436
0437
0438
0439
0440 at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
0441 blk_mq_request_bypass_insert(rq, at_head, false);
0442 goto run;
0443 }
0444
0445 if (e) {
0446 LIST_HEAD(list);
0447
0448 list_add(&rq->queuelist, &list);
0449 e->type->ops.insert_requests(hctx, &list, at_head);
0450 } else {
0451 spin_lock(&ctx->lock);
0452 __blk_mq_insert_request(hctx, rq, at_head);
0453 spin_unlock(&ctx->lock);
0454 }
0455
0456 run:
0457 if (run_queue)
0458 blk_mq_run_hw_queue(hctx, async);
0459 }
0460
0461 void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
0462 struct blk_mq_ctx *ctx,
0463 struct list_head *list, bool run_queue_async)
0464 {
0465 struct elevator_queue *e;
0466 struct request_queue *q = hctx->queue;
0467
0468
0469
0470
0471
0472
0473 percpu_ref_get(&q->q_usage_counter);
0474
0475 e = hctx->queue->elevator;
0476 if (e) {
0477 e->type->ops.insert_requests(hctx, list, false);
0478 } else {
0479
0480
0481
0482
0483
0484 if (!hctx->dispatch_busy && !run_queue_async) {
0485 blk_mq_run_dispatch_ops(hctx->queue,
0486 blk_mq_try_issue_list_directly(hctx, list));
0487 if (list_empty(list))
0488 goto out;
0489 }
0490 blk_mq_insert_requests(hctx, ctx, list);
0491 }
0492
0493 blk_mq_run_hw_queue(hctx, run_queue_async);
0494 out:
0495 percpu_ref_put(&q->q_usage_counter);
0496 }
0497
0498 static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
0499 struct blk_mq_hw_ctx *hctx,
0500 unsigned int hctx_idx)
0501 {
0502 if (blk_mq_is_shared_tags(q->tag_set->flags)) {
0503 hctx->sched_tags = q->sched_shared_tags;
0504 return 0;
0505 }
0506
0507 hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
0508 q->nr_requests);
0509
0510 if (!hctx->sched_tags)
0511 return -ENOMEM;
0512 return 0;
0513 }
0514
0515 static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
0516 {
0517 blk_mq_free_rq_map(queue->sched_shared_tags);
0518 queue->sched_shared_tags = NULL;
0519 }
0520
0521
0522 static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
0523 {
0524 struct blk_mq_hw_ctx *hctx;
0525 unsigned long i;
0526
0527 queue_for_each_hw_ctx(q, hctx, i) {
0528 if (hctx->sched_tags) {
0529 if (!blk_mq_is_shared_tags(flags))
0530 blk_mq_free_rq_map(hctx->sched_tags);
0531 hctx->sched_tags = NULL;
0532 }
0533 }
0534
0535 if (blk_mq_is_shared_tags(flags))
0536 blk_mq_exit_sched_shared_tags(q);
0537 }
0538
0539 static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
0540 {
0541 struct blk_mq_tag_set *set = queue->tag_set;
0542
0543
0544
0545
0546
0547 queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
0548 BLK_MQ_NO_HCTX_IDX,
0549 MAX_SCHED_RQ);
0550 if (!queue->sched_shared_tags)
0551 return -ENOMEM;
0552
0553 blk_mq_tag_update_sched_shared_tags(queue);
0554
0555 return 0;
0556 }
0557
0558 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
0559 {
0560 unsigned int flags = q->tag_set->flags;
0561 struct blk_mq_hw_ctx *hctx;
0562 struct elevator_queue *eq;
0563 unsigned long i;
0564 int ret;
0565
0566 if (!e) {
0567 blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
0568 q->elevator = NULL;
0569 q->nr_requests = q->tag_set->queue_depth;
0570 return 0;
0571 }
0572
0573
0574
0575
0576
0577
0578 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
0579 BLKDEV_DEFAULT_RQ);
0580
0581 if (blk_mq_is_shared_tags(flags)) {
0582 ret = blk_mq_init_sched_shared_tags(q);
0583 if (ret)
0584 return ret;
0585 }
0586
0587 queue_for_each_hw_ctx(q, hctx, i) {
0588 ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
0589 if (ret)
0590 goto err_free_map_and_rqs;
0591 }
0592
0593 ret = e->ops.init_sched(q, e);
0594 if (ret)
0595 goto err_free_map_and_rqs;
0596
0597 mutex_lock(&q->debugfs_mutex);
0598 blk_mq_debugfs_register_sched(q);
0599 mutex_unlock(&q->debugfs_mutex);
0600
0601 queue_for_each_hw_ctx(q, hctx, i) {
0602 if (e->ops.init_hctx) {
0603 ret = e->ops.init_hctx(hctx, i);
0604 if (ret) {
0605 eq = q->elevator;
0606 blk_mq_sched_free_rqs(q);
0607 blk_mq_exit_sched(q, eq);
0608 kobject_put(&eq->kobj);
0609 return ret;
0610 }
0611 }
0612 mutex_lock(&q->debugfs_mutex);
0613 blk_mq_debugfs_register_sched_hctx(q, hctx);
0614 mutex_unlock(&q->debugfs_mutex);
0615 }
0616
0617 return 0;
0618
0619 err_free_map_and_rqs:
0620 blk_mq_sched_free_rqs(q);
0621 blk_mq_sched_tags_teardown(q, flags);
0622
0623 q->elevator = NULL;
0624 return ret;
0625 }
0626
0627
0628
0629
0630
0631 void blk_mq_sched_free_rqs(struct request_queue *q)
0632 {
0633 struct blk_mq_hw_ctx *hctx;
0634 unsigned long i;
0635
0636 if (blk_mq_is_shared_tags(q->tag_set->flags)) {
0637 blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
0638 BLK_MQ_NO_HCTX_IDX);
0639 } else {
0640 queue_for_each_hw_ctx(q, hctx, i) {
0641 if (hctx->sched_tags)
0642 blk_mq_free_rqs(q->tag_set,
0643 hctx->sched_tags, i);
0644 }
0645 }
0646 }
0647
0648 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
0649 {
0650 struct blk_mq_hw_ctx *hctx;
0651 unsigned long i;
0652 unsigned int flags = 0;
0653
0654 queue_for_each_hw_ctx(q, hctx, i) {
0655 mutex_lock(&q->debugfs_mutex);
0656 blk_mq_debugfs_unregister_sched_hctx(hctx);
0657 mutex_unlock(&q->debugfs_mutex);
0658
0659 if (e->type->ops.exit_hctx && hctx->sched_data) {
0660 e->type->ops.exit_hctx(hctx, i);
0661 hctx->sched_data = NULL;
0662 }
0663 flags = hctx->flags;
0664 }
0665
0666 mutex_lock(&q->debugfs_mutex);
0667 blk_mq_debugfs_unregister_sched(q);
0668 mutex_unlock(&q->debugfs_mutex);
0669
0670 if (e->type->ops.exit_sched)
0671 e->type->ops.exit_sched(e);
0672 blk_mq_sched_tags_teardown(q, flags);
0673 q->elevator = NULL;
0674 }