the-tree/block/kyber-iosched.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
0004  * scalable techniques.
0005  *
0006  * Copyright (C) 2017 Facebook
0007  */
0008
0009 #include <linux/kernel.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/blk-mq.h>
0012 #include <linux/module.h>
0013 #include <linux/sbitmap.h>
0014
0015 #include <trace/events/block.h>
0016
0017 #include "elevator.h"
0018 #include "blk.h"
0019 #include "blk-mq.h"
0020 #include "blk-mq-debugfs.h"
0021 #include "blk-mq-sched.h"
0022 #include "blk-mq-tag.h"
0023
0024 #define CREATE_TRACE_POINTS
0025 #include <trace/events/kyber.h>
0026
0027 /*
0028  * Scheduling domains: the device is divided into multiple domains based on the
0029  * request type.
0030  */
0031 enum {
0032     KYBER_READ,
0033     KYBER_WRITE,
0034     KYBER_DISCARD,
0035     KYBER_OTHER,
0036     KYBER_NUM_DOMAINS,
0037 };
0038
0039 static const char *kyber_domain_names[] = {
0040     [KYBER_READ] = "READ",
0041     [KYBER_WRITE] = "WRITE",
0042     [KYBER_DISCARD] = "DISCARD",
0043     [KYBER_OTHER] = "OTHER",
0044 };
0045
0046 enum {
0047     /*
0048      * In order to prevent starvation of synchronous requests by a flood of
0049      * asynchronous requests, we reserve 25% of requests for synchronous
0050      * operations.
0051      */
0052     KYBER_ASYNC_PERCENT = 75,
0053 };
0054
0055 /*
0056  * Maximum device-wide depth for each scheduling domain.
0057  *
0058  * Even for fast devices with lots of tags like NVMe, you can saturate the
0059  * device with only a fraction of the maximum possible queue depth. So, we cap
0060  * these to a reasonable value.
0061  */
0062 static const unsigned int kyber_depth[] = {
0063     [KYBER_READ] = 256,
0064     [KYBER_WRITE] = 128,
0065     [KYBER_DISCARD] = 64,
0066     [KYBER_OTHER] = 16,
0067 };
0068
0069 /*
0070  * Default latency targets for each scheduling domain.
0071  */
0072 static const u64 kyber_latency_targets[] = {
0073     [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
0074     [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
0075     [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
0076 };
0077
0078 /*
0079  * Batch size (number of requests we'll dispatch in a row) for each scheduling
0080  * domain.
0081  */
0082 static const unsigned int kyber_batch_size[] = {
0083     [KYBER_READ] = 16,
0084     [KYBER_WRITE] = 8,
0085     [KYBER_DISCARD] = 1,
0086     [KYBER_OTHER] = 1,
0087 };
0088
0089 /*
0090  * Requests latencies are recorded in a histogram with buckets defined relative
0091  * to the target latency:
0092  *
0093  * <= 1/4 * target latency
0094  * <= 1/2 * target latency
0095  * <= 3/4 * target latency
0096  * <= target latency
0097  * <= 1 1/4 * target latency
0098  * <= 1 1/2 * target latency
0099  * <= 1 3/4 * target latency
0100  * > 1 3/4 * target latency
0101  */
0102 enum {
0103     /*
0104      * The width of the latency histogram buckets is
0105      * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
0106      */
0107     KYBER_LATENCY_SHIFT = 2,
0108     /*
0109      * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
0110      * thus, "good".
0111      */
0112     KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
0113     /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
0114     KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
0115 };
0116
0117 /*
0118  * We measure both the total latency and the I/O latency (i.e., latency after
0119  * submitting to the device).
0120  */
0121 enum {
0122     KYBER_TOTAL_LATENCY,
0123     KYBER_IO_LATENCY,
0124 };
0125
0126 static const char *kyber_latency_type_names[] = {
0127     [KYBER_TOTAL_LATENCY] = "total",
0128     [KYBER_IO_LATENCY] = "I/O",
0129 };
0130
0131 /*
0132  * Per-cpu latency histograms: total latency and I/O latency for each scheduling
0133  * domain except for KYBER_OTHER.
0134  */
0135 struct kyber_cpu_latency {
0136     atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
0137 };
0138
0139 /*
0140  * There is a same mapping between ctx & hctx and kcq & khd,
0141  * we use request->mq_ctx->index_hw to index the kcq in khd.
0142  */
0143 struct kyber_ctx_queue {
0144     /*
0145      * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
0146      * Also protect the rqs on rq_list when merge.
0147      */
0148     spinlock_t lock;
0149     struct list_head rq_list[KYBER_NUM_DOMAINS];
0150 } ____cacheline_aligned_in_smp;
0151
0152 struct kyber_queue_data {
0153     struct request_queue *q;
0154     dev_t dev;
0155
0156     /*
0157      * Each scheduling domain has a limited number of in-flight requests
0158      * device-wide, limited by these tokens.
0159      */
0160     struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
0161
0162     /*
0163      * Async request percentage, converted to per-word depth for
0164      * sbitmap_get_shallow().
0165      */
0166     unsigned int async_depth;
0167
0168     struct kyber_cpu_latency __percpu *cpu_latency;
0169
0170     /* Timer for stats aggregation and adjusting domain tokens. */
0171     struct timer_list timer;
0172
0173     unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
0174
0175     unsigned long latency_timeout[KYBER_OTHER];
0176
0177     int domain_p99[KYBER_OTHER];
0178
0179     /* Target latencies in nanoseconds. */
0180     u64 latency_targets[KYBER_OTHER];
0181 };
0182
0183 struct kyber_hctx_data {
0184     spinlock_t lock;
0185     struct list_head rqs[KYBER_NUM_DOMAINS];
0186     unsigned int cur_domain;
0187     unsigned int batching;
0188     struct kyber_ctx_queue *kcqs;
0189     struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
0190     struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
0191     struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
0192     atomic_t wait_index[KYBER_NUM_DOMAINS];
0193 };
0194
0195 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
0196                  void *key);
0197
0198 static unsigned int kyber_sched_domain(blk_opf_t opf)
0199 {
0200     switch (opf & REQ_OP_MASK) {
0201     case REQ_OP_READ:
0202         return KYBER_READ;
0203     case REQ_OP_WRITE:
0204         return KYBER_WRITE;
0205     case REQ_OP_DISCARD:
0206         return KYBER_DISCARD;
0207     default:
0208         return KYBER_OTHER;
0209     }
0210 }
0211
0212 static void flush_latency_buckets(struct kyber_queue_data *kqd,
0213                   struct kyber_cpu_latency *cpu_latency,
0214                   unsigned int sched_domain, unsigned int type)
0215 {
0216     unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
0217     atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
0218     unsigned int bucket;
0219
0220     for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
0221         buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
0222 }
0223
0224 /*
0225  * Calculate the histogram bucket with the given percentile rank, or -1 if there
0226  * aren't enough samples yet.
0227  */
0228 static int calculate_percentile(struct kyber_queue_data *kqd,
0229                 unsigned int sched_domain, unsigned int type,
0230                 unsigned int percentile)
0231 {
0232     unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
0233     unsigned int bucket, samples = 0, percentile_samples;
0234
0235     for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
0236         samples += buckets[bucket];
0237
0238     if (!samples)
0239         return -1;
0240
0241     /*
0242      * We do the calculation once we have 500 samples or one second passes
0243      * since the first sample was recorded, whichever comes first.
0244      */
0245     if (!kqd->latency_timeout[sched_domain])
0246         kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
0247     if (samples < 500 &&
0248         time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
0249         return -1;
0250     }
0251     kqd->latency_timeout[sched_domain] = 0;
0252
0253     percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
0254     for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
0255         if (buckets[bucket] >= percentile_samples)
0256             break;
0257         percentile_samples -= buckets[bucket];
0258     }
0259     memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
0260
0261     trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain],
0262                 kyber_latency_type_names[type], percentile,
0263                 bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
0264
0265     return bucket;
0266 }
0267
0268 static void kyber_resize_domain(struct kyber_queue_data *kqd,
0269                 unsigned int sched_domain, unsigned int depth)
0270 {
0271     depth = clamp(depth, 1U, kyber_depth[sched_domain]);
0272     if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
0273         sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
0274         trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain],
0275                    depth);
0276     }
0277 }
0278
0279 static void kyber_timer_fn(struct timer_list *t)
0280 {
0281     struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
0282     unsigned int sched_domain;
0283     int cpu;
0284     bool bad = false;
0285
0286     /* Sum all of the per-cpu latency histograms. */
0287     for_each_online_cpu(cpu) {
0288         struct kyber_cpu_latency *cpu_latency;
0289
0290         cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
0291         for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
0292             flush_latency_buckets(kqd, cpu_latency, sched_domain,
0293                           KYBER_TOTAL_LATENCY);
0294             flush_latency_buckets(kqd, cpu_latency, sched_domain,
0295                           KYBER_IO_LATENCY);
0296         }
0297     }
0298
0299     /*
0300      * Check if any domains have a high I/O latency, which might indicate
0301      * congestion in the device. Note that we use the p90; we don't want to
0302      * be too sensitive to outliers here.
0303      */
0304     for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
0305         int p90;
0306
0307         p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
0308                        90);
0309         if (p90 >= KYBER_GOOD_BUCKETS)
0310             bad = true;
0311     }
0312
0313     /*
0314      * Adjust the scheduling domain depths. If we determined that there was
0315      * congestion, we throttle all domains with good latencies. Either way,
0316      * we ease up on throttling domains with bad latencies.
0317      */
0318     for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
0319         unsigned int orig_depth, depth;
0320         int p99;
0321
0322         p99 = calculate_percentile(kqd, sched_domain,
0323                        KYBER_TOTAL_LATENCY, 99);
0324         /*
0325          * This is kind of subtle: different domains will not
0326          * necessarily have enough samples to calculate the latency
0327          * percentiles during the same window, so we have to remember
0328          * the p99 for the next time we observe congestion; once we do,
0329          * we don't want to throttle again until we get more data, so we
0330          * reset it to -1.
0331          */
0332         if (bad) {
0333             if (p99 < 0)
0334                 p99 = kqd->domain_p99[sched_domain];
0335             kqd->domain_p99[sched_domain] = -1;
0336         } else if (p99 >= 0) {
0337             kqd->domain_p99[sched_domain] = p99;
0338         }
0339         if (p99 < 0)
0340             continue;
0341
0342         /*
0343          * If this domain has bad latency, throttle less. Otherwise,
0344          * throttle more iff we determined that there is congestion.
0345          *
0346          * The new depth is scaled linearly with the p99 latency vs the
0347          * latency target. E.g., if the p99 is 3/4 of the target, then
0348          * we throttle down to 3/4 of the current depth, and if the p99
0349          * is 2x the target, then we double the depth.
0350          */
0351         if (bad || p99 >= KYBER_GOOD_BUCKETS) {
0352             orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
0353             depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
0354             kyber_resize_domain(kqd, sched_domain, depth);
0355         }
0356     }
0357 }
0358
0359 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
0360 {
0361     struct kyber_queue_data *kqd;
0362     int ret = -ENOMEM;
0363     int i;
0364
0365     kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
0366     if (!kqd)
0367         goto err;
0368
0369     kqd->q = q;
0370     kqd->dev = disk_devt(q->disk);
0371
0372     kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
0373                         GFP_KERNEL | __GFP_ZERO);
0374     if (!kqd->cpu_latency)
0375         goto err_kqd;
0376
0377     timer_setup(&kqd->timer, kyber_timer_fn, 0);
0378
0379     for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
0380         WARN_ON(!kyber_depth[i]);
0381         WARN_ON(!kyber_batch_size[i]);
0382         ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
0383                           kyber_depth[i], -1, false,
0384                           GFP_KERNEL, q->node);
0385         if (ret) {
0386             while (--i >= 0)
0387                 sbitmap_queue_free(&kqd->domain_tokens[i]);
0388             goto err_buckets;
0389         }
0390     }
0391
0392     for (i = 0; i < KYBER_OTHER; i++) {
0393         kqd->domain_p99[i] = -1;
0394         kqd->latency_targets[i] = kyber_latency_targets[i];
0395     }
0396
0397     return kqd;
0398
0399 err_buckets:
0400     free_percpu(kqd->cpu_latency);
0401 err_kqd:
0402     kfree(kqd);
0403 err:
0404     return ERR_PTR(ret);
0405 }
0406
0407 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
0408 {
0409     struct kyber_queue_data *kqd;
0410     struct elevator_queue *eq;
0411
0412     eq = elevator_alloc(q, e);
0413     if (!eq)
0414         return -ENOMEM;
0415
0416     kqd = kyber_queue_data_alloc(q);
0417     if (IS_ERR(kqd)) {
0418         kobject_put(&eq->kobj);
0419         return PTR_ERR(kqd);
0420     }
0421
0422     blk_stat_enable_accounting(q);
0423
0424     blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
0425
0426     eq->elevator_data = kqd;
0427     q->elevator = eq;
0428
0429     return 0;
0430 }
0431
0432 static void kyber_exit_sched(struct elevator_queue *e)
0433 {
0434     struct kyber_queue_data *kqd = e->elevator_data;
0435     int i;
0436
0437     del_timer_sync(&kqd->timer);
0438     blk_stat_disable_accounting(kqd->q);
0439
0440     for (i = 0; i < KYBER_NUM_DOMAINS; i++)
0441         sbitmap_queue_free(&kqd->domain_tokens[i]);
0442     free_percpu(kqd->cpu_latency);
0443     kfree(kqd);
0444 }
0445
0446 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
0447 {
0448     unsigned int i;
0449
0450     spin_lock_init(&kcq->lock);
0451     for (i = 0; i < KYBER_NUM_DOMAINS; i++)
0452         INIT_LIST_HEAD(&kcq->rq_list[i]);
0453 }
0454
0455 static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
0456 {
0457     struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
0458     struct blk_mq_tags *tags = hctx->sched_tags;
0459     unsigned int shift = tags->bitmap_tags.sb.shift;
0460
0461     kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
0462
0463     sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
0464 }
0465
0466 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
0467 {
0468     struct kyber_hctx_data *khd;
0469     int i;
0470
0471     khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
0472     if (!khd)
0473         return -ENOMEM;
0474
0475     khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
0476                        sizeof(struct kyber_ctx_queue),
0477                        GFP_KERNEL, hctx->numa_node);
0478     if (!khd->kcqs)
0479         goto err_khd;
0480
0481     for (i = 0; i < hctx->nr_ctx; i++)
0482         kyber_ctx_queue_init(&khd->kcqs[i]);
0483
0484     for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
0485         if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
0486                       ilog2(8), GFP_KERNEL, hctx->numa_node,
0487                       false, false)) {
0488             while (--i >= 0)
0489                 sbitmap_free(&khd->kcq_map[i]);
0490             goto err_kcqs;
0491         }
0492     }
0493
0494     spin_lock_init(&khd->lock);
0495
0496     for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
0497         INIT_LIST_HEAD(&khd->rqs[i]);
0498         khd->domain_wait[i].sbq = NULL;
0499         init_waitqueue_func_entry(&khd->domain_wait[i].wait,
0500                       kyber_domain_wake);
0501         khd->domain_wait[i].wait.private = hctx;
0502         INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
0503         atomic_set(&khd->wait_index[i], 0);
0504     }
0505
0506     khd->cur_domain = 0;
0507     khd->batching = 0;
0508
0509     hctx->sched_data = khd;
0510     kyber_depth_updated(hctx);
0511
0512     return 0;
0513
0514 err_kcqs:
0515     kfree(khd->kcqs);
0516 err_khd:
0517     kfree(khd);
0518     return -ENOMEM;
0519 }
0520
0521 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
0522 {
0523     struct kyber_hctx_data *khd = hctx->sched_data;
0524     int i;
0525
0526     for (i = 0; i < KYBER_NUM_DOMAINS; i++)
0527         sbitmap_free(&khd->kcq_map[i]);
0528     kfree(khd->kcqs);
0529     kfree(hctx->sched_data);
0530 }
0531
0532 static int rq_get_domain_token(struct request *rq)
0533 {
0534     return (long)rq->elv.priv[0];
0535 }
0536
0537 static void rq_set_domain_token(struct request *rq, int token)
0538 {
0539     rq->elv.priv[0] = (void *)(long)token;
0540 }
0541
0542 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
0543                   struct request *rq)
0544 {
0545     unsigned int sched_domain;
0546     int nr;
0547
0548     nr = rq_get_domain_token(rq);
0549     if (nr != -1) {
0550         sched_domain = kyber_sched_domain(rq->cmd_flags);
0551         sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
0552                     rq->mq_ctx->cpu);
0553     }
0554 }
0555
0556 static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
0557 {
0558     /*
0559      * We use the scheduler tags as per-hardware queue queueing tokens.
0560      * Async requests can be limited at this stage.
0561      */
0562     if (!op_is_sync(opf)) {
0563         struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
0564
0565         data->shallow_depth = kqd->async_depth;
0566     }
0567 }
0568
0569 static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
0570         unsigned int nr_segs)
0571 {
0572     struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
0573     struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
0574     struct kyber_hctx_data *khd = hctx->sched_data;
0575     struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
0576     unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
0577     struct list_head *rq_list = &kcq->rq_list[sched_domain];
0578     bool merged;
0579
0580     spin_lock(&kcq->lock);
0581     merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
0582     spin_unlock(&kcq->lock);
0583
0584     return merged;
0585 }
0586
0587 static void kyber_prepare_request(struct request *rq)
0588 {
0589     rq_set_domain_token(rq, -1);
0590 }
0591
0592 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
0593                   struct list_head *rq_list, bool at_head)
0594 {
0595     struct kyber_hctx_data *khd = hctx->sched_data;
0596     struct request *rq, *next;
0597
0598     list_for_each_entry_safe(rq, next, rq_list, queuelist) {
0599         unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
0600         struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
0601         struct list_head *head = &kcq->rq_list[sched_domain];
0602
0603         spin_lock(&kcq->lock);
0604         trace_block_rq_insert(rq);
0605         if (at_head)
0606             list_move(&rq->queuelist, head);
0607         else
0608             list_move_tail(&rq->queuelist, head);
0609         sbitmap_set_bit(&khd->kcq_map[sched_domain],
0610                 rq->mq_ctx->index_hw[hctx->type]);
0611         spin_unlock(&kcq->lock);
0612     }
0613 }
0614
0615 static void kyber_finish_request(struct request *rq)
0616 {
0617     struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
0618
0619     rq_clear_domain_token(kqd, rq);
0620 }
0621
0622 static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
0623                    unsigned int sched_domain, unsigned int type,
0624                    u64 target, u64 latency)
0625 {
0626     unsigned int bucket;
0627     u64 divisor;
0628
0629     if (latency > 0) {
0630         divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
0631         bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
0632                    KYBER_LATENCY_BUCKETS - 1);
0633     } else {
0634         bucket = 0;
0635     }
0636
0637     atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
0638 }
0639
0640 static void kyber_completed_request(struct request *rq, u64 now)
0641 {
0642     struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
0643     struct kyber_cpu_latency *cpu_latency;
0644     unsigned int sched_domain;
0645     u64 target;
0646
0647     sched_domain = kyber_sched_domain(rq->cmd_flags);
0648     if (sched_domain == KYBER_OTHER)
0649         return;
0650
0651     cpu_latency = get_cpu_ptr(kqd->cpu_latency);
0652     target = kqd->latency_targets[sched_domain];
0653     add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
0654                target, now - rq->start_time_ns);
0655     add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
0656                now - rq->io_start_time_ns);
0657     put_cpu_ptr(kqd->cpu_latency);
0658
0659     timer_reduce(&kqd->timer, jiffies + HZ / 10);
0660 }
0661
0662 struct flush_kcq_data {
0663     struct kyber_hctx_data *khd;
0664     unsigned int sched_domain;
0665     struct list_head *list;
0666 };
0667
0668 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
0669 {
0670     struct flush_kcq_data *flush_data = data;
0671     struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
0672
0673     spin_lock(&kcq->lock);
0674     list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
0675                   flush_data->list);
0676     sbitmap_clear_bit(sb, bitnr);
0677     spin_unlock(&kcq->lock);
0678
0679     return true;
0680 }
0681
0682 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
0683                   unsigned int sched_domain,
0684                   struct list_head *list)
0685 {
0686     struct flush_kcq_data data = {
0687         .khd = khd,
0688         .sched_domain = sched_domain,
0689         .list = list,
0690     };
0691
0692     sbitmap_for_each_set(&khd->kcq_map[sched_domain],
0693                  flush_busy_kcq, &data);
0694 }
0695
0696 static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
0697                  void *key)
0698 {
0699     struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
0700     struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
0701
0702     sbitmap_del_wait_queue(wait);
0703     blk_mq_run_hw_queue(hctx, true);
0704     return 1;
0705 }
0706
0707 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
0708                   struct kyber_hctx_data *khd,
0709                   struct blk_mq_hw_ctx *hctx)
0710 {
0711     unsigned int sched_domain = khd->cur_domain;
0712     struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
0713     struct sbq_wait *wait = &khd->domain_wait[sched_domain];
0714     struct sbq_wait_state *ws;
0715     int nr;
0716
0717     nr = __sbitmap_queue_get(domain_tokens);
0718
0719     /*
0720      * If we failed to get a domain token, make sure the hardware queue is
0721      * run when one becomes available. Note that this is serialized on
0722      * khd->lock, but we still need to be careful about the waker.
0723      */
0724     if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
0725         ws = sbq_wait_ptr(domain_tokens,
0726                   &khd->wait_index[sched_domain]);
0727         khd->domain_ws[sched_domain] = ws;
0728         sbitmap_add_wait_queue(domain_tokens, ws, wait);
0729
0730         /*
0731          * Try again in case a token was freed before we got on the wait
0732          * queue.
0733          */
0734         nr = __sbitmap_queue_get(domain_tokens);
0735     }
0736
0737     /*
0738      * If we got a token while we were on the wait queue, remove ourselves
0739      * from the wait queue to ensure that all wake ups make forward
0740      * progress. It's possible that the waker already deleted the entry
0741      * between the !list_empty_careful() check and us grabbing the lock, but
0742      * list_del_init() is okay with that.
0743      */
0744     if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
0745         ws = khd->domain_ws[sched_domain];
0746         spin_lock_irq(&ws->wait.lock);
0747         sbitmap_del_wait_queue(wait);
0748         spin_unlock_irq(&ws->wait.lock);
0749     }
0750
0751     return nr;
0752 }
0753
0754 static struct request *
0755 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
0756               struct kyber_hctx_data *khd,
0757               struct blk_mq_hw_ctx *hctx)
0758 {
0759     struct list_head *rqs;
0760     struct request *rq;
0761     int nr;
0762
0763     rqs = &khd->rqs[khd->cur_domain];
0764
0765     /*
0766      * If we already have a flushed request, then we just need to get a
0767      * token for it. Otherwise, if there are pending requests in the kcqs,
0768      * flush the kcqs, but only if we can get a token. If not, we should
0769      * leave the requests in the kcqs so that they can be merged. Note that
0770      * khd->lock serializes the flushes, so if we observed any bit set in
0771      * the kcq_map, we will always get a request.
0772      */
0773     rq = list_first_entry_or_null(rqs, struct request, queuelist);
0774     if (rq) {
0775         nr = kyber_get_domain_token(kqd, khd, hctx);
0776         if (nr >= 0) {
0777             khd->batching++;
0778             rq_set_domain_token(rq, nr);
0779             list_del_init(&rq->queuelist);
0780             return rq;
0781         } else {
0782             trace_kyber_throttled(kqd->dev,
0783                           kyber_domain_names[khd->cur_domain]);
0784         }
0785     } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
0786         nr = kyber_get_domain_token(kqd, khd, hctx);
0787         if (nr >= 0) {
0788             kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
0789             rq = list_first_entry(rqs, struct request, queuelist);
0790             khd->batching++;
0791             rq_set_domain_token(rq, nr);
0792             list_del_init(&rq->queuelist);
0793             return rq;
0794         } else {
0795             trace_kyber_throttled(kqd->dev,
0796                           kyber_domain_names[khd->cur_domain]);
0797         }
0798     }
0799
0800     /* There were either no pending requests or no tokens. */
0801     return NULL;
0802 }
0803
0804 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
0805 {
0806     struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
0807     struct kyber_hctx_data *khd = hctx->sched_data;
0808     struct request *rq;
0809     int i;
0810
0811     spin_lock(&khd->lock);
0812
0813     /*
0814      * First, if we are still entitled to batch, try to dispatch a request
0815      * from the batch.
0816      */
0817     if (khd->batching < kyber_batch_size[khd->cur_domain]) {
0818         rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
0819         if (rq)
0820             goto out;
0821     }
0822
0823     /*
0824      * Either,
0825      * 1. We were no longer entitled to a batch.
0826      * 2. The domain we were batching didn't have any requests.
0827      * 3. The domain we were batching was out of tokens.
0828      *
0829      * Start another batch. Note that this wraps back around to the original
0830      * domain if no other domains have requests or tokens.
0831      */
0832     khd->batching = 0;
0833     for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
0834         if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
0835             khd->cur_domain = 0;
0836         else
0837             khd->cur_domain++;
0838
0839         rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
0840         if (rq)
0841             goto out;
0842     }
0843
0844     rq = NULL;
0845 out:
0846     spin_unlock(&khd->lock);
0847     return rq;
0848 }
0849
0850 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
0851 {
0852     struct kyber_hctx_data *khd = hctx->sched_data;
0853     int i;
0854
0855     for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
0856         if (!list_empty_careful(&khd->rqs[i]) ||
0857             sbitmap_any_bit_set(&khd->kcq_map[i]))
0858             return true;
0859     }
0860
0861     return false;
0862 }
0863
0864 #define KYBER_LAT_SHOW_STORE(domain, name)              \
0865 static ssize_t kyber_##name##_lat_show(struct elevator_queue *e,    \
0866                        char *page)          \
0867 {                                   \
0868     struct kyber_queue_data *kqd = e->elevator_data;        \
0869                                     \
0870     return sprintf(page, "%llu\n", kqd->latency_targets[domain]);   \
0871 }                                   \
0872                                     \
0873 static ssize_t kyber_##name##_lat_store(struct elevator_queue *e,   \
0874                     const char *page, size_t count) \
0875 {                                   \
0876     struct kyber_queue_data *kqd = e->elevator_data;        \
0877     unsigned long long nsec;                    \
0878     int ret;                            \
0879                                     \
0880     ret = kstrtoull(page, 10, &nsec);               \
0881     if (ret)                            \
0882         return ret;                     \
0883                                     \
0884     kqd->latency_targets[domain] = nsec;                \
0885                                     \
0886     return count;                           \
0887 }
0888 KYBER_LAT_SHOW_STORE(KYBER_READ, read);
0889 KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
0890 #undef KYBER_LAT_SHOW_STORE
0891
0892 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
0893 static struct elv_fs_entry kyber_sched_attrs[] = {
0894     KYBER_LAT_ATTR(read),
0895     KYBER_LAT_ATTR(write),
0896     __ATTR_NULL
0897 };
0898 #undef KYBER_LAT_ATTR
0899
0900 #ifdef CONFIG_BLK_DEBUG_FS
0901 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)            \
0902 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
0903 {                                   \
0904     struct request_queue *q = data;                 \
0905     struct kyber_queue_data *kqd = q->elevator->elevator_data;  \
0906                                     \
0907     sbitmap_queue_show(&kqd->domain_tokens[domain], m);     \
0908     return 0;                           \
0909 }                                   \
0910                                     \
0911 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
0912     __acquires(&khd->lock)                      \
0913 {                                   \
0914     struct blk_mq_hw_ctx *hctx = m->private;            \
0915     struct kyber_hctx_data *khd = hctx->sched_data;         \
0916                                     \
0917     spin_lock(&khd->lock);                      \
0918     return seq_list_start(&khd->rqs[domain], *pos);         \
0919 }                                   \
0920                                     \
0921 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,   \
0922                      loff_t *pos)           \
0923 {                                   \
0924     struct blk_mq_hw_ctx *hctx = m->private;            \
0925     struct kyber_hctx_data *khd = hctx->sched_data;         \
0926                                     \
0927     return seq_list_next(v, &khd->rqs[domain], pos);        \
0928 }                                   \
0929                                     \
0930 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)    \
0931     __releases(&khd->lock)                      \
0932 {                                   \
0933     struct blk_mq_hw_ctx *hctx = m->private;            \
0934     struct kyber_hctx_data *khd = hctx->sched_data;         \
0935                                     \
0936     spin_unlock(&khd->lock);                    \
0937 }                                   \
0938                                     \
0939 static const struct seq_operations kyber_##name##_rqs_seq_ops = {   \
0940     .start  = kyber_##name##_rqs_start,             \
0941     .next   = kyber_##name##_rqs_next,              \
0942     .stop   = kyber_##name##_rqs_stop,              \
0943     .show   = blk_mq_debugfs_rq_show,               \
0944 };                                  \
0945                                     \
0946 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
0947 {                                   \
0948     struct blk_mq_hw_ctx *hctx = data;              \
0949     struct kyber_hctx_data *khd = hctx->sched_data;         \
0950     wait_queue_entry_t *wait = &khd->domain_wait[domain].wait;  \
0951                                     \
0952     seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));   \
0953     return 0;                           \
0954 }
0955 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
0956 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
0957 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
0958 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
0959 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
0960
0961 static int kyber_async_depth_show(void *data, struct seq_file *m)
0962 {
0963     struct request_queue *q = data;
0964     struct kyber_queue_data *kqd = q->elevator->elevator_data;
0965
0966     seq_printf(m, "%u\n", kqd->async_depth);
0967     return 0;
0968 }
0969
0970 static int kyber_cur_domain_show(void *data, struct seq_file *m)
0971 {
0972     struct blk_mq_hw_ctx *hctx = data;
0973     struct kyber_hctx_data *khd = hctx->sched_data;
0974
0975     seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
0976     return 0;
0977 }
0978
0979 static int kyber_batching_show(void *data, struct seq_file *m)
0980 {
0981     struct blk_mq_hw_ctx *hctx = data;
0982     struct kyber_hctx_data *khd = hctx->sched_data;
0983
0984     seq_printf(m, "%u\n", khd->batching);
0985     return 0;
0986 }
0987
0988 #define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
0989     {#name "_tokens", 0400, kyber_##name##_tokens_show}
0990 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
0991     KYBER_QUEUE_DOMAIN_ATTRS(read),
0992     KYBER_QUEUE_DOMAIN_ATTRS(write),
0993     KYBER_QUEUE_DOMAIN_ATTRS(discard),
0994     KYBER_QUEUE_DOMAIN_ATTRS(other),
0995     {"async_depth", 0400, kyber_async_depth_show},
0996     {},
0997 };
0998 #undef KYBER_QUEUE_DOMAIN_ATTRS
0999
1000 #define KYBER_HCTX_DOMAIN_ATTRS(name)                   \
1001     {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
1002     {#name "_waiting", 0400, kyber_##name##_waiting_show}
1003 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
1004     KYBER_HCTX_DOMAIN_ATTRS(read),
1005     KYBER_HCTX_DOMAIN_ATTRS(write),
1006     KYBER_HCTX_DOMAIN_ATTRS(discard),
1007     KYBER_HCTX_DOMAIN_ATTRS(other),
1008     {"cur_domain", 0400, kyber_cur_domain_show},
1009     {"batching", 0400, kyber_batching_show},
1010     {},
1011 };
1012 #undef KYBER_HCTX_DOMAIN_ATTRS
1013 #endif
1014
1015 static struct elevator_type kyber_sched = {
1016     .ops = {
1017         .init_sched = kyber_init_sched,
1018         .exit_sched = kyber_exit_sched,
1019         .init_hctx = kyber_init_hctx,
1020         .exit_hctx = kyber_exit_hctx,
1021         .limit_depth = kyber_limit_depth,
1022         .bio_merge = kyber_bio_merge,
1023         .prepare_request = kyber_prepare_request,
1024         .insert_requests = kyber_insert_requests,
1025         .finish_request = kyber_finish_request,
1026         .requeue_request = kyber_finish_request,
1027         .completed_request = kyber_completed_request,
1028         .dispatch_request = kyber_dispatch_request,
1029         .has_work = kyber_has_work,
1030         .depth_updated = kyber_depth_updated,
1031     },
1032 #ifdef CONFIG_BLK_DEBUG_FS
1033     .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
1034     .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
1035 #endif
1036     .elevator_attrs = kyber_sched_attrs,
1037     .elevator_name = "kyber",
1038     .elevator_owner = THIS_MODULE,
1039 };
1040
1041 static int __init kyber_init(void)
1042 {
1043     return elv_register(&kyber_sched);
1044 }
1045
1046 static void __exit kyber_exit(void)
1047 {
1048     elv_unregister(&kyber_sched);
1049 }
1050
1051 module_init(kyber_init);
1052 module_exit(kyber_exit);
1053
1054 MODULE_AUTHOR("Omar Sandoval");
1055 MODULE_LICENSE("GPL");
1056 MODULE_DESCRIPTION("Kyber I/O scheduler");