the-tree/block/blk-iolatency.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Block rq-qos base io controller
0004  *
0005  * This works similar to wbt with a few exceptions
0006  *
0007  * - It's bio based, so the latency covers the whole block layer in addition to
0008  *   the actual io.
0009  * - We will throttle all IO that comes in here if we need to.
0010  * - We use the mean latency over the 100ms window.  This is because writes can
0011  *   be particularly fast, which could give us a false sense of the impact of
0012  *   other workloads on our protected workload.
0013  * - By default there's no throttling, we set the queue_depth to UINT_MAX so
0014  *   that we can have as many outstanding bio's as we're allowed to.  Only at
0015  *   throttle time do we pay attention to the actual queue depth.
0016  *
0017  * The hierarchy works like the cpu controller does, we track the latency at
0018  * every configured node, and each configured node has it's own independent
0019  * queue depth.  This means that we only care about our latency targets at the
0020  * peer level.  Some group at the bottom of the hierarchy isn't going to affect
0021  * a group at the end of some other path if we're only configred at leaf level.
0022  *
0023  * Consider the following
0024  *
0025  *                   root blkg
0026  *             /                     \
0027  *        fast (target=5ms)     slow (target=10ms)
0028  *         /     \                  /        \
0029  *       a        b          normal(15ms)   unloved
0030  *
0031  * "a" and "b" have no target, but their combined io under "fast" cannot exceed
0032  * an average latency of 5ms.  If it does then we will throttle the "slow"
0033  * group.  In the case of "normal", if it exceeds its 15ms target, we will
0034  * throttle "unloved", but nobody else.
0035  *
0036  * In this example "fast", "slow", and "normal" will be the only groups actually
0037  * accounting their io latencies.  We have to walk up the heirarchy to the root
0038  * on every submit and complete so we can do the appropriate stat recording and
0039  * adjust the queue depth of ourselves if needed.
0040  *
0041  * There are 2 ways we throttle IO.
0042  *
0043  * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
0044  * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
0045  * to 1.  If the group is only ever submitting IO for itself then this is the
0046  * only way we throttle.
0047  *
0048  * 2) Induced delay throttling.  This is for the case that a group is generating
0049  * IO that has to be issued by the root cg to avoid priority inversion. So think
0050  * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
0051  * of work done for us on behalf of the root cg and are being asked to scale
0052  * down more then we induce a latency at userspace return.  We accumulate the
0053  * total amount of time we need to be punished by doing
0054  *
0055  * total_time += min_lat_nsec - actual_io_completion
0056  *
0057  * and then at throttle time will do
0058  *
0059  * throttle_time = min(total_time, NSEC_PER_SEC)
0060  *
0061  * This induced delay will throttle back the activity that is generating the
0062  * root cg issued io's, wethere that's some metadata intensive operation or the
0063  * group is using so much memory that it is pushing us into swap.
0064  *
0065  * Copyright (C) 2018 Josef Bacik
0066  */
0067 #include <linux/kernel.h>
0068 #include <linux/blk_types.h>
0069 #include <linux/backing-dev.h>
0070 #include <linux/module.h>
0071 #include <linux/timer.h>
0072 #include <linux/memcontrol.h>
0073 #include <linux/sched/loadavg.h>
0074 #include <linux/sched/signal.h>
0075 #include <trace/events/block.h>
0076 #include <linux/blk-mq.h>
0077 #include "blk-rq-qos.h"
0078 #include "blk-stat.h"
0079 #include "blk-cgroup.h"
0080 #include "blk.h"
0081
0082 #define DEFAULT_SCALE_COOKIE 1000000U
0083
0084 static struct blkcg_policy blkcg_policy_iolatency;
0085 struct iolatency_grp;
0086
0087 struct blk_iolatency {
0088     struct rq_qos rqos;
0089     struct timer_list timer;
0090
0091     /*
0092      * ->enabled is the master enable switch gating the throttling logic and
0093      * inflight tracking. The number of cgroups which have iolat enabled is
0094      * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly
0095      * from ->enable_work with the request_queue frozen. For details, See
0096      * blkiolatency_enable_work_fn().
0097      */
0098     bool enabled;
0099     atomic_t enable_cnt;
0100     struct work_struct enable_work;
0101 };
0102
0103 static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
0104 {
0105     return container_of(rqos, struct blk_iolatency, rqos);
0106 }
0107
0108 struct child_latency_info {
0109     spinlock_t lock;
0110
0111     /* Last time we adjusted the scale of everybody. */
0112     u64 last_scale_event;
0113
0114     /* The latency that we missed. */
0115     u64 scale_lat;
0116
0117     /* Total io's from all of our children for the last summation. */
0118     u64 nr_samples;
0119
0120     /* The guy who actually changed the latency numbers. */
0121     struct iolatency_grp *scale_grp;
0122
0123     /* Cookie to tell if we need to scale up or down. */
0124     atomic_t scale_cookie;
0125 };
0126
0127 struct percentile_stats {
0128     u64 total;
0129     u64 missed;
0130 };
0131
0132 struct latency_stat {
0133     union {
0134         struct percentile_stats ps;
0135         struct blk_rq_stat rqs;
0136     };
0137 };
0138
0139 struct iolatency_grp {
0140     struct blkg_policy_data pd;
0141     struct latency_stat __percpu *stats;
0142     struct latency_stat cur_stat;
0143     struct blk_iolatency *blkiolat;
0144     struct rq_depth rq_depth;
0145     struct rq_wait rq_wait;
0146     atomic64_t window_start;
0147     atomic_t scale_cookie;
0148     u64 min_lat_nsec;
0149     u64 cur_win_nsec;
0150
0151     /* total running average of our io latency. */
0152     u64 lat_avg;
0153
0154     /* Our current number of IO's for the last summation. */
0155     u64 nr_samples;
0156
0157     bool ssd;
0158     struct child_latency_info child_lat;
0159 };
0160
0161 #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
0162 #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
0163 /*
0164  * These are the constants used to fake the fixed-point moving average
0165  * calculation just like load average.  The call to calc_load() folds
0166  * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
0167  * window size is bucketed to try to approximately calculate average
0168  * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
0169  * elapse immediately.  Note, windows only elapse with IO activity.  Idle
0170  * periods extend the most recent window.
0171  */
0172 #define BLKIOLATENCY_NR_EXP_FACTORS 5
0173 #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
0174                       (BLKIOLATENCY_NR_EXP_FACTORS - 1))
0175 static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
0176     2045, // exp(1/600) - 600 samples
0177     2039, // exp(1/240) - 240 samples
0178     2031, // exp(1/120) - 120 samples
0179     2023, // exp(1/80)  - 80 samples
0180     2014, // exp(1/60)  - 60 samples
0181 };
0182
0183 static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
0184 {
0185     return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
0186 }
0187
0188 static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
0189 {
0190     return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
0191 }
0192
0193 static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
0194 {
0195     return pd_to_blkg(&iolat->pd);
0196 }
0197
0198 static inline void latency_stat_init(struct iolatency_grp *iolat,
0199                      struct latency_stat *stat)
0200 {
0201     if (iolat->ssd) {
0202         stat->ps.total = 0;
0203         stat->ps.missed = 0;
0204     } else
0205         blk_rq_stat_init(&stat->rqs);
0206 }
0207
0208 static inline void latency_stat_sum(struct iolatency_grp *iolat,
0209                     struct latency_stat *sum,
0210                     struct latency_stat *stat)
0211 {
0212     if (iolat->ssd) {
0213         sum->ps.total += stat->ps.total;
0214         sum->ps.missed += stat->ps.missed;
0215     } else
0216         blk_rq_stat_sum(&sum->rqs, &stat->rqs);
0217 }
0218
0219 static inline void latency_stat_record_time(struct iolatency_grp *iolat,
0220                         u64 req_time)
0221 {
0222     struct latency_stat *stat = get_cpu_ptr(iolat->stats);
0223     if (iolat->ssd) {
0224         if (req_time >= iolat->min_lat_nsec)
0225             stat->ps.missed++;
0226         stat->ps.total++;
0227     } else
0228         blk_rq_stat_add(&stat->rqs, req_time);
0229     put_cpu_ptr(stat);
0230 }
0231
0232 static inline bool latency_sum_ok(struct iolatency_grp *iolat,
0233                   struct latency_stat *stat)
0234 {
0235     if (iolat->ssd) {
0236         u64 thresh = div64_u64(stat->ps.total, 10);
0237         thresh = max(thresh, 1ULL);
0238         return stat->ps.missed < thresh;
0239     }
0240     return stat->rqs.mean <= iolat->min_lat_nsec;
0241 }
0242
0243 static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
0244                        struct latency_stat *stat)
0245 {
0246     if (iolat->ssd)
0247         return stat->ps.total;
0248     return stat->rqs.nr_samples;
0249 }
0250
0251 static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
0252                           struct latency_stat *stat)
0253 {
0254     int exp_idx;
0255
0256     if (iolat->ssd)
0257         return;
0258
0259     /*
0260      * calc_load() takes in a number stored in fixed point representation.
0261      * Because we are using this for IO time in ns, the values stored
0262      * are significantly larger than the FIXED_1 denominator (2048).
0263      * Therefore, rounding errors in the calculation are negligible and
0264      * can be ignored.
0265      */
0266     exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
0267             div64_u64(iolat->cur_win_nsec,
0268                   BLKIOLATENCY_EXP_BUCKET_SIZE));
0269     iolat->lat_avg = calc_load(iolat->lat_avg,
0270                    iolatency_exp_factors[exp_idx],
0271                    stat->rqs.mean);
0272 }
0273
0274 static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
0275 {
0276     atomic_dec(&rqw->inflight);
0277     wake_up(&rqw->wait);
0278 }
0279
0280 static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
0281 {
0282     struct iolatency_grp *iolat = private_data;
0283     return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
0284 }
0285
0286 static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
0287                        struct iolatency_grp *iolat,
0288                        bool issue_as_root,
0289                        bool use_memdelay)
0290 {
0291     struct rq_wait *rqw = &iolat->rq_wait;
0292     unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
0293
0294     if (use_delay)
0295         blkcg_schedule_throttle(rqos->q, use_memdelay);
0296
0297     /*
0298      * To avoid priority inversions we want to just take a slot if we are
0299      * issuing as root.  If we're being killed off there's no point in
0300      * delaying things, we may have been killed by OOM so throttling may
0301      * make recovery take even longer, so just let the IO's through so the
0302      * task can go away.
0303      */
0304     if (issue_as_root || fatal_signal_pending(current)) {
0305         atomic_inc(&rqw->inflight);
0306         return;
0307     }
0308
0309     rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
0310 }
0311
0312 #define SCALE_DOWN_FACTOR 2
0313 #define SCALE_UP_FACTOR 4
0314
0315 static inline unsigned long scale_amount(unsigned long qd, bool up)
0316 {
0317     return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
0318 }
0319
0320 /*
0321  * We scale the qd down faster than we scale up, so we need to use this helper
0322  * to adjust the scale_cookie accordingly so we don't prematurely get
0323  * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
0324  *
0325  * Each group has their own local copy of the last scale cookie they saw, so if
0326  * the global scale cookie goes up or down they know which way they need to go
0327  * based on their last knowledge of it.
0328  */
0329 static void scale_cookie_change(struct blk_iolatency *blkiolat,
0330                 struct child_latency_info *lat_info,
0331                 bool up)
0332 {
0333     unsigned long qd = blkiolat->rqos.q->nr_requests;
0334     unsigned long scale = scale_amount(qd, up);
0335     unsigned long old = atomic_read(&lat_info->scale_cookie);
0336     unsigned long max_scale = qd << 1;
0337     unsigned long diff = 0;
0338
0339     if (old < DEFAULT_SCALE_COOKIE)
0340         diff = DEFAULT_SCALE_COOKIE - old;
0341
0342     if (up) {
0343         if (scale + old > DEFAULT_SCALE_COOKIE)
0344             atomic_set(&lat_info->scale_cookie,
0345                    DEFAULT_SCALE_COOKIE);
0346         else if (diff > qd)
0347             atomic_inc(&lat_info->scale_cookie);
0348         else
0349             atomic_add(scale, &lat_info->scale_cookie);
0350     } else {
0351         /*
0352          * We don't want to dig a hole so deep that it takes us hours to
0353          * dig out of it.  Just enough that we don't throttle/unthrottle
0354          * with jagged workloads but can still unthrottle once pressure
0355          * has sufficiently dissipated.
0356          */
0357         if (diff > qd) {
0358             if (diff < max_scale)
0359                 atomic_dec(&lat_info->scale_cookie);
0360         } else {
0361             atomic_sub(scale, &lat_info->scale_cookie);
0362         }
0363     }
0364 }
0365
0366 /*
0367  * Change the queue depth of the iolatency_grp.  We add/subtract 1/16th of the
0368  * queue depth at a time so we don't get wild swings and hopefully dial in to
0369  * fairer distribution of the overall queue depth.
0370  */
0371 static void scale_change(struct iolatency_grp *iolat, bool up)
0372 {
0373     unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
0374     unsigned long scale = scale_amount(qd, up);
0375     unsigned long old = iolat->rq_depth.max_depth;
0376
0377     if (old > qd)
0378         old = qd;
0379
0380     if (up) {
0381         if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
0382             return;
0383
0384         if (old < qd) {
0385             old += scale;
0386             old = min(old, qd);
0387             iolat->rq_depth.max_depth = old;
0388             wake_up_all(&iolat->rq_wait.wait);
0389         }
0390     } else {
0391         old >>= 1;
0392         iolat->rq_depth.max_depth = max(old, 1UL);
0393     }
0394 }
0395
0396 /* Check our parent and see if the scale cookie has changed. */
0397 static void check_scale_change(struct iolatency_grp *iolat)
0398 {
0399     struct iolatency_grp *parent;
0400     struct child_latency_info *lat_info;
0401     unsigned int cur_cookie;
0402     unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
0403     u64 scale_lat;
0404     int direction = 0;
0405
0406     if (lat_to_blkg(iolat)->parent == NULL)
0407         return;
0408
0409     parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
0410     if (!parent)
0411         return;
0412
0413     lat_info = &parent->child_lat;
0414     cur_cookie = atomic_read(&lat_info->scale_cookie);
0415     scale_lat = READ_ONCE(lat_info->scale_lat);
0416
0417     if (cur_cookie < our_cookie)
0418         direction = -1;
0419     else if (cur_cookie > our_cookie)
0420         direction = 1;
0421     else
0422         return;
0423
0424     if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) {
0425         /* Somebody beat us to the punch, just bail. */
0426         return;
0427     }
0428
0429     if (direction < 0 && iolat->min_lat_nsec) {
0430         u64 samples_thresh;
0431
0432         if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
0433             return;
0434
0435         /*
0436          * Sometimes high priority groups are their own worst enemy, so
0437          * instead of taking it out on some poor other group that did 5%
0438          * or less of the IO's for the last summation just skip this
0439          * scale down event.
0440          */
0441         samples_thresh = lat_info->nr_samples * 5;
0442         samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
0443         if (iolat->nr_samples <= samples_thresh)
0444             return;
0445     }
0446
0447     /* We're as low as we can go. */
0448     if (iolat->rq_depth.max_depth == 1 && direction < 0) {
0449         blkcg_use_delay(lat_to_blkg(iolat));
0450         return;
0451     }
0452
0453     /* We're back to the default cookie, unthrottle all the things. */
0454     if (cur_cookie == DEFAULT_SCALE_COOKIE) {
0455         blkcg_clear_delay(lat_to_blkg(iolat));
0456         iolat->rq_depth.max_depth = UINT_MAX;
0457         wake_up_all(&iolat->rq_wait.wait);
0458         return;
0459     }
0460
0461     scale_change(iolat, direction > 0);
0462 }
0463
0464 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
0465 {
0466     struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
0467     struct blkcg_gq *blkg = bio->bi_blkg;
0468     bool issue_as_root = bio_issue_as_root_blkg(bio);
0469
0470     if (!blkiolat->enabled)
0471         return;
0472
0473     while (blkg && blkg->parent) {
0474         struct iolatency_grp *iolat = blkg_to_lat(blkg);
0475         if (!iolat) {
0476             blkg = blkg->parent;
0477             continue;
0478         }
0479
0480         check_scale_change(iolat);
0481         __blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
0482                      (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
0483         blkg = blkg->parent;
0484     }
0485     if (!timer_pending(&blkiolat->timer))
0486         mod_timer(&blkiolat->timer, jiffies + HZ);
0487 }
0488
0489 static void iolatency_record_time(struct iolatency_grp *iolat,
0490                   struct bio_issue *issue, u64 now,
0491                   bool issue_as_root)
0492 {
0493     u64 start = bio_issue_time(issue);
0494     u64 req_time;
0495
0496     /*
0497      * Have to do this so we are truncated to the correct time that our
0498      * issue is truncated to.
0499      */
0500     now = __bio_issue_time(now);
0501
0502     if (now <= start)
0503         return;
0504
0505     req_time = now - start;
0506
0507     /*
0508      * We don't want to count issue_as_root bio's in the cgroups latency
0509      * statistics as it could skew the numbers downwards.
0510      */
0511     if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
0512         u64 sub = iolat->min_lat_nsec;
0513         if (req_time < sub)
0514             blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
0515         return;
0516     }
0517
0518     latency_stat_record_time(iolat, req_time);
0519 }
0520
0521 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
0522 #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
0523
0524 static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
0525 {
0526     struct blkcg_gq *blkg = lat_to_blkg(iolat);
0527     struct iolatency_grp *parent;
0528     struct child_latency_info *lat_info;
0529     struct latency_stat stat;
0530     unsigned long flags;
0531     int cpu;
0532
0533     latency_stat_init(iolat, &stat);
0534     preempt_disable();
0535     for_each_online_cpu(cpu) {
0536         struct latency_stat *s;
0537         s = per_cpu_ptr(iolat->stats, cpu);
0538         latency_stat_sum(iolat, &stat, s);
0539         latency_stat_init(iolat, s);
0540     }
0541     preempt_enable();
0542
0543     parent = blkg_to_lat(blkg->parent);
0544     if (!parent)
0545         return;
0546
0547     lat_info = &parent->child_lat;
0548
0549     iolat_update_total_lat_avg(iolat, &stat);
0550
0551     /* Everything is ok and we don't need to adjust the scale. */
0552     if (latency_sum_ok(iolat, &stat) &&
0553         atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
0554         return;
0555
0556     /* Somebody beat us to the punch, just bail. */
0557     spin_lock_irqsave(&lat_info->lock, flags);
0558
0559     latency_stat_sum(iolat, &iolat->cur_stat, &stat);
0560     lat_info->nr_samples -= iolat->nr_samples;
0561     lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
0562     iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
0563
0564     if ((lat_info->last_scale_event >= now ||
0565         now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
0566         goto out;
0567
0568     if (latency_sum_ok(iolat, &iolat->cur_stat) &&
0569         latency_sum_ok(iolat, &stat)) {
0570         if (latency_stat_samples(iolat, &iolat->cur_stat) <
0571             BLKIOLATENCY_MIN_GOOD_SAMPLES)
0572             goto out;
0573         if (lat_info->scale_grp == iolat) {
0574             lat_info->last_scale_event = now;
0575             scale_cookie_change(iolat->blkiolat, lat_info, true);
0576         }
0577     } else if (lat_info->scale_lat == 0 ||
0578            lat_info->scale_lat >= iolat->min_lat_nsec) {
0579         lat_info->last_scale_event = now;
0580         if (!lat_info->scale_grp ||
0581             lat_info->scale_lat > iolat->min_lat_nsec) {
0582             WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
0583             lat_info->scale_grp = iolat;
0584         }
0585         scale_cookie_change(iolat->blkiolat, lat_info, false);
0586     }
0587     latency_stat_init(iolat, &iolat->cur_stat);
0588 out:
0589     spin_unlock_irqrestore(&lat_info->lock, flags);
0590 }
0591
0592 static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
0593 {
0594     struct blkcg_gq *blkg;
0595     struct rq_wait *rqw;
0596     struct iolatency_grp *iolat;
0597     u64 window_start;
0598     u64 now;
0599     bool issue_as_root = bio_issue_as_root_blkg(bio);
0600     int inflight = 0;
0601
0602     blkg = bio->bi_blkg;
0603     if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
0604         return;
0605
0606     iolat = blkg_to_lat(bio->bi_blkg);
0607     if (!iolat)
0608         return;
0609
0610     if (!iolat->blkiolat->enabled)
0611         return;
0612
0613     now = ktime_to_ns(ktime_get());
0614     while (blkg && blkg->parent) {
0615         iolat = blkg_to_lat(blkg);
0616         if (!iolat) {
0617             blkg = blkg->parent;
0618             continue;
0619         }
0620         rqw = &iolat->rq_wait;
0621
0622         inflight = atomic_dec_return(&rqw->inflight);
0623         WARN_ON_ONCE(inflight < 0);
0624         /*
0625          * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
0626          * submitted, so do not account for it.
0627          */
0628         if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
0629             iolatency_record_time(iolat, &bio->bi_issue, now,
0630                           issue_as_root);
0631             window_start = atomic64_read(&iolat->window_start);
0632             if (now > window_start &&
0633                 (now - window_start) >= iolat->cur_win_nsec) {
0634                 if (atomic64_try_cmpxchg(&iolat->window_start,
0635                              &window_start, now))
0636                     iolatency_check_latencies(iolat, now);
0637             }
0638         }
0639         wake_up(&rqw->wait);
0640         blkg = blkg->parent;
0641     }
0642 }
0643
0644 static void blkcg_iolatency_exit(struct rq_qos *rqos)
0645 {
0646     struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
0647
0648     del_timer_sync(&blkiolat->timer);
0649     flush_work(&blkiolat->enable_work);
0650     blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
0651     kfree(blkiolat);
0652 }
0653
0654 static struct rq_qos_ops blkcg_iolatency_ops = {
0655     .throttle = blkcg_iolatency_throttle,
0656     .done_bio = blkcg_iolatency_done_bio,
0657     .exit = blkcg_iolatency_exit,
0658 };
0659
0660 static void blkiolatency_timer_fn(struct timer_list *t)
0661 {
0662     struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
0663     struct blkcg_gq *blkg;
0664     struct cgroup_subsys_state *pos_css;
0665     u64 now = ktime_to_ns(ktime_get());
0666
0667     rcu_read_lock();
0668     blkg_for_each_descendant_pre(blkg, pos_css,
0669                      blkiolat->rqos.q->root_blkg) {
0670         struct iolatency_grp *iolat;
0671         struct child_latency_info *lat_info;
0672         unsigned long flags;
0673         u64 cookie;
0674
0675         /*
0676          * We could be exiting, don't access the pd unless we have a
0677          * ref on the blkg.
0678          */
0679         if (!blkg_tryget(blkg))
0680             continue;
0681
0682         iolat = blkg_to_lat(blkg);
0683         if (!iolat)
0684             goto next;
0685
0686         lat_info = &iolat->child_lat;
0687         cookie = atomic_read(&lat_info->scale_cookie);
0688
0689         if (cookie >= DEFAULT_SCALE_COOKIE)
0690             goto next;
0691
0692         spin_lock_irqsave(&lat_info->lock, flags);
0693         if (lat_info->last_scale_event >= now)
0694             goto next_lock;
0695
0696         /*
0697          * We scaled down but don't have a scale_grp, scale up and carry
0698          * on.
0699          */
0700         if (lat_info->scale_grp == NULL) {
0701             scale_cookie_change(iolat->blkiolat, lat_info, true);
0702             goto next_lock;
0703         }
0704
0705         /*
0706          * It's been 5 seconds since our last scale event, clear the
0707          * scale grp in case the group that needed the scale down isn't
0708          * doing any IO currently.
0709          */
0710         if (now - lat_info->last_scale_event >=
0711             ((u64)NSEC_PER_SEC * 5))
0712             lat_info->scale_grp = NULL;
0713 next_lock:
0714         spin_unlock_irqrestore(&lat_info->lock, flags);
0715 next:
0716         blkg_put(blkg);
0717     }
0718     rcu_read_unlock();
0719 }
0720
0721 /**
0722  * blkiolatency_enable_work_fn - Enable or disable iolatency on the device
0723  * @work: enable_work of the blk_iolatency of interest
0724  *
0725  * iolatency needs to keep track of the number of in-flight IOs per cgroup. This
0726  * is relatively expensive as it involves walking up the hierarchy twice for
0727  * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we
0728  * want to disable the in-flight tracking.
0729  *
0730  * We have to make sure that the counting is balanced - we don't want to leak
0731  * the in-flight counts by disabling accounting in the completion path while IOs
0732  * are in flight. This is achieved by ensuring that no IO is in flight by
0733  * freezing the queue while flipping ->enabled. As this requires a sleepable
0734  * context, ->enabled flipping is punted to this work function.
0735  */
0736 static void blkiolatency_enable_work_fn(struct work_struct *work)
0737 {
0738     struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency,
0739                               enable_work);
0740     bool enabled;
0741
0742     /*
0743      * There can only be one instance of this function running for @blkiolat
0744      * and it's guaranteed to be executed at least once after the latest
0745      * ->enabled_cnt modification. Acting on the latest ->enable_cnt is
0746      * sufficient.
0747      *
0748      * Also, we know @blkiolat is safe to access as ->enable_work is flushed
0749      * in blkcg_iolatency_exit().
0750      */
0751     enabled = atomic_read(&blkiolat->enable_cnt);
0752     if (enabled != blkiolat->enabled) {
0753         blk_mq_freeze_queue(blkiolat->rqos.q);
0754         blkiolat->enabled = enabled;
0755         blk_mq_unfreeze_queue(blkiolat->rqos.q);
0756     }
0757 }
0758
0759 int blk_iolatency_init(struct request_queue *q)
0760 {
0761     struct blk_iolatency *blkiolat;
0762     struct rq_qos *rqos;
0763     int ret;
0764
0765     blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
0766     if (!blkiolat)
0767         return -ENOMEM;
0768
0769     rqos = &blkiolat->rqos;
0770     rqos->id = RQ_QOS_LATENCY;
0771     rqos->ops = &blkcg_iolatency_ops;
0772     rqos->q = q;
0773
0774     ret = rq_qos_add(q, rqos);
0775     if (ret)
0776         goto err_free;
0777     ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
0778     if (ret)
0779         goto err_qos_del;
0780
0781     timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
0782     INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
0783
0784     return 0;
0785
0786 err_qos_del:
0787     rq_qos_del(q, rqos);
0788 err_free:
0789     kfree(blkiolat);
0790     return ret;
0791 }
0792
0793 static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
0794 {
0795     struct iolatency_grp *iolat = blkg_to_lat(blkg);
0796     struct blk_iolatency *blkiolat = iolat->blkiolat;
0797     u64 oldval = iolat->min_lat_nsec;
0798
0799     iolat->min_lat_nsec = val;
0800     iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
0801     iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
0802                     BLKIOLATENCY_MAX_WIN_SIZE);
0803
0804     if (!oldval && val) {
0805         if (atomic_inc_return(&blkiolat->enable_cnt) == 1)
0806             schedule_work(&blkiolat->enable_work);
0807     }
0808     if (oldval && !val) {
0809         blkcg_clear_delay(blkg);
0810         if (atomic_dec_return(&blkiolat->enable_cnt) == 0)
0811             schedule_work(&blkiolat->enable_work);
0812     }
0813 }
0814
0815 static void iolatency_clear_scaling(struct blkcg_gq *blkg)
0816 {
0817     if (blkg->parent) {
0818         struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
0819         struct child_latency_info *lat_info;
0820         if (!iolat)
0821             return;
0822
0823         lat_info = &iolat->child_lat;
0824         spin_lock(&lat_info->lock);
0825         atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
0826         lat_info->last_scale_event = 0;
0827         lat_info->scale_grp = NULL;
0828         lat_info->scale_lat = 0;
0829         spin_unlock(&lat_info->lock);
0830     }
0831 }
0832
0833 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
0834                  size_t nbytes, loff_t off)
0835 {
0836     struct blkcg *blkcg = css_to_blkcg(of_css(of));
0837     struct blkcg_gq *blkg;
0838     struct blkg_conf_ctx ctx;
0839     struct iolatency_grp *iolat;
0840     char *p, *tok;
0841     u64 lat_val = 0;
0842     u64 oldval;
0843     int ret;
0844
0845     ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
0846     if (ret)
0847         return ret;
0848
0849     iolat = blkg_to_lat(ctx.blkg);
0850     p = ctx.body;
0851
0852     ret = -EINVAL;
0853     while ((tok = strsep(&p, " "))) {
0854         char key[16];
0855         char val[21];   /* 18446744073709551616 */
0856
0857         if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
0858             goto out;
0859
0860         if (!strcmp(key, "target")) {
0861             u64 v;
0862
0863             if (!strcmp(val, "max"))
0864                 lat_val = 0;
0865             else if (sscanf(val, "%llu", &v) == 1)
0866                 lat_val = v * NSEC_PER_USEC;
0867             else
0868                 goto out;
0869         } else {
0870             goto out;
0871         }
0872     }
0873
0874     /* Walk up the tree to see if our new val is lower than it should be. */
0875     blkg = ctx.blkg;
0876     oldval = iolat->min_lat_nsec;
0877
0878     iolatency_set_min_lat_nsec(blkg, lat_val);
0879     if (oldval != iolat->min_lat_nsec)
0880         iolatency_clear_scaling(blkg);
0881     ret = 0;
0882 out:
0883     blkg_conf_finish(&ctx);
0884     return ret ?: nbytes;
0885 }
0886
0887 static u64 iolatency_prfill_limit(struct seq_file *sf,
0888                   struct blkg_policy_data *pd, int off)
0889 {
0890     struct iolatency_grp *iolat = pd_to_lat(pd);
0891     const char *dname = blkg_dev_name(pd->blkg);
0892
0893     if (!dname || !iolat->min_lat_nsec)
0894         return 0;
0895     seq_printf(sf, "%s target=%llu\n",
0896            dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
0897     return 0;
0898 }
0899
0900 static int iolatency_print_limit(struct seq_file *sf, void *v)
0901 {
0902     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
0903               iolatency_prfill_limit,
0904               &blkcg_policy_iolatency, seq_cft(sf)->private, false);
0905     return 0;
0906 }
0907
0908 static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
0909 {
0910     struct latency_stat stat;
0911     int cpu;
0912
0913     latency_stat_init(iolat, &stat);
0914     preempt_disable();
0915     for_each_online_cpu(cpu) {
0916         struct latency_stat *s;
0917         s = per_cpu_ptr(iolat->stats, cpu);
0918         latency_stat_sum(iolat, &stat, s);
0919     }
0920     preempt_enable();
0921
0922     if (iolat->rq_depth.max_depth == UINT_MAX)
0923         seq_printf(s, " missed=%llu total=%llu depth=max",
0924             (unsigned long long)stat.ps.missed,
0925             (unsigned long long)stat.ps.total);
0926     else
0927         seq_printf(s, " missed=%llu total=%llu depth=%u",
0928             (unsigned long long)stat.ps.missed,
0929             (unsigned long long)stat.ps.total,
0930             iolat->rq_depth.max_depth);
0931 }
0932
0933 static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
0934 {
0935     struct iolatency_grp *iolat = pd_to_lat(pd);
0936     unsigned long long avg_lat;
0937     unsigned long long cur_win;
0938
0939     if (!blkcg_debug_stats)
0940         return;
0941
0942     if (iolat->ssd)
0943         return iolatency_ssd_stat(iolat, s);
0944
0945     avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
0946     cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
0947     if (iolat->rq_depth.max_depth == UINT_MAX)
0948         seq_printf(s, " depth=max avg_lat=%llu win=%llu",
0949             avg_lat, cur_win);
0950     else
0951         seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
0952             iolat->rq_depth.max_depth, avg_lat, cur_win);
0953 }
0954
0955 static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
0956                            struct request_queue *q,
0957                            struct blkcg *blkcg)
0958 {
0959     struct iolatency_grp *iolat;
0960
0961     iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
0962     if (!iolat)
0963         return NULL;
0964     iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
0965                        __alignof__(struct latency_stat), gfp);
0966     if (!iolat->stats) {
0967         kfree(iolat);
0968         return NULL;
0969     }
0970     return &iolat->pd;
0971 }
0972
0973 static void iolatency_pd_init(struct blkg_policy_data *pd)
0974 {
0975     struct iolatency_grp *iolat = pd_to_lat(pd);
0976     struct blkcg_gq *blkg = lat_to_blkg(iolat);
0977     struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
0978     struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
0979     u64 now = ktime_to_ns(ktime_get());
0980     int cpu;
0981
0982     if (blk_queue_nonrot(blkg->q))
0983         iolat->ssd = true;
0984     else
0985         iolat->ssd = false;
0986
0987     for_each_possible_cpu(cpu) {
0988         struct latency_stat *stat;
0989         stat = per_cpu_ptr(iolat->stats, cpu);
0990         latency_stat_init(iolat, stat);
0991     }
0992
0993     latency_stat_init(iolat, &iolat->cur_stat);
0994     rq_wait_init(&iolat->rq_wait);
0995     spin_lock_init(&iolat->child_lat.lock);
0996     iolat->rq_depth.queue_depth = blkg->q->nr_requests;
0997     iolat->rq_depth.max_depth = UINT_MAX;
0998     iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
0999     iolat->blkiolat = blkiolat;
1000     iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
1001     atomic64_set(&iolat->window_start, now);
1002
1003     /*
1004      * We init things in list order, so the pd for the parent may not be
1005      * init'ed yet for whatever reason.
1006      */
1007     if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
1008         struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
1009         atomic_set(&iolat->scale_cookie,
1010                atomic_read(&parent->child_lat.scale_cookie));
1011     } else {
1012         atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
1013     }
1014
1015     atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
1016 }
1017
1018 static void iolatency_pd_offline(struct blkg_policy_data *pd)
1019 {
1020     struct iolatency_grp *iolat = pd_to_lat(pd);
1021     struct blkcg_gq *blkg = lat_to_blkg(iolat);
1022
1023     iolatency_set_min_lat_nsec(blkg, 0);
1024     iolatency_clear_scaling(blkg);
1025 }
1026
1027 static void iolatency_pd_free(struct blkg_policy_data *pd)
1028 {
1029     struct iolatency_grp *iolat = pd_to_lat(pd);
1030     free_percpu(iolat->stats);
1031     kfree(iolat);
1032 }
1033
1034 static struct cftype iolatency_files[] = {
1035     {
1036         .name = "latency",
1037         .flags = CFTYPE_NOT_ON_ROOT,
1038         .seq_show = iolatency_print_limit,
1039         .write = iolatency_set_limit,
1040     },
1041     {}
1042 };
1043
1044 static struct blkcg_policy blkcg_policy_iolatency = {
1045     .dfl_cftypes    = iolatency_files,
1046     .pd_alloc_fn    = iolatency_pd_alloc,
1047     .pd_init_fn = iolatency_pd_init,
1048     .pd_offline_fn  = iolatency_pd_offline,
1049     .pd_free_fn = iolatency_pd_free,
1050     .pd_stat_fn = iolatency_pd_stat,
1051 };
1052
1053 static int __init iolatency_init(void)
1054 {
1055     return blkcg_policy_register(&blkcg_policy_iolatency);
1056 }
1057
1058 static void __exit iolatency_exit(void)
1059 {
1060     blkcg_policy_unregister(&blkcg_policy_iolatency);
1061 }
1062
1063 module_init(iolatency_init);
1064 module_exit(iolatency_exit);