0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175 #include <linux/kernel.h>
0176 #include <linux/module.h>
0177 #include <linux/timer.h>
0178 #include <linux/time64.h>
0179 #include <linux/parser.h>
0180 #include <linux/sched/signal.h>
0181 #include <asm/local.h>
0182 #include <asm/local64.h>
0183 #include "blk-rq-qos.h"
0184 #include "blk-stat.h"
0185 #include "blk-wbt.h"
0186 #include "blk-cgroup.h"
0187
0188 #ifdef CONFIG_TRACEPOINTS
0189
0190
0191 #define TRACE_IOCG_PATH_LEN 1024
0192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
0193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
0194
0195 #define TRACE_IOCG_PATH(type, iocg, ...) \
0196 do { \
0197 unsigned long flags; \
0198 if (trace_iocost_##type##_enabled()) { \
0199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
0200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
0201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
0202 trace_iocost_##type(iocg, trace_iocg_path, \
0203 ##__VA_ARGS__); \
0204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
0205 } \
0206 } while (0)
0207
0208 #else
0209 #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
0210 #endif
0211
0212 enum {
0213 MILLION = 1000000,
0214
0215
0216 MIN_PERIOD = USEC_PER_MSEC,
0217 MAX_PERIOD = USEC_PER_SEC,
0218
0219
0220
0221
0222
0223
0224 MARGIN_MIN_PCT = 10,
0225 MARGIN_LOW_PCT = 20,
0226 MARGIN_TARGET_PCT = 50,
0227
0228 INUSE_ADJ_STEP_PCT = 25,
0229
0230
0231 TIMER_SLACK_PCT = 1,
0232
0233
0234 WEIGHT_ONE = 1 << 16,
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247 VTIME_PER_SEC_SHIFT = 37,
0248 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
0249 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
0250 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
0251
0252
0253 VRATE_MIN_PPM = 10000,
0254 VRATE_MAX_PPM = 100000000,
0255
0256 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
0257 VRATE_CLAMP_ADJ_PCT = 4,
0258
0259
0260 RQ_WAIT_BUSY_PCT = 5,
0261
0262
0263 UNBUSY_THR_PCT = 75,
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285 MIN_DELAY_THR_PCT = 500,
0286 MAX_DELAY_THR_PCT = 25000,
0287 MIN_DELAY = 250,
0288 MAX_DELAY = 250 * USEC_PER_MSEC,
0289
0290
0291 DFGV_USAGE_PCT = 50,
0292 DFGV_PERIOD = 100 * USEC_PER_MSEC,
0293
0294
0295 MAX_LAGGING_PERIODS = 10,
0296
0297
0298 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
0299
0300
0301
0302
0303
0304
0305 IOC_PAGE_SHIFT = 12,
0306 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
0307 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
0308
0309
0310 LCOEF_RANDIO_PAGES = 4096,
0311 };
0312
0313 enum ioc_running {
0314 IOC_IDLE,
0315 IOC_RUNNING,
0316 IOC_STOP,
0317 };
0318
0319
0320 enum {
0321 QOS_ENABLE,
0322 QOS_CTRL,
0323 NR_QOS_CTRL_PARAMS,
0324 };
0325
0326
0327 enum {
0328 QOS_RPPM,
0329 QOS_RLAT,
0330 QOS_WPPM,
0331 QOS_WLAT,
0332 QOS_MIN,
0333 QOS_MAX,
0334 NR_QOS_PARAMS,
0335 };
0336
0337
0338 enum {
0339 COST_CTRL,
0340 COST_MODEL,
0341 NR_COST_CTRL_PARAMS,
0342 };
0343
0344
0345 enum {
0346 I_LCOEF_RBPS,
0347 I_LCOEF_RSEQIOPS,
0348 I_LCOEF_RRANDIOPS,
0349 I_LCOEF_WBPS,
0350 I_LCOEF_WSEQIOPS,
0351 I_LCOEF_WRANDIOPS,
0352 NR_I_LCOEFS,
0353 };
0354
0355 enum {
0356 LCOEF_RPAGE,
0357 LCOEF_RSEQIO,
0358 LCOEF_RRANDIO,
0359 LCOEF_WPAGE,
0360 LCOEF_WSEQIO,
0361 LCOEF_WRANDIO,
0362 NR_LCOEFS,
0363 };
0364
0365 enum {
0366 AUTOP_INVALID,
0367 AUTOP_HDD,
0368 AUTOP_SSD_QD1,
0369 AUTOP_SSD_DFL,
0370 AUTOP_SSD_FAST,
0371 };
0372
0373 struct ioc_params {
0374 u32 qos[NR_QOS_PARAMS];
0375 u64 i_lcoefs[NR_I_LCOEFS];
0376 u64 lcoefs[NR_LCOEFS];
0377 u32 too_fast_vrate_pct;
0378 u32 too_slow_vrate_pct;
0379 };
0380
0381 struct ioc_margins {
0382 s64 min;
0383 s64 low;
0384 s64 target;
0385 };
0386
0387 struct ioc_missed {
0388 local_t nr_met;
0389 local_t nr_missed;
0390 u32 last_met;
0391 u32 last_missed;
0392 };
0393
0394 struct ioc_pcpu_stat {
0395 struct ioc_missed missed[2];
0396
0397 local64_t rq_wait_ns;
0398 u64 last_rq_wait_ns;
0399 };
0400
0401
0402 struct ioc {
0403 struct rq_qos rqos;
0404
0405 bool enabled;
0406
0407 struct ioc_params params;
0408 struct ioc_margins margins;
0409 u32 period_us;
0410 u32 timer_slack_ns;
0411 u64 vrate_min;
0412 u64 vrate_max;
0413
0414 spinlock_t lock;
0415 struct timer_list timer;
0416 struct list_head active_iocgs;
0417 struct ioc_pcpu_stat __percpu *pcpu_stat;
0418
0419 enum ioc_running running;
0420 atomic64_t vtime_rate;
0421 u64 vtime_base_rate;
0422 s64 vtime_err;
0423
0424 seqcount_spinlock_t period_seqcount;
0425 u64 period_at;
0426 u64 period_at_vtime;
0427
0428 atomic64_t cur_period;
0429 int busy_level;
0430
0431 bool weights_updated;
0432 atomic_t hweight_gen;
0433
0434
0435 u64 dfgv_period_at;
0436 u64 dfgv_period_rem;
0437 u64 dfgv_usage_us_sum;
0438
0439 u64 autop_too_fast_at;
0440 u64 autop_too_slow_at;
0441 int autop_idx;
0442 bool user_qos_params:1;
0443 bool user_cost_model:1;
0444 };
0445
0446 struct iocg_pcpu_stat {
0447 local64_t abs_vusage;
0448 };
0449
0450 struct iocg_stat {
0451 u64 usage_us;
0452 u64 wait_us;
0453 u64 indebt_us;
0454 u64 indelay_us;
0455 };
0456
0457
0458 struct ioc_gq {
0459 struct blkg_policy_data pd;
0460 struct ioc *ioc;
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480 u32 cfg_weight;
0481 u32 weight;
0482 u32 active;
0483 u32 inuse;
0484
0485 u32 last_inuse;
0486 s64 saved_margin;
0487
0488 sector_t cursor;
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500 atomic64_t vtime;
0501 atomic64_t done_vtime;
0502 u64 abs_vdebt;
0503
0504
0505 u64 delay;
0506 u64 delay_at;
0507
0508
0509
0510
0511
0512 atomic64_t active_period;
0513 struct list_head active_list;
0514
0515
0516 u64 child_active_sum;
0517 u64 child_inuse_sum;
0518 u64 child_adjusted_sum;
0519 int hweight_gen;
0520 u32 hweight_active;
0521 u32 hweight_inuse;
0522 u32 hweight_donating;
0523 u32 hweight_after_donation;
0524
0525 struct list_head walk_list;
0526 struct list_head surplus_list;
0527
0528 struct wait_queue_head waitq;
0529 struct hrtimer waitq_timer;
0530
0531
0532 u64 activated_at;
0533
0534
0535 struct iocg_pcpu_stat __percpu *pcpu_stat;
0536 struct iocg_stat stat;
0537 struct iocg_stat last_stat;
0538 u64 last_stat_abs_vusage;
0539 u64 usage_delta_us;
0540 u64 wait_since;
0541 u64 indebt_since;
0542 u64 indelay_since;
0543
0544
0545 int level;
0546 struct ioc_gq *ancestors[];
0547 };
0548
0549
0550 struct ioc_cgrp {
0551 struct blkcg_policy_data cpd;
0552 unsigned int dfl_weight;
0553 };
0554
0555 struct ioc_now {
0556 u64 now_ns;
0557 u64 now;
0558 u64 vnow;
0559 u64 vrate;
0560 };
0561
0562 struct iocg_wait {
0563 struct wait_queue_entry wait;
0564 struct bio *bio;
0565 u64 abs_cost;
0566 bool committed;
0567 };
0568
0569 struct iocg_wake_ctx {
0570 struct ioc_gq *iocg;
0571 u32 hw_inuse;
0572 s64 vbudget;
0573 };
0574
0575 static const struct ioc_params autop[] = {
0576 [AUTOP_HDD] = {
0577 .qos = {
0578 [QOS_RLAT] = 250000,
0579 [QOS_WLAT] = 250000,
0580 [QOS_MIN] = VRATE_MIN_PPM,
0581 [QOS_MAX] = VRATE_MAX_PPM,
0582 },
0583 .i_lcoefs = {
0584 [I_LCOEF_RBPS] = 174019176,
0585 [I_LCOEF_RSEQIOPS] = 41708,
0586 [I_LCOEF_RRANDIOPS] = 370,
0587 [I_LCOEF_WBPS] = 178075866,
0588 [I_LCOEF_WSEQIOPS] = 42705,
0589 [I_LCOEF_WRANDIOPS] = 378,
0590 },
0591 },
0592 [AUTOP_SSD_QD1] = {
0593 .qos = {
0594 [QOS_RLAT] = 25000,
0595 [QOS_WLAT] = 25000,
0596 [QOS_MIN] = VRATE_MIN_PPM,
0597 [QOS_MAX] = VRATE_MAX_PPM,
0598 },
0599 .i_lcoefs = {
0600 [I_LCOEF_RBPS] = 245855193,
0601 [I_LCOEF_RSEQIOPS] = 61575,
0602 [I_LCOEF_RRANDIOPS] = 6946,
0603 [I_LCOEF_WBPS] = 141365009,
0604 [I_LCOEF_WSEQIOPS] = 33716,
0605 [I_LCOEF_WRANDIOPS] = 26796,
0606 },
0607 },
0608 [AUTOP_SSD_DFL] = {
0609 .qos = {
0610 [QOS_RLAT] = 25000,
0611 [QOS_WLAT] = 25000,
0612 [QOS_MIN] = VRATE_MIN_PPM,
0613 [QOS_MAX] = VRATE_MAX_PPM,
0614 },
0615 .i_lcoefs = {
0616 [I_LCOEF_RBPS] = 488636629,
0617 [I_LCOEF_RSEQIOPS] = 8932,
0618 [I_LCOEF_RRANDIOPS] = 8518,
0619 [I_LCOEF_WBPS] = 427891549,
0620 [I_LCOEF_WSEQIOPS] = 28755,
0621 [I_LCOEF_WRANDIOPS] = 21940,
0622 },
0623 .too_fast_vrate_pct = 500,
0624 },
0625 [AUTOP_SSD_FAST] = {
0626 .qos = {
0627 [QOS_RLAT] = 5000,
0628 [QOS_WLAT] = 5000,
0629 [QOS_MIN] = VRATE_MIN_PPM,
0630 [QOS_MAX] = VRATE_MAX_PPM,
0631 },
0632 .i_lcoefs = {
0633 [I_LCOEF_RBPS] = 3102524156LLU,
0634 [I_LCOEF_RSEQIOPS] = 724816,
0635 [I_LCOEF_RRANDIOPS] = 778122,
0636 [I_LCOEF_WBPS] = 1742780862LLU,
0637 [I_LCOEF_WSEQIOPS] = 425702,
0638 [I_LCOEF_WRANDIOPS] = 443193,
0639 },
0640 .too_slow_vrate_pct = 10,
0641 },
0642 };
0643
0644
0645
0646
0647
0648 static u32 vrate_adj_pct[] =
0649 { 0, 0, 0, 0,
0650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0652 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
0653
0654 static struct blkcg_policy blkcg_policy_iocost;
0655
0656
0657 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
0658 {
0659 return container_of(rqos, struct ioc, rqos);
0660 }
0661
0662 static struct ioc *q_to_ioc(struct request_queue *q)
0663 {
0664 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
0665 }
0666
0667 static const char *q_name(struct request_queue *q)
0668 {
0669 if (blk_queue_registered(q))
0670 return kobject_name(q->kobj.parent);
0671 else
0672 return "<unknown>";
0673 }
0674
0675 static const char __maybe_unused *ioc_name(struct ioc *ioc)
0676 {
0677 return q_name(ioc->rqos.q);
0678 }
0679
0680 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
0681 {
0682 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
0683 }
0684
0685 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
0686 {
0687 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
0688 }
0689
0690 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
0691 {
0692 return pd_to_blkg(&iocg->pd);
0693 }
0694
0695 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
0696 {
0697 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
0698 struct ioc_cgrp, cpd);
0699 }
0700
0701
0702
0703
0704
0705 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
0706 {
0707 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
0708 }
0709
0710
0711
0712
0713 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
0714 {
0715 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
0716 }
0717
0718 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
0719 u64 abs_cost, u64 cost)
0720 {
0721 struct iocg_pcpu_stat *gcs;
0722
0723 bio->bi_iocost_cost = cost;
0724 atomic64_add(cost, &iocg->vtime);
0725
0726 gcs = get_cpu_ptr(iocg->pcpu_stat);
0727 local64_add(abs_cost, &gcs->abs_vusage);
0728 put_cpu_ptr(gcs);
0729 }
0730
0731 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
0732 {
0733 if (lock_ioc) {
0734 spin_lock_irqsave(&iocg->ioc->lock, *flags);
0735 spin_lock(&iocg->waitq.lock);
0736 } else {
0737 spin_lock_irqsave(&iocg->waitq.lock, *flags);
0738 }
0739 }
0740
0741 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
0742 {
0743 if (unlock_ioc) {
0744 spin_unlock(&iocg->waitq.lock);
0745 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
0746 } else {
0747 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
0748 }
0749 }
0750
0751 #define CREATE_TRACE_POINTS
0752 #include <trace/events/iocost.h>
0753
0754 static void ioc_refresh_margins(struct ioc *ioc)
0755 {
0756 struct ioc_margins *margins = &ioc->margins;
0757 u32 period_us = ioc->period_us;
0758 u64 vrate = ioc->vtime_base_rate;
0759
0760 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
0761 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
0762 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
0763 }
0764
0765
0766 static void ioc_refresh_period_us(struct ioc *ioc)
0767 {
0768 u32 ppm, lat, multi, period_us;
0769
0770 lockdep_assert_held(&ioc->lock);
0771
0772
0773 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
0774 ppm = ioc->params.qos[QOS_RPPM];
0775 lat = ioc->params.qos[QOS_RLAT];
0776 } else {
0777 ppm = ioc->params.qos[QOS_WPPM];
0778 lat = ioc->params.qos[QOS_WLAT];
0779 }
0780
0781
0782
0783
0784
0785
0786
0787
0788
0789 if (ppm)
0790 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
0791 else
0792 multi = 2;
0793 period_us = multi * lat;
0794 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
0795
0796
0797 ioc->period_us = period_us;
0798 ioc->timer_slack_ns = div64_u64(
0799 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
0800 100);
0801 ioc_refresh_margins(ioc);
0802 }
0803
0804 static int ioc_autop_idx(struct ioc *ioc)
0805 {
0806 int idx = ioc->autop_idx;
0807 const struct ioc_params *p = &autop[idx];
0808 u32 vrate_pct;
0809 u64 now_ns;
0810
0811
0812 if (!blk_queue_nonrot(ioc->rqos.q))
0813 return AUTOP_HDD;
0814
0815
0816 if (blk_queue_depth(ioc->rqos.q) == 1)
0817 return AUTOP_SSD_QD1;
0818
0819
0820 if (idx < AUTOP_SSD_DFL)
0821 return AUTOP_SSD_DFL;
0822
0823
0824 if (ioc->user_qos_params || ioc->user_cost_model)
0825 return idx;
0826
0827
0828 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
0829 now_ns = ktime_get_ns();
0830
0831 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
0832 if (!ioc->autop_too_fast_at)
0833 ioc->autop_too_fast_at = now_ns;
0834 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
0835 return idx + 1;
0836 } else {
0837 ioc->autop_too_fast_at = 0;
0838 }
0839
0840 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
0841 if (!ioc->autop_too_slow_at)
0842 ioc->autop_too_slow_at = now_ns;
0843 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
0844 return idx - 1;
0845 } else {
0846 ioc->autop_too_slow_at = 0;
0847 }
0848
0849 return idx;
0850 }
0851
0852
0853
0854
0855
0856
0857
0858
0859
0860
0861
0862
0863
0864
0865 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
0866 u64 *page, u64 *seqio, u64 *randio)
0867 {
0868 u64 v;
0869
0870 *page = *seqio = *randio = 0;
0871
0872 if (bps)
0873 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
0874 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
0875
0876 if (seqiops) {
0877 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
0878 if (v > *page)
0879 *seqio = v - *page;
0880 }
0881
0882 if (randiops) {
0883 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
0884 if (v > *page)
0885 *randio = v - *page;
0886 }
0887 }
0888
0889 static void ioc_refresh_lcoefs(struct ioc *ioc)
0890 {
0891 u64 *u = ioc->params.i_lcoefs;
0892 u64 *c = ioc->params.lcoefs;
0893
0894 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
0895 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
0896 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
0897 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
0898 }
0899
0900 static bool ioc_refresh_params(struct ioc *ioc, bool force)
0901 {
0902 const struct ioc_params *p;
0903 int idx;
0904
0905 lockdep_assert_held(&ioc->lock);
0906
0907 idx = ioc_autop_idx(ioc);
0908 p = &autop[idx];
0909
0910 if (idx == ioc->autop_idx && !force)
0911 return false;
0912
0913 if (idx != ioc->autop_idx)
0914 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
0915
0916 ioc->autop_idx = idx;
0917 ioc->autop_too_fast_at = 0;
0918 ioc->autop_too_slow_at = 0;
0919
0920 if (!ioc->user_qos_params)
0921 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
0922 if (!ioc->user_cost_model)
0923 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
0924
0925 ioc_refresh_period_us(ioc);
0926 ioc_refresh_lcoefs(ioc);
0927
0928 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
0929 VTIME_PER_USEC, MILLION);
0930 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
0931 VTIME_PER_USEC, MILLION);
0932
0933 return true;
0934 }
0935
0936
0937
0938
0939
0940
0941
0942
0943 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
0944 {
0945 s64 pleft = ioc->period_at + ioc->period_us - now->now;
0946 s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
0947 s64 vcomp, vcomp_min, vcomp_max;
0948
0949 lockdep_assert_held(&ioc->lock);
0950
0951
0952 if (pleft <= 0)
0953 goto done;
0954
0955
0956
0957
0958
0959
0960 vcomp = -div64_s64(ioc->vtime_err, pleft);
0961 vcomp_min = -(ioc->vtime_base_rate >> 1);
0962 vcomp_max = ioc->vtime_base_rate;
0963 vcomp = clamp(vcomp, vcomp_min, vcomp_max);
0964
0965 ioc->vtime_err += vcomp * pleft;
0966
0967 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
0968 done:
0969
0970 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
0971 }
0972
0973 static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
0974 int nr_lagging, int nr_shortages,
0975 int prev_busy_level, u32 *missed_ppm)
0976 {
0977 u64 vrate = ioc->vtime_base_rate;
0978 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
0979
0980 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
0981 if (ioc->busy_level != prev_busy_level || nr_lagging)
0982 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
0983 missed_ppm, rq_wait_pct,
0984 nr_lagging, nr_shortages);
0985
0986 return;
0987 }
0988
0989
0990
0991
0992
0993
0994 if (vrate < vrate_min) {
0995 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
0996 vrate = min(vrate, vrate_min);
0997 } else if (vrate > vrate_max) {
0998 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
0999 vrate = max(vrate, vrate_max);
1000 } else {
1001 int idx = min_t(int, abs(ioc->busy_level),
1002 ARRAY_SIZE(vrate_adj_pct) - 1);
1003 u32 adj_pct = vrate_adj_pct[idx];
1004
1005 if (ioc->busy_level > 0)
1006 adj_pct = 100 - adj_pct;
1007 else
1008 adj_pct = 100 + adj_pct;
1009
1010 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1011 vrate_min, vrate_max);
1012 }
1013
1014 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1015 nr_lagging, nr_shortages);
1016
1017 ioc->vtime_base_rate = vrate;
1018 ioc_refresh_margins(ioc);
1019 }
1020
1021
1022 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1023 {
1024 unsigned seq;
1025
1026 now->now_ns = ktime_get();
1027 now->now = ktime_to_us(now->now_ns);
1028 now->vrate = atomic64_read(&ioc->vtime_rate);
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038 do {
1039 seq = read_seqcount_begin(&ioc->period_seqcount);
1040 now->vnow = ioc->period_at_vtime +
1041 (now->now - ioc->period_at) * now->vrate;
1042 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1043 }
1044
1045 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1046 {
1047 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1048
1049 write_seqcount_begin(&ioc->period_seqcount);
1050 ioc->period_at = now->now;
1051 ioc->period_at_vtime = now->vnow;
1052 write_seqcount_end(&ioc->period_seqcount);
1053
1054 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1055 add_timer(&ioc->timer);
1056 }
1057
1058
1059
1060
1061
1062
1063 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1064 bool save, struct ioc_now *now)
1065 {
1066 struct ioc *ioc = iocg->ioc;
1067 int lvl;
1068
1069 lockdep_assert_held(&ioc->lock);
1070
1071
1072
1073
1074
1075
1076 if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1077 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1078 iocg->child_active_sum);
1079 } else {
1080 inuse = clamp_t(u32, inuse, 1, active);
1081 }
1082
1083 iocg->last_inuse = iocg->inuse;
1084 if (save)
1085 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1086
1087 if (active == iocg->active && inuse == iocg->inuse)
1088 return;
1089
1090 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1091 struct ioc_gq *parent = iocg->ancestors[lvl];
1092 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1093 u32 parent_active = 0, parent_inuse = 0;
1094
1095
1096 parent->child_active_sum += (s32)(active - child->active);
1097 parent->child_inuse_sum += (s32)(inuse - child->inuse);
1098
1099 child->active = active;
1100 child->inuse = inuse;
1101
1102
1103
1104
1105
1106
1107 if (parent->child_active_sum) {
1108 parent_active = parent->weight;
1109 parent_inuse = DIV64_U64_ROUND_UP(
1110 parent_active * parent->child_inuse_sum,
1111 parent->child_active_sum);
1112 }
1113
1114
1115 if (parent_active == parent->active &&
1116 parent_inuse == parent->inuse)
1117 break;
1118
1119 active = parent_active;
1120 inuse = parent_inuse;
1121 }
1122
1123 ioc->weights_updated = true;
1124 }
1125
1126 static void commit_weights(struct ioc *ioc)
1127 {
1128 lockdep_assert_held(&ioc->lock);
1129
1130 if (ioc->weights_updated) {
1131
1132 smp_wmb();
1133 atomic_inc(&ioc->hweight_gen);
1134 ioc->weights_updated = false;
1135 }
1136 }
1137
1138 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1139 bool save, struct ioc_now *now)
1140 {
1141 __propagate_weights(iocg, active, inuse, save, now);
1142 commit_weights(iocg->ioc);
1143 }
1144
1145 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1146 {
1147 struct ioc *ioc = iocg->ioc;
1148 int lvl;
1149 u32 hwa, hwi;
1150 int ioc_gen;
1151
1152
1153 ioc_gen = atomic_read(&ioc->hweight_gen);
1154 if (ioc_gen == iocg->hweight_gen)
1155 goto out;
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167 smp_rmb();
1168
1169 hwa = hwi = WEIGHT_ONE;
1170 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1171 struct ioc_gq *parent = iocg->ancestors[lvl];
1172 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1173 u64 active_sum = READ_ONCE(parent->child_active_sum);
1174 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1175 u32 active = READ_ONCE(child->active);
1176 u32 inuse = READ_ONCE(child->inuse);
1177
1178
1179 if (!active_sum || !inuse_sum)
1180 continue;
1181
1182 active_sum = max_t(u64, active, active_sum);
1183 hwa = div64_u64((u64)hwa * active, active_sum);
1184
1185 inuse_sum = max_t(u64, inuse, inuse_sum);
1186 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1187 }
1188
1189 iocg->hweight_active = max_t(u32, hwa, 1);
1190 iocg->hweight_inuse = max_t(u32, hwi, 1);
1191 iocg->hweight_gen = ioc_gen;
1192 out:
1193 if (hw_activep)
1194 *hw_activep = iocg->hweight_active;
1195 if (hw_inusep)
1196 *hw_inusep = iocg->hweight_inuse;
1197 }
1198
1199
1200
1201
1202
1203 static u32 current_hweight_max(struct ioc_gq *iocg)
1204 {
1205 u32 hwm = WEIGHT_ONE;
1206 u32 inuse = iocg->active;
1207 u64 child_inuse_sum;
1208 int lvl;
1209
1210 lockdep_assert_held(&iocg->ioc->lock);
1211
1212 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1213 struct ioc_gq *parent = iocg->ancestors[lvl];
1214 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1215
1216 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1217 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1218 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1219 parent->child_active_sum);
1220 }
1221
1222 return max_t(u32, hwm, 1);
1223 }
1224
1225 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1226 {
1227 struct ioc *ioc = iocg->ioc;
1228 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1229 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1230 u32 weight;
1231
1232 lockdep_assert_held(&ioc->lock);
1233
1234 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1235 if (weight != iocg->weight && iocg->active)
1236 propagate_weights(iocg, weight, iocg->inuse, true, now);
1237 iocg->weight = weight;
1238 }
1239
1240 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1241 {
1242 struct ioc *ioc = iocg->ioc;
1243 u64 last_period, cur_period;
1244 u64 vtime, vtarget;
1245 int i;
1246
1247
1248
1249
1250
1251 if (!list_empty(&iocg->active_list)) {
1252 ioc_now(ioc, now);
1253 cur_period = atomic64_read(&ioc->cur_period);
1254 if (atomic64_read(&iocg->active_period) != cur_period)
1255 atomic64_set(&iocg->active_period, cur_period);
1256 return true;
1257 }
1258
1259
1260 if (iocg->child_active_sum)
1261 return false;
1262
1263 spin_lock_irq(&ioc->lock);
1264
1265 ioc_now(ioc, now);
1266
1267
1268 cur_period = atomic64_read(&ioc->cur_period);
1269 last_period = atomic64_read(&iocg->active_period);
1270 atomic64_set(&iocg->active_period, cur_period);
1271
1272
1273 if (!list_empty(&iocg->active_list))
1274 goto succeed_unlock;
1275 for (i = iocg->level - 1; i > 0; i--)
1276 if (!list_empty(&iocg->ancestors[i]->active_list))
1277 goto fail_unlock;
1278
1279 if (iocg->child_active_sum)
1280 goto fail_unlock;
1281
1282
1283
1284
1285
1286 vtarget = now->vnow - ioc->margins.target;
1287 vtime = atomic64_read(&iocg->vtime);
1288
1289 atomic64_add(vtarget - vtime, &iocg->vtime);
1290 atomic64_add(vtarget - vtime, &iocg->done_vtime);
1291 vtime = vtarget;
1292
1293
1294
1295
1296
1297
1298 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1299 list_add(&iocg->active_list, &ioc->active_iocgs);
1300
1301 propagate_weights(iocg, iocg->weight,
1302 iocg->last_inuse ?: iocg->weight, true, now);
1303
1304 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1305 last_period, cur_period, vtime);
1306
1307 iocg->activated_at = now->now;
1308
1309 if (ioc->running == IOC_IDLE) {
1310 ioc->running = IOC_RUNNING;
1311 ioc->dfgv_period_at = now->now;
1312 ioc->dfgv_period_rem = 0;
1313 ioc_start_period(ioc, now);
1314 }
1315
1316 succeed_unlock:
1317 spin_unlock_irq(&ioc->lock);
1318 return true;
1319
1320 fail_unlock:
1321 spin_unlock_irq(&ioc->lock);
1322 return false;
1323 }
1324
1325 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1326 {
1327 struct ioc *ioc = iocg->ioc;
1328 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1329 u64 tdelta, delay, new_delay;
1330 s64 vover, vover_pct;
1331 u32 hwa;
1332
1333 lockdep_assert_held(&iocg->waitq.lock);
1334
1335
1336 tdelta = now->now - iocg->delay_at;
1337 if (iocg->delay)
1338 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1339 else
1340 delay = 0;
1341
1342
1343 current_hweight(iocg, &hwa, NULL);
1344 vover = atomic64_read(&iocg->vtime) +
1345 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1346 vover_pct = div64_s64(100 * vover,
1347 ioc->period_us * ioc->vtime_base_rate);
1348
1349 if (vover_pct <= MIN_DELAY_THR_PCT)
1350 new_delay = 0;
1351 else if (vover_pct >= MAX_DELAY_THR_PCT)
1352 new_delay = MAX_DELAY;
1353 else
1354 new_delay = MIN_DELAY +
1355 div_u64((MAX_DELAY - MIN_DELAY) *
1356 (vover_pct - MIN_DELAY_THR_PCT),
1357 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1358
1359
1360 if (new_delay > delay) {
1361 iocg->delay = new_delay;
1362 iocg->delay_at = now->now;
1363 delay = new_delay;
1364 }
1365
1366 if (delay >= MIN_DELAY) {
1367 if (!iocg->indelay_since)
1368 iocg->indelay_since = now->now;
1369 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1370 return true;
1371 } else {
1372 if (iocg->indelay_since) {
1373 iocg->stat.indelay_us += now->now - iocg->indelay_since;
1374 iocg->indelay_since = 0;
1375 }
1376 iocg->delay = 0;
1377 blkcg_clear_delay(blkg);
1378 return false;
1379 }
1380 }
1381
1382 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1383 struct ioc_now *now)
1384 {
1385 struct iocg_pcpu_stat *gcs;
1386
1387 lockdep_assert_held(&iocg->ioc->lock);
1388 lockdep_assert_held(&iocg->waitq.lock);
1389 WARN_ON_ONCE(list_empty(&iocg->active_list));
1390
1391
1392
1393
1394
1395 if (!iocg->abs_vdebt && abs_cost) {
1396 iocg->indebt_since = now->now;
1397 propagate_weights(iocg, iocg->active, 0, false, now);
1398 }
1399
1400 iocg->abs_vdebt += abs_cost;
1401
1402 gcs = get_cpu_ptr(iocg->pcpu_stat);
1403 local64_add(abs_cost, &gcs->abs_vusage);
1404 put_cpu_ptr(gcs);
1405 }
1406
1407 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1408 struct ioc_now *now)
1409 {
1410 lockdep_assert_held(&iocg->ioc->lock);
1411 lockdep_assert_held(&iocg->waitq.lock);
1412
1413
1414 WARN_ON_ONCE(list_empty(&iocg->active_list));
1415 WARN_ON_ONCE(iocg->inuse > 1);
1416
1417 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1418
1419
1420 if (!iocg->abs_vdebt) {
1421 iocg->stat.indebt_us += now->now - iocg->indebt_since;
1422 iocg->indebt_since = 0;
1423
1424 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1425 false, now);
1426 }
1427 }
1428
1429 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1430 int flags, void *key)
1431 {
1432 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1433 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1434 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1435
1436 ctx->vbudget -= cost;
1437
1438 if (ctx->vbudget < 0)
1439 return -1;
1440
1441 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1442 wait->committed = true;
1443
1444
1445
1446
1447
1448
1449
1450
1451 default_wake_function(wq_entry, mode, flags, key);
1452 list_del_init_careful(&wq_entry->entry);
1453 return 0;
1454 }
1455
1456
1457
1458
1459
1460
1461 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1462 struct ioc_now *now)
1463 {
1464 struct ioc *ioc = iocg->ioc;
1465 struct iocg_wake_ctx ctx = { .iocg = iocg };
1466 u64 vshortage, expires, oexpires;
1467 s64 vbudget;
1468 u32 hwa;
1469
1470 lockdep_assert_held(&iocg->waitq.lock);
1471
1472 current_hweight(iocg, &hwa, NULL);
1473 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1474
1475
1476 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1477 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1478 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1479 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1480
1481 lockdep_assert_held(&ioc->lock);
1482
1483 atomic64_add(vpay, &iocg->vtime);
1484 atomic64_add(vpay, &iocg->done_vtime);
1485 iocg_pay_debt(iocg, abs_vpay, now);
1486 vbudget -= vpay;
1487 }
1488
1489 if (iocg->abs_vdebt || iocg->delay)
1490 iocg_kick_delay(iocg, now);
1491
1492
1493
1494
1495
1496
1497
1498 if (iocg->abs_vdebt) {
1499 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1500 vbudget = min_t(s64, 0, vbudget - vdebt);
1501 }
1502
1503
1504
1505
1506
1507
1508 ctx.vbudget = vbudget;
1509 current_hweight(iocg, NULL, &ctx.hw_inuse);
1510
1511 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1512
1513 if (!waitqueue_active(&iocg->waitq)) {
1514 if (iocg->wait_since) {
1515 iocg->stat.wait_us += now->now - iocg->wait_since;
1516 iocg->wait_since = 0;
1517 }
1518 return;
1519 }
1520
1521 if (!iocg->wait_since)
1522 iocg->wait_since = now->now;
1523
1524 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1525 return;
1526
1527
1528 vshortage = -ctx.vbudget;
1529 expires = now->now_ns +
1530 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1531 NSEC_PER_USEC;
1532 expires += ioc->timer_slack_ns;
1533
1534
1535 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1536 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1537 abs(oexpires - expires) <= ioc->timer_slack_ns)
1538 return;
1539
1540 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1541 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1542 }
1543
1544 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1545 {
1546 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1547 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1548 struct ioc_now now;
1549 unsigned long flags;
1550
1551 ioc_now(iocg->ioc, &now);
1552
1553 iocg_lock(iocg, pay_debt, &flags);
1554 iocg_kick_waitq(iocg, pay_debt, &now);
1555 iocg_unlock(iocg, pay_debt, &flags);
1556
1557 return HRTIMER_NORESTART;
1558 }
1559
1560 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1561 {
1562 u32 nr_met[2] = { };
1563 u32 nr_missed[2] = { };
1564 u64 rq_wait_ns = 0;
1565 int cpu, rw;
1566
1567 for_each_online_cpu(cpu) {
1568 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1569 u64 this_rq_wait_ns;
1570
1571 for (rw = READ; rw <= WRITE; rw++) {
1572 u32 this_met = local_read(&stat->missed[rw].nr_met);
1573 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1574
1575 nr_met[rw] += this_met - stat->missed[rw].last_met;
1576 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1577 stat->missed[rw].last_met = this_met;
1578 stat->missed[rw].last_missed = this_missed;
1579 }
1580
1581 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1582 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1583 stat->last_rq_wait_ns = this_rq_wait_ns;
1584 }
1585
1586 for (rw = READ; rw <= WRITE; rw++) {
1587 if (nr_met[rw] + nr_missed[rw])
1588 missed_ppm_ar[rw] =
1589 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1590 nr_met[rw] + nr_missed[rw]);
1591 else
1592 missed_ppm_ar[rw] = 0;
1593 }
1594
1595 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1596 ioc->period_us * NSEC_PER_USEC);
1597 }
1598
1599
1600 static bool iocg_is_idle(struct ioc_gq *iocg)
1601 {
1602 struct ioc *ioc = iocg->ioc;
1603
1604
1605 if (atomic64_read(&iocg->active_period) ==
1606 atomic64_read(&ioc->cur_period))
1607 return false;
1608
1609
1610 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1611 return false;
1612
1613 return true;
1614 }
1615
1616
1617
1618
1619
1620
1621 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1622 struct list_head *inner_walk)
1623 {
1624 int lvl;
1625
1626 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1627
1628
1629 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1630 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1631 break;
1632 }
1633
1634
1635 while (++lvl <= iocg->level - 1) {
1636 struct ioc_gq *inner = iocg->ancestors[lvl];
1637
1638
1639 list_add_tail(&inner->walk_list, inner_walk);
1640 }
1641 }
1642
1643
1644 static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1645 {
1646 if (iocg->level > 0) {
1647 struct iocg_stat *parent_stat =
1648 &iocg->ancestors[iocg->level - 1]->stat;
1649
1650 parent_stat->usage_us +=
1651 iocg->stat.usage_us - iocg->last_stat.usage_us;
1652 parent_stat->wait_us +=
1653 iocg->stat.wait_us - iocg->last_stat.wait_us;
1654 parent_stat->indebt_us +=
1655 iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1656 parent_stat->indelay_us +=
1657 iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1658 }
1659
1660 iocg->last_stat = iocg->stat;
1661 }
1662
1663
1664 static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
1665 {
1666 struct ioc *ioc = iocg->ioc;
1667 u64 abs_vusage = 0;
1668 u64 vusage_delta;
1669 int cpu;
1670
1671 lockdep_assert_held(&iocg->ioc->lock);
1672
1673
1674 for_each_possible_cpu(cpu) {
1675 abs_vusage += local64_read(
1676 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1677 }
1678 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1679 iocg->last_stat_abs_vusage = abs_vusage;
1680
1681 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1682 iocg->stat.usage_us += iocg->usage_delta_us;
1683
1684 iocg_flush_stat_upward(iocg);
1685 }
1686
1687
1688 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1689 {
1690 LIST_HEAD(inner_walk);
1691 struct ioc_gq *iocg, *tiocg;
1692
1693
1694 list_for_each_entry(iocg, target_iocgs, active_list) {
1695 iocg_flush_stat_leaf(iocg, now);
1696 iocg_build_inner_walk(iocg, &inner_walk);
1697 }
1698
1699
1700 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1701 iocg_flush_stat_upward(iocg);
1702 list_del_init(&iocg->walk_list);
1703 }
1704 }
1705
1706
1707
1708
1709
1710
1711 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1712 u32 usage, struct ioc_now *now)
1713 {
1714 struct ioc *ioc = iocg->ioc;
1715 u64 vtime = atomic64_read(&iocg->vtime);
1716 s64 excess, delta, target, new_hwi;
1717
1718
1719 if (iocg->abs_vdebt)
1720 return 1;
1721
1722
1723 if (waitqueue_active(&iocg->waitq) ||
1724 time_after64(vtime, now->vnow - ioc->margins.min))
1725 return hwm;
1726
1727
1728 excess = now->vnow - vtime - ioc->margins.target;
1729 if (excess > 0) {
1730 atomic64_add(excess, &iocg->vtime);
1731 atomic64_add(excess, &iocg->done_vtime);
1732 vtime += excess;
1733 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1734 }
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1753 now->vnow - ioc->period_at_vtime);
1754 target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1755 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1756
1757 return clamp_t(s64, new_hwi, 1, hwm);
1758 }
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1818 {
1819 LIST_HEAD(over_hwa);
1820 LIST_HEAD(inner_walk);
1821 struct ioc_gq *iocg, *tiocg, *root_iocg;
1822 u32 after_sum, over_sum, over_target, gamma;
1823
1824
1825
1826
1827
1828
1829
1830
1831 after_sum = 0;
1832 over_sum = 0;
1833 list_for_each_entry(iocg, surpluses, surplus_list) {
1834 u32 hwa;
1835
1836 current_hweight(iocg, &hwa, NULL);
1837 after_sum += iocg->hweight_after_donation;
1838
1839 if (iocg->hweight_after_donation > hwa) {
1840 over_sum += iocg->hweight_after_donation;
1841 list_add(&iocg->walk_list, &over_hwa);
1842 }
1843 }
1844
1845 if (after_sum >= WEIGHT_ONE) {
1846
1847
1848
1849
1850 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1851 WARN_ON_ONCE(over_sum <= over_delta);
1852 over_target = over_sum - over_delta;
1853 } else {
1854 over_target = 0;
1855 }
1856
1857 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1858 if (over_target)
1859 iocg->hweight_after_donation =
1860 div_u64((u64)iocg->hweight_after_donation *
1861 over_target, over_sum);
1862 list_del_init(&iocg->walk_list);
1863 }
1864
1865
1866
1867
1868
1869 list_for_each_entry(iocg, surpluses, surplus_list) {
1870 iocg_build_inner_walk(iocg, &inner_walk);
1871 }
1872
1873 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1874 WARN_ON_ONCE(root_iocg->level > 0);
1875
1876 list_for_each_entry(iocg, &inner_walk, walk_list) {
1877 iocg->child_adjusted_sum = 0;
1878 iocg->hweight_donating = 0;
1879 iocg->hweight_after_donation = 0;
1880 }
1881
1882
1883
1884
1885
1886 list_for_each_entry(iocg, surpluses, surplus_list) {
1887 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1888
1889 parent->hweight_donating += iocg->hweight_donating;
1890 parent->hweight_after_donation += iocg->hweight_after_donation;
1891 }
1892
1893 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1894 if (iocg->level > 0) {
1895 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1896
1897 parent->hweight_donating += iocg->hweight_donating;
1898 parent->hweight_after_donation += iocg->hweight_after_donation;
1899 }
1900 }
1901
1902
1903
1904
1905
1906
1907 list_for_each_entry(iocg, &inner_walk, walk_list) {
1908 if (iocg->level) {
1909 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1910
1911 iocg->hweight_active = DIV64_U64_ROUND_UP(
1912 (u64)parent->hweight_active * iocg->active,
1913 parent->child_active_sum);
1914
1915 }
1916
1917 iocg->hweight_donating = min(iocg->hweight_donating,
1918 iocg->hweight_active);
1919 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1920 iocg->hweight_donating - 1);
1921 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1922 iocg->hweight_donating <= 1 ||
1923 iocg->hweight_after_donation == 0)) {
1924 pr_warn("iocg: invalid donation weights in ");
1925 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1926 pr_cont(": active=%u donating=%u after=%u\n",
1927 iocg->hweight_active, iocg->hweight_donating,
1928 iocg->hweight_after_donation);
1929 }
1930 }
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946 gamma = DIV_ROUND_UP(
1947 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1948 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1949
1950
1951
1952
1953
1954 list_for_each_entry(iocg, &inner_walk, walk_list) {
1955 struct ioc_gq *parent;
1956 u32 inuse, wpt, wptp;
1957 u64 st, sf;
1958
1959 if (iocg->level == 0) {
1960
1961 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1962 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1963 WEIGHT_ONE - iocg->hweight_after_donation);
1964 continue;
1965 }
1966
1967 parent = iocg->ancestors[iocg->level - 1];
1968
1969
1970 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1971 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1972 WEIGHT_ONE) + iocg->hweight_after_donation;
1973
1974
1975 inuse = DIV64_U64_ROUND_UP(
1976 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1977 parent->hweight_inuse);
1978
1979
1980 st = DIV64_U64_ROUND_UP(
1981 iocg->child_active_sum * iocg->hweight_donating,
1982 iocg->hweight_active);
1983 sf = iocg->child_active_sum - st;
1984 wpt = DIV64_U64_ROUND_UP(
1985 (u64)iocg->active * iocg->hweight_donating,
1986 iocg->hweight_active);
1987 wptp = DIV64_U64_ROUND_UP(
1988 (u64)inuse * iocg->hweight_after_donation,
1989 iocg->hweight_inuse);
1990
1991 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1992 }
1993
1994
1995
1996
1997
1998 list_for_each_entry(iocg, surpluses, surplus_list) {
1999 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2000 u32 inuse;
2001
2002
2003
2004
2005
2006
2007
2008
2009 if (iocg->abs_vdebt) {
2010 WARN_ON_ONCE(iocg->inuse > 1);
2011 continue;
2012 }
2013
2014
2015 inuse = DIV64_U64_ROUND_UP(
2016 parent->child_adjusted_sum * iocg->hweight_after_donation,
2017 parent->hweight_inuse);
2018
2019 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2020 iocg->inuse, inuse,
2021 iocg->hweight_inuse,
2022 iocg->hweight_after_donation);
2023
2024 __propagate_weights(iocg, iocg->active, inuse, true, now);
2025 }
2026
2027
2028 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2029 list_del_init(&iocg->walk_list);
2030 }
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2044 struct ioc_now *now)
2045 {
2046 struct ioc_gq *iocg;
2047 u64 dur, usage_pct, nr_cycles;
2048
2049
2050 if (!nr_debtors) {
2051 ioc->dfgv_period_at = now->now;
2052 ioc->dfgv_period_rem = 0;
2053 ioc->dfgv_usage_us_sum = 0;
2054 return;
2055 }
2056
2057
2058
2059
2060
2061
2062
2063 if (ioc->busy_level > 0)
2064 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2065
2066 ioc->dfgv_usage_us_sum += usage_us_sum;
2067 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2068 return;
2069
2070
2071
2072
2073
2074 dur = now->now - ioc->dfgv_period_at;
2075 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2076
2077 ioc->dfgv_period_at = now->now;
2078 ioc->dfgv_usage_us_sum = 0;
2079
2080
2081 if (usage_pct > DFGV_USAGE_PCT) {
2082 ioc->dfgv_period_rem = 0;
2083 return;
2084 }
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095 nr_cycles = dur + ioc->dfgv_period_rem;
2096 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2097
2098 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2099 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2100
2101 if (!iocg->abs_vdebt && !iocg->delay)
2102 continue;
2103
2104 spin_lock(&iocg->waitq.lock);
2105
2106 old_debt = iocg->abs_vdebt;
2107 old_delay = iocg->delay;
2108
2109 if (iocg->abs_vdebt)
2110 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2111 if (iocg->delay)
2112 iocg->delay = iocg->delay >> nr_cycles ?: 1;
2113
2114 iocg_kick_waitq(iocg, true, now);
2115
2116 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2117 old_debt, iocg->abs_vdebt,
2118 old_delay, iocg->delay);
2119
2120 spin_unlock(&iocg->waitq.lock);
2121 }
2122 }
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134 static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2135 {
2136 int nr_debtors = 0;
2137 struct ioc_gq *iocg, *tiocg;
2138
2139 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2140 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2141 !iocg->delay && !iocg_is_idle(iocg))
2142 continue;
2143
2144 spin_lock(&iocg->waitq.lock);
2145
2146
2147 if (iocg->wait_since) {
2148 iocg->stat.wait_us += now->now - iocg->wait_since;
2149 iocg->wait_since = now->now;
2150 }
2151 if (iocg->indebt_since) {
2152 iocg->stat.indebt_us +=
2153 now->now - iocg->indebt_since;
2154 iocg->indebt_since = now->now;
2155 }
2156 if (iocg->indelay_since) {
2157 iocg->stat.indelay_us +=
2158 now->now - iocg->indelay_since;
2159 iocg->indelay_since = now->now;
2160 }
2161
2162 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2163 iocg->delay) {
2164
2165 iocg_kick_waitq(iocg, true, now);
2166 if (iocg->abs_vdebt || iocg->delay)
2167 nr_debtors++;
2168 } else if (iocg_is_idle(iocg)) {
2169
2170 u64 vtime = atomic64_read(&iocg->vtime);
2171 s64 excess;
2172
2173
2174
2175
2176
2177
2178
2179 excess = now->vnow - vtime - ioc->margins.target;
2180 if (excess > 0) {
2181 u32 old_hwi;
2182
2183 current_hweight(iocg, NULL, &old_hwi);
2184 ioc->vtime_err -= div64_u64(excess * old_hwi,
2185 WEIGHT_ONE);
2186 }
2187
2188 TRACE_IOCG_PATH(iocg_idle, iocg, now,
2189 atomic64_read(&iocg->active_period),
2190 atomic64_read(&ioc->cur_period), vtime);
2191 __propagate_weights(iocg, 0, 0, false, now);
2192 list_del_init(&iocg->active_list);
2193 }
2194
2195 spin_unlock(&iocg->waitq.lock);
2196 }
2197
2198 commit_weights(ioc);
2199 return nr_debtors;
2200 }
2201
2202 static void ioc_timer_fn(struct timer_list *timer)
2203 {
2204 struct ioc *ioc = container_of(timer, struct ioc, timer);
2205 struct ioc_gq *iocg, *tiocg;
2206 struct ioc_now now;
2207 LIST_HEAD(surpluses);
2208 int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2209 u64 usage_us_sum = 0;
2210 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2211 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2212 u32 missed_ppm[2], rq_wait_pct;
2213 u64 period_vtime;
2214 int prev_busy_level;
2215
2216
2217 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2218
2219
2220 spin_lock_irq(&ioc->lock);
2221
2222 ioc_now(ioc, &now);
2223
2224 period_vtime = now.vnow - ioc->period_at_vtime;
2225 if (WARN_ON_ONCE(!period_vtime)) {
2226 spin_unlock_irq(&ioc->lock);
2227 return;
2228 }
2229
2230 nr_debtors = ioc_check_iocgs(ioc, &now);
2231
2232
2233
2234
2235
2236 iocg_flush_stat(&ioc->active_iocgs, &now);
2237
2238
2239 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2240 u64 vdone, vtime, usage_us;
2241 u32 hw_active, hw_inuse;
2242
2243
2244
2245
2246
2247 vdone = atomic64_read(&iocg->done_vtime);
2248 vtime = atomic64_read(&iocg->vtime);
2249 current_hweight(iocg, &hw_active, &hw_inuse);
2250
2251
2252
2253
2254
2255
2256
2257 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2258 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2259 time_after64(vtime, vdone) &&
2260 time_after64(vtime, now.vnow -
2261 MAX_LAGGING_PERIODS * period_vtime) &&
2262 time_before64(vdone, now.vnow - period_vtime))
2263 nr_lagging++;
2264
2265
2266
2267
2268
2269 usage_us = iocg->usage_delta_us;
2270 usage_us_sum += usage_us;
2271
2272
2273 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2274 if (hw_inuse < hw_active ||
2275 (!waitqueue_active(&iocg->waitq) &&
2276 time_before64(vtime, now.vnow - ioc->margins.low))) {
2277 u32 hwa, old_hwi, hwm, new_hwi, usage;
2278 u64 usage_dur;
2279
2280 if (vdone != vtime) {
2281 u64 inflight_us = DIV64_U64_ROUND_UP(
2282 cost_to_abs_cost(vtime - vdone, hw_inuse),
2283 ioc->vtime_base_rate);
2284
2285 usage_us = max(usage_us, inflight_us);
2286 }
2287
2288
2289 if (time_after64(iocg->activated_at, ioc->period_at))
2290 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2291 else
2292 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2293
2294 usage = clamp_t(u32,
2295 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2296 usage_dur),
2297 1, WEIGHT_ONE);
2298
2299
2300
2301
2302
2303 current_hweight(iocg, &hwa, &old_hwi);
2304 hwm = current_hweight_max(iocg);
2305 new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2306 usage, &now);
2307
2308
2309
2310
2311
2312
2313
2314 if (new_hwi < hwm && hwa >= 2) {
2315 iocg->hweight_donating = hwa;
2316 iocg->hweight_after_donation = new_hwi;
2317 list_add(&iocg->surplus_list, &surpluses);
2318 } else if (!iocg->abs_vdebt) {
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2330 iocg->inuse, iocg->active,
2331 iocg->hweight_inuse, new_hwi);
2332
2333 __propagate_weights(iocg, iocg->active,
2334 iocg->active, true, &now);
2335 nr_shortages++;
2336 }
2337 } else {
2338
2339 nr_shortages++;
2340 }
2341 }
2342
2343 if (!list_empty(&surpluses) && nr_shortages)
2344 transfer_surpluses(&surpluses, &now);
2345
2346 commit_weights(ioc);
2347
2348
2349 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2350 list_del_init(&iocg->surplus_list);
2351
2352
2353
2354
2355
2356
2357
2358 prev_busy_level = ioc->busy_level;
2359 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2360 missed_ppm[READ] > ppm_rthr ||
2361 missed_ppm[WRITE] > ppm_wthr) {
2362
2363 ioc->busy_level = max(ioc->busy_level, 0);
2364 ioc->busy_level++;
2365 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2366 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2367 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2368
2369 if (nr_shortages) {
2370
2371
2372
2373
2374 ioc->busy_level = min(ioc->busy_level, 0);
2375
2376
2377
2378
2379
2380 if (!nr_lagging)
2381 ioc->busy_level--;
2382 } else {
2383
2384
2385
2386
2387
2388
2389 ioc->busy_level = 0;
2390 }
2391 } else {
2392
2393 ioc->busy_level = 0;
2394 }
2395
2396 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2397
2398 ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2399 prev_busy_level, missed_ppm);
2400
2401 ioc_refresh_params(ioc, false);
2402
2403 ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2404
2405
2406
2407
2408
2409 atomic64_inc(&ioc->cur_period);
2410
2411 if (ioc->running != IOC_STOP) {
2412 if (!list_empty(&ioc->active_iocgs)) {
2413 ioc_start_period(ioc, &now);
2414 } else {
2415 ioc->busy_level = 0;
2416 ioc->vtime_err = 0;
2417 ioc->running = IOC_IDLE;
2418 }
2419
2420 ioc_refresh_vrate(ioc, &now);
2421 }
2422
2423 spin_unlock_irq(&ioc->lock);
2424 }
2425
2426 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2427 u64 abs_cost, struct ioc_now *now)
2428 {
2429 struct ioc *ioc = iocg->ioc;
2430 struct ioc_margins *margins = &ioc->margins;
2431 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2432 u32 hwi, adj_step;
2433 s64 margin;
2434 u64 cost, new_inuse;
2435
2436 current_hweight(iocg, NULL, &hwi);
2437 old_hwi = hwi;
2438 cost = abs_cost_to_cost(abs_cost, hwi);
2439 margin = now->vnow - vtime - cost;
2440
2441
2442 if (iocg->abs_vdebt)
2443 return cost;
2444
2445
2446
2447
2448
2449 if (margin >= iocg->saved_margin || margin >= margins->low ||
2450 iocg->inuse == iocg->active)
2451 return cost;
2452
2453 spin_lock_irq(&ioc->lock);
2454
2455
2456 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2457 spin_unlock_irq(&ioc->lock);
2458 return cost;
2459 }
2460
2461
2462
2463
2464
2465
2466
2467
2468 new_inuse = iocg->inuse;
2469 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2470 do {
2471 new_inuse = new_inuse + adj_step;
2472 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2473 current_hweight(iocg, NULL, &hwi);
2474 cost = abs_cost_to_cost(abs_cost, hwi);
2475 } while (time_after64(vtime + cost, now->vnow) &&
2476 iocg->inuse != iocg->active);
2477
2478 spin_unlock_irq(&ioc->lock);
2479
2480 TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2481 old_inuse, iocg->inuse, old_hwi, hwi);
2482
2483 return cost;
2484 }
2485
2486 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2487 bool is_merge, u64 *costp)
2488 {
2489 struct ioc *ioc = iocg->ioc;
2490 u64 coef_seqio, coef_randio, coef_page;
2491 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2492 u64 seek_pages = 0;
2493 u64 cost = 0;
2494
2495 switch (bio_op(bio)) {
2496 case REQ_OP_READ:
2497 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2498 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2499 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2500 break;
2501 case REQ_OP_WRITE:
2502 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2503 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2504 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2505 break;
2506 default:
2507 goto out;
2508 }
2509
2510 if (iocg->cursor) {
2511 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2512 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2513 }
2514
2515 if (!is_merge) {
2516 if (seek_pages > LCOEF_RANDIO_PAGES) {
2517 cost += coef_randio;
2518 } else {
2519 cost += coef_seqio;
2520 }
2521 }
2522 cost += pages * coef_page;
2523 out:
2524 *costp = cost;
2525 }
2526
2527 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2528 {
2529 u64 cost;
2530
2531 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2532 return cost;
2533 }
2534
2535 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2536 u64 *costp)
2537 {
2538 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2539
2540 switch (req_op(rq)) {
2541 case REQ_OP_READ:
2542 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2543 break;
2544 case REQ_OP_WRITE:
2545 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2546 break;
2547 default:
2548 *costp = 0;
2549 }
2550 }
2551
2552 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2553 {
2554 u64 cost;
2555
2556 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2557 return cost;
2558 }
2559
2560 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2561 {
2562 struct blkcg_gq *blkg = bio->bi_blkg;
2563 struct ioc *ioc = rqos_to_ioc(rqos);
2564 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2565 struct ioc_now now;
2566 struct iocg_wait wait;
2567 u64 abs_cost, cost, vtime;
2568 bool use_debt, ioc_locked;
2569 unsigned long flags;
2570
2571
2572 if (!ioc->enabled || !iocg || !iocg->level)
2573 return;
2574
2575
2576 abs_cost = calc_vtime_cost(bio, iocg, false);
2577 if (!abs_cost)
2578 return;
2579
2580 if (!iocg_activate(iocg, &now))
2581 return;
2582
2583 iocg->cursor = bio_end_sector(bio);
2584 vtime = atomic64_read(&iocg->vtime);
2585 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2586
2587
2588
2589
2590
2591
2592 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2593 time_before_eq64(vtime + cost, now.vnow)) {
2594 iocg_commit_bio(iocg, bio, abs_cost, cost);
2595 return;
2596 }
2597
2598
2599
2600
2601
2602
2603
2604
2605 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2606 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2607 retry_lock:
2608 iocg_lock(iocg, ioc_locked, &flags);
2609
2610
2611
2612
2613
2614
2615
2616
2617 if (unlikely(list_empty(&iocg->active_list))) {
2618 iocg_unlock(iocg, ioc_locked, &flags);
2619 iocg_commit_bio(iocg, bio, abs_cost, cost);
2620 return;
2621 }
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 if (use_debt) {
2641 iocg_incur_debt(iocg, abs_cost, &now);
2642 if (iocg_kick_delay(iocg, &now))
2643 blkcg_schedule_throttle(rqos->q,
2644 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2645 iocg_unlock(iocg, ioc_locked, &flags);
2646 return;
2647 }
2648
2649
2650 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2651 if (!ioc_locked) {
2652 iocg_unlock(iocg, false, &flags);
2653 ioc_locked = true;
2654 goto retry_lock;
2655 }
2656 propagate_weights(iocg, iocg->active, iocg->active, true,
2657 &now);
2658 }
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2674 wait.wait.private = current;
2675 wait.bio = bio;
2676 wait.abs_cost = abs_cost;
2677 wait.committed = false;
2678
2679 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2680 iocg_kick_waitq(iocg, ioc_locked, &now);
2681
2682 iocg_unlock(iocg, ioc_locked, &flags);
2683
2684 while (true) {
2685 set_current_state(TASK_UNINTERRUPTIBLE);
2686 if (wait.committed)
2687 break;
2688 io_schedule();
2689 }
2690
2691
2692 finish_wait(&iocg->waitq, &wait.wait);
2693 }
2694
2695 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2696 struct bio *bio)
2697 {
2698 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2699 struct ioc *ioc = rqos_to_ioc(rqos);
2700 sector_t bio_end = bio_end_sector(bio);
2701 struct ioc_now now;
2702 u64 vtime, abs_cost, cost;
2703 unsigned long flags;
2704
2705
2706 if (!ioc->enabled || !iocg || !iocg->level)
2707 return;
2708
2709 abs_cost = calc_vtime_cost(bio, iocg, true);
2710 if (!abs_cost)
2711 return;
2712
2713 ioc_now(ioc, &now);
2714
2715 vtime = atomic64_read(&iocg->vtime);
2716 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2717
2718
2719 if (blk_rq_pos(rq) < bio_end &&
2720 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2721 iocg->cursor = bio_end;
2722
2723
2724
2725
2726
2727 if (rq->bio && rq->bio->bi_iocost_cost &&
2728 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2729 iocg_commit_bio(iocg, bio, abs_cost, cost);
2730 return;
2731 }
2732
2733
2734
2735
2736
2737
2738 spin_lock_irqsave(&ioc->lock, flags);
2739 spin_lock(&iocg->waitq.lock);
2740
2741 if (likely(!list_empty(&iocg->active_list))) {
2742 iocg_incur_debt(iocg, abs_cost, &now);
2743 if (iocg_kick_delay(iocg, &now))
2744 blkcg_schedule_throttle(rqos->q,
2745 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2746 } else {
2747 iocg_commit_bio(iocg, bio, abs_cost, cost);
2748 }
2749
2750 spin_unlock(&iocg->waitq.lock);
2751 spin_unlock_irqrestore(&ioc->lock, flags);
2752 }
2753
2754 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2755 {
2756 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2757
2758 if (iocg && bio->bi_iocost_cost)
2759 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2760 }
2761
2762 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2763 {
2764 struct ioc *ioc = rqos_to_ioc(rqos);
2765 struct ioc_pcpu_stat *ccs;
2766 u64 on_q_ns, rq_wait_ns, size_nsec;
2767 int pidx, rw;
2768
2769 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2770 return;
2771
2772 switch (req_op(rq)) {
2773 case REQ_OP_READ:
2774 pidx = QOS_RLAT;
2775 rw = READ;
2776 break;
2777 case REQ_OP_WRITE:
2778 pidx = QOS_WLAT;
2779 rw = WRITE;
2780 break;
2781 default:
2782 return;
2783 }
2784
2785 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2786 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2787 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2788
2789 ccs = get_cpu_ptr(ioc->pcpu_stat);
2790
2791 if (on_q_ns <= size_nsec ||
2792 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2793 local_inc(&ccs->missed[rw].nr_met);
2794 else
2795 local_inc(&ccs->missed[rw].nr_missed);
2796
2797 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2798
2799 put_cpu_ptr(ccs);
2800 }
2801
2802 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2803 {
2804 struct ioc *ioc = rqos_to_ioc(rqos);
2805
2806 spin_lock_irq(&ioc->lock);
2807 ioc_refresh_params(ioc, false);
2808 spin_unlock_irq(&ioc->lock);
2809 }
2810
2811 static void ioc_rqos_exit(struct rq_qos *rqos)
2812 {
2813 struct ioc *ioc = rqos_to_ioc(rqos);
2814
2815 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2816
2817 spin_lock_irq(&ioc->lock);
2818 ioc->running = IOC_STOP;
2819 spin_unlock_irq(&ioc->lock);
2820
2821 del_timer_sync(&ioc->timer);
2822 free_percpu(ioc->pcpu_stat);
2823 kfree(ioc);
2824 }
2825
2826 static struct rq_qos_ops ioc_rqos_ops = {
2827 .throttle = ioc_rqos_throttle,
2828 .merge = ioc_rqos_merge,
2829 .done_bio = ioc_rqos_done_bio,
2830 .done = ioc_rqos_done,
2831 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2832 .exit = ioc_rqos_exit,
2833 };
2834
2835 static int blk_iocost_init(struct request_queue *q)
2836 {
2837 struct ioc *ioc;
2838 struct rq_qos *rqos;
2839 int i, cpu, ret;
2840
2841 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2842 if (!ioc)
2843 return -ENOMEM;
2844
2845 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2846 if (!ioc->pcpu_stat) {
2847 kfree(ioc);
2848 return -ENOMEM;
2849 }
2850
2851 for_each_possible_cpu(cpu) {
2852 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2853
2854 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2855 local_set(&ccs->missed[i].nr_met, 0);
2856 local_set(&ccs->missed[i].nr_missed, 0);
2857 }
2858 local64_set(&ccs->rq_wait_ns, 0);
2859 }
2860
2861 rqos = &ioc->rqos;
2862 rqos->id = RQ_QOS_COST;
2863 rqos->ops = &ioc_rqos_ops;
2864 rqos->q = q;
2865
2866 spin_lock_init(&ioc->lock);
2867 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2868 INIT_LIST_HEAD(&ioc->active_iocgs);
2869
2870 ioc->running = IOC_IDLE;
2871 ioc->vtime_base_rate = VTIME_PER_USEC;
2872 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2873 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2874 ioc->period_at = ktime_to_us(ktime_get());
2875 atomic64_set(&ioc->cur_period, 0);
2876 atomic_set(&ioc->hweight_gen, 0);
2877
2878 spin_lock_irq(&ioc->lock);
2879 ioc->autop_idx = AUTOP_INVALID;
2880 ioc_refresh_params(ioc, true);
2881 spin_unlock_irq(&ioc->lock);
2882
2883
2884
2885
2886
2887
2888
2889 ret = rq_qos_add(q, rqos);
2890 if (ret)
2891 goto err_free_ioc;
2892
2893 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2894 if (ret)
2895 goto err_del_qos;
2896 return 0;
2897
2898 err_del_qos:
2899 rq_qos_del(q, rqos);
2900 err_free_ioc:
2901 free_percpu(ioc->pcpu_stat);
2902 kfree(ioc);
2903 return ret;
2904 }
2905
2906 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2907 {
2908 struct ioc_cgrp *iocc;
2909
2910 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2911 if (!iocc)
2912 return NULL;
2913
2914 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2915 return &iocc->cpd;
2916 }
2917
2918 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2919 {
2920 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2921 }
2922
2923 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2924 struct blkcg *blkcg)
2925 {
2926 int levels = blkcg->css.cgroup->level + 1;
2927 struct ioc_gq *iocg;
2928
2929 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2930 if (!iocg)
2931 return NULL;
2932
2933 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2934 if (!iocg->pcpu_stat) {
2935 kfree(iocg);
2936 return NULL;
2937 }
2938
2939 return &iocg->pd;
2940 }
2941
2942 static void ioc_pd_init(struct blkg_policy_data *pd)
2943 {
2944 struct ioc_gq *iocg = pd_to_iocg(pd);
2945 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2946 struct ioc *ioc = q_to_ioc(blkg->q);
2947 struct ioc_now now;
2948 struct blkcg_gq *tblkg;
2949 unsigned long flags;
2950
2951 ioc_now(ioc, &now);
2952
2953 iocg->ioc = ioc;
2954 atomic64_set(&iocg->vtime, now.vnow);
2955 atomic64_set(&iocg->done_vtime, now.vnow);
2956 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2957 INIT_LIST_HEAD(&iocg->active_list);
2958 INIT_LIST_HEAD(&iocg->walk_list);
2959 INIT_LIST_HEAD(&iocg->surplus_list);
2960 iocg->hweight_active = WEIGHT_ONE;
2961 iocg->hweight_inuse = WEIGHT_ONE;
2962
2963 init_waitqueue_head(&iocg->waitq);
2964 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2965 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2966
2967 iocg->level = blkg->blkcg->css.cgroup->level;
2968
2969 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2970 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2971 iocg->ancestors[tiocg->level] = tiocg;
2972 }
2973
2974 spin_lock_irqsave(&ioc->lock, flags);
2975 weight_updated(iocg, &now);
2976 spin_unlock_irqrestore(&ioc->lock, flags);
2977 }
2978
2979 static void ioc_pd_free(struct blkg_policy_data *pd)
2980 {
2981 struct ioc_gq *iocg = pd_to_iocg(pd);
2982 struct ioc *ioc = iocg->ioc;
2983 unsigned long flags;
2984
2985 if (ioc) {
2986 spin_lock_irqsave(&ioc->lock, flags);
2987
2988 if (!list_empty(&iocg->active_list)) {
2989 struct ioc_now now;
2990
2991 ioc_now(ioc, &now);
2992 propagate_weights(iocg, 0, 0, false, &now);
2993 list_del_init(&iocg->active_list);
2994 }
2995
2996 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2997 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2998
2999 spin_unlock_irqrestore(&ioc->lock, flags);
3000
3001 hrtimer_cancel(&iocg->waitq_timer);
3002 }
3003 free_percpu(iocg->pcpu_stat);
3004 kfree(iocg);
3005 }
3006
3007 static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3008 {
3009 struct ioc_gq *iocg = pd_to_iocg(pd);
3010 struct ioc *ioc = iocg->ioc;
3011
3012 if (!ioc->enabled)
3013 return;
3014
3015 if (iocg->level == 0) {
3016 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3017 ioc->vtime_base_rate * 10000,
3018 VTIME_PER_USEC);
3019 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3020 }
3021
3022 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3023
3024 if (blkcg_debug_stats)
3025 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3026 iocg->last_stat.wait_us,
3027 iocg->last_stat.indebt_us,
3028 iocg->last_stat.indelay_us);
3029 }
3030
3031 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3032 int off)
3033 {
3034 const char *dname = blkg_dev_name(pd->blkg);
3035 struct ioc_gq *iocg = pd_to_iocg(pd);
3036
3037 if (dname && iocg->cfg_weight)
3038 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3039 return 0;
3040 }
3041
3042
3043 static int ioc_weight_show(struct seq_file *sf, void *v)
3044 {
3045 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3046 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3047
3048 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3049 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3050 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3051 return 0;
3052 }
3053
3054 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3055 size_t nbytes, loff_t off)
3056 {
3057 struct blkcg *blkcg = css_to_blkcg(of_css(of));
3058 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3059 struct blkg_conf_ctx ctx;
3060 struct ioc_now now;
3061 struct ioc_gq *iocg;
3062 u32 v;
3063 int ret;
3064
3065 if (!strchr(buf, ':')) {
3066 struct blkcg_gq *blkg;
3067
3068 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3069 return -EINVAL;
3070
3071 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3072 return -EINVAL;
3073
3074 spin_lock_irq(&blkcg->lock);
3075 iocc->dfl_weight = v * WEIGHT_ONE;
3076 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3077 struct ioc_gq *iocg = blkg_to_iocg(blkg);
3078
3079 if (iocg) {
3080 spin_lock(&iocg->ioc->lock);
3081 ioc_now(iocg->ioc, &now);
3082 weight_updated(iocg, &now);
3083 spin_unlock(&iocg->ioc->lock);
3084 }
3085 }
3086 spin_unlock_irq(&blkcg->lock);
3087
3088 return nbytes;
3089 }
3090
3091 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3092 if (ret)
3093 return ret;
3094
3095 iocg = blkg_to_iocg(ctx.blkg);
3096
3097 if (!strncmp(ctx.body, "default", 7)) {
3098 v = 0;
3099 } else {
3100 if (!sscanf(ctx.body, "%u", &v))
3101 goto einval;
3102 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3103 goto einval;
3104 }
3105
3106 spin_lock(&iocg->ioc->lock);
3107 iocg->cfg_weight = v * WEIGHT_ONE;
3108 ioc_now(iocg->ioc, &now);
3109 weight_updated(iocg, &now);
3110 spin_unlock(&iocg->ioc->lock);
3111
3112 blkg_conf_finish(&ctx);
3113 return nbytes;
3114
3115 einval:
3116 blkg_conf_finish(&ctx);
3117 return -EINVAL;
3118 }
3119
3120 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3121 int off)
3122 {
3123 const char *dname = blkg_dev_name(pd->blkg);
3124 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3125
3126 if (!dname)
3127 return 0;
3128
3129 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3130 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3131 ioc->params.qos[QOS_RPPM] / 10000,
3132 ioc->params.qos[QOS_RPPM] % 10000 / 100,
3133 ioc->params.qos[QOS_RLAT],
3134 ioc->params.qos[QOS_WPPM] / 10000,
3135 ioc->params.qos[QOS_WPPM] % 10000 / 100,
3136 ioc->params.qos[QOS_WLAT],
3137 ioc->params.qos[QOS_MIN] / 10000,
3138 ioc->params.qos[QOS_MIN] % 10000 / 100,
3139 ioc->params.qos[QOS_MAX] / 10000,
3140 ioc->params.qos[QOS_MAX] % 10000 / 100);
3141 return 0;
3142 }
3143
3144 static int ioc_qos_show(struct seq_file *sf, void *v)
3145 {
3146 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3147
3148 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3149 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3150 return 0;
3151 }
3152
3153 static const match_table_t qos_ctrl_tokens = {
3154 { QOS_ENABLE, "enable=%u" },
3155 { QOS_CTRL, "ctrl=%s" },
3156 { NR_QOS_CTRL_PARAMS, NULL },
3157 };
3158
3159 static const match_table_t qos_tokens = {
3160 { QOS_RPPM, "rpct=%s" },
3161 { QOS_RLAT, "rlat=%u" },
3162 { QOS_WPPM, "wpct=%s" },
3163 { QOS_WLAT, "wlat=%u" },
3164 { QOS_MIN, "min=%s" },
3165 { QOS_MAX, "max=%s" },
3166 { NR_QOS_PARAMS, NULL },
3167 };
3168
3169 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3170 size_t nbytes, loff_t off)
3171 {
3172 struct block_device *bdev;
3173 struct ioc *ioc;
3174 u32 qos[NR_QOS_PARAMS];
3175 bool enable, user;
3176 char *p;
3177 int ret;
3178
3179 bdev = blkcg_conf_open_bdev(&input);
3180 if (IS_ERR(bdev))
3181 return PTR_ERR(bdev);
3182
3183 ioc = q_to_ioc(bdev_get_queue(bdev));
3184 if (!ioc) {
3185 ret = blk_iocost_init(bdev_get_queue(bdev));
3186 if (ret)
3187 goto err;
3188 ioc = q_to_ioc(bdev_get_queue(bdev));
3189 }
3190
3191 spin_lock_irq(&ioc->lock);
3192 memcpy(qos, ioc->params.qos, sizeof(qos));
3193 enable = ioc->enabled;
3194 user = ioc->user_qos_params;
3195 spin_unlock_irq(&ioc->lock);
3196
3197 while ((p = strsep(&input, " \t\n"))) {
3198 substring_t args[MAX_OPT_ARGS];
3199 char buf[32];
3200 int tok;
3201 s64 v;
3202
3203 if (!*p)
3204 continue;
3205
3206 switch (match_token(p, qos_ctrl_tokens, args)) {
3207 case QOS_ENABLE:
3208 match_u64(&args[0], &v);
3209 enable = v;
3210 continue;
3211 case QOS_CTRL:
3212 match_strlcpy(buf, &args[0], sizeof(buf));
3213 if (!strcmp(buf, "auto"))
3214 user = false;
3215 else if (!strcmp(buf, "user"))
3216 user = true;
3217 else
3218 goto einval;
3219 continue;
3220 }
3221
3222 tok = match_token(p, qos_tokens, args);
3223 switch (tok) {
3224 case QOS_RPPM:
3225 case QOS_WPPM:
3226 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3227 sizeof(buf))
3228 goto einval;
3229 if (cgroup_parse_float(buf, 2, &v))
3230 goto einval;
3231 if (v < 0 || v > 10000)
3232 goto einval;
3233 qos[tok] = v * 100;
3234 break;
3235 case QOS_RLAT:
3236 case QOS_WLAT:
3237 if (match_u64(&args[0], &v))
3238 goto einval;
3239 qos[tok] = v;
3240 break;
3241 case QOS_MIN:
3242 case QOS_MAX:
3243 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3244 sizeof(buf))
3245 goto einval;
3246 if (cgroup_parse_float(buf, 2, &v))
3247 goto einval;
3248 if (v < 0)
3249 goto einval;
3250 qos[tok] = clamp_t(s64, v * 100,
3251 VRATE_MIN_PPM, VRATE_MAX_PPM);
3252 break;
3253 default:
3254 goto einval;
3255 }
3256 user = true;
3257 }
3258
3259 if (qos[QOS_MIN] > qos[QOS_MAX])
3260 goto einval;
3261
3262 spin_lock_irq(&ioc->lock);
3263
3264 if (enable) {
3265 blk_stat_enable_accounting(ioc->rqos.q);
3266 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3267 ioc->enabled = true;
3268 } else {
3269 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3270 ioc->enabled = false;
3271 }
3272
3273 if (user) {
3274 memcpy(ioc->params.qos, qos, sizeof(qos));
3275 ioc->user_qos_params = true;
3276 } else {
3277 ioc->user_qos_params = false;
3278 }
3279
3280 ioc_refresh_params(ioc, true);
3281 spin_unlock_irq(&ioc->lock);
3282
3283 blkdev_put_no_open(bdev);
3284 return nbytes;
3285 einval:
3286 ret = -EINVAL;
3287 err:
3288 blkdev_put_no_open(bdev);
3289 return ret;
3290 }
3291
3292 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3293 struct blkg_policy_data *pd, int off)
3294 {
3295 const char *dname = blkg_dev_name(pd->blkg);
3296 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3297 u64 *u = ioc->params.i_lcoefs;
3298
3299 if (!dname)
3300 return 0;
3301
3302 seq_printf(sf, "%s ctrl=%s model=linear "
3303 "rbps=%llu rseqiops=%llu rrandiops=%llu "
3304 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3305 dname, ioc->user_cost_model ? "user" : "auto",
3306 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3307 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3308 return 0;
3309 }
3310
3311 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3312 {
3313 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3314
3315 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3316 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3317 return 0;
3318 }
3319
3320 static const match_table_t cost_ctrl_tokens = {
3321 { COST_CTRL, "ctrl=%s" },
3322 { COST_MODEL, "model=%s" },
3323 { NR_COST_CTRL_PARAMS, NULL },
3324 };
3325
3326 static const match_table_t i_lcoef_tokens = {
3327 { I_LCOEF_RBPS, "rbps=%u" },
3328 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
3329 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
3330 { I_LCOEF_WBPS, "wbps=%u" },
3331 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
3332 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
3333 { NR_I_LCOEFS, NULL },
3334 };
3335
3336 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3337 size_t nbytes, loff_t off)
3338 {
3339 struct block_device *bdev;
3340 struct ioc *ioc;
3341 u64 u[NR_I_LCOEFS];
3342 bool user;
3343 char *p;
3344 int ret;
3345
3346 bdev = blkcg_conf_open_bdev(&input);
3347 if (IS_ERR(bdev))
3348 return PTR_ERR(bdev);
3349
3350 ioc = q_to_ioc(bdev_get_queue(bdev));
3351 if (!ioc) {
3352 ret = blk_iocost_init(bdev_get_queue(bdev));
3353 if (ret)
3354 goto err;
3355 ioc = q_to_ioc(bdev_get_queue(bdev));
3356 }
3357
3358 spin_lock_irq(&ioc->lock);
3359 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3360 user = ioc->user_cost_model;
3361 spin_unlock_irq(&ioc->lock);
3362
3363 while ((p = strsep(&input, " \t\n"))) {
3364 substring_t args[MAX_OPT_ARGS];
3365 char buf[32];
3366 int tok;
3367 u64 v;
3368
3369 if (!*p)
3370 continue;
3371
3372 switch (match_token(p, cost_ctrl_tokens, args)) {
3373 case COST_CTRL:
3374 match_strlcpy(buf, &args[0], sizeof(buf));
3375 if (!strcmp(buf, "auto"))
3376 user = false;
3377 else if (!strcmp(buf, "user"))
3378 user = true;
3379 else
3380 goto einval;
3381 continue;
3382 case COST_MODEL:
3383 match_strlcpy(buf, &args[0], sizeof(buf));
3384 if (strcmp(buf, "linear"))
3385 goto einval;
3386 continue;
3387 }
3388
3389 tok = match_token(p, i_lcoef_tokens, args);
3390 if (tok == NR_I_LCOEFS)
3391 goto einval;
3392 if (match_u64(&args[0], &v))
3393 goto einval;
3394 u[tok] = v;
3395 user = true;
3396 }
3397
3398 spin_lock_irq(&ioc->lock);
3399 if (user) {
3400 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3401 ioc->user_cost_model = true;
3402 } else {
3403 ioc->user_cost_model = false;
3404 }
3405 ioc_refresh_params(ioc, true);
3406 spin_unlock_irq(&ioc->lock);
3407
3408 blkdev_put_no_open(bdev);
3409 return nbytes;
3410
3411 einval:
3412 ret = -EINVAL;
3413 err:
3414 blkdev_put_no_open(bdev);
3415 return ret;
3416 }
3417
3418 static struct cftype ioc_files[] = {
3419 {
3420 .name = "weight",
3421 .flags = CFTYPE_NOT_ON_ROOT,
3422 .seq_show = ioc_weight_show,
3423 .write = ioc_weight_write,
3424 },
3425 {
3426 .name = "cost.qos",
3427 .flags = CFTYPE_ONLY_ON_ROOT,
3428 .seq_show = ioc_qos_show,
3429 .write = ioc_qos_write,
3430 },
3431 {
3432 .name = "cost.model",
3433 .flags = CFTYPE_ONLY_ON_ROOT,
3434 .seq_show = ioc_cost_model_show,
3435 .write = ioc_cost_model_write,
3436 },
3437 {}
3438 };
3439
3440 static struct blkcg_policy blkcg_policy_iocost = {
3441 .dfl_cftypes = ioc_files,
3442 .cpd_alloc_fn = ioc_cpd_alloc,
3443 .cpd_free_fn = ioc_cpd_free,
3444 .pd_alloc_fn = ioc_pd_alloc,
3445 .pd_init_fn = ioc_pd_init,
3446 .pd_free_fn = ioc_pd_free,
3447 .pd_stat_fn = ioc_pd_stat,
3448 };
3449
3450 static int __init ioc_init(void)
3451 {
3452 return blkcg_policy_register(&blkcg_policy_iocost);
3453 }
3454
3455 static void __exit ioc_exit(void)
3456 {
3457 blkcg_policy_unregister(&blkcg_policy_iocost);
3458 }
3459
3460 module_init(ioc_init);
3461 module_exit(ioc_exit);