Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Historical Service Time
0004  *
0005  *  Keeps a time-weighted exponential moving average of the historical
0006  *  service time. Estimates future service time based on the historical
0007  *  service time and the number of outstanding requests.
0008  *
0009  *  Marks paths stale if they have not finished within hst *
0010  *  num_paths. If a path is stale and unused, we will send a single
0011  *  request to probe in case the path has improved. This situation
0012  *  generally arises if the path is so much worse than others that it
0013  *  will never have the best estimated service time, or if the entire
0014  *  multipath device is unused. If a path is stale and in use, limit the
0015  *  number of requests it can receive with the assumption that the path
0016  *  has become degraded.
0017  *
0018  *  To avoid repeatedly calculating exponents for time weighting, times
0019  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
0020  *  ns, and the weighting is pre-calculated.
0021  *
0022  */
0023 
0024 #include "dm.h"
0025 #include "dm-path-selector.h"
0026 
0027 #include <linux/blkdev.h>
0028 #include <linux/slab.h>
0029 #include <linux/module.h>
0030 
0031 
0032 #define DM_MSG_PREFIX   "multipath historical-service-time"
0033 #define HST_MIN_IO 1
0034 #define HST_VERSION "0.1.1"
0035 
0036 #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
0037 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
0038 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
0039 #define HST_FIXED_95 972
0040 #define HST_MAX_INFLIGHT HST_FIXED_1
0041 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
0042 #define HST_WEIGHT_COUNT 64ULL
0043 
0044 struct selector {
0045     struct list_head valid_paths;
0046     struct list_head failed_paths;
0047     int valid_count;
0048     spinlock_t lock;
0049 
0050     unsigned int weights[HST_WEIGHT_COUNT];
0051     unsigned int threshold_multiplier;
0052 };
0053 
0054 struct path_info {
0055     struct list_head list;
0056     struct dm_path *path;
0057     unsigned int repeat_count;
0058 
0059     spinlock_t lock;
0060 
0061     u64 historical_service_time; /* Fixed point */
0062 
0063     u64 stale_after;
0064     u64 last_finish;
0065 
0066     u64 outstanding;
0067 };
0068 
0069 /**
0070  * fixed_power - compute: x^n, in O(log n) time
0071  *
0072  * @x:         base of the power
0073  * @frac_bits: fractional bits of @x
0074  * @n:         power to raise @x to.
0075  *
0076  * By exploiting the relation between the definition of the natural power
0077  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
0078  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
0079  * (where: n_i \elem {0, 1}, the binary vector representing n),
0080  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
0081  * of course trivially computable in O(log_2 n), the length of our binary
0082  * vector.
0083  *
0084  * (see: kernel/sched/loadavg.c)
0085  */
0086 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
0087 {
0088     unsigned long result = 1UL << frac_bits;
0089 
0090     if (n) {
0091         for (;;) {
0092             if (n & 1) {
0093                 result *= x;
0094                 result += 1UL << (frac_bits - 1);
0095                 result >>= frac_bits;
0096             }
0097             n >>= 1;
0098             if (!n)
0099                 break;
0100             x *= x;
0101             x += 1UL << (frac_bits - 1);
0102             x >>= frac_bits;
0103         }
0104     }
0105 
0106     return result;
0107 }
0108 
0109 /*
0110  * Calculate the next value of an exponential moving average
0111  * a_1 = a_0 * e + a * (1 - e)
0112  *
0113  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
0114  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
0115  * @weight: [0, HST_FIXED_1]
0116  *
0117  * Note:
0118  *   To account for multiple periods in the same calculation,
0119  *   a_n = a_0 * e^n + a * (1 - e^n),
0120  *   so call fixed_ema(last, next, pow(weight, N))
0121  */
0122 static u64 fixed_ema(u64 last, u64 next, u64 weight)
0123 {
0124     last *= weight;
0125     last += next * (HST_FIXED_1 - weight);
0126     last += 1ULL << (HST_FIXED_SHIFT - 1);
0127     return last >> HST_FIXED_SHIFT;
0128 }
0129 
0130 static struct selector *alloc_selector(void)
0131 {
0132     struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
0133 
0134     if (s) {
0135         INIT_LIST_HEAD(&s->valid_paths);
0136         INIT_LIST_HEAD(&s->failed_paths);
0137         spin_lock_init(&s->lock);
0138         s->valid_count = 0;
0139     }
0140 
0141     return s;
0142 }
0143 
0144 /*
0145  * Get the weight for a given time span.
0146  */
0147 static u64 hst_weight(struct path_selector *ps, u64 delta)
0148 {
0149     struct selector *s = ps->context;
0150     int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
0151                HST_WEIGHT_COUNT - 1);
0152 
0153     return s->weights[bucket];
0154 }
0155 
0156 /*
0157  * Set up the weights array.
0158  *
0159  * weights[len-1] = 0
0160  * weights[n] = base ^ (n + 1)
0161  */
0162 static void hst_set_weights(struct path_selector *ps, unsigned int base)
0163 {
0164     struct selector *s = ps->context;
0165     int i;
0166 
0167     if (base >= HST_FIXED_1)
0168         return;
0169 
0170     for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
0171         s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
0172     s->weights[HST_WEIGHT_COUNT - 1] = 0;
0173 }
0174 
0175 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
0176 {
0177     struct selector *s;
0178     unsigned int base_weight = HST_FIXED_95;
0179     unsigned int threshold_multiplier = 0;
0180     char dummy;
0181 
0182     /*
0183      * Arguments: [<base_weight> [<threshold_multiplier>]]
0184      *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
0185      *                  value of 0 will completely ignore any history.
0186      *                  If not given, default (HST_FIXED_95) is used.
0187      *   <threshold_multiplier>: Minimum threshold multiplier for paths to
0188      *                  be considered different. That is, a path is
0189      *                  considered different iff (p1 > N * p2) where p1
0190      *                  is the path with higher service time. A threshold
0191      *                  of 1 or 0 has no effect. Defaults to 0.
0192      */
0193     if (argc > 2)
0194         return -EINVAL;
0195 
0196     if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
0197          base_weight >= HST_FIXED_1)) {
0198         return -EINVAL;
0199     }
0200 
0201     if (argc > 1 && (sscanf(argv[1], "%u%c",
0202                 &threshold_multiplier, &dummy) != 1)) {
0203         return -EINVAL;
0204     }
0205 
0206     s = alloc_selector();
0207     if (!s)
0208         return -ENOMEM;
0209 
0210     ps->context = s;
0211 
0212     hst_set_weights(ps, base_weight);
0213     s->threshold_multiplier = threshold_multiplier;
0214     return 0;
0215 }
0216 
0217 static void free_paths(struct list_head *paths)
0218 {
0219     struct path_info *pi, *next;
0220 
0221     list_for_each_entry_safe(pi, next, paths, list) {
0222         list_del(&pi->list);
0223         kfree(pi);
0224     }
0225 }
0226 
0227 static void hst_destroy(struct path_selector *ps)
0228 {
0229     struct selector *s = ps->context;
0230 
0231     free_paths(&s->valid_paths);
0232     free_paths(&s->failed_paths);
0233     kfree(s);
0234     ps->context = NULL;
0235 }
0236 
0237 static int hst_status(struct path_selector *ps, struct dm_path *path,
0238              status_type_t type, char *result, unsigned int maxlen)
0239 {
0240     unsigned int sz = 0;
0241     struct path_info *pi;
0242 
0243     if (!path) {
0244         struct selector *s = ps->context;
0245 
0246         DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
0247     } else {
0248         pi = path->pscontext;
0249 
0250         switch (type) {
0251         case STATUSTYPE_INFO:
0252             DMEMIT("%llu %llu %llu ", pi->historical_service_time,
0253                    pi->outstanding, pi->stale_after);
0254             break;
0255         case STATUSTYPE_TABLE:
0256             DMEMIT("0 ");
0257             break;
0258         case STATUSTYPE_IMA:
0259             *result = '\0';
0260             break;
0261         }
0262     }
0263 
0264     return sz;
0265 }
0266 
0267 static int hst_add_path(struct path_selector *ps, struct dm_path *path,
0268                int argc, char **argv, char **error)
0269 {
0270     struct selector *s = ps->context;
0271     struct path_info *pi;
0272     unsigned int repeat_count = HST_MIN_IO;
0273     char dummy;
0274     unsigned long flags;
0275 
0276     /*
0277      * Arguments: [<repeat_count>]
0278      *   <repeat_count>: The number of I/Os before switching path.
0279      *                   If not given, default (HST_MIN_IO) is used.
0280      */
0281     if (argc > 1) {
0282         *error = "historical-service-time ps: incorrect number of arguments";
0283         return -EINVAL;
0284     }
0285 
0286     if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
0287         *error = "historical-service-time ps: invalid repeat count";
0288         return -EINVAL;
0289     }
0290 
0291     /* allocate the path */
0292     pi = kmalloc(sizeof(*pi), GFP_KERNEL);
0293     if (!pi) {
0294         *error = "historical-service-time ps: Error allocating path context";
0295         return -ENOMEM;
0296     }
0297 
0298     pi->path = path;
0299     pi->repeat_count = repeat_count;
0300 
0301     pi->historical_service_time = HST_FIXED_1;
0302 
0303     spin_lock_init(&pi->lock);
0304     pi->outstanding = 0;
0305 
0306     pi->stale_after = 0;
0307     pi->last_finish = 0;
0308 
0309     path->pscontext = pi;
0310 
0311     spin_lock_irqsave(&s->lock, flags);
0312     list_add_tail(&pi->list, &s->valid_paths);
0313     s->valid_count++;
0314     spin_unlock_irqrestore(&s->lock, flags);
0315 
0316     return 0;
0317 }
0318 
0319 static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
0320 {
0321     struct selector *s = ps->context;
0322     struct path_info *pi = path->pscontext;
0323     unsigned long flags;
0324 
0325     spin_lock_irqsave(&s->lock, flags);
0326     list_move(&pi->list, &s->failed_paths);
0327     s->valid_count--;
0328     spin_unlock_irqrestore(&s->lock, flags);
0329 }
0330 
0331 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
0332 {
0333     struct selector *s = ps->context;
0334     struct path_info *pi = path->pscontext;
0335     unsigned long flags;
0336 
0337     spin_lock_irqsave(&s->lock, flags);
0338     list_move_tail(&pi->list, &s->valid_paths);
0339     s->valid_count++;
0340     spin_unlock_irqrestore(&s->lock, flags);
0341 
0342     return 0;
0343 }
0344 
0345 static void hst_fill_compare(struct path_info *pi, u64 *hst,
0346                  u64 *out, u64 *stale)
0347 {
0348     unsigned long flags;
0349 
0350     spin_lock_irqsave(&pi->lock, flags);
0351     *hst = pi->historical_service_time;
0352     *out = pi->outstanding;
0353     *stale = pi->stale_after;
0354     spin_unlock_irqrestore(&pi->lock, flags);
0355 }
0356 
0357 /*
0358  * Compare the estimated service time of 2 paths, pi1 and pi2,
0359  * for the incoming I/O.
0360  *
0361  * Returns:
0362  * < 0 : pi1 is better
0363  * 0   : no difference between pi1 and pi2
0364  * > 0 : pi2 is better
0365  *
0366  */
0367 static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
0368                  u64 time_now, struct path_selector *ps)
0369 {
0370     struct selector *s = ps->context;
0371     u64 hst1, hst2;
0372     long long out1, out2, stale1, stale2;
0373     int pi2_better, over_threshold;
0374 
0375     hst_fill_compare(pi1, &hst1, &out1, &stale1);
0376     hst_fill_compare(pi2, &hst2, &out2, &stale2);
0377 
0378     /* Check here if estimated latency for two paths are too similar.
0379      * If this is the case, we skip extra calculation and just compare
0380      * outstanding requests. In this case, any unloaded paths will
0381      * be preferred.
0382      */
0383     if (hst1 > hst2)
0384         over_threshold = hst1 > (s->threshold_multiplier * hst2);
0385     else
0386         over_threshold = hst2 > (s->threshold_multiplier * hst1);
0387 
0388     if (!over_threshold)
0389         return out1 - out2;
0390 
0391     /*
0392      * If an unloaded path is stale, choose it. If both paths are unloaded,
0393      * choose path that is the most stale.
0394      * (If one path is loaded, choose the other)
0395      */
0396     if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
0397         (!out1 && !out2))
0398         return (!out2 * stale1) - (!out1 * stale2);
0399 
0400     /* Compare estimated service time. If outstanding is the same, we
0401      * don't need to multiply
0402      */
0403     if (out1 == out2) {
0404         pi2_better = hst1 > hst2;
0405     } else {
0406         /* Potential overflow with out >= 1024 */
0407         if (unlikely(out1 >= HST_MAX_INFLIGHT ||
0408                  out2 >= HST_MAX_INFLIGHT)) {
0409             /* If over 1023 in-flights, we may overflow if hst
0410              * is at max. (With this shift we still overflow at
0411              * 1048576 in-flights, which is high enough).
0412              */
0413             hst1 >>= HST_FIXED_SHIFT;
0414             hst2 >>= HST_FIXED_SHIFT;
0415         }
0416         pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
0417     }
0418 
0419     /* In the case that the 'winner' is stale, limit to equal usage. */
0420     if (pi2_better) {
0421         if (stale2 < time_now)
0422             return out1 - out2;
0423         return 1;
0424     }
0425     if (stale1 < time_now)
0426         return out1 - out2;
0427     return -1;
0428 }
0429 
0430 static struct dm_path *hst_select_path(struct path_selector *ps,
0431                        size_t nr_bytes)
0432 {
0433     struct selector *s = ps->context;
0434     struct path_info *pi = NULL, *best = NULL;
0435     u64 time_now = ktime_get_ns();
0436     struct dm_path *ret = NULL;
0437     unsigned long flags;
0438 
0439     spin_lock_irqsave(&s->lock, flags);
0440     if (list_empty(&s->valid_paths))
0441         goto out;
0442 
0443     list_for_each_entry(pi, &s->valid_paths, list) {
0444         if (!best || (hst_compare(pi, best, time_now, ps) < 0))
0445             best = pi;
0446     }
0447 
0448     if (!best)
0449         goto out;
0450 
0451     /* Move last used path to end (least preferred in case of ties) */
0452     list_move_tail(&best->list, &s->valid_paths);
0453 
0454     ret = best->path;
0455 
0456 out:
0457     spin_unlock_irqrestore(&s->lock, flags);
0458     return ret;
0459 }
0460 
0461 static int hst_start_io(struct path_selector *ps, struct dm_path *path,
0462             size_t nr_bytes)
0463 {
0464     struct path_info *pi = path->pscontext;
0465     unsigned long flags;
0466 
0467     spin_lock_irqsave(&pi->lock, flags);
0468     pi->outstanding++;
0469     spin_unlock_irqrestore(&pi->lock, flags);
0470 
0471     return 0;
0472 }
0473 
0474 static u64 path_service_time(struct path_info *pi, u64 start_time)
0475 {
0476     u64 now = ktime_get_ns();
0477 
0478     /* if a previous disk request has finished after this IO was
0479      * sent to the hardware, pretend the submission happened
0480      * serially.
0481      */
0482     if (time_after64(pi->last_finish, start_time))
0483         start_time = pi->last_finish;
0484 
0485     pi->last_finish = now;
0486     if (time_before64(now, start_time))
0487         return 0;
0488 
0489     return now - start_time;
0490 }
0491 
0492 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
0493               size_t nr_bytes, u64 start_time)
0494 {
0495     struct path_info *pi = path->pscontext;
0496     struct selector *s = ps->context;
0497     unsigned long flags;
0498     u64 st;
0499 
0500     spin_lock_irqsave(&pi->lock, flags);
0501 
0502     st = path_service_time(pi, start_time);
0503     pi->outstanding--;
0504     pi->historical_service_time =
0505         fixed_ema(pi->historical_service_time,
0506               min(st * HST_FIXED_1, HST_FIXED_MAX),
0507               hst_weight(ps, st));
0508 
0509     /*
0510      * On request end, mark path as fresh. If a path hasn't
0511      * finished any requests within the fresh period, the estimated
0512      * service time is considered too optimistic and we limit the
0513      * maximum requests on that path.
0514      */
0515     pi->stale_after = pi->last_finish +
0516         (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
0517 
0518     spin_unlock_irqrestore(&pi->lock, flags);
0519 
0520     return 0;
0521 }
0522 
0523 static struct path_selector_type hst_ps = {
0524     .name       = "historical-service-time",
0525     .module     = THIS_MODULE,
0526     .features   = DM_PS_USE_HR_TIMER,
0527     .table_args = 1,
0528     .info_args  = 3,
0529     .create     = hst_create,
0530     .destroy    = hst_destroy,
0531     .status     = hst_status,
0532     .add_path   = hst_add_path,
0533     .fail_path  = hst_fail_path,
0534     .reinstate_path = hst_reinstate_path,
0535     .select_path    = hst_select_path,
0536     .start_io   = hst_start_io,
0537     .end_io     = hst_end_io,
0538 };
0539 
0540 static int __init dm_hst_init(void)
0541 {
0542     int r = dm_register_path_selector(&hst_ps);
0543 
0544     if (r < 0)
0545         DMERR("register failed %d", r);
0546 
0547     DMINFO("version " HST_VERSION " loaded");
0548 
0549     return r;
0550 }
0551 
0552 static void __exit dm_hst_exit(void)
0553 {
0554     int r = dm_unregister_path_selector(&hst_ps);
0555 
0556     if (r < 0)
0557         DMERR("unregister failed %d", r);
0558 }
0559 
0560 module_init(dm_hst_init);
0561 module_exit(dm_hst_exit);
0562 
0563 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
0564 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
0565 MODULE_LICENSE("GPL");