drivers/md/dm-cache-policy-smq.c

0001 /*
0002  * Copyright (C) 2015 Red Hat. All rights reserved.
0003  *
0004  * This file is released under the GPL.
0005  */
0006
0007 #include "dm-cache-background-tracker.h"
0008 #include "dm-cache-policy-internal.h"
0009 #include "dm-cache-policy.h"
0010 #include "dm.h"
0011
0012 #include <linux/hash.h>
0013 #include <linux/jiffies.h>
0014 #include <linux/module.h>
0015 #include <linux/mutex.h>
0016 #include <linux/vmalloc.h>
0017 #include <linux/math64.h>
0018
0019 #define DM_MSG_PREFIX "cache-policy-smq"
0020
0021 /*----------------------------------------------------------------*/
0022
0023 /*
0024  * Safe division functions that return zero on divide by zero.
0025  */
0026 static unsigned safe_div(unsigned n, unsigned d)
0027 {
0028     return d ? n / d : 0u;
0029 }
0030
0031 static unsigned safe_mod(unsigned n, unsigned d)
0032 {
0033     return d ? n % d : 0u;
0034 }
0035
0036 /*----------------------------------------------------------------*/
0037
0038 struct entry {
0039     unsigned hash_next:28;
0040     unsigned prev:28;
0041     unsigned next:28;
0042     unsigned level:6;
0043     bool dirty:1;
0044     bool allocated:1;
0045     bool sentinel:1;
0046     bool pending_work:1;
0047
0048     dm_oblock_t oblock;
0049 };
0050
0051 /*----------------------------------------------------------------*/
0052
0053 #define INDEXER_NULL ((1u << 28u) - 1u)
0054
0055 /*
0056  * An entry_space manages a set of entries that we use for the queues.
0057  * The clean and dirty queues share entries, so this object is separate
0058  * from the queue itself.
0059  */
0060 struct entry_space {
0061     struct entry *begin;
0062     struct entry *end;
0063 };
0064
0065 static int space_init(struct entry_space *es, unsigned nr_entries)
0066 {
0067     if (!nr_entries) {
0068         es->begin = es->end = NULL;
0069         return 0;
0070     }
0071
0072     es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
0073     if (!es->begin)
0074         return -ENOMEM;
0075
0076     es->end = es->begin + nr_entries;
0077     return 0;
0078 }
0079
0080 static void space_exit(struct entry_space *es)
0081 {
0082     vfree(es->begin);
0083 }
0084
0085 static struct entry *__get_entry(struct entry_space *es, unsigned block)
0086 {
0087     struct entry *e;
0088
0089     e = es->begin + block;
0090     BUG_ON(e >= es->end);
0091
0092     return e;
0093 }
0094
0095 static unsigned to_index(struct entry_space *es, struct entry *e)
0096 {
0097     BUG_ON(e < es->begin || e >= es->end);
0098     return e - es->begin;
0099 }
0100
0101 static struct entry *to_entry(struct entry_space *es, unsigned block)
0102 {
0103     if (block == INDEXER_NULL)
0104         return NULL;
0105
0106     return __get_entry(es, block);
0107 }
0108
0109 /*----------------------------------------------------------------*/
0110
0111 struct ilist {
0112     unsigned nr_elts;   /* excluding sentinel entries */
0113     unsigned head, tail;
0114 };
0115
0116 static void l_init(struct ilist *l)
0117 {
0118     l->nr_elts = 0;
0119     l->head = l->tail = INDEXER_NULL;
0120 }
0121
0122 static struct entry *l_head(struct entry_space *es, struct ilist *l)
0123 {
0124     return to_entry(es, l->head);
0125 }
0126
0127 static struct entry *l_tail(struct entry_space *es, struct ilist *l)
0128 {
0129     return to_entry(es, l->tail);
0130 }
0131
0132 static struct entry *l_next(struct entry_space *es, struct entry *e)
0133 {
0134     return to_entry(es, e->next);
0135 }
0136
0137 static struct entry *l_prev(struct entry_space *es, struct entry *e)
0138 {
0139     return to_entry(es, e->prev);
0140 }
0141
0142 static bool l_empty(struct ilist *l)
0143 {
0144     return l->head == INDEXER_NULL;
0145 }
0146
0147 static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
0148 {
0149     struct entry *head = l_head(es, l);
0150
0151     e->next = l->head;
0152     e->prev = INDEXER_NULL;
0153
0154     if (head)
0155         head->prev = l->head = to_index(es, e);
0156     else
0157         l->head = l->tail = to_index(es, e);
0158
0159     if (!e->sentinel)
0160         l->nr_elts++;
0161 }
0162
0163 static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
0164 {
0165     struct entry *tail = l_tail(es, l);
0166
0167     e->next = INDEXER_NULL;
0168     e->prev = l->tail;
0169
0170     if (tail)
0171         tail->next = l->tail = to_index(es, e);
0172     else
0173         l->head = l->tail = to_index(es, e);
0174
0175     if (!e->sentinel)
0176         l->nr_elts++;
0177 }
0178
0179 static void l_add_before(struct entry_space *es, struct ilist *l,
0180              struct entry *old, struct entry *e)
0181 {
0182     struct entry *prev = l_prev(es, old);
0183
0184     if (!prev)
0185         l_add_head(es, l, e);
0186
0187     else {
0188         e->prev = old->prev;
0189         e->next = to_index(es, old);
0190         prev->next = old->prev = to_index(es, e);
0191
0192         if (!e->sentinel)
0193             l->nr_elts++;
0194     }
0195 }
0196
0197 static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
0198 {
0199     struct entry *prev = l_prev(es, e);
0200     struct entry *next = l_next(es, e);
0201
0202     if (prev)
0203         prev->next = e->next;
0204     else
0205         l->head = e->next;
0206
0207     if (next)
0208         next->prev = e->prev;
0209     else
0210         l->tail = e->prev;
0211
0212     if (!e->sentinel)
0213         l->nr_elts--;
0214 }
0215
0216 static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
0217 {
0218     struct entry *e;
0219
0220     for (e = l_head(es, l); e; e = l_next(es, e))
0221         if (!e->sentinel) {
0222             l_del(es, l, e);
0223             return e;
0224         }
0225
0226     return NULL;
0227 }
0228
0229 static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
0230 {
0231     struct entry *e;
0232
0233     for (e = l_tail(es, l); e; e = l_prev(es, e))
0234         if (!e->sentinel) {
0235             l_del(es, l, e);
0236             return e;
0237         }
0238
0239     return NULL;
0240 }
0241
0242 /*----------------------------------------------------------------*/
0243
0244 /*
0245  * The stochastic-multi-queue is a set of lru lists stacked into levels.
0246  * Entries are moved up levels when they are used, which loosely orders the
0247  * most accessed entries in the top levels and least in the bottom.  This
0248  * structure is *much* better than a single lru list.
0249  */
0250 #define MAX_LEVELS 64u
0251
0252 struct queue {
0253     struct entry_space *es;
0254
0255     unsigned nr_elts;
0256     unsigned nr_levels;
0257     struct ilist qs[MAX_LEVELS];
0258
0259     /*
0260      * We maintain a count of the number of entries we would like in each
0261      * level.
0262      */
0263     unsigned last_target_nr_elts;
0264     unsigned nr_top_levels;
0265     unsigned nr_in_top_levels;
0266     unsigned target_count[MAX_LEVELS];
0267 };
0268
0269 static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
0270 {
0271     unsigned i;
0272
0273     q->es = es;
0274     q->nr_elts = 0;
0275     q->nr_levels = nr_levels;
0276
0277     for (i = 0; i < q->nr_levels; i++) {
0278         l_init(q->qs + i);
0279         q->target_count[i] = 0u;
0280     }
0281
0282     q->last_target_nr_elts = 0u;
0283     q->nr_top_levels = 0u;
0284     q->nr_in_top_levels = 0u;
0285 }
0286
0287 static unsigned q_size(struct queue *q)
0288 {
0289     return q->nr_elts;
0290 }
0291
0292 /*
0293  * Insert an entry to the back of the given level.
0294  */
0295 static void q_push(struct queue *q, struct entry *e)
0296 {
0297     BUG_ON(e->pending_work);
0298
0299     if (!e->sentinel)
0300         q->nr_elts++;
0301
0302     l_add_tail(q->es, q->qs + e->level, e);
0303 }
0304
0305 static void q_push_front(struct queue *q, struct entry *e)
0306 {
0307     BUG_ON(e->pending_work);
0308
0309     if (!e->sentinel)
0310         q->nr_elts++;
0311
0312     l_add_head(q->es, q->qs + e->level, e);
0313 }
0314
0315 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
0316 {
0317     BUG_ON(e->pending_work);
0318
0319     if (!e->sentinel)
0320         q->nr_elts++;
0321
0322     l_add_before(q->es, q->qs + e->level, old, e);
0323 }
0324
0325 static void q_del(struct queue *q, struct entry *e)
0326 {
0327     l_del(q->es, q->qs + e->level, e);
0328     if (!e->sentinel)
0329         q->nr_elts--;
0330 }
0331
0332 /*
0333  * Return the oldest entry of the lowest populated level.
0334  */
0335 static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
0336 {
0337     unsigned level;
0338     struct entry *e;
0339
0340     max_level = min(max_level, q->nr_levels);
0341
0342     for (level = 0; level < max_level; level++)
0343         for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
0344             if (e->sentinel) {
0345                 if (can_cross_sentinel)
0346                     continue;
0347                 else
0348                     break;
0349             }
0350
0351             return e;
0352         }
0353
0354     return NULL;
0355 }
0356
0357 static struct entry *q_pop(struct queue *q)
0358 {
0359     struct entry *e = q_peek(q, q->nr_levels, true);
0360
0361     if (e)
0362         q_del(q, e);
0363
0364     return e;
0365 }
0366
0367 /*
0368  * This function assumes there is a non-sentinel entry to pop.  It's only
0369  * used by redistribute, so we know this is true.  It also doesn't adjust
0370  * the q->nr_elts count.
0371  */
0372 static struct entry *__redist_pop_from(struct queue *q, unsigned level)
0373 {
0374     struct entry *e;
0375
0376     for (; level < q->nr_levels; level++)
0377         for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
0378             if (!e->sentinel) {
0379                 l_del(q->es, q->qs + e->level, e);
0380                 return e;
0381             }
0382
0383     return NULL;
0384 }
0385
0386 static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
0387 {
0388     unsigned level, nr_levels, entries_per_level, remainder;
0389
0390     BUG_ON(lbegin > lend);
0391     BUG_ON(lend > q->nr_levels);
0392     nr_levels = lend - lbegin;
0393     entries_per_level = safe_div(nr_elts, nr_levels);
0394     remainder = safe_mod(nr_elts, nr_levels);
0395
0396     for (level = lbegin; level < lend; level++)
0397         q->target_count[level] =
0398             (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
0399 }
0400
0401 /*
0402  * Typically we have fewer elements in the top few levels which allows us
0403  * to adjust the promote threshold nicely.
0404  */
0405 static void q_set_targets(struct queue *q)
0406 {
0407     if (q->last_target_nr_elts == q->nr_elts)
0408         return;
0409
0410     q->last_target_nr_elts = q->nr_elts;
0411
0412     if (q->nr_top_levels > q->nr_levels)
0413         q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
0414
0415     else {
0416         q_set_targets_subrange_(q, q->nr_in_top_levels,
0417                     q->nr_levels - q->nr_top_levels, q->nr_levels);
0418
0419         if (q->nr_in_top_levels < q->nr_elts)
0420             q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
0421                         0, q->nr_levels - q->nr_top_levels);
0422         else
0423             q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
0424     }
0425 }
0426
0427 static void q_redistribute(struct queue *q)
0428 {
0429     unsigned target, level;
0430     struct ilist *l, *l_above;
0431     struct entry *e;
0432
0433     q_set_targets(q);
0434
0435     for (level = 0u; level < q->nr_levels - 1u; level++) {
0436         l = q->qs + level;
0437         target = q->target_count[level];
0438
0439         /*
0440          * Pull down some entries from the level above.
0441          */
0442         while (l->nr_elts < target) {
0443             e = __redist_pop_from(q, level + 1u);
0444             if (!e) {
0445                 /* bug in nr_elts */
0446                 break;
0447             }
0448
0449             e->level = level;
0450             l_add_tail(q->es, l, e);
0451         }
0452
0453         /*
0454          * Push some entries up.
0455          */
0456         l_above = q->qs + level + 1u;
0457         while (l->nr_elts > target) {
0458             e = l_pop_tail(q->es, l);
0459
0460             if (!e)
0461                 /* bug in nr_elts */
0462                 break;
0463
0464             e->level = level + 1u;
0465             l_add_tail(q->es, l_above, e);
0466         }
0467     }
0468 }
0469
0470 static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
0471               struct entry *s1, struct entry *s2)
0472 {
0473     struct entry *de;
0474     unsigned sentinels_passed = 0;
0475     unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
0476
0477     /* try and find an entry to swap with */
0478     if (extra_levels && (e->level < q->nr_levels - 1u)) {
0479         for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
0480             sentinels_passed++;
0481
0482         if (de) {
0483             q_del(q, de);
0484             de->level = e->level;
0485             if (s1) {
0486                 switch (sentinels_passed) {
0487                 case 0:
0488                     q_push_before(q, s1, de);
0489                     break;
0490
0491                 case 1:
0492                     q_push_before(q, s2, de);
0493                     break;
0494
0495                 default:
0496                     q_push(q, de);
0497                 }
0498             } else
0499                 q_push(q, de);
0500         }
0501     }
0502
0503     q_del(q, e);
0504     e->level = new_level;
0505     q_push(q, e);
0506 }
0507
0508 /*----------------------------------------------------------------*/
0509
0510 #define FP_SHIFT 8
0511 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
0512 #define EIGHTH (1u << (FP_SHIFT - 3u))
0513
0514 struct stats {
0515     unsigned hit_threshold;
0516     unsigned hits;
0517     unsigned misses;
0518 };
0519
0520 enum performance {
0521     Q_POOR,
0522     Q_FAIR,
0523     Q_WELL
0524 };
0525
0526 static void stats_init(struct stats *s, unsigned nr_levels)
0527 {
0528     s->hit_threshold = (nr_levels * 3u) / 4u;
0529     s->hits = 0u;
0530     s->misses = 0u;
0531 }
0532
0533 static void stats_reset(struct stats *s)
0534 {
0535     s->hits = s->misses = 0u;
0536 }
0537
0538 static void stats_level_accessed(struct stats *s, unsigned level)
0539 {
0540     if (level >= s->hit_threshold)
0541         s->hits++;
0542     else
0543         s->misses++;
0544 }
0545
0546 static void stats_miss(struct stats *s)
0547 {
0548     s->misses++;
0549 }
0550
0551 /*
0552  * There are times when we don't have any confidence in the hotspot queue.
0553  * Such as when a fresh cache is created and the blocks have been spread
0554  * out across the levels, or if an io load changes.  We detect this by
0555  * seeing how often a lookup is in the top levels of the hotspot queue.
0556  */
0557 static enum performance stats_assess(struct stats *s)
0558 {
0559     unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
0560
0561     if (confidence < SIXTEENTH)
0562         return Q_POOR;
0563
0564     else if (confidence < EIGHTH)
0565         return Q_FAIR;
0566
0567     else
0568         return Q_WELL;
0569 }
0570
0571 /*----------------------------------------------------------------*/
0572
0573 struct smq_hash_table {
0574     struct entry_space *es;
0575     unsigned long long hash_bits;
0576     unsigned *buckets;
0577 };
0578
0579 /*
0580  * All cache entries are stored in a chained hash table.  To save space we
0581  * use indexing again, and only store indexes to the next entry.
0582  */
0583 static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
0584 {
0585     unsigned i, nr_buckets;
0586
0587     ht->es = es;
0588     nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
0589     ht->hash_bits = __ffs(nr_buckets);
0590
0591     ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
0592     if (!ht->buckets)
0593         return -ENOMEM;
0594
0595     for (i = 0; i < nr_buckets; i++)
0596         ht->buckets[i] = INDEXER_NULL;
0597
0598     return 0;
0599 }
0600
0601 static void h_exit(struct smq_hash_table *ht)
0602 {
0603     vfree(ht->buckets);
0604 }
0605
0606 static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
0607 {
0608     return to_entry(ht->es, ht->buckets[bucket]);
0609 }
0610
0611 static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
0612 {
0613     return to_entry(ht->es, e->hash_next);
0614 }
0615
0616 static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
0617 {
0618     e->hash_next = ht->buckets[bucket];
0619     ht->buckets[bucket] = to_index(ht->es, e);
0620 }
0621
0622 static void h_insert(struct smq_hash_table *ht, struct entry *e)
0623 {
0624     unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
0625     __h_insert(ht, h, e);
0626 }
0627
0628 static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
0629                 struct entry **prev)
0630 {
0631     struct entry *e;
0632
0633     *prev = NULL;
0634     for (e = h_head(ht, h); e; e = h_next(ht, e)) {
0635         if (e->oblock == oblock)
0636             return e;
0637
0638         *prev = e;
0639     }
0640
0641     return NULL;
0642 }
0643
0644 static void __h_unlink(struct smq_hash_table *ht, unsigned h,
0645                struct entry *e, struct entry *prev)
0646 {
0647     if (prev)
0648         prev->hash_next = e->hash_next;
0649     else
0650         ht->buckets[h] = e->hash_next;
0651 }
0652
0653 /*
0654  * Also moves each entry to the front of the bucket.
0655  */
0656 static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
0657 {
0658     struct entry *e, *prev;
0659     unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
0660
0661     e = __h_lookup(ht, h, oblock, &prev);
0662     if (e && prev) {
0663         /*
0664          * Move to the front because this entry is likely
0665          * to be hit again.
0666          */
0667         __h_unlink(ht, h, e, prev);
0668         __h_insert(ht, h, e);
0669     }
0670
0671     return e;
0672 }
0673
0674 static void h_remove(struct smq_hash_table *ht, struct entry *e)
0675 {
0676     unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
0677     struct entry *prev;
0678
0679     /*
0680      * The down side of using a singly linked list is we have to
0681      * iterate the bucket to remove an item.
0682      */
0683     e = __h_lookup(ht, h, e->oblock, &prev);
0684     if (e)
0685         __h_unlink(ht, h, e, prev);
0686 }
0687
0688 /*----------------------------------------------------------------*/
0689
0690 struct entry_alloc {
0691     struct entry_space *es;
0692     unsigned begin;
0693
0694     unsigned nr_allocated;
0695     struct ilist free;
0696 };
0697
0698 static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
0699                unsigned begin, unsigned end)
0700 {
0701     unsigned i;
0702
0703     ea->es = es;
0704     ea->nr_allocated = 0u;
0705     ea->begin = begin;
0706
0707     l_init(&ea->free);
0708     for (i = begin; i != end; i++)
0709         l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
0710 }
0711
0712 static void init_entry(struct entry *e)
0713 {
0714     /*
0715      * We can't memset because that would clear the hotspot and
0716      * sentinel bits which remain constant.
0717      */
0718     e->hash_next = INDEXER_NULL;
0719     e->next = INDEXER_NULL;
0720     e->prev = INDEXER_NULL;
0721     e->level = 0u;
0722     e->dirty = true;    /* FIXME: audit */
0723     e->allocated = true;
0724     e->sentinel = false;
0725     e->pending_work = false;
0726 }
0727
0728 static struct entry *alloc_entry(struct entry_alloc *ea)
0729 {
0730     struct entry *e;
0731
0732     if (l_empty(&ea->free))
0733         return NULL;
0734
0735     e = l_pop_head(ea->es, &ea->free);
0736     init_entry(e);
0737     ea->nr_allocated++;
0738
0739     return e;
0740 }
0741
0742 /*
0743  * This assumes the cblock hasn't already been allocated.
0744  */
0745 static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
0746 {
0747     struct entry *e = __get_entry(ea->es, ea->begin + i);
0748
0749     BUG_ON(e->allocated);
0750
0751     l_del(ea->es, &ea->free, e);
0752     init_entry(e);
0753     ea->nr_allocated++;
0754
0755     return e;
0756 }
0757
0758 static void free_entry(struct entry_alloc *ea, struct entry *e)
0759 {
0760     BUG_ON(!ea->nr_allocated);
0761     BUG_ON(!e->allocated);
0762
0763     ea->nr_allocated--;
0764     e->allocated = false;
0765     l_add_tail(ea->es, &ea->free, e);
0766 }
0767
0768 static bool allocator_empty(struct entry_alloc *ea)
0769 {
0770     return l_empty(&ea->free);
0771 }
0772
0773 static unsigned get_index(struct entry_alloc *ea, struct entry *e)
0774 {
0775     return to_index(ea->es, e) - ea->begin;
0776 }
0777
0778 static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
0779 {
0780     return __get_entry(ea->es, ea->begin + index);
0781 }
0782
0783 /*----------------------------------------------------------------*/
0784
0785 #define NR_HOTSPOT_LEVELS 64u
0786 #define NR_CACHE_LEVELS 64u
0787
0788 #define WRITEBACK_PERIOD (10ul * HZ)
0789 #define DEMOTE_PERIOD (60ul * HZ)
0790
0791 #define HOTSPOT_UPDATE_PERIOD (HZ)
0792 #define CACHE_UPDATE_PERIOD (60ul * HZ)
0793
0794 struct smq_policy {
0795     struct dm_cache_policy policy;
0796
0797     /* protects everything */
0798     spinlock_t lock;
0799     dm_cblock_t cache_size;
0800     sector_t cache_block_size;
0801
0802     sector_t hotspot_block_size;
0803     unsigned nr_hotspot_blocks;
0804     unsigned cache_blocks_per_hotspot_block;
0805     unsigned hotspot_level_jump;
0806
0807     struct entry_space es;
0808     struct entry_alloc writeback_sentinel_alloc;
0809     struct entry_alloc demote_sentinel_alloc;
0810     struct entry_alloc hotspot_alloc;
0811     struct entry_alloc cache_alloc;
0812
0813     unsigned long *hotspot_hit_bits;
0814     unsigned long *cache_hit_bits;
0815
0816     /*
0817      * We maintain three queues of entries.  The cache proper,
0818      * consisting of a clean and dirty queue, containing the currently
0819      * active mappings.  The hotspot queue uses a larger block size to
0820      * track blocks that are being hit frequently and potential
0821      * candidates for promotion to the cache.
0822      */
0823     struct queue hotspot;
0824     struct queue clean;
0825     struct queue dirty;
0826
0827     struct stats hotspot_stats;
0828     struct stats cache_stats;
0829
0830     /*
0831      * Keeps track of time, incremented by the core.  We use this to
0832      * avoid attributing multiple hits within the same tick.
0833      */
0834     unsigned tick;
0835
0836     /*
0837      * The hash tables allows us to quickly find an entry by origin
0838      * block.
0839      */
0840     struct smq_hash_table table;
0841     struct smq_hash_table hotspot_table;
0842
0843     bool current_writeback_sentinels;
0844     unsigned long next_writeback_period;
0845
0846     bool current_demote_sentinels;
0847     unsigned long next_demote_period;
0848
0849     unsigned write_promote_level;
0850     unsigned read_promote_level;
0851
0852     unsigned long next_hotspot_period;
0853     unsigned long next_cache_period;
0854
0855     struct background_tracker *bg_work;
0856
0857     bool migrations_allowed;
0858 };
0859
0860 /*----------------------------------------------------------------*/
0861
0862 static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
0863 {
0864     return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
0865 }
0866
0867 static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
0868 {
0869     return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
0870 }
0871
0872 static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
0873 {
0874     return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
0875 }
0876
0877 static void __update_writeback_sentinels(struct smq_policy *mq)
0878 {
0879     unsigned level;
0880     struct queue *q = &mq->dirty;
0881     struct entry *sentinel;
0882
0883     for (level = 0; level < q->nr_levels; level++) {
0884         sentinel = writeback_sentinel(mq, level);
0885         q_del(q, sentinel);
0886         q_push(q, sentinel);
0887     }
0888 }
0889
0890 static void __update_demote_sentinels(struct smq_policy *mq)
0891 {
0892     unsigned level;
0893     struct queue *q = &mq->clean;
0894     struct entry *sentinel;
0895
0896     for (level = 0; level < q->nr_levels; level++) {
0897         sentinel = demote_sentinel(mq, level);
0898         q_del(q, sentinel);
0899         q_push(q, sentinel);
0900     }
0901 }
0902
0903 static void update_sentinels(struct smq_policy *mq)
0904 {
0905     if (time_after(jiffies, mq->next_writeback_period)) {
0906         mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
0907         mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
0908         __update_writeback_sentinels(mq);
0909     }
0910
0911     if (time_after(jiffies, mq->next_demote_period)) {
0912         mq->next_demote_period = jiffies + DEMOTE_PERIOD;
0913         mq->current_demote_sentinels = !mq->current_demote_sentinels;
0914         __update_demote_sentinels(mq);
0915     }
0916 }
0917
0918 static void __sentinels_init(struct smq_policy *mq)
0919 {
0920     unsigned level;
0921     struct entry *sentinel;
0922
0923     for (level = 0; level < NR_CACHE_LEVELS; level++) {
0924         sentinel = writeback_sentinel(mq, level);
0925         sentinel->level = level;
0926         q_push(&mq->dirty, sentinel);
0927
0928         sentinel = demote_sentinel(mq, level);
0929         sentinel->level = level;
0930         q_push(&mq->clean, sentinel);
0931     }
0932 }
0933
0934 static void sentinels_init(struct smq_policy *mq)
0935 {
0936     mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
0937     mq->next_demote_period = jiffies + DEMOTE_PERIOD;
0938
0939     mq->current_writeback_sentinels = false;
0940     mq->current_demote_sentinels = false;
0941     __sentinels_init(mq);
0942
0943     mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
0944     mq->current_demote_sentinels = !mq->current_demote_sentinels;
0945     __sentinels_init(mq);
0946 }
0947
0948 /*----------------------------------------------------------------*/
0949
0950 static void del_queue(struct smq_policy *mq, struct entry *e)
0951 {
0952     q_del(e->dirty ? &mq->dirty : &mq->clean, e);
0953 }
0954
0955 static void push_queue(struct smq_policy *mq, struct entry *e)
0956 {
0957     if (e->dirty)
0958         q_push(&mq->dirty, e);
0959     else
0960         q_push(&mq->clean, e);
0961 }
0962
0963 // !h, !q, a -> h, q, a
0964 static void push(struct smq_policy *mq, struct entry *e)
0965 {
0966     h_insert(&mq->table, e);
0967     if (!e->pending_work)
0968         push_queue(mq, e);
0969 }
0970
0971 static void push_queue_front(struct smq_policy *mq, struct entry *e)
0972 {
0973     if (e->dirty)
0974         q_push_front(&mq->dirty, e);
0975     else
0976         q_push_front(&mq->clean, e);
0977 }
0978
0979 static void push_front(struct smq_policy *mq, struct entry *e)
0980 {
0981     h_insert(&mq->table, e);
0982     if (!e->pending_work)
0983         push_queue_front(mq, e);
0984 }
0985
0986 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
0987 {
0988     return to_cblock(get_index(&mq->cache_alloc, e));
0989 }
0990
0991 static void requeue(struct smq_policy *mq, struct entry *e)
0992 {
0993     /*
0994      * Pending work has temporarily been taken out of the queues.
0995      */
0996     if (e->pending_work)
0997         return;
0998
0999     if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1000         if (!e->dirty) {
1001             q_requeue(&mq->clean, e, 1u, NULL, NULL);
1002             return;
1003         }
1004
1005         q_requeue(&mq->dirty, e, 1u,
1006               get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1007               get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1008     }
1009 }
1010
1011 static unsigned default_promote_level(struct smq_policy *mq)
1012 {
1013     /*
1014      * The promote level depends on the current performance of the
1015      * cache.
1016      *
1017      * If the cache is performing badly, then we can't afford
1018      * to promote much without causing performance to drop below that
1019      * of the origin device.
1020      *
1021      * If the cache is performing well, then we don't need to promote
1022      * much.  If it isn't broken, don't fix it.
1023      *
1024      * If the cache is middling then we promote more.
1025      *
1026      * This scheme reminds me of a graph of entropy vs probability of a
1027      * binary variable.
1028      */
1029     static const unsigned int table[] = {
1030         1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
1031     };
1032
1033     unsigned hits = mq->cache_stats.hits;
1034     unsigned misses = mq->cache_stats.misses;
1035     unsigned index = safe_div(hits << 4u, hits + misses);
1036     return table[index];
1037 }
1038
1039 static void update_promote_levels(struct smq_policy *mq)
1040 {
1041     /*
1042      * If there are unused cache entries then we want to be really
1043      * eager to promote.
1044      */
1045     unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1046         default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1047
1048     threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1049
1050     /*
1051      * If the hotspot queue is performing badly then we have little
1052      * confidence that we know which blocks to promote.  So we cut down
1053      * the amount of promotions.
1054      */
1055     switch (stats_assess(&mq->hotspot_stats)) {
1056     case Q_POOR:
1057         threshold_level /= 4u;
1058         break;
1059
1060     case Q_FAIR:
1061         threshold_level /= 2u;
1062         break;
1063
1064     case Q_WELL:
1065         break;
1066     }
1067
1068     mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1069     mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1070 }
1071
1072 /*
1073  * If the hotspot queue is performing badly, then we try and move entries
1074  * around more quickly.
1075  */
1076 static void update_level_jump(struct smq_policy *mq)
1077 {
1078     switch (stats_assess(&mq->hotspot_stats)) {
1079     case Q_POOR:
1080         mq->hotspot_level_jump = 4u;
1081         break;
1082
1083     case Q_FAIR:
1084         mq->hotspot_level_jump = 2u;
1085         break;
1086
1087     case Q_WELL:
1088         mq->hotspot_level_jump = 1u;
1089         break;
1090     }
1091 }
1092
1093 static void end_hotspot_period(struct smq_policy *mq)
1094 {
1095     clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1096     update_promote_levels(mq);
1097
1098     if (time_after(jiffies, mq->next_hotspot_period)) {
1099         update_level_jump(mq);
1100         q_redistribute(&mq->hotspot);
1101         stats_reset(&mq->hotspot_stats);
1102         mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1103     }
1104 }
1105
1106 static void end_cache_period(struct smq_policy *mq)
1107 {
1108     if (time_after(jiffies, mq->next_cache_period)) {
1109         clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1110
1111         q_redistribute(&mq->dirty);
1112         q_redistribute(&mq->clean);
1113         stats_reset(&mq->cache_stats);
1114
1115         mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1116     }
1117 }
1118
1119 /*----------------------------------------------------------------*/
1120
1121 /*
1122  * Targets are given as a percentage.
1123  */
1124 #define CLEAN_TARGET 25u
1125 #define FREE_TARGET 25u
1126
1127 static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1128 {
1129     return from_cblock(mq->cache_size) * p / 100u;
1130 }
1131
1132 static bool clean_target_met(struct smq_policy *mq, bool idle)
1133 {
1134     /*
1135      * Cache entries may not be populated.  So we cannot rely on the
1136      * size of the clean queue.
1137      */
1138     if (idle) {
1139         /*
1140          * We'd like to clean everything.
1141          */
1142         return q_size(&mq->dirty) == 0u;
1143     }
1144
1145     /*
1146      * If we're busy we don't worry about cleaning at all.
1147      */
1148     return true;
1149 }
1150
1151 static bool free_target_met(struct smq_policy *mq)
1152 {
1153     unsigned nr_free;
1154
1155     nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1156     return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1157         percent_to_target(mq, FREE_TARGET);
1158 }
1159
1160 /*----------------------------------------------------------------*/
1161
1162 static void mark_pending(struct smq_policy *mq, struct entry *e)
1163 {
1164     BUG_ON(e->sentinel);
1165     BUG_ON(!e->allocated);
1166     BUG_ON(e->pending_work);
1167     e->pending_work = true;
1168 }
1169
1170 static void clear_pending(struct smq_policy *mq, struct entry *e)
1171 {
1172     BUG_ON(!e->pending_work);
1173     e->pending_work = false;
1174 }
1175
1176 static void queue_writeback(struct smq_policy *mq, bool idle)
1177 {
1178     int r;
1179     struct policy_work work;
1180     struct entry *e;
1181
1182     e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1183     if (e) {
1184         mark_pending(mq, e);
1185         q_del(&mq->dirty, e);
1186
1187         work.op = POLICY_WRITEBACK;
1188         work.oblock = e->oblock;
1189         work.cblock = infer_cblock(mq, e);
1190
1191         r = btracker_queue(mq->bg_work, &work, NULL);
1192         if (r) {
1193             clear_pending(mq, e);
1194             q_push_front(&mq->dirty, e);
1195         }
1196     }
1197 }
1198
1199 static void queue_demotion(struct smq_policy *mq)
1200 {
1201     int r;
1202     struct policy_work work;
1203     struct entry *e;
1204
1205     if (WARN_ON_ONCE(!mq->migrations_allowed))
1206         return;
1207
1208     e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1209     if (!e) {
1210         if (!clean_target_met(mq, true))
1211             queue_writeback(mq, false);
1212         return;
1213     }
1214
1215     mark_pending(mq, e);
1216     q_del(&mq->clean, e);
1217
1218     work.op = POLICY_DEMOTE;
1219     work.oblock = e->oblock;
1220     work.cblock = infer_cblock(mq, e);
1221     r = btracker_queue(mq->bg_work, &work, NULL);
1222     if (r) {
1223         clear_pending(mq, e);
1224         q_push_front(&mq->clean, e);
1225     }
1226 }
1227
1228 static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1229                 struct policy_work **workp)
1230 {
1231     int r;
1232     struct entry *e;
1233     struct policy_work work;
1234
1235     if (!mq->migrations_allowed)
1236         return;
1237
1238     if (allocator_empty(&mq->cache_alloc)) {
1239         /*
1240          * We always claim to be 'idle' to ensure some demotions happen
1241          * with continuous loads.
1242          */
1243         if (!free_target_met(mq))
1244             queue_demotion(mq);
1245         return;
1246     }
1247
1248     if (btracker_promotion_already_present(mq->bg_work, oblock))
1249         return;
1250
1251     /*
1252      * We allocate the entry now to reserve the cblock.  If the
1253      * background work is aborted we must remember to free it.
1254      */
1255     e = alloc_entry(&mq->cache_alloc);
1256     BUG_ON(!e);
1257     e->pending_work = true;
1258     work.op = POLICY_PROMOTE;
1259     work.oblock = oblock;
1260     work.cblock = infer_cblock(mq, e);
1261     r = btracker_queue(mq->bg_work, &work, workp);
1262     if (r)
1263         free_entry(&mq->cache_alloc, e);
1264 }
1265
1266 /*----------------------------------------------------------------*/
1267
1268 enum promote_result {
1269     PROMOTE_NOT,
1270     PROMOTE_TEMPORARY,
1271     PROMOTE_PERMANENT
1272 };
1273
1274 /*
1275  * Converts a boolean into a promote result.
1276  */
1277 static enum promote_result maybe_promote(bool promote)
1278 {
1279     return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1280 }
1281
1282 static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1283                       int data_dir, bool fast_promote)
1284 {
1285     if (data_dir == WRITE) {
1286         if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1287             return PROMOTE_TEMPORARY;
1288
1289         return maybe_promote(hs_e->level >= mq->write_promote_level);
1290     } else
1291         return maybe_promote(hs_e->level >= mq->read_promote_level);
1292 }
1293
1294 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1295 {
1296     sector_t r = from_oblock(b);
1297     (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1298     return to_oblock(r);
1299 }
1300
1301 static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1302 {
1303     unsigned hi;
1304     dm_oblock_t hb = to_hblock(mq, b);
1305     struct entry *e = h_lookup(&mq->hotspot_table, hb);
1306
1307     if (e) {
1308         stats_level_accessed(&mq->hotspot_stats, e->level);
1309
1310         hi = get_index(&mq->hotspot_alloc, e);
1311         q_requeue(&mq->hotspot, e,
1312               test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1313               0u : mq->hotspot_level_jump,
1314               NULL, NULL);
1315
1316     } else {
1317         stats_miss(&mq->hotspot_stats);
1318
1319         e = alloc_entry(&mq->hotspot_alloc);
1320         if (!e) {
1321             e = q_pop(&mq->hotspot);
1322             if (e) {
1323                 h_remove(&mq->hotspot_table, e);
1324                 hi = get_index(&mq->hotspot_alloc, e);
1325                 clear_bit(hi, mq->hotspot_hit_bits);
1326             }
1327
1328         }
1329
1330         if (e) {
1331             e->oblock = hb;
1332             q_push(&mq->hotspot, e);
1333             h_insert(&mq->hotspot_table, e);
1334         }
1335     }
1336
1337     return e;
1338 }
1339
1340 /*----------------------------------------------------------------*/
1341
1342 /*
1343  * Public interface, via the policy struct.  See dm-cache-policy.h for a
1344  * description of these.
1345  */
1346
1347 static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1348 {
1349     return container_of(p, struct smq_policy, policy);
1350 }
1351
1352 static void smq_destroy(struct dm_cache_policy *p)
1353 {
1354     struct smq_policy *mq = to_smq_policy(p);
1355
1356     btracker_destroy(mq->bg_work);
1357     h_exit(&mq->hotspot_table);
1358     h_exit(&mq->table);
1359     free_bitset(mq->hotspot_hit_bits);
1360     free_bitset(mq->cache_hit_bits);
1361     space_exit(&mq->es);
1362     kfree(mq);
1363 }
1364
1365 /*----------------------------------------------------------------*/
1366
1367 static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1368             int data_dir, bool fast_copy,
1369             struct policy_work **work, bool *background_work)
1370 {
1371     struct entry *e, *hs_e;
1372     enum promote_result pr;
1373
1374     *background_work = false;
1375
1376     e = h_lookup(&mq->table, oblock);
1377     if (e) {
1378         stats_level_accessed(&mq->cache_stats, e->level);
1379
1380         requeue(mq, e);
1381         *cblock = infer_cblock(mq, e);
1382         return 0;
1383
1384     } else {
1385         stats_miss(&mq->cache_stats);
1386
1387         /*
1388          * The hotspot queue only gets updated with misses.
1389          */
1390         hs_e = update_hotspot_queue(mq, oblock);
1391
1392         pr = should_promote(mq, hs_e, data_dir, fast_copy);
1393         if (pr != PROMOTE_NOT) {
1394             queue_promotion(mq, oblock, work);
1395             *background_work = true;
1396         }
1397
1398         return -ENOENT;
1399     }
1400 }
1401
1402 static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1403               int data_dir, bool fast_copy,
1404               bool *background_work)
1405 {
1406     int r;
1407     unsigned long flags;
1408     struct smq_policy *mq = to_smq_policy(p);
1409
1410     spin_lock_irqsave(&mq->lock, flags);
1411     r = __lookup(mq, oblock, cblock,
1412              data_dir, fast_copy,
1413              NULL, background_work);
1414     spin_unlock_irqrestore(&mq->lock, flags);
1415
1416     return r;
1417 }
1418
1419 static int smq_lookup_with_work(struct dm_cache_policy *p,
1420                 dm_oblock_t oblock, dm_cblock_t *cblock,
1421                 int data_dir, bool fast_copy,
1422                 struct policy_work **work)
1423 {
1424     int r;
1425     bool background_queued;
1426     unsigned long flags;
1427     struct smq_policy *mq = to_smq_policy(p);
1428
1429     spin_lock_irqsave(&mq->lock, flags);
1430     r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1431     spin_unlock_irqrestore(&mq->lock, flags);
1432
1433     return r;
1434 }
1435
1436 static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1437                    struct policy_work **result)
1438 {
1439     int r;
1440     unsigned long flags;
1441     struct smq_policy *mq = to_smq_policy(p);
1442
1443     spin_lock_irqsave(&mq->lock, flags);
1444     r = btracker_issue(mq->bg_work, result);
1445     if (r == -ENODATA) {
1446         if (!clean_target_met(mq, idle)) {
1447             queue_writeback(mq, idle);
1448             r = btracker_issue(mq->bg_work, result);
1449         }
1450     }
1451     spin_unlock_irqrestore(&mq->lock, flags);
1452
1453     return r;
1454 }
1455
1456 /*
1457  * We need to clear any pending work flags that have been set, and in the
1458  * case of promotion free the entry for the destination cblock.
1459  */
1460 static void __complete_background_work(struct smq_policy *mq,
1461                        struct policy_work *work,
1462                        bool success)
1463 {
1464     struct entry *e = get_entry(&mq->cache_alloc,
1465                     from_cblock(work->cblock));
1466
1467     switch (work->op) {
1468     case POLICY_PROMOTE:
1469         // !h, !q, a
1470         clear_pending(mq, e);
1471         if (success) {
1472             e->oblock = work->oblock;
1473             e->level = NR_CACHE_LEVELS - 1;
1474             push(mq, e);
1475             // h, q, a
1476         } else {
1477             free_entry(&mq->cache_alloc, e);
1478             // !h, !q, !a
1479         }
1480         break;
1481
1482     case POLICY_DEMOTE:
1483         // h, !q, a
1484         if (success) {
1485             h_remove(&mq->table, e);
1486             free_entry(&mq->cache_alloc, e);
1487             // !h, !q, !a
1488         } else {
1489             clear_pending(mq, e);
1490             push_queue(mq, e);
1491             // h, q, a
1492         }
1493         break;
1494
1495     case POLICY_WRITEBACK:
1496         // h, !q, a
1497         clear_pending(mq, e);
1498         push_queue(mq, e);
1499         // h, q, a
1500         break;
1501     }
1502
1503     btracker_complete(mq->bg_work, work);
1504 }
1505
1506 static void smq_complete_background_work(struct dm_cache_policy *p,
1507                      struct policy_work *work,
1508                      bool success)
1509 {
1510     unsigned long flags;
1511     struct smq_policy *mq = to_smq_policy(p);
1512
1513     spin_lock_irqsave(&mq->lock, flags);
1514     __complete_background_work(mq, work, success);
1515     spin_unlock_irqrestore(&mq->lock, flags);
1516 }
1517
1518 // in_hash(oblock) -> in_hash(oblock)
1519 static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1520 {
1521     struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1522
1523     if (e->pending_work)
1524         e->dirty = set;
1525     else {
1526         del_queue(mq, e);
1527         e->dirty = set;
1528         push_queue(mq, e);
1529     }
1530 }
1531
1532 static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1533 {
1534     unsigned long flags;
1535     struct smq_policy *mq = to_smq_policy(p);
1536
1537     spin_lock_irqsave(&mq->lock, flags);
1538     __smq_set_clear_dirty(mq, cblock, true);
1539     spin_unlock_irqrestore(&mq->lock, flags);
1540 }
1541
1542 static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1543 {
1544     struct smq_policy *mq = to_smq_policy(p);
1545     unsigned long flags;
1546
1547     spin_lock_irqsave(&mq->lock, flags);
1548     __smq_set_clear_dirty(mq, cblock, false);
1549     spin_unlock_irqrestore(&mq->lock, flags);
1550 }
1551
1552 static unsigned random_level(dm_cblock_t cblock)
1553 {
1554     return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1555 }
1556
1557 static int smq_load_mapping(struct dm_cache_policy *p,
1558                 dm_oblock_t oblock, dm_cblock_t cblock,
1559                 bool dirty, uint32_t hint, bool hint_valid)
1560 {
1561     struct smq_policy *mq = to_smq_policy(p);
1562     struct entry *e;
1563
1564     e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1565     e->oblock = oblock;
1566     e->dirty = dirty;
1567     e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1568     e->pending_work = false;
1569
1570     /*
1571      * When we load mappings we push ahead of both sentinels in order to
1572      * allow demotions and cleaning to occur immediately.
1573      */
1574     push_front(mq, e);
1575
1576     return 0;
1577 }
1578
1579 static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1580 {
1581     struct smq_policy *mq = to_smq_policy(p);
1582     struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1583
1584     if (!e->allocated)
1585         return -ENODATA;
1586
1587     // FIXME: what if this block has pending background work?
1588     del_queue(mq, e);
1589     h_remove(&mq->table, e);
1590     free_entry(&mq->cache_alloc, e);
1591     return 0;
1592 }
1593
1594 static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1595 {
1596     struct smq_policy *mq = to_smq_policy(p);
1597     struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1598
1599     if (!e->allocated)
1600         return 0;
1601
1602     return e->level;
1603 }
1604
1605 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1606 {
1607     dm_cblock_t r;
1608     unsigned long flags;
1609     struct smq_policy *mq = to_smq_policy(p);
1610
1611     spin_lock_irqsave(&mq->lock, flags);
1612     r = to_cblock(mq->cache_alloc.nr_allocated);
1613     spin_unlock_irqrestore(&mq->lock, flags);
1614
1615     return r;
1616 }
1617
1618 static void smq_tick(struct dm_cache_policy *p, bool can_block)
1619 {
1620     struct smq_policy *mq = to_smq_policy(p);
1621     unsigned long flags;
1622
1623     spin_lock_irqsave(&mq->lock, flags);
1624     mq->tick++;
1625     update_sentinels(mq);
1626     end_hotspot_period(mq);
1627     end_cache_period(mq);
1628     spin_unlock_irqrestore(&mq->lock, flags);
1629 }
1630
1631 static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1632 {
1633     struct smq_policy *mq = to_smq_policy(p);
1634     mq->migrations_allowed = allow;
1635 }
1636
1637 /*
1638  * smq has no config values, but the old mq policy did.  To avoid breaking
1639  * software we continue to accept these configurables for the mq policy,
1640  * but they have no effect.
1641  */
1642 static int mq_set_config_value(struct dm_cache_policy *p,
1643                    const char *key, const char *value)
1644 {
1645     unsigned long tmp;
1646
1647     if (kstrtoul(value, 10, &tmp))
1648         return -EINVAL;
1649
1650     if (!strcasecmp(key, "random_threshold") ||
1651         !strcasecmp(key, "sequential_threshold") ||
1652         !strcasecmp(key, "discard_promote_adjustment") ||
1653         !strcasecmp(key, "read_promote_adjustment") ||
1654         !strcasecmp(key, "write_promote_adjustment")) {
1655         DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1656         return 0;
1657     }
1658
1659     return -EINVAL;
1660 }
1661
1662 static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1663                  unsigned maxlen, ssize_t *sz_ptr)
1664 {
1665     ssize_t sz = *sz_ptr;
1666
1667     DMEMIT("10 random_threshold 0 "
1668            "sequential_threshold 0 "
1669            "discard_promote_adjustment 0 "
1670            "read_promote_adjustment 0 "
1671            "write_promote_adjustment 0 ");
1672
1673     *sz_ptr = sz;
1674     return 0;
1675 }
1676
1677 /* Init the policy plugin interface function pointers. */
1678 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1679 {
1680     mq->policy.destroy = smq_destroy;
1681     mq->policy.lookup = smq_lookup;
1682     mq->policy.lookup_with_work = smq_lookup_with_work;
1683     mq->policy.get_background_work = smq_get_background_work;
1684     mq->policy.complete_background_work = smq_complete_background_work;
1685     mq->policy.set_dirty = smq_set_dirty;
1686     mq->policy.clear_dirty = smq_clear_dirty;
1687     mq->policy.load_mapping = smq_load_mapping;
1688     mq->policy.invalidate_mapping = smq_invalidate_mapping;
1689     mq->policy.get_hint = smq_get_hint;
1690     mq->policy.residency = smq_residency;
1691     mq->policy.tick = smq_tick;
1692     mq->policy.allow_migrations = smq_allow_migrations;
1693
1694     if (mimic_mq) {
1695         mq->policy.set_config_value = mq_set_config_value;
1696         mq->policy.emit_config_values = mq_emit_config_values;
1697     }
1698 }
1699
1700 static bool too_many_hotspot_blocks(sector_t origin_size,
1701                     sector_t hotspot_block_size,
1702                     unsigned nr_hotspot_blocks)
1703 {
1704     return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1705 }
1706
1707 static void calc_hotspot_params(sector_t origin_size,
1708                 sector_t cache_block_size,
1709                 unsigned nr_cache_blocks,
1710                 sector_t *hotspot_block_size,
1711                 unsigned *nr_hotspot_blocks)
1712 {
1713     *hotspot_block_size = cache_block_size * 16u;
1714     *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1715
1716     while ((*hotspot_block_size > cache_block_size) &&
1717            too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1718         *hotspot_block_size /= 2u;
1719 }
1720
1721 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1722                         sector_t origin_size,
1723                         sector_t cache_block_size,
1724                         bool mimic_mq,
1725                         bool migrations_allowed)
1726 {
1727     unsigned i;
1728     unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1729     unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1730     struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1731
1732     if (!mq)
1733         return NULL;
1734
1735     init_policy_functions(mq, mimic_mq);
1736     mq->cache_size = cache_size;
1737     mq->cache_block_size = cache_block_size;
1738
1739     calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1740                 &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1741
1742     mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1743     mq->hotspot_level_jump = 1u;
1744     if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1745         DMERR("couldn't initialize entry space");
1746         goto bad_pool_init;
1747     }
1748
1749     init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1750     for (i = 0; i < nr_sentinels_per_queue; i++)
1751         get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1752
1753     init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1754     for (i = 0; i < nr_sentinels_per_queue; i++)
1755         get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1756
1757     init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1758                total_sentinels + mq->nr_hotspot_blocks);
1759
1760     init_allocator(&mq->cache_alloc, &mq->es,
1761                total_sentinels + mq->nr_hotspot_blocks,
1762                total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1763
1764     mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1765     if (!mq->hotspot_hit_bits) {
1766         DMERR("couldn't allocate hotspot hit bitset");
1767         goto bad_hotspot_hit_bits;
1768     }
1769     clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1770
1771     if (from_cblock(cache_size)) {
1772         mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1773         if (!mq->cache_hit_bits) {
1774             DMERR("couldn't allocate cache hit bitset");
1775             goto bad_cache_hit_bits;
1776         }
1777         clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1778     } else
1779         mq->cache_hit_bits = NULL;
1780
1781     mq->tick = 0;
1782     spin_lock_init(&mq->lock);
1783
1784     q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1785     mq->hotspot.nr_top_levels = 8;
1786     mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1787                        from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1788
1789     q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1790     q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1791
1792     stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1793     stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1794
1795     if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1796         goto bad_alloc_table;
1797
1798     if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1799         goto bad_alloc_hotspot_table;
1800
1801     sentinels_init(mq);
1802     mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1803
1804     mq->next_hotspot_period = jiffies;
1805     mq->next_cache_period = jiffies;
1806
1807     mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1808     if (!mq->bg_work)
1809         goto bad_btracker;
1810
1811     mq->migrations_allowed = migrations_allowed;
1812
1813     return &mq->policy;
1814
1815 bad_btracker:
1816     h_exit(&mq->hotspot_table);
1817 bad_alloc_hotspot_table:
1818     h_exit(&mq->table);
1819 bad_alloc_table:
1820     free_bitset(mq->cache_hit_bits);
1821 bad_cache_hit_bits:
1822     free_bitset(mq->hotspot_hit_bits);
1823 bad_hotspot_hit_bits:
1824     space_exit(&mq->es);
1825 bad_pool_init:
1826     kfree(mq);
1827
1828     return NULL;
1829 }
1830
1831 static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1832                       sector_t origin_size,
1833                       sector_t cache_block_size)
1834 {
1835     return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1836 }
1837
1838 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1839                      sector_t origin_size,
1840                      sector_t cache_block_size)
1841 {
1842     return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1843 }
1844
1845 static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1846                           sector_t origin_size,
1847                           sector_t cache_block_size)
1848 {
1849     return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1850 }
1851
1852 /*----------------------------------------------------------------*/
1853
1854 static struct dm_cache_policy_type smq_policy_type = {
1855     .name = "smq",
1856     .version = {2, 0, 0},
1857     .hint_size = 4,
1858     .owner = THIS_MODULE,
1859     .create = smq_create
1860 };
1861
1862 static struct dm_cache_policy_type mq_policy_type = {
1863     .name = "mq",
1864     .version = {2, 0, 0},
1865     .hint_size = 4,
1866     .owner = THIS_MODULE,
1867     .create = mq_create,
1868 };
1869
1870 static struct dm_cache_policy_type cleaner_policy_type = {
1871     .name = "cleaner",
1872     .version = {2, 0, 0},
1873     .hint_size = 4,
1874     .owner = THIS_MODULE,
1875     .create = cleaner_create,
1876 };
1877
1878 static struct dm_cache_policy_type default_policy_type = {
1879     .name = "default",
1880     .version = {2, 0, 0},
1881     .hint_size = 4,
1882     .owner = THIS_MODULE,
1883     .create = smq_create,
1884     .real = &smq_policy_type
1885 };
1886
1887 static int __init smq_init(void)
1888 {
1889     int r;
1890
1891     r = dm_cache_policy_register(&smq_policy_type);
1892     if (r) {
1893         DMERR("register failed %d", r);
1894         return -ENOMEM;
1895     }
1896
1897     r = dm_cache_policy_register(&mq_policy_type);
1898     if (r) {
1899         DMERR("register failed (as mq) %d", r);
1900         goto out_mq;
1901     }
1902
1903     r = dm_cache_policy_register(&cleaner_policy_type);
1904     if (r) {
1905         DMERR("register failed (as cleaner) %d", r);
1906         goto out_cleaner;
1907     }
1908
1909     r = dm_cache_policy_register(&default_policy_type);
1910     if (r) {
1911         DMERR("register failed (as default) %d", r);
1912         goto out_default;
1913     }
1914
1915     return 0;
1916
1917 out_default:
1918     dm_cache_policy_unregister(&cleaner_policy_type);
1919 out_cleaner:
1920     dm_cache_policy_unregister(&mq_policy_type);
1921 out_mq:
1922     dm_cache_policy_unregister(&smq_policy_type);
1923
1924     return -ENOMEM;
1925 }
1926
1927 static void __exit smq_exit(void)
1928 {
1929     dm_cache_policy_unregister(&cleaner_policy_type);
1930     dm_cache_policy_unregister(&smq_policy_type);
1931     dm_cache_policy_unregister(&mq_policy_type);
1932     dm_cache_policy_unregister(&default_policy_type);
1933 }
1934
1935 module_init(smq_init);
1936 module_exit(smq_exit);
1937
1938 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1939 MODULE_LICENSE("GPL");
1940 MODULE_DESCRIPTION("smq cache policy");
1941
1942 MODULE_ALIAS("dm-cache-default");
1943 MODULE_ALIAS("dm-cache-mq");
1944 MODULE_ALIAS("dm-cache-cleaner");