Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
0004  *
0005  * Uses a block device as cache for other block devices; optimized for SSDs.
0006  * All allocation is done in buckets, which should match the erase block size
0007  * of the device.
0008  *
0009  * Buckets containing cached data are kept on a heap sorted by priority;
0010  * bucket priority is increased on cache hit, and periodically all the buckets
0011  * on the heap have their priority scaled down. This currently is just used as
0012  * an LRU but in the future should allow for more intelligent heuristics.
0013  *
0014  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
0015  * counter. Garbage collection is used to remove stale pointers.
0016  *
0017  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
0018  * as keys are inserted we only sort the pages that have not yet been written.
0019  * When garbage collection is run, we resort the entire node.
0020  *
0021  * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
0022  */
0023 
0024 #include "bcache.h"
0025 #include "btree.h"
0026 #include "debug.h"
0027 #include "extents.h"
0028 #include "writeback.h"
0029 
0030 static void sort_key_next(struct btree_iter *iter,
0031               struct btree_iter_set *i)
0032 {
0033     i->k = bkey_next(i->k);
0034 
0035     if (i->k == i->end)
0036         *i = iter->data[--iter->used];
0037 }
0038 
0039 static bool bch_key_sort_cmp(struct btree_iter_set l,
0040                  struct btree_iter_set r)
0041 {
0042     int64_t c = bkey_cmp(l.k, r.k);
0043 
0044     return c ? c > 0 : l.k < r.k;
0045 }
0046 
0047 static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
0048 {
0049     unsigned int i;
0050 
0051     for (i = 0; i < KEY_PTRS(k); i++)
0052         if (ptr_available(c, k, i)) {
0053             struct cache *ca = c->cache;
0054             size_t bucket = PTR_BUCKET_NR(c, k, i);
0055             size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
0056 
0057             if (KEY_SIZE(k) + r > c->cache->sb.bucket_size ||
0058                 bucket <  ca->sb.first_bucket ||
0059                 bucket >= ca->sb.nbuckets)
0060                 return true;
0061         }
0062 
0063     return false;
0064 }
0065 
0066 /* Common among btree and extent ptrs */
0067 
0068 static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
0069 {
0070     unsigned int i;
0071 
0072     for (i = 0; i < KEY_PTRS(k); i++)
0073         if (ptr_available(c, k, i)) {
0074             struct cache *ca = c->cache;
0075             size_t bucket = PTR_BUCKET_NR(c, k, i);
0076             size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
0077 
0078             if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
0079                 return "bad, length too big";
0080             if (bucket <  ca->sb.first_bucket)
0081                 return "bad, short offset";
0082             if (bucket >= ca->sb.nbuckets)
0083                 return "bad, offset past end of device";
0084             if (ptr_stale(c, k, i))
0085                 return "stale";
0086         }
0087 
0088     if (!bkey_cmp(k, &ZERO_KEY))
0089         return "bad, null key";
0090     if (!KEY_PTRS(k))
0091         return "bad, no pointers";
0092     if (!KEY_SIZE(k))
0093         return "zeroed key";
0094     return "";
0095 }
0096 
0097 void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
0098 {
0099     unsigned int i = 0;
0100     char *out = buf, *end = buf + size;
0101 
0102 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
0103 
0104     p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
0105 
0106     for (i = 0; i < KEY_PTRS(k); i++) {
0107         if (i)
0108             p(", ");
0109 
0110         if (PTR_DEV(k, i) == PTR_CHECK_DEV)
0111             p("check dev");
0112         else
0113             p("%llu:%llu gen %llu", PTR_DEV(k, i),
0114               PTR_OFFSET(k, i), PTR_GEN(k, i));
0115     }
0116 
0117     p("]");
0118 
0119     if (KEY_DIRTY(k))
0120         p(" dirty");
0121     if (KEY_CSUM(k))
0122         p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
0123 #undef p
0124 }
0125 
0126 static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
0127 {
0128     struct btree *b = container_of(keys, struct btree, keys);
0129     unsigned int j;
0130     char buf[80];
0131 
0132     bch_extent_to_text(buf, sizeof(buf), k);
0133     pr_cont(" %s", buf);
0134 
0135     for (j = 0; j < KEY_PTRS(k); j++) {
0136         size_t n = PTR_BUCKET_NR(b->c, k, j);
0137 
0138         pr_cont(" bucket %zu", n);
0139         if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets)
0140             pr_cont(" prio %i",
0141                 PTR_BUCKET(b->c, k, j)->prio);
0142     }
0143 
0144     pr_cont(" %s\n", bch_ptr_status(b->c, k));
0145 }
0146 
0147 /* Btree ptrs */
0148 
0149 bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
0150 {
0151     char buf[80];
0152 
0153     if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
0154         goto bad;
0155 
0156     if (__ptr_invalid(c, k))
0157         goto bad;
0158 
0159     return false;
0160 bad:
0161     bch_extent_to_text(buf, sizeof(buf), k);
0162     cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
0163     return true;
0164 }
0165 
0166 static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
0167 {
0168     struct btree *b = container_of(bk, struct btree, keys);
0169 
0170     return __bch_btree_ptr_invalid(b->c, k);
0171 }
0172 
0173 static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
0174 {
0175     unsigned int i;
0176     char buf[80];
0177     struct bucket *g;
0178 
0179     if (mutex_trylock(&b->c->bucket_lock)) {
0180         for (i = 0; i < KEY_PTRS(k); i++)
0181             if (ptr_available(b->c, k, i)) {
0182                 g = PTR_BUCKET(b->c, k, i);
0183 
0184                 if (KEY_DIRTY(k) ||
0185                     g->prio != BTREE_PRIO ||
0186                     (b->c->gc_mark_valid &&
0187                      GC_MARK(g) != GC_MARK_METADATA))
0188                     goto err;
0189             }
0190 
0191         mutex_unlock(&b->c->bucket_lock);
0192     }
0193 
0194     return false;
0195 err:
0196     mutex_unlock(&b->c->bucket_lock);
0197     bch_extent_to_text(buf, sizeof(buf), k);
0198     btree_bug(b,
0199 "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
0200           buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
0201           g->prio, g->gen, g->last_gc, GC_MARK(g));
0202     return true;
0203 }
0204 
0205 static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
0206 {
0207     struct btree *b = container_of(bk, struct btree, keys);
0208     unsigned int i;
0209 
0210     if (!bkey_cmp(k, &ZERO_KEY) ||
0211         !KEY_PTRS(k) ||
0212         bch_ptr_invalid(bk, k))
0213         return true;
0214 
0215     for (i = 0; i < KEY_PTRS(k); i++)
0216         if (!ptr_available(b->c, k, i) ||
0217             ptr_stale(b->c, k, i))
0218             return true;
0219 
0220     if (expensive_debug_checks(b->c) &&
0221         btree_ptr_bad_expensive(b, k))
0222         return true;
0223 
0224     return false;
0225 }
0226 
0227 static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
0228                        struct bkey *insert,
0229                        struct btree_iter *iter,
0230                        struct bkey *replace_key)
0231 {
0232     struct btree *b = container_of(bk, struct btree, keys);
0233 
0234     if (!KEY_OFFSET(insert))
0235         btree_current_write(b)->prio_blocked++;
0236 
0237     return false;
0238 }
0239 
0240 const struct btree_keys_ops bch_btree_keys_ops = {
0241     .sort_cmp   = bch_key_sort_cmp,
0242     .insert_fixup   = bch_btree_ptr_insert_fixup,
0243     .key_invalid    = bch_btree_ptr_invalid,
0244     .key_bad    = bch_btree_ptr_bad,
0245     .key_to_text    = bch_extent_to_text,
0246     .key_dump   = bch_bkey_dump,
0247 };
0248 
0249 /* Extents */
0250 
0251 /*
0252  * Returns true if l > r - unless l == r, in which case returns true if l is
0253  * older than r.
0254  *
0255  * Necessary for btree_sort_fixup() - if there are multiple keys that compare
0256  * equal in different sets, we have to process them newest to oldest.
0257  */
0258 static bool bch_extent_sort_cmp(struct btree_iter_set l,
0259                 struct btree_iter_set r)
0260 {
0261     int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
0262 
0263     return c ? c > 0 : l.k < r.k;
0264 }
0265 
0266 static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
0267                       struct bkey *tmp)
0268 {
0269     while (iter->used > 1) {
0270         struct btree_iter_set *top = iter->data, *i = top + 1;
0271 
0272         if (iter->used > 2 &&
0273             bch_extent_sort_cmp(i[0], i[1]))
0274             i++;
0275 
0276         if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
0277             break;
0278 
0279         if (!KEY_SIZE(i->k)) {
0280             sort_key_next(iter, i);
0281             heap_sift(iter, i - top, bch_extent_sort_cmp);
0282             continue;
0283         }
0284 
0285         if (top->k > i->k) {
0286             if (bkey_cmp(top->k, i->k) >= 0)
0287                 sort_key_next(iter, i);
0288             else
0289                 bch_cut_front(top->k, i->k);
0290 
0291             heap_sift(iter, i - top, bch_extent_sort_cmp);
0292         } else {
0293             /* can't happen because of comparison func */
0294             BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
0295 
0296             if (bkey_cmp(i->k, top->k) < 0) {
0297                 bkey_copy(tmp, top->k);
0298 
0299                 bch_cut_back(&START_KEY(i->k), tmp);
0300                 bch_cut_front(i->k, top->k);
0301                 heap_sift(iter, 0, bch_extent_sort_cmp);
0302 
0303                 return tmp;
0304             } else {
0305                 bch_cut_back(&START_KEY(i->k), top->k);
0306             }
0307         }
0308     }
0309 
0310     return NULL;
0311 }
0312 
0313 static void bch_subtract_dirty(struct bkey *k,
0314                struct cache_set *c,
0315                uint64_t offset,
0316                int sectors)
0317 {
0318     if (KEY_DIRTY(k))
0319         bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
0320                          offset, -sectors);
0321 }
0322 
0323 static bool bch_extent_insert_fixup(struct btree_keys *b,
0324                     struct bkey *insert,
0325                     struct btree_iter *iter,
0326                     struct bkey *replace_key)
0327 {
0328     struct cache_set *c = container_of(b, struct btree, keys)->c;
0329 
0330     uint64_t old_offset;
0331     unsigned int old_size, sectors_found = 0;
0332 
0333     BUG_ON(!KEY_OFFSET(insert));
0334     BUG_ON(!KEY_SIZE(insert));
0335 
0336     while (1) {
0337         struct bkey *k = bch_btree_iter_next(iter);
0338 
0339         if (!k)
0340             break;
0341 
0342         if (bkey_cmp(&START_KEY(k), insert) >= 0) {
0343             if (KEY_SIZE(k))
0344                 break;
0345             else
0346                 continue;
0347         }
0348 
0349         if (bkey_cmp(k, &START_KEY(insert)) <= 0)
0350             continue;
0351 
0352         old_offset = KEY_START(k);
0353         old_size = KEY_SIZE(k);
0354 
0355         /*
0356          * We might overlap with 0 size extents; we can't skip these
0357          * because if they're in the set we're inserting to we have to
0358          * adjust them so they don't overlap with the key we're
0359          * inserting. But we don't want to check them for replace
0360          * operations.
0361          */
0362 
0363         if (replace_key && KEY_SIZE(k)) {
0364             /*
0365              * k might have been split since we inserted/found the
0366              * key we're replacing
0367              */
0368             unsigned int i;
0369             uint64_t offset = KEY_START(k) -
0370                 KEY_START(replace_key);
0371 
0372             /* But it must be a subset of the replace key */
0373             if (KEY_START(k) < KEY_START(replace_key) ||
0374                 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
0375                 goto check_failed;
0376 
0377             /* We didn't find a key that we were supposed to */
0378             if (KEY_START(k) > KEY_START(insert) + sectors_found)
0379                 goto check_failed;
0380 
0381             if (!bch_bkey_equal_header(k, replace_key))
0382                 goto check_failed;
0383 
0384             /* skip past gen */
0385             offset <<= 8;
0386 
0387             BUG_ON(!KEY_PTRS(replace_key));
0388 
0389             for (i = 0; i < KEY_PTRS(replace_key); i++)
0390                 if (k->ptr[i] != replace_key->ptr[i] + offset)
0391                     goto check_failed;
0392 
0393             sectors_found = KEY_OFFSET(k) - KEY_START(insert);
0394         }
0395 
0396         if (bkey_cmp(insert, k) < 0 &&
0397             bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
0398             /*
0399              * We overlapped in the middle of an existing key: that
0400              * means we have to split the old key. But we have to do
0401              * slightly different things depending on whether the
0402              * old key has been written out yet.
0403              */
0404 
0405             struct bkey *top;
0406 
0407             bch_subtract_dirty(k, c, KEY_START(insert),
0408                        KEY_SIZE(insert));
0409 
0410             if (bkey_written(b, k)) {
0411                 /*
0412                  * We insert a new key to cover the top of the
0413                  * old key, and the old key is modified in place
0414                  * to represent the bottom split.
0415                  *
0416                  * It's completely arbitrary whether the new key
0417                  * is the top or the bottom, but it has to match
0418                  * up with what btree_sort_fixup() does - it
0419                  * doesn't check for this kind of overlap, it
0420                  * depends on us inserting a new key for the top
0421                  * here.
0422                  */
0423                 top = bch_bset_search(b, bset_tree_last(b),
0424                               insert);
0425                 bch_bset_insert(b, top, k);
0426             } else {
0427                 BKEY_PADDED(key) temp;
0428                 bkey_copy(&temp.key, k);
0429                 bch_bset_insert(b, k, &temp.key);
0430                 top = bkey_next(k);
0431             }
0432 
0433             bch_cut_front(insert, top);
0434             bch_cut_back(&START_KEY(insert), k);
0435             bch_bset_fix_invalidated_key(b, k);
0436             goto out;
0437         }
0438 
0439         if (bkey_cmp(insert, k) < 0) {
0440             bch_cut_front(insert, k);
0441         } else {
0442             if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
0443                 old_offset = KEY_START(insert);
0444 
0445             if (bkey_written(b, k) &&
0446                 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
0447                 /*
0448                  * Completely overwrote, so we don't have to
0449                  * invalidate the binary search tree
0450                  */
0451                 bch_cut_front(k, k);
0452             } else {
0453                 __bch_cut_back(&START_KEY(insert), k);
0454                 bch_bset_fix_invalidated_key(b, k);
0455             }
0456         }
0457 
0458         bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
0459     }
0460 
0461 check_failed:
0462     if (replace_key) {
0463         if (!sectors_found) {
0464             return true;
0465         } else if (sectors_found < KEY_SIZE(insert)) {
0466             SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
0467                        (KEY_SIZE(insert) - sectors_found));
0468             SET_KEY_SIZE(insert, sectors_found);
0469         }
0470     }
0471 out:
0472     if (KEY_DIRTY(insert))
0473         bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
0474                          KEY_START(insert),
0475                          KEY_SIZE(insert));
0476 
0477     return false;
0478 }
0479 
0480 bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
0481 {
0482     char buf[80];
0483 
0484     if (!KEY_SIZE(k))
0485         return true;
0486 
0487     if (KEY_SIZE(k) > KEY_OFFSET(k))
0488         goto bad;
0489 
0490     if (__ptr_invalid(c, k))
0491         goto bad;
0492 
0493     return false;
0494 bad:
0495     bch_extent_to_text(buf, sizeof(buf), k);
0496     cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
0497     return true;
0498 }
0499 
0500 static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
0501 {
0502     struct btree *b = container_of(bk, struct btree, keys);
0503 
0504     return __bch_extent_invalid(b->c, k);
0505 }
0506 
0507 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
0508                      unsigned int ptr)
0509 {
0510     struct bucket *g = PTR_BUCKET(b->c, k, ptr);
0511     char buf[80];
0512 
0513     if (mutex_trylock(&b->c->bucket_lock)) {
0514         if (b->c->gc_mark_valid &&
0515             (!GC_MARK(g) ||
0516              GC_MARK(g) == GC_MARK_METADATA ||
0517              (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
0518             goto err;
0519 
0520         if (g->prio == BTREE_PRIO)
0521             goto err;
0522 
0523         mutex_unlock(&b->c->bucket_lock);
0524     }
0525 
0526     return false;
0527 err:
0528     mutex_unlock(&b->c->bucket_lock);
0529     bch_extent_to_text(buf, sizeof(buf), k);
0530     btree_bug(b,
0531 "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
0532           buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
0533           g->prio, g->gen, g->last_gc, GC_MARK(g));
0534     return true;
0535 }
0536 
0537 static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
0538 {
0539     struct btree *b = container_of(bk, struct btree, keys);
0540     unsigned int i, stale;
0541     char buf[80];
0542 
0543     if (!KEY_PTRS(k) ||
0544         bch_extent_invalid(bk, k))
0545         return true;
0546 
0547     for (i = 0; i < KEY_PTRS(k); i++)
0548         if (!ptr_available(b->c, k, i))
0549             return true;
0550 
0551     for (i = 0; i < KEY_PTRS(k); i++) {
0552         stale = ptr_stale(b->c, k, i);
0553 
0554         if (stale && KEY_DIRTY(k)) {
0555             bch_extent_to_text(buf, sizeof(buf), k);
0556             pr_info("stale dirty pointer, stale %u, key: %s\n",
0557                 stale, buf);
0558         }
0559 
0560         btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
0561                  "key too stale: %i, need_gc %u",
0562                  stale, b->c->need_gc);
0563 
0564         if (stale)
0565             return true;
0566 
0567         if (expensive_debug_checks(b->c) &&
0568             bch_extent_bad_expensive(b, k, i))
0569             return true;
0570     }
0571 
0572     return false;
0573 }
0574 
0575 static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
0576 {
0577     return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
0578         ~((uint64_t)1 << 63);
0579 }
0580 
0581 static bool bch_extent_merge(struct btree_keys *bk,
0582                  struct bkey *l,
0583                  struct bkey *r)
0584 {
0585     struct btree *b = container_of(bk, struct btree, keys);
0586     unsigned int i;
0587 
0588     if (key_merging_disabled(b->c))
0589         return false;
0590 
0591     for (i = 0; i < KEY_PTRS(l); i++)
0592         if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
0593             PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
0594             return false;
0595 
0596     /* Keys with no pointers aren't restricted to one bucket and could
0597      * overflow KEY_SIZE
0598      */
0599     if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
0600         SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
0601         SET_KEY_SIZE(l, USHRT_MAX);
0602 
0603         bch_cut_front(l, r);
0604         return false;
0605     }
0606 
0607     if (KEY_CSUM(l)) {
0608         if (KEY_CSUM(r))
0609             l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
0610         else
0611             SET_KEY_CSUM(l, 0);
0612     }
0613 
0614     SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
0615     SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
0616 
0617     return true;
0618 }
0619 
0620 const struct btree_keys_ops bch_extent_keys_ops = {
0621     .sort_cmp   = bch_extent_sort_cmp,
0622     .sort_fixup = bch_extent_sort_fixup,
0623     .insert_fixup   = bch_extent_insert_fixup,
0624     .key_invalid    = bch_extent_invalid,
0625     .key_bad    = bch_extent_bad,
0626     .key_merge  = bch_extent_merge,
0627     .key_to_text    = bch_extent_to_text,
0628     .key_dump   = bch_bkey_dump,
0629     .is_extents = true,
0630 };