Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _BCACHE_BTREE_H
0003 #define _BCACHE_BTREE_H
0004 
0005 /*
0006  * THE BTREE:
0007  *
0008  * At a high level, bcache's btree is relatively standard b+ tree. All keys and
0009  * pointers are in the leaves; interior nodes only have pointers to the child
0010  * nodes.
0011  *
0012  * In the interior nodes, a struct bkey always points to a child btree node, and
0013  * the key is the highest key in the child node - except that the highest key in
0014  * an interior node is always MAX_KEY. The size field refers to the size on disk
0015  * of the child node - this would allow us to have variable sized btree nodes
0016  * (handy for keeping the depth of the btree 1 by expanding just the root).
0017  *
0018  * Btree nodes are themselves log structured, but this is hidden fairly
0019  * thoroughly. Btree nodes on disk will in practice have extents that overlap
0020  * (because they were written at different times), but in memory we never have
0021  * overlapping extents - when we read in a btree node from disk, the first thing
0022  * we do is resort all the sets of keys with a mergesort, and in the same pass
0023  * we check for overlapping extents and adjust them appropriately.
0024  *
0025  * struct btree_op is a central interface to the btree code. It's used for
0026  * specifying read vs. write locking, and the embedded closure is used for
0027  * waiting on IO or reserve memory.
0028  *
0029  * BTREE CACHE:
0030  *
0031  * Btree nodes are cached in memory; traversing the btree might require reading
0032  * in btree nodes which is handled mostly transparently.
0033  *
0034  * bch_btree_node_get() looks up a btree node in the cache and reads it in from
0035  * disk if necessary. This function is almost never called directly though - the
0036  * btree() macro is used to get a btree node, call some function on it, and
0037  * unlock the node after the function returns.
0038  *
0039  * The root is special cased - it's taken out of the cache's lru (thus pinning
0040  * it in memory), so we can find the root of the btree by just dereferencing a
0041  * pointer instead of looking it up in the cache. This makes locking a bit
0042  * tricky, since the root pointer is protected by the lock in the btree node it
0043  * points to - the btree_root() macro handles this.
0044  *
0045  * In various places we must be able to allocate memory for multiple btree nodes
0046  * in order to make forward progress. To do this we use the btree cache itself
0047  * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
0048  * cache we can reuse. We can't allow more than one thread to be doing this at a
0049  * time, so there's a lock, implemented by a pointer to the btree_op closure -
0050  * this allows the btree_root() macro to implicitly release this lock.
0051  *
0052  * BTREE IO:
0053  *
0054  * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
0055  * this.
0056  *
0057  * For writing, we have two btree_write structs embeddded in struct btree - one
0058  * write in flight, and one being set up, and we toggle between them.
0059  *
0060  * Writing is done with a single function -  bch_btree_write() really serves two
0061  * different purposes and should be broken up into two different functions. When
0062  * passing now = false, it merely indicates that the node is now dirty - calling
0063  * it ensures that the dirty keys will be written at some point in the future.
0064  *
0065  * When passing now = true, bch_btree_write() causes a write to happen
0066  * "immediately" (if there was already a write in flight, it'll cause the write
0067  * to happen as soon as the previous write completes). It returns immediately
0068  * though - but it takes a refcount on the closure in struct btree_op you passed
0069  * to it, so a closure_sync() later can be used to wait for the write to
0070  * complete.
0071  *
0072  * This is handy because btree_split() and garbage collection can issue writes
0073  * in parallel, reducing the amount of time they have to hold write locks.
0074  *
0075  * LOCKING:
0076  *
0077  * When traversing the btree, we may need write locks starting at some level -
0078  * inserting a key into the btree will typically only require a write lock on
0079  * the leaf node.
0080  *
0081  * This is specified with the lock field in struct btree_op; lock = 0 means we
0082  * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
0083  * checks this field and returns the node with the appropriate lock held.
0084  *
0085  * If, after traversing the btree, the insertion code discovers it has to split
0086  * then it must restart from the root and take new locks - to do this it changes
0087  * the lock field and returns -EINTR, which causes the btree_root() macro to
0088  * loop.
0089  *
0090  * Handling cache misses require a different mechanism for upgrading to a write
0091  * lock. We do cache lookups with only a read lock held, but if we get a cache
0092  * miss and we wish to insert this data into the cache, we have to insert a
0093  * placeholder key to detect races - otherwise, we could race with a write and
0094  * overwrite the data that was just written to the cache with stale data from
0095  * the backing device.
0096  *
0097  * For this we use a sequence number that write locks and unlocks increment - to
0098  * insert the check key it unlocks the btree node and then takes a write lock,
0099  * and fails if the sequence number doesn't match.
0100  */
0101 
0102 #include "bset.h"
0103 #include "debug.h"
0104 
0105 struct btree_write {
0106     atomic_t        *journal;
0107 
0108     /* If btree_split() frees a btree node, it writes a new pointer to that
0109      * btree node indicating it was freed; it takes a refcount on
0110      * c->prio_blocked because we can't write the gens until the new
0111      * pointer is on disk. This allows btree_write_endio() to release the
0112      * refcount that btree_split() took.
0113      */
0114     int         prio_blocked;
0115 };
0116 
0117 struct btree {
0118     /* Hottest entries first */
0119     struct hlist_node   hash;
0120 
0121     /* Key/pointer for this btree node */
0122     BKEY_PADDED(key);
0123 
0124     unsigned long       seq;
0125     struct rw_semaphore lock;
0126     struct cache_set    *c;
0127     struct btree        *parent;
0128 
0129     struct mutex        write_lock;
0130 
0131     unsigned long       flags;
0132     uint16_t        written;    /* would be nice to kill */
0133     uint8_t         level;
0134 
0135     struct btree_keys   keys;
0136 
0137     /* For outstanding btree writes, used as a lock - protects write_idx */
0138     struct closure      io;
0139     struct semaphore    io_mutex;
0140 
0141     struct list_head    list;
0142     struct delayed_work work;
0143 
0144     struct btree_write  writes[2];
0145     struct bio      *bio;
0146 };
0147 
0148 
0149 
0150 
0151 #define BTREE_FLAG(flag)                        \
0152 static inline bool btree_node_ ## flag(struct btree *b)         \
0153 {   return test_bit(BTREE_NODE_ ## flag, &b->flags); }      \
0154                                     \
0155 static inline void set_btree_node_ ## flag(struct btree *b)     \
0156 {   set_bit(BTREE_NODE_ ## flag, &b->flags); }
0157 
0158 enum btree_flags {
0159     BTREE_NODE_io_error,
0160     BTREE_NODE_dirty,
0161     BTREE_NODE_write_idx,
0162     BTREE_NODE_journal_flush,
0163 };
0164 
0165 BTREE_FLAG(io_error);
0166 BTREE_FLAG(dirty);
0167 BTREE_FLAG(write_idx);
0168 BTREE_FLAG(journal_flush);
0169 
0170 static inline struct btree_write *btree_current_write(struct btree *b)
0171 {
0172     return b->writes + btree_node_write_idx(b);
0173 }
0174 
0175 static inline struct btree_write *btree_prev_write(struct btree *b)
0176 {
0177     return b->writes + (btree_node_write_idx(b) ^ 1);
0178 }
0179 
0180 static inline struct bset *btree_bset_first(struct btree *b)
0181 {
0182     return b->keys.set->data;
0183 }
0184 
0185 static inline struct bset *btree_bset_last(struct btree *b)
0186 {
0187     return bset_tree_last(&b->keys)->data;
0188 }
0189 
0190 static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
0191 {
0192     return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
0193 }
0194 
0195 static inline void set_gc_sectors(struct cache_set *c)
0196 {
0197     atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16);
0198 }
0199 
0200 void bkey_put(struct cache_set *c, struct bkey *k);
0201 
0202 /* Looping macros */
0203 
0204 #define for_each_cached_btree(b, c, iter)               \
0205     for (iter = 0;                          \
0206          iter < ARRAY_SIZE((c)->bucket_hash);           \
0207          iter++)                            \
0208         hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
0209 
0210 /* Recursing down the btree */
0211 
0212 struct btree_op {
0213     /* for waiting on btree reserve in btree_split() */
0214     wait_queue_entry_t      wait;
0215 
0216     /* Btree level at which we start taking write locks */
0217     short           lock;
0218 
0219     unsigned int        insert_collision:1;
0220 };
0221 
0222 struct btree_check_state;
0223 struct btree_check_info {
0224     struct btree_check_state    *state;
0225     struct task_struct      *thread;
0226     int             result;
0227 };
0228 
0229 #define BCH_BTR_CHKTHREAD_MAX   12
0230 struct btree_check_state {
0231     struct cache_set        *c;
0232     int             total_threads;
0233     int             key_idx;
0234     spinlock_t          idx_lock;
0235     atomic_t            started;
0236     atomic_t            enough;
0237     wait_queue_head_t       wait;
0238     struct btree_check_info     infos[BCH_BTR_CHKTHREAD_MAX];
0239 };
0240 
0241 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
0242 {
0243     memset(op, 0, sizeof(struct btree_op));
0244     init_wait(&op->wait);
0245     op->lock = write_lock_level;
0246 }
0247 
0248 static inline void rw_lock(bool w, struct btree *b, int level)
0249 {
0250     w ? down_write_nested(&b->lock, level + 1)
0251       : down_read_nested(&b->lock, level + 1);
0252     if (w)
0253         b->seq++;
0254 }
0255 
0256 static inline void rw_unlock(bool w, struct btree *b)
0257 {
0258     if (w)
0259         b->seq++;
0260     (w ? up_write : up_read)(&b->lock);
0261 }
0262 
0263 void bch_btree_node_read_done(struct btree *b);
0264 void __bch_btree_node_write(struct btree *b, struct closure *parent);
0265 void bch_btree_node_write(struct btree *b, struct closure *parent);
0266 
0267 void bch_btree_set_root(struct btree *b);
0268 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
0269                      int level, bool wait,
0270                      struct btree *parent);
0271 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
0272                  struct bkey *k, int level, bool write,
0273                  struct btree *parent);
0274 
0275 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
0276                    struct bkey *check_key);
0277 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
0278              atomic_t *journal_ref, struct bkey *replace_key);
0279 
0280 int bch_gc_thread_start(struct cache_set *c);
0281 void bch_initial_gc_finish(struct cache_set *c);
0282 void bch_moving_gc(struct cache_set *c);
0283 int bch_btree_check(struct cache_set *c);
0284 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
0285 
0286 static inline void wake_up_gc(struct cache_set *c)
0287 {
0288     wake_up(&c->gc_wait);
0289 }
0290 
0291 static inline void force_wake_up_gc(struct cache_set *c)
0292 {
0293     /*
0294      * Garbage collection thread only works when sectors_to_gc < 0,
0295      * calling wake_up_gc() won't start gc thread if sectors_to_gc is
0296      * not a nagetive value.
0297      * Therefore sectors_to_gc is set to -1 here, before waking up
0298      * gc thread by calling wake_up_gc(). Then gc_should_run() will
0299      * give a chance to permit gc thread to run. "Give a chance" means
0300      * before going into gc_should_run(), there is still possibility
0301      * that c->sectors_to_gc being set to other positive value. So
0302      * this routine won't 100% make sure gc thread will be woken up
0303      * to run.
0304      */
0305     atomic_set(&c->sectors_to_gc, -1);
0306     wake_up_gc(c);
0307 }
0308 
0309 /*
0310  * These macros are for recursing down the btree - they handle the details of
0311  * locking and looking up nodes in the cache for you. They're best treated as
0312  * mere syntax when reading code that uses them.
0313  *
0314  * op->lock determines whether we take a read or a write lock at a given depth.
0315  * If you've got a read lock and find that you need a write lock (i.e. you're
0316  * going to have to split), set op->lock and return -EINTR; btree_root() will
0317  * call you again and you'll have the correct lock.
0318  */
0319 
0320 /**
0321  * btree - recurse down the btree on a specified key
0322  * @fn:     function to call, which will be passed the child node
0323  * @key:    key to recurse on
0324  * @b:      parent btree node
0325  * @op:     pointer to struct btree_op
0326  */
0327 #define bcache_btree(fn, key, b, op, ...)               \
0328 ({                                  \
0329     int _r, l = (b)->level - 1;                 \
0330     bool _w = l <= (op)->lock;                  \
0331     struct btree *_child = bch_btree_node_get((b)->c, op, key, l,   \
0332                           _w, b);       \
0333     if (!IS_ERR(_child)) {                      \
0334         _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);   \
0335         rw_unlock(_w, _child);                  \
0336     } else                              \
0337         _r = PTR_ERR(_child);                   \
0338     _r;                             \
0339 })
0340 
0341 /**
0342  * btree_root - call a function on the root of the btree
0343  * @fn:     function to call, which will be passed the child node
0344  * @c:      cache set
0345  * @op:     pointer to struct btree_op
0346  */
0347 #define bcache_btree_root(fn, c, op, ...)               \
0348 ({                                  \
0349     int _r = -EINTR;                        \
0350     do {                                \
0351         struct btree *_b = (c)->root;               \
0352         bool _w = insert_lock(op, _b);              \
0353         rw_lock(_w, _b, _b->level);             \
0354         if (_b == (c)->root &&                  \
0355             _w == insert_lock(op, _b)) {            \
0356             _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
0357         }                           \
0358         rw_unlock(_w, _b);                  \
0359         bch_cannibalize_unlock(c);                              \
0360         if (_r == -EINTR)                                       \
0361             schedule();                                     \
0362     } while (_r == -EINTR);                                         \
0363                                     \
0364     finish_wait(&(c)->btree_cache_wait, &(op)->wait);               \
0365     _r;                                                             \
0366 })
0367 
0368 #define MAP_DONE    0
0369 #define MAP_CONTINUE    1
0370 
0371 #define MAP_ALL_NODES   0
0372 #define MAP_LEAF_NODES  1
0373 
0374 #define MAP_END_KEY 1
0375 
0376 typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b);
0377 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
0378               struct bkey *from, btree_map_nodes_fn *fn, int flags);
0379 
0380 static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
0381                       struct bkey *from, btree_map_nodes_fn *fn)
0382 {
0383     return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
0384 }
0385 
0386 static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
0387                        struct cache_set *c,
0388                        struct bkey *from,
0389                        btree_map_nodes_fn *fn)
0390 {
0391     return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
0392 }
0393 
0394 typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b,
0395                 struct bkey *k);
0396 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
0397                struct bkey *from, btree_map_keys_fn *fn, int flags);
0398 int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
0399                    struct bkey *from, btree_map_keys_fn *fn,
0400                    int flags);
0401 
0402 typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k);
0403 
0404 void bch_keybuf_init(struct keybuf *buf);
0405 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
0406                struct bkey *end, keybuf_pred_fn *pred);
0407 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
0408                   struct bkey *end);
0409 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w);
0410 struct keybuf_key *bch_keybuf_next(struct keybuf *buf);
0411 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
0412                       struct keybuf *buf,
0413                       struct bkey *end,
0414                       keybuf_pred_fn *pred);
0415 void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
0416 #endif