Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _BCACHE_JOURNAL_H
0003 #define _BCACHE_JOURNAL_H
0004 
0005 /*
0006  * THE JOURNAL:
0007  *
0008  * The journal is treated as a circular buffer of buckets - a journal entry
0009  * never spans two buckets. This means (not implemented yet) we can resize the
0010  * journal at runtime, and will be needed for bcache on raw flash support.
0011  *
0012  * Journal entries contain a list of keys, ordered by the time they were
0013  * inserted; thus journal replay just has to reinsert the keys.
0014  *
0015  * We also keep some things in the journal header that are logically part of the
0016  * superblock - all the things that are frequently updated. This is for future
0017  * bcache on raw flash support; the superblock (which will become another
0018  * journal) can't be moved or wear leveled, so it contains just enough
0019  * information to find the main journal, and the superblock only has to be
0020  * rewritten when we want to move/wear level the main journal.
0021  *
0022  * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
0023  * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
0024  * from cache misses, which don't have to be journaled, and for writeback and
0025  * moving gc we work around it by flushing the btree to disk before updating the
0026  * gc information. But it is a potential issue with incremental garbage
0027  * collection, and it's fragile.
0028  *
0029  * OPEN JOURNAL ENTRIES:
0030  *
0031  * Each journal entry contains, in the header, the sequence number of the last
0032  * journal entry still open - i.e. that has keys that haven't been flushed to
0033  * disk in the btree.
0034  *
0035  * We track this by maintaining a refcount for every open journal entry, in a
0036  * fifo; each entry in the fifo corresponds to a particular journal
0037  * entry/sequence number. When the refcount at the tail of the fifo goes to
0038  * zero, we pop it off - thus, the size of the fifo tells us the number of open
0039  * journal entries
0040  *
0041  * We take a refcount on a journal entry when we add some keys to a journal
0042  * entry that we're going to insert (held by struct btree_op), and then when we
0043  * insert those keys into the btree the btree write we're setting up takes a
0044  * copy of that refcount (held by struct btree_write). That refcount is dropped
0045  * when the btree write completes.
0046  *
0047  * A struct btree_write can only hold a refcount on a single journal entry, but
0048  * might contain keys for many journal entries - we handle this by making sure
0049  * it always has a refcount on the _oldest_ journal entry of all the journal
0050  * entries it has keys for.
0051  *
0052  * JOURNAL RECLAIM:
0053  *
0054  * As mentioned previously, our fifo of refcounts tells us the number of open
0055  * journal entries; from that and the current journal sequence number we compute
0056  * last_seq - the oldest journal entry we still need. We write last_seq in each
0057  * journal entry, and we also have to keep track of where it exists on disk so
0058  * we don't overwrite it when we loop around the journal.
0059  *
0060  * To do that we track, for each journal bucket, the sequence number of the
0061  * newest journal entry it contains - if we don't need that journal entry we
0062  * don't need anything in that bucket anymore. From that we track the last
0063  * journal bucket we still need; all this is tracked in struct journal_device
0064  * and updated by journal_reclaim().
0065  *
0066  * JOURNAL FILLING UP:
0067  *
0068  * There are two ways the journal could fill up; either we could run out of
0069  * space to write to, or we could have too many open journal entries and run out
0070  * of room in the fifo of refcounts. Since those refcounts are decremented
0071  * without any locking we can't safely resize that fifo, so we handle it the
0072  * same way.
0073  *
0074  * If the journal fills up, we start flushing dirty btree nodes until we can
0075  * allocate space for a journal write again - preferentially flushing btree
0076  * nodes that are pinning the oldest journal entries first.
0077  */
0078 
0079 /*
0080  * Only used for holding the journal entries we read in btree_journal_read()
0081  * during cache_registration
0082  */
0083 struct journal_replay {
0084     struct list_head    list;
0085     atomic_t        *pin;
0086     struct jset     j;
0087 };
0088 
0089 /*
0090  * We put two of these in struct journal; we used them for writes to the
0091  * journal that are being staged or in flight.
0092  */
0093 struct journal_write {
0094     struct jset     *data;
0095 #define JSET_BITS       3
0096 
0097     struct cache_set    *c;
0098     struct closure_waitlist wait;
0099     bool            dirty;
0100     bool            need_write;
0101 };
0102 
0103 /* Embedded in struct cache_set */
0104 struct journal {
0105     spinlock_t      lock;
0106     spinlock_t      flush_write_lock;
0107     bool            btree_flushing;
0108     bool            do_reserve;
0109     /* used when waiting because the journal was full */
0110     struct closure_waitlist wait;
0111     struct closure      io;
0112     int         io_in_flight;
0113     struct delayed_work work;
0114 
0115     /* Number of blocks free in the bucket(s) we're currently writing to */
0116     unsigned int        blocks_free;
0117     uint64_t        seq;
0118     DECLARE_FIFO(atomic_t, pin);
0119 
0120     BKEY_PADDED(key);
0121 
0122     struct journal_write    w[2], *cur;
0123 };
0124 
0125 /*
0126  * Embedded in struct cache. First three fields refer to the array of journal
0127  * buckets, in cache_sb.
0128  */
0129 struct journal_device {
0130     /*
0131      * For each journal bucket, contains the max sequence number of the
0132      * journal writes it contains - so we know when a bucket can be reused.
0133      */
0134     uint64_t        seq[SB_JOURNAL_BUCKETS];
0135 
0136     /* Journal bucket we're currently writing to */
0137     unsigned int        cur_idx;
0138 
0139     /* Last journal bucket that still contains an open journal entry */
0140     unsigned int        last_idx;
0141 
0142     /* Next journal bucket to be discarded */
0143     unsigned int        discard_idx;
0144 
0145 #define DISCARD_READY       0
0146 #define DISCARD_IN_FLIGHT   1
0147 #define DISCARD_DONE        2
0148     /* 1 - discard in flight, -1 - discard completed */
0149     atomic_t        discard_in_flight;
0150 
0151     struct work_struct  discard_work;
0152     struct bio      discard_bio;
0153     struct bio_vec      discard_bv;
0154 
0155     /* Bio for journal reads/writes to this device */
0156     struct bio      bio;
0157     struct bio_vec      bv[8];
0158 };
0159 
0160 #define BTREE_FLUSH_NR  8
0161 
0162 #define journal_pin_cmp(c, l, r)                \
0163     (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
0164 
0165 #define JOURNAL_PIN 20000
0166 
0167 #define journal_full(j)                     \
0168     (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
0169 
0170 struct closure;
0171 struct cache_set;
0172 struct btree_op;
0173 struct keylist;
0174 
0175 atomic_t *bch_journal(struct cache_set *c,
0176               struct keylist *keys,
0177               struct closure *parent);
0178 void bch_journal_next(struct journal *j);
0179 void bch_journal_mark(struct cache_set *c, struct list_head *list);
0180 void bch_journal_meta(struct cache_set *c, struct closure *cl);
0181 int bch_journal_read(struct cache_set *c, struct list_head *list);
0182 int bch_journal_replay(struct cache_set *c, struct list_head *list);
0183 
0184 void bch_journal_free(struct cache_set *c);
0185 int bch_journal_alloc(struct cache_set *c);
0186 void bch_journal_space_reserve(struct journal *j);
0187 
0188 #endif /* _BCACHE_JOURNAL_H */