md/bcache/super.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * bcache setup/teardown code, and some metadata io - read a superblock and
0004  * figure out what to do with it.
0005  *
0006  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
0007  * Copyright 2012 Google, Inc.
0008  */
0009
0010 #include "bcache.h"
0011 #include "btree.h"
0012 #include "debug.h"
0013 #include "extents.h"
0014 #include "request.h"
0015 #include "writeback.h"
0016 #include "features.h"
0017
0018 #include <linux/blkdev.h>
0019 #include <linux/pagemap.h>
0020 #include <linux/debugfs.h>
0021 #include <linux/idr.h>
0022 #include <linux/kthread.h>
0023 #include <linux/workqueue.h>
0024 #include <linux/module.h>
0025 #include <linux/random.h>
0026 #include <linux/reboot.h>
0027 #include <linux/sysfs.h>
0028
0029 unsigned int bch_cutoff_writeback;
0030 unsigned int bch_cutoff_writeback_sync;
0031
0032 static const char bcache_magic[] = {
0033     0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
0034     0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
0035 };
0036
0037 static const char invalid_uuid[] = {
0038     0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
0039     0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
0040 };
0041
0042 static struct kobject *bcache_kobj;
0043 struct mutex bch_register_lock;
0044 bool bcache_is_reboot;
0045 LIST_HEAD(bch_cache_sets);
0046 static LIST_HEAD(uncached_devices);
0047
0048 static int bcache_major;
0049 static DEFINE_IDA(bcache_device_idx);
0050 static wait_queue_head_t unregister_wait;
0051 struct workqueue_struct *bcache_wq;
0052 struct workqueue_struct *bch_flush_wq;
0053 struct workqueue_struct *bch_journal_wq;
0054
0055
0056 #define BTREE_MAX_PAGES     (256 * 1024 / PAGE_SIZE)
0057 /* limitation of partitions number on single bcache device */
0058 #define BCACHE_MINORS       128
0059 /* limitation of bcache devices number on single system */
0060 #define BCACHE_DEVICE_IDX_MAX   ((1U << MINORBITS)/BCACHE_MINORS)
0061
0062 /* Superblock */
0063
0064 static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
0065 {
0066     unsigned int bucket_size = le16_to_cpu(s->bucket_size);
0067
0068     if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
0069         if (bch_has_feature_large_bucket(sb)) {
0070             unsigned int max, order;
0071
0072             max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
0073             order = le16_to_cpu(s->bucket_size);
0074             /*
0075              * bcache tool will make sure the overflow won't
0076              * happen, an error message here is enough.
0077              */
0078             if (order > max)
0079                 pr_err("Bucket size (1 << %u) overflows\n",
0080                     order);
0081             bucket_size = 1 << order;
0082         } else if (bch_has_feature_obso_large_bucket(sb)) {
0083             bucket_size +=
0084                 le16_to_cpu(s->obso_bucket_size_hi) << 16;
0085         }
0086     }
0087
0088     return bucket_size;
0089 }
0090
0091 static const char *read_super_common(struct cache_sb *sb,  struct block_device *bdev,
0092                      struct cache_sb_disk *s)
0093 {
0094     const char *err;
0095     unsigned int i;
0096
0097     sb->first_bucket= le16_to_cpu(s->first_bucket);
0098     sb->nbuckets    = le64_to_cpu(s->nbuckets);
0099     sb->bucket_size = get_bucket_size(sb, s);
0100
0101     sb->nr_in_set   = le16_to_cpu(s->nr_in_set);
0102     sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
0103
0104     err = "Too many journal buckets";
0105     if (sb->keys > SB_JOURNAL_BUCKETS)
0106         goto err;
0107
0108     err = "Too many buckets";
0109     if (sb->nbuckets > LONG_MAX)
0110         goto err;
0111
0112     err = "Not enough buckets";
0113     if (sb->nbuckets < 1 << 7)
0114         goto err;
0115
0116     err = "Bad block size (not power of 2)";
0117     if (!is_power_of_2(sb->block_size))
0118         goto err;
0119
0120     err = "Bad block size (larger than page size)";
0121     if (sb->block_size > PAGE_SECTORS)
0122         goto err;
0123
0124     err = "Bad bucket size (not power of 2)";
0125     if (!is_power_of_2(sb->bucket_size))
0126         goto err;
0127
0128     err = "Bad bucket size (smaller than page size)";
0129     if (sb->bucket_size < PAGE_SECTORS)
0130         goto err;
0131
0132     err = "Invalid superblock: device too small";
0133     if (get_capacity(bdev->bd_disk) <
0134         sb->bucket_size * sb->nbuckets)
0135         goto err;
0136
0137     err = "Bad UUID";
0138     if (bch_is_zero(sb->set_uuid, 16))
0139         goto err;
0140
0141     err = "Bad cache device number in set";
0142     if (!sb->nr_in_set ||
0143         sb->nr_in_set <= sb->nr_this_dev ||
0144         sb->nr_in_set > MAX_CACHES_PER_SET)
0145         goto err;
0146
0147     err = "Journal buckets not sequential";
0148     for (i = 0; i < sb->keys; i++)
0149         if (sb->d[i] != sb->first_bucket + i)
0150             goto err;
0151
0152     err = "Too many journal buckets";
0153     if (sb->first_bucket + sb->keys > sb->nbuckets)
0154         goto err;
0155
0156     err = "Invalid superblock: first bucket comes before end of super";
0157     if (sb->first_bucket * sb->bucket_size < 16)
0158         goto err;
0159
0160     err = NULL;
0161 err:
0162     return err;
0163 }
0164
0165
0166 static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
0167                   struct cache_sb_disk **res)
0168 {
0169     const char *err;
0170     struct cache_sb_disk *s;
0171     struct page *page;
0172     unsigned int i;
0173
0174     page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
0175                    SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
0176     if (IS_ERR(page))
0177         return "IO error";
0178     s = page_address(page) + offset_in_page(SB_OFFSET);
0179
0180     sb->offset      = le64_to_cpu(s->offset);
0181     sb->version     = le64_to_cpu(s->version);
0182
0183     memcpy(sb->magic,   s->magic, 16);
0184     memcpy(sb->uuid,    s->uuid, 16);
0185     memcpy(sb->set_uuid,    s->set_uuid, 16);
0186     memcpy(sb->label,   s->label, SB_LABEL_SIZE);
0187
0188     sb->flags       = le64_to_cpu(s->flags);
0189     sb->seq         = le64_to_cpu(s->seq);
0190     sb->last_mount      = le32_to_cpu(s->last_mount);
0191     sb->keys        = le16_to_cpu(s->keys);
0192
0193     for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
0194         sb->d[i] = le64_to_cpu(s->d[i]);
0195
0196     pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
0197          sb->version, sb->flags, sb->seq, sb->keys);
0198
0199     err = "Not a bcache superblock (bad offset)";
0200     if (sb->offset != SB_SECTOR)
0201         goto err;
0202
0203     err = "Not a bcache superblock (bad magic)";
0204     if (memcmp(sb->magic, bcache_magic, 16))
0205         goto err;
0206
0207     err = "Bad checksum";
0208     if (s->csum != csum_set(s))
0209         goto err;
0210
0211     err = "Bad UUID";
0212     if (bch_is_zero(sb->uuid, 16))
0213         goto err;
0214
0215     sb->block_size  = le16_to_cpu(s->block_size);
0216
0217     err = "Superblock block size smaller than device block size";
0218     if (sb->block_size << 9 < bdev_logical_block_size(bdev))
0219         goto err;
0220
0221     switch (sb->version) {
0222     case BCACHE_SB_VERSION_BDEV:
0223         sb->data_offset = BDEV_DATA_START_DEFAULT;
0224         break;
0225     case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
0226     case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
0227         sb->data_offset = le64_to_cpu(s->data_offset);
0228
0229         err = "Bad data offset";
0230         if (sb->data_offset < BDEV_DATA_START_DEFAULT)
0231             goto err;
0232
0233         break;
0234     case BCACHE_SB_VERSION_CDEV:
0235     case BCACHE_SB_VERSION_CDEV_WITH_UUID:
0236         err = read_super_common(sb, bdev, s);
0237         if (err)
0238             goto err;
0239         break;
0240     case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
0241         /*
0242          * Feature bits are needed in read_super_common(),
0243          * convert them firstly.
0244          */
0245         sb->feature_compat = le64_to_cpu(s->feature_compat);
0246         sb->feature_incompat = le64_to_cpu(s->feature_incompat);
0247         sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
0248
0249         /* Check incompatible features */
0250         err = "Unsupported compatible feature found";
0251         if (bch_has_unknown_compat_features(sb))
0252             goto err;
0253
0254         err = "Unsupported read-only compatible feature found";
0255         if (bch_has_unknown_ro_compat_features(sb))
0256             goto err;
0257
0258         err = "Unsupported incompatible feature found";
0259         if (bch_has_unknown_incompat_features(sb))
0260             goto err;
0261
0262         err = read_super_common(sb, bdev, s);
0263         if (err)
0264             goto err;
0265         break;
0266     default:
0267         err = "Unsupported superblock version";
0268         goto err;
0269     }
0270
0271     sb->last_mount = (u32)ktime_get_real_seconds();
0272     *res = s;
0273     return NULL;
0274 err:
0275     put_page(page);
0276     return err;
0277 }
0278
0279 static void write_bdev_super_endio(struct bio *bio)
0280 {
0281     struct cached_dev *dc = bio->bi_private;
0282
0283     if (bio->bi_status)
0284         bch_count_backing_io_errors(dc, bio);
0285
0286     closure_put(&dc->sb_write);
0287 }
0288
0289 static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
0290         struct bio *bio)
0291 {
0292     unsigned int i;
0293
0294     bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
0295     bio->bi_iter.bi_sector  = SB_SECTOR;
0296     __bio_add_page(bio, virt_to_page(out), SB_SIZE,
0297             offset_in_page(out));
0298
0299     out->offset     = cpu_to_le64(sb->offset);
0300
0301     memcpy(out->uuid,   sb->uuid, 16);
0302     memcpy(out->set_uuid,   sb->set_uuid, 16);
0303     memcpy(out->label,  sb->label, SB_LABEL_SIZE);
0304
0305     out->flags      = cpu_to_le64(sb->flags);
0306     out->seq        = cpu_to_le64(sb->seq);
0307
0308     out->last_mount     = cpu_to_le32(sb->last_mount);
0309     out->first_bucket   = cpu_to_le16(sb->first_bucket);
0310     out->keys       = cpu_to_le16(sb->keys);
0311
0312     for (i = 0; i < sb->keys; i++)
0313         out->d[i] = cpu_to_le64(sb->d[i]);
0314
0315     if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
0316         out->feature_compat    = cpu_to_le64(sb->feature_compat);
0317         out->feature_incompat  = cpu_to_le64(sb->feature_incompat);
0318         out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
0319     }
0320
0321     out->version        = cpu_to_le64(sb->version);
0322     out->csum = csum_set(out);
0323
0324     pr_debug("ver %llu, flags %llu, seq %llu\n",
0325          sb->version, sb->flags, sb->seq);
0326
0327     submit_bio(bio);
0328 }
0329
0330 static void bch_write_bdev_super_unlock(struct closure *cl)
0331 {
0332     struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
0333
0334     up(&dc->sb_write_mutex);
0335 }
0336
0337 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
0338 {
0339     struct closure *cl = &dc->sb_write;
0340     struct bio *bio = &dc->sb_bio;
0341
0342     down(&dc->sb_write_mutex);
0343     closure_init(cl, parent);
0344
0345     bio_init(bio, dc->bdev, dc->sb_bv, 1, 0);
0346     bio->bi_end_io  = write_bdev_super_endio;
0347     bio->bi_private = dc;
0348
0349     closure_get(cl);
0350     /* I/O request sent to backing device */
0351     __write_super(&dc->sb, dc->sb_disk, bio);
0352
0353     closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
0354 }
0355
0356 static void write_super_endio(struct bio *bio)
0357 {
0358     struct cache *ca = bio->bi_private;
0359
0360     /* is_read = 0 */
0361     bch_count_io_errors(ca, bio->bi_status, 0,
0362                 "writing superblock");
0363     closure_put(&ca->set->sb_write);
0364 }
0365
0366 static void bcache_write_super_unlock(struct closure *cl)
0367 {
0368     struct cache_set *c = container_of(cl, struct cache_set, sb_write);
0369
0370     up(&c->sb_write_mutex);
0371 }
0372
0373 void bcache_write_super(struct cache_set *c)
0374 {
0375     struct closure *cl = &c->sb_write;
0376     struct cache *ca = c->cache;
0377     struct bio *bio = &ca->sb_bio;
0378     unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
0379
0380     down(&c->sb_write_mutex);
0381     closure_init(cl, &c->cl);
0382
0383     ca->sb.seq++;
0384
0385     if (ca->sb.version < version)
0386         ca->sb.version = version;
0387
0388     bio_init(bio, ca->bdev, ca->sb_bv, 1, 0);
0389     bio->bi_end_io  = write_super_endio;
0390     bio->bi_private = ca;
0391
0392     closure_get(cl);
0393     __write_super(&ca->sb, ca->sb_disk, bio);
0394
0395     closure_return_with_destructor(cl, bcache_write_super_unlock);
0396 }
0397
0398 /* UUID io */
0399
0400 static void uuid_endio(struct bio *bio)
0401 {
0402     struct closure *cl = bio->bi_private;
0403     struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
0404
0405     cache_set_err_on(bio->bi_status, c, "accessing uuids");
0406     bch_bbio_free(bio, c);
0407     closure_put(cl);
0408 }
0409
0410 static void uuid_io_unlock(struct closure *cl)
0411 {
0412     struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
0413
0414     up(&c->uuid_write_mutex);
0415 }
0416
0417 static void uuid_io(struct cache_set *c, blk_opf_t opf, struct bkey *k,
0418             struct closure *parent)
0419 {
0420     struct closure *cl = &c->uuid_write;
0421     struct uuid_entry *u;
0422     unsigned int i;
0423     char buf[80];
0424
0425     BUG_ON(!parent);
0426     down(&c->uuid_write_mutex);
0427     closure_init(cl, parent);
0428
0429     for (i = 0; i < KEY_PTRS(k); i++) {
0430         struct bio *bio = bch_bbio_alloc(c);
0431
0432         bio->bi_opf = opf | REQ_SYNC | REQ_META;
0433         bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
0434
0435         bio->bi_end_io  = uuid_endio;
0436         bio->bi_private = cl;
0437         bch_bio_map(bio, c->uuids);
0438
0439         bch_submit_bbio(bio, c, k, i);
0440
0441         if ((opf & REQ_OP_MASK) != REQ_OP_WRITE)
0442             break;
0443     }
0444
0445     bch_extent_to_text(buf, sizeof(buf), k);
0446     pr_debug("%s UUIDs at %s\n", (opf & REQ_OP_MASK) == REQ_OP_WRITE ?
0447          "wrote" : "read", buf);
0448
0449     for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
0450         if (!bch_is_zero(u->uuid, 16))
0451             pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
0452                  u - c->uuids, u->uuid, u->label,
0453                  u->first_reg, u->last_reg, u->invalidated);
0454
0455     closure_return_with_destructor(cl, uuid_io_unlock);
0456 }
0457
0458 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
0459 {
0460     struct bkey *k = &j->uuid_bucket;
0461
0462     if (__bch_btree_ptr_invalid(c, k))
0463         return "bad uuid pointer";
0464
0465     bkey_copy(&c->uuid_bucket, k);
0466     uuid_io(c, REQ_OP_READ, k, cl);
0467
0468     if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
0469         struct uuid_entry_v0    *u0 = (void *) c->uuids;
0470         struct uuid_entry   *u1 = (void *) c->uuids;
0471         int i;
0472
0473         closure_sync(cl);
0474
0475         /*
0476          * Since the new uuid entry is bigger than the old, we have to
0477          * convert starting at the highest memory address and work down
0478          * in order to do it in place
0479          */
0480
0481         for (i = c->nr_uuids - 1;
0482              i >= 0;
0483              --i) {
0484             memcpy(u1[i].uuid,  u0[i].uuid, 16);
0485             memcpy(u1[i].label, u0[i].label, 32);
0486
0487             u1[i].first_reg     = u0[i].first_reg;
0488             u1[i].last_reg      = u0[i].last_reg;
0489             u1[i].invalidated   = u0[i].invalidated;
0490
0491             u1[i].flags = 0;
0492             u1[i].sectors   = 0;
0493         }
0494     }
0495
0496     return NULL;
0497 }
0498
0499 static int __uuid_write(struct cache_set *c)
0500 {
0501     BKEY_PADDED(key) k;
0502     struct closure cl;
0503     struct cache *ca = c->cache;
0504     unsigned int size;
0505
0506     closure_init_stack(&cl);
0507     lockdep_assert_held(&bch_register_lock);
0508
0509     if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
0510         return 1;
0511
0512     size =  meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
0513     SET_KEY_SIZE(&k.key, size);
0514     uuid_io(c, REQ_OP_WRITE, &k.key, &cl);
0515     closure_sync(&cl);
0516
0517     /* Only one bucket used for uuid write */
0518     atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
0519
0520     bkey_copy(&c->uuid_bucket, &k.key);
0521     bkey_put(c, &k.key);
0522     return 0;
0523 }
0524
0525 int bch_uuid_write(struct cache_set *c)
0526 {
0527     int ret = __uuid_write(c);
0528
0529     if (!ret)
0530         bch_journal_meta(c, NULL);
0531
0532     return ret;
0533 }
0534
0535 static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
0536 {
0537     struct uuid_entry *u;
0538
0539     for (u = c->uuids;
0540          u < c->uuids + c->nr_uuids; u++)
0541         if (!memcmp(u->uuid, uuid, 16))
0542             return u;
0543
0544     return NULL;
0545 }
0546
0547 static struct uuid_entry *uuid_find_empty(struct cache_set *c)
0548 {
0549     static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
0550
0551     return uuid_find(c, zero_uuid);
0552 }
0553
0554 /*
0555  * Bucket priorities/gens:
0556  *
0557  * For each bucket, we store on disk its
0558  *   8 bit gen
0559  *  16 bit priority
0560  *
0561  * See alloc.c for an explanation of the gen. The priority is used to implement
0562  * lru (and in the future other) cache replacement policies; for most purposes
0563  * it's just an opaque integer.
0564  *
0565  * The gens and the priorities don't have a whole lot to do with each other, and
0566  * it's actually the gens that must be written out at specific times - it's no
0567  * big deal if the priorities don't get written, if we lose them we just reuse
0568  * buckets in suboptimal order.
0569  *
0570  * On disk they're stored in a packed array, and in as many buckets are required
0571  * to fit them all. The buckets we use to store them form a list; the journal
0572  * header points to the first bucket, the first bucket points to the second
0573  * bucket, et cetera.
0574  *
0575  * This code is used by the allocation code; periodically (whenever it runs out
0576  * of buckets to allocate from) the allocation code will invalidate some
0577  * buckets, but it can't use those buckets until their new gens are safely on
0578  * disk.
0579  */
0580
0581 static void prio_endio(struct bio *bio)
0582 {
0583     struct cache *ca = bio->bi_private;
0584
0585     cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
0586     bch_bbio_free(bio, ca->set);
0587     closure_put(&ca->prio);
0588 }
0589
0590 static void prio_io(struct cache *ca, uint64_t bucket, blk_opf_t opf)
0591 {
0592     struct closure *cl = &ca->prio;
0593     struct bio *bio = bch_bbio_alloc(ca->set);
0594
0595     closure_init_stack(cl);
0596
0597     bio->bi_iter.bi_sector  = bucket * ca->sb.bucket_size;
0598     bio_set_dev(bio, ca->bdev);
0599     bio->bi_iter.bi_size    = meta_bucket_bytes(&ca->sb);
0600
0601     bio->bi_end_io  = prio_endio;
0602     bio->bi_private = ca;
0603     bio->bi_opf = opf | REQ_SYNC | REQ_META;
0604     bch_bio_map(bio, ca->disk_buckets);
0605
0606     closure_bio_submit(ca->set, bio, &ca->prio);
0607     closure_sync(cl);
0608 }
0609
0610 int bch_prio_write(struct cache *ca, bool wait)
0611 {
0612     int i;
0613     struct bucket *b;
0614     struct closure cl;
0615
0616     pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
0617          fifo_used(&ca->free[RESERVE_PRIO]),
0618          fifo_used(&ca->free[RESERVE_NONE]),
0619          fifo_used(&ca->free_inc));
0620
0621     /*
0622      * Pre-check if there are enough free buckets. In the non-blocking
0623      * scenario it's better to fail early rather than starting to allocate
0624      * buckets and do a cleanup later in case of failure.
0625      */
0626     if (!wait) {
0627         size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
0628                    fifo_used(&ca->free[RESERVE_NONE]);
0629         if (prio_buckets(ca) > avail)
0630             return -ENOMEM;
0631     }
0632
0633     closure_init_stack(&cl);
0634
0635     lockdep_assert_held(&ca->set->bucket_lock);
0636
0637     ca->disk_buckets->seq++;
0638
0639     atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
0640             &ca->meta_sectors_written);
0641
0642     for (i = prio_buckets(ca) - 1; i >= 0; --i) {
0643         long bucket;
0644         struct prio_set *p = ca->disk_buckets;
0645         struct bucket_disk *d = p->data;
0646         struct bucket_disk *end = d + prios_per_bucket(ca);
0647
0648         for (b = ca->buckets + i * prios_per_bucket(ca);
0649              b < ca->buckets + ca->sb.nbuckets && d < end;
0650              b++, d++) {
0651             d->prio = cpu_to_le16(b->prio);
0652             d->gen = b->gen;
0653         }
0654
0655         p->next_bucket  = ca->prio_buckets[i + 1];
0656         p->magic    = pset_magic(&ca->sb);
0657         p->csum     = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
0658
0659         bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
0660         BUG_ON(bucket == -1);
0661
0662         mutex_unlock(&ca->set->bucket_lock);
0663         prio_io(ca, bucket, REQ_OP_WRITE);
0664         mutex_lock(&ca->set->bucket_lock);
0665
0666         ca->prio_buckets[i] = bucket;
0667         atomic_dec_bug(&ca->buckets[bucket].pin);
0668     }
0669
0670     mutex_unlock(&ca->set->bucket_lock);
0671
0672     bch_journal_meta(ca->set, &cl);
0673     closure_sync(&cl);
0674
0675     mutex_lock(&ca->set->bucket_lock);
0676
0677     /*
0678      * Don't want the old priorities to get garbage collected until after we
0679      * finish writing the new ones, and they're journalled
0680      */
0681     for (i = 0; i < prio_buckets(ca); i++) {
0682         if (ca->prio_last_buckets[i])
0683             __bch_bucket_free(ca,
0684                 &ca->buckets[ca->prio_last_buckets[i]]);
0685
0686         ca->prio_last_buckets[i] = ca->prio_buckets[i];
0687     }
0688     return 0;
0689 }
0690
0691 static int prio_read(struct cache *ca, uint64_t bucket)
0692 {
0693     struct prio_set *p = ca->disk_buckets;
0694     struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
0695     struct bucket *b;
0696     unsigned int bucket_nr = 0;
0697     int ret = -EIO;
0698
0699     for (b = ca->buckets;
0700          b < ca->buckets + ca->sb.nbuckets;
0701          b++, d++) {
0702         if (d == end) {
0703             ca->prio_buckets[bucket_nr] = bucket;
0704             ca->prio_last_buckets[bucket_nr] = bucket;
0705             bucket_nr++;
0706
0707             prio_io(ca, bucket, REQ_OP_READ);
0708
0709             if (p->csum !=
0710                 bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
0711                 pr_warn("bad csum reading priorities\n");
0712                 goto out;
0713             }
0714
0715             if (p->magic != pset_magic(&ca->sb)) {
0716                 pr_warn("bad magic reading priorities\n");
0717                 goto out;
0718             }
0719
0720             bucket = p->next_bucket;
0721             d = p->data;
0722         }
0723
0724         b->prio = le16_to_cpu(d->prio);
0725         b->gen = b->last_gc = d->gen;
0726     }
0727
0728     ret = 0;
0729 out:
0730     return ret;
0731 }
0732
0733 /* Bcache device */
0734
0735 static int open_dev(struct block_device *b, fmode_t mode)
0736 {
0737     struct bcache_device *d = b->bd_disk->private_data;
0738
0739     if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
0740         return -ENXIO;
0741
0742     closure_get(&d->cl);
0743     return 0;
0744 }
0745
0746 static void release_dev(struct gendisk *b, fmode_t mode)
0747 {
0748     struct bcache_device *d = b->private_data;
0749
0750     closure_put(&d->cl);
0751 }
0752
0753 static int ioctl_dev(struct block_device *b, fmode_t mode,
0754              unsigned int cmd, unsigned long arg)
0755 {
0756     struct bcache_device *d = b->bd_disk->private_data;
0757
0758     return d->ioctl(d, mode, cmd, arg);
0759 }
0760
0761 static const struct block_device_operations bcache_cached_ops = {
0762     .submit_bio = cached_dev_submit_bio,
0763     .open       = open_dev,
0764     .release    = release_dev,
0765     .ioctl      = ioctl_dev,
0766     .owner      = THIS_MODULE,
0767 };
0768
0769 static const struct block_device_operations bcache_flash_ops = {
0770     .submit_bio = flash_dev_submit_bio,
0771     .open       = open_dev,
0772     .release    = release_dev,
0773     .ioctl      = ioctl_dev,
0774     .owner      = THIS_MODULE,
0775 };
0776
0777 void bcache_device_stop(struct bcache_device *d)
0778 {
0779     if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
0780         /*
0781          * closure_fn set to
0782          * - cached device: cached_dev_flush()
0783          * - flash dev: flash_dev_flush()
0784          */
0785         closure_queue(&d->cl);
0786 }
0787
0788 static void bcache_device_unlink(struct bcache_device *d)
0789 {
0790     lockdep_assert_held(&bch_register_lock);
0791
0792     if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
0793         struct cache *ca = d->c->cache;
0794
0795         sysfs_remove_link(&d->c->kobj, d->name);
0796         sysfs_remove_link(&d->kobj, "cache");
0797
0798         bd_unlink_disk_holder(ca->bdev, d->disk);
0799     }
0800 }
0801
0802 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
0803                    const char *name)
0804 {
0805     struct cache *ca = c->cache;
0806     int ret;
0807
0808     bd_link_disk_holder(ca->bdev, d->disk);
0809
0810     snprintf(d->name, BCACHEDEVNAME_SIZE,
0811          "%s%u", name, d->id);
0812
0813     ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
0814     if (ret < 0)
0815         pr_err("Couldn't create device -> cache set symlink\n");
0816
0817     ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
0818     if (ret < 0)
0819         pr_err("Couldn't create cache set -> device symlink\n");
0820
0821     clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
0822 }
0823
0824 static void bcache_device_detach(struct bcache_device *d)
0825 {
0826     lockdep_assert_held(&bch_register_lock);
0827
0828     atomic_dec(&d->c->attached_dev_nr);
0829
0830     if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
0831         struct uuid_entry *u = d->c->uuids + d->id;
0832
0833         SET_UUID_FLASH_ONLY(u, 0);
0834         memcpy(u->uuid, invalid_uuid, 16);
0835         u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
0836         bch_uuid_write(d->c);
0837     }
0838
0839     bcache_device_unlink(d);
0840
0841     d->c->devices[d->id] = NULL;
0842     closure_put(&d->c->caching);
0843     d->c = NULL;
0844 }
0845
0846 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
0847                  unsigned int id)
0848 {
0849     d->id = id;
0850     d->c = c;
0851     c->devices[id] = d;
0852
0853     if (id >= c->devices_max_used)
0854         c->devices_max_used = id + 1;
0855
0856     closure_get(&c->caching);
0857 }
0858
0859 static inline int first_minor_to_idx(int first_minor)
0860 {
0861     return (first_minor/BCACHE_MINORS);
0862 }
0863
0864 static inline int idx_to_first_minor(int idx)
0865 {
0866     return (idx * BCACHE_MINORS);
0867 }
0868
0869 static void bcache_device_free(struct bcache_device *d)
0870 {
0871     struct gendisk *disk = d->disk;
0872
0873     lockdep_assert_held(&bch_register_lock);
0874
0875     if (disk)
0876         pr_info("%s stopped\n", disk->disk_name);
0877     else
0878         pr_err("bcache device (NULL gendisk) stopped\n");
0879
0880     if (d->c)
0881         bcache_device_detach(d);
0882
0883     if (disk) {
0884         ida_simple_remove(&bcache_device_idx,
0885                   first_minor_to_idx(disk->first_minor));
0886         put_disk(disk);
0887     }
0888
0889     bioset_exit(&d->bio_split);
0890     kvfree(d->full_dirty_stripes);
0891     kvfree(d->stripe_sectors_dirty);
0892
0893     closure_debug_destroy(&d->cl);
0894 }
0895
0896 static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
0897         sector_t sectors, struct block_device *cached_bdev,
0898         const struct block_device_operations *ops)
0899 {
0900     struct request_queue *q;
0901     const size_t max_stripes = min_t(size_t, INT_MAX,
0902                      SIZE_MAX / sizeof(atomic_t));
0903     uint64_t n;
0904     int idx;
0905
0906     if (!d->stripe_size)
0907         d->stripe_size = 1 << 31;
0908
0909     n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
0910     if (!n || n > max_stripes) {
0911         pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
0912             n);
0913         return -ENOMEM;
0914     }
0915     d->nr_stripes = n;
0916
0917     n = d->nr_stripes * sizeof(atomic_t);
0918     d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
0919     if (!d->stripe_sectors_dirty)
0920         return -ENOMEM;
0921
0922     n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
0923     d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
0924     if (!d->full_dirty_stripes)
0925         goto out_free_stripe_sectors_dirty;
0926
0927     idx = ida_simple_get(&bcache_device_idx, 0,
0928                 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
0929     if (idx < 0)
0930         goto out_free_full_dirty_stripes;
0931
0932     if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
0933             BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
0934         goto out_ida_remove;
0935
0936     d->disk = blk_alloc_disk(NUMA_NO_NODE);
0937     if (!d->disk)
0938         goto out_bioset_exit;
0939
0940     set_capacity(d->disk, sectors);
0941     snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
0942
0943     d->disk->major      = bcache_major;
0944     d->disk->first_minor    = idx_to_first_minor(idx);
0945     d->disk->minors     = BCACHE_MINORS;
0946     d->disk->fops       = ops;
0947     d->disk->private_data   = d;
0948
0949     q = d->disk->queue;
0950     q->limits.max_hw_sectors    = UINT_MAX;
0951     q->limits.max_sectors       = UINT_MAX;
0952     q->limits.max_segment_size  = UINT_MAX;
0953     q->limits.max_segments      = BIO_MAX_VECS;
0954     blk_queue_max_discard_sectors(q, UINT_MAX);
0955     q->limits.discard_granularity   = 512;
0956     q->limits.io_min        = block_size;
0957     q->limits.logical_block_size    = block_size;
0958     q->limits.physical_block_size   = block_size;
0959
0960     if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
0961         /*
0962          * This should only happen with BCACHE_SB_VERSION_BDEV.
0963          * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
0964          */
0965         pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
0966             d->disk->disk_name, q->limits.logical_block_size,
0967             PAGE_SIZE, bdev_logical_block_size(cached_bdev));
0968
0969         /* This also adjusts physical block size/min io size if needed */
0970         blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
0971     }
0972
0973     blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
0974     blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
0975
0976     blk_queue_write_cache(q, true, true);
0977
0978     return 0;
0979
0980 out_bioset_exit:
0981     bioset_exit(&d->bio_split);
0982 out_ida_remove:
0983     ida_simple_remove(&bcache_device_idx, idx);
0984 out_free_full_dirty_stripes:
0985     kvfree(d->full_dirty_stripes);
0986 out_free_stripe_sectors_dirty:
0987     kvfree(d->stripe_sectors_dirty);
0988     return -ENOMEM;
0989
0990 }
0991
0992 /* Cached device */
0993
0994 static void calc_cached_dev_sectors(struct cache_set *c)
0995 {
0996     uint64_t sectors = 0;
0997     struct cached_dev *dc;
0998
0999     list_for_each_entry(dc, &c->cached_devs, list)
1000         sectors += bdev_nr_sectors(dc->bdev);
1001
1002     c->cached_dev_sectors = sectors;
1003 }
1004
1005 #define BACKING_DEV_OFFLINE_TIMEOUT 5
1006 static int cached_dev_status_update(void *arg)
1007 {
1008     struct cached_dev *dc = arg;
1009     struct request_queue *q;
1010
1011     /*
1012      * If this delayed worker is stopping outside, directly quit here.
1013      * dc->io_disable might be set via sysfs interface, so check it
1014      * here too.
1015      */
1016     while (!kthread_should_stop() && !dc->io_disable) {
1017         q = bdev_get_queue(dc->bdev);
1018         if (blk_queue_dying(q))
1019             dc->offline_seconds++;
1020         else
1021             dc->offline_seconds = 0;
1022
1023         if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
1024             pr_err("%pg: device offline for %d seconds\n",
1025                    dc->bdev,
1026                    BACKING_DEV_OFFLINE_TIMEOUT);
1027             pr_err("%s: disable I/O request due to backing device offline\n",
1028                    dc->disk.name);
1029             dc->io_disable = true;
1030             /* let others know earlier that io_disable is true */
1031             smp_mb();
1032             bcache_device_stop(&dc->disk);
1033             break;
1034         }
1035         schedule_timeout_interruptible(HZ);
1036     }
1037
1038     wait_for_kthread_stop();
1039     return 0;
1040 }
1041
1042
1043 int bch_cached_dev_run(struct cached_dev *dc)
1044 {
1045     int ret = 0;
1046     struct bcache_device *d = &dc->disk;
1047     char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
1048     char *env[] = {
1049         "DRIVER=bcache",
1050         kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
1051         kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
1052         NULL,
1053     };
1054
1055     if (dc->io_disable) {
1056         pr_err("I/O disabled on cached dev %pg\n", dc->bdev);
1057         ret = -EIO;
1058         goto out;
1059     }
1060
1061     if (atomic_xchg(&dc->running, 1)) {
1062         pr_info("cached dev %pg is running already\n", dc->bdev);
1063         ret = -EBUSY;
1064         goto out;
1065     }
1066
1067     if (!d->c &&
1068         BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
1069         struct closure cl;
1070
1071         closure_init_stack(&cl);
1072
1073         SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
1074         bch_write_bdev_super(dc, &cl);
1075         closure_sync(&cl);
1076     }
1077
1078     ret = add_disk(d->disk);
1079     if (ret)
1080         goto out;
1081     bd_link_disk_holder(dc->bdev, dc->disk.disk);
1082     /*
1083      * won't show up in the uevent file, use udevadm monitor -e instead
1084      * only class / kset properties are persistent
1085      */
1086     kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
1087
1088     if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
1089         sysfs_create_link(&disk_to_dev(d->disk)->kobj,
1090                   &d->kobj, "bcache")) {
1091         pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
1092         ret = -ENOMEM;
1093         goto out;
1094     }
1095
1096     dc->status_update_thread = kthread_run(cached_dev_status_update,
1097                            dc, "bcache_status_update");
1098     if (IS_ERR(dc->status_update_thread)) {
1099         pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
1100     }
1101
1102 out:
1103     kfree(env[1]);
1104     kfree(env[2]);
1105     kfree(buf);
1106     return ret;
1107 }
1108
1109 /*
1110  * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
1111  * work dc->writeback_rate_update is running. Wait until the routine
1112  * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
1113  * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
1114  * seconds, give up waiting here and continue to cancel it too.
1115  */
1116 static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
1117 {
1118     int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
1119
1120     do {
1121         if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1122                   &dc->disk.flags))
1123             break;
1124         time_out--;
1125         schedule_timeout_interruptible(1);
1126     } while (time_out > 0);
1127
1128     if (time_out == 0)
1129         pr_warn("give up waiting for dc->writeback_write_update to quit\n");
1130
1131     cancel_delayed_work_sync(&dc->writeback_rate_update);
1132 }
1133
1134 static void cached_dev_detach_finish(struct work_struct *w)
1135 {
1136     struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1137     struct cache_set *c = dc->disk.c;
1138
1139     BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1140     BUG_ON(refcount_read(&dc->count));
1141
1142
1143     if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1144         cancel_writeback_rate_update_dwork(dc);
1145
1146     if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1147         kthread_stop(dc->writeback_thread);
1148         dc->writeback_thread = NULL;
1149     }
1150
1151     mutex_lock(&bch_register_lock);
1152
1153     bcache_device_detach(&dc->disk);
1154     list_move(&dc->list, &uncached_devices);
1155     calc_cached_dev_sectors(c);
1156
1157     clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1158     clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1159
1160     mutex_unlock(&bch_register_lock);
1161
1162     pr_info("Caching disabled for %pg\n", dc->bdev);
1163
1164     /* Drop ref we took in cached_dev_detach() */
1165     closure_put(&dc->disk.cl);
1166 }
1167
1168 void bch_cached_dev_detach(struct cached_dev *dc)
1169 {
1170     lockdep_assert_held(&bch_register_lock);
1171
1172     if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1173         return;
1174
1175     if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1176         return;
1177
1178     /*
1179      * Block the device from being closed and freed until we're finished
1180      * detaching
1181      */
1182     closure_get(&dc->disk.cl);
1183
1184     bch_writeback_queue(dc);
1185
1186     cached_dev_put(dc);
1187 }
1188
1189 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1190               uint8_t *set_uuid)
1191 {
1192     uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1193     struct uuid_entry *u;
1194     struct cached_dev *exist_dc, *t;
1195     int ret = 0;
1196
1197     if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
1198         (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
1199         return -ENOENT;
1200
1201     if (dc->disk.c) {
1202         pr_err("Can't attach %pg: already attached\n", dc->bdev);
1203         return -EINVAL;
1204     }
1205
1206     if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1207         pr_err("Can't attach %pg: shutting down\n", dc->bdev);
1208         return -EINVAL;
1209     }
1210
1211     if (dc->sb.block_size < c->cache->sb.block_size) {
1212         /* Will die */
1213         pr_err("Couldn't attach %pg: block size less than set's block size\n",
1214                dc->bdev);
1215         return -EINVAL;
1216     }
1217
1218     /* Check whether already attached */
1219     list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1220         if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1221             pr_err("Tried to attach %pg but duplicate UUID already attached\n",
1222                 dc->bdev);
1223
1224             return -EINVAL;
1225         }
1226     }
1227
1228     u = uuid_find(c, dc->sb.uuid);
1229
1230     if (u &&
1231         (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1232          BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1233         memcpy(u->uuid, invalid_uuid, 16);
1234         u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1235         u = NULL;
1236     }
1237
1238     if (!u) {
1239         if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1240             pr_err("Couldn't find uuid for %pg in set\n", dc->bdev);
1241             return -ENOENT;
1242         }
1243
1244         u = uuid_find_empty(c);
1245         if (!u) {
1246             pr_err("Not caching %pg, no room for UUID\n", dc->bdev);
1247             return -EINVAL;
1248         }
1249     }
1250
1251     /*
1252      * Deadlocks since we're called via sysfs...
1253      * sysfs_remove_file(&dc->kobj, &sysfs_attach);
1254      */
1255
1256     if (bch_is_zero(u->uuid, 16)) {
1257         struct closure cl;
1258
1259         closure_init_stack(&cl);
1260
1261         memcpy(u->uuid, dc->sb.uuid, 16);
1262         memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1263         u->first_reg = u->last_reg = rtime;
1264         bch_uuid_write(c);
1265
1266         memcpy(dc->sb.set_uuid, c->set_uuid, 16);
1267         SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1268
1269         bch_write_bdev_super(dc, &cl);
1270         closure_sync(&cl);
1271     } else {
1272         u->last_reg = rtime;
1273         bch_uuid_write(c);
1274     }
1275
1276     bcache_device_attach(&dc->disk, c, u - c->uuids);
1277     list_move(&dc->list, &c->cached_devs);
1278     calc_cached_dev_sectors(c);
1279
1280     /*
1281      * dc->c must be set before dc->count != 0 - paired with the mb in
1282      * cached_dev_get()
1283      */
1284     smp_wmb();
1285     refcount_set(&dc->count, 1);
1286
1287     /* Block writeback thread, but spawn it */
1288     down_write(&dc->writeback_lock);
1289     if (bch_cached_dev_writeback_start(dc)) {
1290         up_write(&dc->writeback_lock);
1291         pr_err("Couldn't start writeback facilities for %s\n",
1292                dc->disk.disk->disk_name);
1293         return -ENOMEM;
1294     }
1295
1296     if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1297         atomic_set(&dc->has_dirty, 1);
1298         bch_writeback_queue(dc);
1299     }
1300
1301     bch_sectors_dirty_init(&dc->disk);
1302
1303     ret = bch_cached_dev_run(dc);
1304     if (ret && (ret != -EBUSY)) {
1305         up_write(&dc->writeback_lock);
1306         /*
1307          * bch_register_lock is held, bcache_device_stop() is not
1308          * able to be directly called. The kthread and kworker
1309          * created previously in bch_cached_dev_writeback_start()
1310          * have to be stopped manually here.
1311          */
1312         kthread_stop(dc->writeback_thread);
1313         cancel_writeback_rate_update_dwork(dc);
1314         pr_err("Couldn't run cached device %pg\n", dc->bdev);
1315         return ret;
1316     }
1317
1318     bcache_device_link(&dc->disk, c, "bdev");
1319     atomic_inc(&c->attached_dev_nr);
1320
1321     if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
1322         pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1323         pr_err("Please update to the latest bcache-tools to create the cache device\n");
1324         set_disk_ro(dc->disk.disk, 1);
1325     }
1326
1327     /* Allow the writeback thread to proceed */
1328     up_write(&dc->writeback_lock);
1329
1330     pr_info("Caching %pg as %s on set %pU\n",
1331         dc->bdev,
1332         dc->disk.disk->disk_name,
1333         dc->disk.c->set_uuid);
1334     return 0;
1335 }
1336
1337 /* when dc->disk.kobj released */
1338 void bch_cached_dev_release(struct kobject *kobj)
1339 {
1340     struct cached_dev *dc = container_of(kobj, struct cached_dev,
1341                          disk.kobj);
1342     kfree(dc);
1343     module_put(THIS_MODULE);
1344 }
1345
1346 static void cached_dev_free(struct closure *cl)
1347 {
1348     struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1349
1350     if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1351         cancel_writeback_rate_update_dwork(dc);
1352
1353     if (!IS_ERR_OR_NULL(dc->writeback_thread))
1354         kthread_stop(dc->writeback_thread);
1355     if (!IS_ERR_OR_NULL(dc->status_update_thread))
1356         kthread_stop(dc->status_update_thread);
1357
1358     mutex_lock(&bch_register_lock);
1359
1360     if (atomic_read(&dc->running)) {
1361         bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1362         del_gendisk(dc->disk.disk);
1363     }
1364     bcache_device_free(&dc->disk);
1365     list_del(&dc->list);
1366
1367     mutex_unlock(&bch_register_lock);
1368
1369     if (dc->sb_disk)
1370         put_page(virt_to_page(dc->sb_disk));
1371
1372     if (!IS_ERR_OR_NULL(dc->bdev))
1373         blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1374
1375     wake_up(&unregister_wait);
1376
1377     kobject_put(&dc->disk.kobj);
1378 }
1379
1380 static void cached_dev_flush(struct closure *cl)
1381 {
1382     struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1383     struct bcache_device *d = &dc->disk;
1384
1385     mutex_lock(&bch_register_lock);
1386     bcache_device_unlink(d);
1387     mutex_unlock(&bch_register_lock);
1388
1389     bch_cache_accounting_destroy(&dc->accounting);
1390     kobject_del(&d->kobj);
1391
1392     continue_at(cl, cached_dev_free, system_wq);
1393 }
1394
1395 static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1396 {
1397     int ret;
1398     struct io *io;
1399     struct request_queue *q = bdev_get_queue(dc->bdev);
1400
1401     __module_get(THIS_MODULE);
1402     INIT_LIST_HEAD(&dc->list);
1403     closure_init(&dc->disk.cl, NULL);
1404     set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1405     kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1406     INIT_WORK(&dc->detach, cached_dev_detach_finish);
1407     sema_init(&dc->sb_write_mutex, 1);
1408     INIT_LIST_HEAD(&dc->io_lru);
1409     spin_lock_init(&dc->io_lock);
1410     bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1411
1412     dc->sequential_cutoff       = 4 << 20;
1413
1414     for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1415         list_add(&io->lru, &dc->io_lru);
1416         hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1417     }
1418
1419     dc->disk.stripe_size = q->limits.io_opt >> 9;
1420
1421     if (dc->disk.stripe_size)
1422         dc->partial_stripes_expensive =
1423             q->limits.raid_partial_stripes_expensive;
1424
1425     ret = bcache_device_init(&dc->disk, block_size,
1426              bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
1427              dc->bdev, &bcache_cached_ops);
1428     if (ret)
1429         return ret;
1430
1431     blk_queue_io_opt(dc->disk.disk->queue,
1432         max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
1433
1434     atomic_set(&dc->io_errors, 0);
1435     dc->io_disable = false;
1436     dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1437     /* default to auto */
1438     dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1439
1440     bch_cached_dev_request_init(dc);
1441     bch_cached_dev_writeback_init(dc);
1442     return 0;
1443 }
1444
1445 /* Cached device - bcache superblock */
1446
1447 static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
1448                  struct block_device *bdev,
1449                  struct cached_dev *dc)
1450 {
1451     const char *err = "cannot allocate memory";
1452     struct cache_set *c;
1453     int ret = -ENOMEM;
1454
1455     memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1456     dc->bdev = bdev;
1457     dc->bdev->bd_holder = dc;
1458     dc->sb_disk = sb_disk;
1459
1460     if (cached_dev_init(dc, sb->block_size << 9))
1461         goto err;
1462
1463     err = "error creating kobject";
1464     if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
1465         goto err;
1466     if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1467         goto err;
1468
1469     pr_info("registered backing device %pg\n", dc->bdev);
1470
1471     list_add(&dc->list, &uncached_devices);
1472     /* attach to a matched cache set if it exists */
1473     list_for_each_entry(c, &bch_cache_sets, list)
1474         bch_cached_dev_attach(dc, c, NULL);
1475
1476     if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1477         BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1478         err = "failed to run cached device";
1479         ret = bch_cached_dev_run(dc);
1480         if (ret)
1481             goto err;
1482     }
1483
1484     return 0;
1485 err:
1486     pr_notice("error %pg: %s\n", dc->bdev, err);
1487     bcache_device_stop(&dc->disk);
1488     return ret;
1489 }
1490
1491 /* Flash only volumes */
1492
1493 /* When d->kobj released */
1494 void bch_flash_dev_release(struct kobject *kobj)
1495 {
1496     struct bcache_device *d = container_of(kobj, struct bcache_device,
1497                            kobj);
1498     kfree(d);
1499 }
1500
1501 static void flash_dev_free(struct closure *cl)
1502 {
1503     struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1504
1505     mutex_lock(&bch_register_lock);
1506     atomic_long_sub(bcache_dev_sectors_dirty(d),
1507             &d->c->flash_dev_dirty_sectors);
1508     del_gendisk(d->disk);
1509     bcache_device_free(d);
1510     mutex_unlock(&bch_register_lock);
1511     kobject_put(&d->kobj);
1512 }
1513
1514 static void flash_dev_flush(struct closure *cl)
1515 {
1516     struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1517
1518     mutex_lock(&bch_register_lock);
1519     bcache_device_unlink(d);
1520     mutex_unlock(&bch_register_lock);
1521     kobject_del(&d->kobj);
1522     continue_at(cl, flash_dev_free, system_wq);
1523 }
1524
1525 static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1526 {
1527     int err = -ENOMEM;
1528     struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1529                       GFP_KERNEL);
1530     if (!d)
1531         goto err_ret;
1532
1533     closure_init(&d->cl, NULL);
1534     set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1535
1536     kobject_init(&d->kobj, &bch_flash_dev_ktype);
1537
1538     if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
1539             NULL, &bcache_flash_ops))
1540         goto err;
1541
1542     bcache_device_attach(d, c, u - c->uuids);
1543     bch_sectors_dirty_init(d);
1544     bch_flash_dev_request_init(d);
1545     err = add_disk(d->disk);
1546     if (err)
1547         goto err;
1548
1549     err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache");
1550     if (err)
1551         goto err;
1552
1553     bcache_device_link(d, c, "volume");
1554
1555     if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
1556         pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1557         pr_err("Please update to the latest bcache-tools to create the cache device\n");
1558         set_disk_ro(d->disk, 1);
1559     }
1560
1561     return 0;
1562 err:
1563     kobject_put(&d->kobj);
1564 err_ret:
1565     return err;
1566 }
1567
1568 static int flash_devs_run(struct cache_set *c)
1569 {
1570     int ret = 0;
1571     struct uuid_entry *u;
1572
1573     for (u = c->uuids;
1574          u < c->uuids + c->nr_uuids && !ret;
1575          u++)
1576         if (UUID_FLASH_ONLY(u))
1577             ret = flash_dev_run(c, u);
1578
1579     return ret;
1580 }
1581
1582 int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1583 {
1584     struct uuid_entry *u;
1585
1586     if (test_bit(CACHE_SET_STOPPING, &c->flags))
1587         return -EINTR;
1588
1589     if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1590         return -EPERM;
1591
1592     u = uuid_find_empty(c);
1593     if (!u) {
1594         pr_err("Can't create volume, no room for UUID\n");
1595         return -EINVAL;
1596     }
1597
1598     get_random_bytes(u->uuid, 16);
1599     memset(u->label, 0, 32);
1600     u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1601
1602     SET_UUID_FLASH_ONLY(u, 1);
1603     u->sectors = size >> 9;
1604
1605     bch_uuid_write(c);
1606
1607     return flash_dev_run(c, u);
1608 }
1609
1610 bool bch_cached_dev_error(struct cached_dev *dc)
1611 {
1612     if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1613         return false;
1614
1615     dc->io_disable = true;
1616     /* make others know io_disable is true earlier */
1617     smp_mb();
1618
1619     pr_err("stop %s: too many IO errors on backing device %pg\n",
1620            dc->disk.disk->disk_name, dc->bdev);
1621
1622     bcache_device_stop(&dc->disk);
1623     return true;
1624 }
1625
1626 /* Cache set */
1627
1628 __printf(2, 3)
1629 bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1630 {
1631     struct va_format vaf;
1632     va_list args;
1633
1634     if (c->on_error != ON_ERROR_PANIC &&
1635         test_bit(CACHE_SET_STOPPING, &c->flags))
1636         return false;
1637
1638     if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1639         pr_info("CACHE_SET_IO_DISABLE already set\n");
1640
1641     /*
1642      * XXX: we can be called from atomic context
1643      * acquire_console_sem();
1644      */
1645
1646     va_start(args, fmt);
1647
1648     vaf.fmt = fmt;
1649     vaf.va = &args;
1650
1651     pr_err("error on %pU: %pV, disabling caching\n",
1652            c->set_uuid, &vaf);
1653
1654     va_end(args);
1655
1656     if (c->on_error == ON_ERROR_PANIC)
1657         panic("panic forced after error\n");
1658
1659     bch_cache_set_unregister(c);
1660     return true;
1661 }
1662
1663 /* When c->kobj released */
1664 void bch_cache_set_release(struct kobject *kobj)
1665 {
1666     struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1667
1668     kfree(c);
1669     module_put(THIS_MODULE);
1670 }
1671
1672 static void cache_set_free(struct closure *cl)
1673 {
1674     struct cache_set *c = container_of(cl, struct cache_set, cl);
1675     struct cache *ca;
1676
1677     debugfs_remove(c->debug);
1678
1679     bch_open_buckets_free(c);
1680     bch_btree_cache_free(c);
1681     bch_journal_free(c);
1682
1683     mutex_lock(&bch_register_lock);
1684     bch_bset_sort_state_free(&c->sort);
1685     free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
1686
1687     ca = c->cache;
1688     if (ca) {
1689         ca->set = NULL;
1690         c->cache = NULL;
1691         kobject_put(&ca->kobj);
1692     }
1693
1694
1695     if (c->moving_gc_wq)
1696         destroy_workqueue(c->moving_gc_wq);
1697     bioset_exit(&c->bio_split);
1698     mempool_exit(&c->fill_iter);
1699     mempool_exit(&c->bio_meta);
1700     mempool_exit(&c->search);
1701     kfree(c->devices);
1702
1703     list_del(&c->list);
1704     mutex_unlock(&bch_register_lock);
1705
1706     pr_info("Cache set %pU unregistered\n", c->set_uuid);
1707     wake_up(&unregister_wait);
1708
1709     closure_debug_destroy(&c->cl);
1710     kobject_put(&c->kobj);
1711 }
1712
1713 static void cache_set_flush(struct closure *cl)
1714 {
1715     struct cache_set *c = container_of(cl, struct cache_set, caching);
1716     struct cache *ca = c->cache;
1717     struct btree *b;
1718
1719     bch_cache_accounting_destroy(&c->accounting);
1720
1721     kobject_put(&c->internal);
1722     kobject_del(&c->kobj);
1723
1724     if (!IS_ERR_OR_NULL(c->gc_thread))
1725         kthread_stop(c->gc_thread);
1726
1727     if (!IS_ERR_OR_NULL(c->root))
1728         list_add(&c->root->list, &c->btree_cache);
1729
1730     /*
1731      * Avoid flushing cached nodes if cache set is retiring
1732      * due to too many I/O errors detected.
1733      */
1734     if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1735         list_for_each_entry(b, &c->btree_cache, list) {
1736             mutex_lock(&b->write_lock);
1737             if (btree_node_dirty(b))
1738                 __bch_btree_node_write(b, NULL);
1739             mutex_unlock(&b->write_lock);
1740         }
1741
1742     if (ca->alloc_thread)
1743         kthread_stop(ca->alloc_thread);
1744
1745     if (c->journal.cur) {
1746         cancel_delayed_work_sync(&c->journal.work);
1747         /* flush last journal entry if needed */
1748         c->journal.work.work.func(&c->journal.work.work);
1749     }
1750
1751     closure_return(cl);
1752 }
1753
1754 /*
1755  * This function is only called when CACHE_SET_IO_DISABLE is set, which means
1756  * cache set is unregistering due to too many I/O errors. In this condition,
1757  * the bcache device might be stopped, it depends on stop_when_cache_set_failed
1758  * value and whether the broken cache has dirty data:
1759  *
1760  * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
1761  *  BCH_CACHED_STOP_AUTO               0               NO
1762  *  BCH_CACHED_STOP_AUTO               1               YES
1763  *  BCH_CACHED_DEV_STOP_ALWAYS         0               YES
1764  *  BCH_CACHED_DEV_STOP_ALWAYS         1               YES
1765  *
1766  * The expected behavior is, if stop_when_cache_set_failed is configured to
1767  * "auto" via sysfs interface, the bcache device will not be stopped if the
1768  * backing device is clean on the broken cache device.
1769  */
1770 static void conditional_stop_bcache_device(struct cache_set *c,
1771                        struct bcache_device *d,
1772                        struct cached_dev *dc)
1773 {
1774     if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1775         pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
1776             d->disk->disk_name, c->set_uuid);
1777         bcache_device_stop(d);
1778     } else if (atomic_read(&dc->has_dirty)) {
1779         /*
1780          * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1781          * and dc->has_dirty == 1
1782          */
1783         pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
1784             d->disk->disk_name);
1785         /*
1786          * There might be a small time gap that cache set is
1787          * released but bcache device is not. Inside this time
1788          * gap, regular I/O requests will directly go into
1789          * backing device as no cache set attached to. This
1790          * behavior may also introduce potential inconsistence
1791          * data in writeback mode while cache is dirty.
1792          * Therefore before calling bcache_device_stop() due
1793          * to a broken cache device, dc->io_disable should be
1794          * explicitly set to true.
1795          */
1796         dc->io_disable = true;
1797         /* make others know io_disable is true earlier */
1798         smp_mb();
1799         bcache_device_stop(d);
1800     } else {
1801         /*
1802          * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1803          * and dc->has_dirty == 0
1804          */
1805         pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
1806             d->disk->disk_name);
1807     }
1808 }
1809
1810 static void __cache_set_unregister(struct closure *cl)
1811 {
1812     struct cache_set *c = container_of(cl, struct cache_set, caching);
1813     struct cached_dev *dc;
1814     struct bcache_device *d;
1815     size_t i;
1816
1817     mutex_lock(&bch_register_lock);
1818
1819     for (i = 0; i < c->devices_max_used; i++) {
1820         d = c->devices[i];
1821         if (!d)
1822             continue;
1823
1824         if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1825             test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1826             dc = container_of(d, struct cached_dev, disk);
1827             bch_cached_dev_detach(dc);
1828             if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1829                 conditional_stop_bcache_device(c, d, dc);
1830         } else {
1831             bcache_device_stop(d);
1832         }
1833     }
1834
1835     mutex_unlock(&bch_register_lock);
1836
1837     continue_at(cl, cache_set_flush, system_wq);
1838 }
1839
1840 void bch_cache_set_stop(struct cache_set *c)
1841 {
1842     if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1843         /* closure_fn set to __cache_set_unregister() */
1844         closure_queue(&c->caching);
1845 }
1846
1847 void bch_cache_set_unregister(struct cache_set *c)
1848 {
1849     set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1850     bch_cache_set_stop(c);
1851 }
1852
1853 #define alloc_meta_bucket_pages(gfp, sb)        \
1854     ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
1855
1856 struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1857 {
1858     int iter_size;
1859     struct cache *ca = container_of(sb, struct cache, sb);
1860     struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1861
1862     if (!c)
1863         return NULL;
1864
1865     __module_get(THIS_MODULE);
1866     closure_init(&c->cl, NULL);
1867     set_closure_fn(&c->cl, cache_set_free, system_wq);
1868
1869     closure_init(&c->caching, &c->cl);
1870     set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1871
1872     /* Maybe create continue_at_noreturn() and use it here? */
1873     closure_set_stopped(&c->cl);
1874     closure_put(&c->cl);
1875
1876     kobject_init(&c->kobj, &bch_cache_set_ktype);
1877     kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1878
1879     bch_cache_accounting_init(&c->accounting, &c->cl);
1880
1881     memcpy(c->set_uuid, sb->set_uuid, 16);
1882
1883     c->cache        = ca;
1884     c->cache->set       = c;
1885     c->bucket_bits      = ilog2(sb->bucket_size);
1886     c->block_bits       = ilog2(sb->block_size);
1887     c->nr_uuids     = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
1888     c->devices_max_used = 0;
1889     atomic_set(&c->attached_dev_nr, 0);
1890     c->btree_pages      = meta_bucket_pages(sb);
1891     if (c->btree_pages > BTREE_MAX_PAGES)
1892         c->btree_pages = max_t(int, c->btree_pages / 4,
1893                        BTREE_MAX_PAGES);
1894
1895     sema_init(&c->sb_write_mutex, 1);
1896     mutex_init(&c->bucket_lock);
1897     init_waitqueue_head(&c->btree_cache_wait);
1898     spin_lock_init(&c->btree_cannibalize_lock);
1899     init_waitqueue_head(&c->bucket_wait);
1900     init_waitqueue_head(&c->gc_wait);
1901     sema_init(&c->uuid_write_mutex, 1);
1902
1903     spin_lock_init(&c->btree_gc_time.lock);
1904     spin_lock_init(&c->btree_split_time.lock);
1905     spin_lock_init(&c->btree_read_time.lock);
1906
1907     bch_moving_init_cache_set(c);
1908
1909     INIT_LIST_HEAD(&c->list);
1910     INIT_LIST_HEAD(&c->cached_devs);
1911     INIT_LIST_HEAD(&c->btree_cache);
1912     INIT_LIST_HEAD(&c->btree_cache_freeable);
1913     INIT_LIST_HEAD(&c->btree_cache_freed);
1914     INIT_LIST_HEAD(&c->data_buckets);
1915
1916     iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
1917         sizeof(struct btree_iter_set);
1918
1919     c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
1920     if (!c->devices)
1921         goto err;
1922
1923     if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
1924         goto err;
1925
1926     if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
1927             sizeof(struct bbio) +
1928             sizeof(struct bio_vec) * meta_bucket_pages(sb)))
1929         goto err;
1930
1931     if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
1932         goto err;
1933
1934     if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1935             BIOSET_NEED_RESCUER))
1936         goto err;
1937
1938     c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
1939     if (!c->uuids)
1940         goto err;
1941
1942     c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
1943     if (!c->moving_gc_wq)
1944         goto err;
1945
1946     if (bch_journal_alloc(c))
1947         goto err;
1948
1949     if (bch_btree_cache_alloc(c))
1950         goto err;
1951
1952     if (bch_open_buckets_alloc(c))
1953         goto err;
1954
1955     if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1956         goto err;
1957
1958     c->congested_read_threshold_us  = 2000;
1959     c->congested_write_threshold_us = 20000;
1960     c->error_limit  = DEFAULT_IO_ERROR_LIMIT;
1961     c->idle_max_writeback_rate_enabled = 1;
1962     WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1963
1964     return c;
1965 err:
1966     bch_cache_set_unregister(c);
1967     return NULL;
1968 }
1969
1970 static int run_cache_set(struct cache_set *c)
1971 {
1972     const char *err = "cannot allocate memory";
1973     struct cached_dev *dc, *t;
1974     struct cache *ca = c->cache;
1975     struct closure cl;
1976     LIST_HEAD(journal);
1977     struct journal_replay *l;
1978
1979     closure_init_stack(&cl);
1980
1981     c->nbuckets = ca->sb.nbuckets;
1982     set_gc_sectors(c);
1983
1984     if (CACHE_SYNC(&c->cache->sb)) {
1985         struct bkey *k;
1986         struct jset *j;
1987
1988         err = "cannot allocate memory for journal";
1989         if (bch_journal_read(c, &journal))
1990             goto err;
1991
1992         pr_debug("btree_journal_read() done\n");
1993
1994         err = "no journal entries found";
1995         if (list_empty(&journal))
1996             goto err;
1997
1998         j = &list_entry(journal.prev, struct journal_replay, list)->j;
1999
2000         err = "IO error reading priorities";
2001         if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
2002             goto err;
2003
2004         /*
2005          * If prio_read() fails it'll call cache_set_error and we'll
2006          * tear everything down right away, but if we perhaps checked
2007          * sooner we could avoid journal replay.
2008          */
2009
2010         k = &j->btree_root;
2011
2012         err = "bad btree root";
2013         if (__bch_btree_ptr_invalid(c, k))
2014             goto err;
2015
2016         err = "error reading btree root";
2017         c->root = bch_btree_node_get(c, NULL, k,
2018                          j->btree_level,
2019                          true, NULL);
2020         if (IS_ERR_OR_NULL(c->root))
2021             goto err;
2022
2023         list_del_init(&c->root->list);
2024         rw_unlock(true, c->root);
2025
2026         err = uuid_read(c, j, &cl);
2027         if (err)
2028             goto err;
2029
2030         err = "error in recovery";
2031         if (bch_btree_check(c))
2032             goto err;
2033
2034         bch_journal_mark(c, &journal);
2035         bch_initial_gc_finish(c);
2036         pr_debug("btree_check() done\n");
2037
2038         /*
2039          * bcache_journal_next() can't happen sooner, or
2040          * btree_gc_finish() will give spurious errors about last_gc >
2041          * gc_gen - this is a hack but oh well.
2042          */
2043         bch_journal_next(&c->journal);
2044
2045         err = "error starting allocator thread";
2046         if (bch_cache_allocator_start(ca))
2047             goto err;
2048
2049         /*
2050          * First place it's safe to allocate: btree_check() and
2051          * btree_gc_finish() have to run before we have buckets to
2052          * allocate, and bch_bucket_alloc_set() might cause a journal
2053          * entry to be written so bcache_journal_next() has to be called
2054          * first.
2055          *
2056          * If the uuids were in the old format we have to rewrite them
2057          * before the next journal entry is written:
2058          */
2059         if (j->version < BCACHE_JSET_VERSION_UUID)
2060             __uuid_write(c);
2061
2062         err = "bcache: replay journal failed";
2063         if (bch_journal_replay(c, &journal))
2064             goto err;
2065     } else {
2066         unsigned int j;
2067
2068         pr_notice("invalidating existing data\n");
2069         ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
2070                     2, SB_JOURNAL_BUCKETS);
2071
2072         for (j = 0; j < ca->sb.keys; j++)
2073             ca->sb.d[j] = ca->sb.first_bucket + j;
2074
2075         bch_initial_gc_finish(c);
2076
2077         err = "error starting allocator thread";
2078         if (bch_cache_allocator_start(ca))
2079             goto err;
2080
2081         mutex_lock(&c->bucket_lock);
2082         bch_prio_write(ca, true);
2083         mutex_unlock(&c->bucket_lock);
2084
2085         err = "cannot allocate new UUID bucket";
2086         if (__uuid_write(c))
2087             goto err;
2088
2089         err = "cannot allocate new btree root";
2090         c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
2091         if (IS_ERR_OR_NULL(c->root))
2092             goto err;
2093
2094         mutex_lock(&c->root->write_lock);
2095         bkey_copy_key(&c->root->key, &MAX_KEY);
2096         bch_btree_node_write(c->root, &cl);
2097         mutex_unlock(&c->root->write_lock);
2098
2099         bch_btree_set_root(c->root);
2100         rw_unlock(true, c->root);
2101
2102         /*
2103          * We don't want to write the first journal entry until
2104          * everything is set up - fortunately journal entries won't be
2105          * written until the SET_CACHE_SYNC() here:
2106          */
2107         SET_CACHE_SYNC(&c->cache->sb, true);
2108
2109         bch_journal_next(&c->journal);
2110         bch_journal_meta(c, &cl);
2111     }
2112
2113     err = "error starting gc thread";
2114     if (bch_gc_thread_start(c))
2115         goto err;
2116
2117     closure_sync(&cl);
2118     c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
2119     bcache_write_super(c);
2120
2121     if (bch_has_feature_obso_large_bucket(&c->cache->sb))
2122         pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
2123
2124     list_for_each_entry_safe(dc, t, &uncached_devices, list)
2125         bch_cached_dev_attach(dc, c, NULL);
2126
2127     flash_devs_run(c);
2128
2129     bch_journal_space_reserve(&c->journal);
2130     set_bit(CACHE_SET_RUNNING, &c->flags);
2131     return 0;
2132 err:
2133     while (!list_empty(&journal)) {
2134         l = list_first_entry(&journal, struct journal_replay, list);
2135         list_del(&l->list);
2136         kfree(l);
2137     }
2138
2139     closure_sync(&cl);
2140
2141     bch_cache_set_error(c, "%s", err);
2142
2143     return -EIO;
2144 }
2145
2146 static const char *register_cache_set(struct cache *ca)
2147 {
2148     char buf[12];
2149     const char *err = "cannot allocate memory";
2150     struct cache_set *c;
2151
2152     list_for_each_entry(c, &bch_cache_sets, list)
2153         if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
2154             if (c->cache)
2155                 return "duplicate cache set member";
2156
2157             goto found;
2158         }
2159
2160     c = bch_cache_set_alloc(&ca->sb);
2161     if (!c)
2162         return err;
2163
2164     err = "error creating kobject";
2165     if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
2166         kobject_add(&c->internal, &c->kobj, "internal"))
2167         goto err;
2168
2169     if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2170         goto err;
2171
2172     bch_debug_init_cache_set(c);
2173
2174     list_add(&c->list, &bch_cache_sets);
2175 found:
2176     sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2177     if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2178         sysfs_create_link(&c->kobj, &ca->kobj, buf))
2179         goto err;
2180
2181     kobject_get(&ca->kobj);
2182     ca->set = c;
2183     ca->set->cache = ca;
2184
2185     err = "failed to run cache set";
2186     if (run_cache_set(c) < 0)
2187         goto err;
2188
2189     return NULL;
2190 err:
2191     bch_cache_set_unregister(c);
2192     return err;
2193 }
2194
2195 /* Cache device */
2196
2197 /* When ca->kobj released */
2198 void bch_cache_release(struct kobject *kobj)
2199 {
2200     struct cache *ca = container_of(kobj, struct cache, kobj);
2201     unsigned int i;
2202
2203     if (ca->set) {
2204         BUG_ON(ca->set->cache != ca);
2205         ca->set->cache = NULL;
2206     }
2207
2208     free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
2209     kfree(ca->prio_buckets);
2210     vfree(ca->buckets);
2211
2212     free_heap(&ca->heap);
2213     free_fifo(&ca->free_inc);
2214
2215     for (i = 0; i < RESERVE_NR; i++)
2216         free_fifo(&ca->free[i]);
2217
2218     if (ca->sb_disk)
2219         put_page(virt_to_page(ca->sb_disk));
2220
2221     if (!IS_ERR_OR_NULL(ca->bdev))
2222         blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2223
2224     kfree(ca);
2225     module_put(THIS_MODULE);
2226 }
2227
2228 static int cache_alloc(struct cache *ca)
2229 {
2230     size_t free;
2231     size_t btree_buckets;
2232     struct bucket *b;
2233     int ret = -ENOMEM;
2234     const char *err = NULL;
2235
2236     __module_get(THIS_MODULE);
2237     kobject_init(&ca->kobj, &bch_cache_ktype);
2238
2239     bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
2240
2241     /*
2242      * when ca->sb.njournal_buckets is not zero, journal exists,
2243      * and in bch_journal_replay(), tree node may split,
2244      * so bucket of RESERVE_BTREE type is needed,
2245      * the worst situation is all journal buckets are valid journal,
2246      * and all the keys need to replay,
2247      * so the number of  RESERVE_BTREE type buckets should be as much
2248      * as journal buckets
2249      */
2250     btree_buckets = ca->sb.njournal_buckets ?: 8;
2251     free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2252     if (!free) {
2253         ret = -EPERM;
2254         err = "ca->sb.nbuckets is too small";
2255         goto err_free;
2256     }
2257
2258     if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2259                         GFP_KERNEL)) {
2260         err = "ca->free[RESERVE_BTREE] alloc failed";
2261         goto err_btree_alloc;
2262     }
2263
2264     if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2265                             GFP_KERNEL)) {
2266         err = "ca->free[RESERVE_PRIO] alloc failed";
2267         goto err_prio_alloc;
2268     }
2269
2270     if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2271         err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2272         goto err_movinggc_alloc;
2273     }
2274
2275     if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2276         err = "ca->free[RESERVE_NONE] alloc failed";
2277         goto err_none_alloc;
2278     }
2279
2280     if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2281         err = "ca->free_inc alloc failed";
2282         goto err_free_inc_alloc;
2283     }
2284
2285     if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2286         err = "ca->heap alloc failed";
2287         goto err_heap_alloc;
2288     }
2289
2290     ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2291                   ca->sb.nbuckets));
2292     if (!ca->buckets) {
2293         err = "ca->buckets alloc failed";
2294         goto err_buckets_alloc;
2295     }
2296
2297     ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2298                    prio_buckets(ca), 2),
2299                    GFP_KERNEL);
2300     if (!ca->prio_buckets) {
2301         err = "ca->prio_buckets alloc failed";
2302         goto err_prio_buckets_alloc;
2303     }
2304
2305     ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
2306     if (!ca->disk_buckets) {
2307         err = "ca->disk_buckets alloc failed";
2308         goto err_disk_buckets_alloc;
2309     }
2310
2311     ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2312
2313     for_each_bucket(b, ca)
2314         atomic_set(&b->pin, 0);
2315     return 0;
2316
2317 err_disk_buckets_alloc:
2318     kfree(ca->prio_buckets);
2319 err_prio_buckets_alloc:
2320     vfree(ca->buckets);
2321 err_buckets_alloc:
2322     free_heap(&ca->heap);
2323 err_heap_alloc:
2324     free_fifo(&ca->free_inc);
2325 err_free_inc_alloc:
2326     free_fifo(&ca->free[RESERVE_NONE]);
2327 err_none_alloc:
2328     free_fifo(&ca->free[RESERVE_MOVINGGC]);
2329 err_movinggc_alloc:
2330     free_fifo(&ca->free[RESERVE_PRIO]);
2331 err_prio_alloc:
2332     free_fifo(&ca->free[RESERVE_BTREE]);
2333 err_btree_alloc:
2334 err_free:
2335     module_put(THIS_MODULE);
2336     if (err)
2337         pr_notice("error %pg: %s\n", ca->bdev, err);
2338     return ret;
2339 }
2340
2341 static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
2342                 struct block_device *bdev, struct cache *ca)
2343 {
2344     const char *err = NULL; /* must be set for any error case */
2345     int ret = 0;
2346
2347     memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2348     ca->bdev = bdev;
2349     ca->bdev->bd_holder = ca;
2350     ca->sb_disk = sb_disk;
2351
2352     if (bdev_max_discard_sectors((bdev)))
2353         ca->discard = CACHE_DISCARD(&ca->sb);
2354
2355     ret = cache_alloc(ca);
2356     if (ret != 0) {
2357         /*
2358          * If we failed here, it means ca->kobj is not initialized yet,
2359          * kobject_put() won't be called and there is no chance to
2360          * call blkdev_put() to bdev in bch_cache_release(). So we
2361          * explicitly call blkdev_put() here.
2362          */
2363         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2364         if (ret == -ENOMEM)
2365             err = "cache_alloc(): -ENOMEM";
2366         else if (ret == -EPERM)
2367             err = "cache_alloc(): cache device is too small";
2368         else
2369             err = "cache_alloc(): unknown error";
2370         goto err;
2371     }
2372
2373     if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
2374         err = "error calling kobject_add";
2375         ret = -ENOMEM;
2376         goto out;
2377     }
2378
2379     mutex_lock(&bch_register_lock);
2380     err = register_cache_set(ca);
2381     mutex_unlock(&bch_register_lock);
2382
2383     if (err) {
2384         ret = -ENODEV;
2385         goto out;
2386     }
2387
2388     pr_info("registered cache device %pg\n", ca->bdev);
2389
2390 out:
2391     kobject_put(&ca->kobj);
2392
2393 err:
2394     if (err)
2395         pr_notice("error %pg: %s\n", ca->bdev, err);
2396
2397     return ret;
2398 }
2399
2400 /* Global interfaces/init */
2401
2402 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2403                    const char *buffer, size_t size);
2404 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2405                      struct kobj_attribute *attr,
2406                      const char *buffer, size_t size);
2407
2408 kobj_attribute_write(register,      register_bcache);
2409 kobj_attribute_write(register_quiet,    register_bcache);
2410 kobj_attribute_write(pendings_cleanup,  bch_pending_bdevs_cleanup);
2411
2412 static bool bch_is_open_backing(dev_t dev)
2413 {
2414     struct cache_set *c, *tc;
2415     struct cached_dev *dc, *t;
2416
2417     list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2418         list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2419             if (dc->bdev->bd_dev == dev)
2420                 return true;
2421     list_for_each_entry_safe(dc, t, &uncached_devices, list)
2422         if (dc->bdev->bd_dev == dev)
2423             return true;
2424     return false;
2425 }
2426
2427 static bool bch_is_open_cache(dev_t dev)
2428 {
2429     struct cache_set *c, *tc;
2430
2431     list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2432         struct cache *ca = c->cache;
2433
2434         if (ca->bdev->bd_dev == dev)
2435             return true;
2436     }
2437
2438     return false;
2439 }
2440
2441 static bool bch_is_open(dev_t dev)
2442 {
2443     return bch_is_open_cache(dev) || bch_is_open_backing(dev);
2444 }
2445
2446 struct async_reg_args {
2447     struct delayed_work reg_work;
2448     char *path;
2449     struct cache_sb *sb;
2450     struct cache_sb_disk *sb_disk;
2451     struct block_device *bdev;
2452 };
2453
2454 static void register_bdev_worker(struct work_struct *work)
2455 {
2456     int fail = false;
2457     struct async_reg_args *args =
2458         container_of(work, struct async_reg_args, reg_work.work);
2459     struct cached_dev *dc;
2460
2461     dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2462     if (!dc) {
2463         fail = true;
2464         put_page(virt_to_page(args->sb_disk));
2465         blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2466         goto out;
2467     }
2468
2469     mutex_lock(&bch_register_lock);
2470     if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
2471         fail = true;
2472     mutex_unlock(&bch_register_lock);
2473
2474 out:
2475     if (fail)
2476         pr_info("error %s: fail to register backing device\n",
2477             args->path);
2478     kfree(args->sb);
2479     kfree(args->path);
2480     kfree(args);
2481     module_put(THIS_MODULE);
2482 }
2483
2484 static void register_cache_worker(struct work_struct *work)
2485 {
2486     int fail = false;
2487     struct async_reg_args *args =
2488         container_of(work, struct async_reg_args, reg_work.work);
2489     struct cache *ca;
2490
2491     ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2492     if (!ca) {
2493         fail = true;
2494         put_page(virt_to_page(args->sb_disk));
2495         blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2496         goto out;
2497     }
2498
2499     /* blkdev_put() will be called in bch_cache_release() */
2500     if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
2501         fail = true;
2502
2503 out:
2504     if (fail)
2505         pr_info("error %s: fail to register cache device\n",
2506             args->path);
2507     kfree(args->sb);
2508     kfree(args->path);
2509     kfree(args);
2510     module_put(THIS_MODULE);
2511 }
2512
2513 static void register_device_async(struct async_reg_args *args)
2514 {
2515     if (SB_IS_BDEV(args->sb))
2516         INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
2517     else
2518         INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
2519
2520     /* 10 jiffies is enough for a delay */
2521     queue_delayed_work(system_wq, &args->reg_work, 10);
2522 }
2523
2524 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2525                    const char *buffer, size_t size)
2526 {
2527     const char *err;
2528     char *path = NULL;
2529     struct cache_sb *sb;
2530     struct cache_sb_disk *sb_disk;
2531     struct block_device *bdev;
2532     ssize_t ret;
2533     bool async_registration = false;
2534
2535 #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
2536     async_registration = true;
2537 #endif
2538
2539     ret = -EBUSY;
2540     err = "failed to reference bcache module";
2541     if (!try_module_get(THIS_MODULE))
2542         goto out;
2543
2544     /* For latest state of bcache_is_reboot */
2545     smp_mb();
2546     err = "bcache is in reboot";
2547     if (bcache_is_reboot)
2548         goto out_module_put;
2549
2550     ret = -ENOMEM;
2551     err = "cannot allocate memory";
2552     path = kstrndup(buffer, size, GFP_KERNEL);
2553     if (!path)
2554         goto out_module_put;
2555
2556     sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2557     if (!sb)
2558         goto out_free_path;
2559
2560     ret = -EINVAL;
2561     err = "failed to open device";
2562     bdev = blkdev_get_by_path(strim(path),
2563                   FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2564                   sb);
2565     if (IS_ERR(bdev)) {
2566         if (bdev == ERR_PTR(-EBUSY)) {
2567             dev_t dev;
2568
2569             mutex_lock(&bch_register_lock);
2570             if (lookup_bdev(strim(path), &dev) == 0 &&
2571                 bch_is_open(dev))
2572                 err = "device already registered";
2573             else
2574                 err = "device busy";
2575             mutex_unlock(&bch_register_lock);
2576             if (attr == &ksysfs_register_quiet)
2577                 goto done;
2578         }
2579         goto out_free_sb;
2580     }
2581
2582     err = "failed to set blocksize";
2583     if (set_blocksize(bdev, 4096))
2584         goto out_blkdev_put;
2585
2586     err = read_super(sb, bdev, &sb_disk);
2587     if (err)
2588         goto out_blkdev_put;
2589
2590     err = "failed to register device";
2591
2592     if (async_registration) {
2593         /* register in asynchronous way */
2594         struct async_reg_args *args =
2595             kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
2596
2597         if (!args) {
2598             ret = -ENOMEM;
2599             err = "cannot allocate memory";
2600             goto out_put_sb_page;
2601         }
2602
2603         args->path  = path;
2604         args->sb    = sb;
2605         args->sb_disk   = sb_disk;
2606         args->bdev  = bdev;
2607         register_device_async(args);
2608         /* No wait and returns to user space */
2609         goto async_done;
2610     }
2611
2612     if (SB_IS_BDEV(sb)) {
2613         struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2614
2615         if (!dc) {
2616             ret = -ENOMEM;
2617             err = "cannot allocate memory";
2618             goto out_put_sb_page;
2619         }
2620
2621         mutex_lock(&bch_register_lock);
2622         ret = register_bdev(sb, sb_disk, bdev, dc);
2623         mutex_unlock(&bch_register_lock);
2624         /* blkdev_put() will be called in cached_dev_free() */
2625         if (ret < 0)
2626             goto out_free_sb;
2627     } else {
2628         struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2629
2630         if (!ca) {
2631             ret = -ENOMEM;
2632             err = "cannot allocate memory";
2633             goto out_put_sb_page;
2634         }
2635
2636         /* blkdev_put() will be called in bch_cache_release() */
2637         ret = register_cache(sb, sb_disk, bdev, ca);
2638         if (ret)
2639             goto out_free_sb;
2640     }
2641
2642 done:
2643     kfree(sb);
2644     kfree(path);
2645     module_put(THIS_MODULE);
2646 async_done:
2647     return size;
2648
2649 out_put_sb_page:
2650     put_page(virt_to_page(sb_disk));
2651 out_blkdev_put:
2652     blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2653 out_free_sb:
2654     kfree(sb);
2655 out_free_path:
2656     kfree(path);
2657     path = NULL;
2658 out_module_put:
2659     module_put(THIS_MODULE);
2660 out:
2661     pr_info("error %s: %s\n", path?path:"", err);
2662     return ret;
2663 }
2664
2665
2666 struct pdev {
2667     struct list_head list;
2668     struct cached_dev *dc;
2669 };
2670
2671 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2672                      struct kobj_attribute *attr,
2673                      const char *buffer,
2674                      size_t size)
2675 {
2676     LIST_HEAD(pending_devs);
2677     ssize_t ret = size;
2678     struct cached_dev *dc, *tdc;
2679     struct pdev *pdev, *tpdev;
2680     struct cache_set *c, *tc;
2681
2682     mutex_lock(&bch_register_lock);
2683     list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2684         pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2685         if (!pdev)
2686             break;
2687         pdev->dc = dc;
2688         list_add(&pdev->list, &pending_devs);
2689     }
2690
2691     list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2692         char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2693         list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2694             char *set_uuid = c->set_uuid;
2695
2696             if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2697                 list_del(&pdev->list);
2698                 kfree(pdev);
2699                 break;
2700             }
2701         }
2702     }
2703     mutex_unlock(&bch_register_lock);
2704
2705     list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2706         pr_info("delete pdev %p\n", pdev);
2707         list_del(&pdev->list);
2708         bcache_device_stop(&pdev->dc->disk);
2709         kfree(pdev);
2710     }
2711
2712     return ret;
2713 }
2714
2715 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2716 {
2717     if (bcache_is_reboot)
2718         return NOTIFY_DONE;
2719
2720     if (code == SYS_DOWN ||
2721         code == SYS_HALT ||
2722         code == SYS_POWER_OFF) {
2723         DEFINE_WAIT(wait);
2724         unsigned long start = jiffies;
2725         bool stopped = false;
2726
2727         struct cache_set *c, *tc;
2728         struct cached_dev *dc, *tdc;
2729
2730         mutex_lock(&bch_register_lock);
2731
2732         if (bcache_is_reboot)
2733             goto out;
2734
2735         /* New registration is rejected since now */
2736         bcache_is_reboot = true;
2737         /*
2738          * Make registering caller (if there is) on other CPU
2739          * core know bcache_is_reboot set to true earlier
2740          */
2741         smp_mb();
2742
2743         if (list_empty(&bch_cache_sets) &&
2744             list_empty(&uncached_devices))
2745             goto out;
2746
2747         mutex_unlock(&bch_register_lock);
2748
2749         pr_info("Stopping all devices:\n");
2750
2751         /*
2752          * The reason bch_register_lock is not held to call
2753          * bch_cache_set_stop() and bcache_device_stop() is to
2754          * avoid potential deadlock during reboot, because cache
2755          * set or bcache device stopping process will acquire
2756          * bch_register_lock too.
2757          *
2758          * We are safe here because bcache_is_reboot sets to
2759          * true already, register_bcache() will reject new
2760          * registration now. bcache_is_reboot also makes sure
2761          * bcache_reboot() won't be re-entered on by other thread,
2762          * so there is no race in following list iteration by
2763          * list_for_each_entry_safe().
2764          */
2765         list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2766             bch_cache_set_stop(c);
2767
2768         list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2769             bcache_device_stop(&dc->disk);
2770
2771
2772         /*
2773          * Give an early chance for other kthreads and
2774          * kworkers to stop themselves
2775          */
2776         schedule();
2777
2778         /* What's a condition variable? */
2779         while (1) {
2780             long timeout = start + 10 * HZ - jiffies;
2781
2782             mutex_lock(&bch_register_lock);
2783             stopped = list_empty(&bch_cache_sets) &&
2784                 list_empty(&uncached_devices);
2785
2786             if (timeout < 0 || stopped)
2787                 break;
2788
2789             prepare_to_wait(&unregister_wait, &wait,
2790                     TASK_UNINTERRUPTIBLE);
2791
2792             mutex_unlock(&bch_register_lock);
2793             schedule_timeout(timeout);
2794         }
2795
2796         finish_wait(&unregister_wait, &wait);
2797
2798         if (stopped)
2799             pr_info("All devices stopped\n");
2800         else
2801             pr_notice("Timeout waiting for devices to be closed\n");
2802 out:
2803         mutex_unlock(&bch_register_lock);
2804     }
2805
2806     return NOTIFY_DONE;
2807 }
2808
2809 static struct notifier_block reboot = {
2810     .notifier_call  = bcache_reboot,
2811     .priority   = INT_MAX, /* before any real devices */
2812 };
2813
2814 static void bcache_exit(void)
2815 {
2816     bch_debug_exit();
2817     bch_request_exit();
2818     if (bcache_kobj)
2819         kobject_put(bcache_kobj);
2820     if (bcache_wq)
2821         destroy_workqueue(bcache_wq);
2822     if (bch_journal_wq)
2823         destroy_workqueue(bch_journal_wq);
2824     if (bch_flush_wq)
2825         destroy_workqueue(bch_flush_wq);
2826     bch_btree_exit();
2827
2828     if (bcache_major)
2829         unregister_blkdev(bcache_major, "bcache");
2830     unregister_reboot_notifier(&reboot);
2831     mutex_destroy(&bch_register_lock);
2832 }
2833
2834 /* Check and fixup module parameters */
2835 static void check_module_parameters(void)
2836 {
2837     if (bch_cutoff_writeback_sync == 0)
2838         bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2839     else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2840         pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
2841             bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2842         bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2843     }
2844
2845     if (bch_cutoff_writeback == 0)
2846         bch_cutoff_writeback = CUTOFF_WRITEBACK;
2847     else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2848         pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
2849             bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2850         bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2851     }
2852
2853     if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2854         pr_warn("set bch_cutoff_writeback (%u) to %u\n",
2855             bch_cutoff_writeback, bch_cutoff_writeback_sync);
2856         bch_cutoff_writeback = bch_cutoff_writeback_sync;
2857     }
2858 }
2859
2860 static int __init bcache_init(void)
2861 {
2862     static const struct attribute *files[] = {
2863         &ksysfs_register.attr,
2864         &ksysfs_register_quiet.attr,
2865         &ksysfs_pendings_cleanup.attr,
2866         NULL
2867     };
2868
2869     check_module_parameters();
2870
2871     mutex_init(&bch_register_lock);
2872     init_waitqueue_head(&unregister_wait);
2873     register_reboot_notifier(&reboot);
2874
2875     bcache_major = register_blkdev(0, "bcache");
2876     if (bcache_major < 0) {
2877         unregister_reboot_notifier(&reboot);
2878         mutex_destroy(&bch_register_lock);
2879         return bcache_major;
2880     }
2881
2882     if (bch_btree_init())
2883         goto err;
2884
2885     bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2886     if (!bcache_wq)
2887         goto err;
2888
2889     /*
2890      * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
2891      *
2892      * 1. It used `system_wq` before which also does no memory reclaim.
2893      * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
2894      *    reduced throughput can be observed.
2895      *
2896      * We still want to user our own queue to not congest the `system_wq`.
2897      */
2898     bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
2899     if (!bch_flush_wq)
2900         goto err;
2901
2902     bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2903     if (!bch_journal_wq)
2904         goto err;
2905
2906     bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2907     if (!bcache_kobj)
2908         goto err;
2909
2910     if (bch_request_init() ||
2911         sysfs_create_files(bcache_kobj, files))
2912         goto err;
2913
2914     bch_debug_init();
2915     closure_debug_init();
2916
2917     bcache_is_reboot = false;
2918
2919     return 0;
2920 err:
2921     bcache_exit();
2922     return -ENOMEM;
2923 }
2924
2925 /*
2926  * Module hooks
2927  */
2928 module_exit(bcache_exit);
2929 module_init(bcache_init);
2930
2931 module_param(bch_cutoff_writeback, uint, 0);
2932 MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2933
2934 module_param(bch_cutoff_writeback_sync, uint, 0);
2935 MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2936
2937 MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2938 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2939 MODULE_LICENSE("GPL");