0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include "bcache.h"
0025 #include "btree.h"
0026 #include "debug.h"
0027 #include "extents.h"
0028
0029 #include <linux/slab.h>
0030 #include <linux/bitops.h>
0031 #include <linux/hash.h>
0032 #include <linux/kthread.h>
0033 #include <linux/prefetch.h>
0034 #include <linux/random.h>
0035 #include <linux/rcupdate.h>
0036 #include <linux/sched/clock.h>
0037 #include <linux/rculist.h>
0038 #include <linux/delay.h>
0039 #include <trace/events/bcache.h>
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091 #define MAX_NEED_GC 64
0092 #define MAX_SAVE_PRIO 72
0093 #define MAX_GC_TIMES 100
0094 #define MIN_GC_NODES 100
0095 #define GC_SLEEP_MS 100
0096
0097 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
0098
0099 #define PTR_HASH(c, k) \
0100 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
0101
0102 static struct workqueue_struct *btree_io_wq;
0103
0104 #define insert_lock(s, b) ((b)->level <= (s)->lock)
0105
0106
0107 static inline struct bset *write_block(struct btree *b)
0108 {
0109 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
0110 }
0111
0112 static void bch_btree_init_next(struct btree *b)
0113 {
0114
0115 if (b->level && b->keys.nsets)
0116 bch_btree_sort(&b->keys, &b->c->sort);
0117 else
0118 bch_btree_sort_lazy(&b->keys, &b->c->sort);
0119
0120 if (b->written < btree_blocks(b))
0121 bch_bset_init_next(&b->keys, write_block(b),
0122 bset_magic(&b->c->cache->sb));
0123
0124 }
0125
0126
0127
0128 void bkey_put(struct cache_set *c, struct bkey *k)
0129 {
0130 unsigned int i;
0131
0132 for (i = 0; i < KEY_PTRS(k); i++)
0133 if (ptr_available(c, k, i))
0134 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
0135 }
0136
0137
0138
0139 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
0140 {
0141 uint64_t crc = b->key.ptr[0];
0142 void *data = (void *) i + 8, *end = bset_bkey_last(i);
0143
0144 crc = crc64_be(crc, data, end - data);
0145 return crc ^ 0xffffffffffffffffULL;
0146 }
0147
0148 void bch_btree_node_read_done(struct btree *b)
0149 {
0150 const char *err = "bad btree header";
0151 struct bset *i = btree_bset_first(b);
0152 struct btree_iter *iter;
0153
0154
0155
0156
0157
0158
0159 iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
0160 iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
0161 iter->used = 0;
0162
0163 #ifdef CONFIG_BCACHE_DEBUG
0164 iter->b = &b->keys;
0165 #endif
0166
0167 if (!i->seq)
0168 goto err;
0169
0170 for (;
0171 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
0172 i = write_block(b)) {
0173 err = "unsupported bset version";
0174 if (i->version > BCACHE_BSET_VERSION)
0175 goto err;
0176
0177 err = "bad btree header";
0178 if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
0179 btree_blocks(b))
0180 goto err;
0181
0182 err = "bad magic";
0183 if (i->magic != bset_magic(&b->c->cache->sb))
0184 goto err;
0185
0186 err = "bad checksum";
0187 switch (i->version) {
0188 case 0:
0189 if (i->csum != csum_set(i))
0190 goto err;
0191 break;
0192 case BCACHE_BSET_VERSION:
0193 if (i->csum != btree_csum_set(b, i))
0194 goto err;
0195 break;
0196 }
0197
0198 err = "empty set";
0199 if (i != b->keys.set[0].data && !i->keys)
0200 goto err;
0201
0202 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
0203
0204 b->written += set_blocks(i, block_bytes(b->c->cache));
0205 }
0206
0207 err = "corrupted btree";
0208 for (i = write_block(b);
0209 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
0210 i = ((void *) i) + block_bytes(b->c->cache))
0211 if (i->seq == b->keys.set[0].data->seq)
0212 goto err;
0213
0214 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
0215
0216 i = b->keys.set[0].data;
0217 err = "short btree key";
0218 if (b->keys.set[0].size &&
0219 bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
0220 goto err;
0221
0222 if (b->written < btree_blocks(b))
0223 bch_bset_init_next(&b->keys, write_block(b),
0224 bset_magic(&b->c->cache->sb));
0225 out:
0226 mempool_free(iter, &b->c->fill_iter);
0227 return;
0228 err:
0229 set_btree_node_io_error(b);
0230 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
0231 err, PTR_BUCKET_NR(b->c, &b->key, 0),
0232 bset_block_offset(b, i), i->keys);
0233 goto out;
0234 }
0235
0236 static void btree_node_read_endio(struct bio *bio)
0237 {
0238 struct closure *cl = bio->bi_private;
0239
0240 closure_put(cl);
0241 }
0242
0243 static void bch_btree_node_read(struct btree *b)
0244 {
0245 uint64_t start_time = local_clock();
0246 struct closure cl;
0247 struct bio *bio;
0248
0249 trace_bcache_btree_read(b);
0250
0251 closure_init_stack(&cl);
0252
0253 bio = bch_bbio_alloc(b->c);
0254 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
0255 bio->bi_end_io = btree_node_read_endio;
0256 bio->bi_private = &cl;
0257 bio->bi_opf = REQ_OP_READ | REQ_META;
0258
0259 bch_bio_map(bio, b->keys.set[0].data);
0260
0261 bch_submit_bbio(bio, b->c, &b->key, 0);
0262 closure_sync(&cl);
0263
0264 if (bio->bi_status)
0265 set_btree_node_io_error(b);
0266
0267 bch_bbio_free(bio, b->c);
0268
0269 if (btree_node_io_error(b))
0270 goto err;
0271
0272 bch_btree_node_read_done(b);
0273 bch_time_stats_update(&b->c->btree_read_time, start_time);
0274
0275 return;
0276 err:
0277 bch_cache_set_error(b->c, "io error reading bucket %zu",
0278 PTR_BUCKET_NR(b->c, &b->key, 0));
0279 }
0280
0281 static void btree_complete_write(struct btree *b, struct btree_write *w)
0282 {
0283 if (w->prio_blocked &&
0284 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
0285 wake_up_allocators(b->c);
0286
0287 if (w->journal) {
0288 atomic_dec_bug(w->journal);
0289 __closure_wake_up(&b->c->journal.wait);
0290 }
0291
0292 w->prio_blocked = 0;
0293 w->journal = NULL;
0294 }
0295
0296 static void btree_node_write_unlock(struct closure *cl)
0297 {
0298 struct btree *b = container_of(cl, struct btree, io);
0299
0300 up(&b->io_mutex);
0301 }
0302
0303 static void __btree_node_write_done(struct closure *cl)
0304 {
0305 struct btree *b = container_of(cl, struct btree, io);
0306 struct btree_write *w = btree_prev_write(b);
0307
0308 bch_bbio_free(b->bio, b->c);
0309 b->bio = NULL;
0310 btree_complete_write(b, w);
0311
0312 if (btree_node_dirty(b))
0313 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
0314
0315 closure_return_with_destructor(cl, btree_node_write_unlock);
0316 }
0317
0318 static void btree_node_write_done(struct closure *cl)
0319 {
0320 struct btree *b = container_of(cl, struct btree, io);
0321
0322 bio_free_pages(b->bio);
0323 __btree_node_write_done(cl);
0324 }
0325
0326 static void btree_node_write_endio(struct bio *bio)
0327 {
0328 struct closure *cl = bio->bi_private;
0329 struct btree *b = container_of(cl, struct btree, io);
0330
0331 if (bio->bi_status)
0332 set_btree_node_io_error(b);
0333
0334 bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
0335 closure_put(cl);
0336 }
0337
0338 static void do_btree_node_write(struct btree *b)
0339 {
0340 struct closure *cl = &b->io;
0341 struct bset *i = btree_bset_last(b);
0342 BKEY_PADDED(key) k;
0343
0344 i->version = BCACHE_BSET_VERSION;
0345 i->csum = btree_csum_set(b, i);
0346
0347 BUG_ON(b->bio);
0348 b->bio = bch_bbio_alloc(b->c);
0349
0350 b->bio->bi_end_io = btree_node_write_endio;
0351 b->bio->bi_private = cl;
0352 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
0353 b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA;
0354 bch_bio_map(b->bio, i);
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371 bkey_copy(&k.key, &b->key);
0372 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
0373 bset_sector_offset(&b->keys, i));
0374
0375 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
0376 struct bio_vec *bv;
0377 void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
0378 struct bvec_iter_all iter_all;
0379
0380 bio_for_each_segment_all(bv, b->bio, iter_all) {
0381 memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
0382 addr += PAGE_SIZE;
0383 }
0384
0385 bch_submit_bbio(b->bio, b->c, &k.key, 0);
0386
0387 continue_at(cl, btree_node_write_done, NULL);
0388 } else {
0389
0390
0391
0392
0393 b->bio->bi_vcnt = 0;
0394 bch_bio_map(b->bio, i);
0395
0396 bch_submit_bbio(b->bio, b->c, &k.key, 0);
0397
0398 closure_sync(cl);
0399 continue_at_nobarrier(cl, __btree_node_write_done, NULL);
0400 }
0401 }
0402
0403 void __bch_btree_node_write(struct btree *b, struct closure *parent)
0404 {
0405 struct bset *i = btree_bset_last(b);
0406
0407 lockdep_assert_held(&b->write_lock);
0408
0409 trace_bcache_btree_write(b);
0410
0411 BUG_ON(current->bio_list);
0412 BUG_ON(b->written >= btree_blocks(b));
0413 BUG_ON(b->written && !i->keys);
0414 BUG_ON(btree_bset_first(b)->seq != i->seq);
0415 bch_check_keys(&b->keys, "writing");
0416
0417 cancel_delayed_work(&b->work);
0418
0419
0420 down(&b->io_mutex);
0421 closure_init(&b->io, parent ?: &b->c->cl);
0422
0423 clear_bit(BTREE_NODE_dirty, &b->flags);
0424 change_bit(BTREE_NODE_write_idx, &b->flags);
0425
0426 do_btree_node_write(b);
0427
0428 atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
0429 &b->c->cache->btree_sectors_written);
0430
0431 b->written += set_blocks(i, block_bytes(b->c->cache));
0432 }
0433
0434 void bch_btree_node_write(struct btree *b, struct closure *parent)
0435 {
0436 unsigned int nsets = b->keys.nsets;
0437
0438 lockdep_assert_held(&b->lock);
0439
0440 __bch_btree_node_write(b, parent);
0441
0442
0443
0444
0445
0446 if (nsets && !b->keys.nsets)
0447 bch_btree_verify(b);
0448
0449 bch_btree_init_next(b);
0450 }
0451
0452 static void bch_btree_node_write_sync(struct btree *b)
0453 {
0454 struct closure cl;
0455
0456 closure_init_stack(&cl);
0457
0458 mutex_lock(&b->write_lock);
0459 bch_btree_node_write(b, &cl);
0460 mutex_unlock(&b->write_lock);
0461
0462 closure_sync(&cl);
0463 }
0464
0465 static void btree_node_write_work(struct work_struct *w)
0466 {
0467 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
0468
0469 mutex_lock(&b->write_lock);
0470 if (btree_node_dirty(b))
0471 __bch_btree_node_write(b, NULL);
0472 mutex_unlock(&b->write_lock);
0473 }
0474
0475 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
0476 {
0477 struct bset *i = btree_bset_last(b);
0478 struct btree_write *w = btree_current_write(b);
0479
0480 lockdep_assert_held(&b->write_lock);
0481
0482 BUG_ON(!b->written);
0483 BUG_ON(!i->keys);
0484
0485 if (!btree_node_dirty(b))
0486 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
0487
0488 set_btree_node_dirty(b);
0489
0490
0491
0492
0493
0494
0495 if (journal_ref) {
0496 if (w->journal &&
0497 journal_pin_cmp(b->c, w->journal, journal_ref)) {
0498 atomic_dec_bug(w->journal);
0499 w->journal = NULL;
0500 }
0501
0502 if (!w->journal) {
0503 w->journal = journal_ref;
0504 atomic_inc(w->journal);
0505 }
0506 }
0507
0508
0509 if (set_bytes(i) > PAGE_SIZE - 48 &&
0510 !current->bio_list)
0511 bch_btree_node_write(b, NULL);
0512 }
0513
0514
0515
0516
0517
0518
0519 #define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
0520 ? c->root->level : 1) * 8 + 16)
0521 #define mca_can_free(c) \
0522 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
0523
0524 static void mca_data_free(struct btree *b)
0525 {
0526 BUG_ON(b->io_mutex.count != 1);
0527
0528 bch_btree_keys_free(&b->keys);
0529
0530 b->c->btree_cache_used--;
0531 list_move(&b->list, &b->c->btree_cache_freed);
0532 }
0533
0534 static void mca_bucket_free(struct btree *b)
0535 {
0536 BUG_ON(btree_node_dirty(b));
0537
0538 b->key.ptr[0] = 0;
0539 hlist_del_init_rcu(&b->hash);
0540 list_move(&b->list, &b->c->btree_cache_freeable);
0541 }
0542
0543 static unsigned int btree_order(struct bkey *k)
0544 {
0545 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
0546 }
0547
0548 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
0549 {
0550 if (!bch_btree_keys_alloc(&b->keys,
0551 max_t(unsigned int,
0552 ilog2(b->c->btree_pages),
0553 btree_order(k)),
0554 gfp)) {
0555 b->c->btree_cache_used++;
0556 list_move(&b->list, &b->c->btree_cache);
0557 } else {
0558 list_move(&b->list, &b->c->btree_cache_freed);
0559 }
0560 }
0561
0562 static struct btree *mca_bucket_alloc(struct cache_set *c,
0563 struct bkey *k, gfp_t gfp)
0564 {
0565
0566
0567
0568
0569 struct btree *b = kzalloc(sizeof(struct btree), gfp);
0570
0571 if (!b)
0572 return NULL;
0573
0574 init_rwsem(&b->lock);
0575 lockdep_set_novalidate_class(&b->lock);
0576 mutex_init(&b->write_lock);
0577 lockdep_set_novalidate_class(&b->write_lock);
0578 INIT_LIST_HEAD(&b->list);
0579 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
0580 b->c = c;
0581 sema_init(&b->io_mutex, 1);
0582
0583 mca_data_alloc(b, k, gfp);
0584 return b;
0585 }
0586
0587 static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
0588 {
0589 struct closure cl;
0590
0591 closure_init_stack(&cl);
0592 lockdep_assert_held(&b->c->bucket_lock);
0593
0594 if (!down_write_trylock(&b->lock))
0595 return -ENOMEM;
0596
0597 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
0598
0599 if (b->keys.page_order < min_order)
0600 goto out_unlock;
0601
0602 if (!flush) {
0603 if (btree_node_dirty(b))
0604 goto out_unlock;
0605
0606 if (down_trylock(&b->io_mutex))
0607 goto out_unlock;
0608 up(&b->io_mutex);
0609 }
0610
0611 retry:
0612
0613
0614
0615
0616
0617 mutex_lock(&b->write_lock);
0618
0619
0620
0621
0622
0623 if (btree_node_journal_flush(b)) {
0624 pr_debug("bnode %p is flushing by journal, retry\n", b);
0625 mutex_unlock(&b->write_lock);
0626 udelay(1);
0627 goto retry;
0628 }
0629
0630 if (btree_node_dirty(b))
0631 __bch_btree_node_write(b, &cl);
0632 mutex_unlock(&b->write_lock);
0633
0634 closure_sync(&cl);
0635
0636
0637 down(&b->io_mutex);
0638 up(&b->io_mutex);
0639
0640 return 0;
0641 out_unlock:
0642 rw_unlock(true, b);
0643 return -ENOMEM;
0644 }
0645
0646 static unsigned long bch_mca_scan(struct shrinker *shrink,
0647 struct shrink_control *sc)
0648 {
0649 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
0650 struct btree *b, *t;
0651 unsigned long i, nr = sc->nr_to_scan;
0652 unsigned long freed = 0;
0653 unsigned int btree_cache_used;
0654
0655 if (c->shrinker_disabled)
0656 return SHRINK_STOP;
0657
0658 if (c->btree_cache_alloc_lock)
0659 return SHRINK_STOP;
0660
0661
0662 if (sc->gfp_mask & __GFP_IO)
0663 mutex_lock(&c->bucket_lock);
0664 else if (!mutex_trylock(&c->bucket_lock))
0665 return -1;
0666
0667
0668
0669
0670
0671
0672
0673
0674 nr /= c->btree_pages;
0675 if (nr == 0)
0676 nr = 1;
0677 nr = min_t(unsigned long, nr, mca_can_free(c));
0678
0679 i = 0;
0680 btree_cache_used = c->btree_cache_used;
0681 list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
0682 if (nr <= 0)
0683 goto out;
0684
0685 if (!mca_reap(b, 0, false)) {
0686 mca_data_free(b);
0687 rw_unlock(true, b);
0688 freed++;
0689 }
0690 nr--;
0691 i++;
0692 }
0693
0694 list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
0695 if (nr <= 0 || i >= btree_cache_used)
0696 goto out;
0697
0698 if (!mca_reap(b, 0, false)) {
0699 mca_bucket_free(b);
0700 mca_data_free(b);
0701 rw_unlock(true, b);
0702 freed++;
0703 }
0704
0705 nr--;
0706 i++;
0707 }
0708 out:
0709 mutex_unlock(&c->bucket_lock);
0710 return freed * c->btree_pages;
0711 }
0712
0713 static unsigned long bch_mca_count(struct shrinker *shrink,
0714 struct shrink_control *sc)
0715 {
0716 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
0717
0718 if (c->shrinker_disabled)
0719 return 0;
0720
0721 if (c->btree_cache_alloc_lock)
0722 return 0;
0723
0724 return mca_can_free(c) * c->btree_pages;
0725 }
0726
0727 void bch_btree_cache_free(struct cache_set *c)
0728 {
0729 struct btree *b;
0730 struct closure cl;
0731
0732 closure_init_stack(&cl);
0733
0734 if (c->shrink.list.next)
0735 unregister_shrinker(&c->shrink);
0736
0737 mutex_lock(&c->bucket_lock);
0738
0739 #ifdef CONFIG_BCACHE_DEBUG
0740 if (c->verify_data)
0741 list_move(&c->verify_data->list, &c->btree_cache);
0742
0743 free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
0744 #endif
0745
0746 list_splice(&c->btree_cache_freeable,
0747 &c->btree_cache);
0748
0749 while (!list_empty(&c->btree_cache)) {
0750 b = list_first_entry(&c->btree_cache, struct btree, list);
0751
0752
0753
0754
0755
0756
0757 if (btree_node_dirty(b)) {
0758 btree_complete_write(b, btree_current_write(b));
0759 clear_bit(BTREE_NODE_dirty, &b->flags);
0760 }
0761 mca_data_free(b);
0762 }
0763
0764 while (!list_empty(&c->btree_cache_freed)) {
0765 b = list_first_entry(&c->btree_cache_freed,
0766 struct btree, list);
0767 list_del(&b->list);
0768 cancel_delayed_work_sync(&b->work);
0769 kfree(b);
0770 }
0771
0772 mutex_unlock(&c->bucket_lock);
0773 }
0774
0775 int bch_btree_cache_alloc(struct cache_set *c)
0776 {
0777 unsigned int i;
0778
0779 for (i = 0; i < mca_reserve(c); i++)
0780 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
0781 return -ENOMEM;
0782
0783 list_splice_init(&c->btree_cache,
0784 &c->btree_cache_freeable);
0785
0786 #ifdef CONFIG_BCACHE_DEBUG
0787 mutex_init(&c->verify_lock);
0788
0789 c->verify_ondisk = (void *)
0790 __get_free_pages(GFP_KERNEL|__GFP_COMP,
0791 ilog2(meta_bucket_pages(&c->cache->sb)));
0792 if (!c->verify_ondisk) {
0793
0794
0795
0796
0797
0798 return -ENOMEM;
0799 }
0800
0801 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
0802
0803 if (c->verify_data &&
0804 c->verify_data->keys.set->data)
0805 list_del_init(&c->verify_data->list);
0806 else
0807 c->verify_data = NULL;
0808 #endif
0809
0810 c->shrink.count_objects = bch_mca_count;
0811 c->shrink.scan_objects = bch_mca_scan;
0812 c->shrink.seeks = 4;
0813 c->shrink.batch = c->btree_pages * 2;
0814
0815 if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
0816 pr_warn("bcache: %s: could not register shrinker\n",
0817 __func__);
0818
0819 return 0;
0820 }
0821
0822
0823
0824 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
0825 {
0826 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
0827 }
0828
0829 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
0830 {
0831 struct btree *b;
0832
0833 rcu_read_lock();
0834 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
0835 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
0836 goto out;
0837 b = NULL;
0838 out:
0839 rcu_read_unlock();
0840 return b;
0841 }
0842
0843 static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
0844 {
0845 spin_lock(&c->btree_cannibalize_lock);
0846 if (likely(c->btree_cache_alloc_lock == NULL)) {
0847 c->btree_cache_alloc_lock = current;
0848 } else if (c->btree_cache_alloc_lock != current) {
0849 if (op)
0850 prepare_to_wait(&c->btree_cache_wait, &op->wait,
0851 TASK_UNINTERRUPTIBLE);
0852 spin_unlock(&c->btree_cannibalize_lock);
0853 return -EINTR;
0854 }
0855 spin_unlock(&c->btree_cannibalize_lock);
0856
0857 return 0;
0858 }
0859
0860 static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
0861 struct bkey *k)
0862 {
0863 struct btree *b;
0864
0865 trace_bcache_btree_cache_cannibalize(c);
0866
0867 if (mca_cannibalize_lock(c, op))
0868 return ERR_PTR(-EINTR);
0869
0870 list_for_each_entry_reverse(b, &c->btree_cache, list)
0871 if (!mca_reap(b, btree_order(k), false))
0872 return b;
0873
0874 list_for_each_entry_reverse(b, &c->btree_cache, list)
0875 if (!mca_reap(b, btree_order(k), true))
0876 return b;
0877
0878 WARN(1, "btree cache cannibalize failed\n");
0879 return ERR_PTR(-ENOMEM);
0880 }
0881
0882
0883
0884
0885
0886
0887
0888 static void bch_cannibalize_unlock(struct cache_set *c)
0889 {
0890 spin_lock(&c->btree_cannibalize_lock);
0891 if (c->btree_cache_alloc_lock == current) {
0892 c->btree_cache_alloc_lock = NULL;
0893 wake_up(&c->btree_cache_wait);
0894 }
0895 spin_unlock(&c->btree_cannibalize_lock);
0896 }
0897
0898 static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
0899 struct bkey *k, int level)
0900 {
0901 struct btree *b;
0902
0903 BUG_ON(current->bio_list);
0904
0905 lockdep_assert_held(&c->bucket_lock);
0906
0907 if (mca_find(c, k))
0908 return NULL;
0909
0910
0911
0912
0913 list_for_each_entry(b, &c->btree_cache_freeable, list)
0914 if (!mca_reap(b, btree_order(k), false))
0915 goto out;
0916
0917
0918
0919
0920 list_for_each_entry(b, &c->btree_cache_freed, list)
0921 if (!mca_reap(b, 0, false)) {
0922 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
0923 if (!b->keys.set[0].data)
0924 goto err;
0925 else
0926 goto out;
0927 }
0928
0929 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
0930 if (!b)
0931 goto err;
0932
0933 BUG_ON(!down_write_trylock(&b->lock));
0934 if (!b->keys.set->data)
0935 goto err;
0936 out:
0937 BUG_ON(b->io_mutex.count != 1);
0938
0939 bkey_copy(&b->key, k);
0940 list_move(&b->list, &c->btree_cache);
0941 hlist_del_init_rcu(&b->hash);
0942 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
0943
0944 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
0945 b->parent = (void *) ~0UL;
0946 b->flags = 0;
0947 b->written = 0;
0948 b->level = level;
0949
0950 if (!b->level)
0951 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
0952 &b->c->expensive_debug_checks);
0953 else
0954 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
0955 &b->c->expensive_debug_checks);
0956
0957 return b;
0958 err:
0959 if (b)
0960 rw_unlock(true, b);
0961
0962 b = mca_cannibalize(c, op, k);
0963 if (!IS_ERR(b))
0964 goto out;
0965
0966 return b;
0967 }
0968
0969
0970
0971
0972
0973
0974
0975
0976
0977
0978 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
0979 struct bkey *k, int level, bool write,
0980 struct btree *parent)
0981 {
0982 int i = 0;
0983 struct btree *b;
0984
0985 BUG_ON(level < 0);
0986 retry:
0987 b = mca_find(c, k);
0988
0989 if (!b) {
0990 if (current->bio_list)
0991 return ERR_PTR(-EAGAIN);
0992
0993 mutex_lock(&c->bucket_lock);
0994 b = mca_alloc(c, op, k, level);
0995 mutex_unlock(&c->bucket_lock);
0996
0997 if (!b)
0998 goto retry;
0999 if (IS_ERR(b))
1000 return b;
1001
1002 bch_btree_node_read(b);
1003
1004 if (!write)
1005 downgrade_write(&b->lock);
1006 } else {
1007 rw_lock(write, b, level);
1008 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1009 rw_unlock(write, b);
1010 goto retry;
1011 }
1012 BUG_ON(b->level != level);
1013 }
1014
1015 if (btree_node_io_error(b)) {
1016 rw_unlock(write, b);
1017 return ERR_PTR(-EIO);
1018 }
1019
1020 BUG_ON(!b->written);
1021
1022 b->parent = parent;
1023
1024 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1025 prefetch(b->keys.set[i].tree);
1026 prefetch(b->keys.set[i].data);
1027 }
1028
1029 for (; i <= b->keys.nsets; i++)
1030 prefetch(b->keys.set[i].data);
1031
1032 return b;
1033 }
1034
1035 static void btree_node_prefetch(struct btree *parent, struct bkey *k)
1036 {
1037 struct btree *b;
1038
1039 mutex_lock(&parent->c->bucket_lock);
1040 b = mca_alloc(parent->c, NULL, k, parent->level - 1);
1041 mutex_unlock(&parent->c->bucket_lock);
1042
1043 if (!IS_ERR_OR_NULL(b)) {
1044 b->parent = parent;
1045 bch_btree_node_read(b);
1046 rw_unlock(true, b);
1047 }
1048 }
1049
1050
1051
1052 static void btree_node_free(struct btree *b)
1053 {
1054 trace_bcache_btree_node_free(b);
1055
1056 BUG_ON(b == b->c->root);
1057
1058 retry:
1059 mutex_lock(&b->write_lock);
1060
1061
1062
1063
1064
1065
1066 if (btree_node_journal_flush(b)) {
1067 mutex_unlock(&b->write_lock);
1068 pr_debug("bnode %p journal_flush set, retry\n", b);
1069 udelay(1);
1070 goto retry;
1071 }
1072
1073 if (btree_node_dirty(b)) {
1074 btree_complete_write(b, btree_current_write(b));
1075 clear_bit(BTREE_NODE_dirty, &b->flags);
1076 }
1077
1078 mutex_unlock(&b->write_lock);
1079
1080 cancel_delayed_work(&b->work);
1081
1082 mutex_lock(&b->c->bucket_lock);
1083 bch_bucket_free(b->c, &b->key);
1084 mca_bucket_free(b);
1085 mutex_unlock(&b->c->bucket_lock);
1086 }
1087
1088 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1089 int level, bool wait,
1090 struct btree *parent)
1091 {
1092 BKEY_PADDED(key) k;
1093 struct btree *b = ERR_PTR(-EAGAIN);
1094
1095 mutex_lock(&c->bucket_lock);
1096 retry:
1097 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
1098 goto err;
1099
1100 bkey_put(c, &k.key);
1101 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1102
1103 b = mca_alloc(c, op, &k.key, level);
1104 if (IS_ERR(b))
1105 goto err_free;
1106
1107 if (!b) {
1108 cache_bug(c,
1109 "Tried to allocate bucket that was in btree cache");
1110 goto retry;
1111 }
1112
1113 b->parent = parent;
1114 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
1115
1116 mutex_unlock(&c->bucket_lock);
1117
1118 trace_bcache_btree_node_alloc(b);
1119 return b;
1120 err_free:
1121 bch_bucket_free(c, &k.key);
1122 err:
1123 mutex_unlock(&c->bucket_lock);
1124
1125 trace_bcache_btree_node_alloc_fail(c);
1126 return b;
1127 }
1128
1129 static struct btree *bch_btree_node_alloc(struct cache_set *c,
1130 struct btree_op *op, int level,
1131 struct btree *parent)
1132 {
1133 return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1134 }
1135
1136 static struct btree *btree_node_alloc_replacement(struct btree *b,
1137 struct btree_op *op)
1138 {
1139 struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1140
1141 if (!IS_ERR_OR_NULL(n)) {
1142 mutex_lock(&n->write_lock);
1143 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1144 bkey_copy_key(&n->key, &b->key);
1145 mutex_unlock(&n->write_lock);
1146 }
1147
1148 return n;
1149 }
1150
1151 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1152 {
1153 unsigned int i;
1154
1155 mutex_lock(&b->c->bucket_lock);
1156
1157 atomic_inc(&b->c->prio_blocked);
1158
1159 bkey_copy(k, &b->key);
1160 bkey_copy_key(k, &ZERO_KEY);
1161
1162 for (i = 0; i < KEY_PTRS(k); i++)
1163 SET_PTR_GEN(k, i,
1164 bch_inc_gen(b->c->cache,
1165 PTR_BUCKET(b->c, &b->key, i)));
1166
1167 mutex_unlock(&b->c->bucket_lock);
1168 }
1169
1170 static int btree_check_reserve(struct btree *b, struct btree_op *op)
1171 {
1172 struct cache_set *c = b->c;
1173 struct cache *ca = c->cache;
1174 unsigned int reserve = (c->root->level - b->level) * 2 + 1;
1175
1176 mutex_lock(&c->bucket_lock);
1177
1178 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1179 if (op)
1180 prepare_to_wait(&c->btree_cache_wait, &op->wait,
1181 TASK_UNINTERRUPTIBLE);
1182 mutex_unlock(&c->bucket_lock);
1183 return -EINTR;
1184 }
1185
1186 mutex_unlock(&c->bucket_lock);
1187
1188 return mca_cannibalize_lock(b->c, op);
1189 }
1190
1191
1192
1193 static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
1194 struct bkey *k)
1195 {
1196 uint8_t stale = 0;
1197 unsigned int i;
1198 struct bucket *g;
1199
1200
1201
1202
1203
1204
1205 if (!bkey_cmp(k, &ZERO_KEY))
1206 return stale;
1207
1208 for (i = 0; i < KEY_PTRS(k); i++) {
1209 if (!ptr_available(c, k, i))
1210 continue;
1211
1212 g = PTR_BUCKET(c, k, i);
1213
1214 if (gen_after(g->last_gc, PTR_GEN(k, i)))
1215 g->last_gc = PTR_GEN(k, i);
1216
1217 if (ptr_stale(c, k, i)) {
1218 stale = max(stale, ptr_stale(c, k, i));
1219 continue;
1220 }
1221
1222 cache_bug_on(GC_MARK(g) &&
1223 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1224 c, "inconsistent ptrs: mark = %llu, level = %i",
1225 GC_MARK(g), level);
1226
1227 if (level)
1228 SET_GC_MARK(g, GC_MARK_METADATA);
1229 else if (KEY_DIRTY(k))
1230 SET_GC_MARK(g, GC_MARK_DIRTY);
1231 else if (!GC_MARK(g))
1232 SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1233
1234
1235 SET_GC_SECTORS_USED(g, min_t(unsigned int,
1236 GC_SECTORS_USED(g) + KEY_SIZE(k),
1237 MAX_GC_SECTORS_USED));
1238
1239 BUG_ON(!GC_SECTORS_USED(g));
1240 }
1241
1242 return stale;
1243 }
1244
1245 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1246
1247 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1248 {
1249 unsigned int i;
1250
1251 for (i = 0; i < KEY_PTRS(k); i++)
1252 if (ptr_available(c, k, i) &&
1253 !ptr_stale(c, k, i)) {
1254 struct bucket *b = PTR_BUCKET(c, k, i);
1255
1256 b->gen = PTR_GEN(k, i);
1257
1258 if (level && bkey_cmp(k, &ZERO_KEY))
1259 b->prio = BTREE_PRIO;
1260 else if (!level && b->prio == BTREE_PRIO)
1261 b->prio = INITIAL_PRIO;
1262 }
1263
1264 __bch_btree_mark_key(c, level, k);
1265 }
1266
1267 void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
1268 {
1269 stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
1270 }
1271
1272 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1273 {
1274 uint8_t stale = 0;
1275 unsigned int keys = 0, good_keys = 0;
1276 struct bkey *k;
1277 struct btree_iter iter;
1278 struct bset_tree *t;
1279
1280 gc->nodes++;
1281
1282 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1283 stale = max(stale, btree_mark_key(b, k));
1284 keys++;
1285
1286 if (bch_ptr_bad(&b->keys, k))
1287 continue;
1288
1289 gc->key_bytes += bkey_u64s(k);
1290 gc->nkeys++;
1291 good_keys++;
1292
1293 gc->data += KEY_SIZE(k);
1294 }
1295
1296 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1297 btree_bug_on(t->size &&
1298 bset_written(&b->keys, t) &&
1299 bkey_cmp(&b->key, &t->end) < 0,
1300 b, "found short btree key in gc");
1301
1302 if (b->c->gc_always_rewrite)
1303 return true;
1304
1305 if (stale > 10)
1306 return true;
1307
1308 if ((keys - good_keys) * 2 > keys)
1309 return true;
1310
1311 return false;
1312 }
1313
1314 #define GC_MERGE_NODES 4U
1315
1316 struct gc_merge_info {
1317 struct btree *b;
1318 unsigned int keys;
1319 };
1320
1321 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
1322 struct keylist *insert_keys,
1323 atomic_t *journal_ref,
1324 struct bkey *replace_key);
1325
1326 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1327 struct gc_stat *gc, struct gc_merge_info *r)
1328 {
1329 unsigned int i, nodes = 0, keys = 0, blocks;
1330 struct btree *new_nodes[GC_MERGE_NODES];
1331 struct keylist keylist;
1332 struct closure cl;
1333 struct bkey *k;
1334
1335 bch_keylist_init(&keylist);
1336
1337 if (btree_check_reserve(b, NULL))
1338 return 0;
1339
1340 memset(new_nodes, 0, sizeof(new_nodes));
1341 closure_init_stack(&cl);
1342
1343 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1344 keys += r[nodes++].keys;
1345
1346 blocks = btree_default_blocks(b->c) * 2 / 3;
1347
1348 if (nodes < 2 ||
1349 __set_blocks(b->keys.set[0].data, keys,
1350 block_bytes(b->c->cache)) > blocks * (nodes - 1))
1351 return 0;
1352
1353 for (i = 0; i < nodes; i++) {
1354 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1355 if (IS_ERR_OR_NULL(new_nodes[i]))
1356 goto out_nocoalesce;
1357 }
1358
1359
1360
1361
1362
1363
1364
1365 if (btree_check_reserve(b, NULL))
1366 goto out_nocoalesce;
1367
1368 for (i = 0; i < nodes; i++)
1369 mutex_lock(&new_nodes[i]->write_lock);
1370
1371 for (i = nodes - 1; i > 0; --i) {
1372 struct bset *n1 = btree_bset_first(new_nodes[i]);
1373 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1374 struct bkey *k, *last = NULL;
1375
1376 keys = 0;
1377
1378 if (i > 1) {
1379 for (k = n2->start;
1380 k < bset_bkey_last(n2);
1381 k = bkey_next(k)) {
1382 if (__set_blocks(n1, n1->keys + keys +
1383 bkey_u64s(k),
1384 block_bytes(b->c->cache)) > blocks)
1385 break;
1386
1387 last = k;
1388 keys += bkey_u64s(k);
1389 }
1390 } else {
1391
1392
1393
1394
1395
1396
1397
1398
1399 if (__set_blocks(n1, n1->keys + n2->keys,
1400 block_bytes(b->c->cache)) >
1401 btree_blocks(new_nodes[i]))
1402 goto out_unlock_nocoalesce;
1403
1404 keys = n2->keys;
1405
1406 last = &r->b->key;
1407 }
1408
1409 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
1410 btree_blocks(new_nodes[i]));
1411
1412 if (last)
1413 bkey_copy_key(&new_nodes[i]->key, last);
1414
1415 memcpy(bset_bkey_last(n1),
1416 n2->start,
1417 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1418
1419 n1->keys += keys;
1420 r[i].keys = n1->keys;
1421
1422 memmove(n2->start,
1423 bset_bkey_idx(n2, keys),
1424 (void *) bset_bkey_last(n2) -
1425 (void *) bset_bkey_idx(n2, keys));
1426
1427 n2->keys -= keys;
1428
1429 if (__bch_keylist_realloc(&keylist,
1430 bkey_u64s(&new_nodes[i]->key)))
1431 goto out_unlock_nocoalesce;
1432
1433 bch_btree_node_write(new_nodes[i], &cl);
1434 bch_keylist_add(&keylist, &new_nodes[i]->key);
1435 }
1436
1437 for (i = 0; i < nodes; i++)
1438 mutex_unlock(&new_nodes[i]->write_lock);
1439
1440 closure_sync(&cl);
1441
1442
1443 BUG_ON(btree_bset_first(new_nodes[0])->keys);
1444 btree_node_free(new_nodes[0]);
1445 rw_unlock(true, new_nodes[0]);
1446 new_nodes[0] = NULL;
1447
1448 for (i = 0; i < nodes; i++) {
1449 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1450 goto out_nocoalesce;
1451
1452 make_btree_freeing_key(r[i].b, keylist.top);
1453 bch_keylist_push(&keylist);
1454 }
1455
1456 bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1457 BUG_ON(!bch_keylist_empty(&keylist));
1458
1459 for (i = 0; i < nodes; i++) {
1460 btree_node_free(r[i].b);
1461 rw_unlock(true, r[i].b);
1462
1463 r[i].b = new_nodes[i];
1464 }
1465
1466 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1467 r[nodes - 1].b = ERR_PTR(-EINTR);
1468
1469 trace_bcache_btree_gc_coalesce(nodes);
1470 gc->nodes--;
1471
1472 bch_keylist_free(&keylist);
1473
1474
1475 return -EINTR;
1476
1477 out_unlock_nocoalesce:
1478 for (i = 0; i < nodes; i++)
1479 mutex_unlock(&new_nodes[i]->write_lock);
1480
1481 out_nocoalesce:
1482 closure_sync(&cl);
1483
1484 while ((k = bch_keylist_pop(&keylist)))
1485 if (!bkey_cmp(k, &ZERO_KEY))
1486 atomic_dec(&b->c->prio_blocked);
1487 bch_keylist_free(&keylist);
1488
1489 for (i = 0; i < nodes; i++)
1490 if (!IS_ERR_OR_NULL(new_nodes[i])) {
1491 btree_node_free(new_nodes[i]);
1492 rw_unlock(true, new_nodes[i]);
1493 }
1494 return 0;
1495 }
1496
1497 static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1498 struct btree *replace)
1499 {
1500 struct keylist keys;
1501 struct btree *n;
1502
1503 if (btree_check_reserve(b, NULL))
1504 return 0;
1505
1506 n = btree_node_alloc_replacement(replace, NULL);
1507
1508
1509 if (btree_check_reserve(b, NULL)) {
1510 btree_node_free(n);
1511 rw_unlock(true, n);
1512 return 0;
1513 }
1514
1515 bch_btree_node_write_sync(n);
1516
1517 bch_keylist_init(&keys);
1518 bch_keylist_add(&keys, &n->key);
1519
1520 make_btree_freeing_key(replace, keys.top);
1521 bch_keylist_push(&keys);
1522
1523 bch_btree_insert_node(b, op, &keys, NULL, NULL);
1524 BUG_ON(!bch_keylist_empty(&keys));
1525
1526 btree_node_free(replace);
1527 rw_unlock(true, n);
1528
1529
1530 return -EINTR;
1531 }
1532
1533 static unsigned int btree_gc_count_keys(struct btree *b)
1534 {
1535 struct bkey *k;
1536 struct btree_iter iter;
1537 unsigned int ret = 0;
1538
1539 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1540 ret += bkey_u64s(k);
1541
1542 return ret;
1543 }
1544
1545 static size_t btree_gc_min_nodes(struct cache_set *c)
1546 {
1547 size_t min_nodes;
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563 min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
1564 if (min_nodes < MIN_GC_NODES)
1565 min_nodes = MIN_GC_NODES;
1566
1567 return min_nodes;
1568 }
1569
1570
1571 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1572 struct closure *writes, struct gc_stat *gc)
1573 {
1574 int ret = 0;
1575 bool should_rewrite;
1576 struct bkey *k;
1577 struct btree_iter iter;
1578 struct gc_merge_info r[GC_MERGE_NODES];
1579 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1580
1581 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1582
1583 for (i = r; i < r + ARRAY_SIZE(r); i++)
1584 i->b = ERR_PTR(-EINTR);
1585
1586 while (1) {
1587 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1588 if (k) {
1589 r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1590 true, b);
1591 if (IS_ERR(r->b)) {
1592 ret = PTR_ERR(r->b);
1593 break;
1594 }
1595
1596 r->keys = btree_gc_count_keys(r->b);
1597
1598 ret = btree_gc_coalesce(b, op, gc, r);
1599 if (ret)
1600 break;
1601 }
1602
1603 if (!last->b)
1604 break;
1605
1606 if (!IS_ERR(last->b)) {
1607 should_rewrite = btree_gc_mark_node(last->b, gc);
1608 if (should_rewrite) {
1609 ret = btree_gc_rewrite_node(b, op, last->b);
1610 if (ret)
1611 break;
1612 }
1613
1614 if (last->b->level) {
1615 ret = btree_gc_recurse(last->b, op, writes, gc);
1616 if (ret)
1617 break;
1618 }
1619
1620 bkey_copy_key(&b->c->gc_done, &last->b->key);
1621
1622
1623
1624
1625
1626 mutex_lock(&last->b->write_lock);
1627 if (btree_node_dirty(last->b))
1628 bch_btree_node_write(last->b, writes);
1629 mutex_unlock(&last->b->write_lock);
1630 rw_unlock(true, last->b);
1631 }
1632
1633 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1634 r->b = NULL;
1635
1636 if (atomic_read(&b->c->search_inflight) &&
1637 gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
1638 gc->nodes_pre = gc->nodes;
1639 ret = -EAGAIN;
1640 break;
1641 }
1642
1643 if (need_resched()) {
1644 ret = -EAGAIN;
1645 break;
1646 }
1647 }
1648
1649 for (i = r; i < r + ARRAY_SIZE(r); i++)
1650 if (!IS_ERR_OR_NULL(i->b)) {
1651 mutex_lock(&i->b->write_lock);
1652 if (btree_node_dirty(i->b))
1653 bch_btree_node_write(i->b, writes);
1654 mutex_unlock(&i->b->write_lock);
1655 rw_unlock(true, i->b);
1656 }
1657
1658 return ret;
1659 }
1660
1661 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1662 struct closure *writes, struct gc_stat *gc)
1663 {
1664 struct btree *n = NULL;
1665 int ret = 0;
1666 bool should_rewrite;
1667
1668 should_rewrite = btree_gc_mark_node(b, gc);
1669 if (should_rewrite) {
1670 n = btree_node_alloc_replacement(b, NULL);
1671
1672 if (!IS_ERR_OR_NULL(n)) {
1673 bch_btree_node_write_sync(n);
1674
1675 bch_btree_set_root(n);
1676 btree_node_free(b);
1677 rw_unlock(true, n);
1678
1679 return -EINTR;
1680 }
1681 }
1682
1683 __bch_btree_mark_key(b->c, b->level + 1, &b->key);
1684
1685 if (b->level) {
1686 ret = btree_gc_recurse(b, op, writes, gc);
1687 if (ret)
1688 return ret;
1689 }
1690
1691 bkey_copy_key(&b->c->gc_done, &b->key);
1692
1693 return ret;
1694 }
1695
1696 static void btree_gc_start(struct cache_set *c)
1697 {
1698 struct cache *ca;
1699 struct bucket *b;
1700
1701 if (!c->gc_mark_valid)
1702 return;
1703
1704 mutex_lock(&c->bucket_lock);
1705
1706 c->gc_mark_valid = 0;
1707 c->gc_done = ZERO_KEY;
1708
1709 ca = c->cache;
1710 for_each_bucket(b, ca) {
1711 b->last_gc = b->gen;
1712 if (!atomic_read(&b->pin)) {
1713 SET_GC_MARK(b, 0);
1714 SET_GC_SECTORS_USED(b, 0);
1715 }
1716 }
1717
1718 mutex_unlock(&c->bucket_lock);
1719 }
1720
1721 static void bch_btree_gc_finish(struct cache_set *c)
1722 {
1723 struct bucket *b;
1724 struct cache *ca;
1725 unsigned int i, j;
1726 uint64_t *k;
1727
1728 mutex_lock(&c->bucket_lock);
1729
1730 set_gc_sectors(c);
1731 c->gc_mark_valid = 1;
1732 c->need_gc = 0;
1733
1734 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1735 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1736 GC_MARK_METADATA);
1737
1738
1739 rcu_read_lock();
1740 for (i = 0; i < c->devices_max_used; i++) {
1741 struct bcache_device *d = c->devices[i];
1742 struct cached_dev *dc;
1743 struct keybuf_key *w, *n;
1744
1745 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1746 continue;
1747 dc = container_of(d, struct cached_dev, disk);
1748
1749 spin_lock(&dc->writeback_keys.lock);
1750 rbtree_postorder_for_each_entry_safe(w, n,
1751 &dc->writeback_keys.keys, node)
1752 for (j = 0; j < KEY_PTRS(&w->key); j++)
1753 SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1754 GC_MARK_DIRTY);
1755 spin_unlock(&dc->writeback_keys.lock);
1756 }
1757 rcu_read_unlock();
1758
1759 c->avail_nbuckets = 0;
1760
1761 ca = c->cache;
1762 ca->invalidate_needs_gc = 0;
1763
1764 for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
1765 SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
1766
1767 for (k = ca->prio_buckets;
1768 k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
1769 SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
1770
1771 for_each_bucket(b, ca) {
1772 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1773
1774 if (atomic_read(&b->pin))
1775 continue;
1776
1777 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1778
1779 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1780 c->avail_nbuckets++;
1781 }
1782
1783 mutex_unlock(&c->bucket_lock);
1784 }
1785
1786 static void bch_btree_gc(struct cache_set *c)
1787 {
1788 int ret;
1789 struct gc_stat stats;
1790 struct closure writes;
1791 struct btree_op op;
1792 uint64_t start_time = local_clock();
1793
1794 trace_bcache_gc_start(c);
1795
1796 memset(&stats, 0, sizeof(struct gc_stat));
1797 closure_init_stack(&writes);
1798 bch_btree_op_init(&op, SHRT_MAX);
1799
1800 btree_gc_start(c);
1801
1802
1803 do {
1804 ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
1805 closure_sync(&writes);
1806 cond_resched();
1807
1808 if (ret == -EAGAIN)
1809 schedule_timeout_interruptible(msecs_to_jiffies
1810 (GC_SLEEP_MS));
1811 else if (ret)
1812 pr_warn("gc failed!\n");
1813 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1814
1815 bch_btree_gc_finish(c);
1816 wake_up_allocators(c);
1817
1818 bch_time_stats_update(&c->btree_gc_time, start_time);
1819
1820 stats.key_bytes *= sizeof(uint64_t);
1821 stats.data <<= 9;
1822 bch_update_bucket_in_use(c, &stats);
1823 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1824
1825 trace_bcache_gc_end(c);
1826
1827 bch_moving_gc(c);
1828 }
1829
1830 static bool gc_should_run(struct cache_set *c)
1831 {
1832 struct cache *ca = c->cache;
1833
1834 if (ca->invalidate_needs_gc)
1835 return true;
1836
1837 if (atomic_read(&c->sectors_to_gc) < 0)
1838 return true;
1839
1840 return false;
1841 }
1842
1843 static int bch_gc_thread(void *arg)
1844 {
1845 struct cache_set *c = arg;
1846
1847 while (1) {
1848 wait_event_interruptible(c->gc_wait,
1849 kthread_should_stop() ||
1850 test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
1851 gc_should_run(c));
1852
1853 if (kthread_should_stop() ||
1854 test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1855 break;
1856
1857 set_gc_sectors(c);
1858 bch_btree_gc(c);
1859 }
1860
1861 wait_for_kthread_stop();
1862 return 0;
1863 }
1864
1865 int bch_gc_thread_start(struct cache_set *c)
1866 {
1867 c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1868 return PTR_ERR_OR_ZERO(c->gc_thread);
1869 }
1870
1871
1872
1873 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1874 {
1875 int ret = 0;
1876 struct bkey *k, *p = NULL;
1877 struct btree_iter iter;
1878
1879 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1880 bch_initial_mark_key(b->c, b->level, k);
1881
1882 bch_initial_mark_key(b->c, b->level + 1, &b->key);
1883
1884 if (b->level) {
1885 bch_btree_iter_init(&b->keys, &iter, NULL);
1886
1887 do {
1888 k = bch_btree_iter_next_filter(&iter, &b->keys,
1889 bch_ptr_bad);
1890 if (k) {
1891 btree_node_prefetch(b, k);
1892
1893
1894
1895
1896 b->c->gc_stats.nodes++;
1897 }
1898
1899 if (p)
1900 ret = bcache_btree(check_recurse, p, b, op);
1901
1902 p = k;
1903 } while (p && !ret);
1904 }
1905
1906 return ret;
1907 }
1908
1909
1910 static int bch_btree_check_thread(void *arg)
1911 {
1912 int ret;
1913 struct btree_check_info *info = arg;
1914 struct btree_check_state *check_state = info->state;
1915 struct cache_set *c = check_state->c;
1916 struct btree_iter iter;
1917 struct bkey *k, *p;
1918 int cur_idx, prev_idx, skip_nr;
1919
1920 k = p = NULL;
1921 cur_idx = prev_idx = 0;
1922 ret = 0;
1923
1924
1925 bch_btree_iter_init(&c->root->keys, &iter, NULL);
1926 k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
1927 BUG_ON(!k);
1928
1929 p = k;
1930 while (k) {
1931
1932
1933
1934
1935
1936 spin_lock(&check_state->idx_lock);
1937 cur_idx = check_state->key_idx;
1938 check_state->key_idx++;
1939 spin_unlock(&check_state->idx_lock);
1940
1941 skip_nr = cur_idx - prev_idx;
1942
1943 while (skip_nr) {
1944 k = bch_btree_iter_next_filter(&iter,
1945 &c->root->keys,
1946 bch_ptr_bad);
1947 if (k)
1948 p = k;
1949 else {
1950
1951
1952
1953
1954
1955 atomic_set(&check_state->enough, 1);
1956
1957 smp_mb__after_atomic();
1958 goto out;
1959 }
1960 skip_nr--;
1961 cond_resched();
1962 }
1963
1964 if (p) {
1965 struct btree_op op;
1966
1967 btree_node_prefetch(c->root, p);
1968 c->gc_stats.nodes++;
1969 bch_btree_op_init(&op, 0);
1970 ret = bcache_btree(check_recurse, p, c->root, &op);
1971 if (ret)
1972 goto out;
1973 }
1974 p = NULL;
1975 prev_idx = cur_idx;
1976 cond_resched();
1977 }
1978
1979 out:
1980 info->result = ret;
1981
1982 smp_mb__before_atomic();
1983 if (atomic_dec_and_test(&check_state->started))
1984 wake_up(&check_state->wait);
1985
1986 return ret;
1987 }
1988
1989
1990
1991 static int bch_btree_chkthread_nr(void)
1992 {
1993 int n = num_online_cpus()/2;
1994
1995 if (n == 0)
1996 n = 1;
1997 else if (n > BCH_BTR_CHKTHREAD_MAX)
1998 n = BCH_BTR_CHKTHREAD_MAX;
1999
2000 return n;
2001 }
2002
2003 int bch_btree_check(struct cache_set *c)
2004 {
2005 int ret = 0;
2006 int i;
2007 struct bkey *k = NULL;
2008 struct btree_iter iter;
2009 struct btree_check_state check_state;
2010
2011
2012 for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2013 bch_initial_mark_key(c, c->root->level, k);
2014
2015 bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
2016
2017 if (c->root->level == 0)
2018 return 0;
2019
2020 memset(&check_state, 0, sizeof(struct btree_check_state));
2021 check_state.c = c;
2022 check_state.total_threads = bch_btree_chkthread_nr();
2023 check_state.key_idx = 0;
2024 spin_lock_init(&check_state.idx_lock);
2025 atomic_set(&check_state.started, 0);
2026 atomic_set(&check_state.enough, 0);
2027 init_waitqueue_head(&check_state.wait);
2028
2029 rw_lock(0, c->root, c->root->level);
2030
2031
2032
2033
2034
2035
2036 for (i = 0; i < check_state.total_threads; i++) {
2037
2038 smp_mb__before_atomic();
2039 if (atomic_read(&check_state.enough))
2040 break;
2041
2042 check_state.infos[i].result = 0;
2043 check_state.infos[i].state = &check_state;
2044
2045 check_state.infos[i].thread =
2046 kthread_run(bch_btree_check_thread,
2047 &check_state.infos[i],
2048 "bch_btrchk[%d]", i);
2049 if (IS_ERR(check_state.infos[i].thread)) {
2050 pr_err("fails to run thread bch_btrchk[%d]\n", i);
2051 for (--i; i >= 0; i--)
2052 kthread_stop(check_state.infos[i].thread);
2053 ret = -ENOMEM;
2054 goto out;
2055 }
2056 atomic_inc(&check_state.started);
2057 }
2058
2059
2060
2061
2062 wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
2063
2064 for (i = 0; i < check_state.total_threads; i++) {
2065 if (check_state.infos[i].result) {
2066 ret = check_state.infos[i].result;
2067 goto out;
2068 }
2069 }
2070
2071 out:
2072 rw_unlock(0, c->root);
2073 return ret;
2074 }
2075
2076 void bch_initial_gc_finish(struct cache_set *c)
2077 {
2078 struct cache *ca = c->cache;
2079 struct bucket *b;
2080
2081 bch_btree_gc_finish(c);
2082
2083 mutex_lock(&c->bucket_lock);
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094 for_each_bucket(b, ca) {
2095 if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2096 fifo_full(&ca->free[RESERVE_BTREE]))
2097 break;
2098
2099 if (bch_can_invalidate_bucket(ca, b) &&
2100 !GC_MARK(b)) {
2101 __bch_invalidate_one_bucket(ca, b);
2102 if (!fifo_push(&ca->free[RESERVE_PRIO],
2103 b - ca->buckets))
2104 fifo_push(&ca->free[RESERVE_BTREE],
2105 b - ca->buckets);
2106 }
2107 }
2108
2109 mutex_unlock(&c->bucket_lock);
2110 }
2111
2112
2113
2114 static bool btree_insert_key(struct btree *b, struct bkey *k,
2115 struct bkey *replace_key)
2116 {
2117 unsigned int status;
2118
2119 BUG_ON(bkey_cmp(k, &b->key) > 0);
2120
2121 status = bch_btree_insert_key(&b->keys, k, replace_key);
2122 if (status != BTREE_INSERT_STATUS_NO_INSERT) {
2123 bch_check_keys(&b->keys, "%u for %s", status,
2124 replace_key ? "replace" : "insert");
2125
2126 trace_bcache_btree_insert_key(b, k, replace_key != NULL,
2127 status);
2128 return true;
2129 } else
2130 return false;
2131 }
2132
2133 static size_t insert_u64s_remaining(struct btree *b)
2134 {
2135 long ret = bch_btree_keys_u64s_remaining(&b->keys);
2136
2137
2138
2139
2140 if (b->keys.ops->is_extents)
2141 ret -= KEY_MAX_U64S;
2142
2143 return max(ret, 0L);
2144 }
2145
2146 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2147 struct keylist *insert_keys,
2148 struct bkey *replace_key)
2149 {
2150 bool ret = false;
2151 int oldsize = bch_count_data(&b->keys);
2152
2153 while (!bch_keylist_empty(insert_keys)) {
2154 struct bkey *k = insert_keys->keys;
2155
2156 if (bkey_u64s(k) > insert_u64s_remaining(b))
2157 break;
2158
2159 if (bkey_cmp(k, &b->key) <= 0) {
2160 if (!b->level)
2161 bkey_put(b->c, k);
2162
2163 ret |= btree_insert_key(b, k, replace_key);
2164 bch_keylist_pop_front(insert_keys);
2165 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2166 BKEY_PADDED(key) temp;
2167 bkey_copy(&temp.key, insert_keys->keys);
2168
2169 bch_cut_back(&b->key, &temp.key);
2170 bch_cut_front(&b->key, insert_keys->keys);
2171
2172 ret |= btree_insert_key(b, &temp.key, replace_key);
2173 break;
2174 } else {
2175 break;
2176 }
2177 }
2178
2179 if (!ret)
2180 op->insert_collision = true;
2181
2182 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2183
2184 BUG_ON(bch_count_data(&b->keys) < oldsize);
2185 return ret;
2186 }
2187
2188 static int btree_split(struct btree *b, struct btree_op *op,
2189 struct keylist *insert_keys,
2190 struct bkey *replace_key)
2191 {
2192 bool split;
2193 struct btree *n1, *n2 = NULL, *n3 = NULL;
2194 uint64_t start_time = local_clock();
2195 struct closure cl;
2196 struct keylist parent_keys;
2197
2198 closure_init_stack(&cl);
2199 bch_keylist_init(&parent_keys);
2200
2201 if (btree_check_reserve(b, op)) {
2202 if (!b->level)
2203 return -EINTR;
2204 else
2205 WARN(1, "insufficient reserve for split\n");
2206 }
2207
2208 n1 = btree_node_alloc_replacement(b, op);
2209 if (IS_ERR(n1))
2210 goto err;
2211
2212 split = set_blocks(btree_bset_first(n1),
2213 block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
2214
2215 if (split) {
2216 unsigned int keys = 0;
2217
2218 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2219
2220 n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
2221 if (IS_ERR(n2))
2222 goto err_free1;
2223
2224 if (!b->parent) {
2225 n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
2226 if (IS_ERR(n3))
2227 goto err_free2;
2228 }
2229
2230 mutex_lock(&n1->write_lock);
2231 mutex_lock(&n2->write_lock);
2232
2233 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2234
2235
2236
2237
2238
2239
2240 while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2241 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
2242 keys));
2243
2244 bkey_copy_key(&n1->key,
2245 bset_bkey_idx(btree_bset_first(n1), keys));
2246 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2247
2248 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2249 btree_bset_first(n1)->keys = keys;
2250
2251 memcpy(btree_bset_first(n2)->start,
2252 bset_bkey_last(btree_bset_first(n1)),
2253 btree_bset_first(n2)->keys * sizeof(uint64_t));
2254
2255 bkey_copy_key(&n2->key, &b->key);
2256
2257 bch_keylist_add(&parent_keys, &n2->key);
2258 bch_btree_node_write(n2, &cl);
2259 mutex_unlock(&n2->write_lock);
2260 rw_unlock(true, n2);
2261 } else {
2262 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2263
2264 mutex_lock(&n1->write_lock);
2265 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2266 }
2267
2268 bch_keylist_add(&parent_keys, &n1->key);
2269 bch_btree_node_write(n1, &cl);
2270 mutex_unlock(&n1->write_lock);
2271
2272 if (n3) {
2273
2274 mutex_lock(&n3->write_lock);
2275 bkey_copy_key(&n3->key, &MAX_KEY);
2276 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2277 bch_btree_node_write(n3, &cl);
2278 mutex_unlock(&n3->write_lock);
2279
2280 closure_sync(&cl);
2281 bch_btree_set_root(n3);
2282 rw_unlock(true, n3);
2283 } else if (!b->parent) {
2284
2285 closure_sync(&cl);
2286 bch_btree_set_root(n1);
2287 } else {
2288
2289 closure_sync(&cl);
2290 make_btree_freeing_key(b, parent_keys.top);
2291 bch_keylist_push(&parent_keys);
2292
2293 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2294 BUG_ON(!bch_keylist_empty(&parent_keys));
2295 }
2296
2297 btree_node_free(b);
2298 rw_unlock(true, n1);
2299
2300 bch_time_stats_update(&b->c->btree_split_time, start_time);
2301
2302 return 0;
2303 err_free2:
2304 bkey_put(b->c, &n2->key);
2305 btree_node_free(n2);
2306 rw_unlock(true, n2);
2307 err_free1:
2308 bkey_put(b->c, &n1->key);
2309 btree_node_free(n1);
2310 rw_unlock(true, n1);
2311 err:
2312 WARN(1, "bcache: btree split failed (level %u)", b->level);
2313
2314 if (n3 == ERR_PTR(-EAGAIN) ||
2315 n2 == ERR_PTR(-EAGAIN) ||
2316 n1 == ERR_PTR(-EAGAIN))
2317 return -EAGAIN;
2318
2319 return -ENOMEM;
2320 }
2321
2322 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2323 struct keylist *insert_keys,
2324 atomic_t *journal_ref,
2325 struct bkey *replace_key)
2326 {
2327 struct closure cl;
2328
2329 BUG_ON(b->level && replace_key);
2330
2331 closure_init_stack(&cl);
2332
2333 mutex_lock(&b->write_lock);
2334
2335 if (write_block(b) != btree_bset_last(b) &&
2336 b->keys.last_set_unwritten)
2337 bch_btree_init_next(b);
2338
2339 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2340 mutex_unlock(&b->write_lock);
2341 goto split;
2342 }
2343
2344 BUG_ON(write_block(b) != btree_bset_last(b));
2345
2346 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2347 if (!b->level)
2348 bch_btree_leaf_dirty(b, journal_ref);
2349 else
2350 bch_btree_node_write(b, &cl);
2351 }
2352
2353 mutex_unlock(&b->write_lock);
2354
2355
2356 closure_sync(&cl);
2357
2358 return 0;
2359 split:
2360 if (current->bio_list) {
2361 op->lock = b->c->root->level + 1;
2362 return -EAGAIN;
2363 } else if (op->lock <= b->c->root->level) {
2364 op->lock = b->c->root->level + 1;
2365 return -EINTR;
2366 } else {
2367
2368 int ret = btree_split(b, op, insert_keys, replace_key);
2369
2370 if (bch_keylist_empty(insert_keys))
2371 return 0;
2372 else if (!ret)
2373 return -EINTR;
2374 return ret;
2375 }
2376 }
2377
2378 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2379 struct bkey *check_key)
2380 {
2381 int ret = -EINTR;
2382 uint64_t btree_ptr = b->key.ptr[0];
2383 unsigned long seq = b->seq;
2384 struct keylist insert;
2385 bool upgrade = op->lock == -1;
2386
2387 bch_keylist_init(&insert);
2388
2389 if (upgrade) {
2390 rw_unlock(false, b);
2391 rw_lock(true, b, b->level);
2392
2393 if (b->key.ptr[0] != btree_ptr ||
2394 b->seq != seq + 1) {
2395 op->lock = b->level;
2396 goto out;
2397 }
2398 }
2399
2400 SET_KEY_PTRS(check_key, 1);
2401 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2402
2403 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2404
2405 bch_keylist_add(&insert, check_key);
2406
2407 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2408
2409 BUG_ON(!ret && !bch_keylist_empty(&insert));
2410 out:
2411 if (upgrade)
2412 downgrade_write(&b->lock);
2413 return ret;
2414 }
2415
2416 struct btree_insert_op {
2417 struct btree_op op;
2418 struct keylist *keys;
2419 atomic_t *journal_ref;
2420 struct bkey *replace_key;
2421 };
2422
2423 static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2424 {
2425 struct btree_insert_op *op = container_of(b_op,
2426 struct btree_insert_op, op);
2427
2428 int ret = bch_btree_insert_node(b, &op->op, op->keys,
2429 op->journal_ref, op->replace_key);
2430 if (ret && !bch_keylist_empty(op->keys))
2431 return ret;
2432 else
2433 return MAP_DONE;
2434 }
2435
2436 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2437 atomic_t *journal_ref, struct bkey *replace_key)
2438 {
2439 struct btree_insert_op op;
2440 int ret = 0;
2441
2442 BUG_ON(current->bio_list);
2443 BUG_ON(bch_keylist_empty(keys));
2444
2445 bch_btree_op_init(&op.op, 0);
2446 op.keys = keys;
2447 op.journal_ref = journal_ref;
2448 op.replace_key = replace_key;
2449
2450 while (!ret && !bch_keylist_empty(keys)) {
2451 op.op.lock = 0;
2452 ret = bch_btree_map_leaf_nodes(&op.op, c,
2453 &START_KEY(keys->keys),
2454 btree_insert_fn);
2455 }
2456
2457 if (ret) {
2458 struct bkey *k;
2459
2460 pr_err("error %i\n", ret);
2461
2462 while ((k = bch_keylist_pop(keys)))
2463 bkey_put(c, k);
2464 } else if (op.op.insert_collision)
2465 ret = -ESRCH;
2466
2467 return ret;
2468 }
2469
2470 void bch_btree_set_root(struct btree *b)
2471 {
2472 unsigned int i;
2473 struct closure cl;
2474
2475 closure_init_stack(&cl);
2476
2477 trace_bcache_btree_set_root(b);
2478
2479 BUG_ON(!b->written);
2480
2481 for (i = 0; i < KEY_PTRS(&b->key); i++)
2482 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2483
2484 mutex_lock(&b->c->bucket_lock);
2485 list_del_init(&b->list);
2486 mutex_unlock(&b->c->bucket_lock);
2487
2488 b->c->root = b;
2489
2490 bch_journal_meta(b->c, &cl);
2491 closure_sync(&cl);
2492 }
2493
2494
2495
2496 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2497 struct bkey *from,
2498 btree_map_nodes_fn *fn, int flags)
2499 {
2500 int ret = MAP_CONTINUE;
2501
2502 if (b->level) {
2503 struct bkey *k;
2504 struct btree_iter iter;
2505
2506 bch_btree_iter_init(&b->keys, &iter, from);
2507
2508 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2509 bch_ptr_bad))) {
2510 ret = bcache_btree(map_nodes_recurse, k, b,
2511 op, from, fn, flags);
2512 from = NULL;
2513
2514 if (ret != MAP_CONTINUE)
2515 return ret;
2516 }
2517 }
2518
2519 if (!b->level || flags == MAP_ALL_NODES)
2520 ret = fn(op, b);
2521
2522 return ret;
2523 }
2524
2525 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2526 struct bkey *from, btree_map_nodes_fn *fn, int flags)
2527 {
2528 return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
2529 }
2530
2531 int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2532 struct bkey *from, btree_map_keys_fn *fn,
2533 int flags)
2534 {
2535 int ret = MAP_CONTINUE;
2536 struct bkey *k;
2537 struct btree_iter iter;
2538
2539 bch_btree_iter_init(&b->keys, &iter, from);
2540
2541 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2542 ret = !b->level
2543 ? fn(op, b, k)
2544 : bcache_btree(map_keys_recurse, k,
2545 b, op, from, fn, flags);
2546 from = NULL;
2547
2548 if (ret != MAP_CONTINUE)
2549 return ret;
2550 }
2551
2552 if (!b->level && (flags & MAP_END_KEY))
2553 ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2554 KEY_OFFSET(&b->key), 0));
2555
2556 return ret;
2557 }
2558
2559 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2560 struct bkey *from, btree_map_keys_fn *fn, int flags)
2561 {
2562 return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
2563 }
2564
2565
2566
2567 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2568 {
2569
2570 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2571 return -1;
2572 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2573 return 1;
2574 return 0;
2575 }
2576
2577 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2578 struct keybuf_key *r)
2579 {
2580 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2581 }
2582
2583 struct refill {
2584 struct btree_op op;
2585 unsigned int nr_found;
2586 struct keybuf *buf;
2587 struct bkey *end;
2588 keybuf_pred_fn *pred;
2589 };
2590
2591 static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2592 struct bkey *k)
2593 {
2594 struct refill *refill = container_of(op, struct refill, op);
2595 struct keybuf *buf = refill->buf;
2596 int ret = MAP_CONTINUE;
2597
2598 if (bkey_cmp(k, refill->end) > 0) {
2599 ret = MAP_DONE;
2600 goto out;
2601 }
2602
2603 if (!KEY_SIZE(k))
2604 goto out;
2605
2606 if (refill->pred(buf, k)) {
2607 struct keybuf_key *w;
2608
2609 spin_lock(&buf->lock);
2610
2611 w = array_alloc(&buf->freelist);
2612 if (!w) {
2613 spin_unlock(&buf->lock);
2614 return MAP_DONE;
2615 }
2616
2617 w->private = NULL;
2618 bkey_copy(&w->key, k);
2619
2620 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2621 array_free(&buf->freelist, w);
2622 else
2623 refill->nr_found++;
2624
2625 if (array_freelist_empty(&buf->freelist))
2626 ret = MAP_DONE;
2627
2628 spin_unlock(&buf->lock);
2629 }
2630 out:
2631 buf->last_scanned = *k;
2632 return ret;
2633 }
2634
2635 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2636 struct bkey *end, keybuf_pred_fn *pred)
2637 {
2638 struct bkey start = buf->last_scanned;
2639 struct refill refill;
2640
2641 cond_resched();
2642
2643 bch_btree_op_init(&refill.op, -1);
2644 refill.nr_found = 0;
2645 refill.buf = buf;
2646 refill.end = end;
2647 refill.pred = pred;
2648
2649 bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2650 refill_keybuf_fn, MAP_END_KEY);
2651
2652 trace_bcache_keyscan(refill.nr_found,
2653 KEY_INODE(&start), KEY_OFFSET(&start),
2654 KEY_INODE(&buf->last_scanned),
2655 KEY_OFFSET(&buf->last_scanned));
2656
2657 spin_lock(&buf->lock);
2658
2659 if (!RB_EMPTY_ROOT(&buf->keys)) {
2660 struct keybuf_key *w;
2661
2662 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2663 buf->start = START_KEY(&w->key);
2664
2665 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2666 buf->end = w->key;
2667 } else {
2668 buf->start = MAX_KEY;
2669 buf->end = MAX_KEY;
2670 }
2671
2672 spin_unlock(&buf->lock);
2673 }
2674
2675 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2676 {
2677 rb_erase(&w->node, &buf->keys);
2678 array_free(&buf->freelist, w);
2679 }
2680
2681 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2682 {
2683 spin_lock(&buf->lock);
2684 __bch_keybuf_del(buf, w);
2685 spin_unlock(&buf->lock);
2686 }
2687
2688 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2689 struct bkey *end)
2690 {
2691 bool ret = false;
2692 struct keybuf_key *p, *w, s;
2693
2694 s.key = *start;
2695
2696 if (bkey_cmp(end, &buf->start) <= 0 ||
2697 bkey_cmp(start, &buf->end) >= 0)
2698 return false;
2699
2700 spin_lock(&buf->lock);
2701 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2702
2703 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2704 p = w;
2705 w = RB_NEXT(w, node);
2706
2707 if (p->private)
2708 ret = true;
2709 else
2710 __bch_keybuf_del(buf, p);
2711 }
2712
2713 spin_unlock(&buf->lock);
2714 return ret;
2715 }
2716
2717 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2718 {
2719 struct keybuf_key *w;
2720
2721 spin_lock(&buf->lock);
2722
2723 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2724
2725 while (w && w->private)
2726 w = RB_NEXT(w, node);
2727
2728 if (w)
2729 w->private = ERR_PTR(-EINTR);
2730
2731 spin_unlock(&buf->lock);
2732 return w;
2733 }
2734
2735 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2736 struct keybuf *buf,
2737 struct bkey *end,
2738 keybuf_pred_fn *pred)
2739 {
2740 struct keybuf_key *ret;
2741
2742 while (1) {
2743 ret = bch_keybuf_next(buf);
2744 if (ret)
2745 break;
2746
2747 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2748 pr_debug("scan finished\n");
2749 break;
2750 }
2751
2752 bch_refill_keybuf(c, buf, end, pred);
2753 }
2754
2755 return ret;
2756 }
2757
2758 void bch_keybuf_init(struct keybuf *buf)
2759 {
2760 buf->last_scanned = MAX_KEY;
2761 buf->keys = RB_ROOT;
2762
2763 spin_lock_init(&buf->lock);
2764 array_allocator_init(&buf->freelist);
2765 }
2766
2767 void bch_btree_exit(void)
2768 {
2769 if (btree_io_wq)
2770 destroy_workqueue(btree_io_wq);
2771 }
2772
2773 int __init bch_btree_init(void)
2774 {
2775 btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
2776 if (!btree_io_wq)
2777 return -ENOMEM;
2778
2779 return 0;
2780 }