0001
0002
0003
0004
0005
0006
0007 #include "dm-thin-metadata.h"
0008 #include "dm-bio-prison-v1.h"
0009 #include "dm.h"
0010
0011 #include <linux/device-mapper.h>
0012 #include <linux/dm-io.h>
0013 #include <linux/dm-kcopyd.h>
0014 #include <linux/jiffies.h>
0015 #include <linux/log2.h>
0016 #include <linux/list.h>
0017 #include <linux/rculist.h>
0018 #include <linux/init.h>
0019 #include <linux/module.h>
0020 #include <linux/slab.h>
0021 #include <linux/vmalloc.h>
0022 #include <linux/sort.h>
0023 #include <linux/rbtree.h>
0024
0025 #define DM_MSG_PREFIX "thin"
0026
0027
0028
0029
0030 #define ENDIO_HOOK_POOL_SIZE 1024
0031 #define MAPPING_POOL_SIZE 1024
0032 #define COMMIT_PERIOD HZ
0033 #define NO_SPACE_TIMEOUT_SECS 60
0034
0035 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
0036
0037 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
0038 "A percentage of time allocated for copy on write");
0039
0040
0041
0042
0043
0044 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
0045 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
0046
0047
0048
0049
0050 #define MAX_DEV_ID ((1 << 24) - 1)
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115 enum lock_space {
0116 VIRTUAL,
0117 PHYSICAL
0118 };
0119
0120 static void build_key(struct dm_thin_device *td, enum lock_space ls,
0121 dm_block_t b, dm_block_t e, struct dm_cell_key *key)
0122 {
0123 key->virtual = (ls == VIRTUAL);
0124 key->dev = dm_thin_dev_id(td);
0125 key->block_begin = b;
0126 key->block_end = e;
0127 }
0128
0129 static void build_data_key(struct dm_thin_device *td, dm_block_t b,
0130 struct dm_cell_key *key)
0131 {
0132 build_key(td, PHYSICAL, b, b + 1llu, key);
0133 }
0134
0135 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
0136 struct dm_cell_key *key)
0137 {
0138 build_key(td, VIRTUAL, b, b + 1llu, key);
0139 }
0140
0141
0142
0143 #define THROTTLE_THRESHOLD (1 * HZ)
0144
0145 struct throttle {
0146 struct rw_semaphore lock;
0147 unsigned long threshold;
0148 bool throttle_applied;
0149 };
0150
0151 static void throttle_init(struct throttle *t)
0152 {
0153 init_rwsem(&t->lock);
0154 t->throttle_applied = false;
0155 }
0156
0157 static void throttle_work_start(struct throttle *t)
0158 {
0159 t->threshold = jiffies + THROTTLE_THRESHOLD;
0160 }
0161
0162 static void throttle_work_update(struct throttle *t)
0163 {
0164 if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
0165 down_write(&t->lock);
0166 t->throttle_applied = true;
0167 }
0168 }
0169
0170 static void throttle_work_complete(struct throttle *t)
0171 {
0172 if (t->throttle_applied) {
0173 t->throttle_applied = false;
0174 up_write(&t->lock);
0175 }
0176 }
0177
0178 static void throttle_lock(struct throttle *t)
0179 {
0180 down_read(&t->lock);
0181 }
0182
0183 static void throttle_unlock(struct throttle *t)
0184 {
0185 up_read(&t->lock);
0186 }
0187
0188
0189
0190
0191
0192
0193
0194
0195 struct dm_thin_new_mapping;
0196
0197
0198
0199
0200 enum pool_mode {
0201 PM_WRITE,
0202 PM_OUT_OF_DATA_SPACE,
0203
0204
0205
0206
0207 PM_OUT_OF_METADATA_SPACE,
0208 PM_READ_ONLY,
0209
0210 PM_FAIL,
0211 };
0212
0213 struct pool_features {
0214 enum pool_mode mode;
0215
0216 bool zero_new_blocks:1;
0217 bool discard_enabled:1;
0218 bool discard_passdown:1;
0219 bool error_if_no_space:1;
0220 };
0221
0222 struct thin_c;
0223 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
0224 typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
0225 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
0226
0227 #define CELL_SORT_ARRAY_SIZE 8192
0228
0229 struct pool {
0230 struct list_head list;
0231 struct dm_target *ti;
0232
0233 struct mapped_device *pool_md;
0234 struct block_device *data_dev;
0235 struct block_device *md_dev;
0236 struct dm_pool_metadata *pmd;
0237
0238 dm_block_t low_water_blocks;
0239 uint32_t sectors_per_block;
0240 int sectors_per_block_shift;
0241
0242 struct pool_features pf;
0243 bool low_water_triggered:1;
0244 bool suspended:1;
0245 bool out_of_data_space:1;
0246
0247 struct dm_bio_prison *prison;
0248 struct dm_kcopyd_client *copier;
0249
0250 struct work_struct worker;
0251 struct workqueue_struct *wq;
0252 struct throttle throttle;
0253 struct delayed_work waker;
0254 struct delayed_work no_space_timeout;
0255
0256 unsigned long last_commit_jiffies;
0257 unsigned ref_count;
0258
0259 spinlock_t lock;
0260 struct bio_list deferred_flush_bios;
0261 struct bio_list deferred_flush_completions;
0262 struct list_head prepared_mappings;
0263 struct list_head prepared_discards;
0264 struct list_head prepared_discards_pt2;
0265 struct list_head active_thins;
0266
0267 struct dm_deferred_set *shared_read_ds;
0268 struct dm_deferred_set *all_io_ds;
0269
0270 struct dm_thin_new_mapping *next_mapping;
0271
0272 process_bio_fn process_bio;
0273 process_bio_fn process_discard;
0274
0275 process_cell_fn process_cell;
0276 process_cell_fn process_discard_cell;
0277
0278 process_mapping_fn process_prepared_mapping;
0279 process_mapping_fn process_prepared_discard;
0280 process_mapping_fn process_prepared_discard_pt2;
0281
0282 struct dm_bio_prison_cell **cell_sort_array;
0283
0284 mempool_t mapping_pool;
0285 };
0286
0287 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
0288
0289 static enum pool_mode get_pool_mode(struct pool *pool)
0290 {
0291 return pool->pf.mode;
0292 }
0293
0294 static void notify_of_pool_mode_change(struct pool *pool)
0295 {
0296 const char *descs[] = {
0297 "write",
0298 "out-of-data-space",
0299 "read-only",
0300 "read-only",
0301 "fail"
0302 };
0303 const char *extra_desc = NULL;
0304 enum pool_mode mode = get_pool_mode(pool);
0305
0306 if (mode == PM_OUT_OF_DATA_SPACE) {
0307 if (!pool->pf.error_if_no_space)
0308 extra_desc = " (queue IO)";
0309 else
0310 extra_desc = " (error IO)";
0311 }
0312
0313 dm_table_event(pool->ti->table);
0314 DMINFO("%s: switching pool to %s%s mode",
0315 dm_device_name(pool->pool_md),
0316 descs[(int)mode], extra_desc ? : "");
0317 }
0318
0319
0320
0321
0322 struct pool_c {
0323 struct dm_target *ti;
0324 struct pool *pool;
0325 struct dm_dev *data_dev;
0326 struct dm_dev *metadata_dev;
0327
0328 dm_block_t low_water_blocks;
0329 struct pool_features requested_pf;
0330 struct pool_features adjusted_pf;
0331 };
0332
0333
0334
0335
0336 struct thin_c {
0337 struct list_head list;
0338 struct dm_dev *pool_dev;
0339 struct dm_dev *origin_dev;
0340 sector_t origin_size;
0341 dm_thin_id dev_id;
0342
0343 struct pool *pool;
0344 struct dm_thin_device *td;
0345 struct mapped_device *thin_md;
0346
0347 bool requeue_mode:1;
0348 spinlock_t lock;
0349 struct list_head deferred_cells;
0350 struct bio_list deferred_bio_list;
0351 struct bio_list retry_on_resume_list;
0352 struct rb_root sort_bio_list;
0353
0354
0355
0356
0357
0358 refcount_t refcount;
0359 struct completion can_destroy;
0360 };
0361
0362
0363
0364 static bool block_size_is_power_of_two(struct pool *pool)
0365 {
0366 return pool->sectors_per_block_shift >= 0;
0367 }
0368
0369 static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
0370 {
0371 return block_size_is_power_of_two(pool) ?
0372 (b << pool->sectors_per_block_shift) :
0373 (b * pool->sectors_per_block);
0374 }
0375
0376
0377
0378 struct discard_op {
0379 struct thin_c *tc;
0380 struct blk_plug plug;
0381 struct bio *parent_bio;
0382 struct bio *bio;
0383 };
0384
0385 static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
0386 {
0387 BUG_ON(!parent);
0388
0389 op->tc = tc;
0390 blk_start_plug(&op->plug);
0391 op->parent_bio = parent;
0392 op->bio = NULL;
0393 }
0394
0395 static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
0396 {
0397 struct thin_c *tc = op->tc;
0398 sector_t s = block_to_sectors(tc->pool, data_b);
0399 sector_t len = block_to_sectors(tc->pool, data_e - data_b);
0400
0401 return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
0402 &op->bio);
0403 }
0404
0405 static void end_discard(struct discard_op *op, int r)
0406 {
0407 if (op->bio) {
0408
0409
0410
0411
0412 bio_chain(op->bio, op->parent_bio);
0413 bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
0414 submit_bio(op->bio);
0415 }
0416
0417 blk_finish_plug(&op->plug);
0418
0419
0420
0421
0422
0423 if (r && !op->parent_bio->bi_status)
0424 op->parent_bio->bi_status = errno_to_blk_status(r);
0425 bio_endio(op->parent_bio);
0426 }
0427
0428
0429
0430
0431
0432
0433
0434 static void wake_worker(struct pool *pool)
0435 {
0436 queue_work(pool->wq, &pool->worker);
0437 }
0438
0439
0440
0441 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
0442 struct dm_bio_prison_cell **cell_result)
0443 {
0444 int r;
0445 struct dm_bio_prison_cell *cell_prealloc;
0446
0447
0448
0449
0450
0451 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
0452
0453 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
0454 if (r)
0455
0456
0457
0458
0459 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
0460
0461 return r;
0462 }
0463
0464 static void cell_release(struct pool *pool,
0465 struct dm_bio_prison_cell *cell,
0466 struct bio_list *bios)
0467 {
0468 dm_cell_release(pool->prison, cell, bios);
0469 dm_bio_prison_free_cell(pool->prison, cell);
0470 }
0471
0472 static void cell_visit_release(struct pool *pool,
0473 void (*fn)(void *, struct dm_bio_prison_cell *),
0474 void *context,
0475 struct dm_bio_prison_cell *cell)
0476 {
0477 dm_cell_visit_release(pool->prison, fn, context, cell);
0478 dm_bio_prison_free_cell(pool->prison, cell);
0479 }
0480
0481 static void cell_release_no_holder(struct pool *pool,
0482 struct dm_bio_prison_cell *cell,
0483 struct bio_list *bios)
0484 {
0485 dm_cell_release_no_holder(pool->prison, cell, bios);
0486 dm_bio_prison_free_cell(pool->prison, cell);
0487 }
0488
0489 static void cell_error_with_code(struct pool *pool,
0490 struct dm_bio_prison_cell *cell, blk_status_t error_code)
0491 {
0492 dm_cell_error(pool->prison, cell, error_code);
0493 dm_bio_prison_free_cell(pool->prison, cell);
0494 }
0495
0496 static blk_status_t get_pool_io_error_code(struct pool *pool)
0497 {
0498 return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
0499 }
0500
0501 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
0502 {
0503 cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
0504 }
0505
0506 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
0507 {
0508 cell_error_with_code(pool, cell, 0);
0509 }
0510
0511 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
0512 {
0513 cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
0514 }
0515
0516
0517
0518
0519
0520
0521 static struct dm_thin_pool_table {
0522 struct mutex mutex;
0523 struct list_head pools;
0524 } dm_thin_pool_table;
0525
0526 static void pool_table_init(void)
0527 {
0528 mutex_init(&dm_thin_pool_table.mutex);
0529 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
0530 }
0531
0532 static void pool_table_exit(void)
0533 {
0534 mutex_destroy(&dm_thin_pool_table.mutex);
0535 }
0536
0537 static void __pool_table_insert(struct pool *pool)
0538 {
0539 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
0540 list_add(&pool->list, &dm_thin_pool_table.pools);
0541 }
0542
0543 static void __pool_table_remove(struct pool *pool)
0544 {
0545 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
0546 list_del(&pool->list);
0547 }
0548
0549 static struct pool *__pool_table_lookup(struct mapped_device *md)
0550 {
0551 struct pool *pool = NULL, *tmp;
0552
0553 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
0554
0555 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
0556 if (tmp->pool_md == md) {
0557 pool = tmp;
0558 break;
0559 }
0560 }
0561
0562 return pool;
0563 }
0564
0565 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
0566 {
0567 struct pool *pool = NULL, *tmp;
0568
0569 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
0570
0571 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
0572 if (tmp->md_dev == md_dev) {
0573 pool = tmp;
0574 break;
0575 }
0576 }
0577
0578 return pool;
0579 }
0580
0581
0582
0583 struct dm_thin_endio_hook {
0584 struct thin_c *tc;
0585 struct dm_deferred_entry *shared_read_entry;
0586 struct dm_deferred_entry *all_io_entry;
0587 struct dm_thin_new_mapping *overwrite_mapping;
0588 struct rb_node rb_node;
0589 struct dm_bio_prison_cell *cell;
0590 };
0591
0592 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
0593 {
0594 bio_list_merge(bios, master);
0595 bio_list_init(master);
0596 }
0597
0598 static void error_bio_list(struct bio_list *bios, blk_status_t error)
0599 {
0600 struct bio *bio;
0601
0602 while ((bio = bio_list_pop(bios))) {
0603 bio->bi_status = error;
0604 bio_endio(bio);
0605 }
0606 }
0607
0608 static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
0609 blk_status_t error)
0610 {
0611 struct bio_list bios;
0612
0613 bio_list_init(&bios);
0614
0615 spin_lock_irq(&tc->lock);
0616 __merge_bio_list(&bios, master);
0617 spin_unlock_irq(&tc->lock);
0618
0619 error_bio_list(&bios, error);
0620 }
0621
0622 static void requeue_deferred_cells(struct thin_c *tc)
0623 {
0624 struct pool *pool = tc->pool;
0625 struct list_head cells;
0626 struct dm_bio_prison_cell *cell, *tmp;
0627
0628 INIT_LIST_HEAD(&cells);
0629
0630 spin_lock_irq(&tc->lock);
0631 list_splice_init(&tc->deferred_cells, &cells);
0632 spin_unlock_irq(&tc->lock);
0633
0634 list_for_each_entry_safe(cell, tmp, &cells, user_list)
0635 cell_requeue(pool, cell);
0636 }
0637
0638 static void requeue_io(struct thin_c *tc)
0639 {
0640 struct bio_list bios;
0641
0642 bio_list_init(&bios);
0643
0644 spin_lock_irq(&tc->lock);
0645 __merge_bio_list(&bios, &tc->deferred_bio_list);
0646 __merge_bio_list(&bios, &tc->retry_on_resume_list);
0647 spin_unlock_irq(&tc->lock);
0648
0649 error_bio_list(&bios, BLK_STS_DM_REQUEUE);
0650 requeue_deferred_cells(tc);
0651 }
0652
0653 static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
0654 {
0655 struct thin_c *tc;
0656
0657 rcu_read_lock();
0658 list_for_each_entry_rcu(tc, &pool->active_thins, list)
0659 error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
0660 rcu_read_unlock();
0661 }
0662
0663 static void error_retry_list(struct pool *pool)
0664 {
0665 error_retry_list_with_code(pool, get_pool_io_error_code(pool));
0666 }
0667
0668
0669
0670
0671
0672
0673
0674
0675 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
0676 {
0677 struct pool *pool = tc->pool;
0678 sector_t block_nr = bio->bi_iter.bi_sector;
0679
0680 if (block_size_is_power_of_two(pool))
0681 block_nr >>= pool->sectors_per_block_shift;
0682 else
0683 (void) sector_div(block_nr, pool->sectors_per_block);
0684
0685 return block_nr;
0686 }
0687
0688
0689
0690
0691 static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
0692 dm_block_t *begin, dm_block_t *end)
0693 {
0694 struct pool *pool = tc->pool;
0695 sector_t b = bio->bi_iter.bi_sector;
0696 sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
0697
0698 b += pool->sectors_per_block - 1ull;
0699
0700 if (block_size_is_power_of_two(pool)) {
0701 b >>= pool->sectors_per_block_shift;
0702 e >>= pool->sectors_per_block_shift;
0703 } else {
0704 (void) sector_div(b, pool->sectors_per_block);
0705 (void) sector_div(e, pool->sectors_per_block);
0706 }
0707
0708 if (e < b)
0709
0710 e = b;
0711
0712 *begin = b;
0713 *end = e;
0714 }
0715
0716 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
0717 {
0718 struct pool *pool = tc->pool;
0719 sector_t bi_sector = bio->bi_iter.bi_sector;
0720
0721 bio_set_dev(bio, tc->pool_dev->bdev);
0722 if (block_size_is_power_of_two(pool))
0723 bio->bi_iter.bi_sector =
0724 (block << pool->sectors_per_block_shift) |
0725 (bi_sector & (pool->sectors_per_block - 1));
0726 else
0727 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
0728 sector_div(bi_sector, pool->sectors_per_block);
0729 }
0730
0731 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
0732 {
0733 bio_set_dev(bio, tc->origin_dev->bdev);
0734 }
0735
0736 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
0737 {
0738 return op_is_flush(bio->bi_opf) &&
0739 dm_thin_changed_this_transaction(tc->td);
0740 }
0741
0742 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
0743 {
0744 struct dm_thin_endio_hook *h;
0745
0746 if (bio_op(bio) == REQ_OP_DISCARD)
0747 return;
0748
0749 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
0750 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
0751 }
0752
0753 static void issue(struct thin_c *tc, struct bio *bio)
0754 {
0755 struct pool *pool = tc->pool;
0756
0757 if (!bio_triggers_commit(tc, bio)) {
0758 dm_submit_bio_remap(bio, NULL);
0759 return;
0760 }
0761
0762
0763
0764
0765
0766
0767 if (dm_thin_aborted_changes(tc->td)) {
0768 bio_io_error(bio);
0769 return;
0770 }
0771
0772
0773
0774
0775
0776 spin_lock_irq(&pool->lock);
0777 bio_list_add(&pool->deferred_flush_bios, bio);
0778 spin_unlock_irq(&pool->lock);
0779 }
0780
0781 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
0782 {
0783 remap_to_origin(tc, bio);
0784 issue(tc, bio);
0785 }
0786
0787 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
0788 dm_block_t block)
0789 {
0790 remap(tc, bio, block);
0791 issue(tc, bio);
0792 }
0793
0794
0795
0796
0797
0798
0799 struct dm_thin_new_mapping {
0800 struct list_head list;
0801
0802 bool pass_discard:1;
0803 bool maybe_shared:1;
0804
0805
0806
0807
0808
0809
0810 atomic_t prepare_actions;
0811
0812 blk_status_t status;
0813 struct thin_c *tc;
0814 dm_block_t virt_begin, virt_end;
0815 dm_block_t data_block;
0816 struct dm_bio_prison_cell *cell;
0817
0818
0819
0820
0821
0822
0823
0824 struct bio *bio;
0825 bio_end_io_t *saved_bi_end_io;
0826 };
0827
0828 static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
0829 {
0830 struct pool *pool = m->tc->pool;
0831
0832 if (atomic_dec_and_test(&m->prepare_actions)) {
0833 list_add_tail(&m->list, &pool->prepared_mappings);
0834 wake_worker(pool);
0835 }
0836 }
0837
0838 static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
0839 {
0840 unsigned long flags;
0841 struct pool *pool = m->tc->pool;
0842
0843 spin_lock_irqsave(&pool->lock, flags);
0844 __complete_mapping_preparation(m);
0845 spin_unlock_irqrestore(&pool->lock, flags);
0846 }
0847
0848 static void copy_complete(int read_err, unsigned long write_err, void *context)
0849 {
0850 struct dm_thin_new_mapping *m = context;
0851
0852 m->status = read_err || write_err ? BLK_STS_IOERR : 0;
0853 complete_mapping_preparation(m);
0854 }
0855
0856 static void overwrite_endio(struct bio *bio)
0857 {
0858 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
0859 struct dm_thin_new_mapping *m = h->overwrite_mapping;
0860
0861 bio->bi_end_io = m->saved_bi_end_io;
0862
0863 m->status = bio->bi_status;
0864 complete_mapping_preparation(m);
0865 }
0866
0867
0868
0869
0870
0871
0872
0873
0874
0875
0876
0877
0878
0879
0880
0881 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
0882 {
0883 struct pool *pool = tc->pool;
0884 unsigned long flags;
0885 int has_work;
0886
0887 spin_lock_irqsave(&tc->lock, flags);
0888 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
0889 has_work = !bio_list_empty(&tc->deferred_bio_list);
0890 spin_unlock_irqrestore(&tc->lock, flags);
0891
0892 if (has_work)
0893 wake_worker(pool);
0894 }
0895
0896 static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
0897
0898 struct remap_info {
0899 struct thin_c *tc;
0900 struct bio_list defer_bios;
0901 struct bio_list issue_bios;
0902 };
0903
0904 static void __inc_remap_and_issue_cell(void *context,
0905 struct dm_bio_prison_cell *cell)
0906 {
0907 struct remap_info *info = context;
0908 struct bio *bio;
0909
0910 while ((bio = bio_list_pop(&cell->bios))) {
0911 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
0912 bio_list_add(&info->defer_bios, bio);
0913 else {
0914 inc_all_io_entry(info->tc->pool, bio);
0915
0916
0917
0918
0919
0920
0921 bio_list_add(&info->issue_bios, bio);
0922 }
0923 }
0924 }
0925
0926 static void inc_remap_and_issue_cell(struct thin_c *tc,
0927 struct dm_bio_prison_cell *cell,
0928 dm_block_t block)
0929 {
0930 struct bio *bio;
0931 struct remap_info info;
0932
0933 info.tc = tc;
0934 bio_list_init(&info.defer_bios);
0935 bio_list_init(&info.issue_bios);
0936
0937
0938
0939
0940
0941
0942 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
0943 &info, cell);
0944
0945 while ((bio = bio_list_pop(&info.defer_bios)))
0946 thin_defer_bio(tc, bio);
0947
0948 while ((bio = bio_list_pop(&info.issue_bios)))
0949 remap_and_issue(info.tc, bio, block);
0950 }
0951
0952 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
0953 {
0954 cell_error(m->tc->pool, m->cell);
0955 list_del(&m->list);
0956 mempool_free(m, &m->tc->pool->mapping_pool);
0957 }
0958
0959 static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
0960 {
0961 struct pool *pool = tc->pool;
0962
0963
0964
0965
0966
0967 if (!bio_triggers_commit(tc, bio)) {
0968 bio_endio(bio);
0969 return;
0970 }
0971
0972
0973
0974
0975
0976
0977 if (dm_thin_aborted_changes(tc->td)) {
0978 bio_io_error(bio);
0979 return;
0980 }
0981
0982
0983
0984
0985
0986 spin_lock_irq(&pool->lock);
0987 bio_list_add(&pool->deferred_flush_completions, bio);
0988 spin_unlock_irq(&pool->lock);
0989 }
0990
0991 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
0992 {
0993 struct thin_c *tc = m->tc;
0994 struct pool *pool = tc->pool;
0995 struct bio *bio = m->bio;
0996 int r;
0997
0998 if (m->status) {
0999 cell_error(pool, m->cell);
1000 goto out;
1001 }
1002
1003
1004
1005
1006
1007
1008 r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
1009 if (r) {
1010 metadata_operation_failed(pool, "dm_thin_insert_block", r);
1011 cell_error(pool, m->cell);
1012 goto out;
1013 }
1014
1015
1016
1017
1018
1019
1020
1021 if (bio) {
1022 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1023 complete_overwrite_bio(tc, bio);
1024 } else {
1025 inc_all_io_entry(tc->pool, m->cell->holder);
1026 remap_and_issue(tc, m->cell->holder, m->data_block);
1027 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1028 }
1029
1030 out:
1031 list_del(&m->list);
1032 mempool_free(m, &pool->mapping_pool);
1033 }
1034
1035
1036
1037 static void free_discard_mapping(struct dm_thin_new_mapping *m)
1038 {
1039 struct thin_c *tc = m->tc;
1040 if (m->cell)
1041 cell_defer_no_holder(tc, m->cell);
1042 mempool_free(m, &tc->pool->mapping_pool);
1043 }
1044
1045 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1046 {
1047 bio_io_error(m->bio);
1048 free_discard_mapping(m);
1049 }
1050
1051 static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1052 {
1053 bio_endio(m->bio);
1054 free_discard_mapping(m);
1055 }
1056
1057 static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1058 {
1059 int r;
1060 struct thin_c *tc = m->tc;
1061
1062 r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
1063 if (r) {
1064 metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
1065 bio_io_error(m->bio);
1066 } else
1067 bio_endio(m->bio);
1068
1069 cell_defer_no_holder(tc, m->cell);
1070 mempool_free(m, &tc->pool->mapping_pool);
1071 }
1072
1073
1074
1075 static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1076 struct bio *discard_parent)
1077 {
1078
1079
1080
1081
1082 int r = 0;
1083 bool shared = true;
1084 struct thin_c *tc = m->tc;
1085 struct pool *pool = tc->pool;
1086 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1087 struct discard_op op;
1088
1089 begin_discard(&op, tc, discard_parent);
1090 while (b != end) {
1091
1092 for (; b < end; b++) {
1093 r = dm_pool_block_is_shared(pool->pmd, b, &shared);
1094 if (r)
1095 goto out;
1096
1097 if (!shared)
1098 break;
1099 }
1100
1101 if (b == end)
1102 break;
1103
1104
1105 for (e = b + 1; e != end; e++) {
1106 r = dm_pool_block_is_shared(pool->pmd, e, &shared);
1107 if (r)
1108 goto out;
1109
1110 if (shared)
1111 break;
1112 }
1113
1114 r = issue_discard(&op, b, e);
1115 if (r)
1116 goto out;
1117
1118 b = e;
1119 }
1120 out:
1121 end_discard(&op, r);
1122 }
1123
1124 static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1125 {
1126 unsigned long flags;
1127 struct pool *pool = m->tc->pool;
1128
1129 spin_lock_irqsave(&pool->lock, flags);
1130 list_add_tail(&m->list, &pool->prepared_discards_pt2);
1131 spin_unlock_irqrestore(&pool->lock, flags);
1132 wake_worker(pool);
1133 }
1134
1135 static void passdown_endio(struct bio *bio)
1136 {
1137
1138
1139
1140
1141 queue_passdown_pt2(bio->bi_private);
1142 bio_put(bio);
1143 }
1144
1145 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1146 {
1147 int r;
1148 struct thin_c *tc = m->tc;
1149 struct pool *pool = tc->pool;
1150 struct bio *discard_parent;
1151 dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1152
1153
1154
1155
1156
1157
1158 r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1159 if (r) {
1160 metadata_operation_failed(pool, "dm_thin_remove_range", r);
1161 bio_io_error(m->bio);
1162 cell_defer_no_holder(tc, m->cell);
1163 mempool_free(m, &pool->mapping_pool);
1164 return;
1165 }
1166
1167
1168
1169
1170
1171 r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
1172 if (r) {
1173 metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
1174 bio_io_error(m->bio);
1175 cell_defer_no_holder(tc, m->cell);
1176 mempool_free(m, &pool->mapping_pool);
1177 return;
1178 }
1179
1180 discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
1181 discard_parent->bi_end_io = passdown_endio;
1182 discard_parent->bi_private = m;
1183 if (m->maybe_shared)
1184 passdown_double_checking_shared_status(m, discard_parent);
1185 else {
1186 struct discard_op op;
1187
1188 begin_discard(&op, tc, discard_parent);
1189 r = issue_discard(&op, m->data_block, data_end);
1190 end_discard(&op, r);
1191 }
1192 }
1193
1194 static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1195 {
1196 int r;
1197 struct thin_c *tc = m->tc;
1198 struct pool *pool = tc->pool;
1199
1200
1201
1202
1203
1204 r = dm_pool_dec_data_range(pool->pmd, m->data_block,
1205 m->data_block + (m->virt_end - m->virt_begin));
1206 if (r) {
1207 metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
1208 bio_io_error(m->bio);
1209 } else
1210 bio_endio(m->bio);
1211
1212 cell_defer_no_holder(tc, m->cell);
1213 mempool_free(m, &pool->mapping_pool);
1214 }
1215
1216 static void process_prepared(struct pool *pool, struct list_head *head,
1217 process_mapping_fn *fn)
1218 {
1219 struct list_head maps;
1220 struct dm_thin_new_mapping *m, *tmp;
1221
1222 INIT_LIST_HEAD(&maps);
1223 spin_lock_irq(&pool->lock);
1224 list_splice_init(head, &maps);
1225 spin_unlock_irq(&pool->lock);
1226
1227 list_for_each_entry_safe(m, tmp, &maps, list)
1228 (*fn)(m);
1229 }
1230
1231
1232
1233
1234 static int io_overlaps_block(struct pool *pool, struct bio *bio)
1235 {
1236 return bio->bi_iter.bi_size ==
1237 (pool->sectors_per_block << SECTOR_SHIFT);
1238 }
1239
1240 static int io_overwrites_block(struct pool *pool, struct bio *bio)
1241 {
1242 return (bio_data_dir(bio) == WRITE) &&
1243 io_overlaps_block(pool, bio);
1244 }
1245
1246 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1247 bio_end_io_t *fn)
1248 {
1249 *save = bio->bi_end_io;
1250 bio->bi_end_io = fn;
1251 }
1252
1253 static int ensure_next_mapping(struct pool *pool)
1254 {
1255 if (pool->next_mapping)
1256 return 0;
1257
1258 pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
1259
1260 return pool->next_mapping ? 0 : -ENOMEM;
1261 }
1262
1263 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1264 {
1265 struct dm_thin_new_mapping *m = pool->next_mapping;
1266
1267 BUG_ON(!pool->next_mapping);
1268
1269 memset(m, 0, sizeof(struct dm_thin_new_mapping));
1270 INIT_LIST_HEAD(&m->list);
1271 m->bio = NULL;
1272
1273 pool->next_mapping = NULL;
1274
1275 return m;
1276 }
1277
1278 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
1279 sector_t begin, sector_t end)
1280 {
1281 struct dm_io_region to;
1282
1283 to.bdev = tc->pool_dev->bdev;
1284 to.sector = begin;
1285 to.count = end - begin;
1286
1287 dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
1288 }
1289
1290 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
1291 dm_block_t data_begin,
1292 struct dm_thin_new_mapping *m)
1293 {
1294 struct pool *pool = tc->pool;
1295 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1296
1297 h->overwrite_mapping = m;
1298 m->bio = bio;
1299 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1300 inc_all_io_entry(pool, bio);
1301 remap_and_issue(tc, bio, data_begin);
1302 }
1303
1304
1305
1306
1307 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1308 struct dm_dev *origin, dm_block_t data_origin,
1309 dm_block_t data_dest,
1310 struct dm_bio_prison_cell *cell, struct bio *bio,
1311 sector_t len)
1312 {
1313 struct pool *pool = tc->pool;
1314 struct dm_thin_new_mapping *m = get_next_mapping(pool);
1315
1316 m->tc = tc;
1317 m->virt_begin = virt_block;
1318 m->virt_end = virt_block + 1u;
1319 m->data_block = data_dest;
1320 m->cell = cell;
1321
1322
1323
1324
1325
1326
1327 atomic_set(&m->prepare_actions, 3);
1328
1329 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
1330 complete_mapping_preparation(m);
1331
1332
1333
1334
1335
1336
1337
1338 if (io_overwrites_block(pool, bio))
1339 remap_and_issue_overwrite(tc, bio, data_dest, m);
1340 else {
1341 struct dm_io_region from, to;
1342
1343 from.bdev = origin->bdev;
1344 from.sector = data_origin * pool->sectors_per_block;
1345 from.count = len;
1346
1347 to.bdev = tc->pool_dev->bdev;
1348 to.sector = data_dest * pool->sectors_per_block;
1349 to.count = len;
1350
1351 dm_kcopyd_copy(pool->copier, &from, 1, &to,
1352 0, copy_complete, m);
1353
1354
1355
1356
1357 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1358 atomic_inc(&m->prepare_actions);
1359 ll_zero(tc, m,
1360 data_dest * pool->sectors_per_block + len,
1361 (data_dest + 1) * pool->sectors_per_block);
1362 }
1363 }
1364
1365 complete_mapping_preparation(m);
1366 }
1367
1368 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1369 dm_block_t data_origin, dm_block_t data_dest,
1370 struct dm_bio_prison_cell *cell, struct bio *bio)
1371 {
1372 schedule_copy(tc, virt_block, tc->pool_dev,
1373 data_origin, data_dest, cell, bio,
1374 tc->pool->sectors_per_block);
1375 }
1376
1377 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1378 dm_block_t data_block, struct dm_bio_prison_cell *cell,
1379 struct bio *bio)
1380 {
1381 struct pool *pool = tc->pool;
1382 struct dm_thin_new_mapping *m = get_next_mapping(pool);
1383
1384 atomic_set(&m->prepare_actions, 1);
1385 m->tc = tc;
1386 m->virt_begin = virt_block;
1387 m->virt_end = virt_block + 1u;
1388 m->data_block = data_block;
1389 m->cell = cell;
1390
1391
1392
1393
1394
1395
1396 if (pool->pf.zero_new_blocks) {
1397 if (io_overwrites_block(pool, bio))
1398 remap_and_issue_overwrite(tc, bio, data_block, m);
1399 else
1400 ll_zero(tc, m, data_block * pool->sectors_per_block,
1401 (data_block + 1) * pool->sectors_per_block);
1402 } else
1403 process_prepared_mapping(m);
1404 }
1405
1406 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1407 dm_block_t data_dest,
1408 struct dm_bio_prison_cell *cell, struct bio *bio)
1409 {
1410 struct pool *pool = tc->pool;
1411 sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1412 sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1413
1414 if (virt_block_end <= tc->origin_size)
1415 schedule_copy(tc, virt_block, tc->origin_dev,
1416 virt_block, data_dest, cell, bio,
1417 pool->sectors_per_block);
1418
1419 else if (virt_block_begin < tc->origin_size)
1420 schedule_copy(tc, virt_block, tc->origin_dev,
1421 virt_block, data_dest, cell, bio,
1422 tc->origin_size - virt_block_begin);
1423
1424 else
1425 schedule_zero(tc, virt_block, data_dest, cell, bio);
1426 }
1427
1428 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1429
1430 static void requeue_bios(struct pool *pool);
1431
1432 static bool is_read_only_pool_mode(enum pool_mode mode)
1433 {
1434 return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1435 }
1436
1437 static bool is_read_only(struct pool *pool)
1438 {
1439 return is_read_only_pool_mode(get_pool_mode(pool));
1440 }
1441
1442 static void check_for_metadata_space(struct pool *pool)
1443 {
1444 int r;
1445 const char *ooms_reason = NULL;
1446 dm_block_t nr_free;
1447
1448 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1449 if (r)
1450 ooms_reason = "Could not get free metadata blocks";
1451 else if (!nr_free)
1452 ooms_reason = "No free metadata blocks";
1453
1454 if (ooms_reason && !is_read_only(pool)) {
1455 DMERR("%s", ooms_reason);
1456 set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1457 }
1458 }
1459
1460 static void check_for_data_space(struct pool *pool)
1461 {
1462 int r;
1463 dm_block_t nr_free;
1464
1465 if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1466 return;
1467
1468 r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1469 if (r)
1470 return;
1471
1472 if (nr_free) {
1473 set_pool_mode(pool, PM_WRITE);
1474 requeue_bios(pool);
1475 }
1476 }
1477
1478
1479
1480
1481
1482 static int commit(struct pool *pool)
1483 {
1484 int r;
1485
1486 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1487 return -EINVAL;
1488
1489 r = dm_pool_commit_metadata(pool->pmd);
1490 if (r)
1491 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1492 else {
1493 check_for_metadata_space(pool);
1494 check_for_data_space(pool);
1495 }
1496
1497 return r;
1498 }
1499
1500 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1501 {
1502 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1503 DMWARN("%s: reached low water mark for data device: sending event.",
1504 dm_device_name(pool->pool_md));
1505 spin_lock_irq(&pool->lock);
1506 pool->low_water_triggered = true;
1507 spin_unlock_irq(&pool->lock);
1508 dm_table_event(pool->ti->table);
1509 }
1510 }
1511
1512 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1513 {
1514 int r;
1515 dm_block_t free_blocks;
1516 struct pool *pool = tc->pool;
1517
1518 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1519 return -EINVAL;
1520
1521 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1522 if (r) {
1523 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1524 return r;
1525 }
1526
1527 check_low_water_mark(pool, free_blocks);
1528
1529 if (!free_blocks) {
1530
1531
1532
1533
1534 r = commit(pool);
1535 if (r)
1536 return r;
1537
1538 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1539 if (r) {
1540 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1541 return r;
1542 }
1543
1544 if (!free_blocks) {
1545 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1546 return -ENOSPC;
1547 }
1548 }
1549
1550 r = dm_pool_alloc_data_block(pool->pmd, result);
1551 if (r) {
1552 if (r == -ENOSPC)
1553 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1554 else
1555 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1556 return r;
1557 }
1558
1559 r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1560 if (r) {
1561 metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1562 return r;
1563 }
1564
1565 if (!free_blocks) {
1566
1567 r = commit(pool);
1568 if (r)
1569 return r;
1570 }
1571
1572 return 0;
1573 }
1574
1575
1576
1577
1578
1579 static void retry_on_resume(struct bio *bio)
1580 {
1581 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1582 struct thin_c *tc = h->tc;
1583
1584 spin_lock_irq(&tc->lock);
1585 bio_list_add(&tc->retry_on_resume_list, bio);
1586 spin_unlock_irq(&tc->lock);
1587 }
1588
1589 static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1590 {
1591 enum pool_mode m = get_pool_mode(pool);
1592
1593 switch (m) {
1594 case PM_WRITE:
1595
1596 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1597 return BLK_STS_IOERR;
1598
1599 case PM_OUT_OF_DATA_SPACE:
1600 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
1601
1602 case PM_OUT_OF_METADATA_SPACE:
1603 case PM_READ_ONLY:
1604 case PM_FAIL:
1605 return BLK_STS_IOERR;
1606 default:
1607
1608 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1609 return BLK_STS_IOERR;
1610 }
1611 }
1612
1613 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1614 {
1615 blk_status_t error = should_error_unserviceable_bio(pool);
1616
1617 if (error) {
1618 bio->bi_status = error;
1619 bio_endio(bio);
1620 } else
1621 retry_on_resume(bio);
1622 }
1623
1624 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1625 {
1626 struct bio *bio;
1627 struct bio_list bios;
1628 blk_status_t error;
1629
1630 error = should_error_unserviceable_bio(pool);
1631 if (error) {
1632 cell_error_with_code(pool, cell, error);
1633 return;
1634 }
1635
1636 bio_list_init(&bios);
1637 cell_release(pool, cell, &bios);
1638
1639 while ((bio = bio_list_pop(&bios)))
1640 retry_on_resume(bio);
1641 }
1642
1643 static void process_discard_cell_no_passdown(struct thin_c *tc,
1644 struct dm_bio_prison_cell *virt_cell)
1645 {
1646 struct pool *pool = tc->pool;
1647 struct dm_thin_new_mapping *m = get_next_mapping(pool);
1648
1649
1650
1651
1652
1653 m->tc = tc;
1654 m->virt_begin = virt_cell->key.block_begin;
1655 m->virt_end = virt_cell->key.block_end;
1656 m->cell = virt_cell;
1657 m->bio = virt_cell->holder;
1658
1659 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1660 pool->process_prepared_discard(m);
1661 }
1662
1663 static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1664 struct bio *bio)
1665 {
1666 struct pool *pool = tc->pool;
1667
1668 int r;
1669 bool maybe_shared;
1670 struct dm_cell_key data_key;
1671 struct dm_bio_prison_cell *data_cell;
1672 struct dm_thin_new_mapping *m;
1673 dm_block_t virt_begin, virt_end, data_begin;
1674
1675 while (begin != end) {
1676 r = ensure_next_mapping(pool);
1677 if (r)
1678
1679 return;
1680
1681 r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
1682 &data_begin, &maybe_shared);
1683 if (r)
1684
1685
1686
1687
1688 break;
1689
1690 build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
1691 if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1692
1693 begin = virt_end;
1694 continue;
1695 }
1696
1697
1698
1699
1700
1701 m = get_next_mapping(pool);
1702 m->tc = tc;
1703 m->maybe_shared = maybe_shared;
1704 m->virt_begin = virt_begin;
1705 m->virt_end = virt_end;
1706 m->data_block = data_begin;
1707 m->cell = data_cell;
1708 m->bio = bio;
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718 bio_inc_remaining(bio);
1719 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1720 pool->process_prepared_discard(m);
1721
1722 begin = virt_end;
1723 }
1724 }
1725
1726 static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
1727 {
1728 struct bio *bio = virt_cell->holder;
1729 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1730
1731
1732
1733
1734
1735
1736 h->cell = virt_cell;
1737 break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
1738
1739
1740
1741
1742
1743
1744 bio_endio(bio);
1745 }
1746
1747 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1748 {
1749 dm_block_t begin, end;
1750 struct dm_cell_key virt_key;
1751 struct dm_bio_prison_cell *virt_cell;
1752
1753 get_bio_block_range(tc, bio, &begin, &end);
1754 if (begin == end) {
1755
1756
1757
1758 bio_endio(bio);
1759 return;
1760 }
1761
1762 build_key(tc->td, VIRTUAL, begin, end, &virt_key);
1763 if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
1764
1765
1766
1767
1768
1769
1770
1771 return;
1772
1773 tc->pool->process_discard_cell(tc, virt_cell);
1774 }
1775
1776 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1777 struct dm_cell_key *key,
1778 struct dm_thin_lookup_result *lookup_result,
1779 struct dm_bio_prison_cell *cell)
1780 {
1781 int r;
1782 dm_block_t data_block;
1783 struct pool *pool = tc->pool;
1784
1785 r = alloc_data_block(tc, &data_block);
1786 switch (r) {
1787 case 0:
1788 schedule_internal_copy(tc, block, lookup_result->block,
1789 data_block, cell, bio);
1790 break;
1791
1792 case -ENOSPC:
1793 retry_bios_on_resume(pool, cell);
1794 break;
1795
1796 default:
1797 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1798 __func__, r);
1799 cell_error(pool, cell);
1800 break;
1801 }
1802 }
1803
1804 static void __remap_and_issue_shared_cell(void *context,
1805 struct dm_bio_prison_cell *cell)
1806 {
1807 struct remap_info *info = context;
1808 struct bio *bio;
1809
1810 while ((bio = bio_list_pop(&cell->bios))) {
1811 if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
1812 bio_op(bio) == REQ_OP_DISCARD)
1813 bio_list_add(&info->defer_bios, bio);
1814 else {
1815 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1816
1817 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1818 inc_all_io_entry(info->tc->pool, bio);
1819 bio_list_add(&info->issue_bios, bio);
1820 }
1821 }
1822 }
1823
1824 static void remap_and_issue_shared_cell(struct thin_c *tc,
1825 struct dm_bio_prison_cell *cell,
1826 dm_block_t block)
1827 {
1828 struct bio *bio;
1829 struct remap_info info;
1830
1831 info.tc = tc;
1832 bio_list_init(&info.defer_bios);
1833 bio_list_init(&info.issue_bios);
1834
1835 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1836 &info, cell);
1837
1838 while ((bio = bio_list_pop(&info.defer_bios)))
1839 thin_defer_bio(tc, bio);
1840
1841 while ((bio = bio_list_pop(&info.issue_bios)))
1842 remap_and_issue(tc, bio, block);
1843 }
1844
1845 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1846 dm_block_t block,
1847 struct dm_thin_lookup_result *lookup_result,
1848 struct dm_bio_prison_cell *virt_cell)
1849 {
1850 struct dm_bio_prison_cell *data_cell;
1851 struct pool *pool = tc->pool;
1852 struct dm_cell_key key;
1853
1854
1855
1856
1857
1858 build_data_key(tc->td, lookup_result->block, &key);
1859 if (bio_detain(pool, &key, bio, &data_cell)) {
1860 cell_defer_no_holder(tc, virt_cell);
1861 return;
1862 }
1863
1864 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1865 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1866 cell_defer_no_holder(tc, virt_cell);
1867 } else {
1868 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1869
1870 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1871 inc_all_io_entry(pool, bio);
1872 remap_and_issue(tc, bio, lookup_result->block);
1873
1874 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1875 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1876 }
1877 }
1878
1879 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1880 struct dm_bio_prison_cell *cell)
1881 {
1882 int r;
1883 dm_block_t data_block;
1884 struct pool *pool = tc->pool;
1885
1886
1887
1888
1889 if (!bio->bi_iter.bi_size) {
1890 inc_all_io_entry(pool, bio);
1891 cell_defer_no_holder(tc, cell);
1892
1893 remap_and_issue(tc, bio, 0);
1894 return;
1895 }
1896
1897
1898
1899
1900 if (bio_data_dir(bio) == READ) {
1901 zero_fill_bio(bio);
1902 cell_defer_no_holder(tc, cell);
1903 bio_endio(bio);
1904 return;
1905 }
1906
1907 r = alloc_data_block(tc, &data_block);
1908 switch (r) {
1909 case 0:
1910 if (tc->origin_dev)
1911 schedule_external_copy(tc, block, data_block, cell, bio);
1912 else
1913 schedule_zero(tc, block, data_block, cell, bio);
1914 break;
1915
1916 case -ENOSPC:
1917 retry_bios_on_resume(pool, cell);
1918 break;
1919
1920 default:
1921 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1922 __func__, r);
1923 cell_error(pool, cell);
1924 break;
1925 }
1926 }
1927
1928 static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1929 {
1930 int r;
1931 struct pool *pool = tc->pool;
1932 struct bio *bio = cell->holder;
1933 dm_block_t block = get_bio_block(tc, bio);
1934 struct dm_thin_lookup_result lookup_result;
1935
1936 if (tc->requeue_mode) {
1937 cell_requeue(pool, cell);
1938 return;
1939 }
1940
1941 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1942 switch (r) {
1943 case 0:
1944 if (lookup_result.shared)
1945 process_shared_bio(tc, bio, block, &lookup_result, cell);
1946 else {
1947 inc_all_io_entry(pool, bio);
1948 remap_and_issue(tc, bio, lookup_result.block);
1949 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1950 }
1951 break;
1952
1953 case -ENODATA:
1954 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1955 inc_all_io_entry(pool, bio);
1956 cell_defer_no_holder(tc, cell);
1957
1958 if (bio_end_sector(bio) <= tc->origin_size)
1959 remap_to_origin_and_issue(tc, bio);
1960
1961 else if (bio->bi_iter.bi_sector < tc->origin_size) {
1962 zero_fill_bio(bio);
1963 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1964 remap_to_origin_and_issue(tc, bio);
1965
1966 } else {
1967 zero_fill_bio(bio);
1968 bio_endio(bio);
1969 }
1970 } else
1971 provision_block(tc, bio, block, cell);
1972 break;
1973
1974 default:
1975 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1976 __func__, r);
1977 cell_defer_no_holder(tc, cell);
1978 bio_io_error(bio);
1979 break;
1980 }
1981 }
1982
1983 static void process_bio(struct thin_c *tc, struct bio *bio)
1984 {
1985 struct pool *pool = tc->pool;
1986 dm_block_t block = get_bio_block(tc, bio);
1987 struct dm_bio_prison_cell *cell;
1988 struct dm_cell_key key;
1989
1990
1991
1992
1993
1994 build_virtual_key(tc->td, block, &key);
1995 if (bio_detain(pool, &key, bio, &cell))
1996 return;
1997
1998 process_cell(tc, cell);
1999 }
2000
2001 static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
2002 struct dm_bio_prison_cell *cell)
2003 {
2004 int r;
2005 int rw = bio_data_dir(bio);
2006 dm_block_t block = get_bio_block(tc, bio);
2007 struct dm_thin_lookup_result lookup_result;
2008
2009 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
2010 switch (r) {
2011 case 0:
2012 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
2013 handle_unserviceable_bio(tc->pool, bio);
2014 if (cell)
2015 cell_defer_no_holder(tc, cell);
2016 } else {
2017 inc_all_io_entry(tc->pool, bio);
2018 remap_and_issue(tc, bio, lookup_result.block);
2019 if (cell)
2020 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
2021 }
2022 break;
2023
2024 case -ENODATA:
2025 if (cell)
2026 cell_defer_no_holder(tc, cell);
2027 if (rw != READ) {
2028 handle_unserviceable_bio(tc->pool, bio);
2029 break;
2030 }
2031
2032 if (tc->origin_dev) {
2033 inc_all_io_entry(tc->pool, bio);
2034 remap_to_origin_and_issue(tc, bio);
2035 break;
2036 }
2037
2038 zero_fill_bio(bio);
2039 bio_endio(bio);
2040 break;
2041
2042 default:
2043 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2044 __func__, r);
2045 if (cell)
2046 cell_defer_no_holder(tc, cell);
2047 bio_io_error(bio);
2048 break;
2049 }
2050 }
2051
2052 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
2053 {
2054 __process_bio_read_only(tc, bio, NULL);
2055 }
2056
2057 static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2058 {
2059 __process_bio_read_only(tc, cell->holder, cell);
2060 }
2061
2062 static void process_bio_success(struct thin_c *tc, struct bio *bio)
2063 {
2064 bio_endio(bio);
2065 }
2066
2067 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
2068 {
2069 bio_io_error(bio);
2070 }
2071
2072 static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2073 {
2074 cell_success(tc->pool, cell);
2075 }
2076
2077 static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2078 {
2079 cell_error(tc->pool, cell);
2080 }
2081
2082
2083
2084
2085
2086 static int need_commit_due_to_time(struct pool *pool)
2087 {
2088 return !time_in_range(jiffies, pool->last_commit_jiffies,
2089 pool->last_commit_jiffies + COMMIT_PERIOD);
2090 }
2091
2092 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2093 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2094
2095 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
2096 {
2097 struct rb_node **rbp, *parent;
2098 struct dm_thin_endio_hook *pbd;
2099 sector_t bi_sector = bio->bi_iter.bi_sector;
2100
2101 rbp = &tc->sort_bio_list.rb_node;
2102 parent = NULL;
2103 while (*rbp) {
2104 parent = *rbp;
2105 pbd = thin_pbd(parent);
2106
2107 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2108 rbp = &(*rbp)->rb_left;
2109 else
2110 rbp = &(*rbp)->rb_right;
2111 }
2112
2113 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2114 rb_link_node(&pbd->rb_node, parent, rbp);
2115 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2116 }
2117
2118 static void __extract_sorted_bios(struct thin_c *tc)
2119 {
2120 struct rb_node *node;
2121 struct dm_thin_endio_hook *pbd;
2122 struct bio *bio;
2123
2124 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2125 pbd = thin_pbd(node);
2126 bio = thin_bio(pbd);
2127
2128 bio_list_add(&tc->deferred_bio_list, bio);
2129 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2130 }
2131
2132 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2133 }
2134
2135 static void __sort_thin_deferred_bios(struct thin_c *tc)
2136 {
2137 struct bio *bio;
2138 struct bio_list bios;
2139
2140 bio_list_init(&bios);
2141 bio_list_merge(&bios, &tc->deferred_bio_list);
2142 bio_list_init(&tc->deferred_bio_list);
2143
2144
2145 while ((bio = bio_list_pop(&bios)))
2146 __thin_bio_rb_add(tc, bio);
2147
2148
2149
2150
2151
2152
2153 __extract_sorted_bios(tc);
2154 }
2155
2156 static void process_thin_deferred_bios(struct thin_c *tc)
2157 {
2158 struct pool *pool = tc->pool;
2159 struct bio *bio;
2160 struct bio_list bios;
2161 struct blk_plug plug;
2162 unsigned count = 0;
2163
2164 if (tc->requeue_mode) {
2165 error_thin_bio_list(tc, &tc->deferred_bio_list,
2166 BLK_STS_DM_REQUEUE);
2167 return;
2168 }
2169
2170 bio_list_init(&bios);
2171
2172 spin_lock_irq(&tc->lock);
2173
2174 if (bio_list_empty(&tc->deferred_bio_list)) {
2175 spin_unlock_irq(&tc->lock);
2176 return;
2177 }
2178
2179 __sort_thin_deferred_bios(tc);
2180
2181 bio_list_merge(&bios, &tc->deferred_bio_list);
2182 bio_list_init(&tc->deferred_bio_list);
2183
2184 spin_unlock_irq(&tc->lock);
2185
2186 blk_start_plug(&plug);
2187 while ((bio = bio_list_pop(&bios))) {
2188
2189
2190
2191
2192
2193 if (ensure_next_mapping(pool)) {
2194 spin_lock_irq(&tc->lock);
2195 bio_list_add(&tc->deferred_bio_list, bio);
2196 bio_list_merge(&tc->deferred_bio_list, &bios);
2197 spin_unlock_irq(&tc->lock);
2198 break;
2199 }
2200
2201 if (bio_op(bio) == REQ_OP_DISCARD)
2202 pool->process_discard(tc, bio);
2203 else
2204 pool->process_bio(tc, bio);
2205
2206 if ((count++ & 127) == 0) {
2207 throttle_work_update(&pool->throttle);
2208 dm_pool_issue_prefetches(pool->pmd);
2209 }
2210 }
2211 blk_finish_plug(&plug);
2212 }
2213
2214 static int cmp_cells(const void *lhs, const void *rhs)
2215 {
2216 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
2217 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
2218
2219 BUG_ON(!lhs_cell->holder);
2220 BUG_ON(!rhs_cell->holder);
2221
2222 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2223 return -1;
2224
2225 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2226 return 1;
2227
2228 return 0;
2229 }
2230
2231 static unsigned sort_cells(struct pool *pool, struct list_head *cells)
2232 {
2233 unsigned count = 0;
2234 struct dm_bio_prison_cell *cell, *tmp;
2235
2236 list_for_each_entry_safe(cell, tmp, cells, user_list) {
2237 if (count >= CELL_SORT_ARRAY_SIZE)
2238 break;
2239
2240 pool->cell_sort_array[count++] = cell;
2241 list_del(&cell->user_list);
2242 }
2243
2244 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
2245
2246 return count;
2247 }
2248
2249 static void process_thin_deferred_cells(struct thin_c *tc)
2250 {
2251 struct pool *pool = tc->pool;
2252 struct list_head cells;
2253 struct dm_bio_prison_cell *cell;
2254 unsigned i, j, count;
2255
2256 INIT_LIST_HEAD(&cells);
2257
2258 spin_lock_irq(&tc->lock);
2259 list_splice_init(&tc->deferred_cells, &cells);
2260 spin_unlock_irq(&tc->lock);
2261
2262 if (list_empty(&cells))
2263 return;
2264
2265 do {
2266 count = sort_cells(tc->pool, &cells);
2267
2268 for (i = 0; i < count; i++) {
2269 cell = pool->cell_sort_array[i];
2270 BUG_ON(!cell->holder);
2271
2272
2273
2274
2275
2276
2277 if (ensure_next_mapping(pool)) {
2278 for (j = i; j < count; j++)
2279 list_add(&pool->cell_sort_array[j]->user_list, &cells);
2280
2281 spin_lock_irq(&tc->lock);
2282 list_splice(&cells, &tc->deferred_cells);
2283 spin_unlock_irq(&tc->lock);
2284 return;
2285 }
2286
2287 if (bio_op(cell->holder) == REQ_OP_DISCARD)
2288 pool->process_discard_cell(tc, cell);
2289 else
2290 pool->process_cell(tc, cell);
2291 }
2292 } while (!list_empty(&cells));
2293 }
2294
2295 static void thin_get(struct thin_c *tc);
2296 static void thin_put(struct thin_c *tc);
2297
2298
2299
2300
2301
2302
2303 static struct thin_c *get_first_thin(struct pool *pool)
2304 {
2305 struct thin_c *tc = NULL;
2306
2307 rcu_read_lock();
2308 if (!list_empty(&pool->active_thins)) {
2309 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2310 thin_get(tc);
2311 }
2312 rcu_read_unlock();
2313
2314 return tc;
2315 }
2316
2317 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
2318 {
2319 struct thin_c *old_tc = tc;
2320
2321 rcu_read_lock();
2322 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2323 thin_get(tc);
2324 thin_put(old_tc);
2325 rcu_read_unlock();
2326 return tc;
2327 }
2328 thin_put(old_tc);
2329 rcu_read_unlock();
2330
2331 return NULL;
2332 }
2333
2334 static void process_deferred_bios(struct pool *pool)
2335 {
2336 struct bio *bio;
2337 struct bio_list bios, bio_completions;
2338 struct thin_c *tc;
2339
2340 tc = get_first_thin(pool);
2341 while (tc) {
2342 process_thin_deferred_cells(tc);
2343 process_thin_deferred_bios(tc);
2344 tc = get_next_thin(pool, tc);
2345 }
2346
2347
2348
2349
2350
2351 bio_list_init(&bios);
2352 bio_list_init(&bio_completions);
2353
2354 spin_lock_irq(&pool->lock);
2355 bio_list_merge(&bios, &pool->deferred_flush_bios);
2356 bio_list_init(&pool->deferred_flush_bios);
2357
2358 bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
2359 bio_list_init(&pool->deferred_flush_completions);
2360 spin_unlock_irq(&pool->lock);
2361
2362 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
2363 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
2364 return;
2365
2366 if (commit(pool)) {
2367 bio_list_merge(&bios, &bio_completions);
2368
2369 while ((bio = bio_list_pop(&bios)))
2370 bio_io_error(bio);
2371 return;
2372 }
2373 pool->last_commit_jiffies = jiffies;
2374
2375 while ((bio = bio_list_pop(&bio_completions)))
2376 bio_endio(bio);
2377
2378 while ((bio = bio_list_pop(&bios))) {
2379
2380
2381
2382
2383 if (bio->bi_opf & REQ_PREFLUSH)
2384 bio_endio(bio);
2385 else
2386 dm_submit_bio_remap(bio, NULL);
2387 }
2388 }
2389
2390 static void do_worker(struct work_struct *ws)
2391 {
2392 struct pool *pool = container_of(ws, struct pool, worker);
2393
2394 throttle_work_start(&pool->throttle);
2395 dm_pool_issue_prefetches(pool->pmd);
2396 throttle_work_update(&pool->throttle);
2397 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
2398 throttle_work_update(&pool->throttle);
2399 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
2400 throttle_work_update(&pool->throttle);
2401 process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
2402 throttle_work_update(&pool->throttle);
2403 process_deferred_bios(pool);
2404 throttle_work_complete(&pool->throttle);
2405 }
2406
2407
2408
2409
2410
2411 static void do_waker(struct work_struct *ws)
2412 {
2413 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
2414 wake_worker(pool);
2415 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
2416 }
2417
2418
2419
2420
2421
2422
2423 static void do_no_space_timeout(struct work_struct *ws)
2424 {
2425 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
2426 no_space_timeout);
2427
2428 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2429 pool->pf.error_if_no_space = true;
2430 notify_of_pool_mode_change(pool);
2431 error_retry_list_with_code(pool, BLK_STS_NOSPC);
2432 }
2433 }
2434
2435
2436
2437 struct pool_work {
2438 struct work_struct worker;
2439 struct completion complete;
2440 };
2441
2442 static struct pool_work *to_pool_work(struct work_struct *ws)
2443 {
2444 return container_of(ws, struct pool_work, worker);
2445 }
2446
2447 static void pool_work_complete(struct pool_work *pw)
2448 {
2449 complete(&pw->complete);
2450 }
2451
2452 static void pool_work_wait(struct pool_work *pw, struct pool *pool,
2453 void (*fn)(struct work_struct *))
2454 {
2455 INIT_WORK_ONSTACK(&pw->worker, fn);
2456 init_completion(&pw->complete);
2457 queue_work(pool->wq, &pw->worker);
2458 wait_for_completion(&pw->complete);
2459 }
2460
2461
2462
2463 struct noflush_work {
2464 struct pool_work pw;
2465 struct thin_c *tc;
2466 };
2467
2468 static struct noflush_work *to_noflush(struct work_struct *ws)
2469 {
2470 return container_of(to_pool_work(ws), struct noflush_work, pw);
2471 }
2472
2473 static void do_noflush_start(struct work_struct *ws)
2474 {
2475 struct noflush_work *w = to_noflush(ws);
2476 w->tc->requeue_mode = true;
2477 requeue_io(w->tc);
2478 pool_work_complete(&w->pw);
2479 }
2480
2481 static void do_noflush_stop(struct work_struct *ws)
2482 {
2483 struct noflush_work *w = to_noflush(ws);
2484 w->tc->requeue_mode = false;
2485 pool_work_complete(&w->pw);
2486 }
2487
2488 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
2489 {
2490 struct noflush_work w;
2491
2492 w.tc = tc;
2493 pool_work_wait(&w.pw, tc->pool, fn);
2494 }
2495
2496
2497
2498 static bool passdown_enabled(struct pool_c *pt)
2499 {
2500 return pt->adjusted_pf.discard_passdown;
2501 }
2502
2503 static void set_discard_callbacks(struct pool *pool)
2504 {
2505 struct pool_c *pt = pool->ti->private;
2506
2507 if (passdown_enabled(pt)) {
2508 pool->process_discard_cell = process_discard_cell_passdown;
2509 pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2510 pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
2511 } else {
2512 pool->process_discard_cell = process_discard_cell_no_passdown;
2513 pool->process_prepared_discard = process_prepared_discard_no_passdown;
2514 }
2515 }
2516
2517 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
2518 {
2519 struct pool_c *pt = pool->ti->private;
2520 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
2521 enum pool_mode old_mode = get_pool_mode(pool);
2522 unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
2523
2524
2525
2526
2527
2528 if (new_mode == PM_WRITE && needs_check) {
2529 DMERR("%s: unable to switch pool to write mode until repaired.",
2530 dm_device_name(pool->pool_md));
2531 if (old_mode != new_mode)
2532 new_mode = old_mode;
2533 else
2534 new_mode = PM_READ_ONLY;
2535 }
2536
2537
2538
2539
2540
2541 if (old_mode == PM_FAIL)
2542 new_mode = old_mode;
2543
2544 switch (new_mode) {
2545 case PM_FAIL:
2546 dm_pool_metadata_read_only(pool->pmd);
2547 pool->process_bio = process_bio_fail;
2548 pool->process_discard = process_bio_fail;
2549 pool->process_cell = process_cell_fail;
2550 pool->process_discard_cell = process_cell_fail;
2551 pool->process_prepared_mapping = process_prepared_mapping_fail;
2552 pool->process_prepared_discard = process_prepared_discard_fail;
2553
2554 error_retry_list(pool);
2555 break;
2556
2557 case PM_OUT_OF_METADATA_SPACE:
2558 case PM_READ_ONLY:
2559 dm_pool_metadata_read_only(pool->pmd);
2560 pool->process_bio = process_bio_read_only;
2561 pool->process_discard = process_bio_success;
2562 pool->process_cell = process_cell_read_only;
2563 pool->process_discard_cell = process_cell_success;
2564 pool->process_prepared_mapping = process_prepared_mapping_fail;
2565 pool->process_prepared_discard = process_prepared_discard_success;
2566
2567 error_retry_list(pool);
2568 break;
2569
2570 case PM_OUT_OF_DATA_SPACE:
2571
2572
2573
2574
2575
2576
2577
2578
2579 pool->out_of_data_space = true;
2580 pool->process_bio = process_bio_read_only;
2581 pool->process_discard = process_discard_bio;
2582 pool->process_cell = process_cell_read_only;
2583 pool->process_prepared_mapping = process_prepared_mapping;
2584 set_discard_callbacks(pool);
2585
2586 if (!pool->pf.error_if_no_space && no_space_timeout)
2587 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
2588 break;
2589
2590 case PM_WRITE:
2591 if (old_mode == PM_OUT_OF_DATA_SPACE)
2592 cancel_delayed_work_sync(&pool->no_space_timeout);
2593 pool->out_of_data_space = false;
2594 pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
2595 dm_pool_metadata_read_write(pool->pmd);
2596 pool->process_bio = process_bio;
2597 pool->process_discard = process_discard_bio;
2598 pool->process_cell = process_cell;
2599 pool->process_prepared_mapping = process_prepared_mapping;
2600 set_discard_callbacks(pool);
2601 break;
2602 }
2603
2604 pool->pf.mode = new_mode;
2605
2606
2607
2608
2609 pt->adjusted_pf.mode = new_mode;
2610
2611 if (old_mode != new_mode)
2612 notify_of_pool_mode_change(pool);
2613 }
2614
2615 static void abort_transaction(struct pool *pool)
2616 {
2617 const char *dev_name = dm_device_name(pool->pool_md);
2618
2619 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2620 if (dm_pool_abort_metadata(pool->pmd)) {
2621 DMERR("%s: failed to abort metadata transaction", dev_name);
2622 set_pool_mode(pool, PM_FAIL);
2623 }
2624
2625 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
2626 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2627 set_pool_mode(pool, PM_FAIL);
2628 }
2629 }
2630
2631 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
2632 {
2633 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2634 dm_device_name(pool->pool_md), op, r);
2635
2636 abort_transaction(pool);
2637 set_pool_mode(pool, PM_READ_ONLY);
2638 }
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
2650 {
2651 struct pool *pool = tc->pool;
2652
2653 spin_lock_irq(&tc->lock);
2654 bio_list_add(&tc->deferred_bio_list, bio);
2655 spin_unlock_irq(&tc->lock);
2656
2657 wake_worker(pool);
2658 }
2659
2660 static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2661 {
2662 struct pool *pool = tc->pool;
2663
2664 throttle_lock(&pool->throttle);
2665 thin_defer_bio(tc, bio);
2666 throttle_unlock(&pool->throttle);
2667 }
2668
2669 static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2670 {
2671 struct pool *pool = tc->pool;
2672
2673 throttle_lock(&pool->throttle);
2674 spin_lock_irq(&tc->lock);
2675 list_add_tail(&cell->user_list, &tc->deferred_cells);
2676 spin_unlock_irq(&tc->lock);
2677 throttle_unlock(&pool->throttle);
2678
2679 wake_worker(pool);
2680 }
2681
2682 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
2683 {
2684 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2685
2686 h->tc = tc;
2687 h->shared_read_entry = NULL;
2688 h->all_io_entry = NULL;
2689 h->overwrite_mapping = NULL;
2690 h->cell = NULL;
2691 }
2692
2693
2694
2695
2696 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2697 {
2698 int r;
2699 struct thin_c *tc = ti->private;
2700 dm_block_t block = get_bio_block(tc, bio);
2701 struct dm_thin_device *td = tc->td;
2702 struct dm_thin_lookup_result result;
2703 struct dm_bio_prison_cell *virt_cell, *data_cell;
2704 struct dm_cell_key key;
2705
2706 thin_hook_bio(tc, bio);
2707
2708 if (tc->requeue_mode) {
2709 bio->bi_status = BLK_STS_DM_REQUEUE;
2710 bio_endio(bio);
2711 return DM_MAPIO_SUBMITTED;
2712 }
2713
2714 if (get_pool_mode(tc->pool) == PM_FAIL) {
2715 bio_io_error(bio);
2716 return DM_MAPIO_SUBMITTED;
2717 }
2718
2719 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
2720 thin_defer_bio_with_throttle(tc, bio);
2721 return DM_MAPIO_SUBMITTED;
2722 }
2723
2724
2725
2726
2727
2728 build_virtual_key(tc->td, block, &key);
2729 if (bio_detain(tc->pool, &key, bio, &virt_cell))
2730 return DM_MAPIO_SUBMITTED;
2731
2732 r = dm_thin_find_block(td, block, 0, &result);
2733
2734
2735
2736
2737 switch (r) {
2738 case 0:
2739 if (unlikely(result.shared)) {
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754 thin_defer_cell(tc, virt_cell);
2755 return DM_MAPIO_SUBMITTED;
2756 }
2757
2758 build_data_key(tc->td, result.block, &key);
2759 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
2760 cell_defer_no_holder(tc, virt_cell);
2761 return DM_MAPIO_SUBMITTED;
2762 }
2763
2764 inc_all_io_entry(tc->pool, bio);
2765 cell_defer_no_holder(tc, data_cell);
2766 cell_defer_no_holder(tc, virt_cell);
2767
2768 remap(tc, bio, result.block);
2769 return DM_MAPIO_REMAPPED;
2770
2771 case -ENODATA:
2772 case -EWOULDBLOCK:
2773 thin_defer_cell(tc, virt_cell);
2774 return DM_MAPIO_SUBMITTED;
2775
2776 default:
2777
2778
2779
2780
2781
2782 bio_io_error(bio);
2783 cell_defer_no_holder(tc, virt_cell);
2784 return DM_MAPIO_SUBMITTED;
2785 }
2786 }
2787
2788 static void requeue_bios(struct pool *pool)
2789 {
2790 struct thin_c *tc;
2791
2792 rcu_read_lock();
2793 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2794 spin_lock_irq(&tc->lock);
2795 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2796 bio_list_init(&tc->retry_on_resume_list);
2797 spin_unlock_irq(&tc->lock);
2798 }
2799 rcu_read_unlock();
2800 }
2801
2802
2803
2804
2805 static bool is_factor(sector_t block_size, uint32_t n)
2806 {
2807 return !sector_div(block_size, n);
2808 }
2809
2810
2811
2812
2813
2814 static void disable_passdown_if_not_supported(struct pool_c *pt)
2815 {
2816 struct pool *pool = pt->pool;
2817 struct block_device *data_bdev = pt->data_dev->bdev;
2818 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
2819 const char *reason = NULL;
2820
2821 if (!pt->adjusted_pf.discard_passdown)
2822 return;
2823
2824 if (!bdev_max_discard_sectors(pt->data_dev->bdev))
2825 reason = "discard unsupported";
2826
2827 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2828 reason = "max discard sectors smaller than a block";
2829
2830 if (reason) {
2831 DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
2832 pt->adjusted_pf.discard_passdown = false;
2833 }
2834 }
2835
2836 static int bind_control_target(struct pool *pool, struct dm_target *ti)
2837 {
2838 struct pool_c *pt = ti->private;
2839
2840
2841
2842
2843 enum pool_mode old_mode = get_pool_mode(pool);
2844 enum pool_mode new_mode = pt->adjusted_pf.mode;
2845
2846
2847
2848
2849
2850
2851 pt->adjusted_pf.mode = old_mode;
2852
2853 pool->ti = ti;
2854 pool->pf = pt->adjusted_pf;
2855 pool->low_water_blocks = pt->low_water_blocks;
2856
2857 set_pool_mode(pool, new_mode);
2858
2859 return 0;
2860 }
2861
2862 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2863 {
2864 if (pool->ti == ti)
2865 pool->ti = NULL;
2866 }
2867
2868
2869
2870
2871
2872 static void pool_features_init(struct pool_features *pf)
2873 {
2874 pf->mode = PM_WRITE;
2875 pf->zero_new_blocks = true;
2876 pf->discard_enabled = true;
2877 pf->discard_passdown = true;
2878 pf->error_if_no_space = false;
2879 }
2880
2881 static void __pool_destroy(struct pool *pool)
2882 {
2883 __pool_table_remove(pool);
2884
2885 vfree(pool->cell_sort_array);
2886 if (dm_pool_metadata_close(pool->pmd) < 0)
2887 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2888
2889 dm_bio_prison_destroy(pool->prison);
2890 dm_kcopyd_client_destroy(pool->copier);
2891
2892 if (pool->wq)
2893 destroy_workqueue(pool->wq);
2894
2895 if (pool->next_mapping)
2896 mempool_free(pool->next_mapping, &pool->mapping_pool);
2897 mempool_exit(&pool->mapping_pool);
2898 dm_deferred_set_destroy(pool->shared_read_ds);
2899 dm_deferred_set_destroy(pool->all_io_ds);
2900 kfree(pool);
2901 }
2902
2903 static struct kmem_cache *_new_mapping_cache;
2904
2905 static struct pool *pool_create(struct mapped_device *pool_md,
2906 struct block_device *metadata_dev,
2907 struct block_device *data_dev,
2908 unsigned long block_size,
2909 int read_only, char **error)
2910 {
2911 int r;
2912 void *err_p;
2913 struct pool *pool;
2914 struct dm_pool_metadata *pmd;
2915 bool format_device = read_only ? false : true;
2916
2917 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2918 if (IS_ERR(pmd)) {
2919 *error = "Error creating metadata object";
2920 return (struct pool *)pmd;
2921 }
2922
2923 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
2924 if (!pool) {
2925 *error = "Error allocating memory for pool";
2926 err_p = ERR_PTR(-ENOMEM);
2927 goto bad_pool;
2928 }
2929
2930 pool->pmd = pmd;
2931 pool->sectors_per_block = block_size;
2932 if (block_size & (block_size - 1))
2933 pool->sectors_per_block_shift = -1;
2934 else
2935 pool->sectors_per_block_shift = __ffs(block_size);
2936 pool->low_water_blocks = 0;
2937 pool_features_init(&pool->pf);
2938 pool->prison = dm_bio_prison_create();
2939 if (!pool->prison) {
2940 *error = "Error creating pool's bio prison";
2941 err_p = ERR_PTR(-ENOMEM);
2942 goto bad_prison;
2943 }
2944
2945 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2946 if (IS_ERR(pool->copier)) {
2947 r = PTR_ERR(pool->copier);
2948 *error = "Error creating pool's kcopyd client";
2949 err_p = ERR_PTR(r);
2950 goto bad_kcopyd_client;
2951 }
2952
2953
2954
2955
2956
2957 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2958 if (!pool->wq) {
2959 *error = "Error creating pool's workqueue";
2960 err_p = ERR_PTR(-ENOMEM);
2961 goto bad_wq;
2962 }
2963
2964 throttle_init(&pool->throttle);
2965 INIT_WORK(&pool->worker, do_worker);
2966 INIT_DELAYED_WORK(&pool->waker, do_waker);
2967 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2968 spin_lock_init(&pool->lock);
2969 bio_list_init(&pool->deferred_flush_bios);
2970 bio_list_init(&pool->deferred_flush_completions);
2971 INIT_LIST_HEAD(&pool->prepared_mappings);
2972 INIT_LIST_HEAD(&pool->prepared_discards);
2973 INIT_LIST_HEAD(&pool->prepared_discards_pt2);
2974 INIT_LIST_HEAD(&pool->active_thins);
2975 pool->low_water_triggered = false;
2976 pool->suspended = true;
2977 pool->out_of_data_space = false;
2978
2979 pool->shared_read_ds = dm_deferred_set_create();
2980 if (!pool->shared_read_ds) {
2981 *error = "Error creating pool's shared read deferred set";
2982 err_p = ERR_PTR(-ENOMEM);
2983 goto bad_shared_read_ds;
2984 }
2985
2986 pool->all_io_ds = dm_deferred_set_create();
2987 if (!pool->all_io_ds) {
2988 *error = "Error creating pool's all io deferred set";
2989 err_p = ERR_PTR(-ENOMEM);
2990 goto bad_all_io_ds;
2991 }
2992
2993 pool->next_mapping = NULL;
2994 r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
2995 _new_mapping_cache);
2996 if (r) {
2997 *error = "Error creating pool's mapping mempool";
2998 err_p = ERR_PTR(r);
2999 goto bad_mapping_pool;
3000 }
3001
3002 pool->cell_sort_array =
3003 vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
3004 sizeof(*pool->cell_sort_array)));
3005 if (!pool->cell_sort_array) {
3006 *error = "Error allocating cell sort array";
3007 err_p = ERR_PTR(-ENOMEM);
3008 goto bad_sort_array;
3009 }
3010
3011 pool->ref_count = 1;
3012 pool->last_commit_jiffies = jiffies;
3013 pool->pool_md = pool_md;
3014 pool->md_dev = metadata_dev;
3015 pool->data_dev = data_dev;
3016 __pool_table_insert(pool);
3017
3018 return pool;
3019
3020 bad_sort_array:
3021 mempool_exit(&pool->mapping_pool);
3022 bad_mapping_pool:
3023 dm_deferred_set_destroy(pool->all_io_ds);
3024 bad_all_io_ds:
3025 dm_deferred_set_destroy(pool->shared_read_ds);
3026 bad_shared_read_ds:
3027 destroy_workqueue(pool->wq);
3028 bad_wq:
3029 dm_kcopyd_client_destroy(pool->copier);
3030 bad_kcopyd_client:
3031 dm_bio_prison_destroy(pool->prison);
3032 bad_prison:
3033 kfree(pool);
3034 bad_pool:
3035 if (dm_pool_metadata_close(pmd))
3036 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3037
3038 return err_p;
3039 }
3040
3041 static void __pool_inc(struct pool *pool)
3042 {
3043 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3044 pool->ref_count++;
3045 }
3046
3047 static void __pool_dec(struct pool *pool)
3048 {
3049 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3050 BUG_ON(!pool->ref_count);
3051 if (!--pool->ref_count)
3052 __pool_destroy(pool);
3053 }
3054
3055 static struct pool *__pool_find(struct mapped_device *pool_md,
3056 struct block_device *metadata_dev,
3057 struct block_device *data_dev,
3058 unsigned long block_size, int read_only,
3059 char **error, int *created)
3060 {
3061 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
3062
3063 if (pool) {
3064 if (pool->pool_md != pool_md) {
3065 *error = "metadata device already in use by a pool";
3066 return ERR_PTR(-EBUSY);
3067 }
3068 if (pool->data_dev != data_dev) {
3069 *error = "data device already in use by a pool";
3070 return ERR_PTR(-EBUSY);
3071 }
3072 __pool_inc(pool);
3073
3074 } else {
3075 pool = __pool_table_lookup(pool_md);
3076 if (pool) {
3077 if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
3078 *error = "different pool cannot replace a pool";
3079 return ERR_PTR(-EINVAL);
3080 }
3081 __pool_inc(pool);
3082
3083 } else {
3084 pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
3085 *created = 1;
3086 }
3087 }
3088
3089 return pool;
3090 }
3091
3092
3093
3094
3095 static void pool_dtr(struct dm_target *ti)
3096 {
3097 struct pool_c *pt = ti->private;
3098
3099 mutex_lock(&dm_thin_pool_table.mutex);
3100
3101 unbind_control_target(pt->pool, ti);
3102 __pool_dec(pt->pool);
3103 dm_put_device(ti, pt->metadata_dev);
3104 dm_put_device(ti, pt->data_dev);
3105 kfree(pt);
3106
3107 mutex_unlock(&dm_thin_pool_table.mutex);
3108 }
3109
3110 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
3111 struct dm_target *ti)
3112 {
3113 int r;
3114 unsigned argc;
3115 const char *arg_name;
3116
3117 static const struct dm_arg _args[] = {
3118 {0, 4, "Invalid number of pool feature arguments"},
3119 };
3120
3121
3122
3123
3124 if (!as->argc)
3125 return 0;
3126
3127 r = dm_read_arg_group(_args, as, &argc, &ti->error);
3128 if (r)
3129 return -EINVAL;
3130
3131 while (argc && !r) {
3132 arg_name = dm_shift_arg(as);
3133 argc--;
3134
3135 if (!strcasecmp(arg_name, "skip_block_zeroing"))
3136 pf->zero_new_blocks = false;
3137
3138 else if (!strcasecmp(arg_name, "ignore_discard"))
3139 pf->discard_enabled = false;
3140
3141 else if (!strcasecmp(arg_name, "no_discard_passdown"))
3142 pf->discard_passdown = false;
3143
3144 else if (!strcasecmp(arg_name, "read_only"))
3145 pf->mode = PM_READ_ONLY;
3146
3147 else if (!strcasecmp(arg_name, "error_if_no_space"))
3148 pf->error_if_no_space = true;
3149
3150 else {
3151 ti->error = "Unrecognised pool feature requested";
3152 r = -EINVAL;
3153 break;
3154 }
3155 }
3156
3157 return r;
3158 }
3159
3160 static void metadata_low_callback(void *context)
3161 {
3162 struct pool *pool = context;
3163
3164 DMWARN("%s: reached low water mark for metadata device: sending event.",
3165 dm_device_name(pool->pool_md));
3166
3167 dm_table_event(pool->ti->table);
3168 }
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181 static int metadata_pre_commit_callback(void *context)
3182 {
3183 struct pool *pool = context;
3184
3185 return blkdev_issue_flush(pool->data_dev);
3186 }
3187
3188 static sector_t get_dev_size(struct block_device *bdev)
3189 {
3190 return bdev_nr_sectors(bdev);
3191 }
3192
3193 static void warn_if_metadata_device_too_big(struct block_device *bdev)
3194 {
3195 sector_t metadata_dev_size = get_dev_size(bdev);
3196
3197 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
3198 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
3199 bdev, THIN_METADATA_MAX_SECTORS);
3200 }
3201
3202 static sector_t get_metadata_dev_size(struct block_device *bdev)
3203 {
3204 sector_t metadata_dev_size = get_dev_size(bdev);
3205
3206 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3207 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
3208
3209 return metadata_dev_size;
3210 }
3211
3212 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3213 {
3214 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3215
3216 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
3217
3218 return metadata_dev_size;
3219 }
3220
3221
3222
3223
3224
3225
3226
3227 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3228 {
3229
3230
3231
3232
3233
3234 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
3235 return min((dm_block_t)1024ULL , quarter);
3236 }
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
3252 {
3253 int r, pool_created = 0;
3254 struct pool_c *pt;
3255 struct pool *pool;
3256 struct pool_features pf;
3257 struct dm_arg_set as;
3258 struct dm_dev *data_dev;
3259 unsigned long block_size;
3260 dm_block_t low_water_blocks;
3261 struct dm_dev *metadata_dev;
3262 fmode_t metadata_mode;
3263
3264
3265
3266
3267 mutex_lock(&dm_thin_pool_table.mutex);
3268
3269 if (argc < 4) {
3270 ti->error = "Invalid argument count";
3271 r = -EINVAL;
3272 goto out_unlock;
3273 }
3274
3275 as.argc = argc;
3276 as.argv = argv;
3277
3278
3279 if (!strcmp(argv[0], argv[1])) {
3280 ti->error = "Error setting metadata or data device";
3281 r = -EINVAL;
3282 goto out_unlock;
3283 }
3284
3285
3286
3287
3288 pool_features_init(&pf);
3289
3290 dm_consume_args(&as, 4);
3291 r = parse_pool_features(&as, &pf, ti);
3292 if (r)
3293 goto out_unlock;
3294
3295 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
3296 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
3297 if (r) {
3298 ti->error = "Error opening metadata block device";
3299 goto out_unlock;
3300 }
3301 warn_if_metadata_device_too_big(metadata_dev->bdev);
3302
3303 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
3304 if (r) {
3305 ti->error = "Error getting data device";
3306 goto out_metadata;
3307 }
3308
3309 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
3310 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
3311 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
3312 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
3313 ti->error = "Invalid block size";
3314 r = -EINVAL;
3315 goto out;
3316 }
3317
3318 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
3319 ti->error = "Invalid low water mark";
3320 r = -EINVAL;
3321 goto out;
3322 }
3323
3324 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
3325 if (!pt) {
3326 r = -ENOMEM;
3327 goto out;
3328 }
3329
3330 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
3331 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
3332 if (IS_ERR(pool)) {
3333 r = PTR_ERR(pool);
3334 goto out_free_pt;
3335 }
3336
3337
3338
3339
3340
3341
3342
3343 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3344 ti->error = "Discard support cannot be disabled once enabled";
3345 r = -EINVAL;
3346 goto out_flags_changed;
3347 }
3348
3349 pt->pool = pool;
3350 pt->ti = ti;
3351 pt->metadata_dev = metadata_dev;
3352 pt->data_dev = data_dev;
3353 pt->low_water_blocks = low_water_blocks;
3354 pt->adjusted_pf = pt->requested_pf = pf;
3355 ti->num_flush_bios = 1;
3356
3357
3358
3359
3360
3361
3362 if (pf.discard_enabled && pf.discard_passdown) {
3363 ti->num_discard_bios = 1;
3364
3365
3366
3367
3368
3369
3370 ti->discards_supported = true;
3371 }
3372 ti->private = pt;
3373
3374 r = dm_pool_register_metadata_threshold(pt->pool->pmd,
3375 calc_metadata_threshold(pt),
3376 metadata_low_callback,
3377 pool);
3378 if (r) {
3379 ti->error = "Error registering metadata threshold";
3380 goto out_flags_changed;
3381 }
3382
3383 dm_pool_register_pre_commit_callback(pool->pmd,
3384 metadata_pre_commit_callback, pool);
3385
3386 mutex_unlock(&dm_thin_pool_table.mutex);
3387
3388 return 0;
3389
3390 out_flags_changed:
3391 __pool_dec(pool);
3392 out_free_pt:
3393 kfree(pt);
3394 out:
3395 dm_put_device(ti, data_dev);
3396 out_metadata:
3397 dm_put_device(ti, metadata_dev);
3398 out_unlock:
3399 mutex_unlock(&dm_thin_pool_table.mutex);
3400
3401 return r;
3402 }
3403
3404 static int pool_map(struct dm_target *ti, struct bio *bio)
3405 {
3406 int r;
3407 struct pool_c *pt = ti->private;
3408 struct pool *pool = pt->pool;
3409
3410
3411
3412
3413 spin_lock_irq(&pool->lock);
3414 bio_set_dev(bio, pt->data_dev->bdev);
3415 r = DM_MAPIO_REMAPPED;
3416 spin_unlock_irq(&pool->lock);
3417
3418 return r;
3419 }
3420
3421 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
3422 {
3423 int r;
3424 struct pool_c *pt = ti->private;
3425 struct pool *pool = pt->pool;
3426 sector_t data_size = ti->len;
3427 dm_block_t sb_data_size;
3428
3429 *need_commit = false;
3430
3431 (void) sector_div(data_size, pool->sectors_per_block);
3432
3433 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
3434 if (r) {
3435 DMERR("%s: failed to retrieve data device size",
3436 dm_device_name(pool->pool_md));
3437 return r;
3438 }
3439
3440 if (data_size < sb_data_size) {
3441 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3442 dm_device_name(pool->pool_md),
3443 (unsigned long long)data_size, sb_data_size);
3444 return -EINVAL;
3445
3446 } else if (data_size > sb_data_size) {
3447 if (dm_pool_metadata_needs_check(pool->pmd)) {
3448 DMERR("%s: unable to grow the data device until repaired.",
3449 dm_device_name(pool->pool_md));
3450 return 0;
3451 }
3452
3453 if (sb_data_size)
3454 DMINFO("%s: growing the data device from %llu to %llu blocks",
3455 dm_device_name(pool->pool_md),
3456 sb_data_size, (unsigned long long)data_size);
3457 r = dm_pool_resize_data_dev(pool->pmd, data_size);
3458 if (r) {
3459 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
3460 return r;
3461 }
3462
3463 *need_commit = true;
3464 }
3465
3466 return 0;
3467 }
3468
3469 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
3470 {
3471 int r;
3472 struct pool_c *pt = ti->private;
3473 struct pool *pool = pt->pool;
3474 dm_block_t metadata_dev_size, sb_metadata_dev_size;
3475
3476 *need_commit = false;
3477
3478 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
3479
3480 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
3481 if (r) {
3482 DMERR("%s: failed to retrieve metadata device size",
3483 dm_device_name(pool->pool_md));
3484 return r;
3485 }
3486
3487 if (metadata_dev_size < sb_metadata_dev_size) {
3488 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3489 dm_device_name(pool->pool_md),
3490 metadata_dev_size, sb_metadata_dev_size);
3491 return -EINVAL;
3492
3493 } else if (metadata_dev_size > sb_metadata_dev_size) {
3494 if (dm_pool_metadata_needs_check(pool->pmd)) {
3495 DMERR("%s: unable to grow the metadata device until repaired.",
3496 dm_device_name(pool->pool_md));
3497 return 0;
3498 }
3499
3500 warn_if_metadata_device_too_big(pool->md_dev);
3501 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3502 dm_device_name(pool->pool_md),
3503 sb_metadata_dev_size, metadata_dev_size);
3504
3505 if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3506 set_pool_mode(pool, PM_WRITE);
3507
3508 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
3509 if (r) {
3510 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
3511 return r;
3512 }
3513
3514 *need_commit = true;
3515 }
3516
3517 return 0;
3518 }
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531 static int pool_preresume(struct dm_target *ti)
3532 {
3533 int r;
3534 bool need_commit1, need_commit2;
3535 struct pool_c *pt = ti->private;
3536 struct pool *pool = pt->pool;
3537
3538
3539
3540
3541 r = bind_control_target(pool, ti);
3542 if (r)
3543 return r;
3544
3545 r = maybe_resize_data_dev(ti, &need_commit1);
3546 if (r)
3547 return r;
3548
3549 r = maybe_resize_metadata_dev(ti, &need_commit2);
3550 if (r)
3551 return r;
3552
3553 if (need_commit1 || need_commit2)
3554 (void) commit(pool);
3555
3556 return 0;
3557 }
3558
3559 static void pool_suspend_active_thins(struct pool *pool)
3560 {
3561 struct thin_c *tc;
3562
3563
3564 tc = get_first_thin(pool);
3565 while (tc) {
3566 dm_internal_suspend_noflush(tc->thin_md);
3567 tc = get_next_thin(pool, tc);
3568 }
3569 }
3570
3571 static void pool_resume_active_thins(struct pool *pool)
3572 {
3573 struct thin_c *tc;
3574
3575
3576 tc = get_first_thin(pool);
3577 while (tc) {
3578 dm_internal_resume(tc->thin_md);
3579 tc = get_next_thin(pool, tc);
3580 }
3581 }
3582
3583 static void pool_resume(struct dm_target *ti)
3584 {
3585 struct pool_c *pt = ti->private;
3586 struct pool *pool = pt->pool;
3587
3588
3589
3590
3591
3592 requeue_bios(pool);
3593 pool_resume_active_thins(pool);
3594
3595 spin_lock_irq(&pool->lock);
3596 pool->low_water_triggered = false;
3597 pool->suspended = false;
3598 spin_unlock_irq(&pool->lock);
3599
3600 do_waker(&pool->waker.work);
3601 }
3602
3603 static void pool_presuspend(struct dm_target *ti)
3604 {
3605 struct pool_c *pt = ti->private;
3606 struct pool *pool = pt->pool;
3607
3608 spin_lock_irq(&pool->lock);
3609 pool->suspended = true;
3610 spin_unlock_irq(&pool->lock);
3611
3612 pool_suspend_active_thins(pool);
3613 }
3614
3615 static void pool_presuspend_undo(struct dm_target *ti)
3616 {
3617 struct pool_c *pt = ti->private;
3618 struct pool *pool = pt->pool;
3619
3620 pool_resume_active_thins(pool);
3621
3622 spin_lock_irq(&pool->lock);
3623 pool->suspended = false;
3624 spin_unlock_irq(&pool->lock);
3625 }
3626
3627 static void pool_postsuspend(struct dm_target *ti)
3628 {
3629 struct pool_c *pt = ti->private;
3630 struct pool *pool = pt->pool;
3631
3632 cancel_delayed_work_sync(&pool->waker);
3633 cancel_delayed_work_sync(&pool->no_space_timeout);
3634 flush_workqueue(pool->wq);
3635 (void) commit(pool);
3636 }
3637
3638 static int check_arg_count(unsigned argc, unsigned args_required)
3639 {
3640 if (argc != args_required) {
3641 DMWARN("Message received with %u arguments instead of %u.",
3642 argc, args_required);
3643 return -EINVAL;
3644 }
3645
3646 return 0;
3647 }
3648
3649 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
3650 {
3651 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
3652 *dev_id <= MAX_DEV_ID)
3653 return 0;
3654
3655 if (warning)
3656 DMWARN("Message received with invalid device id: %s", arg);
3657
3658 return -EINVAL;
3659 }
3660
3661 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
3662 {
3663 dm_thin_id dev_id;
3664 int r;
3665
3666 r = check_arg_count(argc, 2);
3667 if (r)
3668 return r;
3669
3670 r = read_dev_id(argv[1], &dev_id, 1);
3671 if (r)
3672 return r;
3673
3674 r = dm_pool_create_thin(pool->pmd, dev_id);
3675 if (r) {
3676 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3677 argv[1]);
3678 return r;
3679 }
3680
3681 return 0;
3682 }
3683
3684 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3685 {
3686 dm_thin_id dev_id;
3687 dm_thin_id origin_dev_id;
3688 int r;
3689
3690 r = check_arg_count(argc, 3);
3691 if (r)
3692 return r;
3693
3694 r = read_dev_id(argv[1], &dev_id, 1);
3695 if (r)
3696 return r;
3697
3698 r = read_dev_id(argv[2], &origin_dev_id, 1);
3699 if (r)
3700 return r;
3701
3702 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
3703 if (r) {
3704 DMWARN("Creation of new snapshot %s of device %s failed.",
3705 argv[1], argv[2]);
3706 return r;
3707 }
3708
3709 return 0;
3710 }
3711
3712 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
3713 {
3714 dm_thin_id dev_id;
3715 int r;
3716
3717 r = check_arg_count(argc, 2);
3718 if (r)
3719 return r;
3720
3721 r = read_dev_id(argv[1], &dev_id, 1);
3722 if (r)
3723 return r;
3724
3725 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
3726 if (r)
3727 DMWARN("Deletion of thin device %s failed.", argv[1]);
3728
3729 return r;
3730 }
3731
3732 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
3733 {
3734 dm_thin_id old_id, new_id;
3735 int r;
3736
3737 r = check_arg_count(argc, 3);
3738 if (r)
3739 return r;
3740
3741 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
3742 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
3743 return -EINVAL;
3744 }
3745
3746 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
3747 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
3748 return -EINVAL;
3749 }
3750
3751 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
3752 if (r) {
3753 DMWARN("Failed to change transaction id from %s to %s.",
3754 argv[1], argv[2]);
3755 return r;
3756 }
3757
3758 return 0;
3759 }
3760
3761 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3762 {
3763 int r;
3764
3765 r = check_arg_count(argc, 1);
3766 if (r)
3767 return r;
3768
3769 (void) commit(pool);
3770
3771 r = dm_pool_reserve_metadata_snap(pool->pmd);
3772 if (r)
3773 DMWARN("reserve_metadata_snap message failed.");
3774
3775 return r;
3776 }
3777
3778 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3779 {
3780 int r;
3781
3782 r = check_arg_count(argc, 1);
3783 if (r)
3784 return r;
3785
3786 r = dm_pool_release_metadata_snap(pool->pmd);
3787 if (r)
3788 DMWARN("release_metadata_snap message failed.");
3789
3790 return r;
3791 }
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802 static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
3803 char *result, unsigned maxlen)
3804 {
3805 int r = -EINVAL;
3806 struct pool_c *pt = ti->private;
3807 struct pool *pool = pt->pool;
3808
3809 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3810 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3811 dm_device_name(pool->pool_md));
3812 return -EOPNOTSUPP;
3813 }
3814
3815 if (!strcasecmp(argv[0], "create_thin"))
3816 r = process_create_thin_mesg(argc, argv, pool);
3817
3818 else if (!strcasecmp(argv[0], "create_snap"))
3819 r = process_create_snap_mesg(argc, argv, pool);
3820
3821 else if (!strcasecmp(argv[0], "delete"))
3822 r = process_delete_mesg(argc, argv, pool);
3823
3824 else if (!strcasecmp(argv[0], "set_transaction_id"))
3825 r = process_set_transaction_id_mesg(argc, argv, pool);
3826
3827 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
3828 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3829
3830 else if (!strcasecmp(argv[0], "release_metadata_snap"))
3831 r = process_release_metadata_snap_mesg(argc, argv, pool);
3832
3833 else
3834 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
3835
3836 if (!r)
3837 (void) commit(pool);
3838
3839 return r;
3840 }
3841
3842 static void emit_flags(struct pool_features *pf, char *result,
3843 unsigned sz, unsigned maxlen)
3844 {
3845 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
3846 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3847 pf->error_if_no_space;
3848 DMEMIT("%u ", count);
3849
3850 if (!pf->zero_new_blocks)
3851 DMEMIT("skip_block_zeroing ");
3852
3853 if (!pf->discard_enabled)
3854 DMEMIT("ignore_discard ");
3855
3856 if (!pf->discard_passdown)
3857 DMEMIT("no_discard_passdown ");
3858
3859 if (pf->mode == PM_READ_ONLY)
3860 DMEMIT("read_only ");
3861
3862 if (pf->error_if_no_space)
3863 DMEMIT("error_if_no_space ");
3864 }
3865
3866
3867
3868
3869
3870
3871
3872 static void pool_status(struct dm_target *ti, status_type_t type,
3873 unsigned status_flags, char *result, unsigned maxlen)
3874 {
3875 int r;
3876 unsigned sz = 0;
3877 uint64_t transaction_id;
3878 dm_block_t nr_free_blocks_data;
3879 dm_block_t nr_free_blocks_metadata;
3880 dm_block_t nr_blocks_data;
3881 dm_block_t nr_blocks_metadata;
3882 dm_block_t held_root;
3883 enum pool_mode mode;
3884 char buf[BDEVNAME_SIZE];
3885 char buf2[BDEVNAME_SIZE];
3886 struct pool_c *pt = ti->private;
3887 struct pool *pool = pt->pool;
3888
3889 switch (type) {
3890 case STATUSTYPE_INFO:
3891 if (get_pool_mode(pool) == PM_FAIL) {
3892 DMEMIT("Fail");
3893 break;
3894 }
3895
3896
3897 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3898 (void) commit(pool);
3899
3900 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3901 if (r) {
3902 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3903 dm_device_name(pool->pool_md), r);
3904 goto err;
3905 }
3906
3907 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3908 if (r) {
3909 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3910 dm_device_name(pool->pool_md), r);
3911 goto err;
3912 }
3913
3914 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
3915 if (r) {
3916 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3917 dm_device_name(pool->pool_md), r);
3918 goto err;
3919 }
3920
3921 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3922 if (r) {
3923 DMERR("%s: dm_pool_get_free_block_count returned %d",
3924 dm_device_name(pool->pool_md), r);
3925 goto err;
3926 }
3927
3928 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
3929 if (r) {
3930 DMERR("%s: dm_pool_get_data_dev_size returned %d",
3931 dm_device_name(pool->pool_md), r);
3932 goto err;
3933 }
3934
3935 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
3936 if (r) {
3937 DMERR("%s: dm_pool_get_metadata_snap returned %d",
3938 dm_device_name(pool->pool_md), r);
3939 goto err;
3940 }
3941
3942 DMEMIT("%llu %llu/%llu %llu/%llu ",
3943 (unsigned long long)transaction_id,
3944 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3945 (unsigned long long)nr_blocks_metadata,
3946 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3947 (unsigned long long)nr_blocks_data);
3948
3949 if (held_root)
3950 DMEMIT("%llu ", held_root);
3951 else
3952 DMEMIT("- ");
3953
3954 mode = get_pool_mode(pool);
3955 if (mode == PM_OUT_OF_DATA_SPACE)
3956 DMEMIT("out_of_data_space ");
3957 else if (is_read_only_pool_mode(mode))
3958 DMEMIT("ro ");
3959 else
3960 DMEMIT("rw ");
3961
3962 if (!pool->pf.discard_enabled)
3963 DMEMIT("ignore_discard ");
3964 else if (pool->pf.discard_passdown)
3965 DMEMIT("discard_passdown ");
3966 else
3967 DMEMIT("no_discard_passdown ");
3968
3969 if (pool->pf.error_if_no_space)
3970 DMEMIT("error_if_no_space ");
3971 else
3972 DMEMIT("queue_if_no_space ");
3973
3974 if (dm_pool_metadata_needs_check(pool->pmd))
3975 DMEMIT("needs_check ");
3976 else
3977 DMEMIT("- ");
3978
3979 DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
3980
3981 break;
3982
3983 case STATUSTYPE_TABLE:
3984 DMEMIT("%s %s %lu %llu ",
3985 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3986 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3987 (unsigned long)pool->sectors_per_block,
3988 (unsigned long long)pt->low_water_blocks);
3989 emit_flags(&pt->requested_pf, result, sz, maxlen);
3990 break;
3991
3992 case STATUSTYPE_IMA:
3993 *result = '\0';
3994 break;
3995 }
3996 return;
3997
3998 err:
3999 DMEMIT("Error");
4000 }
4001
4002 static int pool_iterate_devices(struct dm_target *ti,
4003 iterate_devices_callout_fn fn, void *data)
4004 {
4005 struct pool_c *pt = ti->private;
4006
4007 return fn(ti, pt->data_dev, 0, ti->len, data);
4008 }
4009
4010 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
4011 {
4012 struct pool_c *pt = ti->private;
4013 struct pool *pool = pt->pool;
4014 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025 if (limits->max_sectors < pool->sectors_per_block) {
4026 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
4027 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
4028 limits->max_sectors--;
4029 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
4030 }
4031 }
4032
4033
4034
4035
4036
4037 if (io_opt_sectors < pool->sectors_per_block ||
4038 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
4039 if (is_factor(pool->sectors_per_block, limits->max_sectors))
4040 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
4041 else
4042 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
4043 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
4044 }
4045
4046
4047
4048
4049
4050
4051 if (!pt->adjusted_pf.discard_enabled) {
4052
4053
4054
4055
4056 limits->discard_granularity = 0;
4057 return;
4058 }
4059
4060 disable_passdown_if_not_supported(pt);
4061
4062
4063
4064
4065
4066 }
4067
4068 static struct target_type pool_target = {
4069 .name = "thin-pool",
4070 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
4071 DM_TARGET_IMMUTABLE,
4072 .version = {1, 22, 0},
4073 .module = THIS_MODULE,
4074 .ctr = pool_ctr,
4075 .dtr = pool_dtr,
4076 .map = pool_map,
4077 .presuspend = pool_presuspend,
4078 .presuspend_undo = pool_presuspend_undo,
4079 .postsuspend = pool_postsuspend,
4080 .preresume = pool_preresume,
4081 .resume = pool_resume,
4082 .message = pool_message,
4083 .status = pool_status,
4084 .iterate_devices = pool_iterate_devices,
4085 .io_hints = pool_io_hints,
4086 };
4087
4088
4089
4090
4091 static void thin_get(struct thin_c *tc)
4092 {
4093 refcount_inc(&tc->refcount);
4094 }
4095
4096 static void thin_put(struct thin_c *tc)
4097 {
4098 if (refcount_dec_and_test(&tc->refcount))
4099 complete(&tc->can_destroy);
4100 }
4101
4102 static void thin_dtr(struct dm_target *ti)
4103 {
4104 struct thin_c *tc = ti->private;
4105
4106 spin_lock_irq(&tc->pool->lock);
4107 list_del_rcu(&tc->list);
4108 spin_unlock_irq(&tc->pool->lock);
4109 synchronize_rcu();
4110
4111 thin_put(tc);
4112 wait_for_completion(&tc->can_destroy);
4113
4114 mutex_lock(&dm_thin_pool_table.mutex);
4115
4116 __pool_dec(tc->pool);
4117 dm_pool_close_thin_device(tc->td);
4118 dm_put_device(ti, tc->pool_dev);
4119 if (tc->origin_dev)
4120 dm_put_device(ti, tc->origin_dev);
4121 kfree(tc);
4122
4123 mutex_unlock(&dm_thin_pool_table.mutex);
4124 }
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
4139 {
4140 int r;
4141 struct thin_c *tc;
4142 struct dm_dev *pool_dev, *origin_dev;
4143 struct mapped_device *pool_md;
4144
4145 mutex_lock(&dm_thin_pool_table.mutex);
4146
4147 if (argc != 2 && argc != 3) {
4148 ti->error = "Invalid argument count";
4149 r = -EINVAL;
4150 goto out_unlock;
4151 }
4152
4153 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
4154 if (!tc) {
4155 ti->error = "Out of memory";
4156 r = -ENOMEM;
4157 goto out_unlock;
4158 }
4159 tc->thin_md = dm_table_get_md(ti->table);
4160 spin_lock_init(&tc->lock);
4161 INIT_LIST_HEAD(&tc->deferred_cells);
4162 bio_list_init(&tc->deferred_bio_list);
4163 bio_list_init(&tc->retry_on_resume_list);
4164 tc->sort_bio_list = RB_ROOT;
4165
4166 if (argc == 3) {
4167 if (!strcmp(argv[0], argv[2])) {
4168 ti->error = "Error setting origin device";
4169 r = -EINVAL;
4170 goto bad_origin_dev;
4171 }
4172
4173 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
4174 if (r) {
4175 ti->error = "Error opening origin device";
4176 goto bad_origin_dev;
4177 }
4178 tc->origin_dev = origin_dev;
4179 }
4180
4181 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
4182 if (r) {
4183 ti->error = "Error opening pool device";
4184 goto bad_pool_dev;
4185 }
4186 tc->pool_dev = pool_dev;
4187
4188 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
4189 ti->error = "Invalid device id";
4190 r = -EINVAL;
4191 goto bad_common;
4192 }
4193
4194 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
4195 if (!pool_md) {
4196 ti->error = "Couldn't get pool mapped device";
4197 r = -EINVAL;
4198 goto bad_common;
4199 }
4200
4201 tc->pool = __pool_table_lookup(pool_md);
4202 if (!tc->pool) {
4203 ti->error = "Couldn't find pool object";
4204 r = -EINVAL;
4205 goto bad_pool_lookup;
4206 }
4207 __pool_inc(tc->pool);
4208
4209 if (get_pool_mode(tc->pool) == PM_FAIL) {
4210 ti->error = "Couldn't open thin device, Pool is in fail mode";
4211 r = -EINVAL;
4212 goto bad_pool;
4213 }
4214
4215 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
4216 if (r) {
4217 ti->error = "Couldn't open thin internal device";
4218 goto bad_pool;
4219 }
4220
4221 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
4222 if (r)
4223 goto bad;
4224
4225 ti->num_flush_bios = 1;
4226 ti->flush_supported = true;
4227 ti->accounts_remapped_io = true;
4228 ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
4229
4230
4231 if (tc->pool->pf.discard_enabled) {
4232 ti->discards_supported = true;
4233 ti->num_discard_bios = 1;
4234 }
4235
4236 mutex_unlock(&dm_thin_pool_table.mutex);
4237
4238 spin_lock_irq(&tc->pool->lock);
4239 if (tc->pool->suspended) {
4240 spin_unlock_irq(&tc->pool->lock);
4241 mutex_lock(&dm_thin_pool_table.mutex);
4242 ti->error = "Unable to activate thin device while pool is suspended";
4243 r = -EINVAL;
4244 goto bad;
4245 }
4246 refcount_set(&tc->refcount, 1);
4247 init_completion(&tc->can_destroy);
4248 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
4249 spin_unlock_irq(&tc->pool->lock);
4250
4251
4252
4253
4254
4255
4256 synchronize_rcu();
4257
4258 dm_put(pool_md);
4259
4260 return 0;
4261
4262 bad:
4263 dm_pool_close_thin_device(tc->td);
4264 bad_pool:
4265 __pool_dec(tc->pool);
4266 bad_pool_lookup:
4267 dm_put(pool_md);
4268 bad_common:
4269 dm_put_device(ti, tc->pool_dev);
4270 bad_pool_dev:
4271 if (tc->origin_dev)
4272 dm_put_device(ti, tc->origin_dev);
4273 bad_origin_dev:
4274 kfree(tc);
4275 out_unlock:
4276 mutex_unlock(&dm_thin_pool_table.mutex);
4277
4278 return r;
4279 }
4280
4281 static int thin_map(struct dm_target *ti, struct bio *bio)
4282 {
4283 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
4284
4285 return thin_bio_map(ti, bio);
4286 }
4287
4288 static int thin_endio(struct dm_target *ti, struct bio *bio,
4289 blk_status_t *err)
4290 {
4291 unsigned long flags;
4292 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
4293 struct list_head work;
4294 struct dm_thin_new_mapping *m, *tmp;
4295 struct pool *pool = h->tc->pool;
4296
4297 if (h->shared_read_entry) {
4298 INIT_LIST_HEAD(&work);
4299 dm_deferred_entry_dec(h->shared_read_entry, &work);
4300
4301 spin_lock_irqsave(&pool->lock, flags);
4302 list_for_each_entry_safe(m, tmp, &work, list) {
4303 list_del(&m->list);
4304 __complete_mapping_preparation(m);
4305 }
4306 spin_unlock_irqrestore(&pool->lock, flags);
4307 }
4308
4309 if (h->all_io_entry) {
4310 INIT_LIST_HEAD(&work);
4311 dm_deferred_entry_dec(h->all_io_entry, &work);
4312 if (!list_empty(&work)) {
4313 spin_lock_irqsave(&pool->lock, flags);
4314 list_for_each_entry_safe(m, tmp, &work, list)
4315 list_add_tail(&m->list, &pool->prepared_discards);
4316 spin_unlock_irqrestore(&pool->lock, flags);
4317 wake_worker(pool);
4318 }
4319 }
4320
4321 if (h->cell)
4322 cell_defer_no_holder(h->tc, h->cell);
4323
4324 return DM_ENDIO_DONE;
4325 }
4326
4327 static void thin_presuspend(struct dm_target *ti)
4328 {
4329 struct thin_c *tc = ti->private;
4330
4331 if (dm_noflush_suspending(ti))
4332 noflush_work(tc, do_noflush_start);
4333 }
4334
4335 static void thin_postsuspend(struct dm_target *ti)
4336 {
4337 struct thin_c *tc = ti->private;
4338
4339
4340
4341
4342
4343 noflush_work(tc, do_noflush_stop);
4344 }
4345
4346 static int thin_preresume(struct dm_target *ti)
4347 {
4348 struct thin_c *tc = ti->private;
4349
4350 if (tc->origin_dev)
4351 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
4352
4353 return 0;
4354 }
4355
4356
4357
4358
4359 static void thin_status(struct dm_target *ti, status_type_t type,
4360 unsigned status_flags, char *result, unsigned maxlen)
4361 {
4362 int r;
4363 ssize_t sz = 0;
4364 dm_block_t mapped, highest;
4365 char buf[BDEVNAME_SIZE];
4366 struct thin_c *tc = ti->private;
4367
4368 if (get_pool_mode(tc->pool) == PM_FAIL) {
4369 DMEMIT("Fail");
4370 return;
4371 }
4372
4373 if (!tc->td)
4374 DMEMIT("-");
4375 else {
4376 switch (type) {
4377 case STATUSTYPE_INFO:
4378 r = dm_thin_get_mapped_count(tc->td, &mapped);
4379 if (r) {
4380 DMERR("dm_thin_get_mapped_count returned %d", r);
4381 goto err;
4382 }
4383
4384 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
4385 if (r < 0) {
4386 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4387 goto err;
4388 }
4389
4390 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4391 if (r)
4392 DMEMIT("%llu", ((highest + 1) *
4393 tc->pool->sectors_per_block) - 1);
4394 else
4395 DMEMIT("-");
4396 break;
4397
4398 case STATUSTYPE_TABLE:
4399 DMEMIT("%s %lu",
4400 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4401 (unsigned long) tc->dev_id);
4402 if (tc->origin_dev)
4403 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
4404 break;
4405
4406 case STATUSTYPE_IMA:
4407 *result = '\0';
4408 break;
4409 }
4410 }
4411
4412 return;
4413
4414 err:
4415 DMEMIT("Error");
4416 }
4417
4418 static int thin_iterate_devices(struct dm_target *ti,
4419 iterate_devices_callout_fn fn, void *data)
4420 {
4421 sector_t blocks;
4422 struct thin_c *tc = ti->private;
4423 struct pool *pool = tc->pool;
4424
4425
4426
4427
4428
4429 if (!pool->ti)
4430 return 0;
4431
4432 blocks = pool->ti->len;
4433 (void) sector_div(blocks, pool->sectors_per_block);
4434 if (blocks)
4435 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
4436
4437 return 0;
4438 }
4439
4440 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
4441 {
4442 struct thin_c *tc = ti->private;
4443 struct pool *pool = tc->pool;
4444
4445 if (!pool->pf.discard_enabled)
4446 return;
4447
4448 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4449 limits->max_discard_sectors = 2048 * 1024 * 16;
4450 }
4451
4452 static struct target_type thin_target = {
4453 .name = "thin",
4454 .version = {1, 22, 0},
4455 .module = THIS_MODULE,
4456 .ctr = thin_ctr,
4457 .dtr = thin_dtr,
4458 .map = thin_map,
4459 .end_io = thin_endio,
4460 .preresume = thin_preresume,
4461 .presuspend = thin_presuspend,
4462 .postsuspend = thin_postsuspend,
4463 .status = thin_status,
4464 .iterate_devices = thin_iterate_devices,
4465 .io_hints = thin_io_hints,
4466 };
4467
4468
4469
4470 static int __init dm_thin_init(void)
4471 {
4472 int r = -ENOMEM;
4473
4474 pool_table_init();
4475
4476 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
4477 if (!_new_mapping_cache)
4478 return r;
4479
4480 r = dm_register_target(&thin_target);
4481 if (r)
4482 goto bad_new_mapping_cache;
4483
4484 r = dm_register_target(&pool_target);
4485 if (r)
4486 goto bad_thin_target;
4487
4488 return 0;
4489
4490 bad_thin_target:
4491 dm_unregister_target(&thin_target);
4492 bad_new_mapping_cache:
4493 kmem_cache_destroy(_new_mapping_cache);
4494
4495 return r;
4496 }
4497
4498 static void dm_thin_exit(void)
4499 {
4500 dm_unregister_target(&thin_target);
4501 dm_unregister_target(&pool_target);
4502
4503 kmem_cache_destroy(_new_mapping_cache);
4504
4505 pool_table_exit();
4506 }
4507
4508 module_init(dm_thin_init);
4509 module_exit(dm_thin_exit);
4510
4511 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
4512 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4513
4514 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
4515 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
4516 MODULE_LICENSE("GPL");