0001
0002
0003
0004
0005
0006 #include <linux/blkdev.h>
0007 #include <linux/ratelimit.h>
0008 #include <linux/sched/mm.h>
0009 #include <crypto/hash.h>
0010 #include "ctree.h"
0011 #include "discard.h"
0012 #include "volumes.h"
0013 #include "disk-io.h"
0014 #include "ordered-data.h"
0015 #include "transaction.h"
0016 #include "backref.h"
0017 #include "extent_io.h"
0018 #include "dev-replace.h"
0019 #include "check-integrity.h"
0020 #include "rcu-string.h"
0021 #include "raid56.h"
0022 #include "block-group.h"
0023 #include "zoned.h"
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 struct scrub_block;
0039 struct scrub_ctx;
0040
0041
0042
0043
0044
0045
0046
0047
0048 #define SCRUB_SECTORS_PER_BIO 32
0049 #define SCRUB_BIOS_PER_SCTX 64
0050
0051
0052
0053
0054
0055 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
0056
0057 struct scrub_recover {
0058 refcount_t refs;
0059 struct btrfs_io_context *bioc;
0060 u64 map_length;
0061 };
0062
0063 struct scrub_sector {
0064 struct scrub_block *sblock;
0065 struct page *page;
0066 struct btrfs_device *dev;
0067 struct list_head list;
0068 u64 flags;
0069 u64 generation;
0070 u64 logical;
0071 u64 physical;
0072 u64 physical_for_dev_replace;
0073 atomic_t refs;
0074 u8 mirror_num;
0075 unsigned int have_csum:1;
0076 unsigned int io_error:1;
0077 u8 csum[BTRFS_CSUM_SIZE];
0078
0079 struct scrub_recover *recover;
0080 };
0081
0082 struct scrub_bio {
0083 int index;
0084 struct scrub_ctx *sctx;
0085 struct btrfs_device *dev;
0086 struct bio *bio;
0087 blk_status_t status;
0088 u64 logical;
0089 u64 physical;
0090 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
0091 int sector_count;
0092 int next_free;
0093 struct work_struct work;
0094 };
0095
0096 struct scrub_block {
0097 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
0098 int sector_count;
0099 atomic_t outstanding_sectors;
0100 refcount_t refs;
0101 struct scrub_ctx *sctx;
0102 struct scrub_parity *sparity;
0103 struct {
0104 unsigned int header_error:1;
0105 unsigned int checksum_error:1;
0106 unsigned int no_io_error_seen:1;
0107 unsigned int generation_error:1;
0108
0109
0110
0111 unsigned int data_corrected:1;
0112 };
0113 struct work_struct work;
0114 };
0115
0116
0117 struct scrub_parity {
0118 struct scrub_ctx *sctx;
0119
0120 struct btrfs_device *scrub_dev;
0121
0122 u64 logic_start;
0123
0124 u64 logic_end;
0125
0126 int nsectors;
0127
0128 u32 stripe_len;
0129
0130 refcount_t refs;
0131
0132 struct list_head sectors_list;
0133
0134
0135 struct work_struct work;
0136
0137
0138 unsigned long dbitmap;
0139
0140
0141
0142
0143
0144 unsigned long ebitmap;
0145 };
0146
0147 struct scrub_ctx {
0148 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
0149 struct btrfs_fs_info *fs_info;
0150 int first_free;
0151 int curr;
0152 atomic_t bios_in_flight;
0153 atomic_t workers_pending;
0154 spinlock_t list_lock;
0155 wait_queue_head_t list_wait;
0156 struct list_head csum_list;
0157 atomic_t cancel_req;
0158 int readonly;
0159 int sectors_per_bio;
0160
0161
0162 ktime_t throttle_deadline;
0163 u64 throttle_sent;
0164
0165 int is_dev_replace;
0166 u64 write_pointer;
0167
0168 struct scrub_bio *wr_curr_bio;
0169 struct mutex wr_lock;
0170 struct btrfs_device *wr_tgtdev;
0171 bool flush_all_writes;
0172
0173
0174
0175
0176 struct btrfs_scrub_progress stat;
0177 spinlock_t stat_lock;
0178
0179
0180
0181
0182
0183
0184
0185
0186 refcount_t refs;
0187 };
0188
0189 struct scrub_warning {
0190 struct btrfs_path *path;
0191 u64 extent_item_size;
0192 const char *errstr;
0193 u64 physical;
0194 u64 logical;
0195 struct btrfs_device *dev;
0196 };
0197
0198 struct full_stripe_lock {
0199 struct rb_node node;
0200 u64 logical;
0201 u64 refs;
0202 struct mutex mutex;
0203 };
0204
0205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
0206 struct scrub_block *sblocks_for_recheck);
0207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
0208 struct scrub_block *sblock,
0209 int retry_failed_mirror);
0210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
0211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
0212 struct scrub_block *sblock_good);
0213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
0214 struct scrub_block *sblock_good,
0215 int sector_num, int force_write);
0216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
0217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
0218 int sector_num);
0219 static int scrub_checksum_data(struct scrub_block *sblock);
0220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
0221 static int scrub_checksum_super(struct scrub_block *sblock);
0222 static void scrub_block_put(struct scrub_block *sblock);
0223 static void scrub_sector_get(struct scrub_sector *sector);
0224 static void scrub_sector_put(struct scrub_sector *sector);
0225 static void scrub_parity_get(struct scrub_parity *sparity);
0226 static void scrub_parity_put(struct scrub_parity *sparity);
0227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
0228 u64 physical, struct btrfs_device *dev, u64 flags,
0229 u64 gen, int mirror_num, u8 *csum,
0230 u64 physical_for_dev_replace);
0231 static void scrub_bio_end_io(struct bio *bio);
0232 static void scrub_bio_end_io_worker(struct work_struct *work);
0233 static void scrub_block_complete(struct scrub_block *sblock);
0234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
0235 u64 extent_logical, u32 extent_len,
0236 u64 *extent_physical,
0237 struct btrfs_device **extent_dev,
0238 int *extent_mirror_num);
0239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
0240 struct scrub_sector *sector);
0241 static void scrub_wr_submit(struct scrub_ctx *sctx);
0242 static void scrub_wr_bio_end_io(struct bio *bio);
0243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
0244 static void scrub_put_ctx(struct scrub_ctx *sctx);
0245
0246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
0247 {
0248 return sector->recover &&
0249 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
0250 }
0251
0252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
0253 {
0254 refcount_inc(&sctx->refs);
0255 atomic_inc(&sctx->bios_in_flight);
0256 }
0257
0258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
0259 {
0260 atomic_dec(&sctx->bios_in_flight);
0261 wake_up(&sctx->list_wait);
0262 scrub_put_ctx(sctx);
0263 }
0264
0265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
0266 {
0267 while (atomic_read(&fs_info->scrub_pause_req)) {
0268 mutex_unlock(&fs_info->scrub_lock);
0269 wait_event(fs_info->scrub_pause_wait,
0270 atomic_read(&fs_info->scrub_pause_req) == 0);
0271 mutex_lock(&fs_info->scrub_lock);
0272 }
0273 }
0274
0275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
0276 {
0277 atomic_inc(&fs_info->scrubs_paused);
0278 wake_up(&fs_info->scrub_pause_wait);
0279 }
0280
0281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
0282 {
0283 mutex_lock(&fs_info->scrub_lock);
0284 __scrub_blocked_if_needed(fs_info);
0285 atomic_dec(&fs_info->scrubs_paused);
0286 mutex_unlock(&fs_info->scrub_lock);
0287
0288 wake_up(&fs_info->scrub_pause_wait);
0289 }
0290
0291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
0292 {
0293 scrub_pause_on(fs_info);
0294 scrub_pause_off(fs_info);
0295 }
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307 static struct full_stripe_lock *insert_full_stripe_lock(
0308 struct btrfs_full_stripe_locks_tree *locks_root,
0309 u64 fstripe_logical)
0310 {
0311 struct rb_node **p;
0312 struct rb_node *parent = NULL;
0313 struct full_stripe_lock *entry;
0314 struct full_stripe_lock *ret;
0315
0316 lockdep_assert_held(&locks_root->lock);
0317
0318 p = &locks_root->root.rb_node;
0319 while (*p) {
0320 parent = *p;
0321 entry = rb_entry(parent, struct full_stripe_lock, node);
0322 if (fstripe_logical < entry->logical) {
0323 p = &(*p)->rb_left;
0324 } else if (fstripe_logical > entry->logical) {
0325 p = &(*p)->rb_right;
0326 } else {
0327 entry->refs++;
0328 return entry;
0329 }
0330 }
0331
0332
0333
0334
0335 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
0336 if (!ret)
0337 return ERR_PTR(-ENOMEM);
0338 ret->logical = fstripe_logical;
0339 ret->refs = 1;
0340 mutex_init(&ret->mutex);
0341
0342 rb_link_node(&ret->node, parent, p);
0343 rb_insert_color(&ret->node, &locks_root->root);
0344 return ret;
0345 }
0346
0347
0348
0349
0350
0351
0352
0353 static struct full_stripe_lock *search_full_stripe_lock(
0354 struct btrfs_full_stripe_locks_tree *locks_root,
0355 u64 fstripe_logical)
0356 {
0357 struct rb_node *node;
0358 struct full_stripe_lock *entry;
0359
0360 lockdep_assert_held(&locks_root->lock);
0361
0362 node = locks_root->root.rb_node;
0363 while (node) {
0364 entry = rb_entry(node, struct full_stripe_lock, node);
0365 if (fstripe_logical < entry->logical)
0366 node = node->rb_left;
0367 else if (fstripe_logical > entry->logical)
0368 node = node->rb_right;
0369 else
0370 return entry;
0371 }
0372 return NULL;
0373 }
0374
0375
0376
0377
0378
0379
0380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
0381 {
0382 u64 ret;
0383
0384
0385
0386
0387
0388 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
0389
0390
0391
0392
0393
0394 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
0395 cache->full_stripe_len + cache->start;
0396 return ret;
0397 }
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407
0408
0409
0410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
0411 bool *locked_ret)
0412 {
0413 struct btrfs_block_group *bg_cache;
0414 struct btrfs_full_stripe_locks_tree *locks_root;
0415 struct full_stripe_lock *existing;
0416 u64 fstripe_start;
0417 int ret = 0;
0418
0419 *locked_ret = false;
0420 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
0421 if (!bg_cache) {
0422 ASSERT(0);
0423 return -ENOENT;
0424 }
0425
0426
0427 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
0428 goto out;
0429 locks_root = &bg_cache->full_stripe_locks_root;
0430
0431 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
0432
0433
0434 mutex_lock(&locks_root->lock);
0435 existing = insert_full_stripe_lock(locks_root, fstripe_start);
0436 mutex_unlock(&locks_root->lock);
0437 if (IS_ERR(existing)) {
0438 ret = PTR_ERR(existing);
0439 goto out;
0440 }
0441 mutex_lock(&existing->mutex);
0442 *locked_ret = true;
0443 out:
0444 btrfs_put_block_group(bg_cache);
0445 return ret;
0446 }
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
0458 bool locked)
0459 {
0460 struct btrfs_block_group *bg_cache;
0461 struct btrfs_full_stripe_locks_tree *locks_root;
0462 struct full_stripe_lock *fstripe_lock;
0463 u64 fstripe_start;
0464 bool freeit = false;
0465 int ret = 0;
0466
0467
0468 if (!locked)
0469 return 0;
0470
0471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
0472 if (!bg_cache) {
0473 ASSERT(0);
0474 return -ENOENT;
0475 }
0476 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
0477 goto out;
0478
0479 locks_root = &bg_cache->full_stripe_locks_root;
0480 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
0481
0482 mutex_lock(&locks_root->lock);
0483 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
0484
0485 if (!fstripe_lock) {
0486 WARN_ON(1);
0487 ret = -ENOENT;
0488 mutex_unlock(&locks_root->lock);
0489 goto out;
0490 }
0491
0492 if (fstripe_lock->refs == 0) {
0493 WARN_ON(1);
0494 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
0495 fstripe_lock->logical);
0496 } else {
0497 fstripe_lock->refs--;
0498 }
0499
0500 if (fstripe_lock->refs == 0) {
0501 rb_erase(&fstripe_lock->node, &locks_root->root);
0502 freeit = true;
0503 }
0504 mutex_unlock(&locks_root->lock);
0505
0506 mutex_unlock(&fstripe_lock->mutex);
0507 if (freeit)
0508 kfree(fstripe_lock);
0509 out:
0510 btrfs_put_block_group(bg_cache);
0511 return ret;
0512 }
0513
0514 static void scrub_free_csums(struct scrub_ctx *sctx)
0515 {
0516 while (!list_empty(&sctx->csum_list)) {
0517 struct btrfs_ordered_sum *sum;
0518 sum = list_first_entry(&sctx->csum_list,
0519 struct btrfs_ordered_sum, list);
0520 list_del(&sum->list);
0521 kfree(sum);
0522 }
0523 }
0524
0525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
0526 {
0527 int i;
0528
0529 if (!sctx)
0530 return;
0531
0532
0533 if (sctx->curr != -1) {
0534 struct scrub_bio *sbio = sctx->bios[sctx->curr];
0535
0536 for (i = 0; i < sbio->sector_count; i++) {
0537 WARN_ON(!sbio->sectors[i]->page);
0538 scrub_block_put(sbio->sectors[i]->sblock);
0539 }
0540 bio_put(sbio->bio);
0541 }
0542
0543 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
0544 struct scrub_bio *sbio = sctx->bios[i];
0545
0546 if (!sbio)
0547 break;
0548 kfree(sbio);
0549 }
0550
0551 kfree(sctx->wr_curr_bio);
0552 scrub_free_csums(sctx);
0553 kfree(sctx);
0554 }
0555
0556 static void scrub_put_ctx(struct scrub_ctx *sctx)
0557 {
0558 if (refcount_dec_and_test(&sctx->refs))
0559 scrub_free_ctx(sctx);
0560 }
0561
0562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
0563 struct btrfs_fs_info *fs_info, int is_dev_replace)
0564 {
0565 struct scrub_ctx *sctx;
0566 int i;
0567
0568 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
0569 if (!sctx)
0570 goto nomem;
0571 refcount_set(&sctx->refs, 1);
0572 sctx->is_dev_replace = is_dev_replace;
0573 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
0574 sctx->curr = -1;
0575 sctx->fs_info = fs_info;
0576 INIT_LIST_HEAD(&sctx->csum_list);
0577 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
0578 struct scrub_bio *sbio;
0579
0580 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
0581 if (!sbio)
0582 goto nomem;
0583 sctx->bios[i] = sbio;
0584
0585 sbio->index = i;
0586 sbio->sctx = sctx;
0587 sbio->sector_count = 0;
0588 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
0589
0590 if (i != SCRUB_BIOS_PER_SCTX - 1)
0591 sctx->bios[i]->next_free = i + 1;
0592 else
0593 sctx->bios[i]->next_free = -1;
0594 }
0595 sctx->first_free = 0;
0596 atomic_set(&sctx->bios_in_flight, 0);
0597 atomic_set(&sctx->workers_pending, 0);
0598 atomic_set(&sctx->cancel_req, 0);
0599
0600 spin_lock_init(&sctx->list_lock);
0601 spin_lock_init(&sctx->stat_lock);
0602 init_waitqueue_head(&sctx->list_wait);
0603 sctx->throttle_deadline = 0;
0604
0605 WARN_ON(sctx->wr_curr_bio != NULL);
0606 mutex_init(&sctx->wr_lock);
0607 sctx->wr_curr_bio = NULL;
0608 if (is_dev_replace) {
0609 WARN_ON(!fs_info->dev_replace.tgtdev);
0610 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
0611 sctx->flush_all_writes = false;
0612 }
0613
0614 return sctx;
0615
0616 nomem:
0617 scrub_free_ctx(sctx);
0618 return ERR_PTR(-ENOMEM);
0619 }
0620
0621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
0622 void *warn_ctx)
0623 {
0624 u32 nlink;
0625 int ret;
0626 int i;
0627 unsigned nofs_flag;
0628 struct extent_buffer *eb;
0629 struct btrfs_inode_item *inode_item;
0630 struct scrub_warning *swarn = warn_ctx;
0631 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
0632 struct inode_fs_paths *ipath = NULL;
0633 struct btrfs_root *local_root;
0634 struct btrfs_key key;
0635
0636 local_root = btrfs_get_fs_root(fs_info, root, true);
0637 if (IS_ERR(local_root)) {
0638 ret = PTR_ERR(local_root);
0639 goto err;
0640 }
0641
0642
0643
0644
0645 key.objectid = inum;
0646 key.type = BTRFS_INODE_ITEM_KEY;
0647 key.offset = 0;
0648
0649 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
0650 if (ret) {
0651 btrfs_put_root(local_root);
0652 btrfs_release_path(swarn->path);
0653 goto err;
0654 }
0655
0656 eb = swarn->path->nodes[0];
0657 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
0658 struct btrfs_inode_item);
0659 nlink = btrfs_inode_nlink(eb, inode_item);
0660 btrfs_release_path(swarn->path);
0661
0662
0663
0664
0665
0666
0667 nofs_flag = memalloc_nofs_save();
0668 ipath = init_ipath(4096, local_root, swarn->path);
0669 memalloc_nofs_restore(nofs_flag);
0670 if (IS_ERR(ipath)) {
0671 btrfs_put_root(local_root);
0672 ret = PTR_ERR(ipath);
0673 ipath = NULL;
0674 goto err;
0675 }
0676 ret = paths_from_inode(inum, ipath);
0677
0678 if (ret < 0)
0679 goto err;
0680
0681
0682
0683
0684
0685 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
0686 btrfs_warn_in_rcu(fs_info,
0687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
0688 swarn->errstr, swarn->logical,
0689 rcu_str_deref(swarn->dev->name),
0690 swarn->physical,
0691 root, inum, offset,
0692 fs_info->sectorsize, nlink,
0693 (char *)(unsigned long)ipath->fspath->val[i]);
0694
0695 btrfs_put_root(local_root);
0696 free_ipath(ipath);
0697 return 0;
0698
0699 err:
0700 btrfs_warn_in_rcu(fs_info,
0701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
0702 swarn->errstr, swarn->logical,
0703 rcu_str_deref(swarn->dev->name),
0704 swarn->physical,
0705 root, inum, offset, ret);
0706
0707 free_ipath(ipath);
0708 return 0;
0709 }
0710
0711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
0712 {
0713 struct btrfs_device *dev;
0714 struct btrfs_fs_info *fs_info;
0715 struct btrfs_path *path;
0716 struct btrfs_key found_key;
0717 struct extent_buffer *eb;
0718 struct btrfs_extent_item *ei;
0719 struct scrub_warning swarn;
0720 unsigned long ptr = 0;
0721 u64 extent_item_pos;
0722 u64 flags = 0;
0723 u64 ref_root;
0724 u32 item_size;
0725 u8 ref_level = 0;
0726 int ret;
0727
0728 WARN_ON(sblock->sector_count < 1);
0729 dev = sblock->sectors[0]->dev;
0730 fs_info = sblock->sctx->fs_info;
0731
0732 path = btrfs_alloc_path();
0733 if (!path)
0734 return;
0735
0736 swarn.physical = sblock->sectors[0]->physical;
0737 swarn.logical = sblock->sectors[0]->logical;
0738 swarn.errstr = errstr;
0739 swarn.dev = NULL;
0740
0741 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
0742 &flags);
0743 if (ret < 0)
0744 goto out;
0745
0746 extent_item_pos = swarn.logical - found_key.objectid;
0747 swarn.extent_item_size = found_key.offset;
0748
0749 eb = path->nodes[0];
0750 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
0751 item_size = btrfs_item_size(eb, path->slots[0]);
0752
0753 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
0754 do {
0755 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
0756 item_size, &ref_root,
0757 &ref_level);
0758 btrfs_warn_in_rcu(fs_info,
0759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
0760 errstr, swarn.logical,
0761 rcu_str_deref(dev->name),
0762 swarn.physical,
0763 ref_level ? "node" : "leaf",
0764 ret < 0 ? -1 : ref_level,
0765 ret < 0 ? -1 : ref_root);
0766 } while (ret != 1);
0767 btrfs_release_path(path);
0768 } else {
0769 btrfs_release_path(path);
0770 swarn.path = path;
0771 swarn.dev = dev;
0772 iterate_extent_inodes(fs_info, found_key.objectid,
0773 extent_item_pos, 1,
0774 scrub_print_warning_inode, &swarn, false);
0775 }
0776
0777 out:
0778 btrfs_free_path(path);
0779 }
0780
0781 static inline void scrub_get_recover(struct scrub_recover *recover)
0782 {
0783 refcount_inc(&recover->refs);
0784 }
0785
0786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
0787 struct scrub_recover *recover)
0788 {
0789 if (refcount_dec_and_test(&recover->refs)) {
0790 btrfs_bio_counter_dec(fs_info);
0791 btrfs_put_bioc(recover->bioc);
0792 kfree(recover);
0793 }
0794 }
0795
0796
0797
0798
0799
0800
0801
0802
0803
0804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
0805 {
0806 struct scrub_ctx *sctx = sblock_to_check->sctx;
0807 struct btrfs_device *dev;
0808 struct btrfs_fs_info *fs_info;
0809 u64 logical;
0810 unsigned int failed_mirror_index;
0811 unsigned int is_metadata;
0812 unsigned int have_csum;
0813 struct scrub_block *sblocks_for_recheck;
0814 struct scrub_block *sblock_bad;
0815 int ret;
0816 int mirror_index;
0817 int sector_num;
0818 int success;
0819 bool full_stripe_locked;
0820 unsigned int nofs_flag;
0821 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
0822 DEFAULT_RATELIMIT_BURST);
0823
0824 BUG_ON(sblock_to_check->sector_count < 1);
0825 fs_info = sctx->fs_info;
0826 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
0827
0828
0829
0830
0831
0832 spin_lock(&sctx->stat_lock);
0833 ++sctx->stat.super_errors;
0834 spin_unlock(&sctx->stat_lock);
0835 return 0;
0836 }
0837 logical = sblock_to_check->sectors[0]->logical;
0838 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
0839 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
0840 is_metadata = !(sblock_to_check->sectors[0]->flags &
0841 BTRFS_EXTENT_FLAG_DATA);
0842 have_csum = sblock_to_check->sectors[0]->have_csum;
0843 dev = sblock_to_check->sectors[0]->dev;
0844
0845 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
0846 return 0;
0847
0848
0849
0850
0851
0852
0853
0854
0855
0856
0857 nofs_flag = memalloc_nofs_save();
0858
0859
0860
0861
0862
0863
0864
0865 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
0866 if (ret < 0) {
0867 memalloc_nofs_restore(nofs_flag);
0868 spin_lock(&sctx->stat_lock);
0869 if (ret == -ENOMEM)
0870 sctx->stat.malloc_errors++;
0871 sctx->stat.read_errors++;
0872 sctx->stat.uncorrectable_errors++;
0873 spin_unlock(&sctx->stat_lock);
0874 return ret;
0875 }
0876
0877
0878
0879
0880
0881
0882
0883
0884
0885
0886
0887
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900
0901
0902
0903
0904
0905
0906 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
0907 sizeof(*sblocks_for_recheck), GFP_KERNEL);
0908 if (!sblocks_for_recheck) {
0909 spin_lock(&sctx->stat_lock);
0910 sctx->stat.malloc_errors++;
0911 sctx->stat.read_errors++;
0912 sctx->stat.uncorrectable_errors++;
0913 spin_unlock(&sctx->stat_lock);
0914 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0915 goto out;
0916 }
0917
0918
0919 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
0920 if (ret) {
0921 spin_lock(&sctx->stat_lock);
0922 sctx->stat.read_errors++;
0923 sctx->stat.uncorrectable_errors++;
0924 spin_unlock(&sctx->stat_lock);
0925 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0926 goto out;
0927 }
0928 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
0929 sblock_bad = sblocks_for_recheck + failed_mirror_index;
0930
0931
0932 scrub_recheck_block(fs_info, sblock_bad, 1);
0933
0934 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
0935 sblock_bad->no_io_error_seen) {
0936
0937
0938
0939
0940
0941
0942
0943
0944 spin_lock(&sctx->stat_lock);
0945 sctx->stat.unverified_errors++;
0946 sblock_to_check->data_corrected = 1;
0947 spin_unlock(&sctx->stat_lock);
0948
0949 if (sctx->is_dev_replace)
0950 scrub_write_block_to_dev_replace(sblock_bad);
0951 goto out;
0952 }
0953
0954 if (!sblock_bad->no_io_error_seen) {
0955 spin_lock(&sctx->stat_lock);
0956 sctx->stat.read_errors++;
0957 spin_unlock(&sctx->stat_lock);
0958 if (__ratelimit(&rs))
0959 scrub_print_warning("i/o error", sblock_to_check);
0960 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0961 } else if (sblock_bad->checksum_error) {
0962 spin_lock(&sctx->stat_lock);
0963 sctx->stat.csum_errors++;
0964 spin_unlock(&sctx->stat_lock);
0965 if (__ratelimit(&rs))
0966 scrub_print_warning("checksum error", sblock_to_check);
0967 btrfs_dev_stat_inc_and_print(dev,
0968 BTRFS_DEV_STAT_CORRUPTION_ERRS);
0969 } else if (sblock_bad->header_error) {
0970 spin_lock(&sctx->stat_lock);
0971 sctx->stat.verify_errors++;
0972 spin_unlock(&sctx->stat_lock);
0973 if (__ratelimit(&rs))
0974 scrub_print_warning("checksum/header error",
0975 sblock_to_check);
0976 if (sblock_bad->generation_error)
0977 btrfs_dev_stat_inc_and_print(dev,
0978 BTRFS_DEV_STAT_GENERATION_ERRS);
0979 else
0980 btrfs_dev_stat_inc_and_print(dev,
0981 BTRFS_DEV_STAT_CORRUPTION_ERRS);
0982 }
0983
0984 if (sctx->readonly) {
0985 ASSERT(!sctx->is_dev_replace);
0986 goto out;
0987 }
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004 for (mirror_index = 0; ;mirror_index++) {
1005 struct scrub_block *sblock_other;
1006
1007 if (mirror_index == failed_mirror_index)
1008 continue;
1009
1010
1011 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012 if (mirror_index >= BTRFS_MAX_MIRRORS)
1013 break;
1014 if (!sblocks_for_recheck[mirror_index].sector_count)
1015 break;
1016
1017 sblock_other = sblocks_for_recheck + mirror_index;
1018 } else {
1019 struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022 if (mirror_index >= max_allowed)
1023 break;
1024 if (!sblocks_for_recheck[1].sector_count)
1025 break;
1026
1027 ASSERT(failed_mirror_index == 0);
1028 sblock_other = sblocks_for_recheck + 1;
1029 sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030 }
1031
1032
1033 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035 if (!sblock_other->header_error &&
1036 !sblock_other->checksum_error &&
1037 sblock_other->no_io_error_seen) {
1038 if (sctx->is_dev_replace) {
1039 scrub_write_block_to_dev_replace(sblock_other);
1040 goto corrected_error;
1041 } else {
1042 ret = scrub_repair_block_from_good_copy(
1043 sblock_bad, sblock_other);
1044 if (!ret)
1045 goto corrected_error;
1046 }
1047 }
1048 }
1049
1050 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051 goto did_not_correct_error;
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077 success = 1;
1078 for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079 sector_num++) {
1080 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081 struct scrub_block *sblock_other = NULL;
1082
1083
1084 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085 continue;
1086
1087 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088
1089
1090
1091
1092
1093
1094
1095 sblock_other = NULL;
1096 } else if (sector_bad->io_error) {
1097
1098 for (mirror_index = 0;
1099 mirror_index < BTRFS_MAX_MIRRORS &&
1100 sblocks_for_recheck[mirror_index].sector_count > 0;
1101 mirror_index++) {
1102 if (!sblocks_for_recheck[mirror_index].
1103 sectors[sector_num]->io_error) {
1104 sblock_other = sblocks_for_recheck +
1105 mirror_index;
1106 break;
1107 }
1108 }
1109 if (!sblock_other)
1110 success = 0;
1111 }
1112
1113 if (sctx->is_dev_replace) {
1114
1115
1116
1117
1118
1119
1120 if (!sblock_other)
1121 sblock_other = sblock_bad;
1122
1123 if (scrub_write_sector_to_dev_replace(sblock_other,
1124 sector_num) != 0) {
1125 atomic64_inc(
1126 &fs_info->dev_replace.num_write_errors);
1127 success = 0;
1128 }
1129 } else if (sblock_other) {
1130 ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131 sblock_other,
1132 sector_num, 0);
1133 if (0 == ret)
1134 sector_bad->io_error = 0;
1135 else
1136 success = 0;
1137 }
1138 }
1139
1140 if (success && !sctx->is_dev_replace) {
1141 if (is_metadata || have_csum) {
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151 scrub_recheck_block(fs_info, sblock_bad, 1);
1152 if (!sblock_bad->header_error &&
1153 !sblock_bad->checksum_error &&
1154 sblock_bad->no_io_error_seen)
1155 goto corrected_error;
1156 else
1157 goto did_not_correct_error;
1158 } else {
1159 corrected_error:
1160 spin_lock(&sctx->stat_lock);
1161 sctx->stat.corrected_errors++;
1162 sblock_to_check->data_corrected = 1;
1163 spin_unlock(&sctx->stat_lock);
1164 btrfs_err_rl_in_rcu(fs_info,
1165 "fixed up error at logical %llu on dev %s",
1166 logical, rcu_str_deref(dev->name));
1167 }
1168 } else {
1169 did_not_correct_error:
1170 spin_lock(&sctx->stat_lock);
1171 sctx->stat.uncorrectable_errors++;
1172 spin_unlock(&sctx->stat_lock);
1173 btrfs_err_rl_in_rcu(fs_info,
1174 "unable to fixup (regular) error at logical %llu on dev %s",
1175 logical, rcu_str_deref(dev->name));
1176 }
1177
1178 out:
1179 if (sblocks_for_recheck) {
1180 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181 mirror_index++) {
1182 struct scrub_block *sblock = sblocks_for_recheck +
1183 mirror_index;
1184 struct scrub_recover *recover;
1185 int i;
1186
1187 for (i = 0; i < sblock->sector_count; i++) {
1188 sblock->sectors[i]->sblock = NULL;
1189 recover = sblock->sectors[i]->recover;
1190 if (recover) {
1191 scrub_put_recover(fs_info, recover);
1192 sblock->sectors[i]->recover = NULL;
1193 }
1194 scrub_sector_put(sblock->sectors[i]);
1195 }
1196 }
1197 kfree(sblocks_for_recheck);
1198 }
1199
1200 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201 memalloc_nofs_restore(nofs_flag);
1202 if (ret < 0)
1203 return ret;
1204 return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210 return 2;
1211 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212 return 3;
1213 else
1214 return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218 u64 *raid_map,
1219 int nstripes, int mirror,
1220 int *stripe_index,
1221 u64 *stripe_offset)
1222 {
1223 int i;
1224
1225 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226
1227 for (i = 0; i < nstripes; i++) {
1228 if (raid_map[i] == RAID6_Q_STRIPE ||
1229 raid_map[i] == RAID5_P_STRIPE)
1230 continue;
1231
1232 if (logical >= raid_map[i] &&
1233 logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234 break;
1235 }
1236
1237 *stripe_index = i;
1238 *stripe_offset = logical - raid_map[i];
1239 } else {
1240
1241 *stripe_index = mirror;
1242 *stripe_offset = 0;
1243 }
1244 }
1245
1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247 struct scrub_block *sblocks_for_recheck)
1248 {
1249 struct scrub_ctx *sctx = original_sblock->sctx;
1250 struct btrfs_fs_info *fs_info = sctx->fs_info;
1251 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252 u64 logical = original_sblock->sectors[0]->logical;
1253 u64 generation = original_sblock->sectors[0]->generation;
1254 u64 flags = original_sblock->sectors[0]->flags;
1255 u64 have_csum = original_sblock->sectors[0]->have_csum;
1256 struct scrub_recover *recover;
1257 struct btrfs_io_context *bioc;
1258 u64 sublen;
1259 u64 mapped_length;
1260 u64 stripe_offset;
1261 int stripe_index;
1262 int sector_index = 0;
1263 int mirror_index;
1264 int nmirrors;
1265 int ret;
1266
1267
1268
1269
1270
1271
1272 while (length > 0) {
1273 sublen = min_t(u64, length, fs_info->sectorsize);
1274 mapped_length = sublen;
1275 bioc = NULL;
1276
1277
1278
1279
1280
1281 btrfs_bio_counter_inc_blocked(fs_info);
1282 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283 logical, &mapped_length, &bioc);
1284 if (ret || !bioc || mapped_length < sublen) {
1285 btrfs_put_bioc(bioc);
1286 btrfs_bio_counter_dec(fs_info);
1287 return -EIO;
1288 }
1289
1290 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291 if (!recover) {
1292 btrfs_put_bioc(bioc);
1293 btrfs_bio_counter_dec(fs_info);
1294 return -ENOMEM;
1295 }
1296
1297 refcount_set(&recover->refs, 1);
1298 recover->bioc = bioc;
1299 recover->map_length = mapped_length;
1300
1301 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305 for (mirror_index = 0; mirror_index < nmirrors;
1306 mirror_index++) {
1307 struct scrub_block *sblock;
1308 struct scrub_sector *sector;
1309
1310 sblock = sblocks_for_recheck + mirror_index;
1311 sblock->sctx = sctx;
1312
1313 sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314 if (!sector) {
1315 leave_nomem:
1316 spin_lock(&sctx->stat_lock);
1317 sctx->stat.malloc_errors++;
1318 spin_unlock(&sctx->stat_lock);
1319 scrub_put_recover(fs_info, recover);
1320 return -ENOMEM;
1321 }
1322 scrub_sector_get(sector);
1323 sblock->sectors[sector_index] = sector;
1324 sector->sblock = sblock;
1325 sector->flags = flags;
1326 sector->generation = generation;
1327 sector->logical = logical;
1328 sector->have_csum = have_csum;
1329 if (have_csum)
1330 memcpy(sector->csum,
1331 original_sblock->sectors[0]->csum,
1332 sctx->fs_info->csum_size);
1333
1334 scrub_stripe_index_and_offset(logical,
1335 bioc->map_type,
1336 bioc->raid_map,
1337 bioc->num_stripes -
1338 bioc->num_tgtdevs,
1339 mirror_index,
1340 &stripe_index,
1341 &stripe_offset);
1342 sector->physical = bioc->stripes[stripe_index].physical +
1343 stripe_offset;
1344 sector->dev = bioc->stripes[stripe_index].dev;
1345
1346 BUG_ON(sector_index >= original_sblock->sector_count);
1347 sector->physical_for_dev_replace =
1348 original_sblock->sectors[sector_index]->
1349 physical_for_dev_replace;
1350
1351 sector->mirror_num = mirror_index + 1;
1352 sblock->sector_count++;
1353 sector->page = alloc_page(GFP_NOFS);
1354 if (!sector->page)
1355 goto leave_nomem;
1356
1357 scrub_get_recover(recover);
1358 sector->recover = recover;
1359 }
1360 scrub_put_recover(fs_info, recover);
1361 length -= sublen;
1362 logical += sublen;
1363 sector_index++;
1364 }
1365
1366 return 0;
1367 }
1368
1369 static void scrub_bio_wait_endio(struct bio *bio)
1370 {
1371 complete(bio->bi_private);
1372 }
1373
1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375 struct bio *bio,
1376 struct scrub_sector *sector)
1377 {
1378 DECLARE_COMPLETION_ONSTACK(done);
1379
1380 bio->bi_iter.bi_sector = sector->logical >> 9;
1381 bio->bi_private = &done;
1382 bio->bi_end_io = scrub_bio_wait_endio;
1383 raid56_parity_recover(bio, sector->recover->bioc,
1384 sector->sblock->sectors[0]->mirror_num, false);
1385
1386 wait_for_completion_io(&done);
1387 return blk_status_to_errno(bio->bi_status);
1388 }
1389
1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391 struct scrub_block *sblock)
1392 {
1393 struct scrub_sector *first_sector = sblock->sectors[0];
1394 struct bio *bio;
1395 int i;
1396
1397
1398 ASSERT(first_sector->dev);
1399 if (!first_sector->dev->bdev)
1400 goto out;
1401
1402 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404 for (i = 0; i < sblock->sector_count; i++) {
1405 struct scrub_sector *sector = sblock->sectors[i];
1406
1407 WARN_ON(!sector->page);
1408 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409 }
1410
1411 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412 bio_put(bio);
1413 goto out;
1414 }
1415
1416 bio_put(bio);
1417
1418 scrub_recheck_block_checksum(sblock);
1419
1420 return;
1421 out:
1422 for (i = 0; i < sblock->sector_count; i++)
1423 sblock->sectors[i]->io_error = 1;
1424
1425 sblock->no_io_error_seen = 0;
1426 }
1427
1428
1429
1430
1431
1432
1433
1434
1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436 struct scrub_block *sblock,
1437 int retry_failed_mirror)
1438 {
1439 int i;
1440
1441 sblock->no_io_error_seen = 1;
1442
1443
1444 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445 return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447 for (i = 0; i < sblock->sector_count; i++) {
1448 struct scrub_sector *sector = sblock->sectors[i];
1449 struct bio bio;
1450 struct bio_vec bvec;
1451
1452 if (sector->dev->bdev == NULL) {
1453 sector->io_error = 1;
1454 sblock->no_io_error_seen = 0;
1455 continue;
1456 }
1457
1458 WARN_ON(!sector->page);
1459 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461 bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463 btrfsic_check_bio(&bio);
1464 if (submit_bio_wait(&bio)) {
1465 sector->io_error = 1;
1466 sblock->no_io_error_seen = 0;
1467 }
1468
1469 bio_uninit(&bio);
1470 }
1471
1472 if (sblock->no_io_error_seen)
1473 scrub_recheck_block_checksum(sblock);
1474 }
1475
1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477 {
1478 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479 int ret;
1480
1481 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482 return !ret;
1483 }
1484
1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486 {
1487 sblock->header_error = 0;
1488 sblock->checksum_error = 0;
1489 sblock->generation_error = 0;
1490
1491 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492 scrub_checksum_data(sblock);
1493 else
1494 scrub_checksum_tree_block(sblock);
1495 }
1496
1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498 struct scrub_block *sblock_good)
1499 {
1500 int i;
1501 int ret = 0;
1502
1503 for (i = 0; i < sblock_bad->sector_count; i++) {
1504 int ret_sub;
1505
1506 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507 sblock_good, i, 1);
1508 if (ret_sub)
1509 ret = ret_sub;
1510 }
1511
1512 return ret;
1513 }
1514
1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516 struct scrub_block *sblock_good,
1517 int sector_num, int force_write)
1518 {
1519 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520 struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522 const u32 sectorsize = fs_info->sectorsize;
1523
1524 BUG_ON(sector_bad->page == NULL);
1525 BUG_ON(sector_good->page == NULL);
1526 if (force_write || sblock_bad->header_error ||
1527 sblock_bad->checksum_error || sector_bad->io_error) {
1528 struct bio bio;
1529 struct bio_vec bvec;
1530 int ret;
1531
1532 if (!sector_bad->dev->bdev) {
1533 btrfs_warn_rl(fs_info,
1534 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535 return -EIO;
1536 }
1537
1538 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542 btrfsic_check_bio(&bio);
1543 ret = submit_bio_wait(&bio);
1544 bio_uninit(&bio);
1545
1546 if (ret) {
1547 btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548 BTRFS_DEV_STAT_WRITE_ERRS);
1549 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550 return -EIO;
1551 }
1552 }
1553
1554 return 0;
1555 }
1556
1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558 {
1559 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560 int i;
1561
1562
1563
1564
1565
1566 if (sblock->sparity)
1567 return;
1568
1569 for (i = 0; i < sblock->sector_count; i++) {
1570 int ret;
1571
1572 ret = scrub_write_sector_to_dev_replace(sblock, i);
1573 if (ret)
1574 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575 }
1576 }
1577
1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579 {
1580 struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582 BUG_ON(sector->page == NULL);
1583 if (sector->io_error)
1584 clear_page(page_address(sector->page));
1585
1586 return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587 }
1588
1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590 {
1591 int ret = 0;
1592 u64 length;
1593
1594 if (!btrfs_is_zoned(sctx->fs_info))
1595 return 0;
1596
1597 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598 return 0;
1599
1600 if (sctx->write_pointer < physical) {
1601 length = physical - sctx->write_pointer;
1602
1603 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604 sctx->write_pointer, length);
1605 if (!ret)
1606 sctx->write_pointer = physical;
1607 }
1608 return ret;
1609 }
1610
1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612 struct scrub_sector *sector)
1613 {
1614 struct scrub_bio *sbio;
1615 int ret;
1616 const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618 mutex_lock(&sctx->wr_lock);
1619 again:
1620 if (!sctx->wr_curr_bio) {
1621 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622 GFP_KERNEL);
1623 if (!sctx->wr_curr_bio) {
1624 mutex_unlock(&sctx->wr_lock);
1625 return -ENOMEM;
1626 }
1627 sctx->wr_curr_bio->sctx = sctx;
1628 sctx->wr_curr_bio->sector_count = 0;
1629 }
1630 sbio = sctx->wr_curr_bio;
1631 if (sbio->sector_count == 0) {
1632 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633 if (ret) {
1634 mutex_unlock(&sctx->wr_lock);
1635 return ret;
1636 }
1637
1638 sbio->physical = sector->physical_for_dev_replace;
1639 sbio->logical = sector->logical;
1640 sbio->dev = sctx->wr_tgtdev;
1641 if (!sbio->bio) {
1642 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643 REQ_OP_WRITE, GFP_NOFS);
1644 }
1645 sbio->bio->bi_private = sbio;
1646 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648 sbio->status = 0;
1649 } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650 sector->physical_for_dev_replace ||
1651 sbio->logical + sbio->sector_count * sectorsize !=
1652 sector->logical) {
1653 scrub_wr_submit(sctx);
1654 goto again;
1655 }
1656
1657 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658 if (ret != sectorsize) {
1659 if (sbio->sector_count < 1) {
1660 bio_put(sbio->bio);
1661 sbio->bio = NULL;
1662 mutex_unlock(&sctx->wr_lock);
1663 return -EIO;
1664 }
1665 scrub_wr_submit(sctx);
1666 goto again;
1667 }
1668
1669 sbio->sectors[sbio->sector_count] = sector;
1670 scrub_sector_get(sector);
1671 sbio->sector_count++;
1672 if (sbio->sector_count == sctx->sectors_per_bio)
1673 scrub_wr_submit(sctx);
1674 mutex_unlock(&sctx->wr_lock);
1675
1676 return 0;
1677 }
1678
1679 static void scrub_wr_submit(struct scrub_ctx *sctx)
1680 {
1681 struct scrub_bio *sbio;
1682
1683 if (!sctx->wr_curr_bio)
1684 return;
1685
1686 sbio = sctx->wr_curr_bio;
1687 sctx->wr_curr_bio = NULL;
1688 scrub_pending_bio_inc(sctx);
1689
1690
1691
1692
1693 btrfsic_check_bio(sbio->bio);
1694 submit_bio(sbio->bio);
1695
1696 if (btrfs_is_zoned(sctx->fs_info))
1697 sctx->write_pointer = sbio->physical + sbio->sector_count *
1698 sctx->fs_info->sectorsize;
1699 }
1700
1701 static void scrub_wr_bio_end_io(struct bio *bio)
1702 {
1703 struct scrub_bio *sbio = bio->bi_private;
1704 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706 sbio->status = bio->bi_status;
1707 sbio->bio = bio;
1708
1709 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711 }
1712
1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714 {
1715 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716 struct scrub_ctx *sctx = sbio->sctx;
1717 int i;
1718
1719 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720 if (sbio->status) {
1721 struct btrfs_dev_replace *dev_replace =
1722 &sbio->sctx->fs_info->dev_replace;
1723
1724 for (i = 0; i < sbio->sector_count; i++) {
1725 struct scrub_sector *sector = sbio->sectors[i];
1726
1727 sector->io_error = 1;
1728 atomic64_inc(&dev_replace->num_write_errors);
1729 }
1730 }
1731
1732 for (i = 0; i < sbio->sector_count; i++)
1733 scrub_sector_put(sbio->sectors[i]);
1734
1735 bio_put(sbio->bio);
1736 kfree(sbio);
1737 scrub_pending_bio_dec(sctx);
1738 }
1739
1740 static int scrub_checksum(struct scrub_block *sblock)
1741 {
1742 u64 flags;
1743 int ret;
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753 sblock->header_error = 0;
1754 sblock->generation_error = 0;
1755 sblock->checksum_error = 0;
1756
1757 WARN_ON(sblock->sector_count < 1);
1758 flags = sblock->sectors[0]->flags;
1759 ret = 0;
1760 if (flags & BTRFS_EXTENT_FLAG_DATA)
1761 ret = scrub_checksum_data(sblock);
1762 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763 ret = scrub_checksum_tree_block(sblock);
1764 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765 (void)scrub_checksum_super(sblock);
1766 else
1767 WARN_ON(1);
1768 if (ret)
1769 scrub_handle_errored_block(sblock);
1770
1771 return ret;
1772 }
1773
1774 static int scrub_checksum_data(struct scrub_block *sblock)
1775 {
1776 struct scrub_ctx *sctx = sblock->sctx;
1777 struct btrfs_fs_info *fs_info = sctx->fs_info;
1778 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779 u8 csum[BTRFS_CSUM_SIZE];
1780 struct scrub_sector *sector;
1781 char *kaddr;
1782
1783 BUG_ON(sblock->sector_count < 1);
1784 sector = sblock->sectors[0];
1785 if (!sector->have_csum)
1786 return 0;
1787
1788 kaddr = page_address(sector->page);
1789
1790 shash->tfm = fs_info->csum_shash;
1791 crypto_shash_init(shash);
1792
1793
1794
1795
1796
1797 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799 if (memcmp(csum, sector->csum, fs_info->csum_size))
1800 sblock->checksum_error = 1;
1801 return sblock->checksum_error;
1802 }
1803
1804 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805 {
1806 struct scrub_ctx *sctx = sblock->sctx;
1807 struct btrfs_header *h;
1808 struct btrfs_fs_info *fs_info = sctx->fs_info;
1809 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810 u8 calculated_csum[BTRFS_CSUM_SIZE];
1811 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812
1813
1814
1815
1816
1817 const u32 sectorsize = sctx->fs_info->sectorsize;
1818 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819 int i;
1820 struct scrub_sector *sector;
1821 char *kaddr;
1822
1823 BUG_ON(sblock->sector_count < 1);
1824
1825
1826 ASSERT(sblock->sector_count == num_sectors);
1827
1828 sector = sblock->sectors[0];
1829 kaddr = page_address(sector->page);
1830 h = (struct btrfs_header *)kaddr;
1831 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833
1834
1835
1836
1837
1838 if (sector->logical != btrfs_stack_header_bytenr(h))
1839 sblock->header_error = 1;
1840
1841 if (sector->generation != btrfs_stack_header_generation(h)) {
1842 sblock->header_error = 1;
1843 sblock->generation_error = 1;
1844 }
1845
1846 if (!scrub_check_fsid(h->fsid, sector))
1847 sblock->header_error = 1;
1848
1849 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850 BTRFS_UUID_SIZE))
1851 sblock->header_error = 1;
1852
1853 shash->tfm = fs_info->csum_shash;
1854 crypto_shash_init(shash);
1855 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856 sectorsize - BTRFS_CSUM_SIZE);
1857
1858 for (i = 1; i < num_sectors; i++) {
1859 kaddr = page_address(sblock->sectors[i]->page);
1860 crypto_shash_update(shash, kaddr, sectorsize);
1861 }
1862
1863 crypto_shash_final(shash, calculated_csum);
1864 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865 sblock->checksum_error = 1;
1866
1867 return sblock->header_error || sblock->checksum_error;
1868 }
1869
1870 static int scrub_checksum_super(struct scrub_block *sblock)
1871 {
1872 struct btrfs_super_block *s;
1873 struct scrub_ctx *sctx = sblock->sctx;
1874 struct btrfs_fs_info *fs_info = sctx->fs_info;
1875 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876 u8 calculated_csum[BTRFS_CSUM_SIZE];
1877 struct scrub_sector *sector;
1878 char *kaddr;
1879 int fail_gen = 0;
1880 int fail_cor = 0;
1881
1882 BUG_ON(sblock->sector_count < 1);
1883 sector = sblock->sectors[0];
1884 kaddr = page_address(sector->page);
1885 s = (struct btrfs_super_block *)kaddr;
1886
1887 if (sector->logical != btrfs_super_bytenr(s))
1888 ++fail_cor;
1889
1890 if (sector->generation != btrfs_super_generation(s))
1891 ++fail_gen;
1892
1893 if (!scrub_check_fsid(s->fsid, sector))
1894 ++fail_cor;
1895
1896 shash->tfm = fs_info->csum_shash;
1897 crypto_shash_init(shash);
1898 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902 ++fail_cor;
1903
1904 if (fail_cor + fail_gen) {
1905
1906
1907
1908
1909
1910 spin_lock(&sctx->stat_lock);
1911 ++sctx->stat.super_errors;
1912 spin_unlock(&sctx->stat_lock);
1913 if (fail_cor)
1914 btrfs_dev_stat_inc_and_print(sector->dev,
1915 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916 else
1917 btrfs_dev_stat_inc_and_print(sector->dev,
1918 BTRFS_DEV_STAT_GENERATION_ERRS);
1919 }
1920
1921 return fail_cor + fail_gen;
1922 }
1923
1924 static void scrub_block_get(struct scrub_block *sblock)
1925 {
1926 refcount_inc(&sblock->refs);
1927 }
1928
1929 static void scrub_block_put(struct scrub_block *sblock)
1930 {
1931 if (refcount_dec_and_test(&sblock->refs)) {
1932 int i;
1933
1934 if (sblock->sparity)
1935 scrub_parity_put(sblock->sparity);
1936
1937 for (i = 0; i < sblock->sector_count; i++)
1938 scrub_sector_put(sblock->sectors[i]);
1939 kfree(sblock);
1940 }
1941 }
1942
1943 static void scrub_sector_get(struct scrub_sector *sector)
1944 {
1945 atomic_inc(§or->refs);
1946 }
1947
1948 static void scrub_sector_put(struct scrub_sector *sector)
1949 {
1950 if (atomic_dec_and_test(§or->refs)) {
1951 if (sector->page)
1952 __free_page(sector->page);
1953 kfree(sector);
1954 }
1955 }
1956
1957
1958
1959
1960
1961 static void scrub_throttle(struct scrub_ctx *sctx)
1962 {
1963 const int time_slice = 1000;
1964 struct scrub_bio *sbio;
1965 struct btrfs_device *device;
1966 s64 delta;
1967 ktime_t now;
1968 u32 div;
1969 u64 bwlimit;
1970
1971 sbio = sctx->bios[sctx->curr];
1972 device = sbio->dev;
1973 bwlimit = READ_ONCE(device->scrub_speed_max);
1974 if (bwlimit == 0)
1975 return;
1976
1977
1978
1979
1980
1981 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982 div = min_t(u32, 64, div);
1983
1984
1985 now = ktime_get();
1986 if (sctx->throttle_deadline == 0) {
1987 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988 sctx->throttle_sent = 0;
1989 }
1990
1991
1992 if (ktime_before(now, sctx->throttle_deadline)) {
1993
1994 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996 return;
1997
1998
1999 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000 } else {
2001
2002 delta = 0;
2003 }
2004
2005 if (delta) {
2006 long timeout;
2007
2008 timeout = div_u64(delta * HZ, 1000);
2009 schedule_timeout_interruptible(timeout);
2010 }
2011
2012
2013 sctx->throttle_deadline = 0;
2014 }
2015
2016 static void scrub_submit(struct scrub_ctx *sctx)
2017 {
2018 struct scrub_bio *sbio;
2019
2020 if (sctx->curr == -1)
2021 return;
2022
2023 scrub_throttle(sctx);
2024
2025 sbio = sctx->bios[sctx->curr];
2026 sctx->curr = -1;
2027 scrub_pending_bio_inc(sctx);
2028 btrfsic_check_bio(sbio->bio);
2029 submit_bio(sbio->bio);
2030 }
2031
2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033 struct scrub_sector *sector)
2034 {
2035 struct scrub_block *sblock = sector->sblock;
2036 struct scrub_bio *sbio;
2037 const u32 sectorsize = sctx->fs_info->sectorsize;
2038 int ret;
2039
2040 again:
2041
2042
2043
2044 while (sctx->curr == -1) {
2045 spin_lock(&sctx->list_lock);
2046 sctx->curr = sctx->first_free;
2047 if (sctx->curr != -1) {
2048 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049 sctx->bios[sctx->curr]->next_free = -1;
2050 sctx->bios[sctx->curr]->sector_count = 0;
2051 spin_unlock(&sctx->list_lock);
2052 } else {
2053 spin_unlock(&sctx->list_lock);
2054 wait_event(sctx->list_wait, sctx->first_free != -1);
2055 }
2056 }
2057 sbio = sctx->bios[sctx->curr];
2058 if (sbio->sector_count == 0) {
2059 sbio->physical = sector->physical;
2060 sbio->logical = sector->logical;
2061 sbio->dev = sector->dev;
2062 if (!sbio->bio) {
2063 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064 REQ_OP_READ, GFP_NOFS);
2065 }
2066 sbio->bio->bi_private = sbio;
2067 sbio->bio->bi_end_io = scrub_bio_end_io;
2068 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069 sbio->status = 0;
2070 } else if (sbio->physical + sbio->sector_count * sectorsize !=
2071 sector->physical ||
2072 sbio->logical + sbio->sector_count * sectorsize !=
2073 sector->logical ||
2074 sbio->dev != sector->dev) {
2075 scrub_submit(sctx);
2076 goto again;
2077 }
2078
2079 sbio->sectors[sbio->sector_count] = sector;
2080 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081 if (ret != sectorsize) {
2082 if (sbio->sector_count < 1) {
2083 bio_put(sbio->bio);
2084 sbio->bio = NULL;
2085 return -EIO;
2086 }
2087 scrub_submit(sctx);
2088 goto again;
2089 }
2090
2091 scrub_block_get(sblock);
2092 atomic_inc(&sblock->outstanding_sectors);
2093 sbio->sector_count++;
2094 if (sbio->sector_count == sctx->sectors_per_bio)
2095 scrub_submit(sctx);
2096
2097 return 0;
2098 }
2099
2100 static void scrub_missing_raid56_end_io(struct bio *bio)
2101 {
2102 struct scrub_block *sblock = bio->bi_private;
2103 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105 if (bio->bi_status)
2106 sblock->no_io_error_seen = 0;
2107
2108 bio_put(bio);
2109
2110 queue_work(fs_info->scrub_workers, &sblock->work);
2111 }
2112
2113 static void scrub_missing_raid56_worker(struct work_struct *work)
2114 {
2115 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116 struct scrub_ctx *sctx = sblock->sctx;
2117 struct btrfs_fs_info *fs_info = sctx->fs_info;
2118 u64 logical;
2119 struct btrfs_device *dev;
2120
2121 logical = sblock->sectors[0]->logical;
2122 dev = sblock->sectors[0]->dev;
2123
2124 if (sblock->no_io_error_seen)
2125 scrub_recheck_block_checksum(sblock);
2126
2127 if (!sblock->no_io_error_seen) {
2128 spin_lock(&sctx->stat_lock);
2129 sctx->stat.read_errors++;
2130 spin_unlock(&sctx->stat_lock);
2131 btrfs_err_rl_in_rcu(fs_info,
2132 "IO error rebuilding logical %llu for dev %s",
2133 logical, rcu_str_deref(dev->name));
2134 } else if (sblock->header_error || sblock->checksum_error) {
2135 spin_lock(&sctx->stat_lock);
2136 sctx->stat.uncorrectable_errors++;
2137 spin_unlock(&sctx->stat_lock);
2138 btrfs_err_rl_in_rcu(fs_info,
2139 "failed to rebuild valid logical %llu for dev %s",
2140 logical, rcu_str_deref(dev->name));
2141 } else {
2142 scrub_write_block_to_dev_replace(sblock);
2143 }
2144
2145 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146 mutex_lock(&sctx->wr_lock);
2147 scrub_wr_submit(sctx);
2148 mutex_unlock(&sctx->wr_lock);
2149 }
2150
2151 scrub_block_put(sblock);
2152 scrub_pending_bio_dec(sctx);
2153 }
2154
2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156 {
2157 struct scrub_ctx *sctx = sblock->sctx;
2158 struct btrfs_fs_info *fs_info = sctx->fs_info;
2159 u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160 u64 logical = sblock->sectors[0]->logical;
2161 struct btrfs_io_context *bioc = NULL;
2162 struct bio *bio;
2163 struct btrfs_raid_bio *rbio;
2164 int ret;
2165 int i;
2166
2167 btrfs_bio_counter_inc_blocked(fs_info);
2168 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169 &length, &bioc);
2170 if (ret || !bioc || !bioc->raid_map)
2171 goto bioc_out;
2172
2173 if (WARN_ON(!sctx->is_dev_replace ||
2174 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175
2176
2177
2178
2179
2180
2181 goto bioc_out;
2182 }
2183
2184 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185 bio->bi_iter.bi_sector = logical >> 9;
2186 bio->bi_private = sblock;
2187 bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189 rbio = raid56_alloc_missing_rbio(bio, bioc);
2190 if (!rbio)
2191 goto rbio_out;
2192
2193 for (i = 0; i < sblock->sector_count; i++) {
2194 struct scrub_sector *sector = sblock->sectors[i];
2195
2196
2197
2198
2199
2200 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201 }
2202
2203 INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204 scrub_block_get(sblock);
2205 scrub_pending_bio_inc(sctx);
2206 raid56_submit_missing_rbio(rbio);
2207 return;
2208
2209 rbio_out:
2210 bio_put(bio);
2211 bioc_out:
2212 btrfs_bio_counter_dec(fs_info);
2213 btrfs_put_bioc(bioc);
2214 spin_lock(&sctx->stat_lock);
2215 sctx->stat.malloc_errors++;
2216 spin_unlock(&sctx->stat_lock);
2217 }
2218
2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220 u64 physical, struct btrfs_device *dev, u64 flags,
2221 u64 gen, int mirror_num, u8 *csum,
2222 u64 physical_for_dev_replace)
2223 {
2224 struct scrub_block *sblock;
2225 const u32 sectorsize = sctx->fs_info->sectorsize;
2226 int index;
2227
2228 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229 if (!sblock) {
2230 spin_lock(&sctx->stat_lock);
2231 sctx->stat.malloc_errors++;
2232 spin_unlock(&sctx->stat_lock);
2233 return -ENOMEM;
2234 }
2235
2236
2237
2238 refcount_set(&sblock->refs, 1);
2239 sblock->sctx = sctx;
2240 sblock->no_io_error_seen = 1;
2241
2242 for (index = 0; len > 0; index++) {
2243 struct scrub_sector *sector;
2244
2245
2246
2247
2248
2249 u32 l = min(sectorsize, len);
2250
2251 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252 if (!sector) {
2253 leave_nomem:
2254 spin_lock(&sctx->stat_lock);
2255 sctx->stat.malloc_errors++;
2256 spin_unlock(&sctx->stat_lock);
2257 scrub_block_put(sblock);
2258 return -ENOMEM;
2259 }
2260 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261 scrub_sector_get(sector);
2262 sblock->sectors[index] = sector;
2263 sector->sblock = sblock;
2264 sector->dev = dev;
2265 sector->flags = flags;
2266 sector->generation = gen;
2267 sector->logical = logical;
2268 sector->physical = physical;
2269 sector->physical_for_dev_replace = physical_for_dev_replace;
2270 sector->mirror_num = mirror_num;
2271 if (csum) {
2272 sector->have_csum = 1;
2273 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274 } else {
2275 sector->have_csum = 0;
2276 }
2277 sblock->sector_count++;
2278 sector->page = alloc_page(GFP_KERNEL);
2279 if (!sector->page)
2280 goto leave_nomem;
2281 len -= l;
2282 logical += l;
2283 physical += l;
2284 physical_for_dev_replace += l;
2285 }
2286
2287 WARN_ON(sblock->sector_count == 0);
2288 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289
2290
2291
2292
2293 scrub_missing_raid56_pages(sblock);
2294 } else {
2295 for (index = 0; index < sblock->sector_count; index++) {
2296 struct scrub_sector *sector = sblock->sectors[index];
2297 int ret;
2298
2299 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300 if (ret) {
2301 scrub_block_put(sblock);
2302 return ret;
2303 }
2304 }
2305
2306 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307 scrub_submit(sctx);
2308 }
2309
2310
2311 scrub_block_put(sblock);
2312 return 0;
2313 }
2314
2315 static void scrub_bio_end_io(struct bio *bio)
2316 {
2317 struct scrub_bio *sbio = bio->bi_private;
2318 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320 sbio->status = bio->bi_status;
2321 sbio->bio = bio;
2322
2323 queue_work(fs_info->scrub_workers, &sbio->work);
2324 }
2325
2326 static void scrub_bio_end_io_worker(struct work_struct *work)
2327 {
2328 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329 struct scrub_ctx *sctx = sbio->sctx;
2330 int i;
2331
2332 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333 if (sbio->status) {
2334 for (i = 0; i < sbio->sector_count; i++) {
2335 struct scrub_sector *sector = sbio->sectors[i];
2336
2337 sector->io_error = 1;
2338 sector->sblock->no_io_error_seen = 0;
2339 }
2340 }
2341
2342
2343 for (i = 0; i < sbio->sector_count; i++) {
2344 struct scrub_sector *sector = sbio->sectors[i];
2345 struct scrub_block *sblock = sector->sblock;
2346
2347 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348 scrub_block_complete(sblock);
2349 scrub_block_put(sblock);
2350 }
2351
2352 bio_put(sbio->bio);
2353 sbio->bio = NULL;
2354 spin_lock(&sctx->list_lock);
2355 sbio->next_free = sctx->first_free;
2356 sctx->first_free = sbio->index;
2357 spin_unlock(&sctx->list_lock);
2358
2359 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360 mutex_lock(&sctx->wr_lock);
2361 scrub_wr_submit(sctx);
2362 mutex_unlock(&sctx->wr_lock);
2363 }
2364
2365 scrub_pending_bio_dec(sctx);
2366 }
2367
2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369 unsigned long *bitmap,
2370 u64 start, u32 len)
2371 {
2372 u64 offset;
2373 u32 nsectors;
2374 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376 if (len >= sparity->stripe_len) {
2377 bitmap_set(bitmap, 0, sparity->nsectors);
2378 return;
2379 }
2380
2381 start -= sparity->logic_start;
2382 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383 offset = offset >> sectorsize_bits;
2384 nsectors = len >> sectorsize_bits;
2385
2386 if (offset + nsectors <= sparity->nsectors) {
2387 bitmap_set(bitmap, offset, nsectors);
2388 return;
2389 }
2390
2391 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393 }
2394
2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396 u64 start, u32 len)
2397 {
2398 __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399 }
2400
2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402 u64 start, u32 len)
2403 {
2404 __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405 }
2406
2407 static void scrub_block_complete(struct scrub_block *sblock)
2408 {
2409 int corrupted = 0;
2410
2411 if (!sblock->no_io_error_seen) {
2412 corrupted = 1;
2413 scrub_handle_errored_block(sblock);
2414 } else {
2415
2416
2417
2418
2419
2420 corrupted = scrub_checksum(sblock);
2421 if (!corrupted && sblock->sctx->is_dev_replace)
2422 scrub_write_block_to_dev_replace(sblock);
2423 }
2424
2425 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426 u64 start = sblock->sectors[0]->logical;
2427 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428 sblock->sctx->fs_info->sectorsize;
2429
2430 ASSERT(end - start <= U32_MAX);
2431 scrub_parity_mark_sectors_error(sblock->sparity,
2432 start, end - start);
2433 }
2434 }
2435
2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437 {
2438 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439 list_del(&sum->list);
2440 kfree(sum);
2441 }
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455 {
2456 bool found = false;
2457
2458 while (!list_empty(&sctx->csum_list)) {
2459 struct btrfs_ordered_sum *sum = NULL;
2460 unsigned long index;
2461 unsigned long num_sectors;
2462
2463 sum = list_first_entry(&sctx->csum_list,
2464 struct btrfs_ordered_sum, list);
2465
2466 if (sum->bytenr > logical)
2467 break;
2468
2469
2470
2471
2472
2473
2474
2475 if (sum->bytenr + sum->len <= logical) {
2476 drop_csum_range(sctx, sum);
2477 continue;
2478 }
2479
2480
2481 found = true;
2482 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486 sctx->fs_info->csum_size);
2487
2488
2489 if (index == num_sectors - 1)
2490 drop_csum_range(sctx, sum);
2491 break;
2492 }
2493 if (!found)
2494 return 0;
2495 return 1;
2496 }
2497
2498
2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500 u64 logical, u32 len,
2501 u64 physical, struct btrfs_device *dev, u64 flags,
2502 u64 gen, int mirror_num)
2503 {
2504 struct btrfs_device *src_dev = dev;
2505 u64 src_physical = physical;
2506 int src_mirror = mirror_num;
2507 int ret;
2508 u8 csum[BTRFS_CSUM_SIZE];
2509 u32 blocksize;
2510
2511 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513 blocksize = map->stripe_len;
2514 else
2515 blocksize = sctx->fs_info->sectorsize;
2516 spin_lock(&sctx->stat_lock);
2517 sctx->stat.data_extents_scrubbed++;
2518 sctx->stat.data_bytes_scrubbed += len;
2519 spin_unlock(&sctx->stat_lock);
2520 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522 blocksize = map->stripe_len;
2523 else
2524 blocksize = sctx->fs_info->nodesize;
2525 spin_lock(&sctx->stat_lock);
2526 sctx->stat.tree_extents_scrubbed++;
2527 sctx->stat.tree_bytes_scrubbed += len;
2528 spin_unlock(&sctx->stat_lock);
2529 } else {
2530 blocksize = sctx->fs_info->sectorsize;
2531 WARN_ON(1);
2532 }
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543 if (sctx->is_dev_replace && !dev->bdev)
2544 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545 &src_dev, &src_mirror);
2546 while (len) {
2547 u32 l = min(len, blocksize);
2548 int have_csum = 0;
2549
2550 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551
2552 have_csum = scrub_find_csum(sctx, logical, csum);
2553 if (have_csum == 0)
2554 ++sctx->stat.no_csum;
2555 }
2556 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557 flags, gen, src_mirror,
2558 have_csum ? csum : NULL, physical);
2559 if (ret)
2560 return ret;
2561 len -= l;
2562 logical += l;
2563 physical += l;
2564 src_physical += l;
2565 }
2566 return 0;
2567 }
2568
2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570 u64 logical, u32 len,
2571 u64 physical, struct btrfs_device *dev,
2572 u64 flags, u64 gen, int mirror_num, u8 *csum)
2573 {
2574 struct scrub_ctx *sctx = sparity->sctx;
2575 struct scrub_block *sblock;
2576 const u32 sectorsize = sctx->fs_info->sectorsize;
2577 int index;
2578
2579 ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582 if (!sblock) {
2583 spin_lock(&sctx->stat_lock);
2584 sctx->stat.malloc_errors++;
2585 spin_unlock(&sctx->stat_lock);
2586 return -ENOMEM;
2587 }
2588
2589
2590
2591 refcount_set(&sblock->refs, 1);
2592 sblock->sctx = sctx;
2593 sblock->no_io_error_seen = 1;
2594 sblock->sparity = sparity;
2595 scrub_parity_get(sparity);
2596
2597 for (index = 0; len > 0; index++) {
2598 struct scrub_sector *sector;
2599
2600 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601 if (!sector) {
2602 leave_nomem:
2603 spin_lock(&sctx->stat_lock);
2604 sctx->stat.malloc_errors++;
2605 spin_unlock(&sctx->stat_lock);
2606 scrub_block_put(sblock);
2607 return -ENOMEM;
2608 }
2609 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610
2611 scrub_sector_get(sector);
2612 sblock->sectors[index] = sector;
2613
2614 scrub_sector_get(sector);
2615 list_add_tail(§or->list, &sparity->sectors_list);
2616 sector->sblock = sblock;
2617 sector->dev = dev;
2618 sector->flags = flags;
2619 sector->generation = gen;
2620 sector->logical = logical;
2621 sector->physical = physical;
2622 sector->mirror_num = mirror_num;
2623 if (csum) {
2624 sector->have_csum = 1;
2625 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626 } else {
2627 sector->have_csum = 0;
2628 }
2629 sblock->sector_count++;
2630 sector->page = alloc_page(GFP_KERNEL);
2631 if (!sector->page)
2632 goto leave_nomem;
2633
2634
2635
2636 len -= sectorsize;
2637 logical += sectorsize;
2638 physical += sectorsize;
2639 }
2640
2641 WARN_ON(sblock->sector_count == 0);
2642 for (index = 0; index < sblock->sector_count; index++) {
2643 struct scrub_sector *sector = sblock->sectors[index];
2644 int ret;
2645
2646 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647 if (ret) {
2648 scrub_block_put(sblock);
2649 return ret;
2650 }
2651 }
2652
2653
2654 scrub_block_put(sblock);
2655 return 0;
2656 }
2657
2658 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659 u64 logical, u32 len,
2660 u64 physical, struct btrfs_device *dev,
2661 u64 flags, u64 gen, int mirror_num)
2662 {
2663 struct scrub_ctx *sctx = sparity->sctx;
2664 int ret;
2665 u8 csum[BTRFS_CSUM_SIZE];
2666 u32 blocksize;
2667
2668 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669 scrub_parity_mark_sectors_error(sparity, logical, len);
2670 return 0;
2671 }
2672
2673 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674 blocksize = sparity->stripe_len;
2675 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676 blocksize = sparity->stripe_len;
2677 } else {
2678 blocksize = sctx->fs_info->sectorsize;
2679 WARN_ON(1);
2680 }
2681
2682 while (len) {
2683 u32 l = min(len, blocksize);
2684 int have_csum = 0;
2685
2686 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687
2688 have_csum = scrub_find_csum(sctx, logical, csum);
2689 if (have_csum == 0)
2690 goto skip;
2691 }
2692 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693 flags, gen, mirror_num,
2694 have_csum ? csum : NULL);
2695 if (ret)
2696 return ret;
2697 skip:
2698 len -= l;
2699 logical += l;
2700 physical += l;
2701 }
2702 return 0;
2703 }
2704
2705
2706
2707
2708
2709
2710
2711
2712 static int get_raid56_logic_offset(u64 physical, int num,
2713 struct map_lookup *map, u64 *offset,
2714 u64 *stripe_start)
2715 {
2716 int i;
2717 int j = 0;
2718 u64 stripe_nr;
2719 u64 last_offset;
2720 u32 stripe_index;
2721 u32 rot;
2722 const int data_stripes = nr_data_stripes(map);
2723
2724 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725 if (stripe_start)
2726 *stripe_start = last_offset;
2727
2728 *offset = last_offset;
2729 for (i = 0; i < data_stripes; i++) {
2730 *offset = last_offset + i * map->stripe_len;
2731
2732 stripe_nr = div64_u64(*offset, map->stripe_len);
2733 stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735
2736 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737
2738 rot += i;
2739 stripe_index = rot % map->num_stripes;
2740 if (stripe_index == num)
2741 return 0;
2742 if (stripe_index < num)
2743 j++;
2744 }
2745 *offset = last_offset + j * map->stripe_len;
2746 return 1;
2747 }
2748
2749 static void scrub_free_parity(struct scrub_parity *sparity)
2750 {
2751 struct scrub_ctx *sctx = sparity->sctx;
2752 struct scrub_sector *curr, *next;
2753 int nbits;
2754
2755 nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756 if (nbits) {
2757 spin_lock(&sctx->stat_lock);
2758 sctx->stat.read_errors += nbits;
2759 sctx->stat.uncorrectable_errors += nbits;
2760 spin_unlock(&sctx->stat_lock);
2761 }
2762
2763 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764 list_del_init(&curr->list);
2765 scrub_sector_put(curr);
2766 }
2767
2768 kfree(sparity);
2769 }
2770
2771 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772 {
2773 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774 work);
2775 struct scrub_ctx *sctx = sparity->sctx;
2776
2777 scrub_free_parity(sparity);
2778 scrub_pending_bio_dec(sctx);
2779 }
2780
2781 static void scrub_parity_bio_endio(struct bio *bio)
2782 {
2783 struct scrub_parity *sparity = bio->bi_private;
2784 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786 if (bio->bi_status)
2787 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788 &sparity->dbitmap, sparity->nsectors);
2789
2790 bio_put(bio);
2791
2792 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793 queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794 }
2795
2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797 {
2798 struct scrub_ctx *sctx = sparity->sctx;
2799 struct btrfs_fs_info *fs_info = sctx->fs_info;
2800 struct bio *bio;
2801 struct btrfs_raid_bio *rbio;
2802 struct btrfs_io_context *bioc = NULL;
2803 u64 length;
2804 int ret;
2805
2806 if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807 &sparity->ebitmap, sparity->nsectors))
2808 goto out;
2809
2810 length = sparity->logic_end - sparity->logic_start;
2811
2812 btrfs_bio_counter_inc_blocked(fs_info);
2813 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814 &length, &bioc);
2815 if (ret || !bioc || !bioc->raid_map)
2816 goto bioc_out;
2817
2818 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820 bio->bi_private = sparity;
2821 bio->bi_end_io = scrub_parity_bio_endio;
2822
2823 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824 sparity->scrub_dev,
2825 &sparity->dbitmap,
2826 sparity->nsectors);
2827 if (!rbio)
2828 goto rbio_out;
2829
2830 scrub_pending_bio_inc(sctx);
2831 raid56_parity_submit_scrub_rbio(rbio);
2832 return;
2833
2834 rbio_out:
2835 bio_put(bio);
2836 bioc_out:
2837 btrfs_bio_counter_dec(fs_info);
2838 btrfs_put_bioc(bioc);
2839 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840 sparity->nsectors);
2841 spin_lock(&sctx->stat_lock);
2842 sctx->stat.malloc_errors++;
2843 spin_unlock(&sctx->stat_lock);
2844 out:
2845 scrub_free_parity(sparity);
2846 }
2847
2848 static void scrub_parity_get(struct scrub_parity *sparity)
2849 {
2850 refcount_inc(&sparity->refs);
2851 }
2852
2853 static void scrub_parity_put(struct scrub_parity *sparity)
2854 {
2855 if (!refcount_dec_and_test(&sparity->refs))
2856 return;
2857
2858 scrub_parity_check_and_repair(sparity);
2859 }
2860
2861
2862
2863
2864
2865
2866 static int compare_extent_item_range(struct btrfs_path *path,
2867 u64 search_start, u64 search_len)
2868 {
2869 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870 u64 len;
2871 struct btrfs_key key;
2872
2873 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875 key.type == BTRFS_METADATA_ITEM_KEY);
2876 if (key.type == BTRFS_METADATA_ITEM_KEY)
2877 len = fs_info->nodesize;
2878 else
2879 len = key.offset;
2880
2881 if (key.objectid + len <= search_start)
2882 return -1;
2883 if (key.objectid >= search_start + search_len)
2884 return 1;
2885 return 0;
2886 }
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904 static int find_first_extent_item(struct btrfs_root *extent_root,
2905 struct btrfs_path *path,
2906 u64 search_start, u64 search_len)
2907 {
2908 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909 struct btrfs_key key;
2910 int ret;
2911
2912
2913 if (path->nodes[0])
2914 goto search_forward;
2915
2916 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917 key.type = BTRFS_METADATA_ITEM_KEY;
2918 else
2919 key.type = BTRFS_EXTENT_ITEM_KEY;
2920 key.objectid = search_start;
2921 key.offset = (u64)-1;
2922
2923 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924 if (ret < 0)
2925 return ret;
2926
2927 ASSERT(ret > 0);
2928
2929
2930
2931
2932 ret = btrfs_previous_extent_item(extent_root, path, 0);
2933 if (ret < 0)
2934 return ret;
2935
2936
2937
2938
2939 search_forward:
2940 while (true) {
2941 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942 if (key.objectid >= search_start + search_len)
2943 break;
2944 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945 key.type != BTRFS_EXTENT_ITEM_KEY)
2946 goto next;
2947
2948 ret = compare_extent_item_range(path, search_start, search_len);
2949 if (ret == 0)
2950 return ret;
2951 if (ret > 0)
2952 break;
2953 next:
2954 path->slots[0]++;
2955 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956 ret = btrfs_next_leaf(extent_root, path);
2957 if (ret) {
2958
2959 btrfs_release_path(path);
2960 return ret;
2961 }
2962 }
2963 }
2964 btrfs_release_path(path);
2965 return 1;
2966 }
2967
2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970 {
2971 struct btrfs_key key;
2972 struct btrfs_extent_item *ei;
2973
2974 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976 key.type == BTRFS_EXTENT_ITEM_KEY);
2977 *extent_start_ret = key.objectid;
2978 if (key.type == BTRFS_METADATA_ITEM_KEY)
2979 *size_ret = path->nodes[0]->fs_info->nodesize;
2980 else
2981 *size_ret = key.offset;
2982 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985 }
2986
2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988 u64 boundary_start, u64 boudary_len)
2989 {
2990 return (extent_start < boundary_start &&
2991 extent_start + extent_len > boundary_start) ||
2992 (extent_start < boundary_start + boudary_len &&
2993 extent_start + extent_len > boundary_start + boudary_len);
2994 }
2995
2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997 struct scrub_parity *sparity,
2998 struct map_lookup *map,
2999 struct btrfs_device *sdev,
3000 struct btrfs_path *path,
3001 u64 logical)
3002 {
3003 struct btrfs_fs_info *fs_info = sctx->fs_info;
3004 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006 u64 cur_logical = logical;
3007 int ret;
3008
3009 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011
3012 ASSERT(!path->nodes[0]);
3013
3014 while (cur_logical < logical + map->stripe_len) {
3015 struct btrfs_io_context *bioc = NULL;
3016 struct btrfs_device *extent_dev;
3017 u64 extent_start;
3018 u64 extent_size;
3019 u64 mapped_length;
3020 u64 extent_flags;
3021 u64 extent_gen;
3022 u64 extent_physical;
3023 u64 extent_mirror_num;
3024
3025 ret = find_first_extent_item(extent_root, path, cur_logical,
3026 logical + map->stripe_len - cur_logical);
3027
3028 if (ret > 0) {
3029 ret = 0;
3030 break;
3031 }
3032 if (ret < 0)
3033 break;
3034 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035 &extent_gen);
3036
3037
3038 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039 does_range_cross_boundary(extent_start, extent_size,
3040 logical, map->stripe_len)) {
3041 btrfs_err(fs_info,
3042 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043 extent_start, logical);
3044 spin_lock(&sctx->stat_lock);
3045 sctx->stat.uncorrectable_errors++;
3046 spin_unlock(&sctx->stat_lock);
3047 cur_logical += extent_size;
3048 continue;
3049 }
3050
3051
3052 cur_logical = max(extent_start, cur_logical);
3053
3054
3055 extent_size = min(extent_start + extent_size,
3056 logical + map->stripe_len) - cur_logical;
3057 extent_start = cur_logical;
3058 ASSERT(extent_size <= U32_MAX);
3059
3060 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062 mapped_length = extent_size;
3063 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064 &mapped_length, &bioc, 0);
3065 if (!ret && (!bioc || mapped_length < extent_size))
3066 ret = -EIO;
3067 if (ret) {
3068 btrfs_put_bioc(bioc);
3069 scrub_parity_mark_sectors_error(sparity, extent_start,
3070 extent_size);
3071 break;
3072 }
3073 extent_physical = bioc->stripes[0].physical;
3074 extent_mirror_num = bioc->mirror_num;
3075 extent_dev = bioc->stripes[0].dev;
3076 btrfs_put_bioc(bioc);
3077
3078 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079 extent_start + extent_size - 1,
3080 &sctx->csum_list, 1);
3081 if (ret) {
3082 scrub_parity_mark_sectors_error(sparity, extent_start,
3083 extent_size);
3084 break;
3085 }
3086
3087 ret = scrub_extent_for_parity(sparity, extent_start,
3088 extent_size, extent_physical,
3089 extent_dev, extent_flags,
3090 extent_gen, extent_mirror_num);
3091 scrub_free_csums(sctx);
3092
3093 if (ret) {
3094 scrub_parity_mark_sectors_error(sparity, extent_start,
3095 extent_size);
3096 break;
3097 }
3098
3099 cond_resched();
3100 cur_logical += extent_size;
3101 }
3102 btrfs_release_path(path);
3103 return ret;
3104 }
3105
3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107 struct map_lookup *map,
3108 struct btrfs_device *sdev,
3109 u64 logic_start,
3110 u64 logic_end)
3111 {
3112 struct btrfs_fs_info *fs_info = sctx->fs_info;
3113 struct btrfs_path *path;
3114 u64 cur_logical;
3115 int ret;
3116 struct scrub_parity *sparity;
3117 int nsectors;
3118
3119 path = btrfs_alloc_path();
3120 if (!path) {
3121 spin_lock(&sctx->stat_lock);
3122 sctx->stat.malloc_errors++;
3123 spin_unlock(&sctx->stat_lock);
3124 return -ENOMEM;
3125 }
3126 path->search_commit_root = 1;
3127 path->skip_locking = 1;
3128
3129 ASSERT(map->stripe_len <= U32_MAX);
3130 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131 ASSERT(nsectors <= BITS_PER_LONG);
3132 sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133 if (!sparity) {
3134 spin_lock(&sctx->stat_lock);
3135 sctx->stat.malloc_errors++;
3136 spin_unlock(&sctx->stat_lock);
3137 btrfs_free_path(path);
3138 return -ENOMEM;
3139 }
3140
3141 ASSERT(map->stripe_len <= U32_MAX);
3142 sparity->stripe_len = map->stripe_len;
3143 sparity->nsectors = nsectors;
3144 sparity->sctx = sctx;
3145 sparity->scrub_dev = sdev;
3146 sparity->logic_start = logic_start;
3147 sparity->logic_end = logic_end;
3148 refcount_set(&sparity->refs, 1);
3149 INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151 ret = 0;
3152 for (cur_logical = logic_start; cur_logical < logic_end;
3153 cur_logical += map->stripe_len) {
3154 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155 sdev, path, cur_logical);
3156 if (ret < 0)
3157 break;
3158 }
3159
3160 scrub_parity_put(sparity);
3161 scrub_submit(sctx);
3162 mutex_lock(&sctx->wr_lock);
3163 scrub_wr_submit(sctx);
3164 mutex_unlock(&sctx->wr_lock);
3165
3166 btrfs_free_path(path);
3167 return ret < 0 ? ret : 0;
3168 }
3169
3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171 {
3172 if (!btrfs_is_zoned(sctx->fs_info))
3173 return;
3174
3175 sctx->flush_all_writes = true;
3176 scrub_submit(sctx);
3177 mutex_lock(&sctx->wr_lock);
3178 scrub_wr_submit(sctx);
3179 mutex_unlock(&sctx->wr_lock);
3180
3181 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182 }
3183
3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185 u64 physical, u64 physical_end)
3186 {
3187 struct btrfs_fs_info *fs_info = sctx->fs_info;
3188 int ret = 0;
3189
3190 if (!btrfs_is_zoned(fs_info))
3191 return 0;
3192
3193 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195 mutex_lock(&sctx->wr_lock);
3196 if (sctx->write_pointer < physical_end) {
3197 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198 physical,
3199 sctx->write_pointer);
3200 if (ret)
3201 btrfs_err(fs_info,
3202 "zoned: failed to recover write pointer");
3203 }
3204 mutex_unlock(&sctx->wr_lock);
3205 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207 return ret;
3208 }
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219 struct btrfs_root *extent_root,
3220 struct btrfs_root *csum_root,
3221 struct btrfs_block_group *bg,
3222 struct map_lookup *map,
3223 u64 logical_start, u64 logical_length,
3224 struct btrfs_device *device,
3225 u64 physical, int mirror_num)
3226 {
3227 struct btrfs_fs_info *fs_info = sctx->fs_info;
3228 const u64 logical_end = logical_start + logical_length;
3229
3230 const u32 max_length = SZ_64K;
3231 struct btrfs_path path = { 0 };
3232 u64 cur_logical = logical_start;
3233 int ret;
3234
3235
3236 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238 path.search_commit_root = 1;
3239 path.skip_locking = 1;
3240
3241 while (cur_logical < logical_end) {
3242 u64 extent_start;
3243 u64 extent_len;
3244 u64 extent_flags;
3245 u64 extent_gen;
3246 u64 scrub_len;
3247
3248
3249 if (atomic_read(&fs_info->scrub_cancel_req) ||
3250 atomic_read(&sctx->cancel_req)) {
3251 ret = -ECANCELED;
3252 break;
3253 }
3254
3255 if (atomic_read(&fs_info->scrub_pause_req)) {
3256
3257 sctx->flush_all_writes = true;
3258 scrub_submit(sctx);
3259 mutex_lock(&sctx->wr_lock);
3260 scrub_wr_submit(sctx);
3261 mutex_unlock(&sctx->wr_lock);
3262 wait_event(sctx->list_wait,
3263 atomic_read(&sctx->bios_in_flight) == 0);
3264 sctx->flush_all_writes = false;
3265 scrub_blocked_if_needed(fs_info);
3266 }
3267
3268 spin_lock(&bg->lock);
3269 if (bg->removed) {
3270 spin_unlock(&bg->lock);
3271 ret = 0;
3272 break;
3273 }
3274 spin_unlock(&bg->lock);
3275
3276 ret = find_first_extent_item(extent_root, &path, cur_logical,
3277 logical_end - cur_logical);
3278 if (ret > 0) {
3279
3280 sctx->stat.last_physical = physical + logical_length;
3281 ret = 0;
3282 break;
3283 }
3284 if (ret < 0)
3285 break;
3286 get_extent_info(&path, &extent_start, &extent_len,
3287 &extent_flags, &extent_gen);
3288
3289 cur_logical = max(extent_start, cur_logical);
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299 scrub_len = min(min(extent_start + extent_len,
3300 logical_end), cur_logical + max_length) -
3301 cur_logical;
3302
3303 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304 ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305 cur_logical + scrub_len - 1,
3306 &sctx->csum_list, 1);
3307 if (ret)
3308 break;
3309 }
3310 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311 does_range_cross_boundary(extent_start, extent_len,
3312 logical_start, logical_length)) {
3313 btrfs_err(fs_info,
3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315 extent_start, logical_start, logical_end);
3316 spin_lock(&sctx->stat_lock);
3317 sctx->stat.uncorrectable_errors++;
3318 spin_unlock(&sctx->stat_lock);
3319 cur_logical += scrub_len;
3320 continue;
3321 }
3322 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323 cur_logical - logical_start + physical,
3324 device, extent_flags, extent_gen,
3325 mirror_num);
3326 scrub_free_csums(sctx);
3327 if (ret)
3328 break;
3329 if (sctx->is_dev_replace)
3330 sync_replace_for_zoned(sctx);
3331 cur_logical += scrub_len;
3332
3333 cond_resched();
3334 }
3335 btrfs_release_path(&path);
3336 return ret;
3337 }
3338
3339
3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341 {
3342 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343 BTRFS_BLOCK_GROUP_RAID10));
3344
3345 return map->num_stripes / map->sub_stripes * map->stripe_len;
3346 }
3347
3348
3349 static u64 simple_stripe_get_logical(struct map_lookup *map,
3350 struct btrfs_block_group *bg,
3351 int stripe_index)
3352 {
3353 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354 BTRFS_BLOCK_GROUP_RAID10));
3355 ASSERT(stripe_index < map->num_stripes);
3356
3357
3358
3359
3360
3361 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362 }
3363
3364
3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366 {
3367 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368 BTRFS_BLOCK_GROUP_RAID10));
3369 ASSERT(stripe_index < map->num_stripes);
3370
3371
3372 return stripe_index % map->sub_stripes + 1;
3373 }
3374
3375 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376 struct btrfs_root *extent_root,
3377 struct btrfs_root *csum_root,
3378 struct btrfs_block_group *bg,
3379 struct map_lookup *map,
3380 struct btrfs_device *device,
3381 int stripe_index)
3382 {
3383 const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385 const u64 orig_physical = map->stripes[stripe_index].physical;
3386 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387 u64 cur_logical = orig_logical;
3388 u64 cur_physical = orig_physical;
3389 int ret = 0;
3390
3391 while (cur_logical < bg->start + bg->length) {
3392
3393
3394
3395
3396
3397 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398 cur_logical, map->stripe_len, device,
3399 cur_physical, mirror_num);
3400 if (ret)
3401 return ret;
3402
3403 cur_logical += logical_increment;
3404
3405 cur_physical += map->stripe_len;
3406 }
3407 return ret;
3408 }
3409
3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411 struct btrfs_block_group *bg,
3412 struct extent_map *em,
3413 struct btrfs_device *scrub_dev,
3414 int stripe_index)
3415 {
3416 struct btrfs_path *path;
3417 struct btrfs_fs_info *fs_info = sctx->fs_info;
3418 struct btrfs_root *root;
3419 struct btrfs_root *csum_root;
3420 struct blk_plug plug;
3421 struct map_lookup *map = em->map_lookup;
3422 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423 const u64 chunk_logical = bg->start;
3424 int ret;
3425 u64 physical = map->stripes[stripe_index].physical;
3426 const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427 const u64 physical_end = physical + dev_stripe_len;
3428 u64 logical;
3429 u64 logic_end;
3430
3431 u64 increment;
3432
3433 u64 offset;
3434 u64 stripe_logical;
3435 u64 stripe_end;
3436 int stop_loop = 0;
3437
3438 path = btrfs_alloc_path();
3439 if (!path)
3440 return -ENOMEM;
3441
3442
3443
3444
3445
3446
3447 path->search_commit_root = 1;
3448 path->skip_locking = 1;
3449 path->reada = READA_FORWARD;
3450
3451 wait_event(sctx->list_wait,
3452 atomic_read(&sctx->bios_in_flight) == 0);
3453 scrub_blocked_if_needed(fs_info);
3454
3455 root = btrfs_extent_root(fs_info, bg->start);
3456 csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458
3459
3460
3461
3462 blk_start_plug(&plug);
3463
3464 if (sctx->is_dev_replace &&
3465 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466 mutex_lock(&sctx->wr_lock);
3467 sctx->write_pointer = physical;
3468 mutex_unlock(&sctx->wr_lock);
3469 sctx->flush_all_writes = true;
3470 }
3471
3472
3473
3474
3475
3476
3477
3478
3479 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481
3482
3483
3484
3485
3486
3487
3488
3489 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490 bg->start, bg->length, scrub_dev,
3491 map->stripes[stripe_index].physical,
3492 stripe_index + 1);
3493 offset = 0;
3494 goto out;
3495 }
3496 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498 scrub_dev, stripe_index);
3499 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500 goto out;
3501 }
3502
3503
3504 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505 ret = 0;
3506
3507
3508 get_raid56_logic_offset(physical_end, stripe_index,
3509 map, &logic_end, NULL);
3510 logic_end += chunk_logical;
3511
3512
3513 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514 increment = map->stripe_len * nr_data_stripes(map);
3515
3516
3517
3518
3519
3520 while (physical < physical_end) {
3521 ret = get_raid56_logic_offset(physical, stripe_index, map,
3522 &logical, &stripe_logical);
3523 logical += chunk_logical;
3524 if (ret) {
3525
3526 stripe_logical += chunk_logical;
3527 stripe_end = stripe_logical + increment;
3528 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529 stripe_logical,
3530 stripe_end);
3531 if (ret)
3532 goto out;
3533 goto next;
3534 }
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545 logical, map->stripe_len,
3546 scrub_dev, physical, 1);
3547 if (ret < 0)
3548 goto out;
3549 next:
3550 logical += increment;
3551 physical += map->stripe_len;
3552 spin_lock(&sctx->stat_lock);
3553 if (stop_loop)
3554 sctx->stat.last_physical =
3555 map->stripes[stripe_index].physical + dev_stripe_len;
3556 else
3557 sctx->stat.last_physical = physical;
3558 spin_unlock(&sctx->stat_lock);
3559 if (stop_loop)
3560 break;
3561 }
3562 out:
3563
3564 scrub_submit(sctx);
3565 mutex_lock(&sctx->wr_lock);
3566 scrub_wr_submit(sctx);
3567 mutex_unlock(&sctx->wr_lock);
3568
3569 blk_finish_plug(&plug);
3570 btrfs_free_path(path);
3571
3572 if (sctx->is_dev_replace && ret >= 0) {
3573 int ret2;
3574
3575 ret2 = sync_write_pointer_for_zoned(sctx,
3576 chunk_logical + offset,
3577 map->stripes[stripe_index].physical,
3578 physical_end);
3579 if (ret2)
3580 ret = ret2;
3581 }
3582
3583 return ret < 0 ? ret : 0;
3584 }
3585
3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587 struct btrfs_block_group *bg,
3588 struct btrfs_device *scrub_dev,
3589 u64 dev_offset,
3590 u64 dev_extent_len)
3591 {
3592 struct btrfs_fs_info *fs_info = sctx->fs_info;
3593 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594 struct map_lookup *map;
3595 struct extent_map *em;
3596 int i;
3597 int ret = 0;
3598
3599 read_lock(&map_tree->lock);
3600 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601 read_unlock(&map_tree->lock);
3602
3603 if (!em) {
3604
3605
3606
3607
3608 spin_lock(&bg->lock);
3609 if (!bg->removed)
3610 ret = -EINVAL;
3611 spin_unlock(&bg->lock);
3612
3613 return ret;
3614 }
3615 if (em->start != bg->start)
3616 goto out;
3617 if (em->len < dev_extent_len)
3618 goto out;
3619
3620 map = em->map_lookup;
3621 for (i = 0; i < map->num_stripes; ++i) {
3622 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623 map->stripes[i].physical == dev_offset) {
3624 ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625 if (ret)
3626 goto out;
3627 }
3628 }
3629 out:
3630 free_extent_map(em);
3631
3632 return ret;
3633 }
3634
3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636 struct btrfs_block_group *cache)
3637 {
3638 struct btrfs_fs_info *fs_info = cache->fs_info;
3639 struct btrfs_trans_handle *trans;
3640
3641 if (!btrfs_is_zoned(fs_info))
3642 return 0;
3643
3644 btrfs_wait_block_group_reservations(cache);
3645 btrfs_wait_nocow_writers(cache);
3646 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648 trans = btrfs_join_transaction(root);
3649 if (IS_ERR(trans))
3650 return PTR_ERR(trans);
3651 return btrfs_commit_transaction(trans);
3652 }
3653
3654 static noinline_for_stack
3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656 struct btrfs_device *scrub_dev, u64 start, u64 end)
3657 {
3658 struct btrfs_dev_extent *dev_extent = NULL;
3659 struct btrfs_path *path;
3660 struct btrfs_fs_info *fs_info = sctx->fs_info;
3661 struct btrfs_root *root = fs_info->dev_root;
3662 u64 chunk_offset;
3663 int ret = 0;
3664 int ro_set;
3665 int slot;
3666 struct extent_buffer *l;
3667 struct btrfs_key key;
3668 struct btrfs_key found_key;
3669 struct btrfs_block_group *cache;
3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672 path = btrfs_alloc_path();
3673 if (!path)
3674 return -ENOMEM;
3675
3676 path->reada = READA_FORWARD;
3677 path->search_commit_root = 1;
3678 path->skip_locking = 1;
3679
3680 key.objectid = scrub_dev->devid;
3681 key.offset = 0ull;
3682 key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684 while (1) {
3685 u64 dev_extent_len;
3686
3687 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688 if (ret < 0)
3689 break;
3690 if (ret > 0) {
3691 if (path->slots[0] >=
3692 btrfs_header_nritems(path->nodes[0])) {
3693 ret = btrfs_next_leaf(root, path);
3694 if (ret < 0)
3695 break;
3696 if (ret > 0) {
3697 ret = 0;
3698 break;
3699 }
3700 } else {
3701 ret = 0;
3702 }
3703 }
3704
3705 l = path->nodes[0];
3706 slot = path->slots[0];
3707
3708 btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710 if (found_key.objectid != scrub_dev->devid)
3711 break;
3712
3713 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714 break;
3715
3716 if (found_key.offset >= end)
3717 break;
3718
3719 if (found_key.offset < key.offset)
3720 break;
3721
3722 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725 if (found_key.offset + dev_extent_len <= start)
3726 goto skip;
3727
3728 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730
3731
3732
3733
3734 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736
3737
3738 if (!cache)
3739 goto skip;
3740
3741 ASSERT(cache->start <= chunk_offset);
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761 if (cache->start < chunk_offset) {
3762 btrfs_put_block_group(cache);
3763 goto skip;
3764 }
3765
3766 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767 spin_lock(&cache->lock);
3768 if (!cache->to_copy) {
3769 spin_unlock(&cache->lock);
3770 btrfs_put_block_group(cache);
3771 goto skip;
3772 }
3773 spin_unlock(&cache->lock);
3774 }
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784 spin_lock(&cache->lock);
3785 if (cache->removed) {
3786 spin_unlock(&cache->lock);
3787 btrfs_put_block_group(cache);
3788 goto skip;
3789 }
3790 btrfs_freeze_block_group(cache);
3791 spin_unlock(&cache->lock);
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801 scrub_pause_on(fs_info);
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834 if (!ret && sctx->is_dev_replace) {
3835 ret = finish_extent_writes_for_zoned(root, cache);
3836 if (ret) {
3837 btrfs_dec_block_group_ro(cache);
3838 scrub_pause_off(fs_info);
3839 btrfs_put_block_group(cache);
3840 break;
3841 }
3842 }
3843
3844 if (ret == 0) {
3845 ro_set = 1;
3846 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847
3848
3849
3850
3851
3852
3853
3854 ro_set = 0;
3855 } else if (ret == -ETXTBSY) {
3856 btrfs_warn(fs_info,
3857 "skipping scrub of block group %llu due to active swapfile",
3858 cache->start);
3859 scrub_pause_off(fs_info);
3860 ret = 0;
3861 goto skip_unfreeze;
3862 } else {
3863 btrfs_warn(fs_info,
3864 "failed setting block group ro: %d", ret);
3865 btrfs_unfreeze_block_group(cache);
3866 btrfs_put_block_group(cache);
3867 scrub_pause_off(fs_info);
3868 break;
3869 }
3870
3871
3872
3873
3874
3875
3876 if (sctx->is_dev_replace) {
3877 btrfs_wait_nocow_writers(cache);
3878 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879 cache->length);
3880 }
3881
3882 scrub_pause_off(fs_info);
3883 down_write(&dev_replace->rwsem);
3884 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885 dev_replace->cursor_left = found_key.offset;
3886 dev_replace->item_needs_writeback = 1;
3887 up_write(&dev_replace->rwsem);
3888
3889 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890 dev_extent_len);
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902 sctx->flush_all_writes = true;
3903 scrub_submit(sctx);
3904 mutex_lock(&sctx->wr_lock);
3905 scrub_wr_submit(sctx);
3906 mutex_unlock(&sctx->wr_lock);
3907
3908 wait_event(sctx->list_wait,
3909 atomic_read(&sctx->bios_in_flight) == 0);
3910
3911 scrub_pause_on(fs_info);
3912
3913
3914
3915
3916
3917
3918 wait_event(sctx->list_wait,
3919 atomic_read(&sctx->workers_pending) == 0);
3920 sctx->flush_all_writes = false;
3921
3922 scrub_pause_off(fs_info);
3923
3924 if (sctx->is_dev_replace &&
3925 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926 cache, found_key.offset))
3927 ro_set = 0;
3928
3929 down_write(&dev_replace->rwsem);
3930 dev_replace->cursor_left = dev_replace->cursor_right;
3931 dev_replace->item_needs_writeback = 1;
3932 up_write(&dev_replace->rwsem);
3933
3934 if (ro_set)
3935 btrfs_dec_block_group_ro(cache);
3936
3937
3938
3939
3940
3941
3942
3943
3944 spin_lock(&cache->lock);
3945 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946 cache->used == 0) {
3947 spin_unlock(&cache->lock);
3948 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949 btrfs_discard_queue_work(&fs_info->discard_ctl,
3950 cache);
3951 else
3952 btrfs_mark_bg_unused(cache);
3953 } else {
3954 spin_unlock(&cache->lock);
3955 }
3956 skip_unfreeze:
3957 btrfs_unfreeze_block_group(cache);
3958 btrfs_put_block_group(cache);
3959 if (ret)
3960 break;
3961 if (sctx->is_dev_replace &&
3962 atomic64_read(&dev_replace->num_write_errors) > 0) {
3963 ret = -EIO;
3964 break;
3965 }
3966 if (sctx->stat.malloc_errors > 0) {
3967 ret = -ENOMEM;
3968 break;
3969 }
3970 skip:
3971 key.offset = found_key.offset + dev_extent_len;
3972 btrfs_release_path(path);
3973 }
3974
3975 btrfs_free_path(path);
3976
3977 return ret;
3978 }
3979
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981 struct btrfs_device *scrub_dev)
3982 {
3983 int i;
3984 u64 bytenr;
3985 u64 gen;
3986 int ret;
3987 struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989 if (BTRFS_FS_ERROR(fs_info))
3990 return -EROFS;
3991
3992
3993 if (scrub_dev->fs_devices != fs_info->fs_devices)
3994 gen = scrub_dev->generation;
3995 else
3996 gen = fs_info->last_trans_committed;
3997
3998 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999 bytenr = btrfs_sb_offset(i);
4000 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001 scrub_dev->commit_total_bytes)
4002 break;
4003 if (!btrfs_check_super_location(scrub_dev, bytenr))
4004 continue;
4005
4006 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008 NULL, bytenr);
4009 if (ret)
4010 return ret;
4011 }
4012 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014 return 0;
4015 }
4016
4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018 {
4019 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020 &fs_info->scrub_lock)) {
4021 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022 struct workqueue_struct *scrub_wr_comp =
4023 fs_info->scrub_wr_completion_workers;
4024 struct workqueue_struct *scrub_parity =
4025 fs_info->scrub_parity_workers;
4026
4027 fs_info->scrub_workers = NULL;
4028 fs_info->scrub_wr_completion_workers = NULL;
4029 fs_info->scrub_parity_workers = NULL;
4030 mutex_unlock(&fs_info->scrub_lock);
4031
4032 if (scrub_workers)
4033 destroy_workqueue(scrub_workers);
4034 if (scrub_wr_comp)
4035 destroy_workqueue(scrub_wr_comp);
4036 if (scrub_parity)
4037 destroy_workqueue(scrub_parity);
4038 }
4039 }
4040
4041
4042
4043
4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045 int is_dev_replace)
4046 {
4047 struct workqueue_struct *scrub_workers = NULL;
4048 struct workqueue_struct *scrub_wr_comp = NULL;
4049 struct workqueue_struct *scrub_parity = NULL;
4050 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051 int max_active = fs_info->thread_pool_size;
4052 int ret = -ENOMEM;
4053
4054 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055 return 0;
4056
4057 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058 is_dev_replace ? 1 : max_active);
4059 if (!scrub_workers)
4060 goto fail_scrub_workers;
4061
4062 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063 if (!scrub_wr_comp)
4064 goto fail_scrub_wr_completion_workers;
4065
4066 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067 if (!scrub_parity)
4068 goto fail_scrub_parity_workers;
4069
4070 mutex_lock(&fs_info->scrub_lock);
4071 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072 ASSERT(fs_info->scrub_workers == NULL &&
4073 fs_info->scrub_wr_completion_workers == NULL &&
4074 fs_info->scrub_parity_workers == NULL);
4075 fs_info->scrub_workers = scrub_workers;
4076 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077 fs_info->scrub_parity_workers = scrub_parity;
4078 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079 mutex_unlock(&fs_info->scrub_lock);
4080 return 0;
4081 }
4082
4083 refcount_inc(&fs_info->scrub_workers_refcnt);
4084 mutex_unlock(&fs_info->scrub_lock);
4085
4086 ret = 0;
4087 destroy_workqueue(scrub_parity);
4088 fail_scrub_parity_workers:
4089 destroy_workqueue(scrub_wr_comp);
4090 fail_scrub_wr_completion_workers:
4091 destroy_workqueue(scrub_workers);
4092 fail_scrub_workers:
4093 return ret;
4094 }
4095
4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097 u64 end, struct btrfs_scrub_progress *progress,
4098 int readonly, int is_dev_replace)
4099 {
4100 struct btrfs_dev_lookup_args args = { .devid = devid };
4101 struct scrub_ctx *sctx;
4102 int ret;
4103 struct btrfs_device *dev;
4104 unsigned int nofs_flag;
4105
4106 if (btrfs_fs_closing(fs_info))
4107 return -EAGAIN;
4108
4109 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110
4111
4112
4113
4114
4115 btrfs_err(fs_info,
4116 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117 fs_info->nodesize,
4118 BTRFS_STRIPE_LEN);
4119 return -EINVAL;
4120 }
4121
4122 if (fs_info->nodesize >
4123 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125
4126
4127
4128
4129 btrfs_err(fs_info,
4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133 return -EINVAL;
4134 }
4135
4136
4137 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138 if (IS_ERR(sctx))
4139 return PTR_ERR(sctx);
4140
4141 ret = scrub_workers_get(fs_info, is_dev_replace);
4142 if (ret)
4143 goto out_free_ctx;
4144
4145 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146 dev = btrfs_find_device(fs_info->fs_devices, &args);
4147 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148 !is_dev_replace)) {
4149 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150 ret = -ENODEV;
4151 goto out;
4152 }
4153
4154 if (!is_dev_replace && !readonly &&
4155 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157 btrfs_err_in_rcu(fs_info,
4158 "scrub on devid %llu: filesystem on %s is not writable",
4159 devid, rcu_str_deref(dev->name));
4160 ret = -EROFS;
4161 goto out;
4162 }
4163
4164 mutex_lock(&fs_info->scrub_lock);
4165 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167 mutex_unlock(&fs_info->scrub_lock);
4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169 ret = -EIO;
4170 goto out;
4171 }
4172
4173 down_read(&fs_info->dev_replace.rwsem);
4174 if (dev->scrub_ctx ||
4175 (!is_dev_replace &&
4176 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177 up_read(&fs_info->dev_replace.rwsem);
4178 mutex_unlock(&fs_info->scrub_lock);
4179 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180 ret = -EINPROGRESS;
4181 goto out;
4182 }
4183 up_read(&fs_info->dev_replace.rwsem);
4184
4185 sctx->readonly = readonly;
4186 dev->scrub_ctx = sctx;
4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189
4190
4191
4192
4193 __scrub_blocked_if_needed(fs_info);
4194 atomic_inc(&fs_info->scrubs_running);
4195 mutex_unlock(&fs_info->scrub_lock);
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206 nofs_flag = memalloc_nofs_save();
4207 if (!is_dev_replace) {
4208 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209
4210
4211
4212
4213 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214 ret = scrub_supers(sctx, dev);
4215 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216 }
4217
4218 if (!ret)
4219 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220 memalloc_nofs_restore(nofs_flag);
4221
4222 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223 atomic_dec(&fs_info->scrubs_running);
4224 wake_up(&fs_info->scrub_pause_wait);
4225
4226 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228 if (progress)
4229 memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231 if (!is_dev_replace)
4232 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233 ret ? "not finished" : "finished", devid, ret);
4234
4235 mutex_lock(&fs_info->scrub_lock);
4236 dev->scrub_ctx = NULL;
4237 mutex_unlock(&fs_info->scrub_lock);
4238
4239 scrub_workers_put(fs_info);
4240 scrub_put_ctx(sctx);
4241
4242 return ret;
4243 out:
4244 scrub_workers_put(fs_info);
4245 out_free_ctx:
4246 scrub_free_ctx(sctx);
4247
4248 return ret;
4249 }
4250
4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252 {
4253 mutex_lock(&fs_info->scrub_lock);
4254 atomic_inc(&fs_info->scrub_pause_req);
4255 while (atomic_read(&fs_info->scrubs_paused) !=
4256 atomic_read(&fs_info->scrubs_running)) {
4257 mutex_unlock(&fs_info->scrub_lock);
4258 wait_event(fs_info->scrub_pause_wait,
4259 atomic_read(&fs_info->scrubs_paused) ==
4260 atomic_read(&fs_info->scrubs_running));
4261 mutex_lock(&fs_info->scrub_lock);
4262 }
4263 mutex_unlock(&fs_info->scrub_lock);
4264 }
4265
4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267 {
4268 atomic_dec(&fs_info->scrub_pause_req);
4269 wake_up(&fs_info->scrub_pause_wait);
4270 }
4271
4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273 {
4274 mutex_lock(&fs_info->scrub_lock);
4275 if (!atomic_read(&fs_info->scrubs_running)) {
4276 mutex_unlock(&fs_info->scrub_lock);
4277 return -ENOTCONN;
4278 }
4279
4280 atomic_inc(&fs_info->scrub_cancel_req);
4281 while (atomic_read(&fs_info->scrubs_running)) {
4282 mutex_unlock(&fs_info->scrub_lock);
4283 wait_event(fs_info->scrub_pause_wait,
4284 atomic_read(&fs_info->scrubs_running) == 0);
4285 mutex_lock(&fs_info->scrub_lock);
4286 }
4287 atomic_dec(&fs_info->scrub_cancel_req);
4288 mutex_unlock(&fs_info->scrub_lock);
4289
4290 return 0;
4291 }
4292
4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294 {
4295 struct btrfs_fs_info *fs_info = dev->fs_info;
4296 struct scrub_ctx *sctx;
4297
4298 mutex_lock(&fs_info->scrub_lock);
4299 sctx = dev->scrub_ctx;
4300 if (!sctx) {
4301 mutex_unlock(&fs_info->scrub_lock);
4302 return -ENOTCONN;
4303 }
4304 atomic_inc(&sctx->cancel_req);
4305 while (dev->scrub_ctx) {
4306 mutex_unlock(&fs_info->scrub_lock);
4307 wait_event(fs_info->scrub_pause_wait,
4308 dev->scrub_ctx == NULL);
4309 mutex_lock(&fs_info->scrub_lock);
4310 }
4311 mutex_unlock(&fs_info->scrub_lock);
4312
4313 return 0;
4314 }
4315
4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317 struct btrfs_scrub_progress *progress)
4318 {
4319 struct btrfs_dev_lookup_args args = { .devid = devid };
4320 struct btrfs_device *dev;
4321 struct scrub_ctx *sctx = NULL;
4322
4323 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324 dev = btrfs_find_device(fs_info->fs_devices, &args);
4325 if (dev)
4326 sctx = dev->scrub_ctx;
4327 if (sctx)
4328 memcpy(progress, &sctx->stat, sizeof(*progress));
4329 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332 }
4333
4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335 u64 extent_logical, u32 extent_len,
4336 u64 *extent_physical,
4337 struct btrfs_device **extent_dev,
4338 int *extent_mirror_num)
4339 {
4340 u64 mapped_length;
4341 struct btrfs_io_context *bioc = NULL;
4342 int ret;
4343
4344 mapped_length = extent_len;
4345 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346 &mapped_length, &bioc, 0);
4347 if (ret || !bioc || mapped_length < extent_len ||
4348 !bioc->stripes[0].dev->bdev) {
4349 btrfs_put_bioc(bioc);
4350 return;
4351 }
4352
4353 *extent_physical = bioc->stripes[0].physical;
4354 *extent_mirror_num = bioc->mirror_num;
4355 *extent_dev = bioc->stripes[0].dev;
4356 btrfs_put_bioc(bioc);
4357 }