fs/btrfs/scrub.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
0004  */
0005
0006 #include <linux/blkdev.h>
0007 #include <linux/ratelimit.h>
0008 #include <linux/sched/mm.h>
0009 #include <crypto/hash.h>
0010 #include "ctree.h"
0011 #include "discard.h"
0012 #include "volumes.h"
0013 #include "disk-io.h"
0014 #include "ordered-data.h"
0015 #include "transaction.h"
0016 #include "backref.h"
0017 #include "extent_io.h"
0018 #include "dev-replace.h"
0019 #include "check-integrity.h"
0020 #include "rcu-string.h"
0021 #include "raid56.h"
0022 #include "block-group.h"
0023 #include "zoned.h"
0024
0025 /*
0026  * This is only the first step towards a full-features scrub. It reads all
0027  * extent and super block and verifies the checksums. In case a bad checksum
0028  * is found or the extent cannot be read, good data will be written back if
0029  * any can be found.
0030  *
0031  * Future enhancements:
0032  *  - In case an unrepairable extent is encountered, track which files are
0033  *    affected and report them
0034  *  - track and record media errors, throw out bad devices
0035  *  - add a mode to also read unallocated space
0036  */
0037
0038 struct scrub_block;
0039 struct scrub_ctx;
0040
0041 /*
0042  * The following three values only influence the performance.
0043  *
0044  * The last one configures the number of parallel and outstanding I/O
0045  * operations. The first one configures an upper limit for the number
0046  * of (dynamically allocated) pages that are added to a bio.
0047  */
0048 #define SCRUB_SECTORS_PER_BIO   32  /* 128KiB per bio for 4KiB pages */
0049 #define SCRUB_BIOS_PER_SCTX 64  /* 8MiB per device in flight for 4KiB pages */
0050
0051 /*
0052  * The following value times PAGE_SIZE needs to be large enough to match the
0053  * largest node/leaf/sector size that shall be supported.
0054  */
0055 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
0056
0057 struct scrub_recover {
0058     refcount_t      refs;
0059     struct btrfs_io_context *bioc;
0060     u64         map_length;
0061 };
0062
0063 struct scrub_sector {
0064     struct scrub_block  *sblock;
0065     struct page     *page;
0066     struct btrfs_device *dev;
0067     struct list_head    list;
0068     u64         flags;  /* extent flags */
0069     u64         generation;
0070     u64         logical;
0071     u64         physical;
0072     u64         physical_for_dev_replace;
0073     atomic_t        refs;
0074     u8          mirror_num;
0075     unsigned int        have_csum:1;
0076     unsigned int        io_error:1;
0077     u8          csum[BTRFS_CSUM_SIZE];
0078
0079     struct scrub_recover    *recover;
0080 };
0081
0082 struct scrub_bio {
0083     int         index;
0084     struct scrub_ctx    *sctx;
0085     struct btrfs_device *dev;
0086     struct bio      *bio;
0087     blk_status_t        status;
0088     u64         logical;
0089     u64         physical;
0090     struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
0091     int         sector_count;
0092     int         next_free;
0093     struct work_struct  work;
0094 };
0095
0096 struct scrub_block {
0097     struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
0098     int         sector_count;
0099     atomic_t        outstanding_sectors;
0100     refcount_t      refs; /* free mem on transition to zero */
0101     struct scrub_ctx    *sctx;
0102     struct scrub_parity *sparity;
0103     struct {
0104         unsigned int    header_error:1;
0105         unsigned int    checksum_error:1;
0106         unsigned int    no_io_error_seen:1;
0107         unsigned int    generation_error:1; /* also sets header_error */
0108
0109         /* The following is for the data used to check parity */
0110         /* It is for the data with checksum */
0111         unsigned int    data_corrected:1;
0112     };
0113     struct work_struct  work;
0114 };
0115
0116 /* Used for the chunks with parity stripe such RAID5/6 */
0117 struct scrub_parity {
0118     struct scrub_ctx    *sctx;
0119
0120     struct btrfs_device *scrub_dev;
0121
0122     u64         logic_start;
0123
0124     u64         logic_end;
0125
0126     int         nsectors;
0127
0128     u32         stripe_len;
0129
0130     refcount_t      refs;
0131
0132     struct list_head    sectors_list;
0133
0134     /* Work of parity check and repair */
0135     struct work_struct  work;
0136
0137     /* Mark the parity blocks which have data */
0138     unsigned long       dbitmap;
0139
0140     /*
0141      * Mark the parity blocks which have data, but errors happen when
0142      * read data or check data
0143      */
0144     unsigned long       ebitmap;
0145 };
0146
0147 struct scrub_ctx {
0148     struct scrub_bio    *bios[SCRUB_BIOS_PER_SCTX];
0149     struct btrfs_fs_info    *fs_info;
0150     int         first_free;
0151     int         curr;
0152     atomic_t        bios_in_flight;
0153     atomic_t        workers_pending;
0154     spinlock_t      list_lock;
0155     wait_queue_head_t   list_wait;
0156     struct list_head    csum_list;
0157     atomic_t        cancel_req;
0158     int         readonly;
0159     int         sectors_per_bio;
0160
0161     /* State of IO submission throttling affecting the associated device */
0162     ktime_t         throttle_deadline;
0163     u64         throttle_sent;
0164
0165     int         is_dev_replace;
0166     u64         write_pointer;
0167
0168     struct scrub_bio        *wr_curr_bio;
0169     struct mutex            wr_lock;
0170     struct btrfs_device     *wr_tgtdev;
0171     bool                    flush_all_writes;
0172
0173     /*
0174      * statistics
0175      */
0176     struct btrfs_scrub_progress stat;
0177     spinlock_t      stat_lock;
0178
0179     /*
0180      * Use a ref counter to avoid use-after-free issues. Scrub workers
0181      * decrement bios_in_flight and workers_pending and then do a wakeup
0182      * on the list_wait wait queue. We must ensure the main scrub task
0183      * doesn't free the scrub context before or while the workers are
0184      * doing the wakeup() call.
0185      */
0186     refcount_t              refs;
0187 };
0188
0189 struct scrub_warning {
0190     struct btrfs_path   *path;
0191     u64         extent_item_size;
0192     const char      *errstr;
0193     u64         physical;
0194     u64         logical;
0195     struct btrfs_device *dev;
0196 };
0197
0198 struct full_stripe_lock {
0199     struct rb_node node;
0200     u64 logical;
0201     u64 refs;
0202     struct mutex mutex;
0203 };
0204
0205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
0206                      struct scrub_block *sblocks_for_recheck);
0207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
0208                 struct scrub_block *sblock,
0209                 int retry_failed_mirror);
0210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
0211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
0212                          struct scrub_block *sblock_good);
0213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
0214                         struct scrub_block *sblock_good,
0215                         int sector_num, int force_write);
0216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
0217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
0218                          int sector_num);
0219 static int scrub_checksum_data(struct scrub_block *sblock);
0220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
0221 static int scrub_checksum_super(struct scrub_block *sblock);
0222 static void scrub_block_put(struct scrub_block *sblock);
0223 static void scrub_sector_get(struct scrub_sector *sector);
0224 static void scrub_sector_put(struct scrub_sector *sector);
0225 static void scrub_parity_get(struct scrub_parity *sparity);
0226 static void scrub_parity_put(struct scrub_parity *sparity);
0227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
0228              u64 physical, struct btrfs_device *dev, u64 flags,
0229              u64 gen, int mirror_num, u8 *csum,
0230              u64 physical_for_dev_replace);
0231 static void scrub_bio_end_io(struct bio *bio);
0232 static void scrub_bio_end_io_worker(struct work_struct *work);
0233 static void scrub_block_complete(struct scrub_block *sblock);
0234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
0235                  u64 extent_logical, u32 extent_len,
0236                  u64 *extent_physical,
0237                  struct btrfs_device **extent_dev,
0238                  int *extent_mirror_num);
0239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
0240                       struct scrub_sector *sector);
0241 static void scrub_wr_submit(struct scrub_ctx *sctx);
0242 static void scrub_wr_bio_end_io(struct bio *bio);
0243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
0244 static void scrub_put_ctx(struct scrub_ctx *sctx);
0245
0246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
0247 {
0248     return sector->recover &&
0249            (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
0250 }
0251
0252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
0253 {
0254     refcount_inc(&sctx->refs);
0255     atomic_inc(&sctx->bios_in_flight);
0256 }
0257
0258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
0259 {
0260     atomic_dec(&sctx->bios_in_flight);
0261     wake_up(&sctx->list_wait);
0262     scrub_put_ctx(sctx);
0263 }
0264
0265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
0266 {
0267     while (atomic_read(&fs_info->scrub_pause_req)) {
0268         mutex_unlock(&fs_info->scrub_lock);
0269         wait_event(fs_info->scrub_pause_wait,
0270            atomic_read(&fs_info->scrub_pause_req) == 0);
0271         mutex_lock(&fs_info->scrub_lock);
0272     }
0273 }
0274
0275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
0276 {
0277     atomic_inc(&fs_info->scrubs_paused);
0278     wake_up(&fs_info->scrub_pause_wait);
0279 }
0280
0281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
0282 {
0283     mutex_lock(&fs_info->scrub_lock);
0284     __scrub_blocked_if_needed(fs_info);
0285     atomic_dec(&fs_info->scrubs_paused);
0286     mutex_unlock(&fs_info->scrub_lock);
0287
0288     wake_up(&fs_info->scrub_pause_wait);
0289 }
0290
0291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
0292 {
0293     scrub_pause_on(fs_info);
0294     scrub_pause_off(fs_info);
0295 }
0296
0297 /*
0298  * Insert new full stripe lock into full stripe locks tree
0299  *
0300  * Return pointer to existing or newly inserted full_stripe_lock structure if
0301  * everything works well.
0302  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
0303  *
0304  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
0305  * function
0306  */
0307 static struct full_stripe_lock *insert_full_stripe_lock(
0308         struct btrfs_full_stripe_locks_tree *locks_root,
0309         u64 fstripe_logical)
0310 {
0311     struct rb_node **p;
0312     struct rb_node *parent = NULL;
0313     struct full_stripe_lock *entry;
0314     struct full_stripe_lock *ret;
0315
0316     lockdep_assert_held(&locks_root->lock);
0317
0318     p = &locks_root->root.rb_node;
0319     while (*p) {
0320         parent = *p;
0321         entry = rb_entry(parent, struct full_stripe_lock, node);
0322         if (fstripe_logical < entry->logical) {
0323             p = &(*p)->rb_left;
0324         } else if (fstripe_logical > entry->logical) {
0325             p = &(*p)->rb_right;
0326         } else {
0327             entry->refs++;
0328             return entry;
0329         }
0330     }
0331
0332     /*
0333      * Insert new lock.
0334      */
0335     ret = kmalloc(sizeof(*ret), GFP_KERNEL);
0336     if (!ret)
0337         return ERR_PTR(-ENOMEM);
0338     ret->logical = fstripe_logical;
0339     ret->refs = 1;
0340     mutex_init(&ret->mutex);
0341
0342     rb_link_node(&ret->node, parent, p);
0343     rb_insert_color(&ret->node, &locks_root->root);
0344     return ret;
0345 }
0346
0347 /*
0348  * Search for a full stripe lock of a block group
0349  *
0350  * Return pointer to existing full stripe lock if found
0351  * Return NULL if not found
0352  */
0353 static struct full_stripe_lock *search_full_stripe_lock(
0354         struct btrfs_full_stripe_locks_tree *locks_root,
0355         u64 fstripe_logical)
0356 {
0357     struct rb_node *node;
0358     struct full_stripe_lock *entry;
0359
0360     lockdep_assert_held(&locks_root->lock);
0361
0362     node = locks_root->root.rb_node;
0363     while (node) {
0364         entry = rb_entry(node, struct full_stripe_lock, node);
0365         if (fstripe_logical < entry->logical)
0366             node = node->rb_left;
0367         else if (fstripe_logical > entry->logical)
0368             node = node->rb_right;
0369         else
0370             return entry;
0371     }
0372     return NULL;
0373 }
0374
0375 /*
0376  * Helper to get full stripe logical from a normal bytenr.
0377  *
0378  * Caller must ensure @cache is a RAID56 block group.
0379  */
0380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
0381 {
0382     u64 ret;
0383
0384     /*
0385      * Due to chunk item size limit, full stripe length should not be
0386      * larger than U32_MAX. Just a sanity check here.
0387      */
0388     WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
0389
0390     /*
0391      * round_down() can only handle power of 2, while RAID56 full
0392      * stripe length can be 64KiB * n, so we need to manually round down.
0393      */
0394     ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
0395             cache->full_stripe_len + cache->start;
0396     return ret;
0397 }
0398
0399 /*
0400  * Lock a full stripe to avoid concurrency of recovery and read
0401  *
0402  * It's only used for profiles with parities (RAID5/6), for other profiles it
0403  * does nothing.
0404  *
0405  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
0406  * So caller must call unlock_full_stripe() at the same context.
0407  *
0408  * Return <0 if encounters error.
0409  */
0410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
0411                 bool *locked_ret)
0412 {
0413     struct btrfs_block_group *bg_cache;
0414     struct btrfs_full_stripe_locks_tree *locks_root;
0415     struct full_stripe_lock *existing;
0416     u64 fstripe_start;
0417     int ret = 0;
0418
0419     *locked_ret = false;
0420     bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
0421     if (!bg_cache) {
0422         ASSERT(0);
0423         return -ENOENT;
0424     }
0425
0426     /* Profiles not based on parity don't need full stripe lock */
0427     if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
0428         goto out;
0429     locks_root = &bg_cache->full_stripe_locks_root;
0430
0431     fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
0432
0433     /* Now insert the full stripe lock */
0434     mutex_lock(&locks_root->lock);
0435     existing = insert_full_stripe_lock(locks_root, fstripe_start);
0436     mutex_unlock(&locks_root->lock);
0437     if (IS_ERR(existing)) {
0438         ret = PTR_ERR(existing);
0439         goto out;
0440     }
0441     mutex_lock(&existing->mutex);
0442     *locked_ret = true;
0443 out:
0444     btrfs_put_block_group(bg_cache);
0445     return ret;
0446 }
0447
0448 /*
0449  * Unlock a full stripe.
0450  *
0451  * NOTE: Caller must ensure it's the same context calling corresponding
0452  * lock_full_stripe().
0453  *
0454  * Return 0 if we unlock full stripe without problem.
0455  * Return <0 for error
0456  */
0457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
0458                   bool locked)
0459 {
0460     struct btrfs_block_group *bg_cache;
0461     struct btrfs_full_stripe_locks_tree *locks_root;
0462     struct full_stripe_lock *fstripe_lock;
0463     u64 fstripe_start;
0464     bool freeit = false;
0465     int ret = 0;
0466
0467     /* If we didn't acquire full stripe lock, no need to continue */
0468     if (!locked)
0469         return 0;
0470
0471     bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
0472     if (!bg_cache) {
0473         ASSERT(0);
0474         return -ENOENT;
0475     }
0476     if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
0477         goto out;
0478
0479     locks_root = &bg_cache->full_stripe_locks_root;
0480     fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
0481
0482     mutex_lock(&locks_root->lock);
0483     fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
0484     /* Unpaired unlock_full_stripe() detected */
0485     if (!fstripe_lock) {
0486         WARN_ON(1);
0487         ret = -ENOENT;
0488         mutex_unlock(&locks_root->lock);
0489         goto out;
0490     }
0491
0492     if (fstripe_lock->refs == 0) {
0493         WARN_ON(1);
0494         btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
0495             fstripe_lock->logical);
0496     } else {
0497         fstripe_lock->refs--;
0498     }
0499
0500     if (fstripe_lock->refs == 0) {
0501         rb_erase(&fstripe_lock->node, &locks_root->root);
0502         freeit = true;
0503     }
0504     mutex_unlock(&locks_root->lock);
0505
0506     mutex_unlock(&fstripe_lock->mutex);
0507     if (freeit)
0508         kfree(fstripe_lock);
0509 out:
0510     btrfs_put_block_group(bg_cache);
0511     return ret;
0512 }
0513
0514 static void scrub_free_csums(struct scrub_ctx *sctx)
0515 {
0516     while (!list_empty(&sctx->csum_list)) {
0517         struct btrfs_ordered_sum *sum;
0518         sum = list_first_entry(&sctx->csum_list,
0519                        struct btrfs_ordered_sum, list);
0520         list_del(&sum->list);
0521         kfree(sum);
0522     }
0523 }
0524
0525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
0526 {
0527     int i;
0528
0529     if (!sctx)
0530         return;
0531
0532     /* this can happen when scrub is cancelled */
0533     if (sctx->curr != -1) {
0534         struct scrub_bio *sbio = sctx->bios[sctx->curr];
0535
0536         for (i = 0; i < sbio->sector_count; i++) {
0537             WARN_ON(!sbio->sectors[i]->page);
0538             scrub_block_put(sbio->sectors[i]->sblock);
0539         }
0540         bio_put(sbio->bio);
0541     }
0542
0543     for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
0544         struct scrub_bio *sbio = sctx->bios[i];
0545
0546         if (!sbio)
0547             break;
0548         kfree(sbio);
0549     }
0550
0551     kfree(sctx->wr_curr_bio);
0552     scrub_free_csums(sctx);
0553     kfree(sctx);
0554 }
0555
0556 static void scrub_put_ctx(struct scrub_ctx *sctx)
0557 {
0558     if (refcount_dec_and_test(&sctx->refs))
0559         scrub_free_ctx(sctx);
0560 }
0561
0562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
0563         struct btrfs_fs_info *fs_info, int is_dev_replace)
0564 {
0565     struct scrub_ctx *sctx;
0566     int     i;
0567
0568     sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
0569     if (!sctx)
0570         goto nomem;
0571     refcount_set(&sctx->refs, 1);
0572     sctx->is_dev_replace = is_dev_replace;
0573     sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
0574     sctx->curr = -1;
0575     sctx->fs_info = fs_info;
0576     INIT_LIST_HEAD(&sctx->csum_list);
0577     for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
0578         struct scrub_bio *sbio;
0579
0580         sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
0581         if (!sbio)
0582             goto nomem;
0583         sctx->bios[i] = sbio;
0584
0585         sbio->index = i;
0586         sbio->sctx = sctx;
0587         sbio->sector_count = 0;
0588         INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
0589
0590         if (i != SCRUB_BIOS_PER_SCTX - 1)
0591             sctx->bios[i]->next_free = i + 1;
0592         else
0593             sctx->bios[i]->next_free = -1;
0594     }
0595     sctx->first_free = 0;
0596     atomic_set(&sctx->bios_in_flight, 0);
0597     atomic_set(&sctx->workers_pending, 0);
0598     atomic_set(&sctx->cancel_req, 0);
0599
0600     spin_lock_init(&sctx->list_lock);
0601     spin_lock_init(&sctx->stat_lock);
0602     init_waitqueue_head(&sctx->list_wait);
0603     sctx->throttle_deadline = 0;
0604
0605     WARN_ON(sctx->wr_curr_bio != NULL);
0606     mutex_init(&sctx->wr_lock);
0607     sctx->wr_curr_bio = NULL;
0608     if (is_dev_replace) {
0609         WARN_ON(!fs_info->dev_replace.tgtdev);
0610         sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
0611         sctx->flush_all_writes = false;
0612     }
0613
0614     return sctx;
0615
0616 nomem:
0617     scrub_free_ctx(sctx);
0618     return ERR_PTR(-ENOMEM);
0619 }
0620
0621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
0622                      void *warn_ctx)
0623 {
0624     u32 nlink;
0625     int ret;
0626     int i;
0627     unsigned nofs_flag;
0628     struct extent_buffer *eb;
0629     struct btrfs_inode_item *inode_item;
0630     struct scrub_warning *swarn = warn_ctx;
0631     struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
0632     struct inode_fs_paths *ipath = NULL;
0633     struct btrfs_root *local_root;
0634     struct btrfs_key key;
0635
0636     local_root = btrfs_get_fs_root(fs_info, root, true);
0637     if (IS_ERR(local_root)) {
0638         ret = PTR_ERR(local_root);
0639         goto err;
0640     }
0641
0642     /*
0643      * this makes the path point to (inum INODE_ITEM ioff)
0644      */
0645     key.objectid = inum;
0646     key.type = BTRFS_INODE_ITEM_KEY;
0647     key.offset = 0;
0648
0649     ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
0650     if (ret) {
0651         btrfs_put_root(local_root);
0652         btrfs_release_path(swarn->path);
0653         goto err;
0654     }
0655
0656     eb = swarn->path->nodes[0];
0657     inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
0658                     struct btrfs_inode_item);
0659     nlink = btrfs_inode_nlink(eb, inode_item);
0660     btrfs_release_path(swarn->path);
0661
0662     /*
0663      * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
0664      * uses GFP_NOFS in this context, so we keep it consistent but it does
0665      * not seem to be strictly necessary.
0666      */
0667     nofs_flag = memalloc_nofs_save();
0668     ipath = init_ipath(4096, local_root, swarn->path);
0669     memalloc_nofs_restore(nofs_flag);
0670     if (IS_ERR(ipath)) {
0671         btrfs_put_root(local_root);
0672         ret = PTR_ERR(ipath);
0673         ipath = NULL;
0674         goto err;
0675     }
0676     ret = paths_from_inode(inum, ipath);
0677
0678     if (ret < 0)
0679         goto err;
0680
0681     /*
0682      * we deliberately ignore the bit ipath might have been too small to
0683      * hold all of the paths here
0684      */
0685     for (i = 0; i < ipath->fspath->elem_cnt; ++i)
0686         btrfs_warn_in_rcu(fs_info,
0687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
0688                   swarn->errstr, swarn->logical,
0689                   rcu_str_deref(swarn->dev->name),
0690                   swarn->physical,
0691                   root, inum, offset,
0692                   fs_info->sectorsize, nlink,
0693                   (char *)(unsigned long)ipath->fspath->val[i]);
0694
0695     btrfs_put_root(local_root);
0696     free_ipath(ipath);
0697     return 0;
0698
0699 err:
0700     btrfs_warn_in_rcu(fs_info,
0701               "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
0702               swarn->errstr, swarn->logical,
0703               rcu_str_deref(swarn->dev->name),
0704               swarn->physical,
0705               root, inum, offset, ret);
0706
0707     free_ipath(ipath);
0708     return 0;
0709 }
0710
0711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
0712 {
0713     struct btrfs_device *dev;
0714     struct btrfs_fs_info *fs_info;
0715     struct btrfs_path *path;
0716     struct btrfs_key found_key;
0717     struct extent_buffer *eb;
0718     struct btrfs_extent_item *ei;
0719     struct scrub_warning swarn;
0720     unsigned long ptr = 0;
0721     u64 extent_item_pos;
0722     u64 flags = 0;
0723     u64 ref_root;
0724     u32 item_size;
0725     u8 ref_level = 0;
0726     int ret;
0727
0728     WARN_ON(sblock->sector_count < 1);
0729     dev = sblock->sectors[0]->dev;
0730     fs_info = sblock->sctx->fs_info;
0731
0732     path = btrfs_alloc_path();
0733     if (!path)
0734         return;
0735
0736     swarn.physical = sblock->sectors[0]->physical;
0737     swarn.logical = sblock->sectors[0]->logical;
0738     swarn.errstr = errstr;
0739     swarn.dev = NULL;
0740
0741     ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
0742                   &flags);
0743     if (ret < 0)
0744         goto out;
0745
0746     extent_item_pos = swarn.logical - found_key.objectid;
0747     swarn.extent_item_size = found_key.offset;
0748
0749     eb = path->nodes[0];
0750     ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
0751     item_size = btrfs_item_size(eb, path->slots[0]);
0752
0753     if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
0754         do {
0755             ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
0756                               item_size, &ref_root,
0757                               &ref_level);
0758             btrfs_warn_in_rcu(fs_info,
0759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
0760                 errstr, swarn.logical,
0761                 rcu_str_deref(dev->name),
0762                 swarn.physical,
0763                 ref_level ? "node" : "leaf",
0764                 ret < 0 ? -1 : ref_level,
0765                 ret < 0 ? -1 : ref_root);
0766         } while (ret != 1);
0767         btrfs_release_path(path);
0768     } else {
0769         btrfs_release_path(path);
0770         swarn.path = path;
0771         swarn.dev = dev;
0772         iterate_extent_inodes(fs_info, found_key.objectid,
0773                     extent_item_pos, 1,
0774                     scrub_print_warning_inode, &swarn, false);
0775     }
0776
0777 out:
0778     btrfs_free_path(path);
0779 }
0780
0781 static inline void scrub_get_recover(struct scrub_recover *recover)
0782 {
0783     refcount_inc(&recover->refs);
0784 }
0785
0786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
0787                      struct scrub_recover *recover)
0788 {
0789     if (refcount_dec_and_test(&recover->refs)) {
0790         btrfs_bio_counter_dec(fs_info);
0791         btrfs_put_bioc(recover->bioc);
0792         kfree(recover);
0793     }
0794 }
0795
0796 /*
0797  * scrub_handle_errored_block gets called when either verification of the
0798  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
0799  * case, this function handles all sectors in the bio, even though only one
0800  * may be bad.
0801  * The goal of this function is to repair the errored block by using the
0802  * contents of one of the mirrors.
0803  */
0804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
0805 {
0806     struct scrub_ctx *sctx = sblock_to_check->sctx;
0807     struct btrfs_device *dev;
0808     struct btrfs_fs_info *fs_info;
0809     u64 logical;
0810     unsigned int failed_mirror_index;
0811     unsigned int is_metadata;
0812     unsigned int have_csum;
0813     struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
0814     struct scrub_block *sblock_bad;
0815     int ret;
0816     int mirror_index;
0817     int sector_num;
0818     int success;
0819     bool full_stripe_locked;
0820     unsigned int nofs_flag;
0821     static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
0822                       DEFAULT_RATELIMIT_BURST);
0823
0824     BUG_ON(sblock_to_check->sector_count < 1);
0825     fs_info = sctx->fs_info;
0826     if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
0827         /*
0828          * if we find an error in a super block, we just report it.
0829          * They will get written with the next transaction commit
0830          * anyway
0831          */
0832         spin_lock(&sctx->stat_lock);
0833         ++sctx->stat.super_errors;
0834         spin_unlock(&sctx->stat_lock);
0835         return 0;
0836     }
0837     logical = sblock_to_check->sectors[0]->logical;
0838     BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
0839     failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
0840     is_metadata = !(sblock_to_check->sectors[0]->flags &
0841             BTRFS_EXTENT_FLAG_DATA);
0842     have_csum = sblock_to_check->sectors[0]->have_csum;
0843     dev = sblock_to_check->sectors[0]->dev;
0844
0845     if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
0846         return 0;
0847
0848     /*
0849      * We must use GFP_NOFS because the scrub task might be waiting for a
0850      * worker task executing this function and in turn a transaction commit
0851      * might be waiting the scrub task to pause (which needs to wait for all
0852      * the worker tasks to complete before pausing).
0853      * We do allocations in the workers through insert_full_stripe_lock()
0854      * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
0855      * this function.
0856      */
0857     nofs_flag = memalloc_nofs_save();
0858     /*
0859      * For RAID5/6, race can happen for a different device scrub thread.
0860      * For data corruption, Parity and Data threads will both try
0861      * to recovery the data.
0862      * Race can lead to doubly added csum error, or even unrecoverable
0863      * error.
0864      */
0865     ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
0866     if (ret < 0) {
0867         memalloc_nofs_restore(nofs_flag);
0868         spin_lock(&sctx->stat_lock);
0869         if (ret == -ENOMEM)
0870             sctx->stat.malloc_errors++;
0871         sctx->stat.read_errors++;
0872         sctx->stat.uncorrectable_errors++;
0873         spin_unlock(&sctx->stat_lock);
0874         return ret;
0875     }
0876
0877     /*
0878      * read all mirrors one after the other. This includes to
0879      * re-read the extent or metadata block that failed (that was
0880      * the cause that this fixup code is called) another time,
0881      * sector by sector this time in order to know which sectors
0882      * caused I/O errors and which ones are good (for all mirrors).
0883      * It is the goal to handle the situation when more than one
0884      * mirror contains I/O errors, but the errors do not
0885      * overlap, i.e. the data can be repaired by selecting the
0886      * sectors from those mirrors without I/O error on the
0887      * particular sectors. One example (with blocks >= 2 * sectorsize)
0888      * would be that mirror #1 has an I/O error on the first sector,
0889      * the second sector is good, and mirror #2 has an I/O error on
0890      * the second sector, but the first sector is good.
0891      * Then the first sector of the first mirror can be repaired by
0892      * taking the first sector of the second mirror, and the
0893      * second sector of the second mirror can be repaired by
0894      * copying the contents of the 2nd sector of the 1st mirror.
0895      * One more note: if the sectors of one mirror contain I/O
0896      * errors, the checksum cannot be verified. In order to get
0897      * the best data for repairing, the first attempt is to find
0898      * a mirror without I/O errors and with a validated checksum.
0899      * Only if this is not possible, the sectors are picked from
0900      * mirrors with I/O errors without considering the checksum.
0901      * If the latter is the case, at the end, the checksum of the
0902      * repaired area is verified in order to correctly maintain
0903      * the statistics.
0904      */
0905
0906     sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
0907                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
0908     if (!sblocks_for_recheck) {
0909         spin_lock(&sctx->stat_lock);
0910         sctx->stat.malloc_errors++;
0911         sctx->stat.read_errors++;
0912         sctx->stat.uncorrectable_errors++;
0913         spin_unlock(&sctx->stat_lock);
0914         btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0915         goto out;
0916     }
0917
0918     /* Setup the context, map the logical blocks and alloc the sectors */
0919     ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
0920     if (ret) {
0921         spin_lock(&sctx->stat_lock);
0922         sctx->stat.read_errors++;
0923         sctx->stat.uncorrectable_errors++;
0924         spin_unlock(&sctx->stat_lock);
0925         btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0926         goto out;
0927     }
0928     BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
0929     sblock_bad = sblocks_for_recheck + failed_mirror_index;
0930
0931     /* build and submit the bios for the failed mirror, check checksums */
0932     scrub_recheck_block(fs_info, sblock_bad, 1);
0933
0934     if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
0935         sblock_bad->no_io_error_seen) {
0936         /*
0937          * The error disappeared after reading sector by sector, or
0938          * the area was part of a huge bio and other parts of the
0939          * bio caused I/O errors, or the block layer merged several
0940          * read requests into one and the error is caused by a
0941          * different bio (usually one of the two latter cases is
0942          * the cause)
0943          */
0944         spin_lock(&sctx->stat_lock);
0945         sctx->stat.unverified_errors++;
0946         sblock_to_check->data_corrected = 1;
0947         spin_unlock(&sctx->stat_lock);
0948
0949         if (sctx->is_dev_replace)
0950             scrub_write_block_to_dev_replace(sblock_bad);
0951         goto out;
0952     }
0953
0954     if (!sblock_bad->no_io_error_seen) {
0955         spin_lock(&sctx->stat_lock);
0956         sctx->stat.read_errors++;
0957         spin_unlock(&sctx->stat_lock);
0958         if (__ratelimit(&rs))
0959             scrub_print_warning("i/o error", sblock_to_check);
0960         btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
0961     } else if (sblock_bad->checksum_error) {
0962         spin_lock(&sctx->stat_lock);
0963         sctx->stat.csum_errors++;
0964         spin_unlock(&sctx->stat_lock);
0965         if (__ratelimit(&rs))
0966             scrub_print_warning("checksum error", sblock_to_check);
0967         btrfs_dev_stat_inc_and_print(dev,
0968                          BTRFS_DEV_STAT_CORRUPTION_ERRS);
0969     } else if (sblock_bad->header_error) {
0970         spin_lock(&sctx->stat_lock);
0971         sctx->stat.verify_errors++;
0972         spin_unlock(&sctx->stat_lock);
0973         if (__ratelimit(&rs))
0974             scrub_print_warning("checksum/header error",
0975                         sblock_to_check);
0976         if (sblock_bad->generation_error)
0977             btrfs_dev_stat_inc_and_print(dev,
0978                 BTRFS_DEV_STAT_GENERATION_ERRS);
0979         else
0980             btrfs_dev_stat_inc_and_print(dev,
0981                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
0982     }
0983
0984     if (sctx->readonly) {
0985         ASSERT(!sctx->is_dev_replace);
0986         goto out;
0987     }
0988
0989     /*
0990      * now build and submit the bios for the other mirrors, check
0991      * checksums.
0992      * First try to pick the mirror which is completely without I/O
0993      * errors and also does not have a checksum error.
0994      * If one is found, and if a checksum is present, the full block
0995      * that is known to contain an error is rewritten. Afterwards
0996      * the block is known to be corrected.
0997      * If a mirror is found which is completely correct, and no
0998      * checksum is present, only those sectors are rewritten that had
0999      * an I/O error in the block to be repaired, since it cannot be
1000      * determined, which copy of the other sectors is better (and it
1001      * could happen otherwise that a correct sector would be
1002      * overwritten by a bad one).
1003      */
1004     for (mirror_index = 0; ;mirror_index++) {
1005         struct scrub_block *sblock_other;
1006
1007         if (mirror_index == failed_mirror_index)
1008             continue;
1009
1010         /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011         if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012             if (mirror_index >= BTRFS_MAX_MIRRORS)
1013                 break;
1014             if (!sblocks_for_recheck[mirror_index].sector_count)
1015                 break;
1016
1017             sblock_other = sblocks_for_recheck + mirror_index;
1018         } else {
1019             struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020             int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022             if (mirror_index >= max_allowed)
1023                 break;
1024             if (!sblocks_for_recheck[1].sector_count)
1025                 break;
1026
1027             ASSERT(failed_mirror_index == 0);
1028             sblock_other = sblocks_for_recheck + 1;
1029             sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030         }
1031
1032         /* build and submit the bios, check checksums */
1033         scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035         if (!sblock_other->header_error &&
1036             !sblock_other->checksum_error &&
1037             sblock_other->no_io_error_seen) {
1038             if (sctx->is_dev_replace) {
1039                 scrub_write_block_to_dev_replace(sblock_other);
1040                 goto corrected_error;
1041             } else {
1042                 ret = scrub_repair_block_from_good_copy(
1043                         sblock_bad, sblock_other);
1044                 if (!ret)
1045                     goto corrected_error;
1046             }
1047         }
1048     }
1049
1050     if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051         goto did_not_correct_error;
1052
1053     /*
1054      * In case of I/O errors in the area that is supposed to be
1055      * repaired, continue by picking good copies of those sectors.
1056      * Select the good sectors from mirrors to rewrite bad sectors from
1057      * the area to fix. Afterwards verify the checksum of the block
1058      * that is supposed to be repaired. This verification step is
1059      * only done for the purpose of statistic counting and for the
1060      * final scrub report, whether errors remain.
1061      * A perfect algorithm could make use of the checksum and try
1062      * all possible combinations of sectors from the different mirrors
1063      * until the checksum verification succeeds. For example, when
1064      * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065      * of mirror #2 is readable but the final checksum test fails,
1066      * then the 2nd sector of mirror #3 could be tried, whether now
1067      * the final checksum succeeds. But this would be a rare
1068      * exception and is therefore not implemented. At least it is
1069      * avoided that the good copy is overwritten.
1070      * A more useful improvement would be to pick the sectors
1071      * without I/O error based on sector sizes (512 bytes on legacy
1072      * disks) instead of on sectorsize. Then maybe 512 byte of one
1073      * mirror could be repaired by taking 512 byte of a different
1074      * mirror, even if other 512 byte sectors in the same sectorsize
1075      * area are unreadable.
1076      */
1077     success = 1;
1078     for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079          sector_num++) {
1080         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081         struct scrub_block *sblock_other = NULL;
1082
1083         /* Skip no-io-error sectors in scrub */
1084         if (!sector_bad->io_error && !sctx->is_dev_replace)
1085             continue;
1086
1087         if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088             /*
1089              * In case of dev replace, if raid56 rebuild process
1090              * didn't work out correct data, then copy the content
1091              * in sblock_bad to make sure target device is identical
1092              * to source device, instead of writing garbage data in
1093              * sblock_for_recheck array to target device.
1094              */
1095             sblock_other = NULL;
1096         } else if (sector_bad->io_error) {
1097             /* Try to find no-io-error sector in mirrors */
1098             for (mirror_index = 0;
1099                  mirror_index < BTRFS_MAX_MIRRORS &&
1100                  sblocks_for_recheck[mirror_index].sector_count > 0;
1101                  mirror_index++) {
1102                 if (!sblocks_for_recheck[mirror_index].
1103                     sectors[sector_num]->io_error) {
1104                     sblock_other = sblocks_for_recheck +
1105                                mirror_index;
1106                     break;
1107                 }
1108             }
1109             if (!sblock_other)
1110                 success = 0;
1111         }
1112
1113         if (sctx->is_dev_replace) {
1114             /*
1115              * Did not find a mirror to fetch the sector from.
1116              * scrub_write_sector_to_dev_replace() handles this
1117              * case (sector->io_error), by filling the block with
1118              * zeros before submitting the write request
1119              */
1120             if (!sblock_other)
1121                 sblock_other = sblock_bad;
1122
1123             if (scrub_write_sector_to_dev_replace(sblock_other,
1124                                   sector_num) != 0) {
1125                 atomic64_inc(
1126                     &fs_info->dev_replace.num_write_errors);
1127                 success = 0;
1128             }
1129         } else if (sblock_other) {
1130             ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131                                  sblock_other,
1132                                  sector_num, 0);
1133             if (0 == ret)
1134                 sector_bad->io_error = 0;
1135             else
1136                 success = 0;
1137         }
1138     }
1139
1140     if (success && !sctx->is_dev_replace) {
1141         if (is_metadata || have_csum) {
1142             /*
1143              * need to verify the checksum now that all
1144              * sectors on disk are repaired (the write
1145              * request for data to be repaired is on its way).
1146              * Just be lazy and use scrub_recheck_block()
1147              * which re-reads the data before the checksum
1148              * is verified, but most likely the data comes out
1149              * of the page cache.
1150              */
1151             scrub_recheck_block(fs_info, sblock_bad, 1);
1152             if (!sblock_bad->header_error &&
1153                 !sblock_bad->checksum_error &&
1154                 sblock_bad->no_io_error_seen)
1155                 goto corrected_error;
1156             else
1157                 goto did_not_correct_error;
1158         } else {
1159 corrected_error:
1160             spin_lock(&sctx->stat_lock);
1161             sctx->stat.corrected_errors++;
1162             sblock_to_check->data_corrected = 1;
1163             spin_unlock(&sctx->stat_lock);
1164             btrfs_err_rl_in_rcu(fs_info,
1165                 "fixed up error at logical %llu on dev %s",
1166                 logical, rcu_str_deref(dev->name));
1167         }
1168     } else {
1169 did_not_correct_error:
1170         spin_lock(&sctx->stat_lock);
1171         sctx->stat.uncorrectable_errors++;
1172         spin_unlock(&sctx->stat_lock);
1173         btrfs_err_rl_in_rcu(fs_info,
1174             "unable to fixup (regular) error at logical %llu on dev %s",
1175             logical, rcu_str_deref(dev->name));
1176     }
1177
1178 out:
1179     if (sblocks_for_recheck) {
1180         for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181              mirror_index++) {
1182             struct scrub_block *sblock = sblocks_for_recheck +
1183                              mirror_index;
1184             struct scrub_recover *recover;
1185             int i;
1186
1187             for (i = 0; i < sblock->sector_count; i++) {
1188                 sblock->sectors[i]->sblock = NULL;
1189                 recover = sblock->sectors[i]->recover;
1190                 if (recover) {
1191                     scrub_put_recover(fs_info, recover);
1192                     sblock->sectors[i]->recover = NULL;
1193                 }
1194                 scrub_sector_put(sblock->sectors[i]);
1195             }
1196         }
1197         kfree(sblocks_for_recheck);
1198     }
1199
1200     ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201     memalloc_nofs_restore(nofs_flag);
1202     if (ret < 0)
1203         return ret;
1204     return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209     if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210         return 2;
1211     else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212         return 3;
1213     else
1214         return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218                          u64 *raid_map,
1219                          int nstripes, int mirror,
1220                          int *stripe_index,
1221                          u64 *stripe_offset)
1222 {
1223     int i;
1224
1225     if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226         /* RAID5/6 */
1227         for (i = 0; i < nstripes; i++) {
1228             if (raid_map[i] == RAID6_Q_STRIPE ||
1229                 raid_map[i] == RAID5_P_STRIPE)
1230                 continue;
1231
1232             if (logical >= raid_map[i] &&
1233                 logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234                 break;
1235         }
1236
1237         *stripe_index = i;
1238         *stripe_offset = logical - raid_map[i];
1239     } else {
1240         /* The other RAID type */
1241         *stripe_index = mirror;
1242         *stripe_offset = 0;
1243     }
1244 }
1245
1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247                      struct scrub_block *sblocks_for_recheck)
1248 {
1249     struct scrub_ctx *sctx = original_sblock->sctx;
1250     struct btrfs_fs_info *fs_info = sctx->fs_info;
1251     u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252     u64 logical = original_sblock->sectors[0]->logical;
1253     u64 generation = original_sblock->sectors[0]->generation;
1254     u64 flags = original_sblock->sectors[0]->flags;
1255     u64 have_csum = original_sblock->sectors[0]->have_csum;
1256     struct scrub_recover *recover;
1257     struct btrfs_io_context *bioc;
1258     u64 sublen;
1259     u64 mapped_length;
1260     u64 stripe_offset;
1261     int stripe_index;
1262     int sector_index = 0;
1263     int mirror_index;
1264     int nmirrors;
1265     int ret;
1266
1267     /*
1268      * Note: the two members refs and outstanding_sectors are not used (and
1269      * not set) in the blocks that are used for the recheck procedure.
1270      */
1271
1272     while (length > 0) {
1273         sublen = min_t(u64, length, fs_info->sectorsize);
1274         mapped_length = sublen;
1275         bioc = NULL;
1276
1277         /*
1278          * With a length of sectorsize, each returned stripe represents
1279          * one mirror
1280          */
1281         btrfs_bio_counter_inc_blocked(fs_info);
1282         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283                        logical, &mapped_length, &bioc);
1284         if (ret || !bioc || mapped_length < sublen) {
1285             btrfs_put_bioc(bioc);
1286             btrfs_bio_counter_dec(fs_info);
1287             return -EIO;
1288         }
1289
1290         recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291         if (!recover) {
1292             btrfs_put_bioc(bioc);
1293             btrfs_bio_counter_dec(fs_info);
1294             return -ENOMEM;
1295         }
1296
1297         refcount_set(&recover->refs, 1);
1298         recover->bioc = bioc;
1299         recover->map_length = mapped_length;
1300
1301         ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303         nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305         for (mirror_index = 0; mirror_index < nmirrors;
1306              mirror_index++) {
1307             struct scrub_block *sblock;
1308             struct scrub_sector *sector;
1309
1310             sblock = sblocks_for_recheck + mirror_index;
1311             sblock->sctx = sctx;
1312
1313             sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314             if (!sector) {
1315 leave_nomem:
1316                 spin_lock(&sctx->stat_lock);
1317                 sctx->stat.malloc_errors++;
1318                 spin_unlock(&sctx->stat_lock);
1319                 scrub_put_recover(fs_info, recover);
1320                 return -ENOMEM;
1321             }
1322             scrub_sector_get(sector);
1323             sblock->sectors[sector_index] = sector;
1324             sector->sblock = sblock;
1325             sector->flags = flags;
1326             sector->generation = generation;
1327             sector->logical = logical;
1328             sector->have_csum = have_csum;
1329             if (have_csum)
1330                 memcpy(sector->csum,
1331                        original_sblock->sectors[0]->csum,
1332                        sctx->fs_info->csum_size);
1333
1334             scrub_stripe_index_and_offset(logical,
1335                               bioc->map_type,
1336                               bioc->raid_map,
1337                               bioc->num_stripes -
1338                               bioc->num_tgtdevs,
1339                               mirror_index,
1340                               &stripe_index,
1341                               &stripe_offset);
1342             sector->physical = bioc->stripes[stripe_index].physical +
1343                      stripe_offset;
1344             sector->dev = bioc->stripes[stripe_index].dev;
1345
1346             BUG_ON(sector_index >= original_sblock->sector_count);
1347             sector->physical_for_dev_replace =
1348                 original_sblock->sectors[sector_index]->
1349                 physical_for_dev_replace;
1350             /* For missing devices, dev->bdev is NULL */
1351             sector->mirror_num = mirror_index + 1;
1352             sblock->sector_count++;
1353             sector->page = alloc_page(GFP_NOFS);
1354             if (!sector->page)
1355                 goto leave_nomem;
1356
1357             scrub_get_recover(recover);
1358             sector->recover = recover;
1359         }
1360         scrub_put_recover(fs_info, recover);
1361         length -= sublen;
1362         logical += sublen;
1363         sector_index++;
1364     }
1365
1366     return 0;
1367 }
1368
1369 static void scrub_bio_wait_endio(struct bio *bio)
1370 {
1371     complete(bio->bi_private);
1372 }
1373
1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375                     struct bio *bio,
1376                     struct scrub_sector *sector)
1377 {
1378     DECLARE_COMPLETION_ONSTACK(done);
1379
1380     bio->bi_iter.bi_sector = sector->logical >> 9;
1381     bio->bi_private = &done;
1382     bio->bi_end_io = scrub_bio_wait_endio;
1383     raid56_parity_recover(bio, sector->recover->bioc,
1384                   sector->sblock->sectors[0]->mirror_num, false);
1385
1386     wait_for_completion_io(&done);
1387     return blk_status_to_errno(bio->bi_status);
1388 }
1389
1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391                       struct scrub_block *sblock)
1392 {
1393     struct scrub_sector *first_sector = sblock->sectors[0];
1394     struct bio *bio;
1395     int i;
1396
1397     /* All sectors in sblock belong to the same stripe on the same device. */
1398     ASSERT(first_sector->dev);
1399     if (!first_sector->dev->bdev)
1400         goto out;
1401
1402     bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404     for (i = 0; i < sblock->sector_count; i++) {
1405         struct scrub_sector *sector = sblock->sectors[i];
1406
1407         WARN_ON(!sector->page);
1408         bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409     }
1410
1411     if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412         bio_put(bio);
1413         goto out;
1414     }
1415
1416     bio_put(bio);
1417
1418     scrub_recheck_block_checksum(sblock);
1419
1420     return;
1421 out:
1422     for (i = 0; i < sblock->sector_count; i++)
1423         sblock->sectors[i]->io_error = 1;
1424
1425     sblock->no_io_error_seen = 0;
1426 }
1427
1428 /*
1429  * This function will check the on disk data for checksum errors, header errors
1430  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431  * errored are marked as being bad. The goal is to enable scrub to take those
1432  * sectors that are not errored from all the mirrors so that the sectors that
1433  * are errored in the just handled mirror can be repaired.
1434  */
1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436                 struct scrub_block *sblock,
1437                 int retry_failed_mirror)
1438 {
1439     int i;
1440
1441     sblock->no_io_error_seen = 1;
1442
1443     /* short cut for raid56 */
1444     if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445         return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447     for (i = 0; i < sblock->sector_count; i++) {
1448         struct scrub_sector *sector = sblock->sectors[i];
1449         struct bio bio;
1450         struct bio_vec bvec;
1451
1452         if (sector->dev->bdev == NULL) {
1453             sector->io_error = 1;
1454             sblock->no_io_error_seen = 0;
1455             continue;
1456         }
1457
1458         WARN_ON(!sector->page);
1459         bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460         bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461         bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463         btrfsic_check_bio(&bio);
1464         if (submit_bio_wait(&bio)) {
1465             sector->io_error = 1;
1466             sblock->no_io_error_seen = 0;
1467         }
1468
1469         bio_uninit(&bio);
1470     }
1471
1472     if (sblock->no_io_error_seen)
1473         scrub_recheck_block_checksum(sblock);
1474 }
1475
1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477 {
1478     struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479     int ret;
1480
1481     ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482     return !ret;
1483 }
1484
1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486 {
1487     sblock->header_error = 0;
1488     sblock->checksum_error = 0;
1489     sblock->generation_error = 0;
1490
1491     if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492         scrub_checksum_data(sblock);
1493     else
1494         scrub_checksum_tree_block(sblock);
1495 }
1496
1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498                          struct scrub_block *sblock_good)
1499 {
1500     int i;
1501     int ret = 0;
1502
1503     for (i = 0; i < sblock_bad->sector_count; i++) {
1504         int ret_sub;
1505
1506         ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507                                  sblock_good, i, 1);
1508         if (ret_sub)
1509             ret = ret_sub;
1510     }
1511
1512     return ret;
1513 }
1514
1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516                           struct scrub_block *sblock_good,
1517                           int sector_num, int force_write)
1518 {
1519     struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520     struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521     struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522     const u32 sectorsize = fs_info->sectorsize;
1523
1524     BUG_ON(sector_bad->page == NULL);
1525     BUG_ON(sector_good->page == NULL);
1526     if (force_write || sblock_bad->header_error ||
1527         sblock_bad->checksum_error || sector_bad->io_error) {
1528         struct bio bio;
1529         struct bio_vec bvec;
1530         int ret;
1531
1532         if (!sector_bad->dev->bdev) {
1533             btrfs_warn_rl(fs_info,
1534                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535             return -EIO;
1536         }
1537
1538         bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539         bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540         __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542         btrfsic_check_bio(&bio);
1543         ret = submit_bio_wait(&bio);
1544         bio_uninit(&bio);
1545
1546         if (ret) {
1547             btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548                 BTRFS_DEV_STAT_WRITE_ERRS);
1549             atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550             return -EIO;
1551         }
1552     }
1553
1554     return 0;
1555 }
1556
1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558 {
1559     struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560     int i;
1561
1562     /*
1563      * This block is used for the check of the parity on the source device,
1564      * so the data needn't be written into the destination device.
1565      */
1566     if (sblock->sparity)
1567         return;
1568
1569     for (i = 0; i < sblock->sector_count; i++) {
1570         int ret;
1571
1572         ret = scrub_write_sector_to_dev_replace(sblock, i);
1573         if (ret)
1574             atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575     }
1576 }
1577
1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579 {
1580     struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582     BUG_ON(sector->page == NULL);
1583     if (sector->io_error)
1584         clear_page(page_address(sector->page));
1585
1586     return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587 }
1588
1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590 {
1591     int ret = 0;
1592     u64 length;
1593
1594     if (!btrfs_is_zoned(sctx->fs_info))
1595         return 0;
1596
1597     if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598         return 0;
1599
1600     if (sctx->write_pointer < physical) {
1601         length = physical - sctx->write_pointer;
1602
1603         ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604                         sctx->write_pointer, length);
1605         if (!ret)
1606             sctx->write_pointer = physical;
1607     }
1608     return ret;
1609 }
1610
1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612                       struct scrub_sector *sector)
1613 {
1614     struct scrub_bio *sbio;
1615     int ret;
1616     const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618     mutex_lock(&sctx->wr_lock);
1619 again:
1620     if (!sctx->wr_curr_bio) {
1621         sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622                           GFP_KERNEL);
1623         if (!sctx->wr_curr_bio) {
1624             mutex_unlock(&sctx->wr_lock);
1625             return -ENOMEM;
1626         }
1627         sctx->wr_curr_bio->sctx = sctx;
1628         sctx->wr_curr_bio->sector_count = 0;
1629     }
1630     sbio = sctx->wr_curr_bio;
1631     if (sbio->sector_count == 0) {
1632         ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633         if (ret) {
1634             mutex_unlock(&sctx->wr_lock);
1635             return ret;
1636         }
1637
1638         sbio->physical = sector->physical_for_dev_replace;
1639         sbio->logical = sector->logical;
1640         sbio->dev = sctx->wr_tgtdev;
1641         if (!sbio->bio) {
1642             sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643                           REQ_OP_WRITE, GFP_NOFS);
1644         }
1645         sbio->bio->bi_private = sbio;
1646         sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647         sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648         sbio->status = 0;
1649     } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650            sector->physical_for_dev_replace ||
1651            sbio->logical + sbio->sector_count * sectorsize !=
1652            sector->logical) {
1653         scrub_wr_submit(sctx);
1654         goto again;
1655     }
1656
1657     ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658     if (ret != sectorsize) {
1659         if (sbio->sector_count < 1) {
1660             bio_put(sbio->bio);
1661             sbio->bio = NULL;
1662             mutex_unlock(&sctx->wr_lock);
1663             return -EIO;
1664         }
1665         scrub_wr_submit(sctx);
1666         goto again;
1667     }
1668
1669     sbio->sectors[sbio->sector_count] = sector;
1670     scrub_sector_get(sector);
1671     sbio->sector_count++;
1672     if (sbio->sector_count == sctx->sectors_per_bio)
1673         scrub_wr_submit(sctx);
1674     mutex_unlock(&sctx->wr_lock);
1675
1676     return 0;
1677 }
1678
1679 static void scrub_wr_submit(struct scrub_ctx *sctx)
1680 {
1681     struct scrub_bio *sbio;
1682
1683     if (!sctx->wr_curr_bio)
1684         return;
1685
1686     sbio = sctx->wr_curr_bio;
1687     sctx->wr_curr_bio = NULL;
1688     scrub_pending_bio_inc(sctx);
1689     /* process all writes in a single worker thread. Then the block layer
1690      * orders the requests before sending them to the driver which
1691      * doubled the write performance on spinning disks when measured
1692      * with Linux 3.5 */
1693     btrfsic_check_bio(sbio->bio);
1694     submit_bio(sbio->bio);
1695
1696     if (btrfs_is_zoned(sctx->fs_info))
1697         sctx->write_pointer = sbio->physical + sbio->sector_count *
1698             sctx->fs_info->sectorsize;
1699 }
1700
1701 static void scrub_wr_bio_end_io(struct bio *bio)
1702 {
1703     struct scrub_bio *sbio = bio->bi_private;
1704     struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706     sbio->status = bio->bi_status;
1707     sbio->bio = bio;
1708
1709     INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710     queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711 }
1712
1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714 {
1715     struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716     struct scrub_ctx *sctx = sbio->sctx;
1717     int i;
1718
1719     ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720     if (sbio->status) {
1721         struct btrfs_dev_replace *dev_replace =
1722             &sbio->sctx->fs_info->dev_replace;
1723
1724         for (i = 0; i < sbio->sector_count; i++) {
1725             struct scrub_sector *sector = sbio->sectors[i];
1726
1727             sector->io_error = 1;
1728             atomic64_inc(&dev_replace->num_write_errors);
1729         }
1730     }
1731
1732     for (i = 0; i < sbio->sector_count; i++)
1733         scrub_sector_put(sbio->sectors[i]);
1734
1735     bio_put(sbio->bio);
1736     kfree(sbio);
1737     scrub_pending_bio_dec(sctx);
1738 }
1739
1740 static int scrub_checksum(struct scrub_block *sblock)
1741 {
1742     u64 flags;
1743     int ret;
1744
1745     /*
1746      * No need to initialize these stats currently,
1747      * because this function only use return value
1748      * instead of these stats value.
1749      *
1750      * Todo:
1751      * always use stats
1752      */
1753     sblock->header_error = 0;
1754     sblock->generation_error = 0;
1755     sblock->checksum_error = 0;
1756
1757     WARN_ON(sblock->sector_count < 1);
1758     flags = sblock->sectors[0]->flags;
1759     ret = 0;
1760     if (flags & BTRFS_EXTENT_FLAG_DATA)
1761         ret = scrub_checksum_data(sblock);
1762     else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763         ret = scrub_checksum_tree_block(sblock);
1764     else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765         (void)scrub_checksum_super(sblock);
1766     else
1767         WARN_ON(1);
1768     if (ret)
1769         scrub_handle_errored_block(sblock);
1770
1771     return ret;
1772 }
1773
1774 static int scrub_checksum_data(struct scrub_block *sblock)
1775 {
1776     struct scrub_ctx *sctx = sblock->sctx;
1777     struct btrfs_fs_info *fs_info = sctx->fs_info;
1778     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779     u8 csum[BTRFS_CSUM_SIZE];
1780     struct scrub_sector *sector;
1781     char *kaddr;
1782
1783     BUG_ON(sblock->sector_count < 1);
1784     sector = sblock->sectors[0];
1785     if (!sector->have_csum)
1786         return 0;
1787
1788     kaddr = page_address(sector->page);
1789
1790     shash->tfm = fs_info->csum_shash;
1791     crypto_shash_init(shash);
1792
1793     /*
1794      * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795      * only contains one sector of data.
1796      */
1797     crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799     if (memcmp(csum, sector->csum, fs_info->csum_size))
1800         sblock->checksum_error = 1;
1801     return sblock->checksum_error;
1802 }
1803
1804 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805 {
1806     struct scrub_ctx *sctx = sblock->sctx;
1807     struct btrfs_header *h;
1808     struct btrfs_fs_info *fs_info = sctx->fs_info;
1809     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810     u8 calculated_csum[BTRFS_CSUM_SIZE];
1811     u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812     /*
1813      * This is done in sectorsize steps even for metadata as there's a
1814      * constraint for nodesize to be aligned to sectorsize. This will need
1815      * to change so we don't misuse data and metadata units like that.
1816      */
1817     const u32 sectorsize = sctx->fs_info->sectorsize;
1818     const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819     int i;
1820     struct scrub_sector *sector;
1821     char *kaddr;
1822
1823     BUG_ON(sblock->sector_count < 1);
1824
1825     /* Each member in sectors is just one sector */
1826     ASSERT(sblock->sector_count == num_sectors);
1827
1828     sector = sblock->sectors[0];
1829     kaddr = page_address(sector->page);
1830     h = (struct btrfs_header *)kaddr;
1831     memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833     /*
1834      * we don't use the getter functions here, as we
1835      * a) don't have an extent buffer and
1836      * b) the page is already kmapped
1837      */
1838     if (sector->logical != btrfs_stack_header_bytenr(h))
1839         sblock->header_error = 1;
1840
1841     if (sector->generation != btrfs_stack_header_generation(h)) {
1842         sblock->header_error = 1;
1843         sblock->generation_error = 1;
1844     }
1845
1846     if (!scrub_check_fsid(h->fsid, sector))
1847         sblock->header_error = 1;
1848
1849     if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850            BTRFS_UUID_SIZE))
1851         sblock->header_error = 1;
1852
1853     shash->tfm = fs_info->csum_shash;
1854     crypto_shash_init(shash);
1855     crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856                 sectorsize - BTRFS_CSUM_SIZE);
1857
1858     for (i = 1; i < num_sectors; i++) {
1859         kaddr = page_address(sblock->sectors[i]->page);
1860         crypto_shash_update(shash, kaddr, sectorsize);
1861     }
1862
1863     crypto_shash_final(shash, calculated_csum);
1864     if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865         sblock->checksum_error = 1;
1866
1867     return sblock->header_error || sblock->checksum_error;
1868 }
1869
1870 static int scrub_checksum_super(struct scrub_block *sblock)
1871 {
1872     struct btrfs_super_block *s;
1873     struct scrub_ctx *sctx = sblock->sctx;
1874     struct btrfs_fs_info *fs_info = sctx->fs_info;
1875     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876     u8 calculated_csum[BTRFS_CSUM_SIZE];
1877     struct scrub_sector *sector;
1878     char *kaddr;
1879     int fail_gen = 0;
1880     int fail_cor = 0;
1881
1882     BUG_ON(sblock->sector_count < 1);
1883     sector = sblock->sectors[0];
1884     kaddr = page_address(sector->page);
1885     s = (struct btrfs_super_block *)kaddr;
1886
1887     if (sector->logical != btrfs_super_bytenr(s))
1888         ++fail_cor;
1889
1890     if (sector->generation != btrfs_super_generation(s))
1891         ++fail_gen;
1892
1893     if (!scrub_check_fsid(s->fsid, sector))
1894         ++fail_cor;
1895
1896     shash->tfm = fs_info->csum_shash;
1897     crypto_shash_init(shash);
1898     crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901     if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902         ++fail_cor;
1903
1904     if (fail_cor + fail_gen) {
1905         /*
1906          * if we find an error in a super block, we just report it.
1907          * They will get written with the next transaction commit
1908          * anyway
1909          */
1910         spin_lock(&sctx->stat_lock);
1911         ++sctx->stat.super_errors;
1912         spin_unlock(&sctx->stat_lock);
1913         if (fail_cor)
1914             btrfs_dev_stat_inc_and_print(sector->dev,
1915                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916         else
1917             btrfs_dev_stat_inc_and_print(sector->dev,
1918                 BTRFS_DEV_STAT_GENERATION_ERRS);
1919     }
1920
1921     return fail_cor + fail_gen;
1922 }
1923
1924 static void scrub_block_get(struct scrub_block *sblock)
1925 {
1926     refcount_inc(&sblock->refs);
1927 }
1928
1929 static void scrub_block_put(struct scrub_block *sblock)
1930 {
1931     if (refcount_dec_and_test(&sblock->refs)) {
1932         int i;
1933
1934         if (sblock->sparity)
1935             scrub_parity_put(sblock->sparity);
1936
1937         for (i = 0; i < sblock->sector_count; i++)
1938             scrub_sector_put(sblock->sectors[i]);
1939         kfree(sblock);
1940     }
1941 }
1942
1943 static void scrub_sector_get(struct scrub_sector *sector)
1944 {
1945     atomic_inc(&sector->refs);
1946 }
1947
1948 static void scrub_sector_put(struct scrub_sector *sector)
1949 {
1950     if (atomic_dec_and_test(&sector->refs)) {
1951         if (sector->page)
1952             __free_page(sector->page);
1953         kfree(sector);
1954     }
1955 }
1956
1957 /*
1958  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1960  */
1961 static void scrub_throttle(struct scrub_ctx *sctx)
1962 {
1963     const int time_slice = 1000;
1964     struct scrub_bio *sbio;
1965     struct btrfs_device *device;
1966     s64 delta;
1967     ktime_t now;
1968     u32 div;
1969     u64 bwlimit;
1970
1971     sbio = sctx->bios[sctx->curr];
1972     device = sbio->dev;
1973     bwlimit = READ_ONCE(device->scrub_speed_max);
1974     if (bwlimit == 0)
1975         return;
1976
1977     /*
1978      * Slice is divided into intervals when the IO is submitted, adjust by
1979      * bwlimit and maximum of 64 intervals.
1980      */
1981     div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982     div = min_t(u32, 64, div);
1983
1984     /* Start new epoch, set deadline */
1985     now = ktime_get();
1986     if (sctx->throttle_deadline == 0) {
1987         sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988         sctx->throttle_sent = 0;
1989     }
1990
1991     /* Still in the time to send? */
1992     if (ktime_before(now, sctx->throttle_deadline)) {
1993         /* If current bio is within the limit, send it */
1994         sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995         if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996             return;
1997
1998         /* We're over the limit, sleep until the rest of the slice */
1999         delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000     } else {
2001         /* New request after deadline, start new epoch */
2002         delta = 0;
2003     }
2004
2005     if (delta) {
2006         long timeout;
2007
2008         timeout = div_u64(delta * HZ, 1000);
2009         schedule_timeout_interruptible(timeout);
2010     }
2011
2012     /* Next call will start the deadline period */
2013     sctx->throttle_deadline = 0;
2014 }
2015
2016 static void scrub_submit(struct scrub_ctx *sctx)
2017 {
2018     struct scrub_bio *sbio;
2019
2020     if (sctx->curr == -1)
2021         return;
2022
2023     scrub_throttle(sctx);
2024
2025     sbio = sctx->bios[sctx->curr];
2026     sctx->curr = -1;
2027     scrub_pending_bio_inc(sctx);
2028     btrfsic_check_bio(sbio->bio);
2029     submit_bio(sbio->bio);
2030 }
2031
2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033                       struct scrub_sector *sector)
2034 {
2035     struct scrub_block *sblock = sector->sblock;
2036     struct scrub_bio *sbio;
2037     const u32 sectorsize = sctx->fs_info->sectorsize;
2038     int ret;
2039
2040 again:
2041     /*
2042      * grab a fresh bio or wait for one to become available
2043      */
2044     while (sctx->curr == -1) {
2045         spin_lock(&sctx->list_lock);
2046         sctx->curr = sctx->first_free;
2047         if (sctx->curr != -1) {
2048             sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049             sctx->bios[sctx->curr]->next_free = -1;
2050             sctx->bios[sctx->curr]->sector_count = 0;
2051             spin_unlock(&sctx->list_lock);
2052         } else {
2053             spin_unlock(&sctx->list_lock);
2054             wait_event(sctx->list_wait, sctx->first_free != -1);
2055         }
2056     }
2057     sbio = sctx->bios[sctx->curr];
2058     if (sbio->sector_count == 0) {
2059         sbio->physical = sector->physical;
2060         sbio->logical = sector->logical;
2061         sbio->dev = sector->dev;
2062         if (!sbio->bio) {
2063             sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064                           REQ_OP_READ, GFP_NOFS);
2065         }
2066         sbio->bio->bi_private = sbio;
2067         sbio->bio->bi_end_io = scrub_bio_end_io;
2068         sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069         sbio->status = 0;
2070     } else if (sbio->physical + sbio->sector_count * sectorsize !=
2071            sector->physical ||
2072            sbio->logical + sbio->sector_count * sectorsize !=
2073            sector->logical ||
2074            sbio->dev != sector->dev) {
2075         scrub_submit(sctx);
2076         goto again;
2077     }
2078
2079     sbio->sectors[sbio->sector_count] = sector;
2080     ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081     if (ret != sectorsize) {
2082         if (sbio->sector_count < 1) {
2083             bio_put(sbio->bio);
2084             sbio->bio = NULL;
2085             return -EIO;
2086         }
2087         scrub_submit(sctx);
2088         goto again;
2089     }
2090
2091     scrub_block_get(sblock); /* one for the page added to the bio */
2092     atomic_inc(&sblock->outstanding_sectors);
2093     sbio->sector_count++;
2094     if (sbio->sector_count == sctx->sectors_per_bio)
2095         scrub_submit(sctx);
2096
2097     return 0;
2098 }
2099
2100 static void scrub_missing_raid56_end_io(struct bio *bio)
2101 {
2102     struct scrub_block *sblock = bio->bi_private;
2103     struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105     if (bio->bi_status)
2106         sblock->no_io_error_seen = 0;
2107
2108     bio_put(bio);
2109
2110     queue_work(fs_info->scrub_workers, &sblock->work);
2111 }
2112
2113 static void scrub_missing_raid56_worker(struct work_struct *work)
2114 {
2115     struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116     struct scrub_ctx *sctx = sblock->sctx;
2117     struct btrfs_fs_info *fs_info = sctx->fs_info;
2118     u64 logical;
2119     struct btrfs_device *dev;
2120
2121     logical = sblock->sectors[0]->logical;
2122     dev = sblock->sectors[0]->dev;
2123
2124     if (sblock->no_io_error_seen)
2125         scrub_recheck_block_checksum(sblock);
2126
2127     if (!sblock->no_io_error_seen) {
2128         spin_lock(&sctx->stat_lock);
2129         sctx->stat.read_errors++;
2130         spin_unlock(&sctx->stat_lock);
2131         btrfs_err_rl_in_rcu(fs_info,
2132             "IO error rebuilding logical %llu for dev %s",
2133             logical, rcu_str_deref(dev->name));
2134     } else if (sblock->header_error || sblock->checksum_error) {
2135         spin_lock(&sctx->stat_lock);
2136         sctx->stat.uncorrectable_errors++;
2137         spin_unlock(&sctx->stat_lock);
2138         btrfs_err_rl_in_rcu(fs_info,
2139             "failed to rebuild valid logical %llu for dev %s",
2140             logical, rcu_str_deref(dev->name));
2141     } else {
2142         scrub_write_block_to_dev_replace(sblock);
2143     }
2144
2145     if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146         mutex_lock(&sctx->wr_lock);
2147         scrub_wr_submit(sctx);
2148         mutex_unlock(&sctx->wr_lock);
2149     }
2150
2151     scrub_block_put(sblock);
2152     scrub_pending_bio_dec(sctx);
2153 }
2154
2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156 {
2157     struct scrub_ctx *sctx = sblock->sctx;
2158     struct btrfs_fs_info *fs_info = sctx->fs_info;
2159     u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160     u64 logical = sblock->sectors[0]->logical;
2161     struct btrfs_io_context *bioc = NULL;
2162     struct bio *bio;
2163     struct btrfs_raid_bio *rbio;
2164     int ret;
2165     int i;
2166
2167     btrfs_bio_counter_inc_blocked(fs_info);
2168     ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169                    &length, &bioc);
2170     if (ret || !bioc || !bioc->raid_map)
2171         goto bioc_out;
2172
2173     if (WARN_ON(!sctx->is_dev_replace ||
2174             !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175         /*
2176          * We shouldn't be scrubbing a missing device. Even for dev
2177          * replace, we should only get here for RAID 5/6. We either
2178          * managed to mount something with no mirrors remaining or
2179          * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2180          */
2181         goto bioc_out;
2182     }
2183
2184     bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185     bio->bi_iter.bi_sector = logical >> 9;
2186     bio->bi_private = sblock;
2187     bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189     rbio = raid56_alloc_missing_rbio(bio, bioc);
2190     if (!rbio)
2191         goto rbio_out;
2192
2193     for (i = 0; i < sblock->sector_count; i++) {
2194         struct scrub_sector *sector = sblock->sectors[i];
2195
2196         /*
2197          * For now, our scrub is still one page per sector, so pgoff
2198          * is always 0.
2199          */
2200         raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201     }
2202
2203     INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204     scrub_block_get(sblock);
2205     scrub_pending_bio_inc(sctx);
2206     raid56_submit_missing_rbio(rbio);
2207     return;
2208
2209 rbio_out:
2210     bio_put(bio);
2211 bioc_out:
2212     btrfs_bio_counter_dec(fs_info);
2213     btrfs_put_bioc(bioc);
2214     spin_lock(&sctx->stat_lock);
2215     sctx->stat.malloc_errors++;
2216     spin_unlock(&sctx->stat_lock);
2217 }
2218
2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220                u64 physical, struct btrfs_device *dev, u64 flags,
2221                u64 gen, int mirror_num, u8 *csum,
2222                u64 physical_for_dev_replace)
2223 {
2224     struct scrub_block *sblock;
2225     const u32 sectorsize = sctx->fs_info->sectorsize;
2226     int index;
2227
2228     sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229     if (!sblock) {
2230         spin_lock(&sctx->stat_lock);
2231         sctx->stat.malloc_errors++;
2232         spin_unlock(&sctx->stat_lock);
2233         return -ENOMEM;
2234     }
2235
2236     /* one ref inside this function, plus one for each page added to
2237      * a bio later on */
2238     refcount_set(&sblock->refs, 1);
2239     sblock->sctx = sctx;
2240     sblock->no_io_error_seen = 1;
2241
2242     for (index = 0; len > 0; index++) {
2243         struct scrub_sector *sector;
2244         /*
2245          * Here we will allocate one page for one sector to scrub.
2246          * This is fine if PAGE_SIZE == sectorsize, but will cost
2247          * more memory for PAGE_SIZE > sectorsize case.
2248          */
2249         u32 l = min(sectorsize, len);
2250
2251         sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252         if (!sector) {
2253 leave_nomem:
2254             spin_lock(&sctx->stat_lock);
2255             sctx->stat.malloc_errors++;
2256             spin_unlock(&sctx->stat_lock);
2257             scrub_block_put(sblock);
2258             return -ENOMEM;
2259         }
2260         ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261         scrub_sector_get(sector);
2262         sblock->sectors[index] = sector;
2263         sector->sblock = sblock;
2264         sector->dev = dev;
2265         sector->flags = flags;
2266         sector->generation = gen;
2267         sector->logical = logical;
2268         sector->physical = physical;
2269         sector->physical_for_dev_replace = physical_for_dev_replace;
2270         sector->mirror_num = mirror_num;
2271         if (csum) {
2272             sector->have_csum = 1;
2273             memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274         } else {
2275             sector->have_csum = 0;
2276         }
2277         sblock->sector_count++;
2278         sector->page = alloc_page(GFP_KERNEL);
2279         if (!sector->page)
2280             goto leave_nomem;
2281         len -= l;
2282         logical += l;
2283         physical += l;
2284         physical_for_dev_replace += l;
2285     }
2286
2287     WARN_ON(sblock->sector_count == 0);
2288     if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289         /*
2290          * This case should only be hit for RAID 5/6 device replace. See
2291          * the comment in scrub_missing_raid56_pages() for details.
2292          */
2293         scrub_missing_raid56_pages(sblock);
2294     } else {
2295         for (index = 0; index < sblock->sector_count; index++) {
2296             struct scrub_sector *sector = sblock->sectors[index];
2297             int ret;
2298
2299             ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300             if (ret) {
2301                 scrub_block_put(sblock);
2302                 return ret;
2303             }
2304         }
2305
2306         if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307             scrub_submit(sctx);
2308     }
2309
2310     /* last one frees, either here or in bio completion for last page */
2311     scrub_block_put(sblock);
2312     return 0;
2313 }
2314
2315 static void scrub_bio_end_io(struct bio *bio)
2316 {
2317     struct scrub_bio *sbio = bio->bi_private;
2318     struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320     sbio->status = bio->bi_status;
2321     sbio->bio = bio;
2322
2323     queue_work(fs_info->scrub_workers, &sbio->work);
2324 }
2325
2326 static void scrub_bio_end_io_worker(struct work_struct *work)
2327 {
2328     struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329     struct scrub_ctx *sctx = sbio->sctx;
2330     int i;
2331
2332     ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333     if (sbio->status) {
2334         for (i = 0; i < sbio->sector_count; i++) {
2335             struct scrub_sector *sector = sbio->sectors[i];
2336
2337             sector->io_error = 1;
2338             sector->sblock->no_io_error_seen = 0;
2339         }
2340     }
2341
2342     /* Now complete the scrub_block items that have all pages completed */
2343     for (i = 0; i < sbio->sector_count; i++) {
2344         struct scrub_sector *sector = sbio->sectors[i];
2345         struct scrub_block *sblock = sector->sblock;
2346
2347         if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348             scrub_block_complete(sblock);
2349         scrub_block_put(sblock);
2350     }
2351
2352     bio_put(sbio->bio);
2353     sbio->bio = NULL;
2354     spin_lock(&sctx->list_lock);
2355     sbio->next_free = sctx->first_free;
2356     sctx->first_free = sbio->index;
2357     spin_unlock(&sctx->list_lock);
2358
2359     if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360         mutex_lock(&sctx->wr_lock);
2361         scrub_wr_submit(sctx);
2362         mutex_unlock(&sctx->wr_lock);
2363     }
2364
2365     scrub_pending_bio_dec(sctx);
2366 }
2367
2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369                        unsigned long *bitmap,
2370                        u64 start, u32 len)
2371 {
2372     u64 offset;
2373     u32 nsectors;
2374     u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376     if (len >= sparity->stripe_len) {
2377         bitmap_set(bitmap, 0, sparity->nsectors);
2378         return;
2379     }
2380
2381     start -= sparity->logic_start;
2382     start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383     offset = offset >> sectorsize_bits;
2384     nsectors = len >> sectorsize_bits;
2385
2386     if (offset + nsectors <= sparity->nsectors) {
2387         bitmap_set(bitmap, offset, nsectors);
2388         return;
2389     }
2390
2391     bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392     bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393 }
2394
2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396                            u64 start, u32 len)
2397 {
2398     __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399 }
2400
2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402                           u64 start, u32 len)
2403 {
2404     __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405 }
2406
2407 static void scrub_block_complete(struct scrub_block *sblock)
2408 {
2409     int corrupted = 0;
2410
2411     if (!sblock->no_io_error_seen) {
2412         corrupted = 1;
2413         scrub_handle_errored_block(sblock);
2414     } else {
2415         /*
2416          * if has checksum error, write via repair mechanism in
2417          * dev replace case, otherwise write here in dev replace
2418          * case.
2419          */
2420         corrupted = scrub_checksum(sblock);
2421         if (!corrupted && sblock->sctx->is_dev_replace)
2422             scrub_write_block_to_dev_replace(sblock);
2423     }
2424
2425     if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426         u64 start = sblock->sectors[0]->logical;
2427         u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428               sblock->sctx->fs_info->sectorsize;
2429
2430         ASSERT(end - start <= U32_MAX);
2431         scrub_parity_mark_sectors_error(sblock->sparity,
2432                         start, end - start);
2433     }
2434 }
2435
2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437 {
2438     sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439     list_del(&sum->list);
2440     kfree(sum);
2441 }
2442
2443 /*
2444  * Find the desired csum for range [logical, logical + sectorsize), and store
2445  * the csum into @csum.
2446  *
2447  * The search source is sctx->csum_list, which is a pre-populated list
2448  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2449  * that is before @logical.
2450  *
2451  * Return 0 if there is no csum for the range.
2452  * Return 1 if there is csum for the range and copied to @csum.
2453  */
2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455 {
2456     bool found = false;
2457
2458     while (!list_empty(&sctx->csum_list)) {
2459         struct btrfs_ordered_sum *sum = NULL;
2460         unsigned long index;
2461         unsigned long num_sectors;
2462
2463         sum = list_first_entry(&sctx->csum_list,
2464                        struct btrfs_ordered_sum, list);
2465         /* The current csum range is beyond our range, no csum found */
2466         if (sum->bytenr > logical)
2467             break;
2468
2469         /*
2470          * The current sum is before our bytenr, since scrub is always
2471          * done in bytenr order, the csum will never be used anymore,
2472          * clean it up so that later calls won't bother with the range,
2473          * and continue search the next range.
2474          */
2475         if (sum->bytenr + sum->len <= logical) {
2476             drop_csum_range(sctx, sum);
2477             continue;
2478         }
2479
2480         /* Now the csum range covers our bytenr, copy the csum */
2481         found = true;
2482         index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483         num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485         memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486                sctx->fs_info->csum_size);
2487
2488         /* Cleanup the range if we're at the end of the csum range */
2489         if (index == num_sectors - 1)
2490             drop_csum_range(sctx, sum);
2491         break;
2492     }
2493     if (!found)
2494         return 0;
2495     return 1;
2496 }
2497
2498 /* scrub extent tries to collect up to 64 kB for each bio */
2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500             u64 logical, u32 len,
2501             u64 physical, struct btrfs_device *dev, u64 flags,
2502             u64 gen, int mirror_num)
2503 {
2504     struct btrfs_device *src_dev = dev;
2505     u64 src_physical = physical;
2506     int src_mirror = mirror_num;
2507     int ret;
2508     u8 csum[BTRFS_CSUM_SIZE];
2509     u32 blocksize;
2510
2511     if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513             blocksize = map->stripe_len;
2514         else
2515             blocksize = sctx->fs_info->sectorsize;
2516         spin_lock(&sctx->stat_lock);
2517         sctx->stat.data_extents_scrubbed++;
2518         sctx->stat.data_bytes_scrubbed += len;
2519         spin_unlock(&sctx->stat_lock);
2520     } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522             blocksize = map->stripe_len;
2523         else
2524             blocksize = sctx->fs_info->nodesize;
2525         spin_lock(&sctx->stat_lock);
2526         sctx->stat.tree_extents_scrubbed++;
2527         sctx->stat.tree_bytes_scrubbed += len;
2528         spin_unlock(&sctx->stat_lock);
2529     } else {
2530         blocksize = sctx->fs_info->sectorsize;
2531         WARN_ON(1);
2532     }
2533
2534     /*
2535      * For dev-replace case, we can have @dev being a missing device.
2536      * Regular scrub will avoid its execution on missing device at all,
2537      * as that would trigger tons of read error.
2538      *
2539      * Reading from missing device will cause read error counts to
2540      * increase unnecessarily.
2541      * So here we change the read source to a good mirror.
2542      */
2543     if (sctx->is_dev_replace && !dev->bdev)
2544         scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545                      &src_dev, &src_mirror);
2546     while (len) {
2547         u32 l = min(len, blocksize);
2548         int have_csum = 0;
2549
2550         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551             /* push csums to sbio */
2552             have_csum = scrub_find_csum(sctx, logical, csum);
2553             if (have_csum == 0)
2554                 ++sctx->stat.no_csum;
2555         }
2556         ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557                     flags, gen, src_mirror,
2558                     have_csum ? csum : NULL, physical);
2559         if (ret)
2560             return ret;
2561         len -= l;
2562         logical += l;
2563         physical += l;
2564         src_physical += l;
2565     }
2566     return 0;
2567 }
2568
2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570                   u64 logical, u32 len,
2571                   u64 physical, struct btrfs_device *dev,
2572                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2573 {
2574     struct scrub_ctx *sctx = sparity->sctx;
2575     struct scrub_block *sblock;
2576     const u32 sectorsize = sctx->fs_info->sectorsize;
2577     int index;
2578
2579     ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581     sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582     if (!sblock) {
2583         spin_lock(&sctx->stat_lock);
2584         sctx->stat.malloc_errors++;
2585         spin_unlock(&sctx->stat_lock);
2586         return -ENOMEM;
2587     }
2588
2589     /* one ref inside this function, plus one for each page added to
2590      * a bio later on */
2591     refcount_set(&sblock->refs, 1);
2592     sblock->sctx = sctx;
2593     sblock->no_io_error_seen = 1;
2594     sblock->sparity = sparity;
2595     scrub_parity_get(sparity);
2596
2597     for (index = 0; len > 0; index++) {
2598         struct scrub_sector *sector;
2599
2600         sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601         if (!sector) {
2602 leave_nomem:
2603             spin_lock(&sctx->stat_lock);
2604             sctx->stat.malloc_errors++;
2605             spin_unlock(&sctx->stat_lock);
2606             scrub_block_put(sblock);
2607             return -ENOMEM;
2608         }
2609         ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610         /* For scrub block */
2611         scrub_sector_get(sector);
2612         sblock->sectors[index] = sector;
2613         /* For scrub parity */
2614         scrub_sector_get(sector);
2615         list_add_tail(&sector->list, &sparity->sectors_list);
2616         sector->sblock = sblock;
2617         sector->dev = dev;
2618         sector->flags = flags;
2619         sector->generation = gen;
2620         sector->logical = logical;
2621         sector->physical = physical;
2622         sector->mirror_num = mirror_num;
2623         if (csum) {
2624             sector->have_csum = 1;
2625             memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626         } else {
2627             sector->have_csum = 0;
2628         }
2629         sblock->sector_count++;
2630         sector->page = alloc_page(GFP_KERNEL);
2631         if (!sector->page)
2632             goto leave_nomem;
2633
2634
2635         /* Iterate over the stripe range in sectorsize steps */
2636         len -= sectorsize;
2637         logical += sectorsize;
2638         physical += sectorsize;
2639     }
2640
2641     WARN_ON(sblock->sector_count == 0);
2642     for (index = 0; index < sblock->sector_count; index++) {
2643         struct scrub_sector *sector = sblock->sectors[index];
2644         int ret;
2645
2646         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647         if (ret) {
2648             scrub_block_put(sblock);
2649             return ret;
2650         }
2651     }
2652
2653     /* Last one frees, either here or in bio completion for last sector */
2654     scrub_block_put(sblock);
2655     return 0;
2656 }
2657
2658 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659                    u64 logical, u32 len,
2660                    u64 physical, struct btrfs_device *dev,
2661                    u64 flags, u64 gen, int mirror_num)
2662 {
2663     struct scrub_ctx *sctx = sparity->sctx;
2664     int ret;
2665     u8 csum[BTRFS_CSUM_SIZE];
2666     u32 blocksize;
2667
2668     if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669         scrub_parity_mark_sectors_error(sparity, logical, len);
2670         return 0;
2671     }
2672
2673     if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674         blocksize = sparity->stripe_len;
2675     } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676         blocksize = sparity->stripe_len;
2677     } else {
2678         blocksize = sctx->fs_info->sectorsize;
2679         WARN_ON(1);
2680     }
2681
2682     while (len) {
2683         u32 l = min(len, blocksize);
2684         int have_csum = 0;
2685
2686         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687             /* push csums to sbio */
2688             have_csum = scrub_find_csum(sctx, logical, csum);
2689             if (have_csum == 0)
2690                 goto skip;
2691         }
2692         ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693                          flags, gen, mirror_num,
2694                          have_csum ? csum : NULL);
2695         if (ret)
2696             return ret;
2697 skip:
2698         len -= l;
2699         logical += l;
2700         physical += l;
2701     }
2702     return 0;
2703 }
2704
2705 /*
2706  * Given a physical address, this will calculate it's
2707  * logical offset. if this is a parity stripe, it will return
2708  * the most left data stripe's logical offset.
2709  *
2710  * return 0 if it is a data stripe, 1 means parity stripe.
2711  */
2712 static int get_raid56_logic_offset(u64 physical, int num,
2713                    struct map_lookup *map, u64 *offset,
2714                    u64 *stripe_start)
2715 {
2716     int i;
2717     int j = 0;
2718     u64 stripe_nr;
2719     u64 last_offset;
2720     u32 stripe_index;
2721     u32 rot;
2722     const int data_stripes = nr_data_stripes(map);
2723
2724     last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725     if (stripe_start)
2726         *stripe_start = last_offset;
2727
2728     *offset = last_offset;
2729     for (i = 0; i < data_stripes; i++) {
2730         *offset = last_offset + i * map->stripe_len;
2731
2732         stripe_nr = div64_u64(*offset, map->stripe_len);
2733         stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735         /* Work out the disk rotation on this stripe-set */
2736         stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737         /* calculate which stripe this data locates */
2738         rot += i;
2739         stripe_index = rot % map->num_stripes;
2740         if (stripe_index == num)
2741             return 0;
2742         if (stripe_index < num)
2743             j++;
2744     }
2745     *offset = last_offset + j * map->stripe_len;
2746     return 1;
2747 }
2748
2749 static void scrub_free_parity(struct scrub_parity *sparity)
2750 {
2751     struct scrub_ctx *sctx = sparity->sctx;
2752     struct scrub_sector *curr, *next;
2753     int nbits;
2754
2755     nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756     if (nbits) {
2757         spin_lock(&sctx->stat_lock);
2758         sctx->stat.read_errors += nbits;
2759         sctx->stat.uncorrectable_errors += nbits;
2760         spin_unlock(&sctx->stat_lock);
2761     }
2762
2763     list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764         list_del_init(&curr->list);
2765         scrub_sector_put(curr);
2766     }
2767
2768     kfree(sparity);
2769 }
2770
2771 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772 {
2773     struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774                             work);
2775     struct scrub_ctx *sctx = sparity->sctx;
2776
2777     scrub_free_parity(sparity);
2778     scrub_pending_bio_dec(sctx);
2779 }
2780
2781 static void scrub_parity_bio_endio(struct bio *bio)
2782 {
2783     struct scrub_parity *sparity = bio->bi_private;
2784     struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786     if (bio->bi_status)
2787         bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788               &sparity->dbitmap, sparity->nsectors);
2789
2790     bio_put(bio);
2791
2792     INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793     queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794 }
2795
2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797 {
2798     struct scrub_ctx *sctx = sparity->sctx;
2799     struct btrfs_fs_info *fs_info = sctx->fs_info;
2800     struct bio *bio;
2801     struct btrfs_raid_bio *rbio;
2802     struct btrfs_io_context *bioc = NULL;
2803     u64 length;
2804     int ret;
2805
2806     if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807                &sparity->ebitmap, sparity->nsectors))
2808         goto out;
2809
2810     length = sparity->logic_end - sparity->logic_start;
2811
2812     btrfs_bio_counter_inc_blocked(fs_info);
2813     ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814                    &length, &bioc);
2815     if (ret || !bioc || !bioc->raid_map)
2816         goto bioc_out;
2817
2818     bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819     bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820     bio->bi_private = sparity;
2821     bio->bi_end_io = scrub_parity_bio_endio;
2822
2823     rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824                           sparity->scrub_dev,
2825                           &sparity->dbitmap,
2826                           sparity->nsectors);
2827     if (!rbio)
2828         goto rbio_out;
2829
2830     scrub_pending_bio_inc(sctx);
2831     raid56_parity_submit_scrub_rbio(rbio);
2832     return;
2833
2834 rbio_out:
2835     bio_put(bio);
2836 bioc_out:
2837     btrfs_bio_counter_dec(fs_info);
2838     btrfs_put_bioc(bioc);
2839     bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840           sparity->nsectors);
2841     spin_lock(&sctx->stat_lock);
2842     sctx->stat.malloc_errors++;
2843     spin_unlock(&sctx->stat_lock);
2844 out:
2845     scrub_free_parity(sparity);
2846 }
2847
2848 static void scrub_parity_get(struct scrub_parity *sparity)
2849 {
2850     refcount_inc(&sparity->refs);
2851 }
2852
2853 static void scrub_parity_put(struct scrub_parity *sparity)
2854 {
2855     if (!refcount_dec_and_test(&sparity->refs))
2856         return;
2857
2858     scrub_parity_check_and_repair(sparity);
2859 }
2860
2861 /*
2862  * Return 0 if the extent item range covers any byte of the range.
2863  * Return <0 if the extent item is before @search_start.
2864  * Return >0 if the extent item is after @start_start + @search_len.
2865  */
2866 static int compare_extent_item_range(struct btrfs_path *path,
2867                      u64 search_start, u64 search_len)
2868 {
2869     struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870     u64 len;
2871     struct btrfs_key key;
2872
2873     btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874     ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875            key.type == BTRFS_METADATA_ITEM_KEY);
2876     if (key.type == BTRFS_METADATA_ITEM_KEY)
2877         len = fs_info->nodesize;
2878     else
2879         len = key.offset;
2880
2881     if (key.objectid + len <= search_start)
2882         return -1;
2883     if (key.objectid >= search_start + search_len)
2884         return 1;
2885     return 0;
2886 }
2887
2888 /*
2889  * Locate one extent item which covers any byte in range
2890  * [@search_start, @search_start + @search_length)
2891  *
2892  * If the path is not initialized, we will initialize the search by doing
2893  * a btrfs_search_slot().
2894  * If the path is already initialized, we will use the path as the initial
2895  * slot, to avoid duplicated btrfs_search_slot() calls.
2896  *
2897  * NOTE: If an extent item starts before @search_start, we will still
2898  * return the extent item. This is for data extent crossing stripe boundary.
2899  *
2900  * Return 0 if we found such extent item, and @path will point to the extent item.
2901  * Return >0 if no such extent item can be found, and @path will be released.
2902  * Return <0 if hit fatal error, and @path will be released.
2903  */
2904 static int find_first_extent_item(struct btrfs_root *extent_root,
2905                   struct btrfs_path *path,
2906                   u64 search_start, u64 search_len)
2907 {
2908     struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909     struct btrfs_key key;
2910     int ret;
2911
2912     /* Continue using the existing path */
2913     if (path->nodes[0])
2914         goto search_forward;
2915
2916     if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917         key.type = BTRFS_METADATA_ITEM_KEY;
2918     else
2919         key.type = BTRFS_EXTENT_ITEM_KEY;
2920     key.objectid = search_start;
2921     key.offset = (u64)-1;
2922
2923     ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924     if (ret < 0)
2925         return ret;
2926
2927     ASSERT(ret > 0);
2928     /*
2929      * Here we intentionally pass 0 as @min_objectid, as there could be
2930      * an extent item starting before @search_start.
2931      */
2932     ret = btrfs_previous_extent_item(extent_root, path, 0);
2933     if (ret < 0)
2934         return ret;
2935     /*
2936      * No matter whether we have found an extent item, the next loop will
2937      * properly do every check on the key.
2938      */
2939 search_forward:
2940     while (true) {
2941         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942         if (key.objectid >= search_start + search_len)
2943             break;
2944         if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945             key.type != BTRFS_EXTENT_ITEM_KEY)
2946             goto next;
2947
2948         ret = compare_extent_item_range(path, search_start, search_len);
2949         if (ret == 0)
2950             return ret;
2951         if (ret > 0)
2952             break;
2953 next:
2954         path->slots[0]++;
2955         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956             ret = btrfs_next_leaf(extent_root, path);
2957             if (ret) {
2958                 /* Either no more item or fatal error */
2959                 btrfs_release_path(path);
2960                 return ret;
2961             }
2962         }
2963     }
2964     btrfs_release_path(path);
2965     return 1;
2966 }
2967
2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969                 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970 {
2971     struct btrfs_key key;
2972     struct btrfs_extent_item *ei;
2973
2974     btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975     ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976            key.type == BTRFS_EXTENT_ITEM_KEY);
2977     *extent_start_ret = key.objectid;
2978     if (key.type == BTRFS_METADATA_ITEM_KEY)
2979         *size_ret = path->nodes[0]->fs_info->nodesize;
2980     else
2981         *size_ret = key.offset;
2982     ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983     *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984     *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985 }
2986
2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988                       u64 boundary_start, u64 boudary_len)
2989 {
2990     return (extent_start < boundary_start &&
2991         extent_start + extent_len > boundary_start) ||
2992            (extent_start < boundary_start + boudary_len &&
2993         extent_start + extent_len > boundary_start + boudary_len);
2994 }
2995
2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997                            struct scrub_parity *sparity,
2998                            struct map_lookup *map,
2999                            struct btrfs_device *sdev,
3000                            struct btrfs_path *path,
3001                            u64 logical)
3002 {
3003     struct btrfs_fs_info *fs_info = sctx->fs_info;
3004     struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005     struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006     u64 cur_logical = logical;
3007     int ret;
3008
3009     ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011     /* Path must not be populated */
3012     ASSERT(!path->nodes[0]);
3013
3014     while (cur_logical < logical + map->stripe_len) {
3015         struct btrfs_io_context *bioc = NULL;
3016         struct btrfs_device *extent_dev;
3017         u64 extent_start;
3018         u64 extent_size;
3019         u64 mapped_length;
3020         u64 extent_flags;
3021         u64 extent_gen;
3022         u64 extent_physical;
3023         u64 extent_mirror_num;
3024
3025         ret = find_first_extent_item(extent_root, path, cur_logical,
3026                          logical + map->stripe_len - cur_logical);
3027         /* No more extent item in this data stripe */
3028         if (ret > 0) {
3029             ret = 0;
3030             break;
3031         }
3032         if (ret < 0)
3033             break;
3034         get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035                 &extent_gen);
3036
3037         /* Metadata should not cross stripe boundaries */
3038         if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039             does_range_cross_boundary(extent_start, extent_size,
3040                           logical, map->stripe_len)) {
3041             btrfs_err(fs_info,
3042     "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043                   extent_start, logical);
3044             spin_lock(&sctx->stat_lock);
3045             sctx->stat.uncorrectable_errors++;
3046             spin_unlock(&sctx->stat_lock);
3047             cur_logical += extent_size;
3048             continue;
3049         }
3050
3051         /* Skip hole range which doesn't have any extent */
3052         cur_logical = max(extent_start, cur_logical);
3053
3054         /* Truncate the range inside this data stripe */
3055         extent_size = min(extent_start + extent_size,
3056                   logical + map->stripe_len) - cur_logical;
3057         extent_start = cur_logical;
3058         ASSERT(extent_size <= U32_MAX);
3059
3060         scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062         mapped_length = extent_size;
3063         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064                       &mapped_length, &bioc, 0);
3065         if (!ret && (!bioc || mapped_length < extent_size))
3066             ret = -EIO;
3067         if (ret) {
3068             btrfs_put_bioc(bioc);
3069             scrub_parity_mark_sectors_error(sparity, extent_start,
3070                             extent_size);
3071             break;
3072         }
3073         extent_physical = bioc->stripes[0].physical;
3074         extent_mirror_num = bioc->mirror_num;
3075         extent_dev = bioc->stripes[0].dev;
3076         btrfs_put_bioc(bioc);
3077
3078         ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079                            extent_start + extent_size - 1,
3080                            &sctx->csum_list, 1);
3081         if (ret) {
3082             scrub_parity_mark_sectors_error(sparity, extent_start,
3083                             extent_size);
3084             break;
3085         }
3086
3087         ret = scrub_extent_for_parity(sparity, extent_start,
3088                           extent_size, extent_physical,
3089                           extent_dev, extent_flags,
3090                           extent_gen, extent_mirror_num);
3091         scrub_free_csums(sctx);
3092
3093         if (ret) {
3094             scrub_parity_mark_sectors_error(sparity, extent_start,
3095                             extent_size);
3096             break;
3097         }
3098
3099         cond_resched();
3100         cur_logical += extent_size;
3101     }
3102     btrfs_release_path(path);
3103     return ret;
3104 }
3105
3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107                           struct map_lookup *map,
3108                           struct btrfs_device *sdev,
3109                           u64 logic_start,
3110                           u64 logic_end)
3111 {
3112     struct btrfs_fs_info *fs_info = sctx->fs_info;
3113     struct btrfs_path *path;
3114     u64 cur_logical;
3115     int ret;
3116     struct scrub_parity *sparity;
3117     int nsectors;
3118
3119     path = btrfs_alloc_path();
3120     if (!path) {
3121         spin_lock(&sctx->stat_lock);
3122         sctx->stat.malloc_errors++;
3123         spin_unlock(&sctx->stat_lock);
3124         return -ENOMEM;
3125     }
3126     path->search_commit_root = 1;
3127     path->skip_locking = 1;
3128
3129     ASSERT(map->stripe_len <= U32_MAX);
3130     nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131     ASSERT(nsectors <= BITS_PER_LONG);
3132     sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133     if (!sparity) {
3134         spin_lock(&sctx->stat_lock);
3135         sctx->stat.malloc_errors++;
3136         spin_unlock(&sctx->stat_lock);
3137         btrfs_free_path(path);
3138         return -ENOMEM;
3139     }
3140
3141     ASSERT(map->stripe_len <= U32_MAX);
3142     sparity->stripe_len = map->stripe_len;
3143     sparity->nsectors = nsectors;
3144     sparity->sctx = sctx;
3145     sparity->scrub_dev = sdev;
3146     sparity->logic_start = logic_start;
3147     sparity->logic_end = logic_end;
3148     refcount_set(&sparity->refs, 1);
3149     INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151     ret = 0;
3152     for (cur_logical = logic_start; cur_logical < logic_end;
3153          cur_logical += map->stripe_len) {
3154         ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155                               sdev, path, cur_logical);
3156         if (ret < 0)
3157             break;
3158     }
3159
3160     scrub_parity_put(sparity);
3161     scrub_submit(sctx);
3162     mutex_lock(&sctx->wr_lock);
3163     scrub_wr_submit(sctx);
3164     mutex_unlock(&sctx->wr_lock);
3165
3166     btrfs_free_path(path);
3167     return ret < 0 ? ret : 0;
3168 }
3169
3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171 {
3172     if (!btrfs_is_zoned(sctx->fs_info))
3173         return;
3174
3175     sctx->flush_all_writes = true;
3176     scrub_submit(sctx);
3177     mutex_lock(&sctx->wr_lock);
3178     scrub_wr_submit(sctx);
3179     mutex_unlock(&sctx->wr_lock);
3180
3181     wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182 }
3183
3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185                     u64 physical, u64 physical_end)
3186 {
3187     struct btrfs_fs_info *fs_info = sctx->fs_info;
3188     int ret = 0;
3189
3190     if (!btrfs_is_zoned(fs_info))
3191         return 0;
3192
3193     wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195     mutex_lock(&sctx->wr_lock);
3196     if (sctx->write_pointer < physical_end) {
3197         ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198                             physical,
3199                             sctx->write_pointer);
3200         if (ret)
3201             btrfs_err(fs_info,
3202                   "zoned: failed to recover write pointer");
3203     }
3204     mutex_unlock(&sctx->wr_lock);
3205     btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207     return ret;
3208 }
3209
3210 /*
3211  * Scrub one range which can only has simple mirror based profile.
3212  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3213  *  RAID0/RAID10).
3214  *
3215  * Since we may need to handle a subset of block group, we need @logical_start
3216  * and @logical_length parameter.
3217  */
3218 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219                    struct btrfs_root *extent_root,
3220                    struct btrfs_root *csum_root,
3221                    struct btrfs_block_group *bg,
3222                    struct map_lookup *map,
3223                    u64 logical_start, u64 logical_length,
3224                    struct btrfs_device *device,
3225                    u64 physical, int mirror_num)
3226 {
3227     struct btrfs_fs_info *fs_info = sctx->fs_info;
3228     const u64 logical_end = logical_start + logical_length;
3229     /* An artificial limit, inherit from old scrub behavior */
3230     const u32 max_length = SZ_64K;
3231     struct btrfs_path path = { 0 };
3232     u64 cur_logical = logical_start;
3233     int ret;
3234
3235     /* The range must be inside the bg */
3236     ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238     path.search_commit_root = 1;
3239     path.skip_locking = 1;
3240     /* Go through each extent items inside the logical range */
3241     while (cur_logical < logical_end) {
3242         u64 extent_start;
3243         u64 extent_len;
3244         u64 extent_flags;
3245         u64 extent_gen;
3246         u64 scrub_len;
3247
3248         /* Canceled? */
3249         if (atomic_read(&fs_info->scrub_cancel_req) ||
3250             atomic_read(&sctx->cancel_req)) {
3251             ret = -ECANCELED;
3252             break;
3253         }
3254         /* Paused? */
3255         if (atomic_read(&fs_info->scrub_pause_req)) {
3256             /* Push queued extents */
3257             sctx->flush_all_writes = true;
3258             scrub_submit(sctx);
3259             mutex_lock(&sctx->wr_lock);
3260             scrub_wr_submit(sctx);
3261             mutex_unlock(&sctx->wr_lock);
3262             wait_event(sctx->list_wait,
3263                    atomic_read(&sctx->bios_in_flight) == 0);
3264             sctx->flush_all_writes = false;
3265             scrub_blocked_if_needed(fs_info);
3266         }
3267         /* Block group removed? */
3268         spin_lock(&bg->lock);
3269         if (bg->removed) {
3270             spin_unlock(&bg->lock);
3271             ret = 0;
3272             break;
3273         }
3274         spin_unlock(&bg->lock);
3275
3276         ret = find_first_extent_item(extent_root, &path, cur_logical,
3277                          logical_end - cur_logical);
3278         if (ret > 0) {
3279             /* No more extent, just update the accounting */
3280             sctx->stat.last_physical = physical + logical_length;
3281             ret = 0;
3282             break;
3283         }
3284         if (ret < 0)
3285             break;
3286         get_extent_info(&path, &extent_start, &extent_len,
3287                 &extent_flags, &extent_gen);
3288         /* Skip hole range which doesn't have any extent */
3289         cur_logical = max(extent_start, cur_logical);
3290
3291         /*
3292          * Scrub len has three limits:
3293          * - Extent size limit
3294          * - Scrub range limit
3295          *   This is especially imporatant for RAID0/RAID10 to reuse
3296          *   this function
3297          * - Max scrub size limit
3298          */
3299         scrub_len = min(min(extent_start + extent_len,
3300                     logical_end), cur_logical + max_length) -
3301                 cur_logical;
3302
3303         if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304             ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305                     cur_logical + scrub_len - 1,
3306                     &sctx->csum_list, 1);
3307             if (ret)
3308                 break;
3309         }
3310         if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311             does_range_cross_boundary(extent_start, extent_len,
3312                           logical_start, logical_length)) {
3313             btrfs_err(fs_info,
3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315                   extent_start, logical_start, logical_end);
3316             spin_lock(&sctx->stat_lock);
3317             sctx->stat.uncorrectable_errors++;
3318             spin_unlock(&sctx->stat_lock);
3319             cur_logical += scrub_len;
3320             continue;
3321         }
3322         ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323                    cur_logical - logical_start + physical,
3324                    device, extent_flags, extent_gen,
3325                    mirror_num);
3326         scrub_free_csums(sctx);
3327         if (ret)
3328             break;
3329         if (sctx->is_dev_replace)
3330             sync_replace_for_zoned(sctx);
3331         cur_logical += scrub_len;
3332         /* Don't hold CPU for too long time */
3333         cond_resched();
3334     }
3335     btrfs_release_path(&path);
3336     return ret;
3337 }
3338
3339 /* Calculate the full stripe length for simple stripe based profiles */
3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341 {
3342     ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343                 BTRFS_BLOCK_GROUP_RAID10));
3344
3345     return map->num_stripes / map->sub_stripes * map->stripe_len;
3346 }
3347
3348 /* Get the logical bytenr for the stripe */
3349 static u64 simple_stripe_get_logical(struct map_lookup *map,
3350                      struct btrfs_block_group *bg,
3351                      int stripe_index)
3352 {
3353     ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354                 BTRFS_BLOCK_GROUP_RAID10));
3355     ASSERT(stripe_index < map->num_stripes);
3356
3357     /*
3358      * (stripe_index / sub_stripes) gives how many data stripes we need to
3359      * skip.
3360      */
3361     return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362 }
3363
3364 /* Get the mirror number for the stripe */
3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366 {
3367     ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368                 BTRFS_BLOCK_GROUP_RAID10));
3369     ASSERT(stripe_index < map->num_stripes);
3370
3371     /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372     return stripe_index % map->sub_stripes + 1;
3373 }
3374
3375 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376                    struct btrfs_root *extent_root,
3377                    struct btrfs_root *csum_root,
3378                    struct btrfs_block_group *bg,
3379                    struct map_lookup *map,
3380                    struct btrfs_device *device,
3381                    int stripe_index)
3382 {
3383     const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384     const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385     const u64 orig_physical = map->stripes[stripe_index].physical;
3386     const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387     u64 cur_logical = orig_logical;
3388     u64 cur_physical = orig_physical;
3389     int ret = 0;
3390
3391     while (cur_logical < bg->start + bg->length) {
3392         /*
3393          * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394          * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3395          * this stripe.
3396          */
3397         ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398                       cur_logical, map->stripe_len, device,
3399                       cur_physical, mirror_num);
3400         if (ret)
3401             return ret;
3402         /* Skip to next stripe which belongs to the target device */
3403         cur_logical += logical_increment;
3404         /* For physical offset, we just go to next stripe */
3405         cur_physical += map->stripe_len;
3406     }
3407     return ret;
3408 }
3409
3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411                        struct btrfs_block_group *bg,
3412                        struct extent_map *em,
3413                        struct btrfs_device *scrub_dev,
3414                        int stripe_index)
3415 {
3416     struct btrfs_path *path;
3417     struct btrfs_fs_info *fs_info = sctx->fs_info;
3418     struct btrfs_root *root;
3419     struct btrfs_root *csum_root;
3420     struct blk_plug plug;
3421     struct map_lookup *map = em->map_lookup;
3422     const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423     const u64 chunk_logical = bg->start;
3424     int ret;
3425     u64 physical = map->stripes[stripe_index].physical;
3426     const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427     const u64 physical_end = physical + dev_stripe_len;
3428     u64 logical;
3429     u64 logic_end;
3430     /* The logical increment after finishing one stripe */
3431     u64 increment;
3432     /* Offset inside the chunk */
3433     u64 offset;
3434     u64 stripe_logical;
3435     u64 stripe_end;
3436     int stop_loop = 0;
3437
3438     path = btrfs_alloc_path();
3439     if (!path)
3440         return -ENOMEM;
3441
3442     /*
3443      * work on commit root. The related disk blocks are static as
3444      * long as COW is applied. This means, it is save to rewrite
3445      * them to repair disk errors without any race conditions
3446      */
3447     path->search_commit_root = 1;
3448     path->skip_locking = 1;
3449     path->reada = READA_FORWARD;
3450
3451     wait_event(sctx->list_wait,
3452            atomic_read(&sctx->bios_in_flight) == 0);
3453     scrub_blocked_if_needed(fs_info);
3454
3455     root = btrfs_extent_root(fs_info, bg->start);
3456     csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458     /*
3459      * collect all data csums for the stripe to avoid seeking during
3460      * the scrub. This might currently (crc32) end up to be about 1MB
3461      */
3462     blk_start_plug(&plug);
3463
3464     if (sctx->is_dev_replace &&
3465         btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466         mutex_lock(&sctx->wr_lock);
3467         sctx->write_pointer = physical;
3468         mutex_unlock(&sctx->wr_lock);
3469         sctx->flush_all_writes = true;
3470     }
3471
3472     /*
3473      * There used to be a big double loop to handle all profiles using the
3474      * same routine, which grows larger and more gross over time.
3475      *
3476      * So here we handle each profile differently, so simpler profiles
3477      * have simpler scrubbing function.
3478      */
3479     if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480              BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481         /*
3482          * Above check rules out all complex profile, the remaining
3483          * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484          * mirrored duplication without stripe.
3485          *
3486          * Only @physical and @mirror_num needs to calculated using
3487          * @stripe_index.
3488          */
3489         ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490                 bg->start, bg->length, scrub_dev,
3491                 map->stripes[stripe_index].physical,
3492                 stripe_index + 1);
3493         offset = 0;
3494         goto out;
3495     }
3496     if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497         ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498                       scrub_dev, stripe_index);
3499         offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500         goto out;
3501     }
3502
3503     /* Only RAID56 goes through the old code */
3504     ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505     ret = 0;
3506
3507     /* Calculate the logical end of the stripe */
3508     get_raid56_logic_offset(physical_end, stripe_index,
3509                 map, &logic_end, NULL);
3510     logic_end += chunk_logical;
3511
3512     /* Initialize @offset in case we need to go to out: label */
3513     get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514     increment = map->stripe_len * nr_data_stripes(map);
3515
3516     /*
3517      * Due to the rotation, for RAID56 it's better to iterate each stripe
3518      * using their physical offset.
3519      */
3520     while (physical < physical_end) {
3521         ret = get_raid56_logic_offset(physical, stripe_index, map,
3522                           &logical, &stripe_logical);
3523         logical += chunk_logical;
3524         if (ret) {
3525             /* it is parity strip */
3526             stripe_logical += chunk_logical;
3527             stripe_end = stripe_logical + increment;
3528             ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529                           stripe_logical,
3530                           stripe_end);
3531             if (ret)
3532                 goto out;
3533             goto next;
3534         }
3535
3536         /*
3537          * Now we're at a data stripe, scrub each extents in the range.
3538          *
3539          * At this stage, if we ignore the repair part, inside each data
3540          * stripe it is no different than SINGLE profile.
3541          * We can reuse scrub_simple_mirror() here, as the repair part
3542          * is still based on @mirror_num.
3543          */
3544         ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545                       logical, map->stripe_len,
3546                       scrub_dev, physical, 1);
3547         if (ret < 0)
3548             goto out;
3549 next:
3550         logical += increment;
3551         physical += map->stripe_len;
3552         spin_lock(&sctx->stat_lock);
3553         if (stop_loop)
3554             sctx->stat.last_physical =
3555                 map->stripes[stripe_index].physical + dev_stripe_len;
3556         else
3557             sctx->stat.last_physical = physical;
3558         spin_unlock(&sctx->stat_lock);
3559         if (stop_loop)
3560             break;
3561     }
3562 out:
3563     /* push queued extents */
3564     scrub_submit(sctx);
3565     mutex_lock(&sctx->wr_lock);
3566     scrub_wr_submit(sctx);
3567     mutex_unlock(&sctx->wr_lock);
3568
3569     blk_finish_plug(&plug);
3570     btrfs_free_path(path);
3571
3572     if (sctx->is_dev_replace && ret >= 0) {
3573         int ret2;
3574
3575         ret2 = sync_write_pointer_for_zoned(sctx,
3576                 chunk_logical + offset,
3577                 map->stripes[stripe_index].physical,
3578                 physical_end);
3579         if (ret2)
3580             ret = ret2;
3581     }
3582
3583     return ret < 0 ? ret : 0;
3584 }
3585
3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587                       struct btrfs_block_group *bg,
3588                       struct btrfs_device *scrub_dev,
3589                       u64 dev_offset,
3590                       u64 dev_extent_len)
3591 {
3592     struct btrfs_fs_info *fs_info = sctx->fs_info;
3593     struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594     struct map_lookup *map;
3595     struct extent_map *em;
3596     int i;
3597     int ret = 0;
3598
3599     read_lock(&map_tree->lock);
3600     em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601     read_unlock(&map_tree->lock);
3602
3603     if (!em) {
3604         /*
3605          * Might have been an unused block group deleted by the cleaner
3606          * kthread or relocation.
3607          */
3608         spin_lock(&bg->lock);
3609         if (!bg->removed)
3610             ret = -EINVAL;
3611         spin_unlock(&bg->lock);
3612
3613         return ret;
3614     }
3615     if (em->start != bg->start)
3616         goto out;
3617     if (em->len < dev_extent_len)
3618         goto out;
3619
3620     map = em->map_lookup;
3621     for (i = 0; i < map->num_stripes; ++i) {
3622         if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623             map->stripes[i].physical == dev_offset) {
3624             ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625             if (ret)
3626                 goto out;
3627         }
3628     }
3629 out:
3630     free_extent_map(em);
3631
3632     return ret;
3633 }
3634
3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636                       struct btrfs_block_group *cache)
3637 {
3638     struct btrfs_fs_info *fs_info = cache->fs_info;
3639     struct btrfs_trans_handle *trans;
3640
3641     if (!btrfs_is_zoned(fs_info))
3642         return 0;
3643
3644     btrfs_wait_block_group_reservations(cache);
3645     btrfs_wait_nocow_writers(cache);
3646     btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648     trans = btrfs_join_transaction(root);
3649     if (IS_ERR(trans))
3650         return PTR_ERR(trans);
3651     return btrfs_commit_transaction(trans);
3652 }
3653
3654 static noinline_for_stack
3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656                struct btrfs_device *scrub_dev, u64 start, u64 end)
3657 {
3658     struct btrfs_dev_extent *dev_extent = NULL;
3659     struct btrfs_path *path;
3660     struct btrfs_fs_info *fs_info = sctx->fs_info;
3661     struct btrfs_root *root = fs_info->dev_root;
3662     u64 chunk_offset;
3663     int ret = 0;
3664     int ro_set;
3665     int slot;
3666     struct extent_buffer *l;
3667     struct btrfs_key key;
3668     struct btrfs_key found_key;
3669     struct btrfs_block_group *cache;
3670     struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672     path = btrfs_alloc_path();
3673     if (!path)
3674         return -ENOMEM;
3675
3676     path->reada = READA_FORWARD;
3677     path->search_commit_root = 1;
3678     path->skip_locking = 1;
3679
3680     key.objectid = scrub_dev->devid;
3681     key.offset = 0ull;
3682     key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684     while (1) {
3685         u64 dev_extent_len;
3686
3687         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688         if (ret < 0)
3689             break;
3690         if (ret > 0) {
3691             if (path->slots[0] >=
3692                 btrfs_header_nritems(path->nodes[0])) {
3693                 ret = btrfs_next_leaf(root, path);
3694                 if (ret < 0)
3695                     break;
3696                 if (ret > 0) {
3697                     ret = 0;
3698                     break;
3699                 }
3700             } else {
3701                 ret = 0;
3702             }
3703         }
3704
3705         l = path->nodes[0];
3706         slot = path->slots[0];
3707
3708         btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710         if (found_key.objectid != scrub_dev->devid)
3711             break;
3712
3713         if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714             break;
3715
3716         if (found_key.offset >= end)
3717             break;
3718
3719         if (found_key.offset < key.offset)
3720             break;
3721
3722         dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723         dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725         if (found_key.offset + dev_extent_len <= start)
3726             goto skip;
3727
3728         chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730         /*
3731          * get a reference on the corresponding block group to prevent
3732          * the chunk from going away while we scrub it
3733          */
3734         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736         /* some chunks are removed but not committed to disk yet,
3737          * continue scrubbing */
3738         if (!cache)
3739             goto skip;
3740
3741         ASSERT(cache->start <= chunk_offset);
3742         /*
3743          * We are using the commit root to search for device extents, so
3744          * that means we could have found a device extent item from a
3745          * block group that was deleted in the current transaction. The
3746          * logical start offset of the deleted block group, stored at
3747          * @chunk_offset, might be part of the logical address range of
3748          * a new block group (which uses different physical extents).
3749          * In this case btrfs_lookup_block_group() has returned the new
3750          * block group, and its start address is less than @chunk_offset.
3751          *
3752          * We skip such new block groups, because it's pointless to
3753          * process them, as we won't find their extents because we search
3754          * for them using the commit root of the extent tree. For a device
3755          * replace it's also fine to skip it, we won't miss copying them
3756          * to the target device because we have the write duplication
3757          * setup through the regular write path (by btrfs_map_block()),
3758          * and we have committed a transaction when we started the device
3759          * replace, right after setting up the device replace state.
3760          */
3761         if (cache->start < chunk_offset) {
3762             btrfs_put_block_group(cache);
3763             goto skip;
3764         }
3765
3766         if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767             spin_lock(&cache->lock);
3768             if (!cache->to_copy) {
3769                 spin_unlock(&cache->lock);
3770                 btrfs_put_block_group(cache);
3771                 goto skip;
3772             }
3773             spin_unlock(&cache->lock);
3774         }
3775
3776         /*
3777          * Make sure that while we are scrubbing the corresponding block
3778          * group doesn't get its logical address and its device extents
3779          * reused for another block group, which can possibly be of a
3780          * different type and different profile. We do this to prevent
3781          * false error detections and crashes due to bogus attempts to
3782          * repair extents.
3783          */
3784         spin_lock(&cache->lock);
3785         if (cache->removed) {
3786             spin_unlock(&cache->lock);
3787             btrfs_put_block_group(cache);
3788             goto skip;
3789         }
3790         btrfs_freeze_block_group(cache);
3791         spin_unlock(&cache->lock);
3792
3793         /*
3794          * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795          * to avoid deadlock caused by:
3796          * btrfs_inc_block_group_ro()
3797          * -> btrfs_wait_for_commit()
3798          * -> btrfs_commit_transaction()
3799          * -> btrfs_scrub_pause()
3800          */
3801         scrub_pause_on(fs_info);
3802
3803         /*
3804          * Don't do chunk preallocation for scrub.
3805          *
3806          * This is especially important for SYSTEM bgs, or we can hit
3807          * -EFBIG from btrfs_finish_chunk_alloc() like:
3808          * 1. The only SYSTEM bg is marked RO.
3809          *    Since SYSTEM bg is small, that's pretty common.
3810          * 2. New SYSTEM bg will be allocated
3811          *    Due to regular version will allocate new chunk.
3812          * 3. New SYSTEM bg is empty and will get cleaned up
3813          *    Before cleanup really happens, it's marked RO again.
3814          * 4. Empty SYSTEM bg get scrubbed
3815          *    We go back to 2.
3816          *
3817          * This can easily boost the amount of SYSTEM chunks if cleaner
3818          * thread can't be triggered fast enough, and use up all space
3819          * of btrfs_super_block::sys_chunk_array
3820          *
3821          * While for dev replace, we need to try our best to mark block
3822          * group RO, to prevent race between:
3823          * - Write duplication
3824          *   Contains latest data
3825          * - Scrub copy
3826          *   Contains data from commit tree
3827          *
3828          * If target block group is not marked RO, nocow writes can
3829          * be overwritten by scrub copy, causing data corruption.
3830          * So for dev-replace, it's not allowed to continue if a block
3831          * group is not RO.
3832          */
3833         ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834         if (!ret && sctx->is_dev_replace) {
3835             ret = finish_extent_writes_for_zoned(root, cache);
3836             if (ret) {
3837                 btrfs_dec_block_group_ro(cache);
3838                 scrub_pause_off(fs_info);
3839                 btrfs_put_block_group(cache);
3840                 break;
3841             }
3842         }
3843
3844         if (ret == 0) {
3845             ro_set = 1;
3846         } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847             /*
3848              * btrfs_inc_block_group_ro return -ENOSPC when it
3849              * failed in creating new chunk for metadata.
3850              * It is not a problem for scrub, because
3851              * metadata are always cowed, and our scrub paused
3852              * commit_transactions.
3853              */
3854             ro_set = 0;
3855         } else if (ret == -ETXTBSY) {
3856             btrfs_warn(fs_info,
3857            "skipping scrub of block group %llu due to active swapfile",
3858                    cache->start);
3859             scrub_pause_off(fs_info);
3860             ret = 0;
3861             goto skip_unfreeze;
3862         } else {
3863             btrfs_warn(fs_info,
3864                    "failed setting block group ro: %d", ret);
3865             btrfs_unfreeze_block_group(cache);
3866             btrfs_put_block_group(cache);
3867             scrub_pause_off(fs_info);
3868             break;
3869         }
3870
3871         /*
3872          * Now the target block is marked RO, wait for nocow writes to
3873          * finish before dev-replace.
3874          * COW is fine, as COW never overwrites extents in commit tree.
3875          */
3876         if (sctx->is_dev_replace) {
3877             btrfs_wait_nocow_writers(cache);
3878             btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879                     cache->length);
3880         }
3881
3882         scrub_pause_off(fs_info);
3883         down_write(&dev_replace->rwsem);
3884         dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885         dev_replace->cursor_left = found_key.offset;
3886         dev_replace->item_needs_writeback = 1;
3887         up_write(&dev_replace->rwsem);
3888
3889         ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890                   dev_extent_len);
3891
3892         /*
3893          * flush, submit all pending read and write bios, afterwards
3894          * wait for them.
3895          * Note that in the dev replace case, a read request causes
3896          * write requests that are submitted in the read completion
3897          * worker. Therefore in the current situation, it is required
3898          * that all write requests are flushed, so that all read and
3899          * write requests are really completed when bios_in_flight
3900          * changes to 0.
3901          */
3902         sctx->flush_all_writes = true;
3903         scrub_submit(sctx);
3904         mutex_lock(&sctx->wr_lock);
3905         scrub_wr_submit(sctx);
3906         mutex_unlock(&sctx->wr_lock);
3907
3908         wait_event(sctx->list_wait,
3909                atomic_read(&sctx->bios_in_flight) == 0);
3910
3911         scrub_pause_on(fs_info);
3912
3913         /*
3914          * must be called before we decrease @scrub_paused.
3915          * make sure we don't block transaction commit while
3916          * we are waiting pending workers finished.
3917          */
3918         wait_event(sctx->list_wait,
3919                atomic_read(&sctx->workers_pending) == 0);
3920         sctx->flush_all_writes = false;
3921
3922         scrub_pause_off(fs_info);
3923
3924         if (sctx->is_dev_replace &&
3925             !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926                               cache, found_key.offset))
3927             ro_set = 0;
3928
3929         down_write(&dev_replace->rwsem);
3930         dev_replace->cursor_left = dev_replace->cursor_right;
3931         dev_replace->item_needs_writeback = 1;
3932         up_write(&dev_replace->rwsem);
3933
3934         if (ro_set)
3935             btrfs_dec_block_group_ro(cache);
3936
3937         /*
3938          * We might have prevented the cleaner kthread from deleting
3939          * this block group if it was already unused because we raced
3940          * and set it to RO mode first. So add it back to the unused
3941          * list, otherwise it might not ever be deleted unless a manual
3942          * balance is triggered or it becomes used and unused again.
3943          */
3944         spin_lock(&cache->lock);
3945         if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946             cache->used == 0) {
3947             spin_unlock(&cache->lock);
3948             if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3950                              cache);
3951             else
3952                 btrfs_mark_bg_unused(cache);
3953         } else {
3954             spin_unlock(&cache->lock);
3955         }
3956 skip_unfreeze:
3957         btrfs_unfreeze_block_group(cache);
3958         btrfs_put_block_group(cache);
3959         if (ret)
3960             break;
3961         if (sctx->is_dev_replace &&
3962             atomic64_read(&dev_replace->num_write_errors) > 0) {
3963             ret = -EIO;
3964             break;
3965         }
3966         if (sctx->stat.malloc_errors > 0) {
3967             ret = -ENOMEM;
3968             break;
3969         }
3970 skip:
3971         key.offset = found_key.offset + dev_extent_len;
3972         btrfs_release_path(path);
3973     }
3974
3975     btrfs_free_path(path);
3976
3977     return ret;
3978 }
3979
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981                        struct btrfs_device *scrub_dev)
3982 {
3983     int i;
3984     u64 bytenr;
3985     u64 gen;
3986     int ret;
3987     struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989     if (BTRFS_FS_ERROR(fs_info))
3990         return -EROFS;
3991
3992     /* Seed devices of a new filesystem has their own generation. */
3993     if (scrub_dev->fs_devices != fs_info->fs_devices)
3994         gen = scrub_dev->generation;
3995     else
3996         gen = fs_info->last_trans_committed;
3997
3998     for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999         bytenr = btrfs_sb_offset(i);
4000         if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001             scrub_dev->commit_total_bytes)
4002             break;
4003         if (!btrfs_check_super_location(scrub_dev, bytenr))
4004             continue;
4005
4006         ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008                     NULL, bytenr);
4009         if (ret)
4010             return ret;
4011     }
4012     wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014     return 0;
4015 }
4016
4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018 {
4019     if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020                     &fs_info->scrub_lock)) {
4021         struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022         struct workqueue_struct *scrub_wr_comp =
4023                         fs_info->scrub_wr_completion_workers;
4024         struct workqueue_struct *scrub_parity =
4025                         fs_info->scrub_parity_workers;
4026
4027         fs_info->scrub_workers = NULL;
4028         fs_info->scrub_wr_completion_workers = NULL;
4029         fs_info->scrub_parity_workers = NULL;
4030         mutex_unlock(&fs_info->scrub_lock);
4031
4032         if (scrub_workers)
4033             destroy_workqueue(scrub_workers);
4034         if (scrub_wr_comp)
4035             destroy_workqueue(scrub_wr_comp);
4036         if (scrub_parity)
4037             destroy_workqueue(scrub_parity);
4038     }
4039 }
4040
4041 /*
4042  * get a reference count on fs_info->scrub_workers. start worker if necessary
4043  */
4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045                         int is_dev_replace)
4046 {
4047     struct workqueue_struct *scrub_workers = NULL;
4048     struct workqueue_struct *scrub_wr_comp = NULL;
4049     struct workqueue_struct *scrub_parity = NULL;
4050     unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051     int max_active = fs_info->thread_pool_size;
4052     int ret = -ENOMEM;
4053
4054     if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055         return 0;
4056
4057     scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058                     is_dev_replace ? 1 : max_active);
4059     if (!scrub_workers)
4060         goto fail_scrub_workers;
4061
4062     scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063     if (!scrub_wr_comp)
4064         goto fail_scrub_wr_completion_workers;
4065
4066     scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067     if (!scrub_parity)
4068         goto fail_scrub_parity_workers;
4069
4070     mutex_lock(&fs_info->scrub_lock);
4071     if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072         ASSERT(fs_info->scrub_workers == NULL &&
4073                fs_info->scrub_wr_completion_workers == NULL &&
4074                fs_info->scrub_parity_workers == NULL);
4075         fs_info->scrub_workers = scrub_workers;
4076         fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077         fs_info->scrub_parity_workers = scrub_parity;
4078         refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079         mutex_unlock(&fs_info->scrub_lock);
4080         return 0;
4081     }
4082     /* Other thread raced in and created the workers for us */
4083     refcount_inc(&fs_info->scrub_workers_refcnt);
4084     mutex_unlock(&fs_info->scrub_lock);
4085
4086     ret = 0;
4087     destroy_workqueue(scrub_parity);
4088 fail_scrub_parity_workers:
4089     destroy_workqueue(scrub_wr_comp);
4090 fail_scrub_wr_completion_workers:
4091     destroy_workqueue(scrub_workers);
4092 fail_scrub_workers:
4093     return ret;
4094 }
4095
4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097             u64 end, struct btrfs_scrub_progress *progress,
4098             int readonly, int is_dev_replace)
4099 {
4100     struct btrfs_dev_lookup_args args = { .devid = devid };
4101     struct scrub_ctx *sctx;
4102     int ret;
4103     struct btrfs_device *dev;
4104     unsigned int nofs_flag;
4105
4106     if (btrfs_fs_closing(fs_info))
4107         return -EAGAIN;
4108
4109     if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110         /*
4111          * in this case scrub is unable to calculate the checksum
4112          * the way scrub is implemented. Do not handle this
4113          * situation at all because it won't ever happen.
4114          */
4115         btrfs_err(fs_info,
4116                "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117                fs_info->nodesize,
4118                BTRFS_STRIPE_LEN);
4119         return -EINVAL;
4120     }
4121
4122     if (fs_info->nodesize >
4123         SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124         fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125         /*
4126          * Would exhaust the array bounds of sectorv member in
4127          * struct scrub_block
4128          */
4129         btrfs_err(fs_info,
4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131                fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132                fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133         return -EINVAL;
4134     }
4135
4136     /* Allocate outside of device_list_mutex */
4137     sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138     if (IS_ERR(sctx))
4139         return PTR_ERR(sctx);
4140
4141     ret = scrub_workers_get(fs_info, is_dev_replace);
4142     if (ret)
4143         goto out_free_ctx;
4144
4145     mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146     dev = btrfs_find_device(fs_info->fs_devices, &args);
4147     if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148              !is_dev_replace)) {
4149         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150         ret = -ENODEV;
4151         goto out;
4152     }
4153
4154     if (!is_dev_replace && !readonly &&
4155         !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157         btrfs_err_in_rcu(fs_info,
4158             "scrub on devid %llu: filesystem on %s is not writable",
4159                  devid, rcu_str_deref(dev->name));
4160         ret = -EROFS;
4161         goto out;
4162     }
4163
4164     mutex_lock(&fs_info->scrub_lock);
4165     if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166         test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167         mutex_unlock(&fs_info->scrub_lock);
4168         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169         ret = -EIO;
4170         goto out;
4171     }
4172
4173     down_read(&fs_info->dev_replace.rwsem);
4174     if (dev->scrub_ctx ||
4175         (!is_dev_replace &&
4176          btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177         up_read(&fs_info->dev_replace.rwsem);
4178         mutex_unlock(&fs_info->scrub_lock);
4179         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180         ret = -EINPROGRESS;
4181         goto out;
4182     }
4183     up_read(&fs_info->dev_replace.rwsem);
4184
4185     sctx->readonly = readonly;
4186     dev->scrub_ctx = sctx;
4187     mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189     /*
4190      * checking @scrub_pause_req here, we can avoid
4191      * race between committing transaction and scrubbing.
4192      */
4193     __scrub_blocked_if_needed(fs_info);
4194     atomic_inc(&fs_info->scrubs_running);
4195     mutex_unlock(&fs_info->scrub_lock);
4196
4197     /*
4198      * In order to avoid deadlock with reclaim when there is a transaction
4199      * trying to pause scrub, make sure we use GFP_NOFS for all the
4200      * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201      * invoked by our callees. The pausing request is done when the
4202      * transaction commit starts, and it blocks the transaction until scrub
4203      * is paused (done at specific points at scrub_stripe() or right above
4204      * before incrementing fs_info->scrubs_running).
4205      */
4206     nofs_flag = memalloc_nofs_save();
4207     if (!is_dev_replace) {
4208         btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209         /*
4210          * by holding device list mutex, we can
4211          * kick off writing super in log tree sync.
4212          */
4213         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214         ret = scrub_supers(sctx, dev);
4215         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216     }
4217
4218     if (!ret)
4219         ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220     memalloc_nofs_restore(nofs_flag);
4221
4222     wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223     atomic_dec(&fs_info->scrubs_running);
4224     wake_up(&fs_info->scrub_pause_wait);
4225
4226     wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228     if (progress)
4229         memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231     if (!is_dev_replace)
4232         btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233             ret ? "not finished" : "finished", devid, ret);
4234
4235     mutex_lock(&fs_info->scrub_lock);
4236     dev->scrub_ctx = NULL;
4237     mutex_unlock(&fs_info->scrub_lock);
4238
4239     scrub_workers_put(fs_info);
4240     scrub_put_ctx(sctx);
4241
4242     return ret;
4243 out:
4244     scrub_workers_put(fs_info);
4245 out_free_ctx:
4246     scrub_free_ctx(sctx);
4247
4248     return ret;
4249 }
4250
4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252 {
4253     mutex_lock(&fs_info->scrub_lock);
4254     atomic_inc(&fs_info->scrub_pause_req);
4255     while (atomic_read(&fs_info->scrubs_paused) !=
4256            atomic_read(&fs_info->scrubs_running)) {
4257         mutex_unlock(&fs_info->scrub_lock);
4258         wait_event(fs_info->scrub_pause_wait,
4259                atomic_read(&fs_info->scrubs_paused) ==
4260                atomic_read(&fs_info->scrubs_running));
4261         mutex_lock(&fs_info->scrub_lock);
4262     }
4263     mutex_unlock(&fs_info->scrub_lock);
4264 }
4265
4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267 {
4268     atomic_dec(&fs_info->scrub_pause_req);
4269     wake_up(&fs_info->scrub_pause_wait);
4270 }
4271
4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273 {
4274     mutex_lock(&fs_info->scrub_lock);
4275     if (!atomic_read(&fs_info->scrubs_running)) {
4276         mutex_unlock(&fs_info->scrub_lock);
4277         return -ENOTCONN;
4278     }
4279
4280     atomic_inc(&fs_info->scrub_cancel_req);
4281     while (atomic_read(&fs_info->scrubs_running)) {
4282         mutex_unlock(&fs_info->scrub_lock);
4283         wait_event(fs_info->scrub_pause_wait,
4284                atomic_read(&fs_info->scrubs_running) == 0);
4285         mutex_lock(&fs_info->scrub_lock);
4286     }
4287     atomic_dec(&fs_info->scrub_cancel_req);
4288     mutex_unlock(&fs_info->scrub_lock);
4289
4290     return 0;
4291 }
4292
4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294 {
4295     struct btrfs_fs_info *fs_info = dev->fs_info;
4296     struct scrub_ctx *sctx;
4297
4298     mutex_lock(&fs_info->scrub_lock);
4299     sctx = dev->scrub_ctx;
4300     if (!sctx) {
4301         mutex_unlock(&fs_info->scrub_lock);
4302         return -ENOTCONN;
4303     }
4304     atomic_inc(&sctx->cancel_req);
4305     while (dev->scrub_ctx) {
4306         mutex_unlock(&fs_info->scrub_lock);
4307         wait_event(fs_info->scrub_pause_wait,
4308                dev->scrub_ctx == NULL);
4309         mutex_lock(&fs_info->scrub_lock);
4310     }
4311     mutex_unlock(&fs_info->scrub_lock);
4312
4313     return 0;
4314 }
4315
4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317              struct btrfs_scrub_progress *progress)
4318 {
4319     struct btrfs_dev_lookup_args args = { .devid = devid };
4320     struct btrfs_device *dev;
4321     struct scrub_ctx *sctx = NULL;
4322
4323     mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324     dev = btrfs_find_device(fs_info->fs_devices, &args);
4325     if (dev)
4326         sctx = dev->scrub_ctx;
4327     if (sctx)
4328         memcpy(progress, &sctx->stat, sizeof(*progress));
4329     mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331     return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332 }
4333
4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335                  u64 extent_logical, u32 extent_len,
4336                  u64 *extent_physical,
4337                  struct btrfs_device **extent_dev,
4338                  int *extent_mirror_num)
4339 {
4340     u64 mapped_length;
4341     struct btrfs_io_context *bioc = NULL;
4342     int ret;
4343
4344     mapped_length = extent_len;
4345     ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346                   &mapped_length, &bioc, 0);
4347     if (ret || !bioc || mapped_length < extent_len ||
4348         !bioc->stripes[0].dev->bdev) {
4349         btrfs_put_bioc(bioc);
4350         return;
4351     }
4352
4353     *extent_physical = bioc->stripes[0].physical;
4354     *extent_mirror_num = bioc->mirror_num;
4355     *extent_dev = bioc->stripes[0].dev;
4356     btrfs_put_bioc(bioc);
4357 }