fs/btrfs/raid56.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2012 Fusion-io  All rights reserved.
0004  * Copyright (C) 2012 Intel Corp. All rights reserved.
0005  */
0006
0007 #include <linux/sched.h>
0008 #include <linux/bio.h>
0009 #include <linux/slab.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/raid/pq.h>
0012 #include <linux/hash.h>
0013 #include <linux/list_sort.h>
0014 #include <linux/raid/xor.h>
0015 #include <linux/mm.h>
0016 #include "misc.h"
0017 #include "ctree.h"
0018 #include "disk-io.h"
0019 #include "volumes.h"
0020 #include "raid56.h"
0021 #include "async-thread.h"
0022
0023 /* set when additional merges to this rbio are not allowed */
0024 #define RBIO_RMW_LOCKED_BIT 1
0025
0026 /*
0027  * set when this rbio is sitting in the hash, but it is just a cache
0028  * of past RMW
0029  */
0030 #define RBIO_CACHE_BIT      2
0031
0032 /*
0033  * set when it is safe to trust the stripe_pages for caching
0034  */
0035 #define RBIO_CACHE_READY_BIT    3
0036
0037 #define RBIO_CACHE_SIZE 1024
0038
0039 #define BTRFS_STRIPE_HASH_TABLE_BITS                11
0040
0041 /* Used by the raid56 code to lock stripes for read/modify/write */
0042 struct btrfs_stripe_hash {
0043     struct list_head hash_list;
0044     spinlock_t lock;
0045 };
0046
0047 /* Used by the raid56 code to lock stripes for read/modify/write */
0048 struct btrfs_stripe_hash_table {
0049     struct list_head stripe_cache;
0050     spinlock_t cache_lock;
0051     int cache_size;
0052     struct btrfs_stripe_hash table[];
0053 };
0054
0055 /*
0056  * A bvec like structure to present a sector inside a page.
0057  *
0058  * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
0059  */
0060 struct sector_ptr {
0061     struct page *page;
0062     unsigned int pgoff:24;
0063     unsigned int uptodate:8;
0064 };
0065
0066 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
0067 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
0068 static void rmw_work(struct work_struct *work);
0069 static void read_rebuild_work(struct work_struct *work);
0070 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
0071 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
0072 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
0073 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
0074 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
0075
0076 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
0077                      int need_check);
0078 static void scrub_parity_work(struct work_struct *work);
0079
0080 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
0081 {
0082     INIT_WORK(&rbio->work, work_func);
0083     queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
0084 }
0085
0086 /*
0087  * the stripe hash table is used for locking, and to collect
0088  * bios in hopes of making a full stripe
0089  */
0090 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
0091 {
0092     struct btrfs_stripe_hash_table *table;
0093     struct btrfs_stripe_hash_table *x;
0094     struct btrfs_stripe_hash *cur;
0095     struct btrfs_stripe_hash *h;
0096     int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
0097     int i;
0098
0099     if (info->stripe_hash_table)
0100         return 0;
0101
0102     /*
0103      * The table is large, starting with order 4 and can go as high as
0104      * order 7 in case lock debugging is turned on.
0105      *
0106      * Try harder to allocate and fallback to vmalloc to lower the chance
0107      * of a failing mount.
0108      */
0109     table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
0110     if (!table)
0111         return -ENOMEM;
0112
0113     spin_lock_init(&table->cache_lock);
0114     INIT_LIST_HEAD(&table->stripe_cache);
0115
0116     h = table->table;
0117
0118     for (i = 0; i < num_entries; i++) {
0119         cur = h + i;
0120         INIT_LIST_HEAD(&cur->hash_list);
0121         spin_lock_init(&cur->lock);
0122     }
0123
0124     x = cmpxchg(&info->stripe_hash_table, NULL, table);
0125     kvfree(x);
0126     return 0;
0127 }
0128
0129 /*
0130  * caching an rbio means to copy anything from the
0131  * bio_sectors array into the stripe_pages array.  We
0132  * use the page uptodate bit in the stripe cache array
0133  * to indicate if it has valid data
0134  *
0135  * once the caching is done, we set the cache ready
0136  * bit.
0137  */
0138 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
0139 {
0140     int i;
0141     int ret;
0142
0143     ret = alloc_rbio_pages(rbio);
0144     if (ret)
0145         return;
0146
0147     for (i = 0; i < rbio->nr_sectors; i++) {
0148         /* Some range not covered by bio (partial write), skip it */
0149         if (!rbio->bio_sectors[i].page)
0150             continue;
0151
0152         ASSERT(rbio->stripe_sectors[i].page);
0153         memcpy_page(rbio->stripe_sectors[i].page,
0154                 rbio->stripe_sectors[i].pgoff,
0155                 rbio->bio_sectors[i].page,
0156                 rbio->bio_sectors[i].pgoff,
0157                 rbio->bioc->fs_info->sectorsize);
0158         rbio->stripe_sectors[i].uptodate = 1;
0159     }
0160     set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
0161 }
0162
0163 /*
0164  * we hash on the first logical address of the stripe
0165  */
0166 static int rbio_bucket(struct btrfs_raid_bio *rbio)
0167 {
0168     u64 num = rbio->bioc->raid_map[0];
0169
0170     /*
0171      * we shift down quite a bit.  We're using byte
0172      * addressing, and most of the lower bits are zeros.
0173      * This tends to upset hash_64, and it consistently
0174      * returns just one or two different values.
0175      *
0176      * shifting off the lower bits fixes things.
0177      */
0178     return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
0179 }
0180
0181 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
0182                        unsigned int page_nr)
0183 {
0184     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0185     const u32 sectors_per_page = PAGE_SIZE / sectorsize;
0186     int i;
0187
0188     ASSERT(page_nr < rbio->nr_pages);
0189
0190     for (i = sectors_per_page * page_nr;
0191          i < sectors_per_page * page_nr + sectors_per_page;
0192          i++) {
0193         if (!rbio->stripe_sectors[i].uptodate)
0194             return false;
0195     }
0196     return true;
0197 }
0198
0199 /*
0200  * Update the stripe_sectors[] array to use correct page and pgoff
0201  *
0202  * Should be called every time any page pointer in stripes_pages[] got modified.
0203  */
0204 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
0205 {
0206     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
0207     u32 offset;
0208     int i;
0209
0210     for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
0211         int page_index = offset >> PAGE_SHIFT;
0212
0213         ASSERT(page_index < rbio->nr_pages);
0214         rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
0215         rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
0216     }
0217 }
0218
0219 static void steal_rbio_page(struct btrfs_raid_bio *src,
0220                 struct btrfs_raid_bio *dest, int page_nr)
0221 {
0222     const u32 sectorsize = src->bioc->fs_info->sectorsize;
0223     const u32 sectors_per_page = PAGE_SIZE / sectorsize;
0224     int i;
0225
0226     if (dest->stripe_pages[page_nr])
0227         __free_page(dest->stripe_pages[page_nr]);
0228     dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
0229     src->stripe_pages[page_nr] = NULL;
0230
0231     /* Also update the sector->uptodate bits. */
0232     for (i = sectors_per_page * page_nr;
0233          i < sectors_per_page * page_nr + sectors_per_page; i++)
0234         dest->stripe_sectors[i].uptodate = true;
0235 }
0236
0237 /*
0238  * Stealing an rbio means taking all the uptodate pages from the stripe array
0239  * in the source rbio and putting them into the destination rbio.
0240  *
0241  * This will also update the involved stripe_sectors[] which are referring to
0242  * the old pages.
0243  */
0244 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
0245 {
0246     int i;
0247     struct page *s;
0248
0249     if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
0250         return;
0251
0252     for (i = 0; i < dest->nr_pages; i++) {
0253         s = src->stripe_pages[i];
0254         if (!s || !full_page_sectors_uptodate(src, i))
0255             continue;
0256
0257         steal_rbio_page(src, dest, i);
0258     }
0259     index_stripe_sectors(dest);
0260     index_stripe_sectors(src);
0261 }
0262
0263 /*
0264  * merging means we take the bio_list from the victim and
0265  * splice it into the destination.  The victim should
0266  * be discarded afterwards.
0267  *
0268  * must be called with dest->rbio_list_lock held
0269  */
0270 static void merge_rbio(struct btrfs_raid_bio *dest,
0271                struct btrfs_raid_bio *victim)
0272 {
0273     bio_list_merge(&dest->bio_list, &victim->bio_list);
0274     dest->bio_list_bytes += victim->bio_list_bytes;
0275     /* Also inherit the bitmaps from @victim. */
0276     bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
0277           dest->stripe_nsectors);
0278     dest->generic_bio_cnt += victim->generic_bio_cnt;
0279     bio_list_init(&victim->bio_list);
0280 }
0281
0282 /*
0283  * used to prune items that are in the cache.  The caller
0284  * must hold the hash table lock.
0285  */
0286 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
0287 {
0288     int bucket = rbio_bucket(rbio);
0289     struct btrfs_stripe_hash_table *table;
0290     struct btrfs_stripe_hash *h;
0291     int freeit = 0;
0292
0293     /*
0294      * check the bit again under the hash table lock.
0295      */
0296     if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
0297         return;
0298
0299     table = rbio->bioc->fs_info->stripe_hash_table;
0300     h = table->table + bucket;
0301
0302     /* hold the lock for the bucket because we may be
0303      * removing it from the hash table
0304      */
0305     spin_lock(&h->lock);
0306
0307     /*
0308      * hold the lock for the bio list because we need
0309      * to make sure the bio list is empty
0310      */
0311     spin_lock(&rbio->bio_list_lock);
0312
0313     if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
0314         list_del_init(&rbio->stripe_cache);
0315         table->cache_size -= 1;
0316         freeit = 1;
0317
0318         /* if the bio list isn't empty, this rbio is
0319          * still involved in an IO.  We take it out
0320          * of the cache list, and drop the ref that
0321          * was held for the list.
0322          *
0323          * If the bio_list was empty, we also remove
0324          * the rbio from the hash_table, and drop
0325          * the corresponding ref
0326          */
0327         if (bio_list_empty(&rbio->bio_list)) {
0328             if (!list_empty(&rbio->hash_list)) {
0329                 list_del_init(&rbio->hash_list);
0330                 refcount_dec(&rbio->refs);
0331                 BUG_ON(!list_empty(&rbio->plug_list));
0332             }
0333         }
0334     }
0335
0336     spin_unlock(&rbio->bio_list_lock);
0337     spin_unlock(&h->lock);
0338
0339     if (freeit)
0340         __free_raid_bio(rbio);
0341 }
0342
0343 /*
0344  * prune a given rbio from the cache
0345  */
0346 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
0347 {
0348     struct btrfs_stripe_hash_table *table;
0349     unsigned long flags;
0350
0351     if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
0352         return;
0353
0354     table = rbio->bioc->fs_info->stripe_hash_table;
0355
0356     spin_lock_irqsave(&table->cache_lock, flags);
0357     __remove_rbio_from_cache(rbio);
0358     spin_unlock_irqrestore(&table->cache_lock, flags);
0359 }
0360
0361 /*
0362  * remove everything in the cache
0363  */
0364 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
0365 {
0366     struct btrfs_stripe_hash_table *table;
0367     unsigned long flags;
0368     struct btrfs_raid_bio *rbio;
0369
0370     table = info->stripe_hash_table;
0371
0372     spin_lock_irqsave(&table->cache_lock, flags);
0373     while (!list_empty(&table->stripe_cache)) {
0374         rbio = list_entry(table->stripe_cache.next,
0375                   struct btrfs_raid_bio,
0376                   stripe_cache);
0377         __remove_rbio_from_cache(rbio);
0378     }
0379     spin_unlock_irqrestore(&table->cache_lock, flags);
0380 }
0381
0382 /*
0383  * remove all cached entries and free the hash table
0384  * used by unmount
0385  */
0386 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
0387 {
0388     if (!info->stripe_hash_table)
0389         return;
0390     btrfs_clear_rbio_cache(info);
0391     kvfree(info->stripe_hash_table);
0392     info->stripe_hash_table = NULL;
0393 }
0394
0395 /*
0396  * insert an rbio into the stripe cache.  It
0397  * must have already been prepared by calling
0398  * cache_rbio_pages
0399  *
0400  * If this rbio was already cached, it gets
0401  * moved to the front of the lru.
0402  *
0403  * If the size of the rbio cache is too big, we
0404  * prune an item.
0405  */
0406 static void cache_rbio(struct btrfs_raid_bio *rbio)
0407 {
0408     struct btrfs_stripe_hash_table *table;
0409     unsigned long flags;
0410
0411     if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
0412         return;
0413
0414     table = rbio->bioc->fs_info->stripe_hash_table;
0415
0416     spin_lock_irqsave(&table->cache_lock, flags);
0417     spin_lock(&rbio->bio_list_lock);
0418
0419     /* bump our ref if we were not in the list before */
0420     if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
0421         refcount_inc(&rbio->refs);
0422
0423     if (!list_empty(&rbio->stripe_cache)){
0424         list_move(&rbio->stripe_cache, &table->stripe_cache);
0425     } else {
0426         list_add(&rbio->stripe_cache, &table->stripe_cache);
0427         table->cache_size += 1;
0428     }
0429
0430     spin_unlock(&rbio->bio_list_lock);
0431
0432     if (table->cache_size > RBIO_CACHE_SIZE) {
0433         struct btrfs_raid_bio *found;
0434
0435         found = list_entry(table->stripe_cache.prev,
0436                   struct btrfs_raid_bio,
0437                   stripe_cache);
0438
0439         if (found != rbio)
0440             __remove_rbio_from_cache(found);
0441     }
0442
0443     spin_unlock_irqrestore(&table->cache_lock, flags);
0444 }
0445
0446 /*
0447  * helper function to run the xor_blocks api.  It is only
0448  * able to do MAX_XOR_BLOCKS at a time, so we need to
0449  * loop through.
0450  */
0451 static void run_xor(void **pages, int src_cnt, ssize_t len)
0452 {
0453     int src_off = 0;
0454     int xor_src_cnt = 0;
0455     void *dest = pages[src_cnt];
0456
0457     while(src_cnt > 0) {
0458         xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
0459         xor_blocks(xor_src_cnt, len, dest, pages + src_off);
0460
0461         src_cnt -= xor_src_cnt;
0462         src_off += xor_src_cnt;
0463     }
0464 }
0465
0466 /*
0467  * Returns true if the bio list inside this rbio covers an entire stripe (no
0468  * rmw required).
0469  */
0470 static int rbio_is_full(struct btrfs_raid_bio *rbio)
0471 {
0472     unsigned long flags;
0473     unsigned long size = rbio->bio_list_bytes;
0474     int ret = 1;
0475
0476     spin_lock_irqsave(&rbio->bio_list_lock, flags);
0477     if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
0478         ret = 0;
0479     BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
0480     spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
0481
0482     return ret;
0483 }
0484
0485 /*
0486  * returns 1 if it is safe to merge two rbios together.
0487  * The merging is safe if the two rbios correspond to
0488  * the same stripe and if they are both going in the same
0489  * direction (read vs write), and if neither one is
0490  * locked for final IO
0491  *
0492  * The caller is responsible for locking such that
0493  * rmw_locked is safe to test
0494  */
0495 static int rbio_can_merge(struct btrfs_raid_bio *last,
0496               struct btrfs_raid_bio *cur)
0497 {
0498     if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
0499         test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
0500         return 0;
0501
0502     /*
0503      * we can't merge with cached rbios, since the
0504      * idea is that when we merge the destination
0505      * rbio is going to run our IO for us.  We can
0506      * steal from cached rbios though, other functions
0507      * handle that.
0508      */
0509     if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
0510         test_bit(RBIO_CACHE_BIT, &cur->flags))
0511         return 0;
0512
0513     if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
0514         return 0;
0515
0516     /* we can't merge with different operations */
0517     if (last->operation != cur->operation)
0518         return 0;
0519     /*
0520      * We've need read the full stripe from the drive.
0521      * check and repair the parity and write the new results.
0522      *
0523      * We're not allowed to add any new bios to the
0524      * bio list here, anyone else that wants to
0525      * change this stripe needs to do their own rmw.
0526      */
0527     if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
0528         return 0;
0529
0530     if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
0531         return 0;
0532
0533     if (last->operation == BTRFS_RBIO_READ_REBUILD) {
0534         int fa = last->faila;
0535         int fb = last->failb;
0536         int cur_fa = cur->faila;
0537         int cur_fb = cur->failb;
0538
0539         if (last->faila >= last->failb) {
0540             fa = last->failb;
0541             fb = last->faila;
0542         }
0543
0544         if (cur->faila >= cur->failb) {
0545             cur_fa = cur->failb;
0546             cur_fb = cur->faila;
0547         }
0548
0549         if (fa != cur_fa || fb != cur_fb)
0550             return 0;
0551     }
0552     return 1;
0553 }
0554
0555 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
0556                          unsigned int stripe_nr,
0557                          unsigned int sector_nr)
0558 {
0559     ASSERT(stripe_nr < rbio->real_stripes);
0560     ASSERT(sector_nr < rbio->stripe_nsectors);
0561
0562     return stripe_nr * rbio->stripe_nsectors + sector_nr;
0563 }
0564
0565 /* Return a sector from rbio->stripe_sectors, not from the bio list */
0566 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
0567                          unsigned int stripe_nr,
0568                          unsigned int sector_nr)
0569 {
0570     return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
0571                                   sector_nr)];
0572 }
0573
0574 /* Grab a sector inside P stripe */
0575 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
0576                           unsigned int sector_nr)
0577 {
0578     return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
0579 }
0580
0581 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
0582 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
0583                           unsigned int sector_nr)
0584 {
0585     if (rbio->nr_data + 1 == rbio->real_stripes)
0586         return NULL;
0587     return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
0588 }
0589
0590 /*
0591  * The first stripe in the table for a logical address
0592  * has the lock.  rbios are added in one of three ways:
0593  *
0594  * 1) Nobody has the stripe locked yet.  The rbio is given
0595  * the lock and 0 is returned.  The caller must start the IO
0596  * themselves.
0597  *
0598  * 2) Someone has the stripe locked, but we're able to merge
0599  * with the lock owner.  The rbio is freed and the IO will
0600  * start automatically along with the existing rbio.  1 is returned.
0601  *
0602  * 3) Someone has the stripe locked, but we're not able to merge.
0603  * The rbio is added to the lock owner's plug list, or merged into
0604  * an rbio already on the plug list.  When the lock owner unlocks,
0605  * the next rbio on the list is run and the IO is started automatically.
0606  * 1 is returned
0607  *
0608  * If we return 0, the caller still owns the rbio and must continue with
0609  * IO submission.  If we return 1, the caller must assume the rbio has
0610  * already been freed.
0611  */
0612 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
0613 {
0614     struct btrfs_stripe_hash *h;
0615     struct btrfs_raid_bio *cur;
0616     struct btrfs_raid_bio *pending;
0617     unsigned long flags;
0618     struct btrfs_raid_bio *freeit = NULL;
0619     struct btrfs_raid_bio *cache_drop = NULL;
0620     int ret = 0;
0621
0622     h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
0623
0624     spin_lock_irqsave(&h->lock, flags);
0625     list_for_each_entry(cur, &h->hash_list, hash_list) {
0626         if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
0627             continue;
0628
0629         spin_lock(&cur->bio_list_lock);
0630
0631         /* Can we steal this cached rbio's pages? */
0632         if (bio_list_empty(&cur->bio_list) &&
0633             list_empty(&cur->plug_list) &&
0634             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
0635             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
0636             list_del_init(&cur->hash_list);
0637             refcount_dec(&cur->refs);
0638
0639             steal_rbio(cur, rbio);
0640             cache_drop = cur;
0641             spin_unlock(&cur->bio_list_lock);
0642
0643             goto lockit;
0644         }
0645
0646         /* Can we merge into the lock owner? */
0647         if (rbio_can_merge(cur, rbio)) {
0648             merge_rbio(cur, rbio);
0649             spin_unlock(&cur->bio_list_lock);
0650             freeit = rbio;
0651             ret = 1;
0652             goto out;
0653         }
0654
0655
0656         /*
0657          * We couldn't merge with the running rbio, see if we can merge
0658          * with the pending ones.  We don't have to check for rmw_locked
0659          * because there is no way they are inside finish_rmw right now
0660          */
0661         list_for_each_entry(pending, &cur->plug_list, plug_list) {
0662             if (rbio_can_merge(pending, rbio)) {
0663                 merge_rbio(pending, rbio);
0664                 spin_unlock(&cur->bio_list_lock);
0665                 freeit = rbio;
0666                 ret = 1;
0667                 goto out;
0668             }
0669         }
0670
0671         /*
0672          * No merging, put us on the tail of the plug list, our rbio
0673          * will be started with the currently running rbio unlocks
0674          */
0675         list_add_tail(&rbio->plug_list, &cur->plug_list);
0676         spin_unlock(&cur->bio_list_lock);
0677         ret = 1;
0678         goto out;
0679     }
0680 lockit:
0681     refcount_inc(&rbio->refs);
0682     list_add(&rbio->hash_list, &h->hash_list);
0683 out:
0684     spin_unlock_irqrestore(&h->lock, flags);
0685     if (cache_drop)
0686         remove_rbio_from_cache(cache_drop);
0687     if (freeit)
0688         __free_raid_bio(freeit);
0689     return ret;
0690 }
0691
0692 /*
0693  * called as rmw or parity rebuild is completed.  If the plug list has more
0694  * rbios waiting for this stripe, the next one on the list will be started
0695  */
0696 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
0697 {
0698     int bucket;
0699     struct btrfs_stripe_hash *h;
0700     unsigned long flags;
0701     int keep_cache = 0;
0702
0703     bucket = rbio_bucket(rbio);
0704     h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
0705
0706     if (list_empty(&rbio->plug_list))
0707         cache_rbio(rbio);
0708
0709     spin_lock_irqsave(&h->lock, flags);
0710     spin_lock(&rbio->bio_list_lock);
0711
0712     if (!list_empty(&rbio->hash_list)) {
0713         /*
0714          * if we're still cached and there is no other IO
0715          * to perform, just leave this rbio here for others
0716          * to steal from later
0717          */
0718         if (list_empty(&rbio->plug_list) &&
0719             test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
0720             keep_cache = 1;
0721             clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
0722             BUG_ON(!bio_list_empty(&rbio->bio_list));
0723             goto done;
0724         }
0725
0726         list_del_init(&rbio->hash_list);
0727         refcount_dec(&rbio->refs);
0728
0729         /*
0730          * we use the plug list to hold all the rbios
0731          * waiting for the chance to lock this stripe.
0732          * hand the lock over to one of them.
0733          */
0734         if (!list_empty(&rbio->plug_list)) {
0735             struct btrfs_raid_bio *next;
0736             struct list_head *head = rbio->plug_list.next;
0737
0738             next = list_entry(head, struct btrfs_raid_bio,
0739                       plug_list);
0740
0741             list_del_init(&rbio->plug_list);
0742
0743             list_add(&next->hash_list, &h->hash_list);
0744             refcount_inc(&next->refs);
0745             spin_unlock(&rbio->bio_list_lock);
0746             spin_unlock_irqrestore(&h->lock, flags);
0747
0748             if (next->operation == BTRFS_RBIO_READ_REBUILD)
0749                 start_async_work(next, read_rebuild_work);
0750             else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
0751                 steal_rbio(rbio, next);
0752                 start_async_work(next, read_rebuild_work);
0753             } else if (next->operation == BTRFS_RBIO_WRITE) {
0754                 steal_rbio(rbio, next);
0755                 start_async_work(next, rmw_work);
0756             } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
0757                 steal_rbio(rbio, next);
0758                 start_async_work(next, scrub_parity_work);
0759             }
0760
0761             goto done_nolock;
0762         }
0763     }
0764 done:
0765     spin_unlock(&rbio->bio_list_lock);
0766     spin_unlock_irqrestore(&h->lock, flags);
0767
0768 done_nolock:
0769     if (!keep_cache)
0770         remove_rbio_from_cache(rbio);
0771 }
0772
0773 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
0774 {
0775     int i;
0776
0777     if (!refcount_dec_and_test(&rbio->refs))
0778         return;
0779
0780     WARN_ON(!list_empty(&rbio->stripe_cache));
0781     WARN_ON(!list_empty(&rbio->hash_list));
0782     WARN_ON(!bio_list_empty(&rbio->bio_list));
0783
0784     for (i = 0; i < rbio->nr_pages; i++) {
0785         if (rbio->stripe_pages[i]) {
0786             __free_page(rbio->stripe_pages[i]);
0787             rbio->stripe_pages[i] = NULL;
0788         }
0789     }
0790
0791     btrfs_put_bioc(rbio->bioc);
0792     kfree(rbio);
0793 }
0794
0795 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
0796 {
0797     struct bio *next;
0798
0799     while (cur) {
0800         next = cur->bi_next;
0801         cur->bi_next = NULL;
0802         cur->bi_status = err;
0803         bio_endio(cur);
0804         cur = next;
0805     }
0806 }
0807
0808 /*
0809  * this frees the rbio and runs through all the bios in the
0810  * bio_list and calls end_io on them
0811  */
0812 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
0813 {
0814     struct bio *cur = bio_list_get(&rbio->bio_list);
0815     struct bio *extra;
0816
0817     if (rbio->generic_bio_cnt)
0818         btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
0819     /*
0820      * Clear the data bitmap, as the rbio may be cached for later usage.
0821      * do this before before unlock_stripe() so there will be no new bio
0822      * for this bio.
0823      */
0824     bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
0825
0826     /*
0827      * At this moment, rbio->bio_list is empty, however since rbio does not
0828      * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
0829      * hash list, rbio may be merged with others so that rbio->bio_list
0830      * becomes non-empty.
0831      * Once unlock_stripe() is done, rbio->bio_list will not be updated any
0832      * more and we can call bio_endio() on all queued bios.
0833      */
0834     unlock_stripe(rbio);
0835     extra = bio_list_get(&rbio->bio_list);
0836     __free_raid_bio(rbio);
0837
0838     rbio_endio_bio_list(cur, err);
0839     if (extra)
0840         rbio_endio_bio_list(extra, err);
0841 }
0842
0843 /*
0844  * end io function used by finish_rmw.  When we finally
0845  * get here, we've written a full stripe
0846  */
0847 static void raid_write_end_io(struct bio *bio)
0848 {
0849     struct btrfs_raid_bio *rbio = bio->bi_private;
0850     blk_status_t err = bio->bi_status;
0851     int max_errors;
0852
0853     if (err)
0854         fail_bio_stripe(rbio, bio);
0855
0856     bio_put(bio);
0857
0858     if (!atomic_dec_and_test(&rbio->stripes_pending))
0859         return;
0860
0861     err = BLK_STS_OK;
0862
0863     /* OK, we have read all the stripes we need to. */
0864     max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
0865              0 : rbio->bioc->max_errors;
0866     if (atomic_read(&rbio->error) > max_errors)
0867         err = BLK_STS_IOERR;
0868
0869     rbio_orig_end_io(rbio, err);
0870 }
0871
0872 /**
0873  * Get a sector pointer specified by its @stripe_nr and @sector_nr
0874  *
0875  * @rbio:               The raid bio
0876  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
0877  * @sector_nr:      Sector number inside the stripe,
0878  *          valid range [0, stripe_nsectors)
0879  * @bio_list_only:      Whether to use sectors inside the bio list only.
0880  *
0881  * The read/modify/write code wants to reuse the original bio page as much
0882  * as possible, and only use stripe_sectors as fallback.
0883  */
0884 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
0885                      int stripe_nr, int sector_nr,
0886                      bool bio_list_only)
0887 {
0888     struct sector_ptr *sector;
0889     int index;
0890
0891     ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
0892     ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
0893
0894     index = stripe_nr * rbio->stripe_nsectors + sector_nr;
0895     ASSERT(index >= 0 && index < rbio->nr_sectors);
0896
0897     spin_lock_irq(&rbio->bio_list_lock);
0898     sector = &rbio->bio_sectors[index];
0899     if (sector->page || bio_list_only) {
0900         /* Don't return sector without a valid page pointer */
0901         if (!sector->page)
0902             sector = NULL;
0903         spin_unlock_irq(&rbio->bio_list_lock);
0904         return sector;
0905     }
0906     spin_unlock_irq(&rbio->bio_list_lock);
0907
0908     return &rbio->stripe_sectors[index];
0909 }
0910
0911 /*
0912  * allocation and initial setup for the btrfs_raid_bio.  Not
0913  * this does not allocate any pages for rbio->pages.
0914  */
0915 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
0916                      struct btrfs_io_context *bioc)
0917 {
0918     const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
0919     const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
0920     const unsigned int num_pages = stripe_npages * real_stripes;
0921     const unsigned int stripe_nsectors =
0922         BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
0923     const unsigned int num_sectors = stripe_nsectors * real_stripes;
0924     struct btrfs_raid_bio *rbio;
0925     void *p;
0926
0927     /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
0928     ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
0929     /*
0930      * Our current stripe len should be fixed to 64k thus stripe_nsectors
0931      * (at most 16) should be no larger than BITS_PER_LONG.
0932      */
0933     ASSERT(stripe_nsectors <= BITS_PER_LONG);
0934
0935     rbio = kzalloc(sizeof(*rbio) +
0936                sizeof(*rbio->stripe_pages) * num_pages +
0937                sizeof(*rbio->bio_sectors) * num_sectors +
0938                sizeof(*rbio->stripe_sectors) * num_sectors +
0939                sizeof(*rbio->finish_pointers) * real_stripes,
0940                GFP_NOFS);
0941     if (!rbio)
0942         return ERR_PTR(-ENOMEM);
0943
0944     bio_list_init(&rbio->bio_list);
0945     INIT_LIST_HEAD(&rbio->plug_list);
0946     spin_lock_init(&rbio->bio_list_lock);
0947     INIT_LIST_HEAD(&rbio->stripe_cache);
0948     INIT_LIST_HEAD(&rbio->hash_list);
0949     rbio->bioc = bioc;
0950     rbio->nr_pages = num_pages;
0951     rbio->nr_sectors = num_sectors;
0952     rbio->real_stripes = real_stripes;
0953     rbio->stripe_npages = stripe_npages;
0954     rbio->stripe_nsectors = stripe_nsectors;
0955     rbio->faila = -1;
0956     rbio->failb = -1;
0957     refcount_set(&rbio->refs, 1);
0958     atomic_set(&rbio->error, 0);
0959     atomic_set(&rbio->stripes_pending, 0);
0960
0961     /*
0962      * The stripe_pages, bio_sectors, etc arrays point to the extra memory
0963      * we allocated past the end of the rbio.
0964      */
0965     p = rbio + 1;
0966 #define CONSUME_ALLOC(ptr, count)   do {                \
0967         ptr = p;                        \
0968         p = (unsigned char *)p + sizeof(*(ptr)) * (count);  \
0969     } while (0)
0970     CONSUME_ALLOC(rbio->stripe_pages, num_pages);
0971     CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
0972     CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
0973     CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
0974 #undef  CONSUME_ALLOC
0975
0976     ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
0977     rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
0978
0979     return rbio;
0980 }
0981
0982 /* allocate pages for all the stripes in the bio, including parity */
0983 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
0984 {
0985     int ret;
0986
0987     ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
0988     if (ret < 0)
0989         return ret;
0990     /* Mapping all sectors */
0991     index_stripe_sectors(rbio);
0992     return 0;
0993 }
0994
0995 /* only allocate pages for p/q stripes */
0996 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
0997 {
0998     const int data_pages = rbio->nr_data * rbio->stripe_npages;
0999     int ret;
1000
1001     ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1002                      rbio->stripe_pages + data_pages);
1003     if (ret < 0)
1004         return ret;
1005
1006     index_stripe_sectors(rbio);
1007     return 0;
1008 }
1009
1010 /*
1011  * Add a single sector @sector into our list of bios for IO.
1012  *
1013  * Return 0 if everything went well.
1014  * Return <0 for error.
1015  */
1016 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1017                   struct bio_list *bio_list,
1018                   struct sector_ptr *sector,
1019                   unsigned int stripe_nr,
1020                   unsigned int sector_nr,
1021                   enum req_op op)
1022 {
1023     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1024     struct bio *last = bio_list->tail;
1025     int ret;
1026     struct bio *bio;
1027     struct btrfs_io_stripe *stripe;
1028     u64 disk_start;
1029
1030     /*
1031      * Note: here stripe_nr has taken device replace into consideration,
1032      * thus it can be larger than rbio->real_stripe.
1033      * So here we check against bioc->num_stripes, not rbio->real_stripes.
1034      */
1035     ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1036     ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1037     ASSERT(sector->page);
1038
1039     stripe = &rbio->bioc->stripes[stripe_nr];
1040     disk_start = stripe->physical + sector_nr * sectorsize;
1041
1042     /* if the device is missing, just fail this stripe */
1043     if (!stripe->dev->bdev)
1044         return fail_rbio_index(rbio, stripe_nr);
1045
1046     /* see if we can add this page onto our existing bio */
1047     if (last) {
1048         u64 last_end = last->bi_iter.bi_sector << 9;
1049         last_end += last->bi_iter.bi_size;
1050
1051         /*
1052          * we can't merge these if they are from different
1053          * devices or if they are not contiguous
1054          */
1055         if (last_end == disk_start && !last->bi_status &&
1056             last->bi_bdev == stripe->dev->bdev) {
1057             ret = bio_add_page(last, sector->page, sectorsize,
1058                        sector->pgoff);
1059             if (ret == sectorsize)
1060                 return 0;
1061         }
1062     }
1063
1064     /* put a new bio on the list */
1065     bio = bio_alloc(stripe->dev->bdev,
1066             max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1067             op, GFP_NOFS);
1068     bio->bi_iter.bi_sector = disk_start >> 9;
1069     bio->bi_private = rbio;
1070
1071     bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1072     bio_list_add(bio_list, bio);
1073     return 0;
1074 }
1075
1076 /*
1077  * while we're doing the read/modify/write cycle, we could
1078  * have errors in reading pages off the disk.  This checks
1079  * for errors and if we're not able to read the page it'll
1080  * trigger parity reconstruction.  The rmw will be finished
1081  * after we've reconstructed the failed stripes
1082  */
1083 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1084 {
1085     if (rbio->faila >= 0 || rbio->failb >= 0) {
1086         BUG_ON(rbio->faila == rbio->real_stripes - 1);
1087         __raid56_parity_recover(rbio);
1088     } else {
1089         finish_rmw(rbio);
1090     }
1091 }
1092
1093 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1094 {
1095     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1096     struct bio_vec bvec;
1097     struct bvec_iter iter;
1098     u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1099              rbio->bioc->raid_map[0];
1100
1101     bio_for_each_segment(bvec, bio, iter) {
1102         u32 bvec_offset;
1103
1104         for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1105              bvec_offset += sectorsize, offset += sectorsize) {
1106             int index = offset / sectorsize;
1107             struct sector_ptr *sector = &rbio->bio_sectors[index];
1108
1109             sector->page = bvec.bv_page;
1110             sector->pgoff = bvec.bv_offset + bvec_offset;
1111             ASSERT(sector->pgoff < PAGE_SIZE);
1112         }
1113     }
1114 }
1115
1116 /*
1117  * helper function to walk our bio list and populate the bio_pages array with
1118  * the result.  This seems expensive, but it is faster than constantly
1119  * searching through the bio list as we setup the IO in finish_rmw or stripe
1120  * reconstruction.
1121  *
1122  * This must be called before you trust the answers from page_in_rbio
1123  */
1124 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1125 {
1126     struct bio *bio;
1127
1128     spin_lock_irq(&rbio->bio_list_lock);
1129     bio_list_for_each(bio, &rbio->bio_list)
1130         index_one_bio(rbio, bio);
1131
1132     spin_unlock_irq(&rbio->bio_list_lock);
1133 }
1134
1135 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1136                    struct raid56_bio_trace_info *trace_info)
1137 {
1138     const struct btrfs_io_context *bioc = rbio->bioc;
1139     int i;
1140
1141     ASSERT(bioc);
1142
1143     /* We rely on bio->bi_bdev to find the stripe number. */
1144     if (!bio->bi_bdev)
1145         goto not_found;
1146
1147     for (i = 0; i < bioc->num_stripes; i++) {
1148         if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1149             continue;
1150         trace_info->stripe_nr = i;
1151         trace_info->devid = bioc->stripes[i].dev->devid;
1152         trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1153                      bioc->stripes[i].physical;
1154         return;
1155     }
1156
1157 not_found:
1158     trace_info->devid = -1;
1159     trace_info->offset = -1;
1160     trace_info->stripe_nr = -1;
1161 }
1162
1163 /*
1164  * this is called from one of two situations.  We either
1165  * have a full stripe from the higher layers, or we've read all
1166  * the missing bits off disk.
1167  *
1168  * This will calculate the parity and then send down any
1169  * changed blocks.
1170  */
1171 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1172 {
1173     struct btrfs_io_context *bioc = rbio->bioc;
1174     const u32 sectorsize = bioc->fs_info->sectorsize;
1175     void **pointers = rbio->finish_pointers;
1176     int nr_data = rbio->nr_data;
1177     /* The total sector number inside the full stripe. */
1178     int total_sector_nr;
1179     int stripe;
1180     /* Sector number inside a stripe. */
1181     int sectornr;
1182     bool has_qstripe;
1183     struct bio_list bio_list;
1184     struct bio *bio;
1185     int ret;
1186
1187     bio_list_init(&bio_list);
1188
1189     if (rbio->real_stripes - rbio->nr_data == 1)
1190         has_qstripe = false;
1191     else if (rbio->real_stripes - rbio->nr_data == 2)
1192         has_qstripe = true;
1193     else
1194         BUG();
1195
1196     /* We should have at least one data sector. */
1197     ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1198
1199     /* at this point we either have a full stripe,
1200      * or we've read the full stripe from the drive.
1201      * recalculate the parity and write the new results.
1202      *
1203      * We're not allowed to add any new bios to the
1204      * bio list here, anyone else that wants to
1205      * change this stripe needs to do their own rmw.
1206      */
1207     spin_lock_irq(&rbio->bio_list_lock);
1208     set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1209     spin_unlock_irq(&rbio->bio_list_lock);
1210
1211     atomic_set(&rbio->error, 0);
1212
1213     /*
1214      * now that we've set rmw_locked, run through the
1215      * bio list one last time and map the page pointers
1216      *
1217      * We don't cache full rbios because we're assuming
1218      * the higher layers are unlikely to use this area of
1219      * the disk again soon.  If they do use it again,
1220      * hopefully they will send another full bio.
1221      */
1222     index_rbio_pages(rbio);
1223     if (!rbio_is_full(rbio))
1224         cache_rbio_pages(rbio);
1225     else
1226         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1227
1228     for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1229         struct sector_ptr *sector;
1230
1231         /* First collect one sector from each data stripe */
1232         for (stripe = 0; stripe < nr_data; stripe++) {
1233             sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1234             pointers[stripe] = kmap_local_page(sector->page) +
1235                        sector->pgoff;
1236         }
1237
1238         /* Then add the parity stripe */
1239         sector = rbio_pstripe_sector(rbio, sectornr);
1240         sector->uptodate = 1;
1241         pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1242
1243         if (has_qstripe) {
1244             /*
1245              * RAID6, add the qstripe and call the library function
1246              * to fill in our p/q
1247              */
1248             sector = rbio_qstripe_sector(rbio, sectornr);
1249             sector->uptodate = 1;
1250             pointers[stripe++] = kmap_local_page(sector->page) +
1251                          sector->pgoff;
1252
1253             raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1254                         pointers);
1255         } else {
1256             /* raid5 */
1257             memcpy(pointers[nr_data], pointers[0], sectorsize);
1258             run_xor(pointers + 1, nr_data - 1, sectorsize);
1259         }
1260         for (stripe = stripe - 1; stripe >= 0; stripe--)
1261             kunmap_local(pointers[stripe]);
1262     }
1263
1264     /*
1265      * Start writing.  Make bios for everything from the higher layers (the
1266      * bio_list in our rbio) and our P/Q.  Ignore everything else.
1267      */
1268     for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1269          total_sector_nr++) {
1270         struct sector_ptr *sector;
1271
1272         stripe = total_sector_nr / rbio->stripe_nsectors;
1273         sectornr = total_sector_nr % rbio->stripe_nsectors;
1274
1275         /* This vertical stripe has no data, skip it. */
1276         if (!test_bit(sectornr, &rbio->dbitmap))
1277             continue;
1278
1279         if (stripe < rbio->nr_data) {
1280             sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1281             if (!sector)
1282                 continue;
1283         } else {
1284             sector = rbio_stripe_sector(rbio, stripe, sectornr);
1285         }
1286
1287         ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1288                      sectornr, REQ_OP_WRITE);
1289         if (ret)
1290             goto cleanup;
1291     }
1292
1293     if (likely(!bioc->num_tgtdevs))
1294         goto write_data;
1295
1296     for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1297          total_sector_nr++) {
1298         struct sector_ptr *sector;
1299
1300         stripe = total_sector_nr / rbio->stripe_nsectors;
1301         sectornr = total_sector_nr % rbio->stripe_nsectors;
1302
1303         if (!bioc->tgtdev_map[stripe]) {
1304             /*
1305              * We can skip the whole stripe completely, note
1306              * total_sector_nr will be increased by one anyway.
1307              */
1308             ASSERT(sectornr == 0);
1309             total_sector_nr += rbio->stripe_nsectors - 1;
1310             continue;
1311         }
1312
1313         /* This vertical stripe has no data, skip it. */
1314         if (!test_bit(sectornr, &rbio->dbitmap))
1315             continue;
1316
1317         if (stripe < rbio->nr_data) {
1318             sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1319             if (!sector)
1320                 continue;
1321         } else {
1322             sector = rbio_stripe_sector(rbio, stripe, sectornr);
1323         }
1324
1325         ret = rbio_add_io_sector(rbio, &bio_list, sector,
1326                      rbio->bioc->tgtdev_map[stripe],
1327                      sectornr, REQ_OP_WRITE);
1328         if (ret)
1329             goto cleanup;
1330     }
1331
1332 write_data:
1333     atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1334     BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1335
1336     while ((bio = bio_list_pop(&bio_list))) {
1337         bio->bi_end_io = raid_write_end_io;
1338
1339         if (trace_raid56_write_stripe_enabled()) {
1340             struct raid56_bio_trace_info trace_info = { 0 };
1341
1342             bio_get_trace_info(rbio, bio, &trace_info);
1343             trace_raid56_write_stripe(rbio, bio, &trace_info);
1344         }
1345         submit_bio(bio);
1346     }
1347     return;
1348
1349 cleanup:
1350     rbio_orig_end_io(rbio, BLK_STS_IOERR);
1351
1352     while ((bio = bio_list_pop(&bio_list)))
1353         bio_put(bio);
1354 }
1355
1356 /*
1357  * helper to find the stripe number for a given bio.  Used to figure out which
1358  * stripe has failed.  This expects the bio to correspond to a physical disk,
1359  * so it looks up based on physical sector numbers.
1360  */
1361 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1362                struct bio *bio)
1363 {
1364     u64 physical = bio->bi_iter.bi_sector;
1365     int i;
1366     struct btrfs_io_stripe *stripe;
1367
1368     physical <<= 9;
1369
1370     for (i = 0; i < rbio->bioc->num_stripes; i++) {
1371         stripe = &rbio->bioc->stripes[i];
1372         if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
1373             stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1374             return i;
1375         }
1376     }
1377     return -1;
1378 }
1379
1380 /*
1381  * helper to find the stripe number for a given
1382  * bio (before mapping).  Used to figure out which stripe has
1383  * failed.  This looks up based on logical block numbers.
1384  */
1385 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1386                    struct bio *bio)
1387 {
1388     u64 logical = bio->bi_iter.bi_sector << 9;
1389     int i;
1390
1391     for (i = 0; i < rbio->nr_data; i++) {
1392         u64 stripe_start = rbio->bioc->raid_map[i];
1393
1394         if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
1395             return i;
1396     }
1397     return -1;
1398 }
1399
1400 /*
1401  * returns -EIO if we had too many failures
1402  */
1403 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1404 {
1405     unsigned long flags;
1406     int ret = 0;
1407
1408     spin_lock_irqsave(&rbio->bio_list_lock, flags);
1409
1410     /* we already know this stripe is bad, move on */
1411     if (rbio->faila == failed || rbio->failb == failed)
1412         goto out;
1413
1414     if (rbio->faila == -1) {
1415         /* first failure on this rbio */
1416         rbio->faila = failed;
1417         atomic_inc(&rbio->error);
1418     } else if (rbio->failb == -1) {
1419         /* second failure on this rbio */
1420         rbio->failb = failed;
1421         atomic_inc(&rbio->error);
1422     } else {
1423         ret = -EIO;
1424     }
1425 out:
1426     spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1427
1428     return ret;
1429 }
1430
1431 /*
1432  * helper to fail a stripe based on a physical disk
1433  * bio.
1434  */
1435 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1436                struct bio *bio)
1437 {
1438     int failed = find_bio_stripe(rbio, bio);
1439
1440     if (failed < 0)
1441         return -EIO;
1442
1443     return fail_rbio_index(rbio, failed);
1444 }
1445
1446 /*
1447  * For subpage case, we can no longer set page Uptodate directly for
1448  * stripe_pages[], thus we need to locate the sector.
1449  */
1450 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1451                          struct page *page,
1452                          unsigned int pgoff)
1453 {
1454     int i;
1455
1456     for (i = 0; i < rbio->nr_sectors; i++) {
1457         struct sector_ptr *sector = &rbio->stripe_sectors[i];
1458
1459         if (sector->page == page && sector->pgoff == pgoff)
1460             return sector;
1461     }
1462     return NULL;
1463 }
1464
1465 /*
1466  * this sets each page in the bio uptodate.  It should only be used on private
1467  * rbio pages, nothing that comes in from the higher layers
1468  */
1469 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1470 {
1471     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1472     struct bio_vec *bvec;
1473     struct bvec_iter_all iter_all;
1474
1475     ASSERT(!bio_flagged(bio, BIO_CLONED));
1476
1477     bio_for_each_segment_all(bvec, bio, iter_all) {
1478         struct sector_ptr *sector;
1479         int pgoff;
1480
1481         for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1482              pgoff += sectorsize) {
1483             sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1484             ASSERT(sector);
1485             if (sector)
1486                 sector->uptodate = 1;
1487         }
1488     }
1489 }
1490
1491 static void raid56_bio_end_io(struct bio *bio)
1492 {
1493     struct btrfs_raid_bio *rbio = bio->bi_private;
1494
1495     if (bio->bi_status)
1496         fail_bio_stripe(rbio, bio);
1497     else
1498         set_bio_pages_uptodate(rbio, bio);
1499
1500     bio_put(bio);
1501
1502     if (atomic_dec_and_test(&rbio->stripes_pending))
1503         queue_work(rbio->bioc->fs_info->endio_raid56_workers,
1504                &rbio->end_io_work);
1505 }
1506
1507 /*
1508  * End io handler for the read phase of the RMW cycle.  All the bios here are
1509  * physical stripe bios we've read from the disk so we can recalculate the
1510  * parity of the stripe.
1511  *
1512  * This will usually kick off finish_rmw once all the bios are read in, but it
1513  * may trigger parity reconstruction if we had any errors along the way
1514  */
1515 static void raid56_rmw_end_io_work(struct work_struct *work)
1516 {
1517     struct btrfs_raid_bio *rbio =
1518         container_of(work, struct btrfs_raid_bio, end_io_work);
1519
1520     if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
1521         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1522         return;
1523     }
1524
1525     /*
1526      * This will normally call finish_rmw to start our write but if there
1527      * are any failed stripes we'll reconstruct from parity first.
1528      */
1529     validate_rbio_for_rmw(rbio);
1530 }
1531
1532 /*
1533  * the stripe must be locked by the caller.  It will
1534  * unlock after all the writes are done
1535  */
1536 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1537 {
1538     int bios_to_read = 0;
1539     struct bio_list bio_list;
1540     const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
1541     int ret;
1542     int total_sector_nr;
1543     struct bio *bio;
1544
1545     bio_list_init(&bio_list);
1546
1547     ret = alloc_rbio_pages(rbio);
1548     if (ret)
1549         goto cleanup;
1550
1551     index_rbio_pages(rbio);
1552
1553     atomic_set(&rbio->error, 0);
1554     /* Build a list of bios to read all the missing data sectors. */
1555     for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
1556          total_sector_nr++) {
1557         struct sector_ptr *sector;
1558         int stripe = total_sector_nr / rbio->stripe_nsectors;
1559         int sectornr = total_sector_nr % rbio->stripe_nsectors;
1560
1561         /*
1562          * We want to find all the sectors missing from the rbio and
1563          * read them from the disk.  If sector_in_rbio() finds a page
1564          * in the bio list we don't need to read it off the stripe.
1565          */
1566         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1567         if (sector)
1568             continue;
1569
1570         sector = rbio_stripe_sector(rbio, stripe, sectornr);
1571         /*
1572          * The bio cache may have handed us an uptodate page.  If so,
1573          * use it.
1574          */
1575         if (sector->uptodate)
1576             continue;
1577
1578         ret = rbio_add_io_sector(rbio, &bio_list, sector,
1579                    stripe, sectornr, REQ_OP_READ);
1580         if (ret)
1581             goto cleanup;
1582     }
1583
1584     bios_to_read = bio_list_size(&bio_list);
1585     if (!bios_to_read) {
1586         /*
1587          * this can happen if others have merged with
1588          * us, it means there is nothing left to read.
1589          * But if there are missing devices it may not be
1590          * safe to do the full stripe write yet.
1591          */
1592         goto finish;
1593     }
1594
1595     /*
1596      * The bioc may be freed once we submit the last bio. Make sure not to
1597      * touch it after that.
1598      */
1599     atomic_set(&rbio->stripes_pending, bios_to_read);
1600     INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
1601     while ((bio = bio_list_pop(&bio_list))) {
1602         bio->bi_end_io = raid56_bio_end_io;
1603
1604         if (trace_raid56_read_partial_enabled()) {
1605             struct raid56_bio_trace_info trace_info = { 0 };
1606
1607             bio_get_trace_info(rbio, bio, &trace_info);
1608             trace_raid56_read_partial(rbio, bio, &trace_info);
1609         }
1610         submit_bio(bio);
1611     }
1612     /* the actual write will happen once the reads are done */
1613     return 0;
1614
1615 cleanup:
1616     rbio_orig_end_io(rbio, BLK_STS_IOERR);
1617
1618     while ((bio = bio_list_pop(&bio_list)))
1619         bio_put(bio);
1620
1621     return -EIO;
1622
1623 finish:
1624     validate_rbio_for_rmw(rbio);
1625     return 0;
1626 }
1627
1628 /*
1629  * if the upper layers pass in a full stripe, we thank them by only allocating
1630  * enough pages to hold the parity, and sending it all down quickly.
1631  */
1632 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1633 {
1634     int ret;
1635
1636     ret = alloc_rbio_parity_pages(rbio);
1637     if (ret) {
1638         __free_raid_bio(rbio);
1639         return ret;
1640     }
1641
1642     ret = lock_stripe_add(rbio);
1643     if (ret == 0)
1644         finish_rmw(rbio);
1645     return 0;
1646 }
1647
1648 /*
1649  * partial stripe writes get handed over to async helpers.
1650  * We're really hoping to merge a few more writes into this
1651  * rbio before calculating new parity
1652  */
1653 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1654 {
1655     int ret;
1656
1657     ret = lock_stripe_add(rbio);
1658     if (ret == 0)
1659         start_async_work(rbio, rmw_work);
1660     return 0;
1661 }
1662
1663 /*
1664  * sometimes while we were reading from the drive to
1665  * recalculate parity, enough new bios come into create
1666  * a full stripe.  So we do a check here to see if we can
1667  * go directly to finish_rmw
1668  */
1669 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1670 {
1671     /* head off into rmw land if we don't have a full stripe */
1672     if (!rbio_is_full(rbio))
1673         return partial_stripe_write(rbio);
1674     return full_stripe_write(rbio);
1675 }
1676
1677 /*
1678  * We use plugging call backs to collect full stripes.
1679  * Any time we get a partial stripe write while plugged
1680  * we collect it into a list.  When the unplug comes down,
1681  * we sort the list by logical block number and merge
1682  * everything we can into the same rbios
1683  */
1684 struct btrfs_plug_cb {
1685     struct blk_plug_cb cb;
1686     struct btrfs_fs_info *info;
1687     struct list_head rbio_list;
1688     struct work_struct work;
1689 };
1690
1691 /*
1692  * rbios on the plug list are sorted for easier merging.
1693  */
1694 static int plug_cmp(void *priv, const struct list_head *a,
1695             const struct list_head *b)
1696 {
1697     const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1698                                plug_list);
1699     const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1700                                plug_list);
1701     u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1702     u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1703
1704     if (a_sector < b_sector)
1705         return -1;
1706     if (a_sector > b_sector)
1707         return 1;
1708     return 0;
1709 }
1710
1711 static void run_plug(struct btrfs_plug_cb *plug)
1712 {
1713     struct btrfs_raid_bio *cur;
1714     struct btrfs_raid_bio *last = NULL;
1715
1716     /*
1717      * sort our plug list then try to merge
1718      * everything we can in hopes of creating full
1719      * stripes.
1720      */
1721     list_sort(NULL, &plug->rbio_list, plug_cmp);
1722     while (!list_empty(&plug->rbio_list)) {
1723         cur = list_entry(plug->rbio_list.next,
1724                  struct btrfs_raid_bio, plug_list);
1725         list_del_init(&cur->plug_list);
1726
1727         if (rbio_is_full(cur)) {
1728             int ret;
1729
1730             /* we have a full stripe, send it down */
1731             ret = full_stripe_write(cur);
1732             BUG_ON(ret);
1733             continue;
1734         }
1735         if (last) {
1736             if (rbio_can_merge(last, cur)) {
1737                 merge_rbio(last, cur);
1738                 __free_raid_bio(cur);
1739                 continue;
1740
1741             }
1742             __raid56_parity_write(last);
1743         }
1744         last = cur;
1745     }
1746     if (last) {
1747         __raid56_parity_write(last);
1748     }
1749     kfree(plug);
1750 }
1751
1752 /*
1753  * if the unplug comes from schedule, we have to push the
1754  * work off to a helper thread
1755  */
1756 static void unplug_work(struct work_struct *work)
1757 {
1758     struct btrfs_plug_cb *plug;
1759     plug = container_of(work, struct btrfs_plug_cb, work);
1760     run_plug(plug);
1761 }
1762
1763 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1764 {
1765     struct btrfs_plug_cb *plug;
1766     plug = container_of(cb, struct btrfs_plug_cb, cb);
1767
1768     if (from_schedule) {
1769         INIT_WORK(&plug->work, unplug_work);
1770         queue_work(plug->info->rmw_workers, &plug->work);
1771         return;
1772     }
1773     run_plug(plug);
1774 }
1775
1776 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1777 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1778 {
1779     const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1780     const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1781     const u64 full_stripe_start = rbio->bioc->raid_map[0];
1782     const u32 orig_len = orig_bio->bi_iter.bi_size;
1783     const u32 sectorsize = fs_info->sectorsize;
1784     u64 cur_logical;
1785
1786     ASSERT(orig_logical >= full_stripe_start &&
1787            orig_logical + orig_len <= full_stripe_start +
1788            rbio->nr_data * BTRFS_STRIPE_LEN);
1789
1790     bio_list_add(&rbio->bio_list, orig_bio);
1791     rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1792
1793     /* Update the dbitmap. */
1794     for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1795          cur_logical += sectorsize) {
1796         int bit = ((u32)(cur_logical - full_stripe_start) >>
1797                fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1798
1799         set_bit(bit, &rbio->dbitmap);
1800     }
1801 }
1802
1803 /*
1804  * our main entry point for writes from the rest of the FS.
1805  */
1806 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1807 {
1808     struct btrfs_fs_info *fs_info = bioc->fs_info;
1809     struct btrfs_raid_bio *rbio;
1810     struct btrfs_plug_cb *plug = NULL;
1811     struct blk_plug_cb *cb;
1812     int ret = 0;
1813
1814     rbio = alloc_rbio(fs_info, bioc);
1815     if (IS_ERR(rbio)) {
1816         btrfs_put_bioc(bioc);
1817         ret = PTR_ERR(rbio);
1818         goto out_dec_counter;
1819     }
1820     rbio->operation = BTRFS_RBIO_WRITE;
1821     rbio_add_bio(rbio, bio);
1822
1823     rbio->generic_bio_cnt = 1;
1824
1825     /*
1826      * don't plug on full rbios, just get them out the door
1827      * as quickly as we can
1828      */
1829     if (rbio_is_full(rbio)) {
1830         ret = full_stripe_write(rbio);
1831         if (ret)
1832             goto out_dec_counter;
1833         return;
1834     }
1835
1836     cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1837     if (cb) {
1838         plug = container_of(cb, struct btrfs_plug_cb, cb);
1839         if (!plug->info) {
1840             plug->info = fs_info;
1841             INIT_LIST_HEAD(&plug->rbio_list);
1842         }
1843         list_add_tail(&rbio->plug_list, &plug->rbio_list);
1844     } else {
1845         ret = __raid56_parity_write(rbio);
1846         if (ret)
1847             goto out_dec_counter;
1848     }
1849
1850     return;
1851
1852 out_dec_counter:
1853     btrfs_bio_counter_dec(fs_info);
1854     bio->bi_status = errno_to_blk_status(ret);
1855     bio_endio(bio);
1856 }
1857
1858 /*
1859  * all parity reconstruction happens here.  We've read in everything
1860  * we can find from the drives and this does the heavy lifting of
1861  * sorting the good from the bad.
1862  */
1863 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1864 {
1865     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1866     int sectornr, stripe;
1867     void **pointers;
1868     void **unmap_array;
1869     int faila = -1, failb = -1;
1870     blk_status_t err;
1871     int i;
1872
1873     /*
1874      * This array stores the pointer for each sector, thus it has the extra
1875      * pgoff value added from each sector
1876      */
1877     pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1878     if (!pointers) {
1879         err = BLK_STS_RESOURCE;
1880         goto cleanup_io;
1881     }
1882
1883     /*
1884      * Store copy of pointers that does not get reordered during
1885      * reconstruction so that kunmap_local works.
1886      */
1887     unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1888     if (!unmap_array) {
1889         err = BLK_STS_RESOURCE;
1890         goto cleanup_pointers;
1891     }
1892
1893     faila = rbio->faila;
1894     failb = rbio->failb;
1895
1896     if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1897         rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1898         spin_lock_irq(&rbio->bio_list_lock);
1899         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1900         spin_unlock_irq(&rbio->bio_list_lock);
1901     }
1902
1903     index_rbio_pages(rbio);
1904
1905     for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1906         struct sector_ptr *sector;
1907
1908         /*
1909          * Now we just use bitmap to mark the horizontal stripes in
1910          * which we have data when doing parity scrub.
1911          */
1912         if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1913             !test_bit(sectornr, &rbio->dbitmap))
1914             continue;
1915
1916         /*
1917          * Setup our array of pointers with sectors from each stripe
1918          *
1919          * NOTE: store a duplicate array of pointers to preserve the
1920          * pointer order
1921          */
1922         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1923             /*
1924              * If we're rebuilding a read, we have to use
1925              * pages from the bio list
1926              */
1927             if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1928                  rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1929                 (stripe == faila || stripe == failb)) {
1930                 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1931             } else {
1932                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1933             }
1934             ASSERT(sector->page);
1935             pointers[stripe] = kmap_local_page(sector->page) +
1936                        sector->pgoff;
1937             unmap_array[stripe] = pointers[stripe];
1938         }
1939
1940         /* All raid6 handling here */
1941         if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1942             /* Single failure, rebuild from parity raid5 style */
1943             if (failb < 0) {
1944                 if (faila == rbio->nr_data) {
1945                     /*
1946                      * Just the P stripe has failed, without
1947                      * a bad data or Q stripe.
1948                      * TODO, we should redo the xor here.
1949                      */
1950                     err = BLK_STS_IOERR;
1951                     goto cleanup;
1952                 }
1953                 /*
1954                  * a single failure in raid6 is rebuilt
1955                  * in the pstripe code below
1956                  */
1957                 goto pstripe;
1958             }
1959
1960             /* make sure our ps and qs are in order */
1961             if (faila > failb)
1962                 swap(faila, failb);
1963
1964             /* if the q stripe is failed, do a pstripe reconstruction
1965              * from the xors.
1966              * If both the q stripe and the P stripe are failed, we're
1967              * here due to a crc mismatch and we can't give them the
1968              * data they want
1969              */
1970             if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1971                 if (rbio->bioc->raid_map[faila] ==
1972                     RAID5_P_STRIPE) {
1973                     err = BLK_STS_IOERR;
1974                     goto cleanup;
1975                 }
1976                 /*
1977                  * otherwise we have one bad data stripe and
1978                  * a good P stripe.  raid5!
1979                  */
1980                 goto pstripe;
1981             }
1982
1983             if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1984                 raid6_datap_recov(rbio->real_stripes,
1985                           sectorsize, faila, pointers);
1986             } else {
1987                 raid6_2data_recov(rbio->real_stripes,
1988                           sectorsize, faila, failb,
1989                           pointers);
1990             }
1991         } else {
1992             void *p;
1993
1994             /* rebuild from P stripe here (raid5 or raid6) */
1995             BUG_ON(failb != -1);
1996 pstripe:
1997             /* Copy parity block into failed block to start with */
1998             memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1999
2000             /* rearrange the pointer array */
2001             p = pointers[faila];
2002             for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2003                 pointers[stripe] = pointers[stripe + 1];
2004             pointers[rbio->nr_data - 1] = p;
2005
2006             /* xor in the rest */
2007             run_xor(pointers, rbio->nr_data - 1, sectorsize);
2008         }
2009         /* if we're doing this rebuild as part of an rmw, go through
2010          * and set all of our private rbio pages in the
2011          * failed stripes as uptodate.  This way finish_rmw will
2012          * know they can be trusted.  If this was a read reconstruction,
2013          * other endio functions will fiddle the uptodate bits
2014          */
2015         if (rbio->operation == BTRFS_RBIO_WRITE) {
2016             for (i = 0;  i < rbio->stripe_nsectors; i++) {
2017                 if (faila != -1) {
2018                     sector = rbio_stripe_sector(rbio, faila, i);
2019                     sector->uptodate = 1;
2020                 }
2021                 if (failb != -1) {
2022                     sector = rbio_stripe_sector(rbio, failb, i);
2023                     sector->uptodate = 1;
2024                 }
2025             }
2026         }
2027         for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2028             kunmap_local(unmap_array[stripe]);
2029     }
2030
2031     err = BLK_STS_OK;
2032 cleanup:
2033     kfree(unmap_array);
2034 cleanup_pointers:
2035     kfree(pointers);
2036
2037 cleanup_io:
2038     /*
2039      * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2040      * valid rbio which is consistent with ondisk content, thus such a
2041      * valid rbio can be cached to avoid further disk reads.
2042      */
2043     if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2044         rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
2045         /*
2046          * - In case of two failures, where rbio->failb != -1:
2047          *
2048          *   Do not cache this rbio since the above read reconstruction
2049          *   (raid6_datap_recov() or raid6_2data_recov()) may have
2050          *   changed some content of stripes which are not identical to
2051          *   on-disk content any more, otherwise, a later write/recover
2052          *   may steal stripe_pages from this rbio and end up with
2053          *   corruptions or rebuild failures.
2054          *
2055          * - In case of single failure, where rbio->failb == -1:
2056          *
2057          *   Cache this rbio iff the above read reconstruction is
2058          *   executed without problems.
2059          */
2060         if (err == BLK_STS_OK && rbio->failb < 0)
2061             cache_rbio_pages(rbio);
2062         else
2063             clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2064
2065         rbio_orig_end_io(rbio, err);
2066     } else if (err == BLK_STS_OK) {
2067         rbio->faila = -1;
2068         rbio->failb = -1;
2069
2070         if (rbio->operation == BTRFS_RBIO_WRITE)
2071             finish_rmw(rbio);
2072         else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2073             finish_parity_scrub(rbio, 0);
2074         else
2075             BUG();
2076     } else {
2077         rbio_orig_end_io(rbio, err);
2078     }
2079 }
2080
2081 /*
2082  * This is called only for stripes we've read from disk to reconstruct the
2083  * parity.
2084  */
2085 static void raid_recover_end_io_work(struct work_struct *work)
2086 {
2087     struct btrfs_raid_bio *rbio =
2088         container_of(work, struct btrfs_raid_bio, end_io_work);
2089
2090     if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2091         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2092     else
2093         __raid_recover_end_io(rbio);
2094 }
2095
2096 /*
2097  * reads everything we need off the disk to reconstruct
2098  * the parity. endio handlers trigger final reconstruction
2099  * when the IO is done.
2100  *
2101  * This is used both for reads from the higher layers and for
2102  * parity construction required to finish a rmw cycle.
2103  */
2104 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2105 {
2106     int bios_to_read = 0;
2107     struct bio_list bio_list;
2108     int ret;
2109     int total_sector_nr;
2110     struct bio *bio;
2111
2112     bio_list_init(&bio_list);
2113
2114     ret = alloc_rbio_pages(rbio);
2115     if (ret)
2116         goto cleanup;
2117
2118     atomic_set(&rbio->error, 0);
2119
2120     /*
2121      * Read everything that hasn't failed. However this time we will
2122      * not trust any cached sector.
2123      * As we may read out some stale data but higher layer is not reading
2124      * that stale part.
2125      *
2126      * So here we always re-read everything in recovery path.
2127      */
2128     for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2129          total_sector_nr++) {
2130         int stripe = total_sector_nr / rbio->stripe_nsectors;
2131         int sectornr = total_sector_nr % rbio->stripe_nsectors;
2132         struct sector_ptr *sector;
2133
2134         if (rbio->faila == stripe || rbio->failb == stripe) {
2135             atomic_inc(&rbio->error);
2136             /* Skip the current stripe. */
2137             ASSERT(sectornr == 0);
2138             total_sector_nr += rbio->stripe_nsectors - 1;
2139             continue;
2140         }
2141         sector = rbio_stripe_sector(rbio, stripe, sectornr);
2142         ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2143                      sectornr, REQ_OP_READ);
2144         if (ret < 0)
2145             goto cleanup;
2146     }
2147
2148     bios_to_read = bio_list_size(&bio_list);
2149     if (!bios_to_read) {
2150         /*
2151          * we might have no bios to read just because the pages
2152          * were up to date, or we might have no bios to read because
2153          * the devices were gone.
2154          */
2155         if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2156             __raid_recover_end_io(rbio);
2157             return 0;
2158         } else {
2159             goto cleanup;
2160         }
2161     }
2162
2163     /*
2164      * The bioc may be freed once we submit the last bio. Make sure not to
2165      * touch it after that.
2166      */
2167     atomic_set(&rbio->stripes_pending, bios_to_read);
2168     INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
2169     while ((bio = bio_list_pop(&bio_list))) {
2170         bio->bi_end_io = raid56_bio_end_io;
2171
2172         if (trace_raid56_scrub_read_recover_enabled()) {
2173             struct raid56_bio_trace_info trace_info = { 0 };
2174
2175             bio_get_trace_info(rbio, bio, &trace_info);
2176             trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
2177         }
2178         submit_bio(bio);
2179     }
2180
2181     return 0;
2182
2183 cleanup:
2184     if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2185         rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2186         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2187
2188     while ((bio = bio_list_pop(&bio_list)))
2189         bio_put(bio);
2190
2191     return -EIO;
2192 }
2193
2194 /*
2195  * the main entry point for reads from the higher layers.  This
2196  * is really only called when the normal read path had a failure,
2197  * so we assume the bio they send down corresponds to a failed part
2198  * of the drive.
2199  */
2200 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2201                int mirror_num, bool generic_io)
2202 {
2203     struct btrfs_fs_info *fs_info = bioc->fs_info;
2204     struct btrfs_raid_bio *rbio;
2205
2206     if (generic_io) {
2207         ASSERT(bioc->mirror_num == mirror_num);
2208         btrfs_bio(bio)->mirror_num = mirror_num;
2209     } else {
2210         btrfs_get_bioc(bioc);
2211     }
2212
2213     rbio = alloc_rbio(fs_info, bioc);
2214     if (IS_ERR(rbio)) {
2215         bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2216         goto out_end_bio;
2217     }
2218
2219     rbio->operation = BTRFS_RBIO_READ_REBUILD;
2220     rbio_add_bio(rbio, bio);
2221
2222     rbio->faila = find_logical_bio_stripe(rbio, bio);
2223     if (rbio->faila == -1) {
2224         btrfs_warn(fs_info,
2225 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2226                __func__, bio->bi_iter.bi_sector << 9,
2227                (u64)bio->bi_iter.bi_size, bioc->map_type);
2228         kfree(rbio);
2229         bio->bi_status = BLK_STS_IOERR;
2230         goto out_end_bio;
2231     }
2232
2233     if (generic_io)
2234         rbio->generic_bio_cnt = 1;
2235
2236     /*
2237      * Loop retry:
2238      * for 'mirror == 2', reconstruct from all other stripes.
2239      * for 'mirror_num > 2', select a stripe to fail on every retry.
2240      */
2241     if (mirror_num > 2) {
2242         /*
2243          * 'mirror == 3' is to fail the p stripe and
2244          * reconstruct from the q stripe.  'mirror > 3' is to
2245          * fail a data stripe and reconstruct from p+q stripe.
2246          */
2247         rbio->failb = rbio->real_stripes - (mirror_num - 1);
2248         ASSERT(rbio->failb > 0);
2249         if (rbio->failb <= rbio->faila)
2250             rbio->failb--;
2251     }
2252
2253     if (lock_stripe_add(rbio))
2254         return;
2255
2256     /*
2257      * This adds our rbio to the list of rbios that will be handled after
2258      * the current lock owner is done.
2259      */
2260     __raid56_parity_recover(rbio);
2261     return;
2262
2263 out_end_bio:
2264     btrfs_bio_counter_dec(fs_info);
2265     btrfs_put_bioc(bioc);
2266     bio_endio(bio);
2267 }
2268
2269 static void rmw_work(struct work_struct *work)
2270 {
2271     struct btrfs_raid_bio *rbio;
2272
2273     rbio = container_of(work, struct btrfs_raid_bio, work);
2274     raid56_rmw_stripe(rbio);
2275 }
2276
2277 static void read_rebuild_work(struct work_struct *work)
2278 {
2279     struct btrfs_raid_bio *rbio;
2280
2281     rbio = container_of(work, struct btrfs_raid_bio, work);
2282     __raid56_parity_recover(rbio);
2283 }
2284
2285 /*
2286  * The following code is used to scrub/replace the parity stripe
2287  *
2288  * Caller must have already increased bio_counter for getting @bioc.
2289  *
2290  * Note: We need make sure all the pages that add into the scrub/replace
2291  * raid bio are correct and not be changed during the scrub/replace. That
2292  * is those pages just hold metadata or file data with checksum.
2293  */
2294
2295 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2296                 struct btrfs_io_context *bioc,
2297                 struct btrfs_device *scrub_dev,
2298                 unsigned long *dbitmap, int stripe_nsectors)
2299 {
2300     struct btrfs_fs_info *fs_info = bioc->fs_info;
2301     struct btrfs_raid_bio *rbio;
2302     int i;
2303
2304     rbio = alloc_rbio(fs_info, bioc);
2305     if (IS_ERR(rbio))
2306         return NULL;
2307     bio_list_add(&rbio->bio_list, bio);
2308     /*
2309      * This is a special bio which is used to hold the completion handler
2310      * and make the scrub rbio is similar to the other types
2311      */
2312     ASSERT(!bio->bi_iter.bi_size);
2313     rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2314
2315     /*
2316      * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2317      * to the end position, so this search can start from the first parity
2318      * stripe.
2319      */
2320     for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2321         if (bioc->stripes[i].dev == scrub_dev) {
2322             rbio->scrubp = i;
2323             break;
2324         }
2325     }
2326     ASSERT(i < rbio->real_stripes);
2327
2328     bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2329
2330     /*
2331      * We have already increased bio_counter when getting bioc, record it
2332      * so we can free it at rbio_orig_end_io().
2333      */
2334     rbio->generic_bio_cnt = 1;
2335
2336     return rbio;
2337 }
2338
2339 /* Used for both parity scrub and missing. */
2340 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2341                 unsigned int pgoff, u64 logical)
2342 {
2343     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2344     int stripe_offset;
2345     int index;
2346
2347     ASSERT(logical >= rbio->bioc->raid_map[0]);
2348     ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
2349                        BTRFS_STRIPE_LEN * rbio->nr_data);
2350     stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2351     index = stripe_offset / sectorsize;
2352     rbio->bio_sectors[index].page = page;
2353     rbio->bio_sectors[index].pgoff = pgoff;
2354 }
2355
2356 /*
2357  * We just scrub the parity that we have correct data on the same horizontal,
2358  * so we needn't allocate all pages for all the stripes.
2359  */
2360 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2361 {
2362     const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2363     int total_sector_nr;
2364
2365     for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2366          total_sector_nr++) {
2367         struct page *page;
2368         int sectornr = total_sector_nr % rbio->stripe_nsectors;
2369         int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2370
2371         if (!test_bit(sectornr, &rbio->dbitmap))
2372             continue;
2373         if (rbio->stripe_pages[index])
2374             continue;
2375         page = alloc_page(GFP_NOFS);
2376         if (!page)
2377             return -ENOMEM;
2378         rbio->stripe_pages[index] = page;
2379     }
2380     index_stripe_sectors(rbio);
2381     return 0;
2382 }
2383
2384 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2385                      int need_check)
2386 {
2387     struct btrfs_io_context *bioc = rbio->bioc;
2388     const u32 sectorsize = bioc->fs_info->sectorsize;
2389     void **pointers = rbio->finish_pointers;
2390     unsigned long *pbitmap = &rbio->finish_pbitmap;
2391     int nr_data = rbio->nr_data;
2392     int stripe;
2393     int sectornr;
2394     bool has_qstripe;
2395     struct sector_ptr p_sector = { 0 };
2396     struct sector_ptr q_sector = { 0 };
2397     struct bio_list bio_list;
2398     struct bio *bio;
2399     int is_replace = 0;
2400     int ret;
2401
2402     bio_list_init(&bio_list);
2403
2404     if (rbio->real_stripes - rbio->nr_data == 1)
2405         has_qstripe = false;
2406     else if (rbio->real_stripes - rbio->nr_data == 2)
2407         has_qstripe = true;
2408     else
2409         BUG();
2410
2411     if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2412         is_replace = 1;
2413         bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2414     }
2415
2416     /*
2417      * Because the higher layers(scrubber) are unlikely to
2418      * use this area of the disk again soon, so don't cache
2419      * it.
2420      */
2421     clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2422
2423     if (!need_check)
2424         goto writeback;
2425
2426     p_sector.page = alloc_page(GFP_NOFS);
2427     if (!p_sector.page)
2428         goto cleanup;
2429     p_sector.pgoff = 0;
2430     p_sector.uptodate = 1;
2431
2432     if (has_qstripe) {
2433         /* RAID6, allocate and map temp space for the Q stripe */
2434         q_sector.page = alloc_page(GFP_NOFS);
2435         if (!q_sector.page) {
2436             __free_page(p_sector.page);
2437             p_sector.page = NULL;
2438             goto cleanup;
2439         }
2440         q_sector.pgoff = 0;
2441         q_sector.uptodate = 1;
2442         pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2443     }
2444
2445     atomic_set(&rbio->error, 0);
2446
2447     /* Map the parity stripe just once */
2448     pointers[nr_data] = kmap_local_page(p_sector.page);
2449
2450     for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2451         struct sector_ptr *sector;
2452         void *parity;
2453
2454         /* first collect one page from each data stripe */
2455         for (stripe = 0; stripe < nr_data; stripe++) {
2456             sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2457             pointers[stripe] = kmap_local_page(sector->page) +
2458                        sector->pgoff;
2459         }
2460
2461         if (has_qstripe) {
2462             /* RAID6, call the library function to fill in our P/Q */
2463             raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2464                         pointers);
2465         } else {
2466             /* raid5 */
2467             memcpy(pointers[nr_data], pointers[0], sectorsize);
2468             run_xor(pointers + 1, nr_data - 1, sectorsize);
2469         }
2470
2471         /* Check scrubbing parity and repair it */
2472         sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2473         parity = kmap_local_page(sector->page) + sector->pgoff;
2474         if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2475             memcpy(parity, pointers[rbio->scrubp], sectorsize);
2476         else
2477             /* Parity is right, needn't writeback */
2478             bitmap_clear(&rbio->dbitmap, sectornr, 1);
2479         kunmap_local(parity);
2480
2481         for (stripe = nr_data - 1; stripe >= 0; stripe--)
2482             kunmap_local(pointers[stripe]);
2483     }
2484
2485     kunmap_local(pointers[nr_data]);
2486     __free_page(p_sector.page);
2487     p_sector.page = NULL;
2488     if (q_sector.page) {
2489         kunmap_local(pointers[rbio->real_stripes - 1]);
2490         __free_page(q_sector.page);
2491         q_sector.page = NULL;
2492     }
2493
2494 writeback:
2495     /*
2496      * time to start writing.  Make bios for everything from the
2497      * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2498      * everything else.
2499      */
2500     for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2501         struct sector_ptr *sector;
2502
2503         sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2504         ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2505                      sectornr, REQ_OP_WRITE);
2506         if (ret)
2507             goto cleanup;
2508     }
2509
2510     if (!is_replace)
2511         goto submit_write;
2512
2513     for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2514         struct sector_ptr *sector;
2515
2516         sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2517         ret = rbio_add_io_sector(rbio, &bio_list, sector,
2518                        bioc->tgtdev_map[rbio->scrubp],
2519                        sectornr, REQ_OP_WRITE);
2520         if (ret)
2521             goto cleanup;
2522     }
2523
2524 submit_write:
2525     nr_data = bio_list_size(&bio_list);
2526     if (!nr_data) {
2527         /* Every parity is right */
2528         rbio_orig_end_io(rbio, BLK_STS_OK);
2529         return;
2530     }
2531
2532     atomic_set(&rbio->stripes_pending, nr_data);
2533
2534     while ((bio = bio_list_pop(&bio_list))) {
2535         bio->bi_end_io = raid_write_end_io;
2536
2537         if (trace_raid56_scrub_write_stripe_enabled()) {
2538             struct raid56_bio_trace_info trace_info = { 0 };
2539
2540             bio_get_trace_info(rbio, bio, &trace_info);
2541             trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
2542         }
2543         submit_bio(bio);
2544     }
2545     return;
2546
2547 cleanup:
2548     rbio_orig_end_io(rbio, BLK_STS_IOERR);
2549
2550     while ((bio = bio_list_pop(&bio_list)))
2551         bio_put(bio);
2552 }
2553
2554 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2555 {
2556     if (stripe >= 0 && stripe < rbio->nr_data)
2557         return 1;
2558     return 0;
2559 }
2560
2561 /*
2562  * While we're doing the parity check and repair, we could have errors
2563  * in reading pages off the disk.  This checks for errors and if we're
2564  * not able to read the page it'll trigger parity reconstruction.  The
2565  * parity scrub will be finished after we've reconstructed the failed
2566  * stripes
2567  */
2568 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2569 {
2570     if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2571         goto cleanup;
2572
2573     if (rbio->faila >= 0 || rbio->failb >= 0) {
2574         int dfail = 0, failp = -1;
2575
2576         if (is_data_stripe(rbio, rbio->faila))
2577             dfail++;
2578         else if (is_parity_stripe(rbio->faila))
2579             failp = rbio->faila;
2580
2581         if (is_data_stripe(rbio, rbio->failb))
2582             dfail++;
2583         else if (is_parity_stripe(rbio->failb))
2584             failp = rbio->failb;
2585
2586         /*
2587          * Because we can not use a scrubbing parity to repair
2588          * the data, so the capability of the repair is declined.
2589          * (In the case of RAID5, we can not repair anything)
2590          */
2591         if (dfail > rbio->bioc->max_errors - 1)
2592             goto cleanup;
2593
2594         /*
2595          * If all data is good, only parity is correctly, just
2596          * repair the parity.
2597          */
2598         if (dfail == 0) {
2599             finish_parity_scrub(rbio, 0);
2600             return;
2601         }
2602
2603         /*
2604          * Here means we got one corrupted data stripe and one
2605          * corrupted parity on RAID6, if the corrupted parity
2606          * is scrubbing parity, luckily, use the other one to repair
2607          * the data, or we can not repair the data stripe.
2608          */
2609         if (failp != rbio->scrubp)
2610             goto cleanup;
2611
2612         __raid_recover_end_io(rbio);
2613     } else {
2614         finish_parity_scrub(rbio, 1);
2615     }
2616     return;
2617
2618 cleanup:
2619     rbio_orig_end_io(rbio, BLK_STS_IOERR);
2620 }
2621
2622 /*
2623  * end io for the read phase of the rmw cycle.  All the bios here are physical
2624  * stripe bios we've read from the disk so we can recalculate the parity of the
2625  * stripe.
2626  *
2627  * This will usually kick off finish_rmw once all the bios are read in, but it
2628  * may trigger parity reconstruction if we had any errors along the way
2629  */
2630 static void raid56_parity_scrub_end_io_work(struct work_struct *work)
2631 {
2632     struct btrfs_raid_bio *rbio =
2633         container_of(work, struct btrfs_raid_bio, end_io_work);
2634
2635     /*
2636      * This will normally call finish_rmw to start our write, but if there
2637      * are any failed stripes we'll reconstruct from parity first
2638      */
2639     validate_rbio_for_parity_scrub(rbio);
2640 }
2641
2642 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2643 {
2644     int bios_to_read = 0;
2645     struct bio_list bio_list;
2646     int ret;
2647     int total_sector_nr;
2648     struct bio *bio;
2649
2650     bio_list_init(&bio_list);
2651
2652     ret = alloc_rbio_essential_pages(rbio);
2653     if (ret)
2654         goto cleanup;
2655
2656     atomic_set(&rbio->error, 0);
2657     /* Build a list of bios to read all the missing parts. */
2658     for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2659          total_sector_nr++) {
2660         int sectornr = total_sector_nr % rbio->stripe_nsectors;
2661         int stripe = total_sector_nr / rbio->stripe_nsectors;
2662         struct sector_ptr *sector;
2663
2664         /* No data in the vertical stripe, no need to read. */
2665         if (!test_bit(sectornr, &rbio->dbitmap))
2666             continue;
2667
2668         /*
2669          * We want to find all the sectors missing from the rbio and
2670          * read them from the disk. If sector_in_rbio() finds a sector
2671          * in the bio list we don't need to read it off the stripe.
2672          */
2673         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2674         if (sector)
2675             continue;
2676
2677         sector = rbio_stripe_sector(rbio, stripe, sectornr);
2678         /*
2679          * The bio cache may have handed us an uptodate sector.  If so,
2680          * use it.
2681          */
2682         if (sector->uptodate)
2683             continue;
2684
2685         ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2686                      sectornr, REQ_OP_READ);
2687         if (ret)
2688             goto cleanup;
2689     }
2690
2691     bios_to_read = bio_list_size(&bio_list);
2692     if (!bios_to_read) {
2693         /*
2694          * this can happen if others have merged with
2695          * us, it means there is nothing left to read.
2696          * But if there are missing devices it may not be
2697          * safe to do the full stripe write yet.
2698          */
2699         goto finish;
2700     }
2701
2702     /*
2703      * The bioc may be freed once we submit the last bio. Make sure not to
2704      * touch it after that.
2705      */
2706     atomic_set(&rbio->stripes_pending, bios_to_read);
2707     INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
2708     while ((bio = bio_list_pop(&bio_list))) {
2709         bio->bi_end_io = raid56_bio_end_io;
2710
2711         if (trace_raid56_scrub_read_enabled()) {
2712             struct raid56_bio_trace_info trace_info = { 0 };
2713
2714             bio_get_trace_info(rbio, bio, &trace_info);
2715             trace_raid56_scrub_read(rbio, bio, &trace_info);
2716         }
2717         submit_bio(bio);
2718     }
2719     /* the actual write will happen once the reads are done */
2720     return;
2721
2722 cleanup:
2723     rbio_orig_end_io(rbio, BLK_STS_IOERR);
2724
2725     while ((bio = bio_list_pop(&bio_list)))
2726         bio_put(bio);
2727
2728     return;
2729
2730 finish:
2731     validate_rbio_for_parity_scrub(rbio);
2732 }
2733
2734 static void scrub_parity_work(struct work_struct *work)
2735 {
2736     struct btrfs_raid_bio *rbio;
2737
2738     rbio = container_of(work, struct btrfs_raid_bio, work);
2739     raid56_parity_scrub_stripe(rbio);
2740 }
2741
2742 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2743 {
2744     if (!lock_stripe_add(rbio))
2745         start_async_work(rbio, scrub_parity_work);
2746 }
2747
2748 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2749
2750 struct btrfs_raid_bio *
2751 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
2752 {
2753     struct btrfs_fs_info *fs_info = bioc->fs_info;
2754     struct btrfs_raid_bio *rbio;
2755
2756     rbio = alloc_rbio(fs_info, bioc);
2757     if (IS_ERR(rbio))
2758         return NULL;
2759
2760     rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2761     bio_list_add(&rbio->bio_list, bio);
2762     /*
2763      * This is a special bio which is used to hold the completion handler
2764      * and make the scrub rbio is similar to the other types
2765      */
2766     ASSERT(!bio->bi_iter.bi_size);
2767
2768     rbio->faila = find_logical_bio_stripe(rbio, bio);
2769     if (rbio->faila == -1) {
2770         BUG();
2771         kfree(rbio);
2772         return NULL;
2773     }
2774
2775     /*
2776      * When we get bioc, we have already increased bio_counter, record it
2777      * so we can free it at rbio_orig_end_io()
2778      */
2779     rbio->generic_bio_cnt = 1;
2780
2781     return rbio;
2782 }
2783
2784 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2785 {
2786     if (!lock_stripe_add(rbio))
2787         start_async_work(rbio, read_rebuild_work);
2788 }