fs/btrfs/disk-io.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2007 Oracle.  All rights reserved.
0004  */
0005
0006 #include <linux/fs.h>
0007 #include <linux/blkdev.h>
0008 #include <linux/radix-tree.h>
0009 #include <linux/writeback.h>
0010 #include <linux/workqueue.h>
0011 #include <linux/kthread.h>
0012 #include <linux/slab.h>
0013 #include <linux/migrate.h>
0014 #include <linux/ratelimit.h>
0015 #include <linux/uuid.h>
0016 #include <linux/semaphore.h>
0017 #include <linux/error-injection.h>
0018 #include <linux/crc32c.h>
0019 #include <linux/sched/mm.h>
0020 #include <asm/unaligned.h>
0021 #include <crypto/hash.h>
0022 #include "ctree.h"
0023 #include "disk-io.h"
0024 #include "transaction.h"
0025 #include "btrfs_inode.h"
0026 #include "volumes.h"
0027 #include "print-tree.h"
0028 #include "locking.h"
0029 #include "tree-log.h"
0030 #include "free-space-cache.h"
0031 #include "free-space-tree.h"
0032 #include "check-integrity.h"
0033 #include "rcu-string.h"
0034 #include "dev-replace.h"
0035 #include "raid56.h"
0036 #include "sysfs.h"
0037 #include "qgroup.h"
0038 #include "compression.h"
0039 #include "tree-checker.h"
0040 #include "ref-verify.h"
0041 #include "block-group.h"
0042 #include "discard.h"
0043 #include "space-info.h"
0044 #include "zoned.h"
0045 #include "subpage.h"
0046
0047 #define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
0048                  BTRFS_HEADER_FLAG_RELOC |\
0049                  BTRFS_SUPER_FLAG_ERROR |\
0050                  BTRFS_SUPER_FLAG_SEEDING |\
0051                  BTRFS_SUPER_FLAG_METADUMP |\
0052                  BTRFS_SUPER_FLAG_METADUMP_V2)
0053
0054 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
0055 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
0056                       struct btrfs_fs_info *fs_info);
0057 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
0058 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
0059                     struct extent_io_tree *dirty_pages,
0060                     int mark);
0061 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
0062                        struct extent_io_tree *pinned_extents);
0063 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
0064 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
0065
0066 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
0067 {
0068     if (fs_info->csum_shash)
0069         crypto_free_shash(fs_info->csum_shash);
0070 }
0071
0072 /*
0073  * async submit bios are used to offload expensive checksumming
0074  * onto the worker threads.  They checksum file and metadata bios
0075  * just before they are sent down the IO stack.
0076  */
0077 struct async_submit_bio {
0078     struct inode *inode;
0079     struct bio *bio;
0080     extent_submit_bio_start_t *submit_bio_start;
0081     int mirror_num;
0082
0083     /* Optional parameter for submit_bio_start used by direct io */
0084     u64 dio_file_offset;
0085     struct btrfs_work work;
0086     blk_status_t status;
0087 };
0088
0089 /*
0090  * Compute the csum of a btree block and store the result to provided buffer.
0091  */
0092 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
0093 {
0094     struct btrfs_fs_info *fs_info = buf->fs_info;
0095     const int num_pages = num_extent_pages(buf);
0096     const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
0097     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
0098     char *kaddr;
0099     int i;
0100
0101     shash->tfm = fs_info->csum_shash;
0102     crypto_shash_init(shash);
0103     kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
0104     crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
0105                 first_page_part - BTRFS_CSUM_SIZE);
0106
0107     for (i = 1; i < num_pages; i++) {
0108         kaddr = page_address(buf->pages[i]);
0109         crypto_shash_update(shash, kaddr, PAGE_SIZE);
0110     }
0111     memset(result, 0, BTRFS_CSUM_SIZE);
0112     crypto_shash_final(shash, result);
0113 }
0114
0115 /*
0116  * we can't consider a given block up to date unless the transid of the
0117  * block matches the transid in the parent node's pointer.  This is how we
0118  * detect blocks that either didn't get written at all or got written
0119  * in the wrong place.
0120  */
0121 static int verify_parent_transid(struct extent_io_tree *io_tree,
0122                  struct extent_buffer *eb, u64 parent_transid,
0123                  int atomic)
0124 {
0125     struct extent_state *cached_state = NULL;
0126     int ret;
0127
0128     if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
0129         return 0;
0130
0131     if (atomic)
0132         return -EAGAIN;
0133
0134     lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
0135              &cached_state);
0136     if (extent_buffer_uptodate(eb) &&
0137         btrfs_header_generation(eb) == parent_transid) {
0138         ret = 0;
0139         goto out;
0140     }
0141     btrfs_err_rl(eb->fs_info,
0142 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
0143             eb->start, eb->read_mirror,
0144             parent_transid, btrfs_header_generation(eb));
0145     ret = 1;
0146     clear_extent_buffer_uptodate(eb);
0147 out:
0148     unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
0149                  &cached_state);
0150     return ret;
0151 }
0152
0153 static bool btrfs_supported_super_csum(u16 csum_type)
0154 {
0155     switch (csum_type) {
0156     case BTRFS_CSUM_TYPE_CRC32:
0157     case BTRFS_CSUM_TYPE_XXHASH:
0158     case BTRFS_CSUM_TYPE_SHA256:
0159     case BTRFS_CSUM_TYPE_BLAKE2:
0160         return true;
0161     default:
0162         return false;
0163     }
0164 }
0165
0166 /*
0167  * Return 0 if the superblock checksum type matches the checksum value of that
0168  * algorithm. Pass the raw disk superblock data.
0169  */
0170 static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
0171                   char *raw_disk_sb)
0172 {
0173     struct btrfs_super_block *disk_sb =
0174         (struct btrfs_super_block *)raw_disk_sb;
0175     char result[BTRFS_CSUM_SIZE];
0176     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
0177
0178     shash->tfm = fs_info->csum_shash;
0179
0180     /*
0181      * The super_block structure does not span the whole
0182      * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
0183      * filled with zeros and is included in the checksum.
0184      */
0185     crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
0186                 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
0187
0188     if (memcmp(disk_sb->csum, result, fs_info->csum_size))
0189         return 1;
0190
0191     return 0;
0192 }
0193
0194 int btrfs_verify_level_key(struct extent_buffer *eb, int level,
0195                struct btrfs_key *first_key, u64 parent_transid)
0196 {
0197     struct btrfs_fs_info *fs_info = eb->fs_info;
0198     int found_level;
0199     struct btrfs_key found_key;
0200     int ret;
0201
0202     found_level = btrfs_header_level(eb);
0203     if (found_level != level) {
0204         WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
0205              KERN_ERR "BTRFS: tree level check failed\n");
0206         btrfs_err(fs_info,
0207 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
0208               eb->start, level, found_level);
0209         return -EIO;
0210     }
0211
0212     if (!first_key)
0213         return 0;
0214
0215     /*
0216      * For live tree block (new tree blocks in current transaction),
0217      * we need proper lock context to avoid race, which is impossible here.
0218      * So we only checks tree blocks which is read from disk, whose
0219      * generation <= fs_info->last_trans_committed.
0220      */
0221     if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
0222         return 0;
0223
0224     /* We have @first_key, so this @eb must have at least one item */
0225     if (btrfs_header_nritems(eb) == 0) {
0226         btrfs_err(fs_info,
0227         "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
0228               eb->start);
0229         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
0230         return -EUCLEAN;
0231     }
0232
0233     if (found_level)
0234         btrfs_node_key_to_cpu(eb, &found_key, 0);
0235     else
0236         btrfs_item_key_to_cpu(eb, &found_key, 0);
0237     ret = btrfs_comp_cpu_keys(first_key, &found_key);
0238
0239     if (ret) {
0240         WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
0241              KERN_ERR "BTRFS: tree first key check failed\n");
0242         btrfs_err(fs_info,
0243 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
0244               eb->start, parent_transid, first_key->objectid,
0245               first_key->type, first_key->offset,
0246               found_key.objectid, found_key.type,
0247               found_key.offset);
0248     }
0249     return ret;
0250 }
0251
0252 /*
0253  * helper to read a given tree block, doing retries as required when
0254  * the checksums don't match and we have alternate mirrors to try.
0255  *
0256  * @parent_transid: expected transid, skip check if 0
0257  * @level:      expected level, mandatory check
0258  * @first_key:      expected key of first slot, skip check if NULL
0259  */
0260 int btrfs_read_extent_buffer(struct extent_buffer *eb,
0261                  u64 parent_transid, int level,
0262                  struct btrfs_key *first_key)
0263 {
0264     struct btrfs_fs_info *fs_info = eb->fs_info;
0265     struct extent_io_tree *io_tree;
0266     int failed = 0;
0267     int ret;
0268     int num_copies = 0;
0269     int mirror_num = 0;
0270     int failed_mirror = 0;
0271
0272     io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
0273     while (1) {
0274         clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
0275         ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
0276         if (!ret) {
0277             if (verify_parent_transid(io_tree, eb,
0278                            parent_transid, 0))
0279                 ret = -EIO;
0280             else if (btrfs_verify_level_key(eb, level,
0281                         first_key, parent_transid))
0282                 ret = -EUCLEAN;
0283             else
0284                 break;
0285         }
0286
0287         num_copies = btrfs_num_copies(fs_info,
0288                           eb->start, eb->len);
0289         if (num_copies == 1)
0290             break;
0291
0292         if (!failed_mirror) {
0293             failed = 1;
0294             failed_mirror = eb->read_mirror;
0295         }
0296
0297         mirror_num++;
0298         if (mirror_num == failed_mirror)
0299             mirror_num++;
0300
0301         if (mirror_num > num_copies)
0302             break;
0303     }
0304
0305     if (failed && !ret && failed_mirror)
0306         btrfs_repair_eb_io_failure(eb, failed_mirror);
0307
0308     return ret;
0309 }
0310
0311 static int csum_one_extent_buffer(struct extent_buffer *eb)
0312 {
0313     struct btrfs_fs_info *fs_info = eb->fs_info;
0314     u8 result[BTRFS_CSUM_SIZE];
0315     int ret;
0316
0317     ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
0318                     offsetof(struct btrfs_header, fsid),
0319                     BTRFS_FSID_SIZE) == 0);
0320     csum_tree_block(eb, result);
0321
0322     if (btrfs_header_level(eb))
0323         ret = btrfs_check_node(eb);
0324     else
0325         ret = btrfs_check_leaf_full(eb);
0326
0327     if (ret < 0)
0328         goto error;
0329
0330     /*
0331      * Also check the generation, the eb reached here must be newer than
0332      * last committed. Or something seriously wrong happened.
0333      */
0334     if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
0335         ret = -EUCLEAN;
0336         btrfs_err(fs_info,
0337             "block=%llu bad generation, have %llu expect > %llu",
0338               eb->start, btrfs_header_generation(eb),
0339               fs_info->last_trans_committed);
0340         goto error;
0341     }
0342     write_extent_buffer(eb, result, 0, fs_info->csum_size);
0343
0344     return 0;
0345
0346 error:
0347     btrfs_print_tree(eb, 0);
0348     btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
0349           eb->start);
0350     WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
0351     return ret;
0352 }
0353
0354 /* Checksum all dirty extent buffers in one bio_vec */
0355 static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
0356                       struct bio_vec *bvec)
0357 {
0358     struct page *page = bvec->bv_page;
0359     u64 bvec_start = page_offset(page) + bvec->bv_offset;
0360     u64 cur;
0361     int ret = 0;
0362
0363     for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
0364          cur += fs_info->nodesize) {
0365         struct extent_buffer *eb;
0366         bool uptodate;
0367
0368         eb = find_extent_buffer(fs_info, cur);
0369         uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
0370                                fs_info->nodesize);
0371
0372         /* A dirty eb shouldn't disappear from buffer_radix */
0373         if (WARN_ON(!eb))
0374             return -EUCLEAN;
0375
0376         if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
0377             free_extent_buffer(eb);
0378             return -EUCLEAN;
0379         }
0380         if (WARN_ON(!uptodate)) {
0381             free_extent_buffer(eb);
0382             return -EUCLEAN;
0383         }
0384
0385         ret = csum_one_extent_buffer(eb);
0386         free_extent_buffer(eb);
0387         if (ret < 0)
0388             return ret;
0389     }
0390     return ret;
0391 }
0392
0393 /*
0394  * Checksum a dirty tree block before IO.  This has extra checks to make sure
0395  * we only fill in the checksum field in the first page of a multi-page block.
0396  * For subpage extent buffers we need bvec to also read the offset in the page.
0397  */
0398 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
0399 {
0400     struct page *page = bvec->bv_page;
0401     u64 start = page_offset(page);
0402     u64 found_start;
0403     struct extent_buffer *eb;
0404
0405     if (fs_info->nodesize < PAGE_SIZE)
0406         return csum_dirty_subpage_buffers(fs_info, bvec);
0407
0408     eb = (struct extent_buffer *)page->private;
0409     if (page != eb->pages[0])
0410         return 0;
0411
0412     found_start = btrfs_header_bytenr(eb);
0413
0414     if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
0415         WARN_ON(found_start != 0);
0416         return 0;
0417     }
0418
0419     /*
0420      * Please do not consolidate these warnings into a single if.
0421      * It is useful to know what went wrong.
0422      */
0423     if (WARN_ON(found_start != start))
0424         return -EUCLEAN;
0425     if (WARN_ON(!PageUptodate(page)))
0426         return -EUCLEAN;
0427
0428     return csum_one_extent_buffer(eb);
0429 }
0430
0431 static int check_tree_block_fsid(struct extent_buffer *eb)
0432 {
0433     struct btrfs_fs_info *fs_info = eb->fs_info;
0434     struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
0435     u8 fsid[BTRFS_FSID_SIZE];
0436     u8 *metadata_uuid;
0437
0438     read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
0439                BTRFS_FSID_SIZE);
0440     /*
0441      * Checking the incompat flag is only valid for the current fs. For
0442      * seed devices it's forbidden to have their uuid changed so reading
0443      * ->fsid in this case is fine
0444      */
0445     if (btrfs_fs_incompat(fs_info, METADATA_UUID))
0446         metadata_uuid = fs_devices->metadata_uuid;
0447     else
0448         metadata_uuid = fs_devices->fsid;
0449
0450     if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
0451         return 0;
0452
0453     list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
0454         if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
0455             return 0;
0456
0457     return 1;
0458 }
0459
0460 /* Do basic extent buffer checks at read time */
0461 static int validate_extent_buffer(struct extent_buffer *eb)
0462 {
0463     struct btrfs_fs_info *fs_info = eb->fs_info;
0464     u64 found_start;
0465     const u32 csum_size = fs_info->csum_size;
0466     u8 found_level;
0467     u8 result[BTRFS_CSUM_SIZE];
0468     const u8 *header_csum;
0469     int ret = 0;
0470
0471     found_start = btrfs_header_bytenr(eb);
0472     if (found_start != eb->start) {
0473         btrfs_err_rl(fs_info,
0474             "bad tree block start, mirror %u want %llu have %llu",
0475                  eb->read_mirror, eb->start, found_start);
0476         ret = -EIO;
0477         goto out;
0478     }
0479     if (check_tree_block_fsid(eb)) {
0480         btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
0481                  eb->start, eb->read_mirror);
0482         ret = -EIO;
0483         goto out;
0484     }
0485     found_level = btrfs_header_level(eb);
0486     if (found_level >= BTRFS_MAX_LEVEL) {
0487         btrfs_err(fs_info,
0488             "bad tree block level, mirror %u level %d on logical %llu",
0489             eb->read_mirror, btrfs_header_level(eb), eb->start);
0490         ret = -EIO;
0491         goto out;
0492     }
0493
0494     csum_tree_block(eb, result);
0495     header_csum = page_address(eb->pages[0]) +
0496         get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
0497
0498     if (memcmp(result, header_csum, csum_size) != 0) {
0499         btrfs_warn_rl(fs_info,
0500 "checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
0501                   eb->start, eb->read_mirror,
0502                   CSUM_FMT_VALUE(csum_size, header_csum),
0503                   CSUM_FMT_VALUE(csum_size, result),
0504                   btrfs_header_level(eb));
0505         ret = -EUCLEAN;
0506         goto out;
0507     }
0508
0509     /*
0510      * If this is a leaf block and it is corrupt, set the corrupt bit so
0511      * that we don't try and read the other copies of this block, just
0512      * return -EIO.
0513      */
0514     if (found_level == 0 && btrfs_check_leaf_full(eb)) {
0515         set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
0516         ret = -EIO;
0517     }
0518
0519     if (found_level > 0 && btrfs_check_node(eb))
0520         ret = -EIO;
0521
0522     if (!ret)
0523         set_extent_buffer_uptodate(eb);
0524     else
0525         btrfs_err(fs_info,
0526         "read time tree block corruption detected on logical %llu mirror %u",
0527               eb->start, eb->read_mirror);
0528 out:
0529     return ret;
0530 }
0531
0532 static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
0533                    int mirror)
0534 {
0535     struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
0536     struct extent_buffer *eb;
0537     bool reads_done;
0538     int ret = 0;
0539
0540     /*
0541      * We don't allow bio merge for subpage metadata read, so we should
0542      * only get one eb for each endio hook.
0543      */
0544     ASSERT(end == start + fs_info->nodesize - 1);
0545     ASSERT(PagePrivate(page));
0546
0547     eb = find_extent_buffer(fs_info, start);
0548     /*
0549      * When we are reading one tree block, eb must have been inserted into
0550      * the radix tree. If not, something is wrong.
0551      */
0552     ASSERT(eb);
0553
0554     reads_done = atomic_dec_and_test(&eb->io_pages);
0555     /* Subpage read must finish in page read */
0556     ASSERT(reads_done);
0557
0558     eb->read_mirror = mirror;
0559     if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
0560         ret = -EIO;
0561         goto err;
0562     }
0563     ret = validate_extent_buffer(eb);
0564     if (ret < 0)
0565         goto err;
0566
0567     set_extent_buffer_uptodate(eb);
0568
0569     free_extent_buffer(eb);
0570     return ret;
0571 err:
0572     /*
0573      * end_bio_extent_readpage decrements io_pages in case of error,
0574      * make sure it has something to decrement.
0575      */
0576     atomic_inc(&eb->io_pages);
0577     clear_extent_buffer_uptodate(eb);
0578     free_extent_buffer(eb);
0579     return ret;
0580 }
0581
0582 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
0583                    struct page *page, u64 start, u64 end,
0584                    int mirror)
0585 {
0586     struct extent_buffer *eb;
0587     int ret = 0;
0588     int reads_done;
0589
0590     ASSERT(page->private);
0591
0592     if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
0593         return validate_subpage_buffer(page, start, end, mirror);
0594
0595     eb = (struct extent_buffer *)page->private;
0596
0597     /*
0598      * The pending IO might have been the only thing that kept this buffer
0599      * in memory.  Make sure we have a ref for all this other checks
0600      */
0601     atomic_inc(&eb->refs);
0602
0603     reads_done = atomic_dec_and_test(&eb->io_pages);
0604     if (!reads_done)
0605         goto err;
0606
0607     eb->read_mirror = mirror;
0608     if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
0609         ret = -EIO;
0610         goto err;
0611     }
0612     ret = validate_extent_buffer(eb);
0613 err:
0614     if (ret) {
0615         /*
0616          * our io error hook is going to dec the io pages
0617          * again, we have to make sure it has something
0618          * to decrement
0619          */
0620         atomic_inc(&eb->io_pages);
0621         clear_extent_buffer_uptodate(eb);
0622     }
0623     free_extent_buffer(eb);
0624
0625     return ret;
0626 }
0627
0628 static void run_one_async_start(struct btrfs_work *work)
0629 {
0630     struct async_submit_bio *async;
0631     blk_status_t ret;
0632
0633     async = container_of(work, struct  async_submit_bio, work);
0634     ret = async->submit_bio_start(async->inode, async->bio,
0635                       async->dio_file_offset);
0636     if (ret)
0637         async->status = ret;
0638 }
0639
0640 /*
0641  * In order to insert checksums into the metadata in large chunks, we wait
0642  * until bio submission time.   All the pages in the bio are checksummed and
0643  * sums are attached onto the ordered extent record.
0644  *
0645  * At IO completion time the csums attached on the ordered extent record are
0646  * inserted into the tree.
0647  */
0648 static void run_one_async_done(struct btrfs_work *work)
0649 {
0650     struct async_submit_bio *async;
0651     struct inode *inode;
0652
0653     async = container_of(work, struct  async_submit_bio, work);
0654     inode = async->inode;
0655
0656     /* If an error occurred we just want to clean up the bio and move on */
0657     if (async->status) {
0658         async->bio->bi_status = async->status;
0659         bio_endio(async->bio);
0660         return;
0661     }
0662
0663     /*
0664      * All of the bios that pass through here are from async helpers.
0665      * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
0666      * This changes nothing when cgroups aren't in use.
0667      */
0668     async->bio->bi_opf |= REQ_CGROUP_PUNT;
0669     btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
0670 }
0671
0672 static void run_one_async_free(struct btrfs_work *work)
0673 {
0674     struct async_submit_bio *async;
0675
0676     async = container_of(work, struct  async_submit_bio, work);
0677     kfree(async);
0678 }
0679
0680 /*
0681  * Submit bio to an async queue.
0682  *
0683  * Retrun:
0684  * - true if the work has been succesfuly submitted
0685  * - false in case of error
0686  */
0687 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
0688              u64 dio_file_offset,
0689              extent_submit_bio_start_t *submit_bio_start)
0690 {
0691     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
0692     struct async_submit_bio *async;
0693
0694     async = kmalloc(sizeof(*async), GFP_NOFS);
0695     if (!async)
0696         return false;
0697
0698     async->inode = inode;
0699     async->bio = bio;
0700     async->mirror_num = mirror_num;
0701     async->submit_bio_start = submit_bio_start;
0702
0703     btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
0704             run_one_async_free);
0705
0706     async->dio_file_offset = dio_file_offset;
0707
0708     async->status = 0;
0709
0710     if (op_is_sync(bio->bi_opf))
0711         btrfs_queue_work(fs_info->hipri_workers, &async->work);
0712     else
0713         btrfs_queue_work(fs_info->workers, &async->work);
0714     return true;
0715 }
0716
0717 static blk_status_t btree_csum_one_bio(struct bio *bio)
0718 {
0719     struct bio_vec *bvec;
0720     struct btrfs_root *root;
0721     int ret = 0;
0722     struct bvec_iter_all iter_all;
0723
0724     ASSERT(!bio_flagged(bio, BIO_CLONED));
0725     bio_for_each_segment_all(bvec, bio, iter_all) {
0726         root = BTRFS_I(bvec->bv_page->mapping->host)->root;
0727         ret = csum_dirty_buffer(root->fs_info, bvec);
0728         if (ret)
0729             break;
0730     }
0731
0732     return errno_to_blk_status(ret);
0733 }
0734
0735 static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
0736                        u64 dio_file_offset)
0737 {
0738     /*
0739      * when we're called for a write, we're already in the async
0740      * submission context.  Just jump into btrfs_submit_bio.
0741      */
0742     return btree_csum_one_bio(bio);
0743 }
0744
0745 static bool should_async_write(struct btrfs_fs_info *fs_info,
0746                  struct btrfs_inode *bi)
0747 {
0748     if (btrfs_is_zoned(fs_info))
0749         return false;
0750     if (atomic_read(&bi->sync_writers))
0751         return false;
0752     if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
0753         return false;
0754     return true;
0755 }
0756
0757 void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
0758 {
0759     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
0760     blk_status_t ret;
0761
0762     bio->bi_opf |= REQ_META;
0763
0764     if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
0765         btrfs_submit_bio(fs_info, bio, mirror_num);
0766         return;
0767     }
0768
0769     /*
0770      * Kthread helpers are used to submit writes so that checksumming can
0771      * happen in parallel across all CPUs.
0772      */
0773     if (should_async_write(fs_info, BTRFS_I(inode)) &&
0774         btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
0775         return;
0776
0777     ret = btree_csum_one_bio(bio);
0778     if (ret) {
0779         bio->bi_status = ret;
0780         bio_endio(bio);
0781         return;
0782     }
0783
0784     btrfs_submit_bio(fs_info, bio, mirror_num);
0785 }
0786
0787 #ifdef CONFIG_MIGRATION
0788 static int btree_migrate_folio(struct address_space *mapping,
0789         struct folio *dst, struct folio *src, enum migrate_mode mode)
0790 {
0791     /*
0792      * we can't safely write a btree page from here,
0793      * we haven't done the locking hook
0794      */
0795     if (folio_test_dirty(src))
0796         return -EAGAIN;
0797     /*
0798      * Buffers may be managed in a filesystem specific way.
0799      * We must have no buffers or drop them.
0800      */
0801     if (folio_get_private(src) &&
0802         !filemap_release_folio(src, GFP_KERNEL))
0803         return -EAGAIN;
0804     return migrate_folio(mapping, dst, src, mode);
0805 }
0806 #else
0807 #define btree_migrate_folio NULL
0808 #endif
0809
0810 static int btree_writepages(struct address_space *mapping,
0811                 struct writeback_control *wbc)
0812 {
0813     struct btrfs_fs_info *fs_info;
0814     int ret;
0815
0816     if (wbc->sync_mode == WB_SYNC_NONE) {
0817
0818         if (wbc->for_kupdate)
0819             return 0;
0820
0821         fs_info = BTRFS_I(mapping->host)->root->fs_info;
0822         /* this is a bit racy, but that's ok */
0823         ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
0824                          BTRFS_DIRTY_METADATA_THRESH,
0825                          fs_info->dirty_metadata_batch);
0826         if (ret < 0)
0827             return 0;
0828     }
0829     return btree_write_cache_pages(mapping, wbc);
0830 }
0831
0832 static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
0833 {
0834     if (folio_test_writeback(folio) || folio_test_dirty(folio))
0835         return false;
0836
0837     return try_release_extent_buffer(&folio->page);
0838 }
0839
0840 static void btree_invalidate_folio(struct folio *folio, size_t offset,
0841                  size_t length)
0842 {
0843     struct extent_io_tree *tree;
0844     tree = &BTRFS_I(folio->mapping->host)->io_tree;
0845     extent_invalidate_folio(tree, folio, offset);
0846     btree_release_folio(folio, GFP_NOFS);
0847     if (folio_get_private(folio)) {
0848         btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
0849                "folio private not zero on folio %llu",
0850                (unsigned long long)folio_pos(folio));
0851         folio_detach_private(folio);
0852     }
0853 }
0854
0855 #ifdef DEBUG
0856 static bool btree_dirty_folio(struct address_space *mapping,
0857         struct folio *folio)
0858 {
0859     struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
0860     struct btrfs_subpage *subpage;
0861     struct extent_buffer *eb;
0862     int cur_bit = 0;
0863     u64 page_start = folio_pos(folio);
0864
0865     if (fs_info->sectorsize == PAGE_SIZE) {
0866         eb = folio_get_private(folio);
0867         BUG_ON(!eb);
0868         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
0869         BUG_ON(!atomic_read(&eb->refs));
0870         btrfs_assert_tree_write_locked(eb);
0871         return filemap_dirty_folio(mapping, folio);
0872     }
0873     subpage = folio_get_private(folio);
0874
0875     ASSERT(subpage->dirty_bitmap);
0876     while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
0877         unsigned long flags;
0878         u64 cur;
0879         u16 tmp = (1 << cur_bit);
0880
0881         spin_lock_irqsave(&subpage->lock, flags);
0882         if (!(tmp & subpage->dirty_bitmap)) {
0883             spin_unlock_irqrestore(&subpage->lock, flags);
0884             cur_bit++;
0885             continue;
0886         }
0887         spin_unlock_irqrestore(&subpage->lock, flags);
0888         cur = page_start + cur_bit * fs_info->sectorsize;
0889
0890         eb = find_extent_buffer(fs_info, cur);
0891         ASSERT(eb);
0892         ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
0893         ASSERT(atomic_read(&eb->refs));
0894         btrfs_assert_tree_write_locked(eb);
0895         free_extent_buffer(eb);
0896
0897         cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
0898     }
0899     return filemap_dirty_folio(mapping, folio);
0900 }
0901 #else
0902 #define btree_dirty_folio filemap_dirty_folio
0903 #endif
0904
0905 static const struct address_space_operations btree_aops = {
0906     .writepages = btree_writepages,
0907     .release_folio  = btree_release_folio,
0908     .invalidate_folio = btree_invalidate_folio,
0909     .migrate_folio  = btree_migrate_folio,
0910     .dirty_folio    = btree_dirty_folio,
0911 };
0912
0913 struct extent_buffer *btrfs_find_create_tree_block(
0914                         struct btrfs_fs_info *fs_info,
0915                         u64 bytenr, u64 owner_root,
0916                         int level)
0917 {
0918     if (btrfs_is_testing(fs_info))
0919         return alloc_test_extent_buffer(fs_info, bytenr);
0920     return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
0921 }
0922
0923 /*
0924  * Read tree block at logical address @bytenr and do variant basic but critical
0925  * verification.
0926  *
0927  * @owner_root:     the objectid of the root owner for this block.
0928  * @parent_transid: expected transid of this tree block, skip check if 0
0929  * @level:      expected level, mandatory check
0930  * @first_key:      expected key in slot 0, skip check if NULL
0931  */
0932 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
0933                       u64 owner_root, u64 parent_transid,
0934                       int level, struct btrfs_key *first_key)
0935 {
0936     struct extent_buffer *buf = NULL;
0937     int ret;
0938
0939     buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
0940     if (IS_ERR(buf))
0941         return buf;
0942
0943     ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
0944     if (ret) {
0945         free_extent_buffer_stale(buf);
0946         return ERR_PTR(ret);
0947     }
0948     if (btrfs_check_eb_owner(buf, owner_root)) {
0949         free_extent_buffer_stale(buf);
0950         return ERR_PTR(-EUCLEAN);
0951     }
0952     return buf;
0953
0954 }
0955
0956 void btrfs_clean_tree_block(struct extent_buffer *buf)
0957 {
0958     struct btrfs_fs_info *fs_info = buf->fs_info;
0959     if (btrfs_header_generation(buf) ==
0960         fs_info->running_transaction->transid) {
0961         btrfs_assert_tree_write_locked(buf);
0962
0963         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
0964             percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
0965                          -buf->len,
0966                          fs_info->dirty_metadata_batch);
0967             clear_extent_buffer_dirty(buf);
0968         }
0969     }
0970 }
0971
0972 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
0973              u64 objectid)
0974 {
0975     bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
0976
0977     memset(&root->root_key, 0, sizeof(root->root_key));
0978     memset(&root->root_item, 0, sizeof(root->root_item));
0979     memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
0980     root->fs_info = fs_info;
0981     root->root_key.objectid = objectid;
0982     root->node = NULL;
0983     root->commit_root = NULL;
0984     root->state = 0;
0985     RB_CLEAR_NODE(&root->rb_node);
0986
0987     root->last_trans = 0;
0988     root->free_objectid = 0;
0989     root->nr_delalloc_inodes = 0;
0990     root->nr_ordered_extents = 0;
0991     root->inode_tree = RB_ROOT;
0992     INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
0993
0994     btrfs_init_root_block_rsv(root);
0995
0996     INIT_LIST_HEAD(&root->dirty_list);
0997     INIT_LIST_HEAD(&root->root_list);
0998     INIT_LIST_HEAD(&root->delalloc_inodes);
0999     INIT_LIST_HEAD(&root->delalloc_root);
1000     INIT_LIST_HEAD(&root->ordered_extents);
1001     INIT_LIST_HEAD(&root->ordered_root);
1002     INIT_LIST_HEAD(&root->reloc_dirty_list);
1003     INIT_LIST_HEAD(&root->logged_list[0]);
1004     INIT_LIST_HEAD(&root->logged_list[1]);
1005     spin_lock_init(&root->inode_lock);
1006     spin_lock_init(&root->delalloc_lock);
1007     spin_lock_init(&root->ordered_extent_lock);
1008     spin_lock_init(&root->accounting_lock);
1009     spin_lock_init(&root->log_extents_lock[0]);
1010     spin_lock_init(&root->log_extents_lock[1]);
1011     spin_lock_init(&root->qgroup_meta_rsv_lock);
1012     mutex_init(&root->objectid_mutex);
1013     mutex_init(&root->log_mutex);
1014     mutex_init(&root->ordered_extent_mutex);
1015     mutex_init(&root->delalloc_mutex);
1016     init_waitqueue_head(&root->qgroup_flush_wait);
1017     init_waitqueue_head(&root->log_writer_wait);
1018     init_waitqueue_head(&root->log_commit_wait[0]);
1019     init_waitqueue_head(&root->log_commit_wait[1]);
1020     INIT_LIST_HEAD(&root->log_ctxs[0]);
1021     INIT_LIST_HEAD(&root->log_ctxs[1]);
1022     atomic_set(&root->log_commit[0], 0);
1023     atomic_set(&root->log_commit[1], 0);
1024     atomic_set(&root->log_writers, 0);
1025     atomic_set(&root->log_batch, 0);
1026     refcount_set(&root->refs, 1);
1027     atomic_set(&root->snapshot_force_cow, 0);
1028     atomic_set(&root->nr_swapfiles, 0);
1029     root->log_transid = 0;
1030     root->log_transid_committed = -1;
1031     root->last_log_commit = 0;
1032     root->anon_dev = 0;
1033     if (!dummy) {
1034         extent_io_tree_init(fs_info, &root->dirty_log_pages,
1035                     IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1036         extent_io_tree_init(fs_info, &root->log_csum_range,
1037                     IO_TREE_LOG_CSUM_RANGE, NULL);
1038     }
1039
1040     spin_lock_init(&root->root_item_lock);
1041     btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1042 #ifdef CONFIG_BTRFS_DEBUG
1043     INIT_LIST_HEAD(&root->leak_list);
1044     spin_lock(&fs_info->fs_roots_radix_lock);
1045     list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1046     spin_unlock(&fs_info->fs_roots_radix_lock);
1047 #endif
1048 }
1049
1050 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1051                        u64 objectid, gfp_t flags)
1052 {
1053     struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1054     if (root)
1055         __setup_root(root, fs_info, objectid);
1056     return root;
1057 }
1058
1059 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1060 /* Should only be used by the testing infrastructure */
1061 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1062 {
1063     struct btrfs_root *root;
1064
1065     if (!fs_info)
1066         return ERR_PTR(-EINVAL);
1067
1068     root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1069     if (!root)
1070         return ERR_PTR(-ENOMEM);
1071
1072     /* We don't use the stripesize in selftest, set it as sectorsize */
1073     root->alloc_bytenr = 0;
1074
1075     return root;
1076 }
1077 #endif
1078
1079 static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
1080 {
1081     const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
1082     const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
1083
1084     return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
1085 }
1086
1087 static int global_root_key_cmp(const void *k, const struct rb_node *node)
1088 {
1089     const struct btrfs_key *key = k;
1090     const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
1091
1092     return btrfs_comp_cpu_keys(key, &root->root_key);
1093 }
1094
1095 int btrfs_global_root_insert(struct btrfs_root *root)
1096 {
1097     struct btrfs_fs_info *fs_info = root->fs_info;
1098     struct rb_node *tmp;
1099
1100     write_lock(&fs_info->global_root_lock);
1101     tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
1102     write_unlock(&fs_info->global_root_lock);
1103     ASSERT(!tmp);
1104
1105     return tmp ? -EEXIST : 0;
1106 }
1107
1108 void btrfs_global_root_delete(struct btrfs_root *root)
1109 {
1110     struct btrfs_fs_info *fs_info = root->fs_info;
1111
1112     write_lock(&fs_info->global_root_lock);
1113     rb_erase(&root->rb_node, &fs_info->global_root_tree);
1114     write_unlock(&fs_info->global_root_lock);
1115 }
1116
1117 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
1118                      struct btrfs_key *key)
1119 {
1120     struct rb_node *node;
1121     struct btrfs_root *root = NULL;
1122
1123     read_lock(&fs_info->global_root_lock);
1124     node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
1125     if (node)
1126         root = container_of(node, struct btrfs_root, rb_node);
1127     read_unlock(&fs_info->global_root_lock);
1128
1129     return root;
1130 }
1131
1132 static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
1133 {
1134     struct btrfs_block_group *block_group;
1135     u64 ret;
1136
1137     if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
1138         return 0;
1139
1140     if (bytenr)
1141         block_group = btrfs_lookup_block_group(fs_info, bytenr);
1142     else
1143         block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
1144     ASSERT(block_group);
1145     if (!block_group)
1146         return 0;
1147     ret = block_group->global_root_id;
1148     btrfs_put_block_group(block_group);
1149
1150     return ret;
1151 }
1152
1153 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1154 {
1155     struct btrfs_key key = {
1156         .objectid = BTRFS_CSUM_TREE_OBJECTID,
1157         .type = BTRFS_ROOT_ITEM_KEY,
1158         .offset = btrfs_global_root_id(fs_info, bytenr),
1159     };
1160
1161     return btrfs_global_root(fs_info, &key);
1162 }
1163
1164 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1165 {
1166     struct btrfs_key key = {
1167         .objectid = BTRFS_EXTENT_TREE_OBJECTID,
1168         .type = BTRFS_ROOT_ITEM_KEY,
1169         .offset = btrfs_global_root_id(fs_info, bytenr),
1170     };
1171
1172     return btrfs_global_root(fs_info, &key);
1173 }
1174
1175 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1176                      u64 objectid)
1177 {
1178     struct btrfs_fs_info *fs_info = trans->fs_info;
1179     struct extent_buffer *leaf;
1180     struct btrfs_root *tree_root = fs_info->tree_root;
1181     struct btrfs_root *root;
1182     struct btrfs_key key;
1183     unsigned int nofs_flag;
1184     int ret = 0;
1185
1186     /*
1187      * We're holding a transaction handle, so use a NOFS memory allocation
1188      * context to avoid deadlock if reclaim happens.
1189      */
1190     nofs_flag = memalloc_nofs_save();
1191     root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1192     memalloc_nofs_restore(nofs_flag);
1193     if (!root)
1194         return ERR_PTR(-ENOMEM);
1195
1196     root->root_key.objectid = objectid;
1197     root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1198     root->root_key.offset = 0;
1199
1200     leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1201                       BTRFS_NESTING_NORMAL);
1202     if (IS_ERR(leaf)) {
1203         ret = PTR_ERR(leaf);
1204         leaf = NULL;
1205         goto fail_unlock;
1206     }
1207
1208     root->node = leaf;
1209     btrfs_mark_buffer_dirty(leaf);
1210
1211     root->commit_root = btrfs_root_node(root);
1212     set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1213
1214     btrfs_set_root_flags(&root->root_item, 0);
1215     btrfs_set_root_limit(&root->root_item, 0);
1216     btrfs_set_root_bytenr(&root->root_item, leaf->start);
1217     btrfs_set_root_generation(&root->root_item, trans->transid);
1218     btrfs_set_root_level(&root->root_item, 0);
1219     btrfs_set_root_refs(&root->root_item, 1);
1220     btrfs_set_root_used(&root->root_item, leaf->len);
1221     btrfs_set_root_last_snapshot(&root->root_item, 0);
1222     btrfs_set_root_dirid(&root->root_item, 0);
1223     if (is_fstree(objectid))
1224         generate_random_guid(root->root_item.uuid);
1225     else
1226         export_guid(root->root_item.uuid, &guid_null);
1227     btrfs_set_root_drop_level(&root->root_item, 0);
1228
1229     btrfs_tree_unlock(leaf);
1230
1231     key.objectid = objectid;
1232     key.type = BTRFS_ROOT_ITEM_KEY;
1233     key.offset = 0;
1234     ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1235     if (ret)
1236         goto fail;
1237
1238     return root;
1239
1240 fail_unlock:
1241     if (leaf)
1242         btrfs_tree_unlock(leaf);
1243 fail:
1244     btrfs_put_root(root);
1245
1246     return ERR_PTR(ret);
1247 }
1248
1249 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1250                      struct btrfs_fs_info *fs_info)
1251 {
1252     struct btrfs_root *root;
1253
1254     root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1255     if (!root)
1256         return ERR_PTR(-ENOMEM);
1257
1258     root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1259     root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1260     root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1261
1262     return root;
1263 }
1264
1265 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
1266                   struct btrfs_root *root)
1267 {
1268     struct extent_buffer *leaf;
1269
1270     /*
1271      * DON'T set SHAREABLE bit for log trees.
1272      *
1273      * Log trees are not exposed to user space thus can't be snapshotted,
1274      * and they go away before a real commit is actually done.
1275      *
1276      * They do store pointers to file data extents, and those reference
1277      * counts still get updated (along with back refs to the log tree).
1278      */
1279
1280     leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1281             NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1282     if (IS_ERR(leaf))
1283         return PTR_ERR(leaf);
1284
1285     root->node = leaf;
1286
1287     btrfs_mark_buffer_dirty(root->node);
1288     btrfs_tree_unlock(root->node);
1289
1290     return 0;
1291 }
1292
1293 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1294                  struct btrfs_fs_info *fs_info)
1295 {
1296     struct btrfs_root *log_root;
1297
1298     log_root = alloc_log_tree(trans, fs_info);
1299     if (IS_ERR(log_root))
1300         return PTR_ERR(log_root);
1301
1302     if (!btrfs_is_zoned(fs_info)) {
1303         int ret = btrfs_alloc_log_tree_node(trans, log_root);
1304
1305         if (ret) {
1306             btrfs_put_root(log_root);
1307             return ret;
1308         }
1309     }
1310
1311     WARN_ON(fs_info->log_root_tree);
1312     fs_info->log_root_tree = log_root;
1313     return 0;
1314 }
1315
1316 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1317                struct btrfs_root *root)
1318 {
1319     struct btrfs_fs_info *fs_info = root->fs_info;
1320     struct btrfs_root *log_root;
1321     struct btrfs_inode_item *inode_item;
1322     int ret;
1323
1324     log_root = alloc_log_tree(trans, fs_info);
1325     if (IS_ERR(log_root))
1326         return PTR_ERR(log_root);
1327
1328     ret = btrfs_alloc_log_tree_node(trans, log_root);
1329     if (ret) {
1330         btrfs_put_root(log_root);
1331         return ret;
1332     }
1333
1334     log_root->last_trans = trans->transid;
1335     log_root->root_key.offset = root->root_key.objectid;
1336
1337     inode_item = &log_root->root_item.inode;
1338     btrfs_set_stack_inode_generation(inode_item, 1);
1339     btrfs_set_stack_inode_size(inode_item, 3);
1340     btrfs_set_stack_inode_nlink(inode_item, 1);
1341     btrfs_set_stack_inode_nbytes(inode_item,
1342                      fs_info->nodesize);
1343     btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1344
1345     btrfs_set_root_node(&log_root->root_item, log_root->node);
1346
1347     WARN_ON(root->log_root);
1348     root->log_root = log_root;
1349     root->log_transid = 0;
1350     root->log_transid_committed = -1;
1351     root->last_log_commit = 0;
1352     return 0;
1353 }
1354
1355 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1356                           struct btrfs_path *path,
1357                           struct btrfs_key *key)
1358 {
1359     struct btrfs_root *root;
1360     struct btrfs_fs_info *fs_info = tree_root->fs_info;
1361     u64 generation;
1362     int ret;
1363     int level;
1364
1365     root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1366     if (!root)
1367         return ERR_PTR(-ENOMEM);
1368
1369     ret = btrfs_find_root(tree_root, key, path,
1370                   &root->root_item, &root->root_key);
1371     if (ret) {
1372         if (ret > 0)
1373             ret = -ENOENT;
1374         goto fail;
1375     }
1376
1377     generation = btrfs_root_generation(&root->root_item);
1378     level = btrfs_root_level(&root->root_item);
1379     root->node = read_tree_block(fs_info,
1380                      btrfs_root_bytenr(&root->root_item),
1381                      key->objectid, generation, level, NULL);
1382     if (IS_ERR(root->node)) {
1383         ret = PTR_ERR(root->node);
1384         root->node = NULL;
1385         goto fail;
1386     }
1387     if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1388         ret = -EIO;
1389         goto fail;
1390     }
1391
1392     /*
1393      * For real fs, and not log/reloc trees, root owner must
1394      * match its root node owner
1395      */
1396     if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
1397         root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1398         root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1399         root->root_key.objectid != btrfs_header_owner(root->node)) {
1400         btrfs_crit(fs_info,
1401 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1402                root->root_key.objectid, root->node->start,
1403                btrfs_header_owner(root->node),
1404                root->root_key.objectid);
1405         ret = -EUCLEAN;
1406         goto fail;
1407     }
1408     root->commit_root = btrfs_root_node(root);
1409     return root;
1410 fail:
1411     btrfs_put_root(root);
1412     return ERR_PTR(ret);
1413 }
1414
1415 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1416                     struct btrfs_key *key)
1417 {
1418     struct btrfs_root *root;
1419     struct btrfs_path *path;
1420
1421     path = btrfs_alloc_path();
1422     if (!path)
1423         return ERR_PTR(-ENOMEM);
1424     root = read_tree_root_path(tree_root, path, key);
1425     btrfs_free_path(path);
1426
1427     return root;
1428 }
1429
1430 /*
1431  * Initialize subvolume root in-memory structure
1432  *
1433  * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1434  */
1435 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1436 {
1437     int ret;
1438     unsigned int nofs_flag;
1439
1440     /*
1441      * We might be called under a transaction (e.g. indirect backref
1442      * resolution) which could deadlock if it triggers memory reclaim
1443      */
1444     nofs_flag = memalloc_nofs_save();
1445     ret = btrfs_drew_lock_init(&root->snapshot_lock);
1446     memalloc_nofs_restore(nofs_flag);
1447     if (ret)
1448         goto fail;
1449
1450     if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1451         !btrfs_is_data_reloc_root(root)) {
1452         set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1453         btrfs_check_and_init_root_item(&root->root_item);
1454     }
1455
1456     /*
1457      * Don't assign anonymous block device to roots that are not exposed to
1458      * userspace, the id pool is limited to 1M
1459      */
1460     if (is_fstree(root->root_key.objectid) &&
1461         btrfs_root_refs(&root->root_item) > 0) {
1462         if (!anon_dev) {
1463             ret = get_anon_bdev(&root->anon_dev);
1464             if (ret)
1465                 goto fail;
1466         } else {
1467             root->anon_dev = anon_dev;
1468         }
1469     }
1470
1471     mutex_lock(&root->objectid_mutex);
1472     ret = btrfs_init_root_free_objectid(root);
1473     if (ret) {
1474         mutex_unlock(&root->objectid_mutex);
1475         goto fail;
1476     }
1477
1478     ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1479
1480     mutex_unlock(&root->objectid_mutex);
1481
1482     return 0;
1483 fail:
1484     /* The caller is responsible to call btrfs_free_fs_root */
1485     return ret;
1486 }
1487
1488 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1489                            u64 root_id)
1490 {
1491     struct btrfs_root *root;
1492
1493     spin_lock(&fs_info->fs_roots_radix_lock);
1494     root = radix_tree_lookup(&fs_info->fs_roots_radix,
1495                  (unsigned long)root_id);
1496     if (root)
1497         root = btrfs_grab_root(root);
1498     spin_unlock(&fs_info->fs_roots_radix_lock);
1499     return root;
1500 }
1501
1502 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1503                         u64 objectid)
1504 {
1505     struct btrfs_key key = {
1506         .objectid = objectid,
1507         .type = BTRFS_ROOT_ITEM_KEY,
1508         .offset = 0,
1509     };
1510
1511     if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1512         return btrfs_grab_root(fs_info->tree_root);
1513     if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1514         return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1515     if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1516         return btrfs_grab_root(fs_info->chunk_root);
1517     if (objectid == BTRFS_DEV_TREE_OBJECTID)
1518         return btrfs_grab_root(fs_info->dev_root);
1519     if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1520         return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1521     if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1522         return btrfs_grab_root(fs_info->quota_root) ?
1523             fs_info->quota_root : ERR_PTR(-ENOENT);
1524     if (objectid == BTRFS_UUID_TREE_OBJECTID)
1525         return btrfs_grab_root(fs_info->uuid_root) ?
1526             fs_info->uuid_root : ERR_PTR(-ENOENT);
1527     if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
1528         struct btrfs_root *root = btrfs_global_root(fs_info, &key);
1529
1530         return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
1531     }
1532     return NULL;
1533 }
1534
1535 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1536              struct btrfs_root *root)
1537 {
1538     int ret;
1539
1540     ret = radix_tree_preload(GFP_NOFS);
1541     if (ret)
1542         return ret;
1543
1544     spin_lock(&fs_info->fs_roots_radix_lock);
1545     ret = radix_tree_insert(&fs_info->fs_roots_radix,
1546                 (unsigned long)root->root_key.objectid,
1547                 root);
1548     if (ret == 0) {
1549         btrfs_grab_root(root);
1550         set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1551     }
1552     spin_unlock(&fs_info->fs_roots_radix_lock);
1553     radix_tree_preload_end();
1554
1555     return ret;
1556 }
1557
1558 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1559 {
1560 #ifdef CONFIG_BTRFS_DEBUG
1561     struct btrfs_root *root;
1562
1563     while (!list_empty(&fs_info->allocated_roots)) {
1564         char buf[BTRFS_ROOT_NAME_BUF_LEN];
1565
1566         root = list_first_entry(&fs_info->allocated_roots,
1567                     struct btrfs_root, leak_list);
1568         btrfs_err(fs_info, "leaked root %s refcount %d",
1569               btrfs_root_name(&root->root_key, buf),
1570               refcount_read(&root->refs));
1571         while (refcount_read(&root->refs) > 1)
1572             btrfs_put_root(root);
1573         btrfs_put_root(root);
1574     }
1575 #endif
1576 }
1577
1578 static void free_global_roots(struct btrfs_fs_info *fs_info)
1579 {
1580     struct btrfs_root *root;
1581     struct rb_node *node;
1582
1583     while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1584         root = rb_entry(node, struct btrfs_root, rb_node);
1585         rb_erase(&root->rb_node, &fs_info->global_root_tree);
1586         btrfs_put_root(root);
1587     }
1588 }
1589
1590 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1591 {
1592     percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1593     percpu_counter_destroy(&fs_info->delalloc_bytes);
1594     percpu_counter_destroy(&fs_info->ordered_bytes);
1595     percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1596     btrfs_free_csum_hash(fs_info);
1597     btrfs_free_stripe_hash_table(fs_info);
1598     btrfs_free_ref_cache(fs_info);
1599     kfree(fs_info->balance_ctl);
1600     kfree(fs_info->delayed_root);
1601     free_global_roots(fs_info);
1602     btrfs_put_root(fs_info->tree_root);
1603     btrfs_put_root(fs_info->chunk_root);
1604     btrfs_put_root(fs_info->dev_root);
1605     btrfs_put_root(fs_info->quota_root);
1606     btrfs_put_root(fs_info->uuid_root);
1607     btrfs_put_root(fs_info->fs_root);
1608     btrfs_put_root(fs_info->data_reloc_root);
1609     btrfs_put_root(fs_info->block_group_root);
1610     btrfs_check_leaked_roots(fs_info);
1611     btrfs_extent_buffer_leak_debug_check(fs_info);
1612     kfree(fs_info->super_copy);
1613     kfree(fs_info->super_for_commit);
1614     kfree(fs_info->subpage_info);
1615     kvfree(fs_info);
1616 }
1617
1618
1619 /*
1620  * Get an in-memory reference of a root structure.
1621  *
1622  * For essential trees like root/extent tree, we grab it from fs_info directly.
1623  * For subvolume trees, we check the cached filesystem roots first. If not
1624  * found, then read it from disk and add it to cached fs roots.
1625  *
1626  * Caller should release the root by calling btrfs_put_root() after the usage.
1627  *
1628  * NOTE: Reloc and log trees can't be read by this function as they share the
1629  *   same root objectid.
1630  *
1631  * @objectid:   root id
1632  * @anon_dev:   preallocated anonymous block device number for new roots,
1633  *      pass 0 for new allocation.
1634  * @check_ref:  whether to check root item references, If true, return -ENOENT
1635  *      for orphan roots
1636  */
1637 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1638                          u64 objectid, dev_t anon_dev,
1639                          bool check_ref)
1640 {
1641     struct btrfs_root *root;
1642     struct btrfs_path *path;
1643     struct btrfs_key key;
1644     int ret;
1645
1646     root = btrfs_get_global_root(fs_info, objectid);
1647     if (root)
1648         return root;
1649 again:
1650     root = btrfs_lookup_fs_root(fs_info, objectid);
1651     if (root) {
1652         /* Shouldn't get preallocated anon_dev for cached roots */
1653         ASSERT(!anon_dev);
1654         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1655             btrfs_put_root(root);
1656             return ERR_PTR(-ENOENT);
1657         }
1658         return root;
1659     }
1660
1661     key.objectid = objectid;
1662     key.type = BTRFS_ROOT_ITEM_KEY;
1663     key.offset = (u64)-1;
1664     root = btrfs_read_tree_root(fs_info->tree_root, &key);
1665     if (IS_ERR(root))
1666         return root;
1667
1668     if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1669         ret = -ENOENT;
1670         goto fail;
1671     }
1672
1673     ret = btrfs_init_fs_root(root, anon_dev);
1674     if (ret)
1675         goto fail;
1676
1677     path = btrfs_alloc_path();
1678     if (!path) {
1679         ret = -ENOMEM;
1680         goto fail;
1681     }
1682     key.objectid = BTRFS_ORPHAN_OBJECTID;
1683     key.type = BTRFS_ORPHAN_ITEM_KEY;
1684     key.offset = objectid;
1685
1686     ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1687     btrfs_free_path(path);
1688     if (ret < 0)
1689         goto fail;
1690     if (ret == 0)
1691         set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1692
1693     ret = btrfs_insert_fs_root(fs_info, root);
1694     if (ret) {
1695         if (ret == -EEXIST) {
1696             btrfs_put_root(root);
1697             goto again;
1698         }
1699         goto fail;
1700     }
1701     return root;
1702 fail:
1703     /*
1704      * If our caller provided us an anonymous device, then it's his
1705      * responsibility to free it in case we fail. So we have to set our
1706      * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1707      * and once again by our caller.
1708      */
1709     if (anon_dev)
1710         root->anon_dev = 0;
1711     btrfs_put_root(root);
1712     return ERR_PTR(ret);
1713 }
1714
1715 /*
1716  * Get in-memory reference of a root structure
1717  *
1718  * @objectid:   tree objectid
1719  * @check_ref:  if set, verify that the tree exists and the item has at least
1720  *      one reference
1721  */
1722 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1723                      u64 objectid, bool check_ref)
1724 {
1725     return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1726 }
1727
1728 /*
1729  * Get in-memory reference of a root structure, created as new, optionally pass
1730  * the anonymous block device id
1731  *
1732  * @objectid:   tree objectid
1733  * @anon_dev:   if zero, allocate a new anonymous block device or use the
1734  *      parameter value
1735  */
1736 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1737                      u64 objectid, dev_t anon_dev)
1738 {
1739     return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1740 }
1741
1742 /*
1743  * btrfs_get_fs_root_commit_root - return a root for the given objectid
1744  * @fs_info:    the fs_info
1745  * @objectid:   the objectid we need to lookup
1746  *
1747  * This is exclusively used for backref walking, and exists specifically because
1748  * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1749  * creation time, which means we may have to read the tree_root in order to look
1750  * up a fs root that is not in memory.  If the root is not in memory we will
1751  * read the tree root commit root and look up the fs root from there.  This is a
1752  * temporary root, it will not be inserted into the radix tree as it doesn't
1753  * have the most uptodate information, it'll simply be discarded once the
1754  * backref code is finished using the root.
1755  */
1756 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1757                          struct btrfs_path *path,
1758                          u64 objectid)
1759 {
1760     struct btrfs_root *root;
1761     struct btrfs_key key;
1762
1763     ASSERT(path->search_commit_root && path->skip_locking);
1764
1765     /*
1766      * This can return -ENOENT if we ask for a root that doesn't exist, but
1767      * since this is called via the backref walking code we won't be looking
1768      * up a root that doesn't exist, unless there's corruption.  So if root
1769      * != NULL just return it.
1770      */
1771     root = btrfs_get_global_root(fs_info, objectid);
1772     if (root)
1773         return root;
1774
1775     root = btrfs_lookup_fs_root(fs_info, objectid);
1776     if (root)
1777         return root;
1778
1779     key.objectid = objectid;
1780     key.type = BTRFS_ROOT_ITEM_KEY;
1781     key.offset = (u64)-1;
1782     root = read_tree_root_path(fs_info->tree_root, path, &key);
1783     btrfs_release_path(path);
1784
1785     return root;
1786 }
1787
1788 static int cleaner_kthread(void *arg)
1789 {
1790     struct btrfs_fs_info *fs_info = arg;
1791     int again;
1792
1793     while (1) {
1794         again = 0;
1795
1796         set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1797
1798         /* Make the cleaner go to sleep early. */
1799         if (btrfs_need_cleaner_sleep(fs_info))
1800             goto sleep;
1801
1802         /*
1803          * Do not do anything if we might cause open_ctree() to block
1804          * before we have finished mounting the filesystem.
1805          */
1806         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1807             goto sleep;
1808
1809         if (!mutex_trylock(&fs_info->cleaner_mutex))
1810             goto sleep;
1811
1812         /*
1813          * Avoid the problem that we change the status of the fs
1814          * during the above check and trylock.
1815          */
1816         if (btrfs_need_cleaner_sleep(fs_info)) {
1817             mutex_unlock(&fs_info->cleaner_mutex);
1818             goto sleep;
1819         }
1820
1821         btrfs_run_delayed_iputs(fs_info);
1822
1823         again = btrfs_clean_one_deleted_snapshot(fs_info);
1824         mutex_unlock(&fs_info->cleaner_mutex);
1825
1826         /*
1827          * The defragger has dealt with the R/O remount and umount,
1828          * needn't do anything special here.
1829          */
1830         btrfs_run_defrag_inodes(fs_info);
1831
1832         /*
1833          * Acquires fs_info->reclaim_bgs_lock to avoid racing
1834          * with relocation (btrfs_relocate_chunk) and relocation
1835          * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1836          * after acquiring fs_info->reclaim_bgs_lock. So we
1837          * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1838          * unused block groups.
1839          */
1840         btrfs_delete_unused_bgs(fs_info);
1841
1842         /*
1843          * Reclaim block groups in the reclaim_bgs list after we deleted
1844          * all unused block_groups. This possibly gives us some more free
1845          * space.
1846          */
1847         btrfs_reclaim_bgs(fs_info);
1848 sleep:
1849         clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1850         if (kthread_should_park())
1851             kthread_parkme();
1852         if (kthread_should_stop())
1853             return 0;
1854         if (!again) {
1855             set_current_state(TASK_INTERRUPTIBLE);
1856             schedule();
1857             __set_current_state(TASK_RUNNING);
1858         }
1859     }
1860 }
1861
1862 static int transaction_kthread(void *arg)
1863 {
1864     struct btrfs_root *root = arg;
1865     struct btrfs_fs_info *fs_info = root->fs_info;
1866     struct btrfs_trans_handle *trans;
1867     struct btrfs_transaction *cur;
1868     u64 transid;
1869     time64_t delta;
1870     unsigned long delay;
1871     bool cannot_commit;
1872
1873     do {
1874         cannot_commit = false;
1875         delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1876         mutex_lock(&fs_info->transaction_kthread_mutex);
1877
1878         spin_lock(&fs_info->trans_lock);
1879         cur = fs_info->running_transaction;
1880         if (!cur) {
1881             spin_unlock(&fs_info->trans_lock);
1882             goto sleep;
1883         }
1884
1885         delta = ktime_get_seconds() - cur->start_time;
1886         if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1887             cur->state < TRANS_STATE_COMMIT_START &&
1888             delta < fs_info->commit_interval) {
1889             spin_unlock(&fs_info->trans_lock);
1890             delay -= msecs_to_jiffies((delta - 1) * 1000);
1891             delay = min(delay,
1892                     msecs_to_jiffies(fs_info->commit_interval * 1000));
1893             goto sleep;
1894         }
1895         transid = cur->transid;
1896         spin_unlock(&fs_info->trans_lock);
1897
1898         /* If the file system is aborted, this will always fail. */
1899         trans = btrfs_attach_transaction(root);
1900         if (IS_ERR(trans)) {
1901             if (PTR_ERR(trans) != -ENOENT)
1902                 cannot_commit = true;
1903             goto sleep;
1904         }
1905         if (transid == trans->transid) {
1906             btrfs_commit_transaction(trans);
1907         } else {
1908             btrfs_end_transaction(trans);
1909         }
1910 sleep:
1911         wake_up_process(fs_info->cleaner_kthread);
1912         mutex_unlock(&fs_info->transaction_kthread_mutex);
1913
1914         if (BTRFS_FS_ERROR(fs_info))
1915             btrfs_cleanup_transaction(fs_info);
1916         if (!kthread_should_stop() &&
1917                 (!btrfs_transaction_blocked(fs_info) ||
1918                  cannot_commit))
1919             schedule_timeout_interruptible(delay);
1920     } while (!kthread_should_stop());
1921     return 0;
1922 }
1923
1924 /*
1925  * This will find the highest generation in the array of root backups.  The
1926  * index of the highest array is returned, or -EINVAL if we can't find
1927  * anything.
1928  *
1929  * We check to make sure the array is valid by comparing the
1930  * generation of the latest  root in the array with the generation
1931  * in the super block.  If they don't match we pitch it.
1932  */
1933 static int find_newest_super_backup(struct btrfs_fs_info *info)
1934 {
1935     const u64 newest_gen = btrfs_super_generation(info->super_copy);
1936     u64 cur;
1937     struct btrfs_root_backup *root_backup;
1938     int i;
1939
1940     for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1941         root_backup = info->super_copy->super_roots + i;
1942         cur = btrfs_backup_tree_root_gen(root_backup);
1943         if (cur == newest_gen)
1944             return i;
1945     }
1946
1947     return -EINVAL;
1948 }
1949
1950 /*
1951  * copy all the root pointers into the super backup array.
1952  * this will bump the backup pointer by one when it is
1953  * done
1954  */
1955 static void backup_super_roots(struct btrfs_fs_info *info)
1956 {
1957     const int next_backup = info->backup_root_index;
1958     struct btrfs_root_backup *root_backup;
1959
1960     root_backup = info->super_for_commit->super_roots + next_backup;
1961
1962     /*
1963      * make sure all of our padding and empty slots get zero filled
1964      * regardless of which ones we use today
1965      */
1966     memset(root_backup, 0, sizeof(*root_backup));
1967
1968     info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1969
1970     btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1971     btrfs_set_backup_tree_root_gen(root_backup,
1972                    btrfs_header_generation(info->tree_root->node));
1973
1974     btrfs_set_backup_tree_root_level(root_backup,
1975                    btrfs_header_level(info->tree_root->node));
1976
1977     btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1978     btrfs_set_backup_chunk_root_gen(root_backup,
1979                    btrfs_header_generation(info->chunk_root->node));
1980     btrfs_set_backup_chunk_root_level(root_backup,
1981                    btrfs_header_level(info->chunk_root->node));
1982
1983     if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
1984         btrfs_set_backup_block_group_root(root_backup,
1985                     info->block_group_root->node->start);
1986         btrfs_set_backup_block_group_root_gen(root_backup,
1987             btrfs_header_generation(info->block_group_root->node));
1988         btrfs_set_backup_block_group_root_level(root_backup,
1989             btrfs_header_level(info->block_group_root->node));
1990     } else {
1991         struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
1992         struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
1993
1994         btrfs_set_backup_extent_root(root_backup,
1995                          extent_root->node->start);
1996         btrfs_set_backup_extent_root_gen(root_backup,
1997                 btrfs_header_generation(extent_root->node));
1998         btrfs_set_backup_extent_root_level(root_backup,
1999                     btrfs_header_level(extent_root->node));
2000
2001         btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
2002         btrfs_set_backup_csum_root_gen(root_backup,
2003                            btrfs_header_generation(csum_root->node));
2004         btrfs_set_backup_csum_root_level(root_backup,
2005                          btrfs_header_level(csum_root->node));
2006     }
2007
2008     /*
2009      * we might commit during log recovery, which happens before we set
2010      * the fs_root.  Make sure it is valid before we fill it in.
2011      */
2012     if (info->fs_root && info->fs_root->node) {
2013         btrfs_set_backup_fs_root(root_backup,
2014                      info->fs_root->node->start);
2015         btrfs_set_backup_fs_root_gen(root_backup,
2016                    btrfs_header_generation(info->fs_root->node));
2017         btrfs_set_backup_fs_root_level(root_backup,
2018                    btrfs_header_level(info->fs_root->node));
2019     }
2020
2021     btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2022     btrfs_set_backup_dev_root_gen(root_backup,
2023                    btrfs_header_generation(info->dev_root->node));
2024     btrfs_set_backup_dev_root_level(root_backup,
2025                        btrfs_header_level(info->dev_root->node));
2026
2027     btrfs_set_backup_total_bytes(root_backup,
2028                  btrfs_super_total_bytes(info->super_copy));
2029     btrfs_set_backup_bytes_used(root_backup,
2030                  btrfs_super_bytes_used(info->super_copy));
2031     btrfs_set_backup_num_devices(root_backup,
2032                  btrfs_super_num_devices(info->super_copy));
2033
2034     /*
2035      * if we don't copy this out to the super_copy, it won't get remembered
2036      * for the next commit
2037      */
2038     memcpy(&info->super_copy->super_roots,
2039            &info->super_for_commit->super_roots,
2040            sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2041 }
2042
2043 /*
2044  * read_backup_root - Reads a backup root based on the passed priority. Prio 0
2045  * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
2046  *
2047  * fs_info - filesystem whose backup roots need to be read
2048  * priority - priority of backup root required
2049  *
2050  * Returns backup root index on success and -EINVAL otherwise.
2051  */
2052 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
2053 {
2054     int backup_index = find_newest_super_backup(fs_info);
2055     struct btrfs_super_block *super = fs_info->super_copy;
2056     struct btrfs_root_backup *root_backup;
2057
2058     if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
2059         if (priority == 0)
2060             return backup_index;
2061
2062         backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
2063         backup_index %= BTRFS_NUM_BACKUP_ROOTS;
2064     } else {
2065         return -EINVAL;
2066     }
2067
2068     root_backup = super->super_roots + backup_index;
2069
2070     btrfs_set_super_generation(super,
2071                    btrfs_backup_tree_root_gen(root_backup));
2072     btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2073     btrfs_set_super_root_level(super,
2074                    btrfs_backup_tree_root_level(root_backup));
2075     btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2076
2077     /*
2078      * Fixme: the total bytes and num_devices need to match or we should
2079      * need a fsck
2080      */
2081     btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2082     btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2083
2084     return backup_index;
2085 }
2086
2087 /* helper to cleanup workers */
2088 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2089 {
2090     btrfs_destroy_workqueue(fs_info->fixup_workers);
2091     btrfs_destroy_workqueue(fs_info->delalloc_workers);
2092     btrfs_destroy_workqueue(fs_info->hipri_workers);
2093     btrfs_destroy_workqueue(fs_info->workers);
2094     if (fs_info->endio_workers)
2095         destroy_workqueue(fs_info->endio_workers);
2096     if (fs_info->endio_raid56_workers)
2097         destroy_workqueue(fs_info->endio_raid56_workers);
2098     if (fs_info->rmw_workers)
2099         destroy_workqueue(fs_info->rmw_workers);
2100     if (fs_info->compressed_write_workers)
2101         destroy_workqueue(fs_info->compressed_write_workers);
2102     btrfs_destroy_workqueue(fs_info->endio_write_workers);
2103     btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2104     btrfs_destroy_workqueue(fs_info->delayed_workers);
2105     btrfs_destroy_workqueue(fs_info->caching_workers);
2106     btrfs_destroy_workqueue(fs_info->flush_workers);
2107     btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2108     if (fs_info->discard_ctl.discard_workers)
2109         destroy_workqueue(fs_info->discard_ctl.discard_workers);
2110     /*
2111      * Now that all other work queues are destroyed, we can safely destroy
2112      * the queues used for metadata I/O, since tasks from those other work
2113      * queues can do metadata I/O operations.
2114      */
2115     if (fs_info->endio_meta_workers)
2116         destroy_workqueue(fs_info->endio_meta_workers);
2117 }
2118
2119 static void free_root_extent_buffers(struct btrfs_root *root)
2120 {
2121     if (root) {
2122         free_extent_buffer(root->node);
2123         free_extent_buffer(root->commit_root);
2124         root->node = NULL;
2125         root->commit_root = NULL;
2126     }
2127 }
2128
2129 static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
2130 {
2131     struct btrfs_root *root, *tmp;
2132
2133     rbtree_postorder_for_each_entry_safe(root, tmp,
2134                          &fs_info->global_root_tree,
2135                          rb_node)
2136         free_root_extent_buffers(root);
2137 }
2138
2139 /* helper to cleanup tree roots */
2140 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2141 {
2142     free_root_extent_buffers(info->tree_root);
2143
2144     free_global_root_pointers(info);
2145     free_root_extent_buffers(info->dev_root);
2146     free_root_extent_buffers(info->quota_root);
2147     free_root_extent_buffers(info->uuid_root);
2148     free_root_extent_buffers(info->fs_root);
2149     free_root_extent_buffers(info->data_reloc_root);
2150     free_root_extent_buffers(info->block_group_root);
2151     if (free_chunk_root)
2152         free_root_extent_buffers(info->chunk_root);
2153 }
2154
2155 void btrfs_put_root(struct btrfs_root *root)
2156 {
2157     if (!root)
2158         return;
2159
2160     if (refcount_dec_and_test(&root->refs)) {
2161         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2162         WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2163         if (root->anon_dev)
2164             free_anon_bdev(root->anon_dev);
2165         btrfs_drew_lock_destroy(&root->snapshot_lock);
2166         free_root_extent_buffers(root);
2167 #ifdef CONFIG_BTRFS_DEBUG
2168         spin_lock(&root->fs_info->fs_roots_radix_lock);
2169         list_del_init(&root->leak_list);
2170         spin_unlock(&root->fs_info->fs_roots_radix_lock);
2171 #endif
2172         kfree(root);
2173     }
2174 }
2175
2176 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2177 {
2178     int ret;
2179     struct btrfs_root *gang[8];
2180     int i;
2181
2182     while (!list_empty(&fs_info->dead_roots)) {
2183         gang[0] = list_entry(fs_info->dead_roots.next,
2184                      struct btrfs_root, root_list);
2185         list_del(&gang[0]->root_list);
2186
2187         if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2188             btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2189         btrfs_put_root(gang[0]);
2190     }
2191
2192     while (1) {
2193         ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2194                          (void **)gang, 0,
2195                          ARRAY_SIZE(gang));
2196         if (!ret)
2197             break;
2198         for (i = 0; i < ret; i++)
2199             btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2200     }
2201 }
2202
2203 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2204 {
2205     mutex_init(&fs_info->scrub_lock);
2206     atomic_set(&fs_info->scrubs_running, 0);
2207     atomic_set(&fs_info->scrub_pause_req, 0);
2208     atomic_set(&fs_info->scrubs_paused, 0);
2209     atomic_set(&fs_info->scrub_cancel_req, 0);
2210     init_waitqueue_head(&fs_info->scrub_pause_wait);
2211     refcount_set(&fs_info->scrub_workers_refcnt, 0);
2212 }
2213
2214 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2215 {
2216     spin_lock_init(&fs_info->balance_lock);
2217     mutex_init(&fs_info->balance_mutex);
2218     atomic_set(&fs_info->balance_pause_req, 0);
2219     atomic_set(&fs_info->balance_cancel_req, 0);
2220     fs_info->balance_ctl = NULL;
2221     init_waitqueue_head(&fs_info->balance_wait_q);
2222     atomic_set(&fs_info->reloc_cancel_req, 0);
2223 }
2224
2225 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2226 {
2227     struct inode *inode = fs_info->btree_inode;
2228
2229     inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2230     set_nlink(inode, 1);
2231     /*
2232      * we set the i_size on the btree inode to the max possible int.
2233      * the real end of the address space is determined by all of
2234      * the devices in the system
2235      */
2236     inode->i_size = OFFSET_MAX;
2237     inode->i_mapping->a_ops = &btree_aops;
2238
2239     RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2240     extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2241                 IO_TREE_BTREE_INODE_IO, inode);
2242     BTRFS_I(inode)->io_tree.track_uptodate = false;
2243     extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2244
2245     BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2246     BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
2247     BTRFS_I(inode)->location.type = 0;
2248     BTRFS_I(inode)->location.offset = 0;
2249     set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2250     btrfs_insert_inode_hash(inode);
2251 }
2252
2253 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2254 {
2255     mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2256     init_rwsem(&fs_info->dev_replace.rwsem);
2257     init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2258 }
2259
2260 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2261 {
2262     spin_lock_init(&fs_info->qgroup_lock);
2263     mutex_init(&fs_info->qgroup_ioctl_lock);
2264     fs_info->qgroup_tree = RB_ROOT;
2265     INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2266     fs_info->qgroup_seq = 1;
2267     fs_info->qgroup_ulist = NULL;
2268     fs_info->qgroup_rescan_running = false;
2269     mutex_init(&fs_info->qgroup_rescan_lock);
2270 }
2271
2272 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
2273 {
2274     u32 max_active = fs_info->thread_pool_size;
2275     unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2276
2277     fs_info->workers =
2278         btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
2279     fs_info->hipri_workers =
2280         btrfs_alloc_workqueue(fs_info, "worker-high",
2281                       flags | WQ_HIGHPRI, max_active, 16);
2282
2283     fs_info->delalloc_workers =
2284         btrfs_alloc_workqueue(fs_info, "delalloc",
2285                       flags, max_active, 2);
2286
2287     fs_info->flush_workers =
2288         btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2289                       flags, max_active, 0);
2290
2291     fs_info->caching_workers =
2292         btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2293
2294     fs_info->fixup_workers =
2295         btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2296
2297     fs_info->endio_workers =
2298         alloc_workqueue("btrfs-endio", flags, max_active);
2299     fs_info->endio_meta_workers =
2300         alloc_workqueue("btrfs-endio-meta", flags, max_active);
2301     fs_info->endio_raid56_workers =
2302         alloc_workqueue("btrfs-endio-raid56", flags, max_active);
2303     fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
2304     fs_info->endio_write_workers =
2305         btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2306                       max_active, 2);
2307     fs_info->compressed_write_workers =
2308         alloc_workqueue("btrfs-compressed-write", flags, max_active);
2309     fs_info->endio_freespace_worker =
2310         btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2311                       max_active, 0);
2312     fs_info->delayed_workers =
2313         btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2314                       max_active, 0);
2315     fs_info->qgroup_rescan_workers =
2316         btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2317     fs_info->discard_ctl.discard_workers =
2318         alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2319
2320     if (!(fs_info->workers && fs_info->hipri_workers &&
2321           fs_info->delalloc_workers && fs_info->flush_workers &&
2322           fs_info->endio_workers && fs_info->endio_meta_workers &&
2323           fs_info->compressed_write_workers &&
2324           fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2325           fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2326           fs_info->caching_workers && fs_info->fixup_workers &&
2327           fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2328           fs_info->discard_ctl.discard_workers)) {
2329         return -ENOMEM;
2330     }
2331
2332     return 0;
2333 }
2334
2335 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2336 {
2337     struct crypto_shash *csum_shash;
2338     const char *csum_driver = btrfs_super_csum_driver(csum_type);
2339
2340     csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2341
2342     if (IS_ERR(csum_shash)) {
2343         btrfs_err(fs_info, "error allocating %s hash for checksum",
2344               csum_driver);
2345         return PTR_ERR(csum_shash);
2346     }
2347
2348     fs_info->csum_shash = csum_shash;
2349
2350     btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2351             btrfs_super_csum_name(csum_type),
2352             crypto_shash_driver_name(csum_shash));
2353     return 0;
2354 }
2355
2356 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2357                 struct btrfs_fs_devices *fs_devices)
2358 {
2359     int ret;
2360     struct btrfs_root *log_tree_root;
2361     struct btrfs_super_block *disk_super = fs_info->super_copy;
2362     u64 bytenr = btrfs_super_log_root(disk_super);
2363     int level = btrfs_super_log_root_level(disk_super);
2364
2365     if (fs_devices->rw_devices == 0) {
2366         btrfs_warn(fs_info, "log replay required on RO media");
2367         return -EIO;
2368     }
2369
2370     log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2371                      GFP_KERNEL);
2372     if (!log_tree_root)
2373         return -ENOMEM;
2374
2375     log_tree_root->node = read_tree_block(fs_info, bytenr,
2376                           BTRFS_TREE_LOG_OBJECTID,
2377                           fs_info->generation + 1, level,
2378                           NULL);
2379     if (IS_ERR(log_tree_root->node)) {
2380         btrfs_warn(fs_info, "failed to read log tree");
2381         ret = PTR_ERR(log_tree_root->node);
2382         log_tree_root->node = NULL;
2383         btrfs_put_root(log_tree_root);
2384         return ret;
2385     }
2386     if (!extent_buffer_uptodate(log_tree_root->node)) {
2387         btrfs_err(fs_info, "failed to read log tree");
2388         btrfs_put_root(log_tree_root);
2389         return -EIO;
2390     }
2391
2392     /* returns with log_tree_root freed on success */
2393     ret = btrfs_recover_log_trees(log_tree_root);
2394     if (ret) {
2395         btrfs_handle_fs_error(fs_info, ret,
2396                       "Failed to recover log tree");
2397         btrfs_put_root(log_tree_root);
2398         return ret;
2399     }
2400
2401     if (sb_rdonly(fs_info->sb)) {
2402         ret = btrfs_commit_super(fs_info);
2403         if (ret)
2404             return ret;
2405     }
2406
2407     return 0;
2408 }
2409
2410 static int load_global_roots_objectid(struct btrfs_root *tree_root,
2411                       struct btrfs_path *path, u64 objectid,
2412                       const char *name)
2413 {
2414     struct btrfs_fs_info *fs_info = tree_root->fs_info;
2415     struct btrfs_root *root;
2416     u64 max_global_id = 0;
2417     int ret;
2418     struct btrfs_key key = {
2419         .objectid = objectid,
2420         .type = BTRFS_ROOT_ITEM_KEY,
2421         .offset = 0,
2422     };
2423     bool found = false;
2424
2425     /* If we have IGNOREDATACSUMS skip loading these roots. */
2426     if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2427         btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2428         set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2429         return 0;
2430     }
2431
2432     while (1) {
2433         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2434         if (ret < 0)
2435             break;
2436
2437         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2438             ret = btrfs_next_leaf(tree_root, path);
2439             if (ret) {
2440                 if (ret > 0)
2441                     ret = 0;
2442                 break;
2443             }
2444         }
2445         ret = 0;
2446
2447         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2448         if (key.objectid != objectid)
2449             break;
2450         btrfs_release_path(path);
2451
2452         /*
2453          * Just worry about this for extent tree, it'll be the same for
2454          * everybody.
2455          */
2456         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2457             max_global_id = max(max_global_id, key.offset);
2458
2459         found = true;
2460         root = read_tree_root_path(tree_root, path, &key);
2461         if (IS_ERR(root)) {
2462             if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2463                 ret = PTR_ERR(root);
2464             break;
2465         }
2466         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2467         ret = btrfs_global_root_insert(root);
2468         if (ret) {
2469             btrfs_put_root(root);
2470             break;
2471         }
2472         key.offset++;
2473     }
2474     btrfs_release_path(path);
2475
2476     if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2477         fs_info->nr_global_roots = max_global_id + 1;
2478
2479     if (!found || ret) {
2480         if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2481             set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2482
2483         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2484             ret = ret ? ret : -ENOENT;
2485         else
2486             ret = 0;
2487         btrfs_err(fs_info, "failed to load root %s", name);
2488     }
2489     return ret;
2490 }
2491
2492 static int load_global_roots(struct btrfs_root *tree_root)
2493 {
2494     struct btrfs_path *path;
2495     int ret = 0;
2496
2497     path = btrfs_alloc_path();
2498     if (!path)
2499         return -ENOMEM;
2500
2501     ret = load_global_roots_objectid(tree_root, path,
2502                      BTRFS_EXTENT_TREE_OBJECTID, "extent");
2503     if (ret)
2504         goto out;
2505     ret = load_global_roots_objectid(tree_root, path,
2506                      BTRFS_CSUM_TREE_OBJECTID, "csum");
2507     if (ret)
2508         goto out;
2509     if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2510         goto out;
2511     ret = load_global_roots_objectid(tree_root, path,
2512                      BTRFS_FREE_SPACE_TREE_OBJECTID,
2513                      "free space");
2514 out:
2515     btrfs_free_path(path);
2516     return ret;
2517 }
2518
2519 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2520 {
2521     struct btrfs_root *tree_root = fs_info->tree_root;
2522     struct btrfs_root *root;
2523     struct btrfs_key location;
2524     int ret;
2525
2526     BUG_ON(!fs_info->tree_root);
2527
2528     ret = load_global_roots(tree_root);
2529     if (ret)
2530         return ret;
2531
2532     location.objectid = BTRFS_DEV_TREE_OBJECTID;
2533     location.type = BTRFS_ROOT_ITEM_KEY;
2534     location.offset = 0;
2535
2536     root = btrfs_read_tree_root(tree_root, &location);
2537     if (IS_ERR(root)) {
2538         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2539             ret = PTR_ERR(root);
2540             goto out;
2541         }
2542     } else {
2543         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2544         fs_info->dev_root = root;
2545     }
2546     /* Initialize fs_info for all devices in any case */
2547     btrfs_init_devices_late(fs_info);
2548
2549     /*
2550      * This tree can share blocks with some other fs tree during relocation
2551      * and we need a proper setup by btrfs_get_fs_root
2552      */
2553     root = btrfs_get_fs_root(tree_root->fs_info,
2554                  BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2555     if (IS_ERR(root)) {
2556         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2557             ret = PTR_ERR(root);
2558             goto out;
2559         }
2560     } else {
2561         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2562         fs_info->data_reloc_root = root;
2563     }
2564
2565     location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2566     root = btrfs_read_tree_root(tree_root, &location);
2567     if (!IS_ERR(root)) {
2568         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2569         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2570         fs_info->quota_root = root;
2571     }
2572
2573     location.objectid = BTRFS_UUID_TREE_OBJECTID;
2574     root = btrfs_read_tree_root(tree_root, &location);
2575     if (IS_ERR(root)) {
2576         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2577             ret = PTR_ERR(root);
2578             if (ret != -ENOENT)
2579                 goto out;
2580         }
2581     } else {
2582         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2583         fs_info->uuid_root = root;
2584     }
2585
2586     return 0;
2587 out:
2588     btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2589            location.objectid, ret);
2590     return ret;
2591 }
2592
2593 /*
2594  * Real super block validation
2595  * NOTE: super csum type and incompat features will not be checked here.
2596  *
2597  * @sb:     super block to check
2598  * @mirror_num: the super block number to check its bytenr:
2599  *      0   the primary (1st) sb
2600  *      1, 2    2nd and 3rd backup copy
2601  *         -1   skip bytenr check
2602  */
2603 static int validate_super(struct btrfs_fs_info *fs_info,
2604                 struct btrfs_super_block *sb, int mirror_num)
2605 {
2606     u64 nodesize = btrfs_super_nodesize(sb);
2607     u64 sectorsize = btrfs_super_sectorsize(sb);
2608     int ret = 0;
2609
2610     if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2611         btrfs_err(fs_info, "no valid FS found");
2612         ret = -EINVAL;
2613     }
2614     if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2615         btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2616                 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2617         ret = -EINVAL;
2618     }
2619     if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2620         btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2621                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2622         ret = -EINVAL;
2623     }
2624     if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2625         btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2626                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2627         ret = -EINVAL;
2628     }
2629     if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2630         btrfs_err(fs_info, "log_root level too big: %d >= %d",
2631                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2632         ret = -EINVAL;
2633     }
2634
2635     /*
2636      * Check sectorsize and nodesize first, other check will need it.
2637      * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2638      */
2639     if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2640         sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2641         btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2642         ret = -EINVAL;
2643     }
2644
2645     /*
2646      * We only support at most two sectorsizes: 4K and PAGE_SIZE.
2647      *
2648      * We can support 16K sectorsize with 64K page size without problem,
2649      * but such sectorsize/pagesize combination doesn't make much sense.
2650      * 4K will be our future standard, PAGE_SIZE is supported from the very
2651      * beginning.
2652      */
2653     if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
2654         btrfs_err(fs_info,
2655             "sectorsize %llu not yet supported for page size %lu",
2656             sectorsize, PAGE_SIZE);
2657         ret = -EINVAL;
2658     }
2659
2660     if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2661         nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2662         btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2663         ret = -EINVAL;
2664     }
2665     if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2666         btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2667               le32_to_cpu(sb->__unused_leafsize), nodesize);
2668         ret = -EINVAL;
2669     }
2670
2671     /* Root alignment check */
2672     if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2673         btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2674                btrfs_super_root(sb));
2675         ret = -EINVAL;
2676     }
2677     if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2678         btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2679                btrfs_super_chunk_root(sb));
2680         ret = -EINVAL;
2681     }
2682     if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2683         btrfs_warn(fs_info, "log_root block unaligned: %llu",
2684                btrfs_super_log_root(sb));
2685         ret = -EINVAL;
2686     }
2687
2688     if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2689            BTRFS_FSID_SIZE)) {
2690         btrfs_err(fs_info,
2691         "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2692             fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2693         ret = -EINVAL;
2694     }
2695
2696     if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2697         memcmp(fs_info->fs_devices->metadata_uuid,
2698            fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2699         btrfs_err(fs_info,
2700 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2701             fs_info->super_copy->metadata_uuid,
2702             fs_info->fs_devices->metadata_uuid);
2703         ret = -EINVAL;
2704     }
2705
2706     if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2707            BTRFS_FSID_SIZE) != 0) {
2708         btrfs_err(fs_info,
2709             "dev_item UUID does not match metadata fsid: %pU != %pU",
2710             fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2711         ret = -EINVAL;
2712     }
2713
2714     /*
2715      * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2716      * done later
2717      */
2718     if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2719         btrfs_err(fs_info, "bytes_used is too small %llu",
2720               btrfs_super_bytes_used(sb));
2721         ret = -EINVAL;
2722     }
2723     if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2724         btrfs_err(fs_info, "invalid stripesize %u",
2725               btrfs_super_stripesize(sb));
2726         ret = -EINVAL;
2727     }
2728     if (btrfs_super_num_devices(sb) > (1UL << 31))
2729         btrfs_warn(fs_info, "suspicious number of devices: %llu",
2730                btrfs_super_num_devices(sb));
2731     if (btrfs_super_num_devices(sb) == 0) {
2732         btrfs_err(fs_info, "number of devices is 0");
2733         ret = -EINVAL;
2734     }
2735
2736     if (mirror_num >= 0 &&
2737         btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2738         btrfs_err(fs_info, "super offset mismatch %llu != %u",
2739               btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2740         ret = -EINVAL;
2741     }
2742
2743     /*
2744      * Obvious sys_chunk_array corruptions, it must hold at least one key
2745      * and one chunk
2746      */
2747     if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2748         btrfs_err(fs_info, "system chunk array too big %u > %u",
2749               btrfs_super_sys_array_size(sb),
2750               BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2751         ret = -EINVAL;
2752     }
2753     if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2754             + sizeof(struct btrfs_chunk)) {
2755         btrfs_err(fs_info, "system chunk array too small %u < %zu",
2756               btrfs_super_sys_array_size(sb),
2757               sizeof(struct btrfs_disk_key)
2758               + sizeof(struct btrfs_chunk));
2759         ret = -EINVAL;
2760     }
2761
2762     /*
2763      * The generation is a global counter, we'll trust it more than the others
2764      * but it's still possible that it's the one that's wrong.
2765      */
2766     if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2767         btrfs_warn(fs_info,
2768             "suspicious: generation < chunk_root_generation: %llu < %llu",
2769             btrfs_super_generation(sb),
2770             btrfs_super_chunk_root_generation(sb));
2771     if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2772         && btrfs_super_cache_generation(sb) != (u64)-1)
2773         btrfs_warn(fs_info,
2774             "suspicious: generation < cache_generation: %llu < %llu",
2775             btrfs_super_generation(sb),
2776             btrfs_super_cache_generation(sb));
2777
2778     return ret;
2779 }
2780
2781 /*
2782  * Validation of super block at mount time.
2783  * Some checks already done early at mount time, like csum type and incompat
2784  * flags will be skipped.
2785  */
2786 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2787 {
2788     return validate_super(fs_info, fs_info->super_copy, 0);
2789 }
2790
2791 /*
2792  * Validation of super block at write time.
2793  * Some checks like bytenr check will be skipped as their values will be
2794  * overwritten soon.
2795  * Extra checks like csum type and incompat flags will be done here.
2796  */
2797 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2798                       struct btrfs_super_block *sb)
2799 {
2800     int ret;
2801
2802     ret = validate_super(fs_info, sb, -1);
2803     if (ret < 0)
2804         goto out;
2805     if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2806         ret = -EUCLEAN;
2807         btrfs_err(fs_info, "invalid csum type, has %u want %u",
2808               btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2809         goto out;
2810     }
2811     if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2812         ret = -EUCLEAN;
2813         btrfs_err(fs_info,
2814         "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2815               btrfs_super_incompat_flags(sb),
2816               (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2817         goto out;
2818     }
2819 out:
2820     if (ret < 0)
2821         btrfs_err(fs_info,
2822         "super block corruption detected before writing it to disk");
2823     return ret;
2824 }
2825
2826 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2827 {
2828     int ret = 0;
2829
2830     root->node = read_tree_block(root->fs_info, bytenr,
2831                      root->root_key.objectid, gen, level, NULL);
2832     if (IS_ERR(root->node)) {
2833         ret = PTR_ERR(root->node);
2834         root->node = NULL;
2835         return ret;
2836     }
2837     if (!extent_buffer_uptodate(root->node)) {
2838         free_extent_buffer(root->node);
2839         root->node = NULL;
2840         return -EIO;
2841     }
2842
2843     btrfs_set_root_node(&root->root_item, root->node);
2844     root->commit_root = btrfs_root_node(root);
2845     btrfs_set_root_refs(&root->root_item, 1);
2846     return ret;
2847 }
2848
2849 static int load_important_roots(struct btrfs_fs_info *fs_info)
2850 {
2851     struct btrfs_super_block *sb = fs_info->super_copy;
2852     u64 gen, bytenr;
2853     int level, ret;
2854
2855     bytenr = btrfs_super_root(sb);
2856     gen = btrfs_super_generation(sb);
2857     level = btrfs_super_root_level(sb);
2858     ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2859     if (ret) {
2860         btrfs_warn(fs_info, "couldn't read tree root");
2861         return ret;
2862     }
2863
2864     if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2865         return 0;
2866
2867     bytenr = btrfs_super_block_group_root(sb);
2868     gen = btrfs_super_block_group_root_generation(sb);
2869     level = btrfs_super_block_group_root_level(sb);
2870     ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
2871     if (ret)
2872         btrfs_warn(fs_info, "couldn't read block group root");
2873     return ret;
2874 }
2875
2876 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2877 {
2878     int backup_index = find_newest_super_backup(fs_info);
2879     struct btrfs_super_block *sb = fs_info->super_copy;
2880     struct btrfs_root *tree_root = fs_info->tree_root;
2881     bool handle_error = false;
2882     int ret = 0;
2883     int i;
2884
2885     if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2886         struct btrfs_root *root;
2887
2888         root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
2889                     GFP_KERNEL);
2890         if (!root)
2891             return -ENOMEM;
2892         fs_info->block_group_root = root;
2893     }
2894
2895     for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2896         if (handle_error) {
2897             if (!IS_ERR(tree_root->node))
2898                 free_extent_buffer(tree_root->node);
2899             tree_root->node = NULL;
2900
2901             if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2902                 break;
2903
2904             free_root_pointers(fs_info, 0);
2905
2906             /*
2907              * Don't use the log in recovery mode, it won't be
2908              * valid
2909              */
2910             btrfs_set_super_log_root(sb, 0);
2911
2912             /* We can't trust the free space cache either */
2913             btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2914
2915             ret = read_backup_root(fs_info, i);
2916             backup_index = ret;
2917             if (ret < 0)
2918                 return ret;
2919         }
2920
2921         ret = load_important_roots(fs_info);
2922         if (ret) {
2923             handle_error = true;
2924             continue;
2925         }
2926
2927         /*
2928          * No need to hold btrfs_root::objectid_mutex since the fs
2929          * hasn't been fully initialised and we are the only user
2930          */
2931         ret = btrfs_init_root_free_objectid(tree_root);
2932         if (ret < 0) {
2933             handle_error = true;
2934             continue;
2935         }
2936
2937         ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2938
2939         ret = btrfs_read_roots(fs_info);
2940         if (ret < 0) {
2941             handle_error = true;
2942             continue;
2943         }
2944
2945         /* All successful */
2946         fs_info->generation = btrfs_header_generation(tree_root->node);
2947         fs_info->last_trans_committed = fs_info->generation;
2948         fs_info->last_reloc_trans = 0;
2949
2950         /* Always begin writing backup roots after the one being used */
2951         if (backup_index < 0) {
2952             fs_info->backup_root_index = 0;
2953         } else {
2954             fs_info->backup_root_index = backup_index + 1;
2955             fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2956         }
2957         break;
2958     }
2959
2960     return ret;
2961 }
2962
2963 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2964 {
2965     INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2966     INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2967     INIT_LIST_HEAD(&fs_info->trans_list);
2968     INIT_LIST_HEAD(&fs_info->dead_roots);
2969     INIT_LIST_HEAD(&fs_info->delayed_iputs);
2970     INIT_LIST_HEAD(&fs_info->delalloc_roots);
2971     INIT_LIST_HEAD(&fs_info->caching_block_groups);
2972     spin_lock_init(&fs_info->delalloc_root_lock);
2973     spin_lock_init(&fs_info->trans_lock);
2974     spin_lock_init(&fs_info->fs_roots_radix_lock);
2975     spin_lock_init(&fs_info->delayed_iput_lock);
2976     spin_lock_init(&fs_info->defrag_inodes_lock);
2977     spin_lock_init(&fs_info->super_lock);
2978     spin_lock_init(&fs_info->buffer_lock);
2979     spin_lock_init(&fs_info->unused_bgs_lock);
2980     spin_lock_init(&fs_info->treelog_bg_lock);
2981     spin_lock_init(&fs_info->zone_active_bgs_lock);
2982     spin_lock_init(&fs_info->relocation_bg_lock);
2983     rwlock_init(&fs_info->tree_mod_log_lock);
2984     rwlock_init(&fs_info->global_root_lock);
2985     mutex_init(&fs_info->unused_bg_unpin_mutex);
2986     mutex_init(&fs_info->reclaim_bgs_lock);
2987     mutex_init(&fs_info->reloc_mutex);
2988     mutex_init(&fs_info->delalloc_root_mutex);
2989     mutex_init(&fs_info->zoned_meta_io_lock);
2990     mutex_init(&fs_info->zoned_data_reloc_io_lock);
2991     seqlock_init(&fs_info->profiles_lock);
2992
2993     INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2994     INIT_LIST_HEAD(&fs_info->space_info);
2995     INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2996     INIT_LIST_HEAD(&fs_info->unused_bgs);
2997     INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2998     INIT_LIST_HEAD(&fs_info->zone_active_bgs);
2999 #ifdef CONFIG_BTRFS_DEBUG
3000     INIT_LIST_HEAD(&fs_info->allocated_roots);
3001     INIT_LIST_HEAD(&fs_info->allocated_ebs);
3002     spin_lock_init(&fs_info->eb_leak_lock);
3003 #endif
3004     extent_map_tree_init(&fs_info->mapping_tree);
3005     btrfs_init_block_rsv(&fs_info->global_block_rsv,
3006                  BTRFS_BLOCK_RSV_GLOBAL);
3007     btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
3008     btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
3009     btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
3010     btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
3011                  BTRFS_BLOCK_RSV_DELOPS);
3012     btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
3013                  BTRFS_BLOCK_RSV_DELREFS);
3014
3015     atomic_set(&fs_info->async_delalloc_pages, 0);
3016     atomic_set(&fs_info->defrag_running, 0);
3017     atomic_set(&fs_info->nr_delayed_iputs, 0);
3018     atomic64_set(&fs_info->tree_mod_seq, 0);
3019     fs_info->global_root_tree = RB_ROOT;
3020     fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
3021     fs_info->metadata_ratio = 0;
3022     fs_info->defrag_inodes = RB_ROOT;
3023     atomic64_set(&fs_info->free_chunk_space, 0);
3024     fs_info->tree_mod_log = RB_ROOT;
3025     fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
3026     fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
3027     btrfs_init_ref_verify(fs_info);
3028
3029     fs_info->thread_pool_size = min_t(unsigned long,
3030                       num_online_cpus() + 2, 8);
3031
3032     INIT_LIST_HEAD(&fs_info->ordered_roots);
3033     spin_lock_init(&fs_info->ordered_root_lock);
3034
3035     btrfs_init_scrub(fs_info);
3036 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3037     fs_info->check_integrity_print_mask = 0;
3038 #endif
3039     btrfs_init_balance(fs_info);
3040     btrfs_init_async_reclaim_work(fs_info);
3041
3042     rwlock_init(&fs_info->block_group_cache_lock);
3043     fs_info->block_group_cache_tree = RB_ROOT_CACHED;
3044
3045     extent_io_tree_init(fs_info, &fs_info->excluded_extents,
3046                 IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
3047
3048     mutex_init(&fs_info->ordered_operations_mutex);
3049     mutex_init(&fs_info->tree_log_mutex);
3050     mutex_init(&fs_info->chunk_mutex);
3051     mutex_init(&fs_info->transaction_kthread_mutex);
3052     mutex_init(&fs_info->cleaner_mutex);
3053     mutex_init(&fs_info->ro_block_group_mutex);
3054     init_rwsem(&fs_info->commit_root_sem);
3055     init_rwsem(&fs_info->cleanup_work_sem);
3056     init_rwsem(&fs_info->subvol_sem);
3057     sema_init(&fs_info->uuid_tree_rescan_sem, 1);
3058
3059     btrfs_init_dev_replace_locks(fs_info);
3060     btrfs_init_qgroup(fs_info);
3061     btrfs_discard_init(fs_info);
3062
3063     btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
3064     btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
3065
3066     init_waitqueue_head(&fs_info->transaction_throttle);
3067     init_waitqueue_head(&fs_info->transaction_wait);
3068     init_waitqueue_head(&fs_info->transaction_blocked_wait);
3069     init_waitqueue_head(&fs_info->async_submit_wait);
3070     init_waitqueue_head(&fs_info->delayed_iputs_wait);
3071
3072     /* Usable values until the real ones are cached from the superblock */
3073     fs_info->nodesize = 4096;
3074     fs_info->sectorsize = 4096;
3075     fs_info->sectorsize_bits = ilog2(4096);
3076     fs_info->stripesize = 4096;
3077
3078     fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
3079
3080     spin_lock_init(&fs_info->swapfile_pins_lock);
3081     fs_info->swapfile_pins = RB_ROOT;
3082
3083     fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
3084     INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
3085 }
3086
3087 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
3088 {
3089     int ret;
3090
3091     fs_info->sb = sb;
3092     sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
3093     sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
3094
3095     ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
3096     if (ret)
3097         return ret;
3098
3099     ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
3100     if (ret)
3101         return ret;
3102
3103     fs_info->dirty_metadata_batch = PAGE_SIZE *
3104                     (1 + ilog2(nr_cpu_ids));
3105
3106     ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
3107     if (ret)
3108         return ret;
3109
3110     ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
3111             GFP_KERNEL);
3112     if (ret)
3113         return ret;
3114
3115     fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
3116                     GFP_KERNEL);
3117     if (!fs_info->delayed_root)
3118         return -ENOMEM;
3119     btrfs_init_delayed_root(fs_info->delayed_root);
3120
3121     if (sb_rdonly(sb))
3122         set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
3123
3124     return btrfs_alloc_stripe_hash_table(fs_info);
3125 }
3126
3127 static int btrfs_uuid_rescan_kthread(void *data)
3128 {
3129     struct btrfs_fs_info *fs_info = data;
3130     int ret;
3131
3132     /*
3133      * 1st step is to iterate through the existing UUID tree and
3134      * to delete all entries that contain outdated data.
3135      * 2nd step is to add all missing entries to the UUID tree.
3136      */
3137     ret = btrfs_uuid_tree_iterate(fs_info);
3138     if (ret < 0) {
3139         if (ret != -EINTR)
3140             btrfs_warn(fs_info, "iterating uuid_tree failed %d",
3141                    ret);
3142         up(&fs_info->uuid_tree_rescan_sem);
3143         return ret;
3144     }
3145     return btrfs_uuid_scan_kthread(data);
3146 }
3147
3148 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3149 {
3150     struct task_struct *task;
3151
3152     down(&fs_info->uuid_tree_rescan_sem);
3153     task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3154     if (IS_ERR(task)) {
3155         /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3156         btrfs_warn(fs_info, "failed to start uuid_rescan task");
3157         up(&fs_info->uuid_tree_rescan_sem);
3158         return PTR_ERR(task);
3159     }
3160
3161     return 0;
3162 }
3163
3164 /*
3165  * Some options only have meaning at mount time and shouldn't persist across
3166  * remounts, or be displayed. Clear these at the end of mount and remount
3167  * code paths.
3168  */
3169 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
3170 {
3171     btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3172     btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
3173 }
3174
3175 /*
3176  * Mounting logic specific to read-write file systems. Shared by open_ctree
3177  * and btrfs_remount when remounting from read-only to read-write.
3178  */
3179 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3180 {
3181     int ret;
3182     const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3183     bool clear_free_space_tree = false;
3184
3185     if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3186         btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3187         clear_free_space_tree = true;
3188     } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3189            !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3190         btrfs_warn(fs_info, "free space tree is invalid");
3191         clear_free_space_tree = true;
3192     }
3193
3194     if (clear_free_space_tree) {
3195         btrfs_info(fs_info, "clearing free space tree");
3196         ret = btrfs_clear_free_space_tree(fs_info);
3197         if (ret) {
3198             btrfs_warn(fs_info,
3199                    "failed to clear free space tree: %d", ret);
3200             goto out;
3201         }
3202     }
3203
3204     /*
3205      * btrfs_find_orphan_roots() is responsible for finding all the dead
3206      * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3207      * them into the fs_info->fs_roots_radix tree. This must be done before
3208      * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3209      * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3210      * item before the root's tree is deleted - this means that if we unmount
3211      * or crash before the deletion completes, on the next mount we will not
3212      * delete what remains of the tree because the orphan item does not
3213      * exists anymore, which is what tells us we have a pending deletion.
3214      */
3215     ret = btrfs_find_orphan_roots(fs_info);
3216     if (ret)
3217         goto out;
3218
3219     ret = btrfs_cleanup_fs_roots(fs_info);
3220     if (ret)
3221         goto out;
3222
3223     down_read(&fs_info->cleanup_work_sem);
3224     if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3225         (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3226         up_read(&fs_info->cleanup_work_sem);
3227         goto out;
3228     }
3229     up_read(&fs_info->cleanup_work_sem);
3230
3231     mutex_lock(&fs_info->cleaner_mutex);
3232     ret = btrfs_recover_relocation(fs_info);
3233     mutex_unlock(&fs_info->cleaner_mutex);
3234     if (ret < 0) {
3235         btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3236         goto out;
3237     }
3238
3239     if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3240         !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3241         btrfs_info(fs_info, "creating free space tree");
3242         ret = btrfs_create_free_space_tree(fs_info);
3243         if (ret) {
3244             btrfs_warn(fs_info,
3245                 "failed to create free space tree: %d", ret);
3246             goto out;
3247         }
3248     }
3249
3250     if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3251         ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3252         if (ret)
3253             goto out;
3254     }
3255
3256     ret = btrfs_resume_balance_async(fs_info);
3257     if (ret)
3258         goto out;
3259
3260     ret = btrfs_resume_dev_replace_async(fs_info);
3261     if (ret) {
3262         btrfs_warn(fs_info, "failed to resume dev_replace");
3263         goto out;
3264     }
3265
3266     btrfs_qgroup_rescan_resume(fs_info);
3267
3268     if (!fs_info->uuid_root) {
3269         btrfs_info(fs_info, "creating UUID tree");
3270         ret = btrfs_create_uuid_tree(fs_info);
3271         if (ret) {
3272             btrfs_warn(fs_info,
3273                    "failed to create the UUID tree %d", ret);
3274             goto out;
3275         }
3276     }
3277
3278 out:
3279     return ret;
3280 }
3281
3282 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3283               char *options)
3284 {
3285     u32 sectorsize;
3286     u32 nodesize;
3287     u32 stripesize;
3288     u64 generation;
3289     u64 features;
3290     u16 csum_type;
3291     struct btrfs_super_block *disk_super;
3292     struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3293     struct btrfs_root *tree_root;
3294     struct btrfs_root *chunk_root;
3295     int ret;
3296     int err = -EINVAL;
3297     int level;
3298
3299     ret = init_mount_fs_info(fs_info, sb);
3300     if (ret) {
3301         err = ret;
3302         goto fail;
3303     }
3304
3305     /* These need to be init'ed before we start creating inodes and such. */
3306     tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3307                      GFP_KERNEL);
3308     fs_info->tree_root = tree_root;
3309     chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3310                       GFP_KERNEL);
3311     fs_info->chunk_root = chunk_root;
3312     if (!tree_root || !chunk_root) {
3313         err = -ENOMEM;
3314         goto fail;
3315     }
3316
3317     fs_info->btree_inode = new_inode(sb);
3318     if (!fs_info->btree_inode) {
3319         err = -ENOMEM;
3320         goto fail;
3321     }
3322     mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
3323     btrfs_init_btree_inode(fs_info);
3324
3325     invalidate_bdev(fs_devices->latest_dev->bdev);
3326
3327     /*
3328      * Read super block and check the signature bytes only
3329      */
3330     disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3331     if (IS_ERR(disk_super)) {
3332         err = PTR_ERR(disk_super);
3333         goto fail_alloc;
3334     }
3335
3336     /*
3337      * Verify the type first, if that or the checksum value are
3338      * corrupted, we'll find out
3339      */
3340     csum_type = btrfs_super_csum_type(disk_super);
3341     if (!btrfs_supported_super_csum(csum_type)) {
3342         btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3343               csum_type);
3344         err = -EINVAL;
3345         btrfs_release_disk_super(disk_super);
3346         goto fail_alloc;
3347     }
3348
3349     fs_info->csum_size = btrfs_super_csum_size(disk_super);
3350
3351     ret = btrfs_init_csum_hash(fs_info, csum_type);
3352     if (ret) {
3353         err = ret;
3354         btrfs_release_disk_super(disk_super);
3355         goto fail_alloc;
3356     }
3357
3358     /*
3359      * We want to check superblock checksum, the type is stored inside.
3360      * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3361      */
3362     if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
3363         btrfs_err(fs_info, "superblock checksum mismatch");
3364         err = -EINVAL;
3365         btrfs_release_disk_super(disk_super);
3366         goto fail_alloc;
3367     }
3368
3369     /*
3370      * super_copy is zeroed at allocation time and we never touch the
3371      * following bytes up to INFO_SIZE, the checksum is calculated from
3372      * the whole block of INFO_SIZE
3373      */
3374     memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3375     btrfs_release_disk_super(disk_super);
3376
3377     disk_super = fs_info->super_copy;
3378
3379
3380     features = btrfs_super_flags(disk_super);
3381     if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3382         features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3383         btrfs_set_super_flags(disk_super, features);
3384         btrfs_info(fs_info,
3385             "found metadata UUID change in progress flag, clearing");
3386     }
3387
3388     memcpy(fs_info->super_for_commit, fs_info->super_copy,
3389            sizeof(*fs_info->super_for_commit));
3390
3391     ret = btrfs_validate_mount_super(fs_info);
3392     if (ret) {
3393         btrfs_err(fs_info, "superblock contains fatal errors");
3394         err = -EINVAL;
3395         goto fail_alloc;
3396     }
3397
3398     if (!btrfs_super_root(disk_super))
3399         goto fail_alloc;
3400
3401     /* check FS state, whether FS is broken. */
3402     if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3403         set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3404
3405     /*
3406      * In the long term, we'll store the compression type in the super
3407      * block, and it'll be used for per file compression control.
3408      */
3409     fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3410
3411
3412     /* Set up fs_info before parsing mount options */
3413     nodesize = btrfs_super_nodesize(disk_super);
3414     sectorsize = btrfs_super_sectorsize(disk_super);
3415     stripesize = sectorsize;
3416     fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3417     fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3418
3419     fs_info->nodesize = nodesize;
3420     fs_info->sectorsize = sectorsize;
3421     fs_info->sectorsize_bits = ilog2(sectorsize);
3422     fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3423     fs_info->stripesize = stripesize;
3424
3425     ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3426     if (ret) {
3427         err = ret;
3428         goto fail_alloc;
3429     }
3430
3431     features = btrfs_super_incompat_flags(disk_super) &
3432         ~BTRFS_FEATURE_INCOMPAT_SUPP;
3433     if (features) {
3434         btrfs_err(fs_info,
3435             "cannot mount because of unsupported optional features (0x%llx)",
3436             features);
3437         err = -EINVAL;
3438         goto fail_alloc;
3439     }
3440
3441     features = btrfs_super_incompat_flags(disk_super);
3442     features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3443     if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3444         features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3445     else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3446         features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3447
3448     /*
3449      * Flag our filesystem as having big metadata blocks if they are bigger
3450      * than the page size.
3451      */
3452     if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3453         features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3454
3455     /*
3456      * mixed block groups end up with duplicate but slightly offset
3457      * extent buffers for the same range.  It leads to corruptions
3458      */
3459     if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3460         (sectorsize != nodesize)) {
3461         btrfs_err(fs_info,
3462 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3463             nodesize, sectorsize);
3464         goto fail_alloc;
3465     }
3466
3467     /*
3468      * Needn't use the lock because there is no other task which will
3469      * update the flag.
3470      */
3471     btrfs_set_super_incompat_flags(disk_super, features);
3472
3473     features = btrfs_super_compat_ro_flags(disk_super) &
3474         ~BTRFS_FEATURE_COMPAT_RO_SUPP;
3475     if (!sb_rdonly(sb) && features) {
3476         btrfs_err(fs_info,
3477     "cannot mount read-write because of unsupported optional features (0x%llx)",
3478                features);
3479         err = -EINVAL;
3480         goto fail_alloc;
3481     }
3482     /*
3483      * We have unsupported RO compat features, although RO mounted, we
3484      * should not cause any metadata write, including log replay.
3485      * Or we could screw up whatever the new feature requires.
3486      */
3487     if (unlikely(features && btrfs_super_log_root(disk_super) &&
3488              !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
3489         btrfs_err(fs_info,
3490 "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3491               features);
3492         err = -EINVAL;
3493         goto fail_alloc;
3494     }
3495
3496
3497     if (sectorsize < PAGE_SIZE) {
3498         struct btrfs_subpage_info *subpage_info;
3499
3500         /*
3501          * V1 space cache has some hardcoded PAGE_SIZE usage, and is
3502          * going to be deprecated.
3503          *
3504          * Force to use v2 cache for subpage case.
3505          */
3506         btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
3507         btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
3508             "forcing free space tree for sector size %u with page size %lu",
3509             sectorsize, PAGE_SIZE);
3510
3511         btrfs_warn(fs_info,
3512         "read-write for sector size %u with page size %lu is experimental",
3513                sectorsize, PAGE_SIZE);
3514         subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3515         if (!subpage_info)
3516             goto fail_alloc;
3517         btrfs_init_subpage_info(subpage_info, sectorsize);
3518         fs_info->subpage_info = subpage_info;
3519     }
3520
3521     ret = btrfs_init_workqueues(fs_info);
3522     if (ret) {
3523         err = ret;
3524         goto fail_sb_buffer;
3525     }
3526
3527     sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3528     sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3529
3530     sb->s_blocksize = sectorsize;
3531     sb->s_blocksize_bits = blksize_bits(sectorsize);
3532     memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3533
3534     mutex_lock(&fs_info->chunk_mutex);
3535     ret = btrfs_read_sys_array(fs_info);
3536     mutex_unlock(&fs_info->chunk_mutex);
3537     if (ret) {
3538         btrfs_err(fs_info, "failed to read the system array: %d", ret);
3539         goto fail_sb_buffer;
3540     }
3541
3542     generation = btrfs_super_chunk_root_generation(disk_super);
3543     level = btrfs_super_chunk_root_level(disk_super);
3544     ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3545                   generation, level);
3546     if (ret) {
3547         btrfs_err(fs_info, "failed to read chunk root");
3548         goto fail_tree_roots;
3549     }
3550
3551     read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3552                offsetof(struct btrfs_header, chunk_tree_uuid),
3553                BTRFS_UUID_SIZE);
3554
3555     ret = btrfs_read_chunk_tree(fs_info);
3556     if (ret) {
3557         btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3558         goto fail_tree_roots;
3559     }
3560
3561     /*
3562      * At this point we know all the devices that make this filesystem,
3563      * including the seed devices but we don't know yet if the replace
3564      * target is required. So free devices that are not part of this
3565      * filesystem but skip the replace target device which is checked
3566      * below in btrfs_init_dev_replace().
3567      */
3568     btrfs_free_extra_devids(fs_devices);
3569     if (!fs_devices->latest_dev->bdev) {
3570         btrfs_err(fs_info, "failed to read devices");
3571         goto fail_tree_roots;
3572     }
3573
3574     ret = init_tree_roots(fs_info);
3575     if (ret)
3576         goto fail_tree_roots;
3577
3578     /*
3579      * Get zone type information of zoned block devices. This will also
3580      * handle emulation of a zoned filesystem if a regular device has the
3581      * zoned incompat feature flag set.
3582      */
3583     ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3584     if (ret) {
3585         btrfs_err(fs_info,
3586               "zoned: failed to read device zone info: %d",
3587               ret);
3588         goto fail_block_groups;
3589     }
3590
3591     /*
3592      * If we have a uuid root and we're not being told to rescan we need to
3593      * check the generation here so we can set the
3594      * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3595      * transaction during a balance or the log replay without updating the
3596      * uuid generation, and then if we crash we would rescan the uuid tree,
3597      * even though it was perfectly fine.
3598      */
3599     if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3600         fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3601         set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3602
3603     ret = btrfs_verify_dev_extents(fs_info);
3604     if (ret) {
3605         btrfs_err(fs_info,
3606               "failed to verify dev extents against chunks: %d",
3607               ret);
3608         goto fail_block_groups;
3609     }
3610     ret = btrfs_recover_balance(fs_info);
3611     if (ret) {
3612         btrfs_err(fs_info, "failed to recover balance: %d", ret);
3613         goto fail_block_groups;
3614     }
3615
3616     ret = btrfs_init_dev_stats(fs_info);
3617     if (ret) {
3618         btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3619         goto fail_block_groups;
3620     }
3621
3622     ret = btrfs_init_dev_replace(fs_info);
3623     if (ret) {
3624         btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3625         goto fail_block_groups;
3626     }
3627
3628     ret = btrfs_check_zoned_mode(fs_info);
3629     if (ret) {
3630         btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3631               ret);
3632         goto fail_block_groups;
3633     }
3634
3635     ret = btrfs_sysfs_add_fsid(fs_devices);
3636     if (ret) {
3637         btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3638                 ret);
3639         goto fail_block_groups;
3640     }
3641
3642     ret = btrfs_sysfs_add_mounted(fs_info);
3643     if (ret) {
3644         btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3645         goto fail_fsdev_sysfs;
3646     }
3647
3648     ret = btrfs_init_space_info(fs_info);
3649     if (ret) {
3650         btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3651         goto fail_sysfs;
3652     }
3653
3654     ret = btrfs_read_block_groups(fs_info);
3655     if (ret) {
3656         btrfs_err(fs_info, "failed to read block groups: %d", ret);
3657         goto fail_sysfs;
3658     }
3659
3660     btrfs_free_zone_cache(fs_info);
3661
3662     if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3663         !btrfs_check_rw_degradable(fs_info, NULL)) {
3664         btrfs_warn(fs_info,
3665         "writable mount is not allowed due to too many missing devices");
3666         goto fail_sysfs;
3667     }
3668
3669     fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3670                            "btrfs-cleaner");
3671     if (IS_ERR(fs_info->cleaner_kthread))
3672         goto fail_sysfs;
3673
3674     fs_info->transaction_kthread = kthread_run(transaction_kthread,
3675                            tree_root,
3676                            "btrfs-transaction");
3677     if (IS_ERR(fs_info->transaction_kthread))
3678         goto fail_cleaner;
3679
3680     if (!btrfs_test_opt(fs_info, NOSSD) &&
3681         !fs_info->fs_devices->rotating) {
3682         btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3683     }
3684
3685     /*
3686      * Mount does not set all options immediately, we can do it now and do
3687      * not have to wait for transaction commit
3688      */
3689     btrfs_apply_pending_changes(fs_info);
3690
3691 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3692     if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3693         ret = btrfsic_mount(fs_info, fs_devices,
3694                     btrfs_test_opt(fs_info,
3695                     CHECK_INTEGRITY_DATA) ? 1 : 0,
3696                     fs_info->check_integrity_print_mask);
3697         if (ret)
3698             btrfs_warn(fs_info,
3699                 "failed to initialize integrity check module: %d",
3700                 ret);
3701     }
3702 #endif
3703     ret = btrfs_read_qgroup_config(fs_info);
3704     if (ret)
3705         goto fail_trans_kthread;
3706
3707     if (btrfs_build_ref_tree(fs_info))
3708         btrfs_err(fs_info, "couldn't build ref tree");
3709
3710     /* do not make disk changes in broken FS or nologreplay is given */
3711     if (btrfs_super_log_root(disk_super) != 0 &&
3712         !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3713         btrfs_info(fs_info, "start tree-log replay");
3714         ret = btrfs_replay_log(fs_info, fs_devices);
3715         if (ret) {
3716             err = ret;
3717             goto fail_qgroup;
3718         }
3719     }
3720
3721     fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3722     if (IS_ERR(fs_info->fs_root)) {
3723         err = PTR_ERR(fs_info->fs_root);
3724         btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3725         fs_info->fs_root = NULL;
3726         goto fail_qgroup;
3727     }
3728
3729     if (sb_rdonly(sb))
3730         goto clear_oneshot;
3731
3732     ret = btrfs_start_pre_rw_mount(fs_info);
3733     if (ret) {
3734         close_ctree(fs_info);
3735         return ret;
3736     }
3737     btrfs_discard_resume(fs_info);
3738
3739     if (fs_info->uuid_root &&
3740         (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3741          fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3742         btrfs_info(fs_info, "checking UUID tree");
3743         ret = btrfs_check_uuid_tree(fs_info);
3744         if (ret) {
3745             btrfs_warn(fs_info,
3746                 "failed to check the UUID tree: %d", ret);
3747             close_ctree(fs_info);
3748             return ret;
3749         }
3750     }
3751
3752     set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3753
3754     /* Kick the cleaner thread so it'll start deleting snapshots. */
3755     if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3756         wake_up_process(fs_info->cleaner_kthread);
3757
3758 clear_oneshot:
3759     btrfs_clear_oneshot_options(fs_info);
3760     return 0;
3761
3762 fail_qgroup:
3763     btrfs_free_qgroup_config(fs_info);
3764 fail_trans_kthread:
3765     kthread_stop(fs_info->transaction_kthread);
3766     btrfs_cleanup_transaction(fs_info);
3767     btrfs_free_fs_roots(fs_info);
3768 fail_cleaner:
3769     kthread_stop(fs_info->cleaner_kthread);
3770
3771     /*
3772      * make sure we're done with the btree inode before we stop our
3773      * kthreads
3774      */
3775     filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3776
3777 fail_sysfs:
3778     btrfs_sysfs_remove_mounted(fs_info);
3779
3780 fail_fsdev_sysfs:
3781     btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3782
3783 fail_block_groups:
3784     btrfs_put_block_group_cache(fs_info);
3785
3786 fail_tree_roots:
3787     if (fs_info->data_reloc_root)
3788         btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3789     free_root_pointers(fs_info, true);
3790     invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3791
3792 fail_sb_buffer:
3793     btrfs_stop_all_workers(fs_info);
3794     btrfs_free_block_groups(fs_info);
3795 fail_alloc:
3796     btrfs_mapping_tree_free(&fs_info->mapping_tree);
3797
3798     iput(fs_info->btree_inode);
3799 fail:
3800     btrfs_close_devices(fs_info->fs_devices);
3801     return err;
3802 }
3803 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3804
3805 static void btrfs_end_super_write(struct bio *bio)
3806 {
3807     struct btrfs_device *device = bio->bi_private;
3808     struct bio_vec *bvec;
3809     struct bvec_iter_all iter_all;
3810     struct page *page;
3811
3812     bio_for_each_segment_all(bvec, bio, iter_all) {
3813         page = bvec->bv_page;
3814
3815         if (bio->bi_status) {
3816             btrfs_warn_rl_in_rcu(device->fs_info,
3817                 "lost page write due to IO error on %s (%d)",
3818                 rcu_str_deref(device->name),
3819                 blk_status_to_errno(bio->bi_status));
3820             ClearPageUptodate(page);
3821             SetPageError(page);
3822             btrfs_dev_stat_inc_and_print(device,
3823                              BTRFS_DEV_STAT_WRITE_ERRS);
3824         } else {
3825             SetPageUptodate(page);
3826         }
3827
3828         put_page(page);
3829         unlock_page(page);
3830     }
3831
3832     bio_put(bio);
3833 }
3834
3835 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3836                            int copy_num)
3837 {
3838     struct btrfs_super_block *super;
3839     struct page *page;
3840     u64 bytenr, bytenr_orig;
3841     struct address_space *mapping = bdev->bd_inode->i_mapping;
3842     int ret;
3843
3844     bytenr_orig = btrfs_sb_offset(copy_num);
3845     ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
3846     if (ret == -ENOENT)
3847         return ERR_PTR(-EINVAL);
3848     else if (ret)
3849         return ERR_PTR(ret);
3850
3851     if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
3852         return ERR_PTR(-EINVAL);
3853
3854     page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3855     if (IS_ERR(page))
3856         return ERR_CAST(page);
3857
3858     super = page_address(page);
3859     if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3860         btrfs_release_disk_super(super);
3861         return ERR_PTR(-ENODATA);
3862     }
3863
3864     if (btrfs_super_bytenr(super) != bytenr_orig) {
3865         btrfs_release_disk_super(super);
3866         return ERR_PTR(-EINVAL);
3867     }
3868
3869     return super;
3870 }
3871
3872
3873 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3874 {
3875     struct btrfs_super_block *super, *latest = NULL;
3876     int i;
3877     u64 transid = 0;
3878
3879     /* we would like to check all the supers, but that would make
3880      * a btrfs mount succeed after a mkfs from a different FS.
3881      * So, we need to add a special mount option to scan for
3882      * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3883      */
3884     for (i = 0; i < 1; i++) {
3885         super = btrfs_read_dev_one_super(bdev, i);
3886         if (IS_ERR(super))
3887             continue;
3888
3889         if (!latest || btrfs_super_generation(super) > transid) {
3890             if (latest)
3891                 btrfs_release_disk_super(super);
3892
3893             latest = super;
3894             transid = btrfs_super_generation(super);
3895         }
3896     }
3897
3898     return super;
3899 }
3900
3901 /*
3902  * Write superblock @sb to the @device. Do not wait for completion, all the
3903  * pages we use for writing are locked.
3904  *
3905  * Write @max_mirrors copies of the superblock, where 0 means default that fit
3906  * the expected device size at commit time. Note that max_mirrors must be
3907  * same for write and wait phases.
3908  *
3909  * Return number of errors when page is not found or submission fails.
3910  */
3911 static int write_dev_supers(struct btrfs_device *device,
3912                 struct btrfs_super_block *sb, int max_mirrors)
3913 {
3914     struct btrfs_fs_info *fs_info = device->fs_info;
3915     struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3916     SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3917     int i;
3918     int errors = 0;
3919     int ret;
3920     u64 bytenr, bytenr_orig;
3921
3922     if (max_mirrors == 0)
3923         max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3924
3925     shash->tfm = fs_info->csum_shash;
3926
3927     for (i = 0; i < max_mirrors; i++) {
3928         struct page *page;
3929         struct bio *bio;
3930         struct btrfs_super_block *disk_super;
3931
3932         bytenr_orig = btrfs_sb_offset(i);
3933         ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3934         if (ret == -ENOENT) {
3935             continue;
3936         } else if (ret < 0) {
3937             btrfs_err(device->fs_info,
3938                 "couldn't get super block location for mirror %d",
3939                 i);
3940             errors++;
3941             continue;
3942         }
3943         if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3944             device->commit_total_bytes)
3945             break;
3946
3947         btrfs_set_super_bytenr(sb, bytenr_orig);
3948
3949         crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3950                     BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3951                     sb->csum);
3952
3953         page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
3954                        GFP_NOFS);
3955         if (!page) {
3956             btrfs_err(device->fs_info,
3957                 "couldn't get super block page for bytenr %llu",
3958                 bytenr);
3959             errors++;
3960             continue;
3961         }
3962
3963         /* Bump the refcount for wait_dev_supers() */
3964         get_page(page);
3965
3966         disk_super = page_address(page);
3967         memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3968
3969         /*
3970          * Directly use bios here instead of relying on the page cache
3971          * to do I/O, so we don't lose the ability to do integrity
3972          * checking.
3973          */
3974         bio = bio_alloc(device->bdev, 1,
3975                 REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
3976                 GFP_NOFS);
3977         bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3978         bio->bi_private = device;
3979         bio->bi_end_io = btrfs_end_super_write;
3980         __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
3981                    offset_in_page(bytenr));
3982
3983         /*
3984          * We FUA only the first super block.  The others we allow to
3985          * go down lazy and there's a short window where the on-disk
3986          * copies might still contain the older version.
3987          */
3988         if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3989             bio->bi_opf |= REQ_FUA;
3990
3991         btrfsic_check_bio(bio);
3992         submit_bio(bio);
3993
3994         if (btrfs_advance_sb_log(device, i))
3995             errors++;
3996     }
3997     return errors < i ? 0 : -1;
3998 }
3999
4000 /*
4001  * Wait for write completion of superblocks done by write_dev_supers,
4002  * @max_mirrors same for write and wait phases.
4003  *
4004  * Return number of errors when page is not found or not marked up to
4005  * date.
4006  */
4007 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
4008 {
4009     int i;
4010     int errors = 0;
4011     bool primary_failed = false;
4012     int ret;
4013     u64 bytenr;
4014
4015     if (max_mirrors == 0)
4016         max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4017
4018     for (i = 0; i < max_mirrors; i++) {
4019         struct page *page;
4020
4021         ret = btrfs_sb_log_location(device, i, READ, &bytenr);
4022         if (ret == -ENOENT) {
4023             break;
4024         } else if (ret < 0) {
4025             errors++;
4026             if (i == 0)
4027                 primary_failed = true;
4028             continue;
4029         }
4030         if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4031             device->commit_total_bytes)
4032             break;
4033
4034         page = find_get_page(device->bdev->bd_inode->i_mapping,
4035                      bytenr >> PAGE_SHIFT);
4036         if (!page) {
4037             errors++;
4038             if (i == 0)
4039                 primary_failed = true;
4040             continue;
4041         }
4042         /* Page is submitted locked and unlocked once the IO completes */
4043         wait_on_page_locked(page);
4044         if (PageError(page)) {
4045             errors++;
4046             if (i == 0)
4047                 primary_failed = true;
4048         }
4049
4050         /* Drop our reference */
4051         put_page(page);
4052
4053         /* Drop the reference from the writing run */
4054         put_page(page);
4055     }
4056
4057     /* log error, force error return */
4058     if (primary_failed) {
4059         btrfs_err(device->fs_info, "error writing primary super block to device %llu",
4060               device->devid);
4061         return -1;
4062     }
4063
4064     return errors < i ? 0 : -1;
4065 }
4066
4067 /*
4068  * endio for the write_dev_flush, this will wake anyone waiting
4069  * for the barrier when it is done
4070  */
4071 static void btrfs_end_empty_barrier(struct bio *bio)
4072 {
4073     bio_uninit(bio);
4074     complete(bio->bi_private);
4075 }
4076
4077 /*
4078  * Submit a flush request to the device if it supports it. Error handling is
4079  * done in the waiting counterpart.
4080  */
4081 static void write_dev_flush(struct btrfs_device *device)
4082 {
4083     struct bio *bio = &device->flush_bio;
4084
4085 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4086     /*
4087      * When a disk has write caching disabled, we skip submission of a bio
4088      * with flush and sync requests before writing the superblock, since
4089      * it's not needed. However when the integrity checker is enabled, this
4090      * results in reports that there are metadata blocks referred by a
4091      * superblock that were not properly flushed. So don't skip the bio
4092      * submission only when the integrity checker is enabled for the sake
4093      * of simplicity, since this is a debug tool and not meant for use in
4094      * non-debug builds.
4095      */
4096     if (!bdev_write_cache(device->bdev))
4097         return;
4098 #endif
4099
4100     bio_init(bio, device->bdev, NULL, 0,
4101          REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
4102     bio->bi_end_io = btrfs_end_empty_barrier;
4103     init_completion(&device->flush_wait);
4104     bio->bi_private = &device->flush_wait;
4105
4106     btrfsic_check_bio(bio);
4107     submit_bio(bio);
4108     set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4109 }
4110
4111 /*
4112  * If the flush bio has been submitted by write_dev_flush, wait for it.
4113  */
4114 static blk_status_t wait_dev_flush(struct btrfs_device *device)
4115 {
4116     struct bio *bio = &device->flush_bio;
4117
4118     if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
4119         return BLK_STS_OK;
4120
4121     clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4122     wait_for_completion_io(&device->flush_wait);
4123
4124     return bio->bi_status;
4125 }
4126
4127 static int check_barrier_error(struct btrfs_fs_info *fs_info)
4128 {
4129     if (!btrfs_check_rw_degradable(fs_info, NULL))
4130         return -EIO;
4131     return 0;
4132 }
4133
4134 /*
4135  * send an empty flush down to each device in parallel,
4136  * then wait for them
4137  */
4138 static int barrier_all_devices(struct btrfs_fs_info *info)
4139 {
4140     struct list_head *head;
4141     struct btrfs_device *dev;
4142     int errors_wait = 0;
4143     blk_status_t ret;
4144
4145     lockdep_assert_held(&info->fs_devices->device_list_mutex);
4146     /* send down all the barriers */
4147     head = &info->fs_devices->devices;
4148     list_for_each_entry(dev, head, dev_list) {
4149         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4150             continue;
4151         if (!dev->bdev)
4152             continue;
4153         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4154             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4155             continue;
4156
4157         write_dev_flush(dev);
4158         dev->last_flush_error = BLK_STS_OK;
4159     }
4160
4161     /* wait for all the barriers */
4162     list_for_each_entry(dev, head, dev_list) {
4163         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4164             continue;
4165         if (!dev->bdev) {
4166             errors_wait++;
4167             continue;
4168         }
4169         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4170             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4171             continue;
4172
4173         ret = wait_dev_flush(dev);
4174         if (ret) {
4175             dev->last_flush_error = ret;
4176             btrfs_dev_stat_inc_and_print(dev,
4177                     BTRFS_DEV_STAT_FLUSH_ERRS);
4178             errors_wait++;
4179         }
4180     }
4181
4182     if (errors_wait) {
4183         /*
4184          * At some point we need the status of all disks
4185          * to arrive at the volume status. So error checking
4186          * is being pushed to a separate loop.
4187          */
4188         return check_barrier_error(info);
4189     }
4190     return 0;
4191 }
4192
4193 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
4194 {
4195     int raid_type;
4196     int min_tolerated = INT_MAX;
4197
4198     if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
4199         (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
4200         min_tolerated = min_t(int, min_tolerated,
4201                     btrfs_raid_array[BTRFS_RAID_SINGLE].
4202                     tolerated_failures);
4203
4204     for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4205         if (raid_type == BTRFS_RAID_SINGLE)
4206             continue;
4207         if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4208             continue;
4209         min_tolerated = min_t(int, min_tolerated,
4210                     btrfs_raid_array[raid_type].
4211                     tolerated_failures);
4212     }
4213
4214     if (min_tolerated == INT_MAX) {
4215         pr_warn("BTRFS: unknown raid flag: %llu", flags);
4216         min_tolerated = 0;
4217     }
4218
4219     return min_tolerated;
4220 }
4221
4222 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4223 {
4224     struct list_head *head;
4225     struct btrfs_device *dev;
4226     struct btrfs_super_block *sb;
4227     struct btrfs_dev_item *dev_item;
4228     int ret;
4229     int do_barriers;
4230     int max_errors;
4231     int total_errors = 0;
4232     u64 flags;
4233
4234     do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4235
4236     /*
4237      * max_mirrors == 0 indicates we're from commit_transaction,
4238      * not from fsync where the tree roots in fs_info have not
4239      * been consistent on disk.
4240      */
4241     if (max_mirrors == 0)
4242         backup_super_roots(fs_info);
4243
4244     sb = fs_info->super_for_commit;
4245     dev_item = &sb->dev_item;
4246
4247     mutex_lock(&fs_info->fs_devices->device_list_mutex);
4248     head = &fs_info->fs_devices->devices;
4249     max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4250
4251     if (do_barriers) {
4252         ret = barrier_all_devices(fs_info);
4253         if (ret) {
4254             mutex_unlock(
4255                 &fs_info->fs_devices->device_list_mutex);
4256             btrfs_handle_fs_error(fs_info, ret,
4257                           "errors while submitting device barriers.");
4258             return ret;
4259         }
4260     }
4261
4262     list_for_each_entry(dev, head, dev_list) {
4263         if (!dev->bdev) {
4264             total_errors++;
4265             continue;
4266         }
4267         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4268             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4269             continue;
4270
4271         btrfs_set_stack_device_generation(dev_item, 0);
4272         btrfs_set_stack_device_type(dev_item, dev->type);
4273         btrfs_set_stack_device_id(dev_item, dev->devid);
4274         btrfs_set_stack_device_total_bytes(dev_item,
4275                            dev->commit_total_bytes);
4276         btrfs_set_stack_device_bytes_used(dev_item,
4277                           dev->commit_bytes_used);
4278         btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4279         btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4280         btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4281         memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4282         memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4283                BTRFS_FSID_SIZE);
4284
4285         flags = btrfs_super_flags(sb);
4286         btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4287
4288         ret = btrfs_validate_write_super(fs_info, sb);
4289         if (ret < 0) {
4290             mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4291             btrfs_handle_fs_error(fs_info, -EUCLEAN,
4292                 "unexpected superblock corruption detected");
4293             return -EUCLEAN;
4294         }
4295
4296         ret = write_dev_supers(dev, sb, max_mirrors);
4297         if (ret)
4298             total_errors++;
4299     }
4300     if (total_errors > max_errors) {
4301         btrfs_err(fs_info, "%d errors while writing supers",
4302               total_errors);
4303         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4304
4305         /* FUA is masked off if unsupported and can't be the reason */
4306         btrfs_handle_fs_error(fs_info, -EIO,
4307                       "%d errors while writing supers",
4308                       total_errors);
4309         return -EIO;
4310     }
4311
4312     total_errors = 0;
4313     list_for_each_entry(dev, head, dev_list) {
4314         if (!dev->bdev)
4315             continue;
4316         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4317             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4318             continue;
4319
4320         ret = wait_dev_supers(dev, max_mirrors);
4321         if (ret)
4322             total_errors++;
4323     }
4324     mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4325     if (total_errors > max_errors) {
4326         btrfs_handle_fs_error(fs_info, -EIO,
4327                       "%d errors while writing supers",
4328                       total_errors);
4329         return -EIO;
4330     }
4331     return 0;
4332 }
4333
4334 /* Drop a fs root from the radix tree and free it. */
4335 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4336                   struct btrfs_root *root)
4337 {
4338     bool drop_ref = false;
4339
4340     spin_lock(&fs_info->fs_roots_radix_lock);
4341     radix_tree_delete(&fs_info->fs_roots_radix,
4342               (unsigned long)root->root_key.objectid);
4343     if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4344         drop_ref = true;
4345     spin_unlock(&fs_info->fs_roots_radix_lock);
4346
4347     if (BTRFS_FS_ERROR(fs_info)) {
4348         ASSERT(root->log_root == NULL);
4349         if (root->reloc_root) {
4350             btrfs_put_root(root->reloc_root);
4351             root->reloc_root = NULL;
4352         }
4353     }
4354
4355     if (drop_ref)
4356         btrfs_put_root(root);
4357 }
4358
4359 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4360 {
4361     u64 root_objectid = 0;
4362     struct btrfs_root *gang[8];
4363     int i = 0;
4364     int err = 0;
4365     unsigned int ret = 0;
4366
4367     while (1) {
4368         spin_lock(&fs_info->fs_roots_radix_lock);
4369         ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4370                          (void **)gang, root_objectid,
4371                          ARRAY_SIZE(gang));
4372         if (!ret) {
4373             spin_unlock(&fs_info->fs_roots_radix_lock);
4374             break;
4375         }
4376         root_objectid = gang[ret - 1]->root_key.objectid + 1;
4377
4378         for (i = 0; i < ret; i++) {
4379             /* Avoid to grab roots in dead_roots */
4380             if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4381                 gang[i] = NULL;
4382                 continue;
4383             }
4384             /* grab all the search result for later use */
4385             gang[i] = btrfs_grab_root(gang[i]);
4386         }
4387         spin_unlock(&fs_info->fs_roots_radix_lock);
4388
4389         for (i = 0; i < ret; i++) {
4390             if (!gang[i])
4391                 continue;
4392             root_objectid = gang[i]->root_key.objectid;
4393             err = btrfs_orphan_cleanup(gang[i]);
4394             if (err)
4395                 break;
4396             btrfs_put_root(gang[i]);
4397         }
4398         root_objectid++;
4399     }
4400
4401     /* release the uncleaned roots due to error */
4402     for (; i < ret; i++) {
4403         if (gang[i])
4404             btrfs_put_root(gang[i]);
4405     }
4406     return err;
4407 }
4408
4409 int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4410 {
4411     struct btrfs_root *root = fs_info->tree_root;
4412     struct btrfs_trans_handle *trans;
4413
4414     mutex_lock(&fs_info->cleaner_mutex);
4415     btrfs_run_delayed_iputs(fs_info);
4416     mutex_unlock(&fs_info->cleaner_mutex);
4417     wake_up_process(fs_info->cleaner_kthread);
4418
4419     /* wait until ongoing cleanup work done */
4420     down_write(&fs_info->cleanup_work_sem);
4421     up_write(&fs_info->cleanup_work_sem);
4422
4423     trans = btrfs_join_transaction(root);
4424     if (IS_ERR(trans))
4425         return PTR_ERR(trans);
4426     return btrfs_commit_transaction(trans);
4427 }
4428
4429 static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4430 {
4431     struct btrfs_transaction *trans;
4432     struct btrfs_transaction *tmp;
4433     bool found = false;
4434
4435     if (list_empty(&fs_info->trans_list))
4436         return;
4437
4438     /*
4439      * This function is only called at the very end of close_ctree(),
4440      * thus no other running transaction, no need to take trans_lock.
4441      */
4442     ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4443     list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4444         struct extent_state *cached = NULL;
4445         u64 dirty_bytes = 0;
4446         u64 cur = 0;
4447         u64 found_start;
4448         u64 found_end;
4449
4450         found = true;
4451         while (!find_first_extent_bit(&trans->dirty_pages, cur,
4452             &found_start, &found_end, EXTENT_DIRTY, &cached)) {
4453             dirty_bytes += found_end + 1 - found_start;
4454             cur = found_end + 1;
4455         }
4456         btrfs_warn(fs_info,
4457     "transaction %llu (with %llu dirty metadata bytes) is not committed",
4458                trans->transid, dirty_bytes);
4459         btrfs_cleanup_one_transaction(trans, fs_info);
4460
4461         if (trans == fs_info->running_transaction)
4462             fs_info->running_transaction = NULL;
4463         list_del_init(&trans->list);
4464
4465         btrfs_put_transaction(trans);
4466         trace_btrfs_transaction_commit(fs_info);
4467     }
4468     ASSERT(!found);
4469 }
4470
4471 void __cold close_ctree(struct btrfs_fs_info *fs_info)
4472 {
4473     int ret;
4474
4475     set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4476
4477     /*
4478      * If we had UNFINISHED_DROPS we could still be processing them, so
4479      * clear that bit and wake up relocation so it can stop.
4480      * We must do this before stopping the block group reclaim task, because
4481      * at btrfs_relocate_block_group() we wait for this bit, and after the
4482      * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4483      * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4484      * return 1.
4485      */
4486     btrfs_wake_unfinished_drop(fs_info);
4487
4488     /*
4489      * We may have the reclaim task running and relocating a data block group,
4490      * in which case it may create delayed iputs. So stop it before we park
4491      * the cleaner kthread otherwise we can get new delayed iputs after
4492      * parking the cleaner, and that can make the async reclaim task to hang
4493      * if it's waiting for delayed iputs to complete, since the cleaner is
4494      * parked and can not run delayed iputs - this will make us hang when
4495      * trying to stop the async reclaim task.
4496      */
4497     cancel_work_sync(&fs_info->reclaim_bgs_work);
4498     /*
4499      * We don't want the cleaner to start new transactions, add more delayed
4500      * iputs, etc. while we're closing. We can't use kthread_stop() yet
4501      * because that frees the task_struct, and the transaction kthread might
4502      * still try to wake up the cleaner.
4503      */
4504     kthread_park(fs_info->cleaner_kthread);
4505
4506     /* wait for the qgroup rescan worker to stop */
4507     btrfs_qgroup_wait_for_completion(fs_info, false);
4508
4509     /* wait for the uuid_scan task to finish */
4510     down(&fs_info->uuid_tree_rescan_sem);
4511     /* avoid complains from lockdep et al., set sem back to initial state */
4512     up(&fs_info->uuid_tree_rescan_sem);
4513
4514     /* pause restriper - we want to resume on mount */
4515     btrfs_pause_balance(fs_info);
4516
4517     btrfs_dev_replace_suspend_for_unmount(fs_info);
4518
4519     btrfs_scrub_cancel(fs_info);
4520
4521     /* wait for any defraggers to finish */
4522     wait_event(fs_info->transaction_wait,
4523            (atomic_read(&fs_info->defrag_running) == 0));
4524
4525     /* clear out the rbtree of defraggable inodes */
4526     btrfs_cleanup_defrag_inodes(fs_info);
4527
4528     /*
4529      * After we parked the cleaner kthread, ordered extents may have
4530      * completed and created new delayed iputs. If one of the async reclaim
4531      * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4532      * can hang forever trying to stop it, because if a delayed iput is
4533      * added after it ran btrfs_run_delayed_iputs() and before it called
4534      * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4535      * no one else to run iputs.
4536      *
4537      * So wait for all ongoing ordered extents to complete and then run
4538      * delayed iputs. This works because once we reach this point no one
4539      * can either create new ordered extents nor create delayed iputs
4540      * through some other means.
4541      *
4542      * Also note that btrfs_wait_ordered_roots() is not safe here, because
4543      * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4544      * but the delayed iput for the respective inode is made only when doing
4545      * the final btrfs_put_ordered_extent() (which must happen at
4546      * btrfs_finish_ordered_io() when we are unmounting).
4547      */
4548     btrfs_flush_workqueue(fs_info->endio_write_workers);
4549     /* Ordered extents for free space inodes. */
4550     btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4551     btrfs_run_delayed_iputs(fs_info);
4552
4553     cancel_work_sync(&fs_info->async_reclaim_work);
4554     cancel_work_sync(&fs_info->async_data_reclaim_work);
4555     cancel_work_sync(&fs_info->preempt_reclaim_work);
4556
4557     /* Cancel or finish ongoing discard work */
4558     btrfs_discard_cleanup(fs_info);
4559
4560     if (!sb_rdonly(fs_info->sb)) {
4561         /*
4562          * The cleaner kthread is stopped, so do one final pass over
4563          * unused block groups.
4564          */
4565         btrfs_delete_unused_bgs(fs_info);
4566
4567         /*
4568          * There might be existing delayed inode workers still running
4569          * and holding an empty delayed inode item. We must wait for
4570          * them to complete first because they can create a transaction.
4571          * This happens when someone calls btrfs_balance_delayed_items()
4572          * and then a transaction commit runs the same delayed nodes
4573          * before any delayed worker has done something with the nodes.
4574          * We must wait for any worker here and not at transaction
4575          * commit time since that could cause a deadlock.
4576          * This is a very rare case.
4577          */
4578         btrfs_flush_workqueue(fs_info->delayed_workers);
4579
4580         ret = btrfs_commit_super(fs_info);
4581         if (ret)
4582             btrfs_err(fs_info, "commit super ret %d", ret);
4583     }
4584
4585     if (BTRFS_FS_ERROR(fs_info))
4586         btrfs_error_commit_super(fs_info);
4587
4588     kthread_stop(fs_info->transaction_kthread);
4589     kthread_stop(fs_info->cleaner_kthread);
4590
4591     ASSERT(list_empty(&fs_info->delayed_iputs));
4592     set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4593
4594     if (btrfs_check_quota_leak(fs_info)) {
4595         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4596         btrfs_err(fs_info, "qgroup reserved space leaked");
4597     }
4598
4599     btrfs_free_qgroup_config(fs_info);
4600     ASSERT(list_empty(&fs_info->delalloc_roots));
4601
4602     if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4603         btrfs_info(fs_info, "at unmount delalloc count %lld",
4604                percpu_counter_sum(&fs_info->delalloc_bytes));
4605     }
4606
4607     if (percpu_counter_sum(&fs_info->ordered_bytes))
4608         btrfs_info(fs_info, "at unmount dio bytes count %lld",
4609                percpu_counter_sum(&fs_info->ordered_bytes));
4610
4611     btrfs_sysfs_remove_mounted(fs_info);
4612     btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4613
4614     btrfs_put_block_group_cache(fs_info);
4615
4616     /*
4617      * we must make sure there is not any read request to
4618      * submit after we stopping all workers.
4619      */
4620     invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4621     btrfs_stop_all_workers(fs_info);
4622
4623     /* We shouldn't have any transaction open at this point */
4624     warn_about_uncommitted_trans(fs_info);
4625
4626     clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4627     free_root_pointers(fs_info, true);
4628     btrfs_free_fs_roots(fs_info);
4629
4630     /*
4631      * We must free the block groups after dropping the fs_roots as we could
4632      * have had an IO error and have left over tree log blocks that aren't
4633      * cleaned up until the fs roots are freed.  This makes the block group
4634      * accounting appear to be wrong because there's pending reserved bytes,
4635      * so make sure we do the block group cleanup afterwards.
4636      */
4637     btrfs_free_block_groups(fs_info);
4638
4639     iput(fs_info->btree_inode);
4640
4641 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4642     if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4643         btrfsic_unmount(fs_info->fs_devices);
4644 #endif
4645
4646     btrfs_mapping_tree_free(&fs_info->mapping_tree);
4647     btrfs_close_devices(fs_info->fs_devices);
4648 }
4649
4650 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4651               int atomic)
4652 {
4653     int ret;
4654     struct inode *btree_inode = buf->pages[0]->mapping->host;
4655
4656     ret = extent_buffer_uptodate(buf);
4657     if (!ret)
4658         return ret;
4659
4660     ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4661                     parent_transid, atomic);
4662     if (ret == -EAGAIN)
4663         return ret;
4664     return !ret;
4665 }
4666
4667 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4668 {
4669     struct btrfs_fs_info *fs_info = buf->fs_info;
4670     u64 transid = btrfs_header_generation(buf);
4671     int was_dirty;
4672
4673 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4674     /*
4675      * This is a fast path so only do this check if we have sanity tests
4676      * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4677      * outside of the sanity tests.
4678      */
4679     if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4680         return;
4681 #endif
4682     btrfs_assert_tree_write_locked(buf);
4683     if (transid != fs_info->generation)
4684         WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4685             buf->start, transid, fs_info->generation);
4686     was_dirty = set_extent_buffer_dirty(buf);
4687     if (!was_dirty)
4688         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4689                      buf->len,
4690                      fs_info->dirty_metadata_batch);
4691 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4692     /*
4693      * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4694      * but item data not updated.
4695      * So here we should only check item pointers, not item data.
4696      */
4697     if (btrfs_header_level(buf) == 0 &&
4698         btrfs_check_leaf_relaxed(buf)) {
4699         btrfs_print_leaf(buf);
4700         ASSERT(0);
4701     }
4702 #endif
4703 }
4704
4705 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4706                     int flush_delayed)
4707 {
4708     /*
4709      * looks as though older kernels can get into trouble with
4710      * this code, they end up stuck in balance_dirty_pages forever
4711      */
4712     int ret;
4713
4714     if (current->flags & PF_MEMALLOC)
4715         return;
4716
4717     if (flush_delayed)
4718         btrfs_balance_delayed_items(fs_info);
4719
4720     ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4721                      BTRFS_DIRTY_METADATA_THRESH,
4722                      fs_info->dirty_metadata_batch);
4723     if (ret > 0) {
4724         balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4725     }
4726 }
4727
4728 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4729 {
4730     __btrfs_btree_balance_dirty(fs_info, 1);
4731 }
4732
4733 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4734 {
4735     __btrfs_btree_balance_dirty(fs_info, 0);
4736 }
4737
4738 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4739 {
4740     /* cleanup FS via transaction */
4741     btrfs_cleanup_transaction(fs_info);
4742
4743     mutex_lock(&fs_info->cleaner_mutex);
4744     btrfs_run_delayed_iputs(fs_info);
4745     mutex_unlock(&fs_info->cleaner_mutex);
4746
4747     down_write(&fs_info->cleanup_work_sem);
4748     up_write(&fs_info->cleanup_work_sem);
4749 }
4750
4751 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4752 {
4753     struct btrfs_root *gang[8];
4754     u64 root_objectid = 0;
4755     int ret;
4756
4757     spin_lock(&fs_info->fs_roots_radix_lock);
4758     while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4759                          (void **)gang, root_objectid,
4760                          ARRAY_SIZE(gang))) != 0) {
4761         int i;
4762
4763         for (i = 0; i < ret; i++)
4764             gang[i] = btrfs_grab_root(gang[i]);
4765         spin_unlock(&fs_info->fs_roots_radix_lock);
4766
4767         for (i = 0; i < ret; i++) {
4768             if (!gang[i])
4769                 continue;
4770             root_objectid = gang[i]->root_key.objectid;
4771             btrfs_free_log(NULL, gang[i]);
4772             btrfs_put_root(gang[i]);
4773         }
4774         root_objectid++;
4775         spin_lock(&fs_info->fs_roots_radix_lock);
4776     }
4777     spin_unlock(&fs_info->fs_roots_radix_lock);
4778     btrfs_free_log_root_tree(NULL, fs_info);
4779 }
4780
4781 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4782 {
4783     struct btrfs_ordered_extent *ordered;
4784
4785     spin_lock(&root->ordered_extent_lock);
4786     /*
4787      * This will just short circuit the ordered completion stuff which will
4788      * make sure the ordered extent gets properly cleaned up.
4789      */
4790     list_for_each_entry(ordered, &root->ordered_extents,
4791                 root_extent_list)
4792         set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4793     spin_unlock(&root->ordered_extent_lock);
4794 }
4795
4796 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4797 {
4798     struct btrfs_root *root;
4799     struct list_head splice;
4800
4801     INIT_LIST_HEAD(&splice);
4802
4803     spin_lock(&fs_info->ordered_root_lock);
4804     list_splice_init(&fs_info->ordered_roots, &splice);
4805     while (!list_empty(&splice)) {
4806         root = list_first_entry(&splice, struct btrfs_root,
4807                     ordered_root);
4808         list_move_tail(&root->ordered_root,
4809                    &fs_info->ordered_roots);
4810
4811         spin_unlock(&fs_info->ordered_root_lock);
4812         btrfs_destroy_ordered_extents(root);
4813
4814         cond_resched();
4815         spin_lock(&fs_info->ordered_root_lock);
4816     }
4817     spin_unlock(&fs_info->ordered_root_lock);
4818
4819     /*
4820      * We need this here because if we've been flipped read-only we won't
4821      * get sync() from the umount, so we need to make sure any ordered
4822      * extents that haven't had their dirty pages IO start writeout yet
4823      * actually get run and error out properly.
4824      */
4825     btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4826 }
4827
4828 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4829                       struct btrfs_fs_info *fs_info)
4830 {
4831     struct rb_node *node;
4832     struct btrfs_delayed_ref_root *delayed_refs;
4833     struct btrfs_delayed_ref_node *ref;
4834     int ret = 0;
4835
4836     delayed_refs = &trans->delayed_refs;
4837
4838     spin_lock(&delayed_refs->lock);
4839     if (atomic_read(&delayed_refs->num_entries) == 0) {
4840         spin_unlock(&delayed_refs->lock);
4841         btrfs_debug(fs_info, "delayed_refs has NO entry");
4842         return ret;
4843     }
4844
4845     while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4846         struct btrfs_delayed_ref_head *head;
4847         struct rb_node *n;
4848         bool pin_bytes = false;
4849
4850         head = rb_entry(node, struct btrfs_delayed_ref_head,
4851                 href_node);
4852         if (btrfs_delayed_ref_lock(delayed_refs, head))
4853             continue;
4854
4855         spin_lock(&head->lock);
4856         while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4857             ref = rb_entry(n, struct btrfs_delayed_ref_node,
4858                        ref_node);
4859             ref->in_tree = 0;
4860             rb_erase_cached(&ref->ref_node, &head->ref_tree);
4861             RB_CLEAR_NODE(&ref->ref_node);
4862             if (!list_empty(&ref->add_list))
4863                 list_del(&ref->add_list);
4864             atomic_dec(&delayed_refs->num_entries);
4865             btrfs_put_delayed_ref(ref);
4866         }
4867         if (head->must_insert_reserved)
4868             pin_bytes = true;
4869         btrfs_free_delayed_extent_op(head->extent_op);
4870         btrfs_delete_ref_head(delayed_refs, head);
4871         spin_unlock(&head->lock);
4872         spin_unlock(&delayed_refs->lock);
4873         mutex_unlock(&head->mutex);
4874
4875         if (pin_bytes) {
4876             struct btrfs_block_group *cache;
4877
4878             cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4879             BUG_ON(!cache);
4880
4881             spin_lock(&cache->space_info->lock);
4882             spin_lock(&cache->lock);
4883             cache->pinned += head->num_bytes;
4884             btrfs_space_info_update_bytes_pinned(fs_info,
4885                 cache->space_info, head->num_bytes);
4886             cache->reserved -= head->num_bytes;
4887             cache->space_info->bytes_reserved -= head->num_bytes;
4888             spin_unlock(&cache->lock);
4889             spin_unlock(&cache->space_info->lock);
4890
4891             btrfs_put_block_group(cache);
4892
4893             btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4894                 head->bytenr + head->num_bytes - 1);
4895         }
4896         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4897         btrfs_put_delayed_ref_head(head);
4898         cond_resched();
4899         spin_lock(&delayed_refs->lock);
4900     }
4901     btrfs_qgroup_destroy_extent_records(trans);
4902
4903     spin_unlock(&delayed_refs->lock);
4904
4905     return ret;
4906 }
4907
4908 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4909 {
4910     struct btrfs_inode *btrfs_inode;
4911     struct list_head splice;
4912
4913     INIT_LIST_HEAD(&splice);
4914
4915     spin_lock(&root->delalloc_lock);
4916     list_splice_init(&root->delalloc_inodes, &splice);
4917
4918     while (!list_empty(&splice)) {
4919         struct inode *inode = NULL;
4920         btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4921                            delalloc_inodes);
4922         __btrfs_del_delalloc_inode(root, btrfs_inode);
4923         spin_unlock(&root->delalloc_lock);
4924
4925         /*
4926          * Make sure we get a live inode and that it'll not disappear
4927          * meanwhile.
4928          */
4929         inode = igrab(&btrfs_inode->vfs_inode);
4930         if (inode) {
4931             invalidate_inode_pages2(inode->i_mapping);
4932             iput(inode);
4933         }
4934         spin_lock(&root->delalloc_lock);
4935     }
4936     spin_unlock(&root->delalloc_lock);
4937 }
4938
4939 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4940 {
4941     struct btrfs_root *root;
4942     struct list_head splice;
4943
4944     INIT_LIST_HEAD(&splice);
4945
4946     spin_lock(&fs_info->delalloc_root_lock);
4947     list_splice_init(&fs_info->delalloc_roots, &splice);
4948     while (!list_empty(&splice)) {
4949         root = list_first_entry(&splice, struct btrfs_root,
4950                      delalloc_root);
4951         root = btrfs_grab_root(root);
4952         BUG_ON(!root);
4953         spin_unlock(&fs_info->delalloc_root_lock);
4954
4955         btrfs_destroy_delalloc_inodes(root);
4956         btrfs_put_root(root);
4957
4958         spin_lock(&fs_info->delalloc_root_lock);
4959     }
4960     spin_unlock(&fs_info->delalloc_root_lock);
4961 }
4962
4963 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4964                     struct extent_io_tree *dirty_pages,
4965                     int mark)
4966 {
4967     int ret;
4968     struct extent_buffer *eb;
4969     u64 start = 0;
4970     u64 end;
4971
4972     while (1) {
4973         ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4974                         mark, NULL);
4975         if (ret)
4976             break;
4977
4978         clear_extent_bits(dirty_pages, start, end, mark);
4979         while (start <= end) {
4980             eb = find_extent_buffer(fs_info, start);
4981             start += fs_info->nodesize;
4982             if (!eb)
4983                 continue;
4984             wait_on_extent_buffer_writeback(eb);
4985
4986             if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
4987                            &eb->bflags))
4988                 clear_extent_buffer_dirty(eb);
4989             free_extent_buffer_stale(eb);
4990         }
4991     }
4992
4993     return ret;
4994 }
4995
4996 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4997                        struct extent_io_tree *unpin)
4998 {
4999     u64 start;
5000     u64 end;
5001     int ret;
5002
5003     while (1) {
5004         struct extent_state *cached_state = NULL;
5005
5006         /*
5007          * The btrfs_finish_extent_commit() may get the same range as
5008          * ours between find_first_extent_bit and clear_extent_dirty.
5009          * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
5010          * the same extent range.
5011          */
5012         mutex_lock(&fs_info->unused_bg_unpin_mutex);
5013         ret = find_first_extent_bit(unpin, 0, &start, &end,
5014                         EXTENT_DIRTY, &cached_state);
5015         if (ret) {
5016             mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5017             break;
5018         }
5019
5020         clear_extent_dirty(unpin, start, end, &cached_state);
5021         free_extent_state(cached_state);
5022         btrfs_error_unpin_extent_range(fs_info, start, end);
5023         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5024         cond_resched();
5025     }
5026
5027     return 0;
5028 }
5029
5030 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
5031 {
5032     struct inode *inode;
5033
5034     inode = cache->io_ctl.inode;
5035     if (inode) {
5036         invalidate_inode_pages2(inode->i_mapping);
5037         BTRFS_I(inode)->generation = 0;
5038         cache->io_ctl.inode = NULL;
5039         iput(inode);
5040     }
5041     ASSERT(cache->io_ctl.pages == NULL);
5042     btrfs_put_block_group(cache);
5043 }
5044
5045 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
5046                  struct btrfs_fs_info *fs_info)
5047 {
5048     struct btrfs_block_group *cache;
5049
5050     spin_lock(&cur_trans->dirty_bgs_lock);
5051     while (!list_empty(&cur_trans->dirty_bgs)) {
5052         cache = list_first_entry(&cur_trans->dirty_bgs,
5053                      struct btrfs_block_group,
5054                      dirty_list);
5055
5056         if (!list_empty(&cache->io_list)) {
5057             spin_unlock(&cur_trans->dirty_bgs_lock);
5058             list_del_init(&cache->io_list);
5059             btrfs_cleanup_bg_io(cache);
5060             spin_lock(&cur_trans->dirty_bgs_lock);
5061         }
5062
5063         list_del_init(&cache->dirty_list);
5064         spin_lock(&cache->lock);
5065         cache->disk_cache_state = BTRFS_DC_ERROR;
5066         spin_unlock(&cache->lock);
5067
5068         spin_unlock(&cur_trans->dirty_bgs_lock);
5069         btrfs_put_block_group(cache);
5070         btrfs_delayed_refs_rsv_release(fs_info, 1);
5071         spin_lock(&cur_trans->dirty_bgs_lock);
5072     }
5073     spin_unlock(&cur_trans->dirty_bgs_lock);
5074
5075     /*
5076      * Refer to the definition of io_bgs member for details why it's safe
5077      * to use it without any locking
5078      */
5079     while (!list_empty(&cur_trans->io_bgs)) {
5080         cache = list_first_entry(&cur_trans->io_bgs,
5081                      struct btrfs_block_group,
5082                      io_list);
5083
5084         list_del_init(&cache->io_list);
5085         spin_lock(&cache->lock);
5086         cache->disk_cache_state = BTRFS_DC_ERROR;
5087         spin_unlock(&cache->lock);
5088         btrfs_cleanup_bg_io(cache);
5089     }
5090 }
5091
5092 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
5093                    struct btrfs_fs_info *fs_info)
5094 {
5095     struct btrfs_device *dev, *tmp;
5096
5097     btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
5098     ASSERT(list_empty(&cur_trans->dirty_bgs));
5099     ASSERT(list_empty(&cur_trans->io_bgs));
5100
5101     list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
5102                  post_commit_list) {
5103         list_del_init(&dev->post_commit_list);
5104     }
5105
5106     btrfs_destroy_delayed_refs(cur_trans, fs_info);
5107
5108     cur_trans->state = TRANS_STATE_COMMIT_START;
5109     wake_up(&fs_info->transaction_blocked_wait);
5110
5111     cur_trans->state = TRANS_STATE_UNBLOCKED;
5112     wake_up(&fs_info->transaction_wait);
5113
5114     btrfs_destroy_delayed_inodes(fs_info);
5115
5116     btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
5117                      EXTENT_DIRTY);
5118     btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
5119
5120     btrfs_free_redirty_list(cur_trans);
5121
5122     cur_trans->state =TRANS_STATE_COMPLETED;
5123     wake_up(&cur_trans->commit_wait);
5124 }
5125
5126 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
5127 {
5128     struct btrfs_transaction *t;
5129
5130     mutex_lock(&fs_info->transaction_kthread_mutex);
5131
5132     spin_lock(&fs_info->trans_lock);
5133     while (!list_empty(&fs_info->trans_list)) {
5134         t = list_first_entry(&fs_info->trans_list,
5135                      struct btrfs_transaction, list);
5136         if (t->state >= TRANS_STATE_COMMIT_START) {
5137             refcount_inc(&t->use_count);
5138             spin_unlock(&fs_info->trans_lock);
5139             btrfs_wait_for_commit(fs_info, t->transid);
5140             btrfs_put_transaction(t);
5141             spin_lock(&fs_info->trans_lock);
5142             continue;
5143         }
5144         if (t == fs_info->running_transaction) {
5145             t->state = TRANS_STATE_COMMIT_DOING;
5146             spin_unlock(&fs_info->trans_lock);
5147             /*
5148              * We wait for 0 num_writers since we don't hold a trans
5149              * handle open currently for this transaction.
5150              */
5151             wait_event(t->writer_wait,
5152                    atomic_read(&t->num_writers) == 0);
5153         } else {
5154             spin_unlock(&fs_info->trans_lock);
5155         }
5156         btrfs_cleanup_one_transaction(t, fs_info);
5157
5158         spin_lock(&fs_info->trans_lock);
5159         if (t == fs_info->running_transaction)
5160             fs_info->running_transaction = NULL;
5161         list_del_init(&t->list);
5162         spin_unlock(&fs_info->trans_lock);
5163
5164         btrfs_put_transaction(t);
5165         trace_btrfs_transaction_commit(fs_info);
5166         spin_lock(&fs_info->trans_lock);
5167     }
5168     spin_unlock(&fs_info->trans_lock);
5169     btrfs_destroy_all_ordered_extents(fs_info);
5170     btrfs_destroy_delayed_inodes(fs_info);
5171     btrfs_assert_delayed_root_empty(fs_info);
5172     btrfs_destroy_all_delalloc_inodes(fs_info);
5173     btrfs_drop_all_logs(fs_info);
5174     mutex_unlock(&fs_info->transaction_kthread_mutex);
5175
5176     return 0;
5177 }
5178
5179 int btrfs_init_root_free_objectid(struct btrfs_root *root)
5180 {
5181     struct btrfs_path *path;
5182     int ret;
5183     struct extent_buffer *l;
5184     struct btrfs_key search_key;
5185     struct btrfs_key found_key;
5186     int slot;
5187
5188     path = btrfs_alloc_path();
5189     if (!path)
5190         return -ENOMEM;
5191
5192     search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
5193     search_key.type = -1;
5194     search_key.offset = (u64)-1;
5195     ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5196     if (ret < 0)
5197         goto error;
5198     BUG_ON(ret == 0); /* Corruption */
5199     if (path->slots[0] > 0) {
5200         slot = path->slots[0] - 1;
5201         l = path->nodes[0];
5202         btrfs_item_key_to_cpu(l, &found_key, slot);
5203         root->free_objectid = max_t(u64, found_key.objectid + 1,
5204                         BTRFS_FIRST_FREE_OBJECTID);
5205     } else {
5206         root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
5207     }
5208     ret = 0;
5209 error:
5210     btrfs_free_path(path);
5211     return ret;
5212 }
5213
5214 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
5215 {
5216     int ret;
5217     mutex_lock(&root->objectid_mutex);
5218
5219     if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
5220         btrfs_warn(root->fs_info,
5221                "the objectid of root %llu reaches its highest value",
5222                root->root_key.objectid);
5223         ret = -ENOSPC;
5224         goto out;
5225     }
5226
5227     *objectid = root->free_objectid++;
5228     ret = 0;
5229 out:
5230     mutex_unlock(&root->objectid_mutex);
5231     return ret;
5232 }