fs/btrfs/block-group.c

0001 // SPDX-License-Identifier: GPL-2.0
0002
0003 #include <linux/list_sort.h>
0004 #include "misc.h"
0005 #include "ctree.h"
0006 #include "block-group.h"
0007 #include "space-info.h"
0008 #include "disk-io.h"
0009 #include "free-space-cache.h"
0010 #include "free-space-tree.h"
0011 #include "volumes.h"
0012 #include "transaction.h"
0013 #include "ref-verify.h"
0014 #include "sysfs.h"
0015 #include "tree-log.h"
0016 #include "delalloc-space.h"
0017 #include "discard.h"
0018 #include "raid56.h"
0019 #include "zoned.h"
0020
0021 /*
0022  * Return target flags in extended format or 0 if restripe for this chunk_type
0023  * is not in progress
0024  *
0025  * Should be called with balance_lock held
0026  */
0027 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
0028 {
0029     struct btrfs_balance_control *bctl = fs_info->balance_ctl;
0030     u64 target = 0;
0031
0032     if (!bctl)
0033         return 0;
0034
0035     if (flags & BTRFS_BLOCK_GROUP_DATA &&
0036         bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
0037         target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
0038     } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
0039            bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
0040         target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
0041     } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
0042            bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
0043         target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
0044     }
0045
0046     return target;
0047 }
0048
0049 /*
0050  * @flags: available profiles in extended format (see ctree.h)
0051  *
0052  * Return reduced profile in chunk format.  If profile changing is in progress
0053  * (either running or paused) picks the target profile (if it's already
0054  * available), otherwise falls back to plain reducing.
0055  */
0056 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
0057 {
0058     u64 num_devices = fs_info->fs_devices->rw_devices;
0059     u64 target;
0060     u64 raid_type;
0061     u64 allowed = 0;
0062
0063     /*
0064      * See if restripe for this chunk_type is in progress, if so try to
0065      * reduce to the target profile
0066      */
0067     spin_lock(&fs_info->balance_lock);
0068     target = get_restripe_target(fs_info, flags);
0069     if (target) {
0070         spin_unlock(&fs_info->balance_lock);
0071         return extended_to_chunk(target);
0072     }
0073     spin_unlock(&fs_info->balance_lock);
0074
0075     /* First, mask out the RAID levels which aren't possible */
0076     for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
0077         if (num_devices >= btrfs_raid_array[raid_type].devs_min)
0078             allowed |= btrfs_raid_array[raid_type].bg_flag;
0079     }
0080     allowed &= flags;
0081
0082     if (allowed & BTRFS_BLOCK_GROUP_RAID6)
0083         allowed = BTRFS_BLOCK_GROUP_RAID6;
0084     else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
0085         allowed = BTRFS_BLOCK_GROUP_RAID5;
0086     else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
0087         allowed = BTRFS_BLOCK_GROUP_RAID10;
0088     else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
0089         allowed = BTRFS_BLOCK_GROUP_RAID1;
0090     else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
0091         allowed = BTRFS_BLOCK_GROUP_RAID0;
0092
0093     flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
0094
0095     return extended_to_chunk(flags | allowed);
0096 }
0097
0098 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
0099 {
0100     unsigned seq;
0101     u64 flags;
0102
0103     do {
0104         flags = orig_flags;
0105         seq = read_seqbegin(&fs_info->profiles_lock);
0106
0107         if (flags & BTRFS_BLOCK_GROUP_DATA)
0108             flags |= fs_info->avail_data_alloc_bits;
0109         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
0110             flags |= fs_info->avail_system_alloc_bits;
0111         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
0112             flags |= fs_info->avail_metadata_alloc_bits;
0113     } while (read_seqretry(&fs_info->profiles_lock, seq));
0114
0115     return btrfs_reduce_alloc_profile(fs_info, flags);
0116 }
0117
0118 void btrfs_get_block_group(struct btrfs_block_group *cache)
0119 {
0120     refcount_inc(&cache->refs);
0121 }
0122
0123 void btrfs_put_block_group(struct btrfs_block_group *cache)
0124 {
0125     if (refcount_dec_and_test(&cache->refs)) {
0126         WARN_ON(cache->pinned > 0);
0127         /*
0128          * If there was a failure to cleanup a log tree, very likely due
0129          * to an IO failure on a writeback attempt of one or more of its
0130          * extent buffers, we could not do proper (and cheap) unaccounting
0131          * of their reserved space, so don't warn on reserved > 0 in that
0132          * case.
0133          */
0134         if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
0135             !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
0136             WARN_ON(cache->reserved > 0);
0137
0138         /*
0139          * A block_group shouldn't be on the discard_list anymore.
0140          * Remove the block_group from the discard_list to prevent us
0141          * from causing a panic due to NULL pointer dereference.
0142          */
0143         if (WARN_ON(!list_empty(&cache->discard_list)))
0144             btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
0145                           cache);
0146
0147         /*
0148          * If not empty, someone is still holding mutex of
0149          * full_stripe_lock, which can only be released by caller.
0150          * And it will definitely cause use-after-free when caller
0151          * tries to release full stripe lock.
0152          *
0153          * No better way to resolve, but only to warn.
0154          */
0155         WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
0156         kfree(cache->free_space_ctl);
0157         kfree(cache->physical_map);
0158         kfree(cache);
0159     }
0160 }
0161
0162 /*
0163  * This adds the block group to the fs_info rb tree for the block group cache
0164  */
0165 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
0166                        struct btrfs_block_group *block_group)
0167 {
0168     struct rb_node **p;
0169     struct rb_node *parent = NULL;
0170     struct btrfs_block_group *cache;
0171     bool leftmost = true;
0172
0173     ASSERT(block_group->length != 0);
0174
0175     write_lock(&info->block_group_cache_lock);
0176     p = &info->block_group_cache_tree.rb_root.rb_node;
0177
0178     while (*p) {
0179         parent = *p;
0180         cache = rb_entry(parent, struct btrfs_block_group, cache_node);
0181         if (block_group->start < cache->start) {
0182             p = &(*p)->rb_left;
0183         } else if (block_group->start > cache->start) {
0184             p = &(*p)->rb_right;
0185             leftmost = false;
0186         } else {
0187             write_unlock(&info->block_group_cache_lock);
0188             return -EEXIST;
0189         }
0190     }
0191
0192     rb_link_node(&block_group->cache_node, parent, p);
0193     rb_insert_color_cached(&block_group->cache_node,
0194                    &info->block_group_cache_tree, leftmost);
0195
0196     write_unlock(&info->block_group_cache_lock);
0197
0198     return 0;
0199 }
0200
0201 /*
0202  * This will return the block group at or after bytenr if contains is 0, else
0203  * it will return the block group that contains the bytenr
0204  */
0205 static struct btrfs_block_group *block_group_cache_tree_search(
0206         struct btrfs_fs_info *info, u64 bytenr, int contains)
0207 {
0208     struct btrfs_block_group *cache, *ret = NULL;
0209     struct rb_node *n;
0210     u64 end, start;
0211
0212     read_lock(&info->block_group_cache_lock);
0213     n = info->block_group_cache_tree.rb_root.rb_node;
0214
0215     while (n) {
0216         cache = rb_entry(n, struct btrfs_block_group, cache_node);
0217         end = cache->start + cache->length - 1;
0218         start = cache->start;
0219
0220         if (bytenr < start) {
0221             if (!contains && (!ret || start < ret->start))
0222                 ret = cache;
0223             n = n->rb_left;
0224         } else if (bytenr > start) {
0225             if (contains && bytenr <= end) {
0226                 ret = cache;
0227                 break;
0228             }
0229             n = n->rb_right;
0230         } else {
0231             ret = cache;
0232             break;
0233         }
0234     }
0235     if (ret)
0236         btrfs_get_block_group(ret);
0237     read_unlock(&info->block_group_cache_lock);
0238
0239     return ret;
0240 }
0241
0242 /*
0243  * Return the block group that starts at or after bytenr
0244  */
0245 struct btrfs_block_group *btrfs_lookup_first_block_group(
0246         struct btrfs_fs_info *info, u64 bytenr)
0247 {
0248     return block_group_cache_tree_search(info, bytenr, 0);
0249 }
0250
0251 /*
0252  * Return the block group that contains the given bytenr
0253  */
0254 struct btrfs_block_group *btrfs_lookup_block_group(
0255         struct btrfs_fs_info *info, u64 bytenr)
0256 {
0257     return block_group_cache_tree_search(info, bytenr, 1);
0258 }
0259
0260 struct btrfs_block_group *btrfs_next_block_group(
0261         struct btrfs_block_group *cache)
0262 {
0263     struct btrfs_fs_info *fs_info = cache->fs_info;
0264     struct rb_node *node;
0265
0266     read_lock(&fs_info->block_group_cache_lock);
0267
0268     /* If our block group was removed, we need a full search. */
0269     if (RB_EMPTY_NODE(&cache->cache_node)) {
0270         const u64 next_bytenr = cache->start + cache->length;
0271
0272         read_unlock(&fs_info->block_group_cache_lock);
0273         btrfs_put_block_group(cache);
0274         return btrfs_lookup_first_block_group(fs_info, next_bytenr);
0275     }
0276     node = rb_next(&cache->cache_node);
0277     btrfs_put_block_group(cache);
0278     if (node) {
0279         cache = rb_entry(node, struct btrfs_block_group, cache_node);
0280         btrfs_get_block_group(cache);
0281     } else
0282         cache = NULL;
0283     read_unlock(&fs_info->block_group_cache_lock);
0284     return cache;
0285 }
0286
0287 /**
0288  * Check if we can do a NOCOW write for a given extent.
0289  *
0290  * @fs_info:       The filesystem information object.
0291  * @bytenr:        Logical start address of the extent.
0292  *
0293  * Check if we can do a NOCOW write for the given extent, and increments the
0294  * number of NOCOW writers in the block group that contains the extent, as long
0295  * as the block group exists and it's currently not in read-only mode.
0296  *
0297  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
0298  *          is responsible for calling btrfs_dec_nocow_writers() later.
0299  *
0300  *          Or NULL if we can not do a NOCOW write
0301  */
0302 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
0303                           u64 bytenr)
0304 {
0305     struct btrfs_block_group *bg;
0306     bool can_nocow = true;
0307
0308     bg = btrfs_lookup_block_group(fs_info, bytenr);
0309     if (!bg)
0310         return NULL;
0311
0312     spin_lock(&bg->lock);
0313     if (bg->ro)
0314         can_nocow = false;
0315     else
0316         atomic_inc(&bg->nocow_writers);
0317     spin_unlock(&bg->lock);
0318
0319     if (!can_nocow) {
0320         btrfs_put_block_group(bg);
0321         return NULL;
0322     }
0323
0324     /* No put on block group, done by btrfs_dec_nocow_writers(). */
0325     return bg;
0326 }
0327
0328 /**
0329  * Decrement the number of NOCOW writers in a block group.
0330  *
0331  * @bg:       The block group.
0332  *
0333  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
0334  * and on the block group returned by that call. Typically this is called after
0335  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
0336  * relocation.
0337  *
0338  * After this call, the caller should not use the block group anymore. It it wants
0339  * to use it, then it should get a reference on it before calling this function.
0340  */
0341 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
0342 {
0343     if (atomic_dec_and_test(&bg->nocow_writers))
0344         wake_up_var(&bg->nocow_writers);
0345
0346     /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
0347     btrfs_put_block_group(bg);
0348 }
0349
0350 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
0351 {
0352     wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
0353 }
0354
0355 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
0356                     const u64 start)
0357 {
0358     struct btrfs_block_group *bg;
0359
0360     bg = btrfs_lookup_block_group(fs_info, start);
0361     ASSERT(bg);
0362     if (atomic_dec_and_test(&bg->reservations))
0363         wake_up_var(&bg->reservations);
0364     btrfs_put_block_group(bg);
0365 }
0366
0367 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
0368 {
0369     struct btrfs_space_info *space_info = bg->space_info;
0370
0371     ASSERT(bg->ro);
0372
0373     if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
0374         return;
0375
0376     /*
0377      * Our block group is read only but before we set it to read only,
0378      * some task might have had allocated an extent from it already, but it
0379      * has not yet created a respective ordered extent (and added it to a
0380      * root's list of ordered extents).
0381      * Therefore wait for any task currently allocating extents, since the
0382      * block group's reservations counter is incremented while a read lock
0383      * on the groups' semaphore is held and decremented after releasing
0384      * the read access on that semaphore and creating the ordered extent.
0385      */
0386     down_write(&space_info->groups_sem);
0387     up_write(&space_info->groups_sem);
0388
0389     wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
0390 }
0391
0392 struct btrfs_caching_control *btrfs_get_caching_control(
0393         struct btrfs_block_group *cache)
0394 {
0395     struct btrfs_caching_control *ctl;
0396
0397     spin_lock(&cache->lock);
0398     if (!cache->caching_ctl) {
0399         spin_unlock(&cache->lock);
0400         return NULL;
0401     }
0402
0403     ctl = cache->caching_ctl;
0404     refcount_inc(&ctl->count);
0405     spin_unlock(&cache->lock);
0406     return ctl;
0407 }
0408
0409 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
0410 {
0411     if (refcount_dec_and_test(&ctl->count))
0412         kfree(ctl);
0413 }
0414
0415 /*
0416  * When we wait for progress in the block group caching, its because our
0417  * allocation attempt failed at least once.  So, we must sleep and let some
0418  * progress happen before we try again.
0419  *
0420  * This function will sleep at least once waiting for new free space to show
0421  * up, and then it will check the block group free space numbers for our min
0422  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
0423  * a free extent of a given size, but this is a good start.
0424  *
0425  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
0426  * any of the information in this block group.
0427  */
0428 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
0429                        u64 num_bytes)
0430 {
0431     struct btrfs_caching_control *caching_ctl;
0432
0433     caching_ctl = btrfs_get_caching_control(cache);
0434     if (!caching_ctl)
0435         return;
0436
0437     wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
0438            (cache->free_space_ctl->free_space >= num_bytes));
0439
0440     btrfs_put_caching_control(caching_ctl);
0441 }
0442
0443 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
0444                        struct btrfs_caching_control *caching_ctl)
0445 {
0446     wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
0447     return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
0448 }
0449
0450 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
0451 {
0452     struct btrfs_caching_control *caching_ctl;
0453     int ret;
0454
0455     caching_ctl = btrfs_get_caching_control(cache);
0456     if (!caching_ctl)
0457         return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
0458     ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
0459     btrfs_put_caching_control(caching_ctl);
0460     return ret;
0461 }
0462
0463 #ifdef CONFIG_BTRFS_DEBUG
0464 static void fragment_free_space(struct btrfs_block_group *block_group)
0465 {
0466     struct btrfs_fs_info *fs_info = block_group->fs_info;
0467     u64 start = block_group->start;
0468     u64 len = block_group->length;
0469     u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
0470         fs_info->nodesize : fs_info->sectorsize;
0471     u64 step = chunk << 1;
0472
0473     while (len > chunk) {
0474         btrfs_remove_free_space(block_group, start, chunk);
0475         start += step;
0476         if (len < step)
0477             len = 0;
0478         else
0479             len -= step;
0480     }
0481 }
0482 #endif
0483
0484 /*
0485  * This is only called by btrfs_cache_block_group, since we could have freed
0486  * extents we need to check the pinned_extents for any extents that can't be
0487  * used yet since their free space will be released as soon as the transaction
0488  * commits.
0489  */
0490 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
0491 {
0492     struct btrfs_fs_info *info = block_group->fs_info;
0493     u64 extent_start, extent_end, size, total_added = 0;
0494     int ret;
0495
0496     while (start < end) {
0497         ret = find_first_extent_bit(&info->excluded_extents, start,
0498                         &extent_start, &extent_end,
0499                         EXTENT_DIRTY | EXTENT_UPTODATE,
0500                         NULL);
0501         if (ret)
0502             break;
0503
0504         if (extent_start <= start) {
0505             start = extent_end + 1;
0506         } else if (extent_start > start && extent_start < end) {
0507             size = extent_start - start;
0508             total_added += size;
0509             ret = btrfs_add_free_space_async_trimmed(block_group,
0510                                  start, size);
0511             BUG_ON(ret); /* -ENOMEM or logic error */
0512             start = extent_end + 1;
0513         } else {
0514             break;
0515         }
0516     }
0517
0518     if (start < end) {
0519         size = end - start;
0520         total_added += size;
0521         ret = btrfs_add_free_space_async_trimmed(block_group, start,
0522                              size);
0523         BUG_ON(ret); /* -ENOMEM or logic error */
0524     }
0525
0526     return total_added;
0527 }
0528
0529 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
0530 {
0531     struct btrfs_block_group *block_group = caching_ctl->block_group;
0532     struct btrfs_fs_info *fs_info = block_group->fs_info;
0533     struct btrfs_root *extent_root;
0534     struct btrfs_path *path;
0535     struct extent_buffer *leaf;
0536     struct btrfs_key key;
0537     u64 total_found = 0;
0538     u64 last = 0;
0539     u32 nritems;
0540     int ret;
0541     bool wakeup = true;
0542
0543     path = btrfs_alloc_path();
0544     if (!path)
0545         return -ENOMEM;
0546
0547     last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
0548     extent_root = btrfs_extent_root(fs_info, last);
0549
0550 #ifdef CONFIG_BTRFS_DEBUG
0551     /*
0552      * If we're fragmenting we don't want to make anybody think we can
0553      * allocate from this block group until we've had a chance to fragment
0554      * the free space.
0555      */
0556     if (btrfs_should_fragment_free_space(block_group))
0557         wakeup = false;
0558 #endif
0559     /*
0560      * We don't want to deadlock with somebody trying to allocate a new
0561      * extent for the extent root while also trying to search the extent
0562      * root to add free space.  So we skip locking and search the commit
0563      * root, since its read-only
0564      */
0565     path->skip_locking = 1;
0566     path->search_commit_root = 1;
0567     path->reada = READA_FORWARD;
0568
0569     key.objectid = last;
0570     key.offset = 0;
0571     key.type = BTRFS_EXTENT_ITEM_KEY;
0572
0573 next:
0574     ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
0575     if (ret < 0)
0576         goto out;
0577
0578     leaf = path->nodes[0];
0579     nritems = btrfs_header_nritems(leaf);
0580
0581     while (1) {
0582         if (btrfs_fs_closing(fs_info) > 1) {
0583             last = (u64)-1;
0584             break;
0585         }
0586
0587         if (path->slots[0] < nritems) {
0588             btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
0589         } else {
0590             ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
0591             if (ret)
0592                 break;
0593
0594             if (need_resched() ||
0595                 rwsem_is_contended(&fs_info->commit_root_sem)) {
0596                 if (wakeup)
0597                     caching_ctl->progress = last;
0598                 btrfs_release_path(path);
0599                 up_read(&fs_info->commit_root_sem);
0600                 mutex_unlock(&caching_ctl->mutex);
0601                 cond_resched();
0602                 mutex_lock(&caching_ctl->mutex);
0603                 down_read(&fs_info->commit_root_sem);
0604                 goto next;
0605             }
0606
0607             ret = btrfs_next_leaf(extent_root, path);
0608             if (ret < 0)
0609                 goto out;
0610             if (ret)
0611                 break;
0612             leaf = path->nodes[0];
0613             nritems = btrfs_header_nritems(leaf);
0614             continue;
0615         }
0616
0617         if (key.objectid < last) {
0618             key.objectid = last;
0619             key.offset = 0;
0620             key.type = BTRFS_EXTENT_ITEM_KEY;
0621
0622             if (wakeup)
0623                 caching_ctl->progress = last;
0624             btrfs_release_path(path);
0625             goto next;
0626         }
0627
0628         if (key.objectid < block_group->start) {
0629             path->slots[0]++;
0630             continue;
0631         }
0632
0633         if (key.objectid >= block_group->start + block_group->length)
0634             break;
0635
0636         if (key.type == BTRFS_EXTENT_ITEM_KEY ||
0637             key.type == BTRFS_METADATA_ITEM_KEY) {
0638             total_found += add_new_free_space(block_group, last,
0639                               key.objectid);
0640             if (key.type == BTRFS_METADATA_ITEM_KEY)
0641                 last = key.objectid +
0642                     fs_info->nodesize;
0643             else
0644                 last = key.objectid + key.offset;
0645
0646             if (total_found > CACHING_CTL_WAKE_UP) {
0647                 total_found = 0;
0648                 if (wakeup)
0649                     wake_up(&caching_ctl->wait);
0650             }
0651         }
0652         path->slots[0]++;
0653     }
0654     ret = 0;
0655
0656     total_found += add_new_free_space(block_group, last,
0657                 block_group->start + block_group->length);
0658     caching_ctl->progress = (u64)-1;
0659
0660 out:
0661     btrfs_free_path(path);
0662     return ret;
0663 }
0664
0665 static noinline void caching_thread(struct btrfs_work *work)
0666 {
0667     struct btrfs_block_group *block_group;
0668     struct btrfs_fs_info *fs_info;
0669     struct btrfs_caching_control *caching_ctl;
0670     int ret;
0671
0672     caching_ctl = container_of(work, struct btrfs_caching_control, work);
0673     block_group = caching_ctl->block_group;
0674     fs_info = block_group->fs_info;
0675
0676     mutex_lock(&caching_ctl->mutex);
0677     down_read(&fs_info->commit_root_sem);
0678
0679     if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
0680         ret = load_free_space_cache(block_group);
0681         if (ret == 1) {
0682             ret = 0;
0683             goto done;
0684         }
0685
0686         /*
0687          * We failed to load the space cache, set ourselves to
0688          * CACHE_STARTED and carry on.
0689          */
0690         spin_lock(&block_group->lock);
0691         block_group->cached = BTRFS_CACHE_STARTED;
0692         spin_unlock(&block_group->lock);
0693         wake_up(&caching_ctl->wait);
0694     }
0695
0696     /*
0697      * If we are in the transaction that populated the free space tree we
0698      * can't actually cache from the free space tree as our commit root and
0699      * real root are the same, so we could change the contents of the blocks
0700      * while caching.  Instead do the slow caching in this case, and after
0701      * the transaction has committed we will be safe.
0702      */
0703     if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
0704         !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
0705         ret = load_free_space_tree(caching_ctl);
0706     else
0707         ret = load_extent_tree_free(caching_ctl);
0708 done:
0709     spin_lock(&block_group->lock);
0710     block_group->caching_ctl = NULL;
0711     block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
0712     spin_unlock(&block_group->lock);
0713
0714 #ifdef CONFIG_BTRFS_DEBUG
0715     if (btrfs_should_fragment_free_space(block_group)) {
0716         u64 bytes_used;
0717
0718         spin_lock(&block_group->space_info->lock);
0719         spin_lock(&block_group->lock);
0720         bytes_used = block_group->length - block_group->used;
0721         block_group->space_info->bytes_used += bytes_used >> 1;
0722         spin_unlock(&block_group->lock);
0723         spin_unlock(&block_group->space_info->lock);
0724         fragment_free_space(block_group);
0725     }
0726 #endif
0727
0728     caching_ctl->progress = (u64)-1;
0729
0730     up_read(&fs_info->commit_root_sem);
0731     btrfs_free_excluded_extents(block_group);
0732     mutex_unlock(&caching_ctl->mutex);
0733
0734     wake_up(&caching_ctl->wait);
0735
0736     btrfs_put_caching_control(caching_ctl);
0737     btrfs_put_block_group(block_group);
0738 }
0739
0740 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
0741 {
0742     struct btrfs_fs_info *fs_info = cache->fs_info;
0743     struct btrfs_caching_control *caching_ctl = NULL;
0744     int ret = 0;
0745
0746     /* Allocator for zoned filesystems does not use the cache at all */
0747     if (btrfs_is_zoned(fs_info))
0748         return 0;
0749
0750     caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
0751     if (!caching_ctl)
0752         return -ENOMEM;
0753
0754     INIT_LIST_HEAD(&caching_ctl->list);
0755     mutex_init(&caching_ctl->mutex);
0756     init_waitqueue_head(&caching_ctl->wait);
0757     caching_ctl->block_group = cache;
0758     caching_ctl->progress = cache->start;
0759     refcount_set(&caching_ctl->count, 2);
0760     btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
0761
0762     spin_lock(&cache->lock);
0763     if (cache->cached != BTRFS_CACHE_NO) {
0764         kfree(caching_ctl);
0765
0766         caching_ctl = cache->caching_ctl;
0767         if (caching_ctl)
0768             refcount_inc(&caching_ctl->count);
0769         spin_unlock(&cache->lock);
0770         goto out;
0771     }
0772     WARN_ON(cache->caching_ctl);
0773     cache->caching_ctl = caching_ctl;
0774     cache->cached = BTRFS_CACHE_STARTED;
0775     cache->has_caching_ctl = 1;
0776     spin_unlock(&cache->lock);
0777
0778     write_lock(&fs_info->block_group_cache_lock);
0779     refcount_inc(&caching_ctl->count);
0780     list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
0781     write_unlock(&fs_info->block_group_cache_lock);
0782
0783     btrfs_get_block_group(cache);
0784
0785     btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
0786 out:
0787     if (wait && caching_ctl)
0788         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
0789     if (caching_ctl)
0790         btrfs_put_caching_control(caching_ctl);
0791
0792     return ret;
0793 }
0794
0795 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
0796 {
0797     u64 extra_flags = chunk_to_extended(flags) &
0798                 BTRFS_EXTENDED_PROFILE_MASK;
0799
0800     write_seqlock(&fs_info->profiles_lock);
0801     if (flags & BTRFS_BLOCK_GROUP_DATA)
0802         fs_info->avail_data_alloc_bits &= ~extra_flags;
0803     if (flags & BTRFS_BLOCK_GROUP_METADATA)
0804         fs_info->avail_metadata_alloc_bits &= ~extra_flags;
0805     if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
0806         fs_info->avail_system_alloc_bits &= ~extra_flags;
0807     write_sequnlock(&fs_info->profiles_lock);
0808 }
0809
0810 /*
0811  * Clear incompat bits for the following feature(s):
0812  *
0813  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
0814  *            in the whole filesystem
0815  *
0816  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
0817  */
0818 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
0819 {
0820     bool found_raid56 = false;
0821     bool found_raid1c34 = false;
0822
0823     if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
0824         (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
0825         (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
0826         struct list_head *head = &fs_info->space_info;
0827         struct btrfs_space_info *sinfo;
0828
0829         list_for_each_entry_rcu(sinfo, head, list) {
0830             down_read(&sinfo->groups_sem);
0831             if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
0832                 found_raid56 = true;
0833             if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
0834                 found_raid56 = true;
0835             if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
0836                 found_raid1c34 = true;
0837             if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
0838                 found_raid1c34 = true;
0839             up_read(&sinfo->groups_sem);
0840         }
0841         if (!found_raid56)
0842             btrfs_clear_fs_incompat(fs_info, RAID56);
0843         if (!found_raid1c34)
0844             btrfs_clear_fs_incompat(fs_info, RAID1C34);
0845     }
0846 }
0847
0848 static int remove_block_group_item(struct btrfs_trans_handle *trans,
0849                    struct btrfs_path *path,
0850                    struct btrfs_block_group *block_group)
0851 {
0852     struct btrfs_fs_info *fs_info = trans->fs_info;
0853     struct btrfs_root *root;
0854     struct btrfs_key key;
0855     int ret;
0856
0857     root = btrfs_block_group_root(fs_info);
0858     key.objectid = block_group->start;
0859     key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
0860     key.offset = block_group->length;
0861
0862     ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
0863     if (ret > 0)
0864         ret = -ENOENT;
0865     if (ret < 0)
0866         return ret;
0867
0868     ret = btrfs_del_item(trans, root, path);
0869     return ret;
0870 }
0871
0872 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
0873                  u64 group_start, struct extent_map *em)
0874 {
0875     struct btrfs_fs_info *fs_info = trans->fs_info;
0876     struct btrfs_path *path;
0877     struct btrfs_block_group *block_group;
0878     struct btrfs_free_cluster *cluster;
0879     struct inode *inode;
0880     struct kobject *kobj = NULL;
0881     int ret;
0882     int index;
0883     int factor;
0884     struct btrfs_caching_control *caching_ctl = NULL;
0885     bool remove_em;
0886     bool remove_rsv = false;
0887
0888     block_group = btrfs_lookup_block_group(fs_info, group_start);
0889     BUG_ON(!block_group);
0890     BUG_ON(!block_group->ro);
0891
0892     trace_btrfs_remove_block_group(block_group);
0893     /*
0894      * Free the reserved super bytes from this block group before
0895      * remove it.
0896      */
0897     btrfs_free_excluded_extents(block_group);
0898     btrfs_free_ref_tree_range(fs_info, block_group->start,
0899                   block_group->length);
0900
0901     index = btrfs_bg_flags_to_raid_index(block_group->flags);
0902     factor = btrfs_bg_type_to_factor(block_group->flags);
0903
0904     /* make sure this block group isn't part of an allocation cluster */
0905     cluster = &fs_info->data_alloc_cluster;
0906     spin_lock(&cluster->refill_lock);
0907     btrfs_return_cluster_to_free_space(block_group, cluster);
0908     spin_unlock(&cluster->refill_lock);
0909
0910     /*
0911      * make sure this block group isn't part of a metadata
0912      * allocation cluster
0913      */
0914     cluster = &fs_info->meta_alloc_cluster;
0915     spin_lock(&cluster->refill_lock);
0916     btrfs_return_cluster_to_free_space(block_group, cluster);
0917     spin_unlock(&cluster->refill_lock);
0918
0919     btrfs_clear_treelog_bg(block_group);
0920     btrfs_clear_data_reloc_bg(block_group);
0921
0922     path = btrfs_alloc_path();
0923     if (!path) {
0924         ret = -ENOMEM;
0925         goto out;
0926     }
0927
0928     /*
0929      * get the inode first so any iput calls done for the io_list
0930      * aren't the final iput (no unlinks allowed now)
0931      */
0932     inode = lookup_free_space_inode(block_group, path);
0933
0934     mutex_lock(&trans->transaction->cache_write_mutex);
0935     /*
0936      * Make sure our free space cache IO is done before removing the
0937      * free space inode
0938      */
0939     spin_lock(&trans->transaction->dirty_bgs_lock);
0940     if (!list_empty(&block_group->io_list)) {
0941         list_del_init(&block_group->io_list);
0942
0943         WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
0944
0945         spin_unlock(&trans->transaction->dirty_bgs_lock);
0946         btrfs_wait_cache_io(trans, block_group, path);
0947         btrfs_put_block_group(block_group);
0948         spin_lock(&trans->transaction->dirty_bgs_lock);
0949     }
0950
0951     if (!list_empty(&block_group->dirty_list)) {
0952         list_del_init(&block_group->dirty_list);
0953         remove_rsv = true;
0954         btrfs_put_block_group(block_group);
0955     }
0956     spin_unlock(&trans->transaction->dirty_bgs_lock);
0957     mutex_unlock(&trans->transaction->cache_write_mutex);
0958
0959     ret = btrfs_remove_free_space_inode(trans, inode, block_group);
0960     if (ret)
0961         goto out;
0962
0963     write_lock(&fs_info->block_group_cache_lock);
0964     rb_erase_cached(&block_group->cache_node,
0965             &fs_info->block_group_cache_tree);
0966     RB_CLEAR_NODE(&block_group->cache_node);
0967
0968     /* Once for the block groups rbtree */
0969     btrfs_put_block_group(block_group);
0970
0971     write_unlock(&fs_info->block_group_cache_lock);
0972
0973     down_write(&block_group->space_info->groups_sem);
0974     /*
0975      * we must use list_del_init so people can check to see if they
0976      * are still on the list after taking the semaphore
0977      */
0978     list_del_init(&block_group->list);
0979     if (list_empty(&block_group->space_info->block_groups[index])) {
0980         kobj = block_group->space_info->block_group_kobjs[index];
0981         block_group->space_info->block_group_kobjs[index] = NULL;
0982         clear_avail_alloc_bits(fs_info, block_group->flags);
0983     }
0984     up_write(&block_group->space_info->groups_sem);
0985     clear_incompat_bg_bits(fs_info, block_group->flags);
0986     if (kobj) {
0987         kobject_del(kobj);
0988         kobject_put(kobj);
0989     }
0990
0991     if (block_group->has_caching_ctl)
0992         caching_ctl = btrfs_get_caching_control(block_group);
0993     if (block_group->cached == BTRFS_CACHE_STARTED)
0994         btrfs_wait_block_group_cache_done(block_group);
0995     if (block_group->has_caching_ctl) {
0996         write_lock(&fs_info->block_group_cache_lock);
0997         if (!caching_ctl) {
0998             struct btrfs_caching_control *ctl;
0999
1000             list_for_each_entry(ctl,
1001                     &fs_info->caching_block_groups, list)
1002                 if (ctl->block_group == block_group) {
1003                     caching_ctl = ctl;
1004                     refcount_inc(&caching_ctl->count);
1005                     break;
1006                 }
1007         }
1008         if (caching_ctl)
1009             list_del_init(&caching_ctl->list);
1010         write_unlock(&fs_info->block_group_cache_lock);
1011         if (caching_ctl) {
1012             /* Once for the caching bgs list and once for us. */
1013             btrfs_put_caching_control(caching_ctl);
1014             btrfs_put_caching_control(caching_ctl);
1015         }
1016     }
1017
1018     spin_lock(&trans->transaction->dirty_bgs_lock);
1019     WARN_ON(!list_empty(&block_group->dirty_list));
1020     WARN_ON(!list_empty(&block_group->io_list));
1021     spin_unlock(&trans->transaction->dirty_bgs_lock);
1022
1023     btrfs_remove_free_space_cache(block_group);
1024
1025     spin_lock(&block_group->space_info->lock);
1026     list_del_init(&block_group->ro_list);
1027
1028     if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1029         WARN_ON(block_group->space_info->total_bytes
1030             < block_group->length);
1031         WARN_ON(block_group->space_info->bytes_readonly
1032             < block_group->length - block_group->zone_unusable);
1033         WARN_ON(block_group->space_info->bytes_zone_unusable
1034             < block_group->zone_unusable);
1035         WARN_ON(block_group->space_info->disk_total
1036             < block_group->length * factor);
1037         WARN_ON(block_group->zone_is_active &&
1038             block_group->space_info->active_total_bytes
1039             < block_group->length);
1040     }
1041     block_group->space_info->total_bytes -= block_group->length;
1042     if (block_group->zone_is_active)
1043         block_group->space_info->active_total_bytes -= block_group->length;
1044     block_group->space_info->bytes_readonly -=
1045         (block_group->length - block_group->zone_unusable);
1046     block_group->space_info->bytes_zone_unusable -=
1047         block_group->zone_unusable;
1048     block_group->space_info->disk_total -= block_group->length * factor;
1049
1050     spin_unlock(&block_group->space_info->lock);
1051
1052     /*
1053      * Remove the free space for the block group from the free space tree
1054      * and the block group's item from the extent tree before marking the
1055      * block group as removed. This is to prevent races with tasks that
1056      * freeze and unfreeze a block group, this task and another task
1057      * allocating a new block group - the unfreeze task ends up removing
1058      * the block group's extent map before the task calling this function
1059      * deletes the block group item from the extent tree, allowing for
1060      * another task to attempt to create another block group with the same
1061      * item key (and failing with -EEXIST and a transaction abort).
1062      */
1063     ret = remove_block_group_free_space(trans, block_group);
1064     if (ret)
1065         goto out;
1066
1067     ret = remove_block_group_item(trans, path, block_group);
1068     if (ret < 0)
1069         goto out;
1070
1071     spin_lock(&block_group->lock);
1072     block_group->removed = 1;
1073     /*
1074      * At this point trimming or scrub can't start on this block group,
1075      * because we removed the block group from the rbtree
1076      * fs_info->block_group_cache_tree so no one can't find it anymore and
1077      * even if someone already got this block group before we removed it
1078      * from the rbtree, they have already incremented block_group->frozen -
1079      * if they didn't, for the trimming case they won't find any free space
1080      * entries because we already removed them all when we called
1081      * btrfs_remove_free_space_cache().
1082      *
1083      * And we must not remove the extent map from the fs_info->mapping_tree
1084      * to prevent the same logical address range and physical device space
1085      * ranges from being reused for a new block group. This is needed to
1086      * avoid races with trimming and scrub.
1087      *
1088      * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1089      * completely transactionless, so while it is trimming a range the
1090      * currently running transaction might finish and a new one start,
1091      * allowing for new block groups to be created that can reuse the same
1092      * physical device locations unless we take this special care.
1093      *
1094      * There may also be an implicit trim operation if the file system
1095      * is mounted with -odiscard. The same protections must remain
1096      * in place until the extents have been discarded completely when
1097      * the transaction commit has completed.
1098      */
1099     remove_em = (atomic_read(&block_group->frozen) == 0);
1100     spin_unlock(&block_group->lock);
1101
1102     if (remove_em) {
1103         struct extent_map_tree *em_tree;
1104
1105         em_tree = &fs_info->mapping_tree;
1106         write_lock(&em_tree->lock);
1107         remove_extent_mapping(em_tree, em);
1108         write_unlock(&em_tree->lock);
1109         /* once for the tree */
1110         free_extent_map(em);
1111     }
1112
1113 out:
1114     /* Once for the lookup reference */
1115     btrfs_put_block_group(block_group);
1116     if (remove_rsv)
1117         btrfs_delayed_refs_rsv_release(fs_info, 1);
1118     btrfs_free_path(path);
1119     return ret;
1120 }
1121
1122 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1123         struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1124 {
1125     struct btrfs_root *root = btrfs_block_group_root(fs_info);
1126     struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1127     struct extent_map *em;
1128     struct map_lookup *map;
1129     unsigned int num_items;
1130
1131     read_lock(&em_tree->lock);
1132     em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1133     read_unlock(&em_tree->lock);
1134     ASSERT(em && em->start == chunk_offset);
1135
1136     /*
1137      * We need to reserve 3 + N units from the metadata space info in order
1138      * to remove a block group (done at btrfs_remove_chunk() and at
1139      * btrfs_remove_block_group()), which are used for:
1140      *
1141      * 1 unit for adding the free space inode's orphan (located in the tree
1142      * of tree roots).
1143      * 1 unit for deleting the block group item (located in the extent
1144      * tree).
1145      * 1 unit for deleting the free space item (located in tree of tree
1146      * roots).
1147      * N units for deleting N device extent items corresponding to each
1148      * stripe (located in the device tree).
1149      *
1150      * In order to remove a block group we also need to reserve units in the
1151      * system space info in order to update the chunk tree (update one or
1152      * more device items and remove one chunk item), but this is done at
1153      * btrfs_remove_chunk() through a call to check_system_chunk().
1154      */
1155     map = em->map_lookup;
1156     num_items = 3 + map->num_stripes;
1157     free_extent_map(em);
1158
1159     return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1160 }
1161
1162 /*
1163  * Mark block group @cache read-only, so later write won't happen to block
1164  * group @cache.
1165  *
1166  * If @force is not set, this function will only mark the block group readonly
1167  * if we have enough free space (1M) in other metadata/system block groups.
1168  * If @force is not set, this function will mark the block group readonly
1169  * without checking free space.
1170  *
1171  * NOTE: This function doesn't care if other block groups can contain all the
1172  * data in this block group. That check should be done by relocation routine,
1173  * not this function.
1174  */
1175 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1176 {
1177     struct btrfs_space_info *sinfo = cache->space_info;
1178     u64 num_bytes;
1179     int ret = -ENOSPC;
1180
1181     spin_lock(&sinfo->lock);
1182     spin_lock(&cache->lock);
1183
1184     if (cache->swap_extents) {
1185         ret = -ETXTBSY;
1186         goto out;
1187     }
1188
1189     if (cache->ro) {
1190         cache->ro++;
1191         ret = 0;
1192         goto out;
1193     }
1194
1195     num_bytes = cache->length - cache->reserved - cache->pinned -
1196             cache->bytes_super - cache->zone_unusable - cache->used;
1197
1198     /*
1199      * Data never overcommits, even in mixed mode, so do just the straight
1200      * check of left over space in how much we have allocated.
1201      */
1202     if (force) {
1203         ret = 0;
1204     } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1205         u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1206
1207         /*
1208          * Here we make sure if we mark this bg RO, we still have enough
1209          * free space as buffer.
1210          */
1211         if (sinfo_used + num_bytes <= sinfo->total_bytes)
1212             ret = 0;
1213     } else {
1214         /*
1215          * We overcommit metadata, so we need to do the
1216          * btrfs_can_overcommit check here, and we need to pass in
1217          * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1218          * leeway to allow us to mark this block group as read only.
1219          */
1220         if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1221                      BTRFS_RESERVE_NO_FLUSH))
1222             ret = 0;
1223     }
1224
1225     if (!ret) {
1226         sinfo->bytes_readonly += num_bytes;
1227         if (btrfs_is_zoned(cache->fs_info)) {
1228             /* Migrate zone_unusable bytes to readonly */
1229             sinfo->bytes_readonly += cache->zone_unusable;
1230             sinfo->bytes_zone_unusable -= cache->zone_unusable;
1231             cache->zone_unusable = 0;
1232         }
1233         cache->ro++;
1234         list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1235     }
1236 out:
1237     spin_unlock(&cache->lock);
1238     spin_unlock(&sinfo->lock);
1239     if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1240         btrfs_info(cache->fs_info,
1241             "unable to make block group %llu ro", cache->start);
1242         btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1243     }
1244     return ret;
1245 }
1246
1247 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1248                  struct btrfs_block_group *bg)
1249 {
1250     struct btrfs_fs_info *fs_info = bg->fs_info;
1251     struct btrfs_transaction *prev_trans = NULL;
1252     const u64 start = bg->start;
1253     const u64 end = start + bg->length - 1;
1254     int ret;
1255
1256     spin_lock(&fs_info->trans_lock);
1257     if (trans->transaction->list.prev != &fs_info->trans_list) {
1258         prev_trans = list_last_entry(&trans->transaction->list,
1259                          struct btrfs_transaction, list);
1260         refcount_inc(&prev_trans->use_count);
1261     }
1262     spin_unlock(&fs_info->trans_lock);
1263
1264     /*
1265      * Hold the unused_bg_unpin_mutex lock to avoid racing with
1266      * btrfs_finish_extent_commit(). If we are at transaction N, another
1267      * task might be running finish_extent_commit() for the previous
1268      * transaction N - 1, and have seen a range belonging to the block
1269      * group in pinned_extents before we were able to clear the whole block
1270      * group range from pinned_extents. This means that task can lookup for
1271      * the block group after we unpinned it from pinned_extents and removed
1272      * it, leading to a BUG_ON() at unpin_extent_range().
1273      */
1274     mutex_lock(&fs_info->unused_bg_unpin_mutex);
1275     if (prev_trans) {
1276         ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1277                     EXTENT_DIRTY);
1278         if (ret)
1279             goto out;
1280     }
1281
1282     ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1283                 EXTENT_DIRTY);
1284 out:
1285     mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1286     if (prev_trans)
1287         btrfs_put_transaction(prev_trans);
1288
1289     return ret == 0;
1290 }
1291
1292 /*
1293  * Process the unused_bgs list and remove any that don't have any allocated
1294  * space inside of them.
1295  */
1296 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1297 {
1298     struct btrfs_block_group *block_group;
1299     struct btrfs_space_info *space_info;
1300     struct btrfs_trans_handle *trans;
1301     const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1302     int ret = 0;
1303
1304     if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1305         return;
1306
1307     /*
1308      * Long running balances can keep us blocked here for eternity, so
1309      * simply skip deletion if we're unable to get the mutex.
1310      */
1311     if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1312         return;
1313
1314     spin_lock(&fs_info->unused_bgs_lock);
1315     while (!list_empty(&fs_info->unused_bgs)) {
1316         int trimming;
1317
1318         block_group = list_first_entry(&fs_info->unused_bgs,
1319                            struct btrfs_block_group,
1320                            bg_list);
1321         list_del_init(&block_group->bg_list);
1322
1323         space_info = block_group->space_info;
1324
1325         if (ret || btrfs_mixed_space_info(space_info)) {
1326             btrfs_put_block_group(block_group);
1327             continue;
1328         }
1329         spin_unlock(&fs_info->unused_bgs_lock);
1330
1331         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1332
1333         /* Don't want to race with allocators so take the groups_sem */
1334         down_write(&space_info->groups_sem);
1335
1336         /*
1337          * Async discard moves the final block group discard to be prior
1338          * to the unused_bgs code path.  Therefore, if it's not fully
1339          * trimmed, punt it back to the async discard lists.
1340          */
1341         if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1342             !btrfs_is_free_space_trimmed(block_group)) {
1343             trace_btrfs_skip_unused_block_group(block_group);
1344             up_write(&space_info->groups_sem);
1345             /* Requeue if we failed because of async discard */
1346             btrfs_discard_queue_work(&fs_info->discard_ctl,
1347                          block_group);
1348             goto next;
1349         }
1350
1351         spin_lock(&block_group->lock);
1352         if (block_group->reserved || block_group->pinned ||
1353             block_group->used || block_group->ro ||
1354             list_is_singular(&block_group->list)) {
1355             /*
1356              * We want to bail if we made new allocations or have
1357              * outstanding allocations in this block group.  We do
1358              * the ro check in case balance is currently acting on
1359              * this block group.
1360              */
1361             trace_btrfs_skip_unused_block_group(block_group);
1362             spin_unlock(&block_group->lock);
1363             up_write(&space_info->groups_sem);
1364             goto next;
1365         }
1366         spin_unlock(&block_group->lock);
1367
1368         /* We don't want to force the issue, only flip if it's ok. */
1369         ret = inc_block_group_ro(block_group, 0);
1370         up_write(&space_info->groups_sem);
1371         if (ret < 0) {
1372             ret = 0;
1373             goto next;
1374         }
1375
1376         ret = btrfs_zone_finish(block_group);
1377         if (ret < 0) {
1378             btrfs_dec_block_group_ro(block_group);
1379             if (ret == -EAGAIN)
1380                 ret = 0;
1381             goto next;
1382         }
1383
1384         /*
1385          * Want to do this before we do anything else so we can recover
1386          * properly if we fail to join the transaction.
1387          */
1388         trans = btrfs_start_trans_remove_block_group(fs_info,
1389                              block_group->start);
1390         if (IS_ERR(trans)) {
1391             btrfs_dec_block_group_ro(block_group);
1392             ret = PTR_ERR(trans);
1393             goto next;
1394         }
1395
1396         /*
1397          * We could have pending pinned extents for this block group,
1398          * just delete them, we don't care about them anymore.
1399          */
1400         if (!clean_pinned_extents(trans, block_group)) {
1401             btrfs_dec_block_group_ro(block_group);
1402             goto end_trans;
1403         }
1404
1405         /*
1406          * At this point, the block_group is read only and should fail
1407          * new allocations.  However, btrfs_finish_extent_commit() can
1408          * cause this block_group to be placed back on the discard
1409          * lists because now the block_group isn't fully discarded.
1410          * Bail here and try again later after discarding everything.
1411          */
1412         spin_lock(&fs_info->discard_ctl.lock);
1413         if (!list_empty(&block_group->discard_list)) {
1414             spin_unlock(&fs_info->discard_ctl.lock);
1415             btrfs_dec_block_group_ro(block_group);
1416             btrfs_discard_queue_work(&fs_info->discard_ctl,
1417                          block_group);
1418             goto end_trans;
1419         }
1420         spin_unlock(&fs_info->discard_ctl.lock);
1421
1422         /* Reset pinned so btrfs_put_block_group doesn't complain */
1423         spin_lock(&space_info->lock);
1424         spin_lock(&block_group->lock);
1425
1426         btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1427                              -block_group->pinned);
1428         space_info->bytes_readonly += block_group->pinned;
1429         block_group->pinned = 0;
1430
1431         spin_unlock(&block_group->lock);
1432         spin_unlock(&space_info->lock);
1433
1434         /*
1435          * The normal path here is an unused block group is passed here,
1436          * then trimming is handled in the transaction commit path.
1437          * Async discard interposes before this to do the trimming
1438          * before coming down the unused block group path as trimming
1439          * will no longer be done later in the transaction commit path.
1440          */
1441         if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1442             goto flip_async;
1443
1444         /*
1445          * DISCARD can flip during remount. On zoned filesystems, we
1446          * need to reset sequential-required zones.
1447          */
1448         trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1449                 btrfs_is_zoned(fs_info);
1450
1451         /* Implicit trim during transaction commit. */
1452         if (trimming)
1453             btrfs_freeze_block_group(block_group);
1454
1455         /*
1456          * Btrfs_remove_chunk will abort the transaction if things go
1457          * horribly wrong.
1458          */
1459         ret = btrfs_remove_chunk(trans, block_group->start);
1460
1461         if (ret) {
1462             if (trimming)
1463                 btrfs_unfreeze_block_group(block_group);
1464             goto end_trans;
1465         }
1466
1467         /*
1468          * If we're not mounted with -odiscard, we can just forget
1469          * about this block group. Otherwise we'll need to wait
1470          * until transaction commit to do the actual discard.
1471          */
1472         if (trimming) {
1473             spin_lock(&fs_info->unused_bgs_lock);
1474             /*
1475              * A concurrent scrub might have added us to the list
1476              * fs_info->unused_bgs, so use a list_move operation
1477              * to add the block group to the deleted_bgs list.
1478              */
1479             list_move(&block_group->bg_list,
1480                   &trans->transaction->deleted_bgs);
1481             spin_unlock(&fs_info->unused_bgs_lock);
1482             btrfs_get_block_group(block_group);
1483         }
1484 end_trans:
1485         btrfs_end_transaction(trans);
1486 next:
1487         btrfs_put_block_group(block_group);
1488         spin_lock(&fs_info->unused_bgs_lock);
1489     }
1490     spin_unlock(&fs_info->unused_bgs_lock);
1491     mutex_unlock(&fs_info->reclaim_bgs_lock);
1492     return;
1493
1494 flip_async:
1495     btrfs_end_transaction(trans);
1496     mutex_unlock(&fs_info->reclaim_bgs_lock);
1497     btrfs_put_block_group(block_group);
1498     btrfs_discard_punt_unused_bgs_list(fs_info);
1499 }
1500
1501 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1502 {
1503     struct btrfs_fs_info *fs_info = bg->fs_info;
1504
1505     spin_lock(&fs_info->unused_bgs_lock);
1506     if (list_empty(&bg->bg_list)) {
1507         btrfs_get_block_group(bg);
1508         trace_btrfs_add_unused_block_group(bg);
1509         list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1510     }
1511     spin_unlock(&fs_info->unused_bgs_lock);
1512 }
1513
1514 /*
1515  * We want block groups with a low number of used bytes to be in the beginning
1516  * of the list, so they will get reclaimed first.
1517  */
1518 static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1519                const struct list_head *b)
1520 {
1521     const struct btrfs_block_group *bg1, *bg2;
1522
1523     bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1524     bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1525
1526     return bg1->used > bg2->used;
1527 }
1528
1529 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
1530 {
1531     if (btrfs_is_zoned(fs_info))
1532         return btrfs_zoned_should_reclaim(fs_info);
1533     return true;
1534 }
1535
1536 void btrfs_reclaim_bgs_work(struct work_struct *work)
1537 {
1538     struct btrfs_fs_info *fs_info =
1539         container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1540     struct btrfs_block_group *bg;
1541     struct btrfs_space_info *space_info;
1542
1543     if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1544         return;
1545
1546     if (!btrfs_should_reclaim(fs_info))
1547         return;
1548
1549     sb_start_write(fs_info->sb);
1550
1551     if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1552         sb_end_write(fs_info->sb);
1553         return;
1554     }
1555
1556     /*
1557      * Long running balances can keep us blocked here for eternity, so
1558      * simply skip reclaim if we're unable to get the mutex.
1559      */
1560     if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1561         btrfs_exclop_finish(fs_info);
1562         sb_end_write(fs_info->sb);
1563         return;
1564     }
1565
1566     spin_lock(&fs_info->unused_bgs_lock);
1567     /*
1568      * Sort happens under lock because we can't simply splice it and sort.
1569      * The block groups might still be in use and reachable via bg_list,
1570      * and their presence in the reclaim_bgs list must be preserved.
1571      */
1572     list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1573     while (!list_empty(&fs_info->reclaim_bgs)) {
1574         u64 zone_unusable;
1575         int ret = 0;
1576
1577         bg = list_first_entry(&fs_info->reclaim_bgs,
1578                       struct btrfs_block_group,
1579                       bg_list);
1580         list_del_init(&bg->bg_list);
1581
1582         space_info = bg->space_info;
1583         spin_unlock(&fs_info->unused_bgs_lock);
1584
1585         /* Don't race with allocators so take the groups_sem */
1586         down_write(&space_info->groups_sem);
1587
1588         spin_lock(&bg->lock);
1589         if (bg->reserved || bg->pinned || bg->ro) {
1590             /*
1591              * We want to bail if we made new allocations or have
1592              * outstanding allocations in this block group.  We do
1593              * the ro check in case balance is currently acting on
1594              * this block group.
1595              */
1596             spin_unlock(&bg->lock);
1597             up_write(&space_info->groups_sem);
1598             goto next;
1599         }
1600         spin_unlock(&bg->lock);
1601
1602         /* Get out fast, in case we're unmounting the filesystem */
1603         if (btrfs_fs_closing(fs_info)) {
1604             up_write(&space_info->groups_sem);
1605             goto next;
1606         }
1607
1608         /*
1609          * Cache the zone_unusable value before turning the block group
1610          * to read only. As soon as the blog group is read only it's
1611          * zone_unusable value gets moved to the block group's read-only
1612          * bytes and isn't available for calculations anymore.
1613          */
1614         zone_unusable = bg->zone_unusable;
1615         ret = inc_block_group_ro(bg, 0);
1616         up_write(&space_info->groups_sem);
1617         if (ret < 0)
1618             goto next;
1619
1620         btrfs_info(fs_info,
1621             "reclaiming chunk %llu with %llu%% used %llu%% unusable",
1622                 bg->start, div_u64(bg->used * 100, bg->length),
1623                 div64_u64(zone_unusable * 100, bg->length));
1624         trace_btrfs_reclaim_block_group(bg);
1625         ret = btrfs_relocate_chunk(fs_info, bg->start);
1626         if (ret) {
1627             btrfs_dec_block_group_ro(bg);
1628             btrfs_err(fs_info, "error relocating chunk %llu",
1629                   bg->start);
1630         }
1631
1632 next:
1633         btrfs_put_block_group(bg);
1634         spin_lock(&fs_info->unused_bgs_lock);
1635     }
1636     spin_unlock(&fs_info->unused_bgs_lock);
1637     mutex_unlock(&fs_info->reclaim_bgs_lock);
1638     btrfs_exclop_finish(fs_info);
1639     sb_end_write(fs_info->sb);
1640 }
1641
1642 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1643 {
1644     spin_lock(&fs_info->unused_bgs_lock);
1645     if (!list_empty(&fs_info->reclaim_bgs))
1646         queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1647     spin_unlock(&fs_info->unused_bgs_lock);
1648 }
1649
1650 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1651 {
1652     struct btrfs_fs_info *fs_info = bg->fs_info;
1653
1654     spin_lock(&fs_info->unused_bgs_lock);
1655     if (list_empty(&bg->bg_list)) {
1656         btrfs_get_block_group(bg);
1657         trace_btrfs_add_reclaim_block_group(bg);
1658         list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1659     }
1660     spin_unlock(&fs_info->unused_bgs_lock);
1661 }
1662
1663 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1664                struct btrfs_path *path)
1665 {
1666     struct extent_map_tree *em_tree;
1667     struct extent_map *em;
1668     struct btrfs_block_group_item bg;
1669     struct extent_buffer *leaf;
1670     int slot;
1671     u64 flags;
1672     int ret = 0;
1673
1674     slot = path->slots[0];
1675     leaf = path->nodes[0];
1676
1677     em_tree = &fs_info->mapping_tree;
1678     read_lock(&em_tree->lock);
1679     em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1680     read_unlock(&em_tree->lock);
1681     if (!em) {
1682         btrfs_err(fs_info,
1683               "logical %llu len %llu found bg but no related chunk",
1684               key->objectid, key->offset);
1685         return -ENOENT;
1686     }
1687
1688     if (em->start != key->objectid || em->len != key->offset) {
1689         btrfs_err(fs_info,
1690             "block group %llu len %llu mismatch with chunk %llu len %llu",
1691             key->objectid, key->offset, em->start, em->len);
1692         ret = -EUCLEAN;
1693         goto out_free_em;
1694     }
1695
1696     read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1697                sizeof(bg));
1698     flags = btrfs_stack_block_group_flags(&bg) &
1699         BTRFS_BLOCK_GROUP_TYPE_MASK;
1700
1701     if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1702         btrfs_err(fs_info,
1703 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1704               key->objectid, key->offset, flags,
1705               (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1706         ret = -EUCLEAN;
1707     }
1708
1709 out_free_em:
1710     free_extent_map(em);
1711     return ret;
1712 }
1713
1714 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1715                   struct btrfs_path *path,
1716                   struct btrfs_key *key)
1717 {
1718     struct btrfs_root *root = btrfs_block_group_root(fs_info);
1719     int ret;
1720     struct btrfs_key found_key;
1721
1722     btrfs_for_each_slot(root, key, &found_key, path, ret) {
1723         if (found_key.objectid >= key->objectid &&
1724             found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1725             return read_bg_from_eb(fs_info, &found_key, path);
1726         }
1727     }
1728     return ret;
1729 }
1730
1731 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1732 {
1733     u64 extra_flags = chunk_to_extended(flags) &
1734                 BTRFS_EXTENDED_PROFILE_MASK;
1735
1736     write_seqlock(&fs_info->profiles_lock);
1737     if (flags & BTRFS_BLOCK_GROUP_DATA)
1738         fs_info->avail_data_alloc_bits |= extra_flags;
1739     if (flags & BTRFS_BLOCK_GROUP_METADATA)
1740         fs_info->avail_metadata_alloc_bits |= extra_flags;
1741     if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1742         fs_info->avail_system_alloc_bits |= extra_flags;
1743     write_sequnlock(&fs_info->profiles_lock);
1744 }
1745
1746 /**
1747  * Map a physical disk address to a list of logical addresses
1748  *
1749  * @fs_info:       the filesystem
1750  * @chunk_start:   logical address of block group
1751  * @bdev:      physical device to resolve, can be NULL to indicate any device
1752  * @physical:      physical address to map to logical addresses
1753  * @logical:       return array of logical addresses which map to @physical
1754  * @naddrs:    length of @logical
1755  * @stripe_len:    size of IO stripe for the given block group
1756  *
1757  * Maps a particular @physical disk address to a list of @logical addresses.
1758  * Used primarily to exclude those portions of a block group that contain super
1759  * block copies.
1760  */
1761 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1762              struct block_device *bdev, u64 physical, u64 **logical,
1763              int *naddrs, int *stripe_len)
1764 {
1765     struct extent_map *em;
1766     struct map_lookup *map;
1767     u64 *buf;
1768     u64 bytenr;
1769     u64 data_stripe_length;
1770     u64 io_stripe_size;
1771     int i, nr = 0;
1772     int ret = 0;
1773
1774     em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1775     if (IS_ERR(em))
1776         return -EIO;
1777
1778     map = em->map_lookup;
1779     data_stripe_length = em->orig_block_len;
1780     io_stripe_size = map->stripe_len;
1781     chunk_start = em->start;
1782
1783     /* For RAID5/6 adjust to a full IO stripe length */
1784     if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1785         io_stripe_size = map->stripe_len * nr_data_stripes(map);
1786
1787     buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1788     if (!buf) {
1789         ret = -ENOMEM;
1790         goto out;
1791     }
1792
1793     for (i = 0; i < map->num_stripes; i++) {
1794         bool already_inserted = false;
1795         u64 stripe_nr;
1796         u64 offset;
1797         int j;
1798
1799         if (!in_range(physical, map->stripes[i].physical,
1800                   data_stripe_length))
1801             continue;
1802
1803         if (bdev && map->stripes[i].dev->bdev != bdev)
1804             continue;
1805
1806         stripe_nr = physical - map->stripes[i].physical;
1807         stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
1808
1809         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
1810                  BTRFS_BLOCK_GROUP_RAID10)) {
1811             stripe_nr = stripe_nr * map->num_stripes + i;
1812             stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1813         }
1814         /*
1815          * The remaining case would be for RAID56, multiply by
1816          * nr_data_stripes().  Alternatively, just use rmap_len below
1817          * instead of map->stripe_len
1818          */
1819
1820         bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
1821
1822         /* Ensure we don't add duplicate addresses */
1823         for (j = 0; j < nr; j++) {
1824             if (buf[j] == bytenr) {
1825                 already_inserted = true;
1826                 break;
1827             }
1828         }
1829
1830         if (!already_inserted)
1831             buf[nr++] = bytenr;
1832     }
1833
1834     *logical = buf;
1835     *naddrs = nr;
1836     *stripe_len = io_stripe_size;
1837 out:
1838     free_extent_map(em);
1839     return ret;
1840 }
1841
1842 static int exclude_super_stripes(struct btrfs_block_group *cache)
1843 {
1844     struct btrfs_fs_info *fs_info = cache->fs_info;
1845     const bool zoned = btrfs_is_zoned(fs_info);
1846     u64 bytenr;
1847     u64 *logical;
1848     int stripe_len;
1849     int i, nr, ret;
1850
1851     if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1852         stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1853         cache->bytes_super += stripe_len;
1854         ret = btrfs_add_excluded_extent(fs_info, cache->start,
1855                         stripe_len);
1856         if (ret)
1857             return ret;
1858     }
1859
1860     for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1861         bytenr = btrfs_sb_offset(i);
1862         ret = btrfs_rmap_block(fs_info, cache->start, NULL,
1863                        bytenr, &logical, &nr, &stripe_len);
1864         if (ret)
1865             return ret;
1866
1867         /* Shouldn't have super stripes in sequential zones */
1868         if (zoned && nr) {
1869             btrfs_err(fs_info,
1870             "zoned: block group %llu must not contain super block",
1871                   cache->start);
1872             return -EUCLEAN;
1873         }
1874
1875         while (nr--) {
1876             u64 len = min_t(u64, stripe_len,
1877                 cache->start + cache->length - logical[nr]);
1878
1879             cache->bytes_super += len;
1880             ret = btrfs_add_excluded_extent(fs_info, logical[nr],
1881                             len);
1882             if (ret) {
1883                 kfree(logical);
1884                 return ret;
1885             }
1886         }
1887
1888         kfree(logical);
1889     }
1890     return 0;
1891 }
1892
1893 static void link_block_group(struct btrfs_block_group *cache)
1894 {
1895     struct btrfs_space_info *space_info = cache->space_info;
1896     int index = btrfs_bg_flags_to_raid_index(cache->flags);
1897
1898     down_write(&space_info->groups_sem);
1899     list_add_tail(&cache->list, &space_info->block_groups[index]);
1900     up_write(&space_info->groups_sem);
1901 }
1902
1903 static struct btrfs_block_group *btrfs_create_block_group_cache(
1904         struct btrfs_fs_info *fs_info, u64 start)
1905 {
1906     struct btrfs_block_group *cache;
1907
1908     cache = kzalloc(sizeof(*cache), GFP_NOFS);
1909     if (!cache)
1910         return NULL;
1911
1912     cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1913                     GFP_NOFS);
1914     if (!cache->free_space_ctl) {
1915         kfree(cache);
1916         return NULL;
1917     }
1918
1919     cache->start = start;
1920
1921     cache->fs_info = fs_info;
1922     cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1923
1924     cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1925
1926     refcount_set(&cache->refs, 1);
1927     spin_lock_init(&cache->lock);
1928     init_rwsem(&cache->data_rwsem);
1929     INIT_LIST_HEAD(&cache->list);
1930     INIT_LIST_HEAD(&cache->cluster_list);
1931     INIT_LIST_HEAD(&cache->bg_list);
1932     INIT_LIST_HEAD(&cache->ro_list);
1933     INIT_LIST_HEAD(&cache->discard_list);
1934     INIT_LIST_HEAD(&cache->dirty_list);
1935     INIT_LIST_HEAD(&cache->io_list);
1936     INIT_LIST_HEAD(&cache->active_bg_list);
1937     btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
1938     atomic_set(&cache->frozen, 0);
1939     mutex_init(&cache->free_space_lock);
1940     btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1941
1942     return cache;
1943 }
1944
1945 /*
1946  * Iterate all chunks and verify that each of them has the corresponding block
1947  * group
1948  */
1949 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1950 {
1951     struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1952     struct extent_map *em;
1953     struct btrfs_block_group *bg;
1954     u64 start = 0;
1955     int ret = 0;
1956
1957     while (1) {
1958         read_lock(&map_tree->lock);
1959         /*
1960          * lookup_extent_mapping will return the first extent map
1961          * intersecting the range, so setting @len to 1 is enough to
1962          * get the first chunk.
1963          */
1964         em = lookup_extent_mapping(map_tree, start, 1);
1965         read_unlock(&map_tree->lock);
1966         if (!em)
1967             break;
1968
1969         bg = btrfs_lookup_block_group(fs_info, em->start);
1970         if (!bg) {
1971             btrfs_err(fs_info,
1972     "chunk start=%llu len=%llu doesn't have corresponding block group",
1973                      em->start, em->len);
1974             ret = -EUCLEAN;
1975             free_extent_map(em);
1976             break;
1977         }
1978         if (bg->start != em->start || bg->length != em->len ||
1979             (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1980             (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1981             btrfs_err(fs_info,
1982 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1983                 em->start, em->len,
1984                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1985                 bg->start, bg->length,
1986                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1987             ret = -EUCLEAN;
1988             free_extent_map(em);
1989             btrfs_put_block_group(bg);
1990             break;
1991         }
1992         start = em->start + em->len;
1993         free_extent_map(em);
1994         btrfs_put_block_group(bg);
1995     }
1996     return ret;
1997 }
1998
1999 static int read_one_block_group(struct btrfs_fs_info *info,
2000                 struct btrfs_block_group_item *bgi,
2001                 const struct btrfs_key *key,
2002                 int need_clear)
2003 {
2004     struct btrfs_block_group *cache;
2005     struct btrfs_space_info *space_info;
2006     const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2007     int ret;
2008
2009     ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2010
2011     cache = btrfs_create_block_group_cache(info, key->objectid);
2012     if (!cache)
2013         return -ENOMEM;
2014
2015     cache->length = key->offset;
2016     cache->used = btrfs_stack_block_group_used(bgi);
2017     cache->flags = btrfs_stack_block_group_flags(bgi);
2018     cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2019
2020     set_free_space_tree_thresholds(cache);
2021
2022     if (need_clear) {
2023         /*
2024          * When we mount with old space cache, we need to
2025          * set BTRFS_DC_CLEAR and set dirty flag.
2026          *
2027          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2028          *    truncate the old free space cache inode and
2029          *    setup a new one.
2030          * b) Setting 'dirty flag' makes sure that we flush
2031          *    the new space cache info onto disk.
2032          */
2033         if (btrfs_test_opt(info, SPACE_CACHE))
2034             cache->disk_cache_state = BTRFS_DC_CLEAR;
2035     }
2036     if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2037         (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2038             btrfs_err(info,
2039 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2040                   cache->start);
2041             ret = -EINVAL;
2042             goto error;
2043     }
2044
2045     ret = btrfs_load_block_group_zone_info(cache, false);
2046     if (ret) {
2047         btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2048               cache->start);
2049         goto error;
2050     }
2051
2052     /*
2053      * We need to exclude the super stripes now so that the space info has
2054      * super bytes accounted for, otherwise we'll think we have more space
2055      * than we actually do.
2056      */
2057     ret = exclude_super_stripes(cache);
2058     if (ret) {
2059         /* We may have excluded something, so call this just in case. */
2060         btrfs_free_excluded_extents(cache);
2061         goto error;
2062     }
2063
2064     /*
2065      * For zoned filesystem, space after the allocation offset is the only
2066      * free space for a block group. So, we don't need any caching work.
2067      * btrfs_calc_zone_unusable() will set the amount of free space and
2068      * zone_unusable space.
2069      *
2070      * For regular filesystem, check for two cases, either we are full, and
2071      * therefore don't need to bother with the caching work since we won't
2072      * find any space, or we are empty, and we can just add all the space
2073      * in and be done with it.  This saves us _a_lot_ of time, particularly
2074      * in the full case.
2075      */
2076     if (btrfs_is_zoned(info)) {
2077         btrfs_calc_zone_unusable(cache);
2078         /* Should not have any excluded extents. Just in case, though. */
2079         btrfs_free_excluded_extents(cache);
2080     } else if (cache->length == cache->used) {
2081         cache->last_byte_to_unpin = (u64)-1;
2082         cache->cached = BTRFS_CACHE_FINISHED;
2083         btrfs_free_excluded_extents(cache);
2084     } else if (cache->used == 0) {
2085         cache->last_byte_to_unpin = (u64)-1;
2086         cache->cached = BTRFS_CACHE_FINISHED;
2087         add_new_free_space(cache, cache->start,
2088                    cache->start + cache->length);
2089         btrfs_free_excluded_extents(cache);
2090     }
2091
2092     ret = btrfs_add_block_group_cache(info, cache);
2093     if (ret) {
2094         btrfs_remove_free_space_cache(cache);
2095         goto error;
2096     }
2097     trace_btrfs_add_block_group(info, cache, 0);
2098     btrfs_update_space_info(info, cache->flags, cache->length,
2099                 cache->used, cache->bytes_super,
2100                 cache->zone_unusable, cache->zone_is_active,
2101                 &space_info);
2102
2103     cache->space_info = space_info;
2104
2105     link_block_group(cache);
2106
2107     set_avail_alloc_bits(info, cache->flags);
2108     if (btrfs_chunk_writeable(info, cache->start)) {
2109         if (cache->used == 0) {
2110             ASSERT(list_empty(&cache->bg_list));
2111             if (btrfs_test_opt(info, DISCARD_ASYNC))
2112                 btrfs_discard_queue_work(&info->discard_ctl, cache);
2113             else
2114                 btrfs_mark_bg_unused(cache);
2115         }
2116     } else {
2117         inc_block_group_ro(cache, 1);
2118     }
2119
2120     return 0;
2121 error:
2122     btrfs_put_block_group(cache);
2123     return ret;
2124 }
2125
2126 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2127 {
2128     struct extent_map_tree *em_tree = &fs_info->mapping_tree;
2129     struct btrfs_space_info *space_info;
2130     struct rb_node *node;
2131     int ret = 0;
2132
2133     for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
2134         struct extent_map *em;
2135         struct map_lookup *map;
2136         struct btrfs_block_group *bg;
2137
2138         em = rb_entry(node, struct extent_map, rb_node);
2139         map = em->map_lookup;
2140         bg = btrfs_create_block_group_cache(fs_info, em->start);
2141         if (!bg) {
2142             ret = -ENOMEM;
2143             break;
2144         }
2145
2146         /* Fill dummy cache as FULL */
2147         bg->length = em->len;
2148         bg->flags = map->type;
2149         bg->last_byte_to_unpin = (u64)-1;
2150         bg->cached = BTRFS_CACHE_FINISHED;
2151         bg->used = em->len;
2152         bg->flags = map->type;
2153         ret = btrfs_add_block_group_cache(fs_info, bg);
2154         /*
2155          * We may have some valid block group cache added already, in
2156          * that case we skip to the next one.
2157          */
2158         if (ret == -EEXIST) {
2159             ret = 0;
2160             btrfs_put_block_group(bg);
2161             continue;
2162         }
2163
2164         if (ret) {
2165             btrfs_remove_free_space_cache(bg);
2166             btrfs_put_block_group(bg);
2167             break;
2168         }
2169
2170         btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
2171                     0, 0, false, &space_info);
2172         bg->space_info = space_info;
2173         link_block_group(bg);
2174
2175         set_avail_alloc_bits(fs_info, bg->flags);
2176     }
2177     if (!ret)
2178         btrfs_init_global_block_rsv(fs_info);
2179     return ret;
2180 }
2181
2182 int btrfs_read_block_groups(struct btrfs_fs_info *info)
2183 {
2184     struct btrfs_root *root = btrfs_block_group_root(info);
2185     struct btrfs_path *path;
2186     int ret;
2187     struct btrfs_block_group *cache;
2188     struct btrfs_space_info *space_info;
2189     struct btrfs_key key;
2190     int need_clear = 0;
2191     u64 cache_gen;
2192
2193     if (!root)
2194         return fill_dummy_bgs(info);
2195
2196     key.objectid = 0;
2197     key.offset = 0;
2198     key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2199     path = btrfs_alloc_path();
2200     if (!path)
2201         return -ENOMEM;
2202
2203     cache_gen = btrfs_super_cache_generation(info->super_copy);
2204     if (btrfs_test_opt(info, SPACE_CACHE) &&
2205         btrfs_super_generation(info->super_copy) != cache_gen)
2206         need_clear = 1;
2207     if (btrfs_test_opt(info, CLEAR_CACHE))
2208         need_clear = 1;
2209
2210     while (1) {
2211         struct btrfs_block_group_item bgi;
2212         struct extent_buffer *leaf;
2213         int slot;
2214
2215         ret = find_first_block_group(info, path, &key);
2216         if (ret > 0)
2217             break;
2218         if (ret != 0)
2219             goto error;
2220
2221         leaf = path->nodes[0];
2222         slot = path->slots[0];
2223
2224         read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2225                    sizeof(bgi));
2226
2227         btrfs_item_key_to_cpu(leaf, &key, slot);
2228         btrfs_release_path(path);
2229         ret = read_one_block_group(info, &bgi, &key, need_clear);
2230         if (ret < 0)
2231             goto error;
2232         key.objectid += key.offset;
2233         key.offset = 0;
2234     }
2235     btrfs_release_path(path);
2236
2237     list_for_each_entry(space_info, &info->space_info, list) {
2238         int i;
2239
2240         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2241             if (list_empty(&space_info->block_groups[i]))
2242                 continue;
2243             cache = list_first_entry(&space_info->block_groups[i],
2244                          struct btrfs_block_group,
2245                          list);
2246             btrfs_sysfs_add_block_group_type(cache);
2247         }
2248
2249         if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2250               (BTRFS_BLOCK_GROUP_RAID10 |
2251                BTRFS_BLOCK_GROUP_RAID1_MASK |
2252                BTRFS_BLOCK_GROUP_RAID56_MASK |
2253                BTRFS_BLOCK_GROUP_DUP)))
2254             continue;
2255         /*
2256          * Avoid allocating from un-mirrored block group if there are
2257          * mirrored block groups.
2258          */
2259         list_for_each_entry(cache,
2260                 &space_info->block_groups[BTRFS_RAID_RAID0],
2261                 list)
2262             inc_block_group_ro(cache, 1);
2263         list_for_each_entry(cache,
2264                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2265                 list)
2266             inc_block_group_ro(cache, 1);
2267     }
2268
2269     btrfs_init_global_block_rsv(info);
2270     ret = check_chunk_block_group_mappings(info);
2271 error:
2272     btrfs_free_path(path);
2273     /*
2274      * We've hit some error while reading the extent tree, and have
2275      * rescue=ibadroots mount option.
2276      * Try to fill the tree using dummy block groups so that the user can
2277      * continue to mount and grab their data.
2278      */
2279     if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2280         ret = fill_dummy_bgs(info);
2281     return ret;
2282 }
2283
2284 /*
2285  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2286  * allocation.
2287  *
2288  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2289  * phases.
2290  */
2291 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2292                    struct btrfs_block_group *block_group)
2293 {
2294     struct btrfs_fs_info *fs_info = trans->fs_info;
2295     struct btrfs_block_group_item bgi;
2296     struct btrfs_root *root = btrfs_block_group_root(fs_info);
2297     struct btrfs_key key;
2298
2299     spin_lock(&block_group->lock);
2300     btrfs_set_stack_block_group_used(&bgi, block_group->used);
2301     btrfs_set_stack_block_group_chunk_objectid(&bgi,
2302                            block_group->global_root_id);
2303     btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2304     key.objectid = block_group->start;
2305     key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2306     key.offset = block_group->length;
2307     spin_unlock(&block_group->lock);
2308
2309     return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2310 }
2311
2312 static int insert_dev_extent(struct btrfs_trans_handle *trans,
2313                 struct btrfs_device *device, u64 chunk_offset,
2314                 u64 start, u64 num_bytes)
2315 {
2316     struct btrfs_fs_info *fs_info = device->fs_info;
2317     struct btrfs_root *root = fs_info->dev_root;
2318     struct btrfs_path *path;
2319     struct btrfs_dev_extent *extent;
2320     struct extent_buffer *leaf;
2321     struct btrfs_key key;
2322     int ret;
2323
2324     WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2325     WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2326     path = btrfs_alloc_path();
2327     if (!path)
2328         return -ENOMEM;
2329
2330     key.objectid = device->devid;
2331     key.type = BTRFS_DEV_EXTENT_KEY;
2332     key.offset = start;
2333     ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2334     if (ret)
2335         goto out;
2336
2337     leaf = path->nodes[0];
2338     extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2339     btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2340     btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2341                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2342     btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2343
2344     btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2345     btrfs_mark_buffer_dirty(leaf);
2346 out:
2347     btrfs_free_path(path);
2348     return ret;
2349 }
2350
2351 /*
2352  * This function belongs to phase 2.
2353  *
2354  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2355  * phases.
2356  */
2357 static int insert_dev_extents(struct btrfs_trans_handle *trans,
2358                    u64 chunk_offset, u64 chunk_size)
2359 {
2360     struct btrfs_fs_info *fs_info = trans->fs_info;
2361     struct btrfs_device *device;
2362     struct extent_map *em;
2363     struct map_lookup *map;
2364     u64 dev_offset;
2365     u64 stripe_size;
2366     int i;
2367     int ret = 0;
2368
2369     em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2370     if (IS_ERR(em))
2371         return PTR_ERR(em);
2372
2373     map = em->map_lookup;
2374     stripe_size = em->orig_block_len;
2375
2376     /*
2377      * Take the device list mutex to prevent races with the final phase of
2378      * a device replace operation that replaces the device object associated
2379      * with the map's stripes, because the device object's id can change
2380      * at any time during that final phase of the device replace operation
2381      * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2382      * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2383      * resulting in persisting a device extent item with such ID.
2384      */
2385     mutex_lock(&fs_info->fs_devices->device_list_mutex);
2386     for (i = 0; i < map->num_stripes; i++) {
2387         device = map->stripes[i].dev;
2388         dev_offset = map->stripes[i].physical;
2389
2390         ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2391                        stripe_size);
2392         if (ret)
2393             break;
2394     }
2395     mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2396
2397     free_extent_map(em);
2398     return ret;
2399 }
2400
2401 /*
2402  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2403  * chunk allocation.
2404  *
2405  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2406  * phases.
2407  */
2408 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2409 {
2410     struct btrfs_fs_info *fs_info = trans->fs_info;
2411     struct btrfs_block_group *block_group;
2412     int ret = 0;
2413
2414     while (!list_empty(&trans->new_bgs)) {
2415         int index;
2416
2417         block_group = list_first_entry(&trans->new_bgs,
2418                            struct btrfs_block_group,
2419                            bg_list);
2420         if (ret)
2421             goto next;
2422
2423         index = btrfs_bg_flags_to_raid_index(block_group->flags);
2424
2425         ret = insert_block_group_item(trans, block_group);
2426         if (ret)
2427             btrfs_abort_transaction(trans, ret);
2428         if (!block_group->chunk_item_inserted) {
2429             mutex_lock(&fs_info->chunk_mutex);
2430             ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2431             mutex_unlock(&fs_info->chunk_mutex);
2432             if (ret)
2433                 btrfs_abort_transaction(trans, ret);
2434         }
2435         ret = insert_dev_extents(trans, block_group->start,
2436                      block_group->length);
2437         if (ret)
2438             btrfs_abort_transaction(trans, ret);
2439         add_block_group_free_space(trans, block_group);
2440
2441         /*
2442          * If we restriped during balance, we may have added a new raid
2443          * type, so now add the sysfs entries when it is safe to do so.
2444          * We don't have to worry about locking here as it's handled in
2445          * btrfs_sysfs_add_block_group_type.
2446          */
2447         if (block_group->space_info->block_group_kobjs[index] == NULL)
2448             btrfs_sysfs_add_block_group_type(block_group);
2449
2450         /* Already aborted the transaction if it failed. */
2451 next:
2452         btrfs_delayed_refs_rsv_release(fs_info, 1);
2453         list_del_init(&block_group->bg_list);
2454     }
2455     btrfs_trans_release_chunk_metadata(trans);
2456 }
2457
2458 /*
2459  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2460  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2461  */
2462 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2463 {
2464     u64 div = SZ_1G;
2465     u64 index;
2466
2467     if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2468         return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2469
2470     /* If we have a smaller fs index based on 128MiB. */
2471     if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2472         div = SZ_128M;
2473
2474     offset = div64_u64(offset, div);
2475     div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2476     return index;
2477 }
2478
2479 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2480                          u64 bytes_used, u64 type,
2481                          u64 chunk_offset, u64 size)
2482 {
2483     struct btrfs_fs_info *fs_info = trans->fs_info;
2484     struct btrfs_block_group *cache;
2485     int ret;
2486
2487     btrfs_set_log_full_commit(trans);
2488
2489     cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2490     if (!cache)
2491         return ERR_PTR(-ENOMEM);
2492
2493     cache->length = size;
2494     set_free_space_tree_thresholds(cache);
2495     cache->used = bytes_used;
2496     cache->flags = type;
2497     cache->last_byte_to_unpin = (u64)-1;
2498     cache->cached = BTRFS_CACHE_FINISHED;
2499     cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2500
2501     if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2502         cache->needs_free_space = 1;
2503
2504     ret = btrfs_load_block_group_zone_info(cache, true);
2505     if (ret) {
2506         btrfs_put_block_group(cache);
2507         return ERR_PTR(ret);
2508     }
2509
2510     ret = exclude_super_stripes(cache);
2511     if (ret) {
2512         /* We may have excluded something, so call this just in case */
2513         btrfs_free_excluded_extents(cache);
2514         btrfs_put_block_group(cache);
2515         return ERR_PTR(ret);
2516     }
2517
2518     add_new_free_space(cache, chunk_offset, chunk_offset + size);
2519
2520     btrfs_free_excluded_extents(cache);
2521
2522 #ifdef CONFIG_BTRFS_DEBUG
2523     if (btrfs_should_fragment_free_space(cache)) {
2524         u64 new_bytes_used = size - bytes_used;
2525
2526         bytes_used += new_bytes_used >> 1;
2527         fragment_free_space(cache);
2528     }
2529 #endif
2530     /*
2531      * Ensure the corresponding space_info object is created and
2532      * assigned to our block group. We want our bg to be added to the rbtree
2533      * with its ->space_info set.
2534      */
2535     cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2536     ASSERT(cache->space_info);
2537
2538     ret = btrfs_add_block_group_cache(fs_info, cache);
2539     if (ret) {
2540         btrfs_remove_free_space_cache(cache);
2541         btrfs_put_block_group(cache);
2542         return ERR_PTR(ret);
2543     }
2544
2545     /*
2546      * Now that our block group has its ->space_info set and is inserted in
2547      * the rbtree, update the space info's counters.
2548      */
2549     trace_btrfs_add_block_group(fs_info, cache, 1);
2550     btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2551                 cache->bytes_super, cache->zone_unusable,
2552                 cache->zone_is_active, &cache->space_info);
2553     btrfs_update_global_block_rsv(fs_info);
2554
2555     link_block_group(cache);
2556
2557     list_add_tail(&cache->bg_list, &trans->new_bgs);
2558     trans->delayed_ref_updates++;
2559     btrfs_update_delayed_refs_rsv(trans);
2560
2561     set_avail_alloc_bits(fs_info, type);
2562     return cache;
2563 }
2564
2565 /*
2566  * Mark one block group RO, can be called several times for the same block
2567  * group.
2568  *
2569  * @cache:      the destination block group
2570  * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
2571  *          ensure we still have some free space after marking this
2572  *          block group RO.
2573  */
2574 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2575                  bool do_chunk_alloc)
2576 {
2577     struct btrfs_fs_info *fs_info = cache->fs_info;
2578     struct btrfs_trans_handle *trans;
2579     struct btrfs_root *root = btrfs_block_group_root(fs_info);
2580     u64 alloc_flags;
2581     int ret;
2582     bool dirty_bg_running;
2583
2584     /*
2585      * This can only happen when we are doing read-only scrub on read-only
2586      * mount.
2587      * In that case we should not start a new transaction on read-only fs.
2588      * Thus here we skip all chunk allocations.
2589      */
2590     if (sb_rdonly(fs_info->sb)) {
2591         mutex_lock(&fs_info->ro_block_group_mutex);
2592         ret = inc_block_group_ro(cache, 0);
2593         mutex_unlock(&fs_info->ro_block_group_mutex);
2594         return ret;
2595     }
2596
2597     do {
2598         trans = btrfs_join_transaction(root);
2599         if (IS_ERR(trans))
2600             return PTR_ERR(trans);
2601
2602         dirty_bg_running = false;
2603
2604         /*
2605          * We're not allowed to set block groups readonly after the dirty
2606          * block group cache has started writing.  If it already started,
2607          * back off and let this transaction commit.
2608          */
2609         mutex_lock(&fs_info->ro_block_group_mutex);
2610         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2611             u64 transid = trans->transid;
2612
2613             mutex_unlock(&fs_info->ro_block_group_mutex);
2614             btrfs_end_transaction(trans);
2615
2616             ret = btrfs_wait_for_commit(fs_info, transid);
2617             if (ret)
2618                 return ret;
2619             dirty_bg_running = true;
2620         }
2621     } while (dirty_bg_running);
2622
2623     if (do_chunk_alloc) {
2624         /*
2625          * If we are changing raid levels, try to allocate a
2626          * corresponding block group with the new raid level.
2627          */
2628         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2629         if (alloc_flags != cache->flags) {
2630             ret = btrfs_chunk_alloc(trans, alloc_flags,
2631                         CHUNK_ALLOC_FORCE);
2632             /*
2633              * ENOSPC is allowed here, we may have enough space
2634              * already allocated at the new raid level to carry on
2635              */
2636             if (ret == -ENOSPC)
2637                 ret = 0;
2638             if (ret < 0)
2639                 goto out;
2640         }
2641     }
2642
2643     ret = inc_block_group_ro(cache, 0);
2644     if (!do_chunk_alloc || ret == -ETXTBSY)
2645         goto unlock_out;
2646     if (!ret)
2647         goto out;
2648     alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2649     ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2650     if (ret < 0)
2651         goto out;
2652     /*
2653      * We have allocated a new chunk. We also need to activate that chunk to
2654      * grant metadata tickets for zoned filesystem.
2655      */
2656     ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
2657     if (ret < 0)
2658         goto out;
2659
2660     ret = inc_block_group_ro(cache, 0);
2661     if (ret == -ETXTBSY)
2662         goto unlock_out;
2663 out:
2664     if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2665         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2666         mutex_lock(&fs_info->chunk_mutex);
2667         check_system_chunk(trans, alloc_flags);
2668         mutex_unlock(&fs_info->chunk_mutex);
2669     }
2670 unlock_out:
2671     mutex_unlock(&fs_info->ro_block_group_mutex);
2672
2673     btrfs_end_transaction(trans);
2674     return ret;
2675 }
2676
2677 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2678 {
2679     struct btrfs_space_info *sinfo = cache->space_info;
2680     u64 num_bytes;
2681
2682     BUG_ON(!cache->ro);
2683
2684     spin_lock(&sinfo->lock);
2685     spin_lock(&cache->lock);
2686     if (!--cache->ro) {
2687         if (btrfs_is_zoned(cache->fs_info)) {
2688             /* Migrate zone_unusable bytes back */
2689             cache->zone_unusable =
2690                 (cache->alloc_offset - cache->used) +
2691                 (cache->length - cache->zone_capacity);
2692             sinfo->bytes_zone_unusable += cache->zone_unusable;
2693             sinfo->bytes_readonly -= cache->zone_unusable;
2694         }
2695         num_bytes = cache->length - cache->reserved -
2696                 cache->pinned - cache->bytes_super -
2697                 cache->zone_unusable - cache->used;
2698         sinfo->bytes_readonly -= num_bytes;
2699         list_del_init(&cache->ro_list);
2700     }
2701     spin_unlock(&cache->lock);
2702     spin_unlock(&sinfo->lock);
2703 }
2704
2705 static int update_block_group_item(struct btrfs_trans_handle *trans,
2706                    struct btrfs_path *path,
2707                    struct btrfs_block_group *cache)
2708 {
2709     struct btrfs_fs_info *fs_info = trans->fs_info;
2710     int ret;
2711     struct btrfs_root *root = btrfs_block_group_root(fs_info);
2712     unsigned long bi;
2713     struct extent_buffer *leaf;
2714     struct btrfs_block_group_item bgi;
2715     struct btrfs_key key;
2716
2717     key.objectid = cache->start;
2718     key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2719     key.offset = cache->length;
2720
2721     ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2722     if (ret) {
2723         if (ret > 0)
2724             ret = -ENOENT;
2725         goto fail;
2726     }
2727
2728     leaf = path->nodes[0];
2729     bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2730     btrfs_set_stack_block_group_used(&bgi, cache->used);
2731     btrfs_set_stack_block_group_chunk_objectid(&bgi,
2732                            cache->global_root_id);
2733     btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2734     write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2735     btrfs_mark_buffer_dirty(leaf);
2736 fail:
2737     btrfs_release_path(path);
2738     return ret;
2739
2740 }
2741
2742 static int cache_save_setup(struct btrfs_block_group *block_group,
2743                 struct btrfs_trans_handle *trans,
2744                 struct btrfs_path *path)
2745 {
2746     struct btrfs_fs_info *fs_info = block_group->fs_info;
2747     struct btrfs_root *root = fs_info->tree_root;
2748     struct inode *inode = NULL;
2749     struct extent_changeset *data_reserved = NULL;
2750     u64 alloc_hint = 0;
2751     int dcs = BTRFS_DC_ERROR;
2752     u64 cache_size = 0;
2753     int retries = 0;
2754     int ret = 0;
2755
2756     if (!btrfs_test_opt(fs_info, SPACE_CACHE))
2757         return 0;
2758
2759     /*
2760      * If this block group is smaller than 100 megs don't bother caching the
2761      * block group.
2762      */
2763     if (block_group->length < (100 * SZ_1M)) {
2764         spin_lock(&block_group->lock);
2765         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2766         spin_unlock(&block_group->lock);
2767         return 0;
2768     }
2769
2770     if (TRANS_ABORTED(trans))
2771         return 0;
2772 again:
2773     inode = lookup_free_space_inode(block_group, path);
2774     if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2775         ret = PTR_ERR(inode);
2776         btrfs_release_path(path);
2777         goto out;
2778     }
2779
2780     if (IS_ERR(inode)) {
2781         BUG_ON(retries);
2782         retries++;
2783
2784         if (block_group->ro)
2785             goto out_free;
2786
2787         ret = create_free_space_inode(trans, block_group, path);
2788         if (ret)
2789             goto out_free;
2790         goto again;
2791     }
2792
2793     /*
2794      * We want to set the generation to 0, that way if anything goes wrong
2795      * from here on out we know not to trust this cache when we load up next
2796      * time.
2797      */
2798     BTRFS_I(inode)->generation = 0;
2799     ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2800     if (ret) {
2801         /*
2802          * So theoretically we could recover from this, simply set the
2803          * super cache generation to 0 so we know to invalidate the
2804          * cache, but then we'd have to keep track of the block groups
2805          * that fail this way so we know we _have_ to reset this cache
2806          * before the next commit or risk reading stale cache.  So to
2807          * limit our exposure to horrible edge cases lets just abort the
2808          * transaction, this only happens in really bad situations
2809          * anyway.
2810          */
2811         btrfs_abort_transaction(trans, ret);
2812         goto out_put;
2813     }
2814     WARN_ON(ret);
2815
2816     /* We've already setup this transaction, go ahead and exit */
2817     if (block_group->cache_generation == trans->transid &&
2818         i_size_read(inode)) {
2819         dcs = BTRFS_DC_SETUP;
2820         goto out_put;
2821     }
2822
2823     if (i_size_read(inode) > 0) {
2824         ret = btrfs_check_trunc_cache_free_space(fs_info,
2825                     &fs_info->global_block_rsv);
2826         if (ret)
2827             goto out_put;
2828
2829         ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2830         if (ret)
2831             goto out_put;
2832     }
2833
2834     spin_lock(&block_group->lock);
2835     if (block_group->cached != BTRFS_CACHE_FINISHED ||
2836         !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2837         /*
2838          * don't bother trying to write stuff out _if_
2839          * a) we're not cached,
2840          * b) we're with nospace_cache mount option,
2841          * c) we're with v2 space_cache (FREE_SPACE_TREE).
2842          */
2843         dcs = BTRFS_DC_WRITTEN;
2844         spin_unlock(&block_group->lock);
2845         goto out_put;
2846     }
2847     spin_unlock(&block_group->lock);
2848
2849     /*
2850      * We hit an ENOSPC when setting up the cache in this transaction, just
2851      * skip doing the setup, we've already cleared the cache so we're safe.
2852      */
2853     if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2854         ret = -ENOSPC;
2855         goto out_put;
2856     }
2857
2858     /*
2859      * Try to preallocate enough space based on how big the block group is.
2860      * Keep in mind this has to include any pinned space which could end up
2861      * taking up quite a bit since it's not folded into the other space
2862      * cache.
2863      */
2864     cache_size = div_u64(block_group->length, SZ_256M);
2865     if (!cache_size)
2866         cache_size = 1;
2867
2868     cache_size *= 16;
2869     cache_size *= fs_info->sectorsize;
2870
2871     ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
2872                       cache_size);
2873     if (ret)
2874         goto out_put;
2875
2876     ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
2877                           cache_size, cache_size,
2878                           &alloc_hint);
2879     /*
2880      * Our cache requires contiguous chunks so that we don't modify a bunch
2881      * of metadata or split extents when writing the cache out, which means
2882      * we can enospc if we are heavily fragmented in addition to just normal
2883      * out of space conditions.  So if we hit this just skip setting up any
2884      * other block groups for this transaction, maybe we'll unpin enough
2885      * space the next time around.
2886      */
2887     if (!ret)
2888         dcs = BTRFS_DC_SETUP;
2889     else if (ret == -ENOSPC)
2890         set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2891
2892 out_put:
2893     iput(inode);
2894 out_free:
2895     btrfs_release_path(path);
2896 out:
2897     spin_lock(&block_group->lock);
2898     if (!ret && dcs == BTRFS_DC_SETUP)
2899         block_group->cache_generation = trans->transid;
2900     block_group->disk_cache_state = dcs;
2901     spin_unlock(&block_group->lock);
2902
2903     extent_changeset_free(data_reserved);
2904     return ret;
2905 }
2906
2907 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2908 {
2909     struct btrfs_fs_info *fs_info = trans->fs_info;
2910     struct btrfs_block_group *cache, *tmp;
2911     struct btrfs_transaction *cur_trans = trans->transaction;
2912     struct btrfs_path *path;
2913
2914     if (list_empty(&cur_trans->dirty_bgs) ||
2915         !btrfs_test_opt(fs_info, SPACE_CACHE))
2916         return 0;
2917
2918     path = btrfs_alloc_path();
2919     if (!path)
2920         return -ENOMEM;
2921
2922     /* Could add new block groups, use _safe just in case */
2923     list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2924                  dirty_list) {
2925         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2926             cache_save_setup(cache, trans, path);
2927     }
2928
2929     btrfs_free_path(path);
2930     return 0;
2931 }
2932
2933 /*
2934  * Transaction commit does final block group cache writeback during a critical
2935  * section where nothing is allowed to change the FS.  This is required in
2936  * order for the cache to actually match the block group, but can introduce a
2937  * lot of latency into the commit.
2938  *
2939  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2940  * There's a chance we'll have to redo some of it if the block group changes
2941  * again during the commit, but it greatly reduces the commit latency by
2942  * getting rid of the easy block groups while we're still allowing others to
2943  * join the commit.
2944  */
2945 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2946 {
2947     struct btrfs_fs_info *fs_info = trans->fs_info;
2948     struct btrfs_block_group *cache;
2949     struct btrfs_transaction *cur_trans = trans->transaction;
2950     int ret = 0;
2951     int should_put;
2952     struct btrfs_path *path = NULL;
2953     LIST_HEAD(dirty);
2954     struct list_head *io = &cur_trans->io_bgs;
2955     int loops = 0;
2956
2957     spin_lock(&cur_trans->dirty_bgs_lock);
2958     if (list_empty(&cur_trans->dirty_bgs)) {
2959         spin_unlock(&cur_trans->dirty_bgs_lock);
2960         return 0;
2961     }
2962     list_splice_init(&cur_trans->dirty_bgs, &dirty);
2963     spin_unlock(&cur_trans->dirty_bgs_lock);
2964
2965 again:
2966     /* Make sure all the block groups on our dirty list actually exist */
2967     btrfs_create_pending_block_groups(trans);
2968
2969     if (!path) {
2970         path = btrfs_alloc_path();
2971         if (!path) {
2972             ret = -ENOMEM;
2973             goto out;
2974         }
2975     }
2976
2977     /*
2978      * cache_write_mutex is here only to save us from balance or automatic
2979      * removal of empty block groups deleting this block group while we are
2980      * writing out the cache
2981      */
2982     mutex_lock(&trans->transaction->cache_write_mutex);
2983     while (!list_empty(&dirty)) {
2984         bool drop_reserve = true;
2985
2986         cache = list_first_entry(&dirty, struct btrfs_block_group,
2987                      dirty_list);
2988         /*
2989          * This can happen if something re-dirties a block group that
2990          * is already under IO.  Just wait for it to finish and then do
2991          * it all again
2992          */
2993         if (!list_empty(&cache->io_list)) {
2994             list_del_init(&cache->io_list);
2995             btrfs_wait_cache_io(trans, cache, path);
2996             btrfs_put_block_group(cache);
2997         }
2998
2999
3000         /*
3001          * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3002          * it should update the cache_state.  Don't delete until after
3003          * we wait.
3004          *
3005          * Since we're not running in the commit critical section
3006          * we need the dirty_bgs_lock to protect from update_block_group
3007          */
3008         spin_lock(&cur_trans->dirty_bgs_lock);
3009         list_del_init(&cache->dirty_list);
3010         spin_unlock(&cur_trans->dirty_bgs_lock);
3011
3012         should_put = 1;
3013
3014         cache_save_setup(cache, trans, path);
3015
3016         if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3017             cache->io_ctl.inode = NULL;
3018             ret = btrfs_write_out_cache(trans, cache, path);
3019             if (ret == 0 && cache->io_ctl.inode) {
3020                 should_put = 0;
3021
3022                 /*
3023                  * The cache_write_mutex is protecting the
3024                  * io_list, also refer to the definition of
3025                  * btrfs_transaction::io_bgs for more details
3026                  */
3027                 list_add_tail(&cache->io_list, io);
3028             } else {
3029                 /*
3030                  * If we failed to write the cache, the
3031                  * generation will be bad and life goes on
3032                  */
3033                 ret = 0;
3034             }
3035         }
3036         if (!ret) {
3037             ret = update_block_group_item(trans, path, cache);
3038             /*
3039              * Our block group might still be attached to the list
3040              * of new block groups in the transaction handle of some
3041              * other task (struct btrfs_trans_handle->new_bgs). This
3042              * means its block group item isn't yet in the extent
3043              * tree. If this happens ignore the error, as we will
3044              * try again later in the critical section of the
3045              * transaction commit.
3046              */
3047             if (ret == -ENOENT) {
3048                 ret = 0;
3049                 spin_lock(&cur_trans->dirty_bgs_lock);
3050                 if (list_empty(&cache->dirty_list)) {
3051                     list_add_tail(&cache->dirty_list,
3052                               &cur_trans->dirty_bgs);
3053                     btrfs_get_block_group(cache);
3054                     drop_reserve = false;
3055                 }
3056                 spin_unlock(&cur_trans->dirty_bgs_lock);
3057             } else if (ret) {
3058                 btrfs_abort_transaction(trans, ret);
3059             }
3060         }
3061
3062         /* If it's not on the io list, we need to put the block group */
3063         if (should_put)
3064             btrfs_put_block_group(cache);
3065         if (drop_reserve)
3066             btrfs_delayed_refs_rsv_release(fs_info, 1);
3067         /*
3068          * Avoid blocking other tasks for too long. It might even save
3069          * us from writing caches for block groups that are going to be
3070          * removed.
3071          */
3072         mutex_unlock(&trans->transaction->cache_write_mutex);
3073         if (ret)
3074             goto out;
3075         mutex_lock(&trans->transaction->cache_write_mutex);
3076     }
3077     mutex_unlock(&trans->transaction->cache_write_mutex);
3078
3079     /*
3080      * Go through delayed refs for all the stuff we've just kicked off
3081      * and then loop back (just once)
3082      */
3083     if (!ret)
3084         ret = btrfs_run_delayed_refs(trans, 0);
3085     if (!ret && loops == 0) {
3086         loops++;
3087         spin_lock(&cur_trans->dirty_bgs_lock);
3088         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3089         /*
3090          * dirty_bgs_lock protects us from concurrent block group
3091          * deletes too (not just cache_write_mutex).
3092          */
3093         if (!list_empty(&dirty)) {
3094             spin_unlock(&cur_trans->dirty_bgs_lock);
3095             goto again;
3096         }
3097         spin_unlock(&cur_trans->dirty_bgs_lock);
3098     }
3099 out:
3100     if (ret < 0) {
3101         spin_lock(&cur_trans->dirty_bgs_lock);
3102         list_splice_init(&dirty, &cur_trans->dirty_bgs);
3103         spin_unlock(&cur_trans->dirty_bgs_lock);
3104         btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3105     }
3106
3107     btrfs_free_path(path);
3108     return ret;
3109 }
3110
3111 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3112 {
3113     struct btrfs_fs_info *fs_info = trans->fs_info;
3114     struct btrfs_block_group *cache;
3115     struct btrfs_transaction *cur_trans = trans->transaction;
3116     int ret = 0;
3117     int should_put;
3118     struct btrfs_path *path;
3119     struct list_head *io = &cur_trans->io_bgs;
3120
3121     path = btrfs_alloc_path();
3122     if (!path)
3123         return -ENOMEM;
3124
3125     /*
3126      * Even though we are in the critical section of the transaction commit,
3127      * we can still have concurrent tasks adding elements to this
3128      * transaction's list of dirty block groups. These tasks correspond to
3129      * endio free space workers started when writeback finishes for a
3130      * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3131      * allocate new block groups as a result of COWing nodes of the root
3132      * tree when updating the free space inode. The writeback for the space
3133      * caches is triggered by an earlier call to
3134      * btrfs_start_dirty_block_groups() and iterations of the following
3135      * loop.
3136      * Also we want to do the cache_save_setup first and then run the
3137      * delayed refs to make sure we have the best chance at doing this all
3138      * in one shot.
3139      */
3140     spin_lock(&cur_trans->dirty_bgs_lock);
3141     while (!list_empty(&cur_trans->dirty_bgs)) {
3142         cache = list_first_entry(&cur_trans->dirty_bgs,
3143                      struct btrfs_block_group,
3144                      dirty_list);
3145
3146         /*
3147          * This can happen if cache_save_setup re-dirties a block group
3148          * that is already under IO.  Just wait for it to finish and
3149          * then do it all again
3150          */
3151         if (!list_empty(&cache->io_list)) {
3152             spin_unlock(&cur_trans->dirty_bgs_lock);
3153             list_del_init(&cache->io_list);
3154             btrfs_wait_cache_io(trans, cache, path);
3155             btrfs_put_block_group(cache);
3156             spin_lock(&cur_trans->dirty_bgs_lock);
3157         }
3158
3159         /*
3160          * Don't remove from the dirty list until after we've waited on
3161          * any pending IO
3162          */
3163         list_del_init(&cache->dirty_list);
3164         spin_unlock(&cur_trans->dirty_bgs_lock);
3165         should_put = 1;
3166
3167         cache_save_setup(cache, trans, path);
3168
3169         if (!ret)
3170             ret = btrfs_run_delayed_refs(trans,
3171                              (unsigned long) -1);
3172
3173         if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3174             cache->io_ctl.inode = NULL;
3175             ret = btrfs_write_out_cache(trans, cache, path);
3176             if (ret == 0 && cache->io_ctl.inode) {
3177                 should_put = 0;
3178                 list_add_tail(&cache->io_list, io);
3179             } else {
3180                 /*
3181                  * If we failed to write the cache, the
3182                  * generation will be bad and life goes on
3183                  */
3184                 ret = 0;
3185             }
3186         }
3187         if (!ret) {
3188             ret = update_block_group_item(trans, path, cache);
3189             /*
3190              * One of the free space endio workers might have
3191              * created a new block group while updating a free space
3192              * cache's inode (at inode.c:btrfs_finish_ordered_io())
3193              * and hasn't released its transaction handle yet, in
3194              * which case the new block group is still attached to
3195              * its transaction handle and its creation has not
3196              * finished yet (no block group item in the extent tree
3197              * yet, etc). If this is the case, wait for all free
3198              * space endio workers to finish and retry. This is a
3199              * very rare case so no need for a more efficient and
3200              * complex approach.
3201              */
3202             if (ret == -ENOENT) {
3203                 wait_event(cur_trans->writer_wait,
3204                    atomic_read(&cur_trans->num_writers) == 1);
3205                 ret = update_block_group_item(trans, path, cache);
3206             }
3207             if (ret)
3208                 btrfs_abort_transaction(trans, ret);
3209         }
3210
3211         /* If its not on the io list, we need to put the block group */
3212         if (should_put)
3213             btrfs_put_block_group(cache);
3214         btrfs_delayed_refs_rsv_release(fs_info, 1);
3215         spin_lock(&cur_trans->dirty_bgs_lock);
3216     }
3217     spin_unlock(&cur_trans->dirty_bgs_lock);
3218
3219     /*
3220      * Refer to the definition of io_bgs member for details why it's safe
3221      * to use it without any locking
3222      */
3223     while (!list_empty(io)) {
3224         cache = list_first_entry(io, struct btrfs_block_group,
3225                      io_list);
3226         list_del_init(&cache->io_list);
3227         btrfs_wait_cache_io(trans, cache, path);
3228         btrfs_put_block_group(cache);
3229     }
3230
3231     btrfs_free_path(path);
3232     return ret;
3233 }
3234
3235 static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
3236                           u64 bytes_freed)
3237 {
3238     const struct btrfs_space_info *space_info = bg->space_info;
3239     const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
3240     const u64 new_val = bg->used;
3241     const u64 old_val = new_val + bytes_freed;
3242     u64 thresh;
3243
3244     if (reclaim_thresh == 0)
3245         return false;
3246
3247     thresh = div_factor_fine(bg->length, reclaim_thresh);
3248
3249     /*
3250      * If we were below the threshold before don't reclaim, we are likely a
3251      * brand new block group and we don't want to relocate new block groups.
3252      */
3253     if (old_val < thresh)
3254         return false;
3255     if (new_val >= thresh)
3256         return false;
3257     return true;
3258 }
3259
3260 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3261                  u64 bytenr, u64 num_bytes, bool alloc)
3262 {
3263     struct btrfs_fs_info *info = trans->fs_info;
3264     struct btrfs_block_group *cache = NULL;
3265     u64 total = num_bytes;
3266     u64 old_val;
3267     u64 byte_in_group;
3268     int factor;
3269     int ret = 0;
3270
3271     /* Block accounting for super block */
3272     spin_lock(&info->delalloc_root_lock);
3273     old_val = btrfs_super_bytes_used(info->super_copy);
3274     if (alloc)
3275         old_val += num_bytes;
3276     else
3277         old_val -= num_bytes;
3278     btrfs_set_super_bytes_used(info->super_copy, old_val);
3279     spin_unlock(&info->delalloc_root_lock);
3280
3281     while (total) {
3282         bool reclaim;
3283
3284         cache = btrfs_lookup_block_group(info, bytenr);
3285         if (!cache) {
3286             ret = -ENOENT;
3287             break;
3288         }
3289         factor = btrfs_bg_type_to_factor(cache->flags);
3290
3291         /*
3292          * If this block group has free space cache written out, we
3293          * need to make sure to load it if we are removing space.  This
3294          * is because we need the unpinning stage to actually add the
3295          * space back to the block group, otherwise we will leak space.
3296          */
3297         if (!alloc && !btrfs_block_group_done(cache))
3298             btrfs_cache_block_group(cache, true);
3299
3300         byte_in_group = bytenr - cache->start;
3301         WARN_ON(byte_in_group > cache->length);
3302
3303         spin_lock(&cache->space_info->lock);
3304         spin_lock(&cache->lock);
3305
3306         if (btrfs_test_opt(info, SPACE_CACHE) &&
3307             cache->disk_cache_state < BTRFS_DC_CLEAR)
3308             cache->disk_cache_state = BTRFS_DC_CLEAR;
3309
3310         old_val = cache->used;
3311         num_bytes = min(total, cache->length - byte_in_group);
3312         if (alloc) {
3313             old_val += num_bytes;
3314             cache->used = old_val;
3315             cache->reserved -= num_bytes;
3316             cache->space_info->bytes_reserved -= num_bytes;
3317             cache->space_info->bytes_used += num_bytes;
3318             cache->space_info->disk_used += num_bytes * factor;
3319             spin_unlock(&cache->lock);
3320             spin_unlock(&cache->space_info->lock);
3321         } else {
3322             old_val -= num_bytes;
3323             cache->used = old_val;
3324             cache->pinned += num_bytes;
3325             btrfs_space_info_update_bytes_pinned(info,
3326                     cache->space_info, num_bytes);
3327             cache->space_info->bytes_used -= num_bytes;
3328             cache->space_info->disk_used -= num_bytes * factor;
3329
3330             reclaim = should_reclaim_block_group(cache, num_bytes);
3331             spin_unlock(&cache->lock);
3332             spin_unlock(&cache->space_info->lock);
3333
3334             set_extent_dirty(&trans->transaction->pinned_extents,
3335                      bytenr, bytenr + num_bytes - 1,
3336                      GFP_NOFS | __GFP_NOFAIL);
3337         }
3338
3339         spin_lock(&trans->transaction->dirty_bgs_lock);
3340         if (list_empty(&cache->dirty_list)) {
3341             list_add_tail(&cache->dirty_list,
3342                       &trans->transaction->dirty_bgs);
3343             trans->delayed_ref_updates++;
3344             btrfs_get_block_group(cache);
3345         }
3346         spin_unlock(&trans->transaction->dirty_bgs_lock);
3347
3348         /*
3349          * No longer have used bytes in this block group, queue it for
3350          * deletion. We do this after adding the block group to the
3351          * dirty list to avoid races between cleaner kthread and space
3352          * cache writeout.
3353          */
3354         if (!alloc && old_val == 0) {
3355             if (!btrfs_test_opt(info, DISCARD_ASYNC))
3356                 btrfs_mark_bg_unused(cache);
3357         } else if (!alloc && reclaim) {
3358             btrfs_mark_bg_to_reclaim(cache);
3359         }
3360
3361         btrfs_put_block_group(cache);
3362         total -= num_bytes;
3363         bytenr += num_bytes;
3364     }
3365
3366     /* Modified block groups are accounted for in the delayed_refs_rsv. */
3367     btrfs_update_delayed_refs_rsv(trans);
3368     return ret;
3369 }
3370
3371 /**
3372  * btrfs_add_reserved_bytes - update the block_group and space info counters
3373  * @cache:  The cache we are manipulating
3374  * @ram_bytes:  The number of bytes of file content, and will be same to
3375  *              @num_bytes except for the compress path.
3376  * @num_bytes:  The number of bytes in question
3377  * @delalloc:   The blocks are allocated for the delalloc write
3378  *
3379  * This is called by the allocator when it reserves space. If this is a
3380  * reservation and the block group has become read only we cannot make the
3381  * reservation and return -EAGAIN, otherwise this function always succeeds.
3382  */
3383 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3384                  u64 ram_bytes, u64 num_bytes, int delalloc)
3385 {
3386     struct btrfs_space_info *space_info = cache->space_info;
3387     int ret = 0;
3388
3389     spin_lock(&space_info->lock);
3390     spin_lock(&cache->lock);
3391     if (cache->ro) {
3392         ret = -EAGAIN;
3393     } else {
3394         cache->reserved += num_bytes;
3395         space_info->bytes_reserved += num_bytes;
3396         trace_btrfs_space_reservation(cache->fs_info, "space_info",
3397                           space_info->flags, num_bytes, 1);
3398         btrfs_space_info_update_bytes_may_use(cache->fs_info,
3399                               space_info, -ram_bytes);
3400         if (delalloc)
3401             cache->delalloc_bytes += num_bytes;
3402
3403         /*
3404          * Compression can use less space than we reserved, so wake
3405          * tickets if that happens
3406          */
3407         if (num_bytes < ram_bytes)
3408             btrfs_try_granting_tickets(cache->fs_info, space_info);
3409     }
3410     spin_unlock(&cache->lock);
3411     spin_unlock(&space_info->lock);
3412     return ret;
3413 }
3414
3415 /**
3416  * btrfs_free_reserved_bytes - update the block_group and space info counters
3417  * @cache:      The cache we are manipulating
3418  * @num_bytes:  The number of bytes in question
3419  * @delalloc:   The blocks are allocated for the delalloc write
3420  *
3421  * This is called by somebody who is freeing space that was never actually used
3422  * on disk.  For example if you reserve some space for a new leaf in transaction
3423  * A and before transaction A commits you free that leaf, you call this with
3424  * reserve set to 0 in order to clear the reservation.
3425  */
3426 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3427                    u64 num_bytes, int delalloc)
3428 {
3429     struct btrfs_space_info *space_info = cache->space_info;
3430
3431     spin_lock(&space_info->lock);
3432     spin_lock(&cache->lock);
3433     if (cache->ro)
3434         space_info->bytes_readonly += num_bytes;
3435     cache->reserved -= num_bytes;
3436     space_info->bytes_reserved -= num_bytes;
3437     space_info->max_extent_size = 0;
3438
3439     if (delalloc)
3440         cache->delalloc_bytes -= num_bytes;
3441     spin_unlock(&cache->lock);
3442
3443     btrfs_try_granting_tickets(cache->fs_info, space_info);
3444     spin_unlock(&space_info->lock);
3445 }
3446
3447 static void force_metadata_allocation(struct btrfs_fs_info *info)
3448 {
3449     struct list_head *head = &info->space_info;
3450     struct btrfs_space_info *found;
3451
3452     list_for_each_entry(found, head, list) {
3453         if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3454             found->force_alloc = CHUNK_ALLOC_FORCE;
3455     }
3456 }
3457
3458 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3459                   struct btrfs_space_info *sinfo, int force)
3460 {
3461     u64 bytes_used = btrfs_space_info_used(sinfo, false);
3462     u64 thresh;
3463
3464     if (force == CHUNK_ALLOC_FORCE)
3465         return 1;
3466
3467     /*
3468      * in limited mode, we want to have some free space up to
3469      * about 1% of the FS size.
3470      */
3471     if (force == CHUNK_ALLOC_LIMITED) {
3472         thresh = btrfs_super_total_bytes(fs_info->super_copy);
3473         thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3474
3475         if (sinfo->total_bytes - bytes_used < thresh)
3476             return 1;
3477     }
3478
3479     if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3480         return 0;
3481     return 1;
3482 }
3483
3484 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3485 {
3486     u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3487
3488     return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3489 }
3490
3491 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
3492 {
3493     struct btrfs_block_group *bg;
3494     int ret;
3495
3496     /*
3497      * Check if we have enough space in the system space info because we
3498      * will need to update device items in the chunk btree and insert a new
3499      * chunk item in the chunk btree as well. This will allocate a new
3500      * system block group if needed.
3501      */
3502     check_system_chunk(trans, flags);
3503
3504     bg = btrfs_create_chunk(trans, flags);
3505     if (IS_ERR(bg)) {
3506         ret = PTR_ERR(bg);
3507         goto out;
3508     }
3509
3510     ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3511     /*
3512      * Normally we are not expected to fail with -ENOSPC here, since we have
3513      * previously reserved space in the system space_info and allocated one
3514      * new system chunk if necessary. However there are three exceptions:
3515      *
3516      * 1) We may have enough free space in the system space_info but all the
3517      *    existing system block groups have a profile which can not be used
3518      *    for extent allocation.
3519      *
3520      *    This happens when mounting in degraded mode. For example we have a
3521      *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3522      *    using the other device in degraded mode. If we then allocate a chunk,
3523      *    we may have enough free space in the existing system space_info, but
3524      *    none of the block groups can be used for extent allocation since they
3525      *    have a RAID1 profile, and because we are in degraded mode with a
3526      *    single device, we are forced to allocate a new system chunk with a
3527      *    SINGLE profile. Making check_system_chunk() iterate over all system
3528      *    block groups and check if they have a usable profile and enough space
3529      *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3530      *    try again after forcing allocation of a new system chunk. Like this
3531      *    we avoid paying the cost of that search in normal circumstances, when
3532      *    we were not mounted in degraded mode;
3533      *
3534      * 2) We had enough free space info the system space_info, and one suitable
3535      *    block group to allocate from when we called check_system_chunk()
3536      *    above. However right after we called it, the only system block group
3537      *    with enough free space got turned into RO mode by a running scrub,
3538      *    and in this case we have to allocate a new one and retry. We only
3539      *    need do this allocate and retry once, since we have a transaction
3540      *    handle and scrub uses the commit root to search for block groups;
3541      *
3542      * 3) We had one system block group with enough free space when we called
3543      *    check_system_chunk(), but after that, right before we tried to
3544      *    allocate the last extent buffer we needed, a discard operation came
3545      *    in and it temporarily removed the last free space entry from the
3546      *    block group (discard removes a free space entry, discards it, and
3547      *    then adds back the entry to the block group cache).
3548      */
3549     if (ret == -ENOSPC) {
3550         const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3551         struct btrfs_block_group *sys_bg;
3552
3553         sys_bg = btrfs_create_chunk(trans, sys_flags);
3554         if (IS_ERR(sys_bg)) {
3555             ret = PTR_ERR(sys_bg);
3556             btrfs_abort_transaction(trans, ret);
3557             goto out;
3558         }
3559
3560         ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3561         if (ret) {
3562             btrfs_abort_transaction(trans, ret);
3563             goto out;
3564         }
3565
3566         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3567         if (ret) {
3568             btrfs_abort_transaction(trans, ret);
3569             goto out;
3570         }
3571     } else if (ret) {
3572         btrfs_abort_transaction(trans, ret);
3573         goto out;
3574     }
3575 out:
3576     btrfs_trans_release_chunk_metadata(trans);
3577
3578     if (ret)
3579         return ERR_PTR(ret);
3580
3581     btrfs_get_block_group(bg);
3582     return bg;
3583 }
3584
3585 /*
3586  * Chunk allocation is done in 2 phases:
3587  *
3588  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3589  *    the chunk, the chunk mapping, create its block group and add the items
3590  *    that belong in the chunk btree to it - more specifically, we need to
3591  *    update device items in the chunk btree and add a new chunk item to it.
3592  *
3593  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3594  *    group item to the extent btree and the device extent items to the devices
3595  *    btree.
3596  *
3597  * This is done to prevent deadlocks. For example when COWing a node from the
3598  * extent btree we are holding a write lock on the node's parent and if we
3599  * trigger chunk allocation and attempted to insert the new block group item
3600  * in the extent btree right way, we could deadlock because the path for the
3601  * insertion can include that parent node. At first glance it seems impossible
3602  * to trigger chunk allocation after starting a transaction since tasks should
3603  * reserve enough transaction units (metadata space), however while that is true
3604  * most of the time, chunk allocation may still be triggered for several reasons:
3605  *
3606  * 1) When reserving metadata, we check if there is enough free space in the
3607  *    metadata space_info and therefore don't trigger allocation of a new chunk.
3608  *    However later when the task actually tries to COW an extent buffer from
3609  *    the extent btree or from the device btree for example, it is forced to
3610  *    allocate a new block group (chunk) because the only one that had enough
3611  *    free space was just turned to RO mode by a running scrub for example (or
3612  *    device replace, block group reclaim thread, etc), so we can not use it
3613  *    for allocating an extent and end up being forced to allocate a new one;
3614  *
3615  * 2) Because we only check that the metadata space_info has enough free bytes,
3616  *    we end up not allocating a new metadata chunk in that case. However if
3617  *    the filesystem was mounted in degraded mode, none of the existing block
3618  *    groups might be suitable for extent allocation due to their incompatible
3619  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
3620  *    use a RAID1 profile, in degraded mode using a single device). In this case
3621  *    when the task attempts to COW some extent buffer of the extent btree for
3622  *    example, it will trigger allocation of a new metadata block group with a
3623  *    suitable profile (SINGLE profile in the example of the degraded mount of
3624  *    the RAID1 filesystem);
3625  *
3626  * 3) The task has reserved enough transaction units / metadata space, but when
3627  *    it attempts to COW an extent buffer from the extent or device btree for
3628  *    example, it does not find any free extent in any metadata block group,
3629  *    therefore forced to try to allocate a new metadata block group.
3630  *    This is because some other task allocated all available extents in the
3631  *    meanwhile - this typically happens with tasks that don't reserve space
3632  *    properly, either intentionally or as a bug. One example where this is
3633  *    done intentionally is fsync, as it does not reserve any transaction units
3634  *    and ends up allocating a variable number of metadata extents for log
3635  *    tree extent buffers;
3636  *
3637  * 4) The task has reserved enough transaction units / metadata space, but right
3638  *    before it tries to allocate the last extent buffer it needs, a discard
3639  *    operation comes in and, temporarily, removes the last free space entry from
3640  *    the only metadata block group that had free space (discard starts by
3641  *    removing a free space entry from a block group, then does the discard
3642  *    operation and, once it's done, it adds back the free space entry to the
3643  *    block group).
3644  *
3645  * We also need this 2 phases setup when adding a device to a filesystem with
3646  * a seed device - we must create new metadata and system chunks without adding
3647  * any of the block group items to the chunk, extent and device btrees. If we
3648  * did not do it this way, we would get ENOSPC when attempting to update those
3649  * btrees, since all the chunks from the seed device are read-only.
3650  *
3651  * Phase 1 does the updates and insertions to the chunk btree because if we had
3652  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
3653  * parallel, we risk having too many system chunks allocated by many tasks if
3654  * many tasks reach phase 1 without the previous ones completing phase 2. In the
3655  * extreme case this leads to exhaustion of the system chunk array in the
3656  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
3657  * and with RAID filesystems (so we have more device items in the chunk btree).
3658  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
3659  * the system chunk array due to concurrent allocations") provides more details.
3660  *
3661  * Allocation of system chunks does not happen through this function. A task that
3662  * needs to update the chunk btree (the only btree that uses system chunks), must
3663  * preallocate chunk space by calling either check_system_chunk() or
3664  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
3665  * metadata chunk or when removing a chunk, while the later is used before doing
3666  * a modification to the chunk btree - use cases for the later are adding,
3667  * removing and resizing a device as well as relocation of a system chunk.
3668  * See the comment below for more details.
3669  *
3670  * The reservation of system space, done through check_system_chunk(), as well
3671  * as all the updates and insertions into the chunk btree must be done while
3672  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
3673  * an extent buffer from the chunks btree we never trigger allocation of a new
3674  * system chunk, which would result in a deadlock (trying to lock twice an
3675  * extent buffer of the chunk btree, first time before triggering the chunk
3676  * allocation and the second time during chunk allocation while attempting to
3677  * update the chunks btree). The system chunk array is also updated while holding
3678  * that mutex. The same logic applies to removing chunks - we must reserve system
3679  * space, update the chunk btree and the system chunk array in the superblock
3680  * while holding fs_info->chunk_mutex.
3681  *
3682  * This function, btrfs_chunk_alloc(), belongs to phase 1.
3683  *
3684  * If @force is CHUNK_ALLOC_FORCE:
3685  *    - return 1 if it successfully allocates a chunk,
3686  *    - return errors including -ENOSPC otherwise.
3687  * If @force is NOT CHUNK_ALLOC_FORCE:
3688  *    - return 0 if it doesn't need to allocate a new chunk,
3689  *    - return 1 if it successfully allocates a chunk,
3690  *    - return errors including -ENOSPC otherwise.
3691  */
3692 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3693               enum btrfs_chunk_alloc_enum force)
3694 {
3695     struct btrfs_fs_info *fs_info = trans->fs_info;
3696     struct btrfs_space_info *space_info;
3697     struct btrfs_block_group *ret_bg;
3698     bool wait_for_alloc = false;
3699     bool should_alloc = false;
3700     bool from_extent_allocation = false;
3701     int ret = 0;
3702
3703     if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
3704         from_extent_allocation = true;
3705         force = CHUNK_ALLOC_FORCE;
3706     }
3707
3708     /* Don't re-enter if we're already allocating a chunk */
3709     if (trans->allocating_chunk)
3710         return -ENOSPC;
3711     /*
3712      * Allocation of system chunks can not happen through this path, as we
3713      * could end up in a deadlock if we are allocating a data or metadata
3714      * chunk and there is another task modifying the chunk btree.
3715      *
3716      * This is because while we are holding the chunk mutex, we will attempt
3717      * to add the new chunk item to the chunk btree or update an existing
3718      * device item in the chunk btree, while the other task that is modifying
3719      * the chunk btree is attempting to COW an extent buffer while holding a
3720      * lock on it and on its parent - if the COW operation triggers a system
3721      * chunk allocation, then we can deadlock because we are holding the
3722      * chunk mutex and we may need to access that extent buffer or its parent
3723      * in order to add the chunk item or update a device item.
3724      *
3725      * Tasks that want to modify the chunk tree should reserve system space
3726      * before updating the chunk btree, by calling either
3727      * btrfs_reserve_chunk_metadata() or check_system_chunk().
3728      * It's possible that after a task reserves the space, it still ends up
3729      * here - this happens in the cases described above at do_chunk_alloc().
3730      * The task will have to either retry or fail.
3731      */
3732     if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3733         return -ENOSPC;
3734
3735     space_info = btrfs_find_space_info(fs_info, flags);
3736     ASSERT(space_info);
3737
3738     do {
3739         spin_lock(&space_info->lock);
3740         if (force < space_info->force_alloc)
3741             force = space_info->force_alloc;
3742         should_alloc = should_alloc_chunk(fs_info, space_info, force);
3743         if (space_info->full) {
3744             /* No more free physical space */
3745             if (should_alloc)
3746                 ret = -ENOSPC;
3747             else
3748                 ret = 0;
3749             spin_unlock(&space_info->lock);
3750             return ret;
3751         } else if (!should_alloc) {
3752             spin_unlock(&space_info->lock);
3753             return 0;
3754         } else if (space_info->chunk_alloc) {
3755             /*
3756              * Someone is already allocating, so we need to block
3757              * until this someone is finished and then loop to
3758              * recheck if we should continue with our allocation
3759              * attempt.
3760              */
3761             wait_for_alloc = true;
3762             force = CHUNK_ALLOC_NO_FORCE;
3763             spin_unlock(&space_info->lock);
3764             mutex_lock(&fs_info->chunk_mutex);
3765             mutex_unlock(&fs_info->chunk_mutex);
3766         } else {
3767             /* Proceed with allocation */
3768             space_info->chunk_alloc = 1;
3769             wait_for_alloc = false;
3770             spin_unlock(&space_info->lock);
3771         }
3772
3773         cond_resched();
3774     } while (wait_for_alloc);
3775
3776     mutex_lock(&fs_info->chunk_mutex);
3777     trans->allocating_chunk = true;
3778
3779     /*
3780      * If we have mixed data/metadata chunks we want to make sure we keep
3781      * allocating mixed chunks instead of individual chunks.
3782      */
3783     if (btrfs_mixed_space_info(space_info))
3784         flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3785
3786     /*
3787      * if we're doing a data chunk, go ahead and make sure that
3788      * we keep a reasonable number of metadata chunks allocated in the
3789      * FS as well.
3790      */
3791     if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3792         fs_info->data_chunk_allocations++;
3793         if (!(fs_info->data_chunk_allocations %
3794               fs_info->metadata_ratio))
3795             force_metadata_allocation(fs_info);
3796     }
3797
3798     ret_bg = do_chunk_alloc(trans, flags);
3799     trans->allocating_chunk = false;
3800
3801     if (IS_ERR(ret_bg)) {
3802         ret = PTR_ERR(ret_bg);
3803     } else if (from_extent_allocation) {
3804         /*
3805          * New block group is likely to be used soon. Try to activate
3806          * it now. Failure is OK for now.
3807          */
3808         btrfs_zone_activate(ret_bg);
3809     }
3810
3811     if (!ret)
3812         btrfs_put_block_group(ret_bg);
3813
3814     spin_lock(&space_info->lock);
3815     if (ret < 0) {
3816         if (ret == -ENOSPC)
3817             space_info->full = 1;
3818         else
3819             goto out;
3820     } else {
3821         ret = 1;
3822         space_info->max_extent_size = 0;
3823     }
3824
3825     space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3826 out:
3827     space_info->chunk_alloc = 0;
3828     spin_unlock(&space_info->lock);
3829     mutex_unlock(&fs_info->chunk_mutex);
3830
3831     return ret;
3832 }
3833
3834 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3835 {
3836     u64 num_dev;
3837
3838     num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3839     if (!num_dev)
3840         num_dev = fs_info->fs_devices->rw_devices;
3841
3842     return num_dev;
3843 }
3844
3845 static void reserve_chunk_space(struct btrfs_trans_handle *trans,
3846                 u64 bytes,
3847                 u64 type)
3848 {
3849     struct btrfs_fs_info *fs_info = trans->fs_info;
3850     struct btrfs_space_info *info;
3851     u64 left;
3852     int ret = 0;
3853
3854     /*
3855      * Needed because we can end up allocating a system chunk and for an
3856      * atomic and race free space reservation in the chunk block reserve.
3857      */
3858     lockdep_assert_held(&fs_info->chunk_mutex);
3859
3860     info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3861     spin_lock(&info->lock);
3862     left = info->total_bytes - btrfs_space_info_used(info, true);
3863     spin_unlock(&info->lock);
3864
3865     if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3866         btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3867                left, bytes, type);
3868         btrfs_dump_space_info(fs_info, info, 0, 0);
3869     }
3870
3871     if (left < bytes) {
3872         u64 flags = btrfs_system_alloc_profile(fs_info);
3873         struct btrfs_block_group *bg;
3874
3875         /*
3876          * Ignore failure to create system chunk. We might end up not
3877          * needing it, as we might not need to COW all nodes/leafs from
3878          * the paths we visit in the chunk tree (they were already COWed
3879          * or created in the current transaction for example).
3880          */
3881         bg = btrfs_create_chunk(trans, flags);
3882         if (IS_ERR(bg)) {
3883             ret = PTR_ERR(bg);
3884         } else {
3885             /*
3886              * We have a new chunk. We also need to activate it for
3887              * zoned filesystem.
3888              */
3889             ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
3890             if (ret < 0)
3891                 return;
3892
3893             /*
3894              * If we fail to add the chunk item here, we end up
3895              * trying again at phase 2 of chunk allocation, at
3896              * btrfs_create_pending_block_groups(). So ignore
3897              * any error here. An ENOSPC here could happen, due to
3898              * the cases described at do_chunk_alloc() - the system
3899              * block group we just created was just turned into RO
3900              * mode by a scrub for example, or a running discard
3901              * temporarily removed its free space entries, etc.
3902              */
3903             btrfs_chunk_alloc_add_chunk_item(trans, bg);
3904         }
3905     }
3906
3907     if (!ret) {
3908         ret = btrfs_block_rsv_add(fs_info,
3909                       &fs_info->chunk_block_rsv,
3910                       bytes, BTRFS_RESERVE_NO_FLUSH);
3911         if (!ret)
3912             trans->chunk_bytes_reserved += bytes;
3913     }
3914 }
3915
3916 /*
3917  * Reserve space in the system space for allocating or removing a chunk.
3918  * The caller must be holding fs_info->chunk_mutex.
3919  */
3920 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3921 {
3922     struct btrfs_fs_info *fs_info = trans->fs_info;
3923     const u64 num_devs = get_profile_num_devs(fs_info, type);
3924     u64 bytes;
3925
3926     /* num_devs device items to update and 1 chunk item to add or remove. */
3927     bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
3928         btrfs_calc_insert_metadata_size(fs_info, 1);
3929
3930     reserve_chunk_space(trans, bytes, type);
3931 }
3932
3933 /*
3934  * Reserve space in the system space, if needed, for doing a modification to the
3935  * chunk btree.
3936  *
3937  * @trans:      A transaction handle.
3938  * @is_item_insertion:  Indicate if the modification is for inserting a new item
3939  *          in the chunk btree or if it's for the deletion or update
3940  *          of an existing item.
3941  *
3942  * This is used in a context where we need to update the chunk btree outside
3943  * block group allocation and removal, to avoid a deadlock with a concurrent
3944  * task that is allocating a metadata or data block group and therefore needs to
3945  * update the chunk btree while holding the chunk mutex. After the update to the
3946  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
3947  *
3948  */
3949 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
3950                   bool is_item_insertion)
3951 {
3952     struct btrfs_fs_info *fs_info = trans->fs_info;
3953     u64 bytes;
3954
3955     if (is_item_insertion)
3956         bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
3957     else
3958         bytes = btrfs_calc_metadata_size(fs_info, 1);
3959
3960     mutex_lock(&fs_info->chunk_mutex);
3961     reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
3962     mutex_unlock(&fs_info->chunk_mutex);
3963 }
3964
3965 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3966 {
3967     struct btrfs_block_group *block_group;
3968     u64 last = 0;
3969
3970     while (1) {
3971         struct inode *inode;
3972
3973         block_group = btrfs_lookup_first_block_group(info, last);
3974         while (block_group) {
3975             btrfs_wait_block_group_cache_done(block_group);
3976             spin_lock(&block_group->lock);
3977             if (block_group->iref)
3978                 break;
3979             spin_unlock(&block_group->lock);
3980             block_group = btrfs_next_block_group(block_group);
3981         }
3982         if (!block_group) {
3983             if (last == 0)
3984                 break;
3985             last = 0;
3986             continue;
3987         }
3988
3989         inode = block_group->inode;
3990         block_group->iref = 0;
3991         block_group->inode = NULL;
3992         spin_unlock(&block_group->lock);
3993         ASSERT(block_group->io_ctl.inode == NULL);
3994         iput(inode);
3995         last = block_group->start + block_group->length;
3996         btrfs_put_block_group(block_group);
3997     }
3998 }
3999
4000 /*
4001  * Must be called only after stopping all workers, since we could have block
4002  * group caching kthreads running, and therefore they could race with us if we
4003  * freed the block groups before stopping them.
4004  */
4005 int btrfs_free_block_groups(struct btrfs_fs_info *info)
4006 {
4007     struct btrfs_block_group *block_group;
4008     struct btrfs_space_info *space_info;
4009     struct btrfs_caching_control *caching_ctl;
4010     struct rb_node *n;
4011
4012     write_lock(&info->block_group_cache_lock);
4013     while (!list_empty(&info->caching_block_groups)) {
4014         caching_ctl = list_entry(info->caching_block_groups.next,
4015                      struct btrfs_caching_control, list);
4016         list_del(&caching_ctl->list);
4017         btrfs_put_caching_control(caching_ctl);
4018     }
4019     write_unlock(&info->block_group_cache_lock);
4020
4021     spin_lock(&info->unused_bgs_lock);
4022     while (!list_empty(&info->unused_bgs)) {
4023         block_group = list_first_entry(&info->unused_bgs,
4024                            struct btrfs_block_group,
4025                            bg_list);
4026         list_del_init(&block_group->bg_list);
4027         btrfs_put_block_group(block_group);
4028     }
4029
4030     while (!list_empty(&info->reclaim_bgs)) {
4031         block_group = list_first_entry(&info->reclaim_bgs,
4032                            struct btrfs_block_group,
4033                            bg_list);
4034         list_del_init(&block_group->bg_list);
4035         btrfs_put_block_group(block_group);
4036     }
4037     spin_unlock(&info->unused_bgs_lock);
4038
4039     spin_lock(&info->zone_active_bgs_lock);
4040     while (!list_empty(&info->zone_active_bgs)) {
4041         block_group = list_first_entry(&info->zone_active_bgs,
4042                            struct btrfs_block_group,
4043                            active_bg_list);
4044         list_del_init(&block_group->active_bg_list);
4045         btrfs_put_block_group(block_group);
4046     }
4047     spin_unlock(&info->zone_active_bgs_lock);
4048
4049     write_lock(&info->block_group_cache_lock);
4050     while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4051         block_group = rb_entry(n, struct btrfs_block_group,
4052                        cache_node);
4053         rb_erase_cached(&block_group->cache_node,
4054                 &info->block_group_cache_tree);
4055         RB_CLEAR_NODE(&block_group->cache_node);
4056         write_unlock(&info->block_group_cache_lock);
4057
4058         down_write(&block_group->space_info->groups_sem);
4059         list_del(&block_group->list);
4060         up_write(&block_group->space_info->groups_sem);
4061
4062         /*
4063          * We haven't cached this block group, which means we could
4064          * possibly have excluded extents on this block group.
4065          */
4066         if (block_group->cached == BTRFS_CACHE_NO ||
4067             block_group->cached == BTRFS_CACHE_ERROR)
4068             btrfs_free_excluded_extents(block_group);
4069
4070         btrfs_remove_free_space_cache(block_group);
4071         ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4072         ASSERT(list_empty(&block_group->dirty_list));
4073         ASSERT(list_empty(&block_group->io_list));
4074         ASSERT(list_empty(&block_group->bg_list));
4075         ASSERT(refcount_read(&block_group->refs) == 1);
4076         ASSERT(block_group->swap_extents == 0);
4077         btrfs_put_block_group(block_group);
4078
4079         write_lock(&info->block_group_cache_lock);
4080     }
4081     write_unlock(&info->block_group_cache_lock);
4082
4083     btrfs_release_global_block_rsv(info);
4084
4085     while (!list_empty(&info->space_info)) {
4086         space_info = list_entry(info->space_info.next,
4087                     struct btrfs_space_info,
4088                     list);
4089
4090         /*
4091          * Do not hide this behind enospc_debug, this is actually
4092          * important and indicates a real bug if this happens.
4093          */
4094         if (WARN_ON(space_info->bytes_pinned > 0 ||
4095                 space_info->bytes_may_use > 0))
4096             btrfs_dump_space_info(info, space_info, 0, 0);
4097
4098         /*
4099          * If there was a failure to cleanup a log tree, very likely due
4100          * to an IO failure on a writeback attempt of one or more of its
4101          * extent buffers, we could not do proper (and cheap) unaccounting
4102          * of their reserved space, so don't warn on bytes_reserved > 0 in
4103          * that case.
4104          */
4105         if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4106             !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4107             if (WARN_ON(space_info->bytes_reserved > 0))
4108                 btrfs_dump_space_info(info, space_info, 0, 0);
4109         }
4110
4111         WARN_ON(space_info->reclaim_size > 0);
4112         list_del(&space_info->list);
4113         btrfs_sysfs_remove_space_info(space_info);
4114     }
4115     return 0;
4116 }
4117
4118 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4119 {
4120     atomic_inc(&cache->frozen);
4121 }
4122
4123 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4124 {
4125     struct btrfs_fs_info *fs_info = block_group->fs_info;
4126     struct extent_map_tree *em_tree;
4127     struct extent_map *em;
4128     bool cleanup;
4129
4130     spin_lock(&block_group->lock);
4131     cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4132            block_group->removed);
4133     spin_unlock(&block_group->lock);
4134
4135     if (cleanup) {
4136         em_tree = &fs_info->mapping_tree;
4137         write_lock(&em_tree->lock);
4138         em = lookup_extent_mapping(em_tree, block_group->start,
4139                        1);
4140         BUG_ON(!em); /* logic error, can't happen */
4141         remove_extent_mapping(em_tree, em);
4142         write_unlock(&em_tree->lock);
4143
4144         /* once for us and once for the tree */
4145         free_extent_map(em);
4146         free_extent_map(em);
4147
4148         /*
4149          * We may have left one free space entry and other possible
4150          * tasks trimming this block group have left 1 entry each one.
4151          * Free them if any.
4152          */
4153         __btrfs_remove_free_space_cache(block_group->free_space_ctl);
4154     }
4155 }
4156
4157 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4158 {
4159     bool ret = true;
4160
4161     spin_lock(&bg->lock);
4162     if (bg->ro)
4163         ret = false;
4164     else
4165         bg->swap_extents++;
4166     spin_unlock(&bg->lock);
4167
4168     return ret;
4169 }
4170
4171 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4172 {
4173     spin_lock(&bg->lock);
4174     ASSERT(!bg->ro);
4175     ASSERT(bg->swap_extents >= amount);
4176     bg->swap_extents -= amount;
4177     spin_unlock(&bg->lock);
4178 }