Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2008 Red Hat.  All rights reserved.
0004  */
0005 
0006 #include <linux/pagemap.h>
0007 #include <linux/sched.h>
0008 #include <linux/sched/signal.h>
0009 #include <linux/slab.h>
0010 #include <linux/math64.h>
0011 #include <linux/ratelimit.h>
0012 #include <linux/error-injection.h>
0013 #include <linux/sched/mm.h>
0014 #include "misc.h"
0015 #include "ctree.h"
0016 #include "free-space-cache.h"
0017 #include "transaction.h"
0018 #include "disk-io.h"
0019 #include "extent_io.h"
0020 #include "volumes.h"
0021 #include "space-info.h"
0022 #include "delalloc-space.h"
0023 #include "block-group.h"
0024 #include "discard.h"
0025 #include "subpage.h"
0026 #include "inode-item.h"
0027 
0028 #define BITS_PER_BITMAP     (PAGE_SIZE * 8UL)
0029 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
0030 #define FORCE_EXTENT_THRESHOLD  SZ_1M
0031 
0032 struct btrfs_trim_range {
0033     u64 start;
0034     u64 bytes;
0035     struct list_head list;
0036 };
0037 
0038 static int link_free_space(struct btrfs_free_space_ctl *ctl,
0039                struct btrfs_free_space *info);
0040 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
0041                   struct btrfs_free_space *info, bool update_stat);
0042 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
0043              struct btrfs_free_space *bitmap_info, u64 *offset,
0044              u64 *bytes, bool for_alloc);
0045 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
0046             struct btrfs_free_space *bitmap_info);
0047 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
0048                   struct btrfs_free_space *info, u64 offset,
0049                   u64 bytes, bool update_stats);
0050 
0051 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
0052                            struct btrfs_path *path,
0053                            u64 offset)
0054 {
0055     struct btrfs_fs_info *fs_info = root->fs_info;
0056     struct btrfs_key key;
0057     struct btrfs_key location;
0058     struct btrfs_disk_key disk_key;
0059     struct btrfs_free_space_header *header;
0060     struct extent_buffer *leaf;
0061     struct inode *inode = NULL;
0062     unsigned nofs_flag;
0063     int ret;
0064 
0065     key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0066     key.offset = offset;
0067     key.type = 0;
0068 
0069     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0070     if (ret < 0)
0071         return ERR_PTR(ret);
0072     if (ret > 0) {
0073         btrfs_release_path(path);
0074         return ERR_PTR(-ENOENT);
0075     }
0076 
0077     leaf = path->nodes[0];
0078     header = btrfs_item_ptr(leaf, path->slots[0],
0079                 struct btrfs_free_space_header);
0080     btrfs_free_space_key(leaf, header, &disk_key);
0081     btrfs_disk_key_to_cpu(&location, &disk_key);
0082     btrfs_release_path(path);
0083 
0084     /*
0085      * We are often under a trans handle at this point, so we need to make
0086      * sure NOFS is set to keep us from deadlocking.
0087      */
0088     nofs_flag = memalloc_nofs_save();
0089     inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
0090     btrfs_release_path(path);
0091     memalloc_nofs_restore(nofs_flag);
0092     if (IS_ERR(inode))
0093         return inode;
0094 
0095     mapping_set_gfp_mask(inode->i_mapping,
0096             mapping_gfp_constraint(inode->i_mapping,
0097             ~(__GFP_FS | __GFP_HIGHMEM)));
0098 
0099     return inode;
0100 }
0101 
0102 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
0103         struct btrfs_path *path)
0104 {
0105     struct btrfs_fs_info *fs_info = block_group->fs_info;
0106     struct inode *inode = NULL;
0107     u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0108 
0109     spin_lock(&block_group->lock);
0110     if (block_group->inode)
0111         inode = igrab(block_group->inode);
0112     spin_unlock(&block_group->lock);
0113     if (inode)
0114         return inode;
0115 
0116     inode = __lookup_free_space_inode(fs_info->tree_root, path,
0117                       block_group->start);
0118     if (IS_ERR(inode))
0119         return inode;
0120 
0121     spin_lock(&block_group->lock);
0122     if (!((BTRFS_I(inode)->flags & flags) == flags)) {
0123         btrfs_info(fs_info, "Old style space inode found, converting.");
0124         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
0125             BTRFS_INODE_NODATACOW;
0126         block_group->disk_cache_state = BTRFS_DC_CLEAR;
0127     }
0128 
0129     if (!block_group->iref) {
0130         block_group->inode = igrab(inode);
0131         block_group->iref = 1;
0132     }
0133     spin_unlock(&block_group->lock);
0134 
0135     return inode;
0136 }
0137 
0138 static int __create_free_space_inode(struct btrfs_root *root,
0139                      struct btrfs_trans_handle *trans,
0140                      struct btrfs_path *path,
0141                      u64 ino, u64 offset)
0142 {
0143     struct btrfs_key key;
0144     struct btrfs_disk_key disk_key;
0145     struct btrfs_free_space_header *header;
0146     struct btrfs_inode_item *inode_item;
0147     struct extent_buffer *leaf;
0148     /* We inline CRCs for the free disk space cache */
0149     const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
0150               BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0151     int ret;
0152 
0153     ret = btrfs_insert_empty_inode(trans, root, path, ino);
0154     if (ret)
0155         return ret;
0156 
0157     leaf = path->nodes[0];
0158     inode_item = btrfs_item_ptr(leaf, path->slots[0],
0159                     struct btrfs_inode_item);
0160     btrfs_item_key(leaf, &disk_key, path->slots[0]);
0161     memzero_extent_buffer(leaf, (unsigned long)inode_item,
0162                  sizeof(*inode_item));
0163     btrfs_set_inode_generation(leaf, inode_item, trans->transid);
0164     btrfs_set_inode_size(leaf, inode_item, 0);
0165     btrfs_set_inode_nbytes(leaf, inode_item, 0);
0166     btrfs_set_inode_uid(leaf, inode_item, 0);
0167     btrfs_set_inode_gid(leaf, inode_item, 0);
0168     btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
0169     btrfs_set_inode_flags(leaf, inode_item, flags);
0170     btrfs_set_inode_nlink(leaf, inode_item, 1);
0171     btrfs_set_inode_transid(leaf, inode_item, trans->transid);
0172     btrfs_set_inode_block_group(leaf, inode_item, offset);
0173     btrfs_mark_buffer_dirty(leaf);
0174     btrfs_release_path(path);
0175 
0176     key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0177     key.offset = offset;
0178     key.type = 0;
0179     ret = btrfs_insert_empty_item(trans, root, path, &key,
0180                       sizeof(struct btrfs_free_space_header));
0181     if (ret < 0) {
0182         btrfs_release_path(path);
0183         return ret;
0184     }
0185 
0186     leaf = path->nodes[0];
0187     header = btrfs_item_ptr(leaf, path->slots[0],
0188                 struct btrfs_free_space_header);
0189     memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
0190     btrfs_set_free_space_key(leaf, header, &disk_key);
0191     btrfs_mark_buffer_dirty(leaf);
0192     btrfs_release_path(path);
0193 
0194     return 0;
0195 }
0196 
0197 int create_free_space_inode(struct btrfs_trans_handle *trans,
0198                 struct btrfs_block_group *block_group,
0199                 struct btrfs_path *path)
0200 {
0201     int ret;
0202     u64 ino;
0203 
0204     ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
0205     if (ret < 0)
0206         return ret;
0207 
0208     return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
0209                      ino, block_group->start);
0210 }
0211 
0212 /*
0213  * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
0214  * handles lookup, otherwise it takes ownership and iputs the inode.
0215  * Don't reuse an inode pointer after passing it into this function.
0216  */
0217 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
0218                   struct inode *inode,
0219                   struct btrfs_block_group *block_group)
0220 {
0221     struct btrfs_path *path;
0222     struct btrfs_key key;
0223     int ret = 0;
0224 
0225     path = btrfs_alloc_path();
0226     if (!path)
0227         return -ENOMEM;
0228 
0229     if (!inode)
0230         inode = lookup_free_space_inode(block_group, path);
0231     if (IS_ERR(inode)) {
0232         if (PTR_ERR(inode) != -ENOENT)
0233             ret = PTR_ERR(inode);
0234         goto out;
0235     }
0236     ret = btrfs_orphan_add(trans, BTRFS_I(inode));
0237     if (ret) {
0238         btrfs_add_delayed_iput(inode);
0239         goto out;
0240     }
0241     clear_nlink(inode);
0242     /* One for the block groups ref */
0243     spin_lock(&block_group->lock);
0244     if (block_group->iref) {
0245         block_group->iref = 0;
0246         block_group->inode = NULL;
0247         spin_unlock(&block_group->lock);
0248         iput(inode);
0249     } else {
0250         spin_unlock(&block_group->lock);
0251     }
0252     /* One for the lookup ref */
0253     btrfs_add_delayed_iput(inode);
0254 
0255     key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0256     key.type = 0;
0257     key.offset = block_group->start;
0258     ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
0259                 -1, 1);
0260     if (ret) {
0261         if (ret > 0)
0262             ret = 0;
0263         goto out;
0264     }
0265     ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
0266 out:
0267     btrfs_free_path(path);
0268     return ret;
0269 }
0270 
0271 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
0272                        struct btrfs_block_rsv *rsv)
0273 {
0274     u64 needed_bytes;
0275     int ret;
0276 
0277     /* 1 for slack space, 1 for updating the inode */
0278     needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
0279         btrfs_calc_metadata_size(fs_info, 1);
0280 
0281     spin_lock(&rsv->lock);
0282     if (rsv->reserved < needed_bytes)
0283         ret = -ENOSPC;
0284     else
0285         ret = 0;
0286     spin_unlock(&rsv->lock);
0287     return ret;
0288 }
0289 
0290 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
0291                     struct btrfs_block_group *block_group,
0292                     struct inode *vfs_inode)
0293 {
0294     struct btrfs_truncate_control control = {
0295         .inode = BTRFS_I(vfs_inode),
0296         .new_size = 0,
0297         .ino = btrfs_ino(BTRFS_I(vfs_inode)),
0298         .min_type = BTRFS_EXTENT_DATA_KEY,
0299         .clear_extent_range = true,
0300     };
0301     struct btrfs_inode *inode = BTRFS_I(vfs_inode);
0302     struct btrfs_root *root = inode->root;
0303     struct extent_state *cached_state = NULL;
0304     int ret = 0;
0305     bool locked = false;
0306 
0307     if (block_group) {
0308         struct btrfs_path *path = btrfs_alloc_path();
0309 
0310         if (!path) {
0311             ret = -ENOMEM;
0312             goto fail;
0313         }
0314         locked = true;
0315         mutex_lock(&trans->transaction->cache_write_mutex);
0316         if (!list_empty(&block_group->io_list)) {
0317             list_del_init(&block_group->io_list);
0318 
0319             btrfs_wait_cache_io(trans, block_group, path);
0320             btrfs_put_block_group(block_group);
0321         }
0322 
0323         /*
0324          * now that we've truncated the cache away, its no longer
0325          * setup or written
0326          */
0327         spin_lock(&block_group->lock);
0328         block_group->disk_cache_state = BTRFS_DC_CLEAR;
0329         spin_unlock(&block_group->lock);
0330         btrfs_free_path(path);
0331     }
0332 
0333     btrfs_i_size_write(inode, 0);
0334     truncate_pagecache(vfs_inode, 0);
0335 
0336     lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
0337     btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
0338 
0339     /*
0340      * We skip the throttling logic for free space cache inodes, so we don't
0341      * need to check for -EAGAIN.
0342      */
0343     ret = btrfs_truncate_inode_items(trans, root, &control);
0344 
0345     inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
0346     btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
0347 
0348     unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
0349     if (ret)
0350         goto fail;
0351 
0352     ret = btrfs_update_inode(trans, root, inode);
0353 
0354 fail:
0355     if (locked)
0356         mutex_unlock(&trans->transaction->cache_write_mutex);
0357     if (ret)
0358         btrfs_abort_transaction(trans, ret);
0359 
0360     return ret;
0361 }
0362 
0363 static void readahead_cache(struct inode *inode)
0364 {
0365     struct file_ra_state ra;
0366     unsigned long last_index;
0367 
0368     file_ra_state_init(&ra, inode->i_mapping);
0369     last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
0370 
0371     page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
0372 }
0373 
0374 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
0375                int write)
0376 {
0377     int num_pages;
0378 
0379     num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
0380 
0381     /* Make sure we can fit our crcs and generation into the first page */
0382     if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
0383         return -ENOSPC;
0384 
0385     memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
0386 
0387     io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
0388     if (!io_ctl->pages)
0389         return -ENOMEM;
0390 
0391     io_ctl->num_pages = num_pages;
0392     io_ctl->fs_info = btrfs_sb(inode->i_sb);
0393     io_ctl->inode = inode;
0394 
0395     return 0;
0396 }
0397 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
0398 
0399 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
0400 {
0401     kfree(io_ctl->pages);
0402     io_ctl->pages = NULL;
0403 }
0404 
0405 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
0406 {
0407     if (io_ctl->cur) {
0408         io_ctl->cur = NULL;
0409         io_ctl->orig = NULL;
0410     }
0411 }
0412 
0413 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
0414 {
0415     ASSERT(io_ctl->index < io_ctl->num_pages);
0416     io_ctl->page = io_ctl->pages[io_ctl->index++];
0417     io_ctl->cur = page_address(io_ctl->page);
0418     io_ctl->orig = io_ctl->cur;
0419     io_ctl->size = PAGE_SIZE;
0420     if (clear)
0421         clear_page(io_ctl->cur);
0422 }
0423 
0424 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
0425 {
0426     int i;
0427 
0428     io_ctl_unmap_page(io_ctl);
0429 
0430     for (i = 0; i < io_ctl->num_pages; i++) {
0431         if (io_ctl->pages[i]) {
0432             btrfs_page_clear_checked(io_ctl->fs_info,
0433                     io_ctl->pages[i],
0434                     page_offset(io_ctl->pages[i]),
0435                     PAGE_SIZE);
0436             unlock_page(io_ctl->pages[i]);
0437             put_page(io_ctl->pages[i]);
0438         }
0439     }
0440 }
0441 
0442 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
0443 {
0444     struct page *page;
0445     struct inode *inode = io_ctl->inode;
0446     gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
0447     int i;
0448 
0449     for (i = 0; i < io_ctl->num_pages; i++) {
0450         int ret;
0451 
0452         page = find_or_create_page(inode->i_mapping, i, mask);
0453         if (!page) {
0454             io_ctl_drop_pages(io_ctl);
0455             return -ENOMEM;
0456         }
0457 
0458         ret = set_page_extent_mapped(page);
0459         if (ret < 0) {
0460             unlock_page(page);
0461             put_page(page);
0462             io_ctl_drop_pages(io_ctl);
0463             return ret;
0464         }
0465 
0466         io_ctl->pages[i] = page;
0467         if (uptodate && !PageUptodate(page)) {
0468             btrfs_read_folio(NULL, page_folio(page));
0469             lock_page(page);
0470             if (page->mapping != inode->i_mapping) {
0471                 btrfs_err(BTRFS_I(inode)->root->fs_info,
0472                       "free space cache page truncated");
0473                 io_ctl_drop_pages(io_ctl);
0474                 return -EIO;
0475             }
0476             if (!PageUptodate(page)) {
0477                 btrfs_err(BTRFS_I(inode)->root->fs_info,
0478                        "error reading free space cache");
0479                 io_ctl_drop_pages(io_ctl);
0480                 return -EIO;
0481             }
0482         }
0483     }
0484 
0485     for (i = 0; i < io_ctl->num_pages; i++)
0486         clear_page_dirty_for_io(io_ctl->pages[i]);
0487 
0488     return 0;
0489 }
0490 
0491 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
0492 {
0493     io_ctl_map_page(io_ctl, 1);
0494 
0495     /*
0496      * Skip the csum areas.  If we don't check crcs then we just have a
0497      * 64bit chunk at the front of the first page.
0498      */
0499     io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
0500     io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
0501 
0502     put_unaligned_le64(generation, io_ctl->cur);
0503     io_ctl->cur += sizeof(u64);
0504 }
0505 
0506 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
0507 {
0508     u64 cache_gen;
0509 
0510     /*
0511      * Skip the crc area.  If we don't check crcs then we just have a 64bit
0512      * chunk at the front of the first page.
0513      */
0514     io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
0515     io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
0516 
0517     cache_gen = get_unaligned_le64(io_ctl->cur);
0518     if (cache_gen != generation) {
0519         btrfs_err_rl(io_ctl->fs_info,
0520             "space cache generation (%llu) does not match inode (%llu)",
0521                 cache_gen, generation);
0522         io_ctl_unmap_page(io_ctl);
0523         return -EIO;
0524     }
0525     io_ctl->cur += sizeof(u64);
0526     return 0;
0527 }
0528 
0529 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
0530 {
0531     u32 *tmp;
0532     u32 crc = ~(u32)0;
0533     unsigned offset = 0;
0534 
0535     if (index == 0)
0536         offset = sizeof(u32) * io_ctl->num_pages;
0537 
0538     crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
0539     btrfs_crc32c_final(crc, (u8 *)&crc);
0540     io_ctl_unmap_page(io_ctl);
0541     tmp = page_address(io_ctl->pages[0]);
0542     tmp += index;
0543     *tmp = crc;
0544 }
0545 
0546 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
0547 {
0548     u32 *tmp, val;
0549     u32 crc = ~(u32)0;
0550     unsigned offset = 0;
0551 
0552     if (index == 0)
0553         offset = sizeof(u32) * io_ctl->num_pages;
0554 
0555     tmp = page_address(io_ctl->pages[0]);
0556     tmp += index;
0557     val = *tmp;
0558 
0559     io_ctl_map_page(io_ctl, 0);
0560     crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
0561     btrfs_crc32c_final(crc, (u8 *)&crc);
0562     if (val != crc) {
0563         btrfs_err_rl(io_ctl->fs_info,
0564             "csum mismatch on free space cache");
0565         io_ctl_unmap_page(io_ctl);
0566         return -EIO;
0567     }
0568 
0569     return 0;
0570 }
0571 
0572 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
0573                 void *bitmap)
0574 {
0575     struct btrfs_free_space_entry *entry;
0576 
0577     if (!io_ctl->cur)
0578         return -ENOSPC;
0579 
0580     entry = io_ctl->cur;
0581     put_unaligned_le64(offset, &entry->offset);
0582     put_unaligned_le64(bytes, &entry->bytes);
0583     entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
0584         BTRFS_FREE_SPACE_EXTENT;
0585     io_ctl->cur += sizeof(struct btrfs_free_space_entry);
0586     io_ctl->size -= sizeof(struct btrfs_free_space_entry);
0587 
0588     if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
0589         return 0;
0590 
0591     io_ctl_set_crc(io_ctl, io_ctl->index - 1);
0592 
0593     /* No more pages to map */
0594     if (io_ctl->index >= io_ctl->num_pages)
0595         return 0;
0596 
0597     /* map the next page */
0598     io_ctl_map_page(io_ctl, 1);
0599     return 0;
0600 }
0601 
0602 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
0603 {
0604     if (!io_ctl->cur)
0605         return -ENOSPC;
0606 
0607     /*
0608      * If we aren't at the start of the current page, unmap this one and
0609      * map the next one if there is any left.
0610      */
0611     if (io_ctl->cur != io_ctl->orig) {
0612         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
0613         if (io_ctl->index >= io_ctl->num_pages)
0614             return -ENOSPC;
0615         io_ctl_map_page(io_ctl, 0);
0616     }
0617 
0618     copy_page(io_ctl->cur, bitmap);
0619     io_ctl_set_crc(io_ctl, io_ctl->index - 1);
0620     if (io_ctl->index < io_ctl->num_pages)
0621         io_ctl_map_page(io_ctl, 0);
0622     return 0;
0623 }
0624 
0625 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
0626 {
0627     /*
0628      * If we're not on the boundary we know we've modified the page and we
0629      * need to crc the page.
0630      */
0631     if (io_ctl->cur != io_ctl->orig)
0632         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
0633     else
0634         io_ctl_unmap_page(io_ctl);
0635 
0636     while (io_ctl->index < io_ctl->num_pages) {
0637         io_ctl_map_page(io_ctl, 1);
0638         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
0639     }
0640 }
0641 
0642 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
0643                 struct btrfs_free_space *entry, u8 *type)
0644 {
0645     struct btrfs_free_space_entry *e;
0646     int ret;
0647 
0648     if (!io_ctl->cur) {
0649         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
0650         if (ret)
0651             return ret;
0652     }
0653 
0654     e = io_ctl->cur;
0655     entry->offset = get_unaligned_le64(&e->offset);
0656     entry->bytes = get_unaligned_le64(&e->bytes);
0657     *type = e->type;
0658     io_ctl->cur += sizeof(struct btrfs_free_space_entry);
0659     io_ctl->size -= sizeof(struct btrfs_free_space_entry);
0660 
0661     if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
0662         return 0;
0663 
0664     io_ctl_unmap_page(io_ctl);
0665 
0666     return 0;
0667 }
0668 
0669 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
0670                   struct btrfs_free_space *entry)
0671 {
0672     int ret;
0673 
0674     ret = io_ctl_check_crc(io_ctl, io_ctl->index);
0675     if (ret)
0676         return ret;
0677 
0678     copy_page(entry->bitmap, io_ctl->cur);
0679     io_ctl_unmap_page(io_ctl);
0680 
0681     return 0;
0682 }
0683 
0684 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
0685 {
0686     struct btrfs_block_group *block_group = ctl->block_group;
0687     u64 max_bytes;
0688     u64 bitmap_bytes;
0689     u64 extent_bytes;
0690     u64 size = block_group->length;
0691     u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
0692     u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
0693 
0694     max_bitmaps = max_t(u64, max_bitmaps, 1);
0695 
0696     ASSERT(ctl->total_bitmaps <= max_bitmaps);
0697 
0698     /*
0699      * We are trying to keep the total amount of memory used per 1GiB of
0700      * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
0701      * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
0702      * bitmaps, we may end up using more memory than this.
0703      */
0704     if (size < SZ_1G)
0705         max_bytes = MAX_CACHE_BYTES_PER_GIG;
0706     else
0707         max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
0708 
0709     bitmap_bytes = ctl->total_bitmaps * ctl->unit;
0710 
0711     /*
0712      * we want the extent entry threshold to always be at most 1/2 the max
0713      * bytes we can have, or whatever is less than that.
0714      */
0715     extent_bytes = max_bytes - bitmap_bytes;
0716     extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
0717 
0718     ctl->extents_thresh =
0719         div_u64(extent_bytes, sizeof(struct btrfs_free_space));
0720 }
0721 
0722 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
0723                    struct btrfs_free_space_ctl *ctl,
0724                    struct btrfs_path *path, u64 offset)
0725 {
0726     struct btrfs_fs_info *fs_info = root->fs_info;
0727     struct btrfs_free_space_header *header;
0728     struct extent_buffer *leaf;
0729     struct btrfs_io_ctl io_ctl;
0730     struct btrfs_key key;
0731     struct btrfs_free_space *e, *n;
0732     LIST_HEAD(bitmaps);
0733     u64 num_entries;
0734     u64 num_bitmaps;
0735     u64 generation;
0736     u8 type;
0737     int ret = 0;
0738 
0739     /* Nothing in the space cache, goodbye */
0740     if (!i_size_read(inode))
0741         return 0;
0742 
0743     key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0744     key.offset = offset;
0745     key.type = 0;
0746 
0747     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0748     if (ret < 0)
0749         return 0;
0750     else if (ret > 0) {
0751         btrfs_release_path(path);
0752         return 0;
0753     }
0754 
0755     ret = -1;
0756 
0757     leaf = path->nodes[0];
0758     header = btrfs_item_ptr(leaf, path->slots[0],
0759                 struct btrfs_free_space_header);
0760     num_entries = btrfs_free_space_entries(leaf, header);
0761     num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
0762     generation = btrfs_free_space_generation(leaf, header);
0763     btrfs_release_path(path);
0764 
0765     if (!BTRFS_I(inode)->generation) {
0766         btrfs_info(fs_info,
0767                "the free space cache file (%llu) is invalid, skip it",
0768                offset);
0769         return 0;
0770     }
0771 
0772     if (BTRFS_I(inode)->generation != generation) {
0773         btrfs_err(fs_info,
0774               "free space inode generation (%llu) did not match free space cache generation (%llu)",
0775               BTRFS_I(inode)->generation, generation);
0776         return 0;
0777     }
0778 
0779     if (!num_entries)
0780         return 0;
0781 
0782     ret = io_ctl_init(&io_ctl, inode, 0);
0783     if (ret)
0784         return ret;
0785 
0786     readahead_cache(inode);
0787 
0788     ret = io_ctl_prepare_pages(&io_ctl, true);
0789     if (ret)
0790         goto out;
0791 
0792     ret = io_ctl_check_crc(&io_ctl, 0);
0793     if (ret)
0794         goto free_cache;
0795 
0796     ret = io_ctl_check_generation(&io_ctl, generation);
0797     if (ret)
0798         goto free_cache;
0799 
0800     while (num_entries) {
0801         e = kmem_cache_zalloc(btrfs_free_space_cachep,
0802                       GFP_NOFS);
0803         if (!e) {
0804             ret = -ENOMEM;
0805             goto free_cache;
0806         }
0807 
0808         ret = io_ctl_read_entry(&io_ctl, e, &type);
0809         if (ret) {
0810             kmem_cache_free(btrfs_free_space_cachep, e);
0811             goto free_cache;
0812         }
0813 
0814         if (!e->bytes) {
0815             ret = -1;
0816             kmem_cache_free(btrfs_free_space_cachep, e);
0817             goto free_cache;
0818         }
0819 
0820         if (type == BTRFS_FREE_SPACE_EXTENT) {
0821             spin_lock(&ctl->tree_lock);
0822             ret = link_free_space(ctl, e);
0823             spin_unlock(&ctl->tree_lock);
0824             if (ret) {
0825                 btrfs_err(fs_info,
0826                     "Duplicate entries in free space cache, dumping");
0827                 kmem_cache_free(btrfs_free_space_cachep, e);
0828                 goto free_cache;
0829             }
0830         } else {
0831             ASSERT(num_bitmaps);
0832             num_bitmaps--;
0833             e->bitmap = kmem_cache_zalloc(
0834                     btrfs_free_space_bitmap_cachep, GFP_NOFS);
0835             if (!e->bitmap) {
0836                 ret = -ENOMEM;
0837                 kmem_cache_free(
0838                     btrfs_free_space_cachep, e);
0839                 goto free_cache;
0840             }
0841             spin_lock(&ctl->tree_lock);
0842             ret = link_free_space(ctl, e);
0843             ctl->total_bitmaps++;
0844             recalculate_thresholds(ctl);
0845             spin_unlock(&ctl->tree_lock);
0846             if (ret) {
0847                 btrfs_err(fs_info,
0848                     "Duplicate entries in free space cache, dumping");
0849                 kmem_cache_free(btrfs_free_space_cachep, e);
0850                 goto free_cache;
0851             }
0852             list_add_tail(&e->list, &bitmaps);
0853         }
0854 
0855         num_entries--;
0856     }
0857 
0858     io_ctl_unmap_page(&io_ctl);
0859 
0860     /*
0861      * We add the bitmaps at the end of the entries in order that
0862      * the bitmap entries are added to the cache.
0863      */
0864     list_for_each_entry_safe(e, n, &bitmaps, list) {
0865         list_del_init(&e->list);
0866         ret = io_ctl_read_bitmap(&io_ctl, e);
0867         if (ret)
0868             goto free_cache;
0869     }
0870 
0871     io_ctl_drop_pages(&io_ctl);
0872     ret = 1;
0873 out:
0874     io_ctl_free(&io_ctl);
0875     return ret;
0876 free_cache:
0877     io_ctl_drop_pages(&io_ctl);
0878     __btrfs_remove_free_space_cache(ctl);
0879     goto out;
0880 }
0881 
0882 static int copy_free_space_cache(struct btrfs_block_group *block_group,
0883                  struct btrfs_free_space_ctl *ctl)
0884 {
0885     struct btrfs_free_space *info;
0886     struct rb_node *n;
0887     int ret = 0;
0888 
0889     while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
0890         info = rb_entry(n, struct btrfs_free_space, offset_index);
0891         if (!info->bitmap) {
0892             unlink_free_space(ctl, info, true);
0893             ret = btrfs_add_free_space(block_group, info->offset,
0894                            info->bytes);
0895             kmem_cache_free(btrfs_free_space_cachep, info);
0896         } else {
0897             u64 offset = info->offset;
0898             u64 bytes = ctl->unit;
0899 
0900             while (search_bitmap(ctl, info, &offset, &bytes,
0901                          false) == 0) {
0902                 ret = btrfs_add_free_space(block_group, offset,
0903                                bytes);
0904                 if (ret)
0905                     break;
0906                 bitmap_clear_bits(ctl, info, offset, bytes, true);
0907                 offset = info->offset;
0908                 bytes = ctl->unit;
0909             }
0910             free_bitmap(ctl, info);
0911         }
0912         cond_resched();
0913     }
0914     return ret;
0915 }
0916 
0917 int load_free_space_cache(struct btrfs_block_group *block_group)
0918 {
0919     struct btrfs_fs_info *fs_info = block_group->fs_info;
0920     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0921     struct btrfs_free_space_ctl tmp_ctl = {};
0922     struct inode *inode;
0923     struct btrfs_path *path;
0924     int ret = 0;
0925     bool matched;
0926     u64 used = block_group->used;
0927 
0928     /*
0929      * Because we could potentially discard our loaded free space, we want
0930      * to load everything into a temporary structure first, and then if it's
0931      * valid copy it all into the actual free space ctl.
0932      */
0933     btrfs_init_free_space_ctl(block_group, &tmp_ctl);
0934 
0935     /*
0936      * If this block group has been marked to be cleared for one reason or
0937      * another then we can't trust the on disk cache, so just return.
0938      */
0939     spin_lock(&block_group->lock);
0940     if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
0941         spin_unlock(&block_group->lock);
0942         return 0;
0943     }
0944     spin_unlock(&block_group->lock);
0945 
0946     path = btrfs_alloc_path();
0947     if (!path)
0948         return 0;
0949     path->search_commit_root = 1;
0950     path->skip_locking = 1;
0951 
0952     /*
0953      * We must pass a path with search_commit_root set to btrfs_iget in
0954      * order to avoid a deadlock when allocating extents for the tree root.
0955      *
0956      * When we are COWing an extent buffer from the tree root, when looking
0957      * for a free extent, at extent-tree.c:find_free_extent(), we can find
0958      * block group without its free space cache loaded. When we find one
0959      * we must load its space cache which requires reading its free space
0960      * cache's inode item from the root tree. If this inode item is located
0961      * in the same leaf that we started COWing before, then we end up in
0962      * deadlock on the extent buffer (trying to read lock it when we
0963      * previously write locked it).
0964      *
0965      * It's safe to read the inode item using the commit root because
0966      * block groups, once loaded, stay in memory forever (until they are
0967      * removed) as well as their space caches once loaded. New block groups
0968      * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
0969      * we will never try to read their inode item while the fs is mounted.
0970      */
0971     inode = lookup_free_space_inode(block_group, path);
0972     if (IS_ERR(inode)) {
0973         btrfs_free_path(path);
0974         return 0;
0975     }
0976 
0977     /* We may have converted the inode and made the cache invalid. */
0978     spin_lock(&block_group->lock);
0979     if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
0980         spin_unlock(&block_group->lock);
0981         btrfs_free_path(path);
0982         goto out;
0983     }
0984     spin_unlock(&block_group->lock);
0985 
0986     ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
0987                       path, block_group->start);
0988     btrfs_free_path(path);
0989     if (ret <= 0)
0990         goto out;
0991 
0992     matched = (tmp_ctl.free_space == (block_group->length - used -
0993                       block_group->bytes_super));
0994 
0995     if (matched) {
0996         ret = copy_free_space_cache(block_group, &tmp_ctl);
0997         /*
0998          * ret == 1 means we successfully loaded the free space cache,
0999          * so we need to re-set it here.
1000          */
1001         if (ret == 0)
1002             ret = 1;
1003     } else {
1004         __btrfs_remove_free_space_cache(&tmp_ctl);
1005         btrfs_warn(fs_info,
1006                "block group %llu has wrong amount of free space",
1007                block_group->start);
1008         ret = -1;
1009     }
1010 out:
1011     if (ret < 0) {
1012         /* This cache is bogus, make sure it gets cleared */
1013         spin_lock(&block_group->lock);
1014         block_group->disk_cache_state = BTRFS_DC_CLEAR;
1015         spin_unlock(&block_group->lock);
1016         ret = 0;
1017 
1018         btrfs_warn(fs_info,
1019                "failed to load free space cache for block group %llu, rebuilding it now",
1020                block_group->start);
1021     }
1022 
1023     spin_lock(&ctl->tree_lock);
1024     btrfs_discard_update_discardable(block_group);
1025     spin_unlock(&ctl->tree_lock);
1026     iput(inode);
1027     return ret;
1028 }
1029 
1030 static noinline_for_stack
1031 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1032                   struct btrfs_free_space_ctl *ctl,
1033                   struct btrfs_block_group *block_group,
1034                   int *entries, int *bitmaps,
1035                   struct list_head *bitmap_list)
1036 {
1037     int ret;
1038     struct btrfs_free_cluster *cluster = NULL;
1039     struct btrfs_free_cluster *cluster_locked = NULL;
1040     struct rb_node *node = rb_first(&ctl->free_space_offset);
1041     struct btrfs_trim_range *trim_entry;
1042 
1043     /* Get the cluster for this block_group if it exists */
1044     if (block_group && !list_empty(&block_group->cluster_list)) {
1045         cluster = list_entry(block_group->cluster_list.next,
1046                      struct btrfs_free_cluster,
1047                      block_group_list);
1048     }
1049 
1050     if (!node && cluster) {
1051         cluster_locked = cluster;
1052         spin_lock(&cluster_locked->lock);
1053         node = rb_first(&cluster->root);
1054         cluster = NULL;
1055     }
1056 
1057     /* Write out the extent entries */
1058     while (node) {
1059         struct btrfs_free_space *e;
1060 
1061         e = rb_entry(node, struct btrfs_free_space, offset_index);
1062         *entries += 1;
1063 
1064         ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1065                        e->bitmap);
1066         if (ret)
1067             goto fail;
1068 
1069         if (e->bitmap) {
1070             list_add_tail(&e->list, bitmap_list);
1071             *bitmaps += 1;
1072         }
1073         node = rb_next(node);
1074         if (!node && cluster) {
1075             node = rb_first(&cluster->root);
1076             cluster_locked = cluster;
1077             spin_lock(&cluster_locked->lock);
1078             cluster = NULL;
1079         }
1080     }
1081     if (cluster_locked) {
1082         spin_unlock(&cluster_locked->lock);
1083         cluster_locked = NULL;
1084     }
1085 
1086     /*
1087      * Make sure we don't miss any range that was removed from our rbtree
1088      * because trimming is running. Otherwise after a umount+mount (or crash
1089      * after committing the transaction) we would leak free space and get
1090      * an inconsistent free space cache report from fsck.
1091      */
1092     list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1093         ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1094                        trim_entry->bytes, NULL);
1095         if (ret)
1096             goto fail;
1097         *entries += 1;
1098     }
1099 
1100     return 0;
1101 fail:
1102     if (cluster_locked)
1103         spin_unlock(&cluster_locked->lock);
1104     return -ENOSPC;
1105 }
1106 
1107 static noinline_for_stack int
1108 update_cache_item(struct btrfs_trans_handle *trans,
1109           struct btrfs_root *root,
1110           struct inode *inode,
1111           struct btrfs_path *path, u64 offset,
1112           int entries, int bitmaps)
1113 {
1114     struct btrfs_key key;
1115     struct btrfs_free_space_header *header;
1116     struct extent_buffer *leaf;
1117     int ret;
1118 
1119     key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1120     key.offset = offset;
1121     key.type = 0;
1122 
1123     ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1124     if (ret < 0) {
1125         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1126                  EXTENT_DELALLOC, 0, 0, NULL);
1127         goto fail;
1128     }
1129     leaf = path->nodes[0];
1130     if (ret > 0) {
1131         struct btrfs_key found_key;
1132         ASSERT(path->slots[0]);
1133         path->slots[0]--;
1134         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1135         if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1136             found_key.offset != offset) {
1137             clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1138                      inode->i_size - 1, EXTENT_DELALLOC, 0,
1139                      0, NULL);
1140             btrfs_release_path(path);
1141             goto fail;
1142         }
1143     }
1144 
1145     BTRFS_I(inode)->generation = trans->transid;
1146     header = btrfs_item_ptr(leaf, path->slots[0],
1147                 struct btrfs_free_space_header);
1148     btrfs_set_free_space_entries(leaf, header, entries);
1149     btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1150     btrfs_set_free_space_generation(leaf, header, trans->transid);
1151     btrfs_mark_buffer_dirty(leaf);
1152     btrfs_release_path(path);
1153 
1154     return 0;
1155 
1156 fail:
1157     return -1;
1158 }
1159 
1160 static noinline_for_stack int write_pinned_extent_entries(
1161                 struct btrfs_trans_handle *trans,
1162                 struct btrfs_block_group *block_group,
1163                 struct btrfs_io_ctl *io_ctl,
1164                 int *entries)
1165 {
1166     u64 start, extent_start, extent_end, len;
1167     struct extent_io_tree *unpin = NULL;
1168     int ret;
1169 
1170     if (!block_group)
1171         return 0;
1172 
1173     /*
1174      * We want to add any pinned extents to our free space cache
1175      * so we don't leak the space
1176      *
1177      * We shouldn't have switched the pinned extents yet so this is the
1178      * right one
1179      */
1180     unpin = &trans->transaction->pinned_extents;
1181 
1182     start = block_group->start;
1183 
1184     while (start < block_group->start + block_group->length) {
1185         ret = find_first_extent_bit(unpin, start,
1186                         &extent_start, &extent_end,
1187                         EXTENT_DIRTY, NULL);
1188         if (ret)
1189             return 0;
1190 
1191         /* This pinned extent is out of our range */
1192         if (extent_start >= block_group->start + block_group->length)
1193             return 0;
1194 
1195         extent_start = max(extent_start, start);
1196         extent_end = min(block_group->start + block_group->length,
1197                  extent_end + 1);
1198         len = extent_end - extent_start;
1199 
1200         *entries += 1;
1201         ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1202         if (ret)
1203             return -ENOSPC;
1204 
1205         start = extent_end;
1206     }
1207 
1208     return 0;
1209 }
1210 
1211 static noinline_for_stack int
1212 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1213 {
1214     struct btrfs_free_space *entry, *next;
1215     int ret;
1216 
1217     /* Write out the bitmaps */
1218     list_for_each_entry_safe(entry, next, bitmap_list, list) {
1219         ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1220         if (ret)
1221             return -ENOSPC;
1222         list_del_init(&entry->list);
1223     }
1224 
1225     return 0;
1226 }
1227 
1228 static int flush_dirty_cache(struct inode *inode)
1229 {
1230     int ret;
1231 
1232     ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1233     if (ret)
1234         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1235                  EXTENT_DELALLOC, 0, 0, NULL);
1236 
1237     return ret;
1238 }
1239 
1240 static void noinline_for_stack
1241 cleanup_bitmap_list(struct list_head *bitmap_list)
1242 {
1243     struct btrfs_free_space *entry, *next;
1244 
1245     list_for_each_entry_safe(entry, next, bitmap_list, list)
1246         list_del_init(&entry->list);
1247 }
1248 
1249 static void noinline_for_stack
1250 cleanup_write_cache_enospc(struct inode *inode,
1251                struct btrfs_io_ctl *io_ctl,
1252                struct extent_state **cached_state)
1253 {
1254     io_ctl_drop_pages(io_ctl);
1255     unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1256                  i_size_read(inode) - 1, cached_state);
1257 }
1258 
1259 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1260                  struct btrfs_trans_handle *trans,
1261                  struct btrfs_block_group *block_group,
1262                  struct btrfs_io_ctl *io_ctl,
1263                  struct btrfs_path *path, u64 offset)
1264 {
1265     int ret;
1266     struct inode *inode = io_ctl->inode;
1267 
1268     if (!inode)
1269         return 0;
1270 
1271     /* Flush the dirty pages in the cache file. */
1272     ret = flush_dirty_cache(inode);
1273     if (ret)
1274         goto out;
1275 
1276     /* Update the cache item to tell everyone this cache file is valid. */
1277     ret = update_cache_item(trans, root, inode, path, offset,
1278                 io_ctl->entries, io_ctl->bitmaps);
1279 out:
1280     if (ret) {
1281         invalidate_inode_pages2(inode->i_mapping);
1282         BTRFS_I(inode)->generation = 0;
1283         if (block_group)
1284             btrfs_debug(root->fs_info,
1285       "failed to write free space cache for block group %llu error %d",
1286                   block_group->start, ret);
1287     }
1288     btrfs_update_inode(trans, root, BTRFS_I(inode));
1289 
1290     if (block_group) {
1291         /* the dirty list is protected by the dirty_bgs_lock */
1292         spin_lock(&trans->transaction->dirty_bgs_lock);
1293 
1294         /* the disk_cache_state is protected by the block group lock */
1295         spin_lock(&block_group->lock);
1296 
1297         /*
1298          * only mark this as written if we didn't get put back on
1299          * the dirty list while waiting for IO.   Otherwise our
1300          * cache state won't be right, and we won't get written again
1301          */
1302         if (!ret && list_empty(&block_group->dirty_list))
1303             block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1304         else if (ret)
1305             block_group->disk_cache_state = BTRFS_DC_ERROR;
1306 
1307         spin_unlock(&block_group->lock);
1308         spin_unlock(&trans->transaction->dirty_bgs_lock);
1309         io_ctl->inode = NULL;
1310         iput(inode);
1311     }
1312 
1313     return ret;
1314 
1315 }
1316 
1317 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1318             struct btrfs_block_group *block_group,
1319             struct btrfs_path *path)
1320 {
1321     return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1322                      block_group, &block_group->io_ctl,
1323                      path, block_group->start);
1324 }
1325 
1326 /**
1327  * Write out cached info to an inode
1328  *
1329  * @root:        root the inode belongs to
1330  * @inode:       freespace inode we are writing out
1331  * @ctl:         free space cache we are going to write out
1332  * @block_group: block_group for this cache if it belongs to a block_group
1333  * @io_ctl:      holds context for the io
1334  * @trans:       the trans handle
1335  *
1336  * This function writes out a free space cache struct to disk for quick recovery
1337  * on mount.  This will return 0 if it was successful in writing the cache out,
1338  * or an errno if it was not.
1339  */
1340 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1341                    struct btrfs_free_space_ctl *ctl,
1342                    struct btrfs_block_group *block_group,
1343                    struct btrfs_io_ctl *io_ctl,
1344                    struct btrfs_trans_handle *trans)
1345 {
1346     struct extent_state *cached_state = NULL;
1347     LIST_HEAD(bitmap_list);
1348     int entries = 0;
1349     int bitmaps = 0;
1350     int ret;
1351     int must_iput = 0;
1352 
1353     if (!i_size_read(inode))
1354         return -EIO;
1355 
1356     WARN_ON(io_ctl->pages);
1357     ret = io_ctl_init(io_ctl, inode, 1);
1358     if (ret)
1359         return ret;
1360 
1361     if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1362         down_write(&block_group->data_rwsem);
1363         spin_lock(&block_group->lock);
1364         if (block_group->delalloc_bytes) {
1365             block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1366             spin_unlock(&block_group->lock);
1367             up_write(&block_group->data_rwsem);
1368             BTRFS_I(inode)->generation = 0;
1369             ret = 0;
1370             must_iput = 1;
1371             goto out;
1372         }
1373         spin_unlock(&block_group->lock);
1374     }
1375 
1376     /* Lock all pages first so we can lock the extent safely. */
1377     ret = io_ctl_prepare_pages(io_ctl, false);
1378     if (ret)
1379         goto out_unlock;
1380 
1381     lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1382              &cached_state);
1383 
1384     io_ctl_set_generation(io_ctl, trans->transid);
1385 
1386     mutex_lock(&ctl->cache_writeout_mutex);
1387     /* Write out the extent entries in the free space cache */
1388     spin_lock(&ctl->tree_lock);
1389     ret = write_cache_extent_entries(io_ctl, ctl,
1390                      block_group, &entries, &bitmaps,
1391                      &bitmap_list);
1392     if (ret)
1393         goto out_nospc_locked;
1394 
1395     /*
1396      * Some spaces that are freed in the current transaction are pinned,
1397      * they will be added into free space cache after the transaction is
1398      * committed, we shouldn't lose them.
1399      *
1400      * If this changes while we are working we'll get added back to
1401      * the dirty list and redo it.  No locking needed
1402      */
1403     ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1404     if (ret)
1405         goto out_nospc_locked;
1406 
1407     /*
1408      * At last, we write out all the bitmaps and keep cache_writeout_mutex
1409      * locked while doing it because a concurrent trim can be manipulating
1410      * or freeing the bitmap.
1411      */
1412     ret = write_bitmap_entries(io_ctl, &bitmap_list);
1413     spin_unlock(&ctl->tree_lock);
1414     mutex_unlock(&ctl->cache_writeout_mutex);
1415     if (ret)
1416         goto out_nospc;
1417 
1418     /* Zero out the rest of the pages just to make sure */
1419     io_ctl_zero_remaining_pages(io_ctl);
1420 
1421     /* Everything is written out, now we dirty the pages in the file. */
1422     ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1423                 io_ctl->num_pages, 0, i_size_read(inode),
1424                 &cached_state, false);
1425     if (ret)
1426         goto out_nospc;
1427 
1428     if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1429         up_write(&block_group->data_rwsem);
1430     /*
1431      * Release the pages and unlock the extent, we will flush
1432      * them out later
1433      */
1434     io_ctl_drop_pages(io_ctl);
1435     io_ctl_free(io_ctl);
1436 
1437     unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1438                  i_size_read(inode) - 1, &cached_state);
1439 
1440     /*
1441      * at this point the pages are under IO and we're happy,
1442      * The caller is responsible for waiting on them and updating
1443      * the cache and the inode
1444      */
1445     io_ctl->entries = entries;
1446     io_ctl->bitmaps = bitmaps;
1447 
1448     ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1449     if (ret)
1450         goto out;
1451 
1452     return 0;
1453 
1454 out_nospc_locked:
1455     cleanup_bitmap_list(&bitmap_list);
1456     spin_unlock(&ctl->tree_lock);
1457     mutex_unlock(&ctl->cache_writeout_mutex);
1458 
1459 out_nospc:
1460     cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1461 
1462 out_unlock:
1463     if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1464         up_write(&block_group->data_rwsem);
1465 
1466 out:
1467     io_ctl->inode = NULL;
1468     io_ctl_free(io_ctl);
1469     if (ret) {
1470         invalidate_inode_pages2(inode->i_mapping);
1471         BTRFS_I(inode)->generation = 0;
1472     }
1473     btrfs_update_inode(trans, root, BTRFS_I(inode));
1474     if (must_iput)
1475         iput(inode);
1476     return ret;
1477 }
1478 
1479 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1480               struct btrfs_block_group *block_group,
1481               struct btrfs_path *path)
1482 {
1483     struct btrfs_fs_info *fs_info = trans->fs_info;
1484     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1485     struct inode *inode;
1486     int ret = 0;
1487 
1488     spin_lock(&block_group->lock);
1489     if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1490         spin_unlock(&block_group->lock);
1491         return 0;
1492     }
1493     spin_unlock(&block_group->lock);
1494 
1495     inode = lookup_free_space_inode(block_group, path);
1496     if (IS_ERR(inode))
1497         return 0;
1498 
1499     ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1500                 block_group, &block_group->io_ctl, trans);
1501     if (ret) {
1502         btrfs_debug(fs_info,
1503       "failed to write free space cache for block group %llu error %d",
1504               block_group->start, ret);
1505         spin_lock(&block_group->lock);
1506         block_group->disk_cache_state = BTRFS_DC_ERROR;
1507         spin_unlock(&block_group->lock);
1508 
1509         block_group->io_ctl.inode = NULL;
1510         iput(inode);
1511     }
1512 
1513     /*
1514      * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1515      * to wait for IO and put the inode
1516      */
1517 
1518     return ret;
1519 }
1520 
1521 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1522                       u64 offset)
1523 {
1524     ASSERT(offset >= bitmap_start);
1525     offset -= bitmap_start;
1526     return (unsigned long)(div_u64(offset, unit));
1527 }
1528 
1529 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1530 {
1531     return (unsigned long)(div_u64(bytes, unit));
1532 }
1533 
1534 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1535                    u64 offset)
1536 {
1537     u64 bitmap_start;
1538     u64 bytes_per_bitmap;
1539 
1540     bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1541     bitmap_start = offset - ctl->start;
1542     bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1543     bitmap_start *= bytes_per_bitmap;
1544     bitmap_start += ctl->start;
1545 
1546     return bitmap_start;
1547 }
1548 
1549 static int tree_insert_offset(struct rb_root *root, u64 offset,
1550                   struct rb_node *node, int bitmap)
1551 {
1552     struct rb_node **p = &root->rb_node;
1553     struct rb_node *parent = NULL;
1554     struct btrfs_free_space *info;
1555 
1556     while (*p) {
1557         parent = *p;
1558         info = rb_entry(parent, struct btrfs_free_space, offset_index);
1559 
1560         if (offset < info->offset) {
1561             p = &(*p)->rb_left;
1562         } else if (offset > info->offset) {
1563             p = &(*p)->rb_right;
1564         } else {
1565             /*
1566              * we could have a bitmap entry and an extent entry
1567              * share the same offset.  If this is the case, we want
1568              * the extent entry to always be found first if we do a
1569              * linear search through the tree, since we want to have
1570              * the quickest allocation time, and allocating from an
1571              * extent is faster than allocating from a bitmap.  So
1572              * if we're inserting a bitmap and we find an entry at
1573              * this offset, we want to go right, or after this entry
1574              * logically.  If we are inserting an extent and we've
1575              * found a bitmap, we want to go left, or before
1576              * logically.
1577              */
1578             if (bitmap) {
1579                 if (info->bitmap) {
1580                     WARN_ON_ONCE(1);
1581                     return -EEXIST;
1582                 }
1583                 p = &(*p)->rb_right;
1584             } else {
1585                 if (!info->bitmap) {
1586                     WARN_ON_ONCE(1);
1587                     return -EEXIST;
1588                 }
1589                 p = &(*p)->rb_left;
1590             }
1591         }
1592     }
1593 
1594     rb_link_node(node, parent, p);
1595     rb_insert_color(node, root);
1596 
1597     return 0;
1598 }
1599 
1600 /*
1601  * This is a little subtle.  We *only* have ->max_extent_size set if we actually
1602  * searched through the bitmap and figured out the largest ->max_extent_size,
1603  * otherwise it's 0.  In the case that it's 0 we don't want to tell the
1604  * allocator the wrong thing, we want to use the actual real max_extent_size
1605  * we've found already if it's larger, or we want to use ->bytes.
1606  *
1607  * This matters because find_free_space() will skip entries who's ->bytes is
1608  * less than the required bytes.  So if we didn't search down this bitmap, we
1609  * may pick some previous entry that has a smaller ->max_extent_size than we
1610  * have.  For example, assume we have two entries, one that has
1611  * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
1612  * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
1613  *  call into find_free_space(), and return with max_extent_size == 4K, because
1614  *  that first bitmap entry had ->max_extent_size set, but the second one did
1615  *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
1616  *  8K contiguous range.
1617  *
1618  *  Consider the other case, we have 2 8K chunks in that second entry and still
1619  *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
1620  *  allocator comes in it'll fully search our second bitmap, and this time it'll
1621  *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
1622  *  right allocation the next loop through.
1623  */
1624 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1625 {
1626     if (entry->bitmap && entry->max_extent_size)
1627         return entry->max_extent_size;
1628     return entry->bytes;
1629 }
1630 
1631 /*
1632  * We want the largest entry to be leftmost, so this is inverted from what you'd
1633  * normally expect.
1634  */
1635 static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1636 {
1637     const struct btrfs_free_space *entry, *exist;
1638 
1639     entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1640     exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1641     return get_max_extent_size(exist) < get_max_extent_size(entry);
1642 }
1643 
1644 /*
1645  * searches the tree for the given offset.
1646  *
1647  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1648  * want a section that has at least bytes size and comes at or after the given
1649  * offset.
1650  */
1651 static struct btrfs_free_space *
1652 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1653            u64 offset, int bitmap_only, int fuzzy)
1654 {
1655     struct rb_node *n = ctl->free_space_offset.rb_node;
1656     struct btrfs_free_space *entry = NULL, *prev = NULL;
1657 
1658     /* find entry that is closest to the 'offset' */
1659     while (n) {
1660         entry = rb_entry(n, struct btrfs_free_space, offset_index);
1661         prev = entry;
1662 
1663         if (offset < entry->offset)
1664             n = n->rb_left;
1665         else if (offset > entry->offset)
1666             n = n->rb_right;
1667         else
1668             break;
1669 
1670         entry = NULL;
1671     }
1672 
1673     if (bitmap_only) {
1674         if (!entry)
1675             return NULL;
1676         if (entry->bitmap)
1677             return entry;
1678 
1679         /*
1680          * bitmap entry and extent entry may share same offset,
1681          * in that case, bitmap entry comes after extent entry.
1682          */
1683         n = rb_next(n);
1684         if (!n)
1685             return NULL;
1686         entry = rb_entry(n, struct btrfs_free_space, offset_index);
1687         if (entry->offset != offset)
1688             return NULL;
1689 
1690         WARN_ON(!entry->bitmap);
1691         return entry;
1692     } else if (entry) {
1693         if (entry->bitmap) {
1694             /*
1695              * if previous extent entry covers the offset,
1696              * we should return it instead of the bitmap entry
1697              */
1698             n = rb_prev(&entry->offset_index);
1699             if (n) {
1700                 prev = rb_entry(n, struct btrfs_free_space,
1701                         offset_index);
1702                 if (!prev->bitmap &&
1703                     prev->offset + prev->bytes > offset)
1704                     entry = prev;
1705             }
1706         }
1707         return entry;
1708     }
1709 
1710     if (!prev)
1711         return NULL;
1712 
1713     /* find last entry before the 'offset' */
1714     entry = prev;
1715     if (entry->offset > offset) {
1716         n = rb_prev(&entry->offset_index);
1717         if (n) {
1718             entry = rb_entry(n, struct btrfs_free_space,
1719                     offset_index);
1720             ASSERT(entry->offset <= offset);
1721         } else {
1722             if (fuzzy)
1723                 return entry;
1724             else
1725                 return NULL;
1726         }
1727     }
1728 
1729     if (entry->bitmap) {
1730         n = rb_prev(&entry->offset_index);
1731         if (n) {
1732             prev = rb_entry(n, struct btrfs_free_space,
1733                     offset_index);
1734             if (!prev->bitmap &&
1735                 prev->offset + prev->bytes > offset)
1736                 return prev;
1737         }
1738         if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1739             return entry;
1740     } else if (entry->offset + entry->bytes > offset)
1741         return entry;
1742 
1743     if (!fuzzy)
1744         return NULL;
1745 
1746     while (1) {
1747         n = rb_next(&entry->offset_index);
1748         if (!n)
1749             return NULL;
1750         entry = rb_entry(n, struct btrfs_free_space, offset_index);
1751         if (entry->bitmap) {
1752             if (entry->offset + BITS_PER_BITMAP *
1753                 ctl->unit > offset)
1754                 break;
1755         } else {
1756             if (entry->offset + entry->bytes > offset)
1757                 break;
1758         }
1759     }
1760     return entry;
1761 }
1762 
1763 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1764                      struct btrfs_free_space *info,
1765                      bool update_stat)
1766 {
1767     rb_erase(&info->offset_index, &ctl->free_space_offset);
1768     rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1769     ctl->free_extents--;
1770 
1771     if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1772         ctl->discardable_extents[BTRFS_STAT_CURR]--;
1773         ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1774     }
1775 
1776     if (update_stat)
1777         ctl->free_space -= info->bytes;
1778 }
1779 
1780 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1781                struct btrfs_free_space *info)
1782 {
1783     int ret = 0;
1784 
1785     ASSERT(info->bytes || info->bitmap);
1786     ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1787                  &info->offset_index, (info->bitmap != NULL));
1788     if (ret)
1789         return ret;
1790 
1791     rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1792 
1793     if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1794         ctl->discardable_extents[BTRFS_STAT_CURR]++;
1795         ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1796     }
1797 
1798     ctl->free_space += info->bytes;
1799     ctl->free_extents++;
1800     return ret;
1801 }
1802 
1803 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1804                 struct btrfs_free_space *info)
1805 {
1806     ASSERT(info->bitmap);
1807 
1808     /*
1809      * If our entry is empty it's because we're on a cluster and we don't
1810      * want to re-link it into our ctl bytes index.
1811      */
1812     if (RB_EMPTY_NODE(&info->bytes_index))
1813         return;
1814 
1815     rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1816     rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1817 }
1818 
1819 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1820                      struct btrfs_free_space *info,
1821                      u64 offset, u64 bytes, bool update_stat)
1822 {
1823     unsigned long start, count, end;
1824     int extent_delta = -1;
1825 
1826     start = offset_to_bit(info->offset, ctl->unit, offset);
1827     count = bytes_to_bits(bytes, ctl->unit);
1828     end = start + count;
1829     ASSERT(end <= BITS_PER_BITMAP);
1830 
1831     bitmap_clear(info->bitmap, start, count);
1832 
1833     info->bytes -= bytes;
1834     if (info->max_extent_size > ctl->unit)
1835         info->max_extent_size = 0;
1836 
1837     relink_bitmap_entry(ctl, info);
1838 
1839     if (start && test_bit(start - 1, info->bitmap))
1840         extent_delta++;
1841 
1842     if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1843         extent_delta++;
1844 
1845     info->bitmap_extents += extent_delta;
1846     if (!btrfs_free_space_trimmed(info)) {
1847         ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1848         ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1849     }
1850 
1851     if (update_stat)
1852         ctl->free_space -= bytes;
1853 }
1854 
1855 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1856                 struct btrfs_free_space *info, u64 offset,
1857                 u64 bytes)
1858 {
1859     unsigned long start, count, end;
1860     int extent_delta = 1;
1861 
1862     start = offset_to_bit(info->offset, ctl->unit, offset);
1863     count = bytes_to_bits(bytes, ctl->unit);
1864     end = start + count;
1865     ASSERT(end <= BITS_PER_BITMAP);
1866 
1867     bitmap_set(info->bitmap, start, count);
1868 
1869     /*
1870      * We set some bytes, we have no idea what the max extent size is
1871      * anymore.
1872      */
1873     info->max_extent_size = 0;
1874     info->bytes += bytes;
1875     ctl->free_space += bytes;
1876 
1877     relink_bitmap_entry(ctl, info);
1878 
1879     if (start && test_bit(start - 1, info->bitmap))
1880         extent_delta--;
1881 
1882     if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1883         extent_delta--;
1884 
1885     info->bitmap_extents += extent_delta;
1886     if (!btrfs_free_space_trimmed(info)) {
1887         ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1888         ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1889     }
1890 }
1891 
1892 /*
1893  * If we can not find suitable extent, we will use bytes to record
1894  * the size of the max extent.
1895  */
1896 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1897              struct btrfs_free_space *bitmap_info, u64 *offset,
1898              u64 *bytes, bool for_alloc)
1899 {
1900     unsigned long found_bits = 0;
1901     unsigned long max_bits = 0;
1902     unsigned long bits, i;
1903     unsigned long next_zero;
1904     unsigned long extent_bits;
1905 
1906     /*
1907      * Skip searching the bitmap if we don't have a contiguous section that
1908      * is large enough for this allocation.
1909      */
1910     if (for_alloc &&
1911         bitmap_info->max_extent_size &&
1912         bitmap_info->max_extent_size < *bytes) {
1913         *bytes = bitmap_info->max_extent_size;
1914         return -1;
1915     }
1916 
1917     i = offset_to_bit(bitmap_info->offset, ctl->unit,
1918               max_t(u64, *offset, bitmap_info->offset));
1919     bits = bytes_to_bits(*bytes, ctl->unit);
1920 
1921     for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1922         if (for_alloc && bits == 1) {
1923             found_bits = 1;
1924             break;
1925         }
1926         next_zero = find_next_zero_bit(bitmap_info->bitmap,
1927                            BITS_PER_BITMAP, i);
1928         extent_bits = next_zero - i;
1929         if (extent_bits >= bits) {
1930             found_bits = extent_bits;
1931             break;
1932         } else if (extent_bits > max_bits) {
1933             max_bits = extent_bits;
1934         }
1935         i = next_zero;
1936     }
1937 
1938     if (found_bits) {
1939         *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1940         *bytes = (u64)(found_bits) * ctl->unit;
1941         return 0;
1942     }
1943 
1944     *bytes = (u64)(max_bits) * ctl->unit;
1945     bitmap_info->max_extent_size = *bytes;
1946     relink_bitmap_entry(ctl, bitmap_info);
1947     return -1;
1948 }
1949 
1950 /* Cache the size of the max extent in bytes */
1951 static struct btrfs_free_space *
1952 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1953         unsigned long align, u64 *max_extent_size, bool use_bytes_index)
1954 {
1955     struct btrfs_free_space *entry;
1956     struct rb_node *node;
1957     u64 tmp;
1958     u64 align_off;
1959     int ret;
1960 
1961     if (!ctl->free_space_offset.rb_node)
1962         goto out;
1963 again:
1964     if (use_bytes_index) {
1965         node = rb_first_cached(&ctl->free_space_bytes);
1966     } else {
1967         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
1968                        0, 1);
1969         if (!entry)
1970             goto out;
1971         node = &entry->offset_index;
1972     }
1973 
1974     for (; node; node = rb_next(node)) {
1975         if (use_bytes_index)
1976             entry = rb_entry(node, struct btrfs_free_space,
1977                      bytes_index);
1978         else
1979             entry = rb_entry(node, struct btrfs_free_space,
1980                      offset_index);
1981 
1982         /*
1983          * If we are using the bytes index then all subsequent entries
1984          * in this tree are going to be < bytes, so simply set the max
1985          * extent size and exit the loop.
1986          *
1987          * If we're using the offset index then we need to keep going
1988          * through the rest of the tree.
1989          */
1990         if (entry->bytes < *bytes) {
1991             *max_extent_size = max(get_max_extent_size(entry),
1992                            *max_extent_size);
1993             if (use_bytes_index)
1994                 break;
1995             continue;
1996         }
1997 
1998         /* make sure the space returned is big enough
1999          * to match our requested alignment
2000          */
2001         if (*bytes >= align) {
2002             tmp = entry->offset - ctl->start + align - 1;
2003             tmp = div64_u64(tmp, align);
2004             tmp = tmp * align + ctl->start;
2005             align_off = tmp - entry->offset;
2006         } else {
2007             align_off = 0;
2008             tmp = entry->offset;
2009         }
2010 
2011         /*
2012          * We don't break here if we're using the bytes index because we
2013          * may have another entry that has the correct alignment that is
2014          * the right size, so we don't want to miss that possibility.
2015          * At worst this adds another loop through the logic, but if we
2016          * broke here we could prematurely ENOSPC.
2017          */
2018         if (entry->bytes < *bytes + align_off) {
2019             *max_extent_size = max(get_max_extent_size(entry),
2020                            *max_extent_size);
2021             continue;
2022         }
2023 
2024         if (entry->bitmap) {
2025             struct rb_node *old_next = rb_next(node);
2026             u64 size = *bytes;
2027 
2028             ret = search_bitmap(ctl, entry, &tmp, &size, true);
2029             if (!ret) {
2030                 *offset = tmp;
2031                 *bytes = size;
2032                 return entry;
2033             } else {
2034                 *max_extent_size =
2035                     max(get_max_extent_size(entry),
2036                         *max_extent_size);
2037             }
2038 
2039             /*
2040              * The bitmap may have gotten re-arranged in the space
2041              * index here because the max_extent_size may have been
2042              * updated.  Start from the beginning again if this
2043              * happened.
2044              */
2045             if (use_bytes_index && old_next != rb_next(node))
2046                 goto again;
2047             continue;
2048         }
2049 
2050         *offset = tmp;
2051         *bytes = entry->bytes - align_off;
2052         return entry;
2053     }
2054 out:
2055     return NULL;
2056 }
2057 
2058 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
2059                struct btrfs_free_space *info, u64 offset)
2060 {
2061     info->offset = offset_to_bitmap(ctl, offset);
2062     info->bytes = 0;
2063     info->bitmap_extents = 0;
2064     INIT_LIST_HEAD(&info->list);
2065     link_free_space(ctl, info);
2066     ctl->total_bitmaps++;
2067     recalculate_thresholds(ctl);
2068 }
2069 
2070 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
2071             struct btrfs_free_space *bitmap_info)
2072 {
2073     /*
2074      * Normally when this is called, the bitmap is completely empty. However,
2075      * if we are blowing up the free space cache for one reason or another
2076      * via __btrfs_remove_free_space_cache(), then it may not be freed and
2077      * we may leave stats on the table.
2078      */
2079     if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2080         ctl->discardable_extents[BTRFS_STAT_CURR] -=
2081             bitmap_info->bitmap_extents;
2082         ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2083 
2084     }
2085     unlink_free_space(ctl, bitmap_info, true);
2086     kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
2087     kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
2088     ctl->total_bitmaps--;
2089     recalculate_thresholds(ctl);
2090 }
2091 
2092 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
2093                   struct btrfs_free_space *bitmap_info,
2094                   u64 *offset, u64 *bytes)
2095 {
2096     u64 end;
2097     u64 search_start, search_bytes;
2098     int ret;
2099 
2100 again:
2101     end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
2102 
2103     /*
2104      * We need to search for bits in this bitmap.  We could only cover some
2105      * of the extent in this bitmap thanks to how we add space, so we need
2106      * to search for as much as it as we can and clear that amount, and then
2107      * go searching for the next bit.
2108      */
2109     search_start = *offset;
2110     search_bytes = ctl->unit;
2111     search_bytes = min(search_bytes, end - search_start + 1);
2112     ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2113                 false);
2114     if (ret < 0 || search_start != *offset)
2115         return -EINVAL;
2116 
2117     /* We may have found more bits than what we need */
2118     search_bytes = min(search_bytes, *bytes);
2119 
2120     /* Cannot clear past the end of the bitmap */
2121     search_bytes = min(search_bytes, end - search_start + 1);
2122 
2123     bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
2124     *offset += search_bytes;
2125     *bytes -= search_bytes;
2126 
2127     if (*bytes) {
2128         struct rb_node *next = rb_next(&bitmap_info->offset_index);
2129         if (!bitmap_info->bytes)
2130             free_bitmap(ctl, bitmap_info);
2131 
2132         /*
2133          * no entry after this bitmap, but we still have bytes to
2134          * remove, so something has gone wrong.
2135          */
2136         if (!next)
2137             return -EINVAL;
2138 
2139         bitmap_info = rb_entry(next, struct btrfs_free_space,
2140                        offset_index);
2141 
2142         /*
2143          * if the next entry isn't a bitmap we need to return to let the
2144          * extent stuff do its work.
2145          */
2146         if (!bitmap_info->bitmap)
2147             return -EAGAIN;
2148 
2149         /*
2150          * Ok the next item is a bitmap, but it may not actually hold
2151          * the information for the rest of this free space stuff, so
2152          * look for it, and if we don't find it return so we can try
2153          * everything over again.
2154          */
2155         search_start = *offset;
2156         search_bytes = ctl->unit;
2157         ret = search_bitmap(ctl, bitmap_info, &search_start,
2158                     &search_bytes, false);
2159         if (ret < 0 || search_start != *offset)
2160             return -EAGAIN;
2161 
2162         goto again;
2163     } else if (!bitmap_info->bytes)
2164         free_bitmap(ctl, bitmap_info);
2165 
2166     return 0;
2167 }
2168 
2169 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2170                    struct btrfs_free_space *info, u64 offset,
2171                    u64 bytes, enum btrfs_trim_state trim_state)
2172 {
2173     u64 bytes_to_set = 0;
2174     u64 end;
2175 
2176     /*
2177      * This is a tradeoff to make bitmap trim state minimal.  We mark the
2178      * whole bitmap untrimmed if at any point we add untrimmed regions.
2179      */
2180     if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2181         if (btrfs_free_space_trimmed(info)) {
2182             ctl->discardable_extents[BTRFS_STAT_CURR] +=
2183                 info->bitmap_extents;
2184             ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2185         }
2186         info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2187     }
2188 
2189     end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2190 
2191     bytes_to_set = min(end - offset, bytes);
2192 
2193     bitmap_set_bits(ctl, info, offset, bytes_to_set);
2194 
2195     return bytes_to_set;
2196 
2197 }
2198 
2199 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2200               struct btrfs_free_space *info)
2201 {
2202     struct btrfs_block_group *block_group = ctl->block_group;
2203     struct btrfs_fs_info *fs_info = block_group->fs_info;
2204     bool forced = false;
2205 
2206 #ifdef CONFIG_BTRFS_DEBUG
2207     if (btrfs_should_fragment_free_space(block_group))
2208         forced = true;
2209 #endif
2210 
2211     /* This is a way to reclaim large regions from the bitmaps. */
2212     if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2213         return false;
2214 
2215     /*
2216      * If we are below the extents threshold then we can add this as an
2217      * extent, and don't have to deal with the bitmap
2218      */
2219     if (!forced && ctl->free_extents < ctl->extents_thresh) {
2220         /*
2221          * If this block group has some small extents we don't want to
2222          * use up all of our free slots in the cache with them, we want
2223          * to reserve them to larger extents, however if we have plenty
2224          * of cache left then go ahead an dadd them, no sense in adding
2225          * the overhead of a bitmap if we don't have to.
2226          */
2227         if (info->bytes <= fs_info->sectorsize * 8) {
2228             if (ctl->free_extents * 3 <= ctl->extents_thresh)
2229                 return false;
2230         } else {
2231             return false;
2232         }
2233     }
2234 
2235     /*
2236      * The original block groups from mkfs can be really small, like 8
2237      * megabytes, so don't bother with a bitmap for those entries.  However
2238      * some block groups can be smaller than what a bitmap would cover but
2239      * are still large enough that they could overflow the 32k memory limit,
2240      * so allow those block groups to still be allowed to have a bitmap
2241      * entry.
2242      */
2243     if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2244         return false;
2245 
2246     return true;
2247 }
2248 
2249 static const struct btrfs_free_space_op free_space_op = {
2250     .use_bitmap     = use_bitmap,
2251 };
2252 
2253 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2254                   struct btrfs_free_space *info)
2255 {
2256     struct btrfs_free_space *bitmap_info;
2257     struct btrfs_block_group *block_group = NULL;
2258     int added = 0;
2259     u64 bytes, offset, bytes_added;
2260     enum btrfs_trim_state trim_state;
2261     int ret;
2262 
2263     bytes = info->bytes;
2264     offset = info->offset;
2265     trim_state = info->trim_state;
2266 
2267     if (!ctl->op->use_bitmap(ctl, info))
2268         return 0;
2269 
2270     if (ctl->op == &free_space_op)
2271         block_group = ctl->block_group;
2272 again:
2273     /*
2274      * Since we link bitmaps right into the cluster we need to see if we
2275      * have a cluster here, and if so and it has our bitmap we need to add
2276      * the free space to that bitmap.
2277      */
2278     if (block_group && !list_empty(&block_group->cluster_list)) {
2279         struct btrfs_free_cluster *cluster;
2280         struct rb_node *node;
2281         struct btrfs_free_space *entry;
2282 
2283         cluster = list_entry(block_group->cluster_list.next,
2284                      struct btrfs_free_cluster,
2285                      block_group_list);
2286         spin_lock(&cluster->lock);
2287         node = rb_first(&cluster->root);
2288         if (!node) {
2289             spin_unlock(&cluster->lock);
2290             goto no_cluster_bitmap;
2291         }
2292 
2293         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2294         if (!entry->bitmap) {
2295             spin_unlock(&cluster->lock);
2296             goto no_cluster_bitmap;
2297         }
2298 
2299         if (entry->offset == offset_to_bitmap(ctl, offset)) {
2300             bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2301                               bytes, trim_state);
2302             bytes -= bytes_added;
2303             offset += bytes_added;
2304         }
2305         spin_unlock(&cluster->lock);
2306         if (!bytes) {
2307             ret = 1;
2308             goto out;
2309         }
2310     }
2311 
2312 no_cluster_bitmap:
2313     bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2314                      1, 0);
2315     if (!bitmap_info) {
2316         ASSERT(added == 0);
2317         goto new_bitmap;
2318     }
2319 
2320     bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2321                       trim_state);
2322     bytes -= bytes_added;
2323     offset += bytes_added;
2324     added = 0;
2325 
2326     if (!bytes) {
2327         ret = 1;
2328         goto out;
2329     } else
2330         goto again;
2331 
2332 new_bitmap:
2333     if (info && info->bitmap) {
2334         add_new_bitmap(ctl, info, offset);
2335         added = 1;
2336         info = NULL;
2337         goto again;
2338     } else {
2339         spin_unlock(&ctl->tree_lock);
2340 
2341         /* no pre-allocated info, allocate a new one */
2342         if (!info) {
2343             info = kmem_cache_zalloc(btrfs_free_space_cachep,
2344                          GFP_NOFS);
2345             if (!info) {
2346                 spin_lock(&ctl->tree_lock);
2347                 ret = -ENOMEM;
2348                 goto out;
2349             }
2350         }
2351 
2352         /* allocate the bitmap */
2353         info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2354                          GFP_NOFS);
2355         info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2356         spin_lock(&ctl->tree_lock);
2357         if (!info->bitmap) {
2358             ret = -ENOMEM;
2359             goto out;
2360         }
2361         goto again;
2362     }
2363 
2364 out:
2365     if (info) {
2366         if (info->bitmap)
2367             kmem_cache_free(btrfs_free_space_bitmap_cachep,
2368                     info->bitmap);
2369         kmem_cache_free(btrfs_free_space_cachep, info);
2370     }
2371 
2372     return ret;
2373 }
2374 
2375 /*
2376  * Free space merging rules:
2377  *  1) Merge trimmed areas together
2378  *  2) Let untrimmed areas coalesce with trimmed areas
2379  *  3) Always pull neighboring regions from bitmaps
2380  *
2381  * The above rules are for when we merge free space based on btrfs_trim_state.
2382  * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2383  * same reason: to promote larger extent regions which makes life easier for
2384  * find_free_extent().  Rule 2 enables coalescing based on the common path
2385  * being returning free space from btrfs_finish_extent_commit().  So when free
2386  * space is trimmed, it will prevent aggregating trimmed new region and
2387  * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2388  * and provide find_free_extent() with the largest extents possible hoping for
2389  * the reuse path.
2390  */
2391 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2392               struct btrfs_free_space *info, bool update_stat)
2393 {
2394     struct btrfs_free_space *left_info = NULL;
2395     struct btrfs_free_space *right_info;
2396     bool merged = false;
2397     u64 offset = info->offset;
2398     u64 bytes = info->bytes;
2399     const bool is_trimmed = btrfs_free_space_trimmed(info);
2400 
2401     /*
2402      * first we want to see if there is free space adjacent to the range we
2403      * are adding, if there is remove that struct and add a new one to
2404      * cover the entire range
2405      */
2406     right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2407     if (right_info && rb_prev(&right_info->offset_index))
2408         left_info = rb_entry(rb_prev(&right_info->offset_index),
2409                      struct btrfs_free_space, offset_index);
2410     else if (!right_info)
2411         left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2412 
2413     /* See try_merge_free_space() comment. */
2414     if (right_info && !right_info->bitmap &&
2415         (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2416         unlink_free_space(ctl, right_info, update_stat);
2417         info->bytes += right_info->bytes;
2418         kmem_cache_free(btrfs_free_space_cachep, right_info);
2419         merged = true;
2420     }
2421 
2422     /* See try_merge_free_space() comment. */
2423     if (left_info && !left_info->bitmap &&
2424         left_info->offset + left_info->bytes == offset &&
2425         (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2426         unlink_free_space(ctl, left_info, update_stat);
2427         info->offset = left_info->offset;
2428         info->bytes += left_info->bytes;
2429         kmem_cache_free(btrfs_free_space_cachep, left_info);
2430         merged = true;
2431     }
2432 
2433     return merged;
2434 }
2435 
2436 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2437                      struct btrfs_free_space *info,
2438                      bool update_stat)
2439 {
2440     struct btrfs_free_space *bitmap;
2441     unsigned long i;
2442     unsigned long j;
2443     const u64 end = info->offset + info->bytes;
2444     const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2445     u64 bytes;
2446 
2447     bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2448     if (!bitmap)
2449         return false;
2450 
2451     i = offset_to_bit(bitmap->offset, ctl->unit, end);
2452     j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2453     if (j == i)
2454         return false;
2455     bytes = (j - i) * ctl->unit;
2456     info->bytes += bytes;
2457 
2458     /* See try_merge_free_space() comment. */
2459     if (!btrfs_free_space_trimmed(bitmap))
2460         info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2461 
2462     bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
2463 
2464     if (!bitmap->bytes)
2465         free_bitmap(ctl, bitmap);
2466 
2467     return true;
2468 }
2469 
2470 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2471                        struct btrfs_free_space *info,
2472                        bool update_stat)
2473 {
2474     struct btrfs_free_space *bitmap;
2475     u64 bitmap_offset;
2476     unsigned long i;
2477     unsigned long j;
2478     unsigned long prev_j;
2479     u64 bytes;
2480 
2481     bitmap_offset = offset_to_bitmap(ctl, info->offset);
2482     /* If we're on a boundary, try the previous logical bitmap. */
2483     if (bitmap_offset == info->offset) {
2484         if (info->offset == 0)
2485             return false;
2486         bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2487     }
2488 
2489     bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2490     if (!bitmap)
2491         return false;
2492 
2493     i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2494     j = 0;
2495     prev_j = (unsigned long)-1;
2496     for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2497         if (j > i)
2498             break;
2499         prev_j = j;
2500     }
2501     if (prev_j == i)
2502         return false;
2503 
2504     if (prev_j == (unsigned long)-1)
2505         bytes = (i + 1) * ctl->unit;
2506     else
2507         bytes = (i - prev_j) * ctl->unit;
2508 
2509     info->offset -= bytes;
2510     info->bytes += bytes;
2511 
2512     /* See try_merge_free_space() comment. */
2513     if (!btrfs_free_space_trimmed(bitmap))
2514         info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2515 
2516     bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
2517 
2518     if (!bitmap->bytes)
2519         free_bitmap(ctl, bitmap);
2520 
2521     return true;
2522 }
2523 
2524 /*
2525  * We prefer always to allocate from extent entries, both for clustered and
2526  * non-clustered allocation requests. So when attempting to add a new extent
2527  * entry, try to see if there's adjacent free space in bitmap entries, and if
2528  * there is, migrate that space from the bitmaps to the extent.
2529  * Like this we get better chances of satisfying space allocation requests
2530  * because we attempt to satisfy them based on a single cache entry, and never
2531  * on 2 or more entries - even if the entries represent a contiguous free space
2532  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2533  * ends).
2534  */
2535 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2536                   struct btrfs_free_space *info,
2537                   bool update_stat)
2538 {
2539     /*
2540      * Only work with disconnected entries, as we can change their offset,
2541      * and must be extent entries.
2542      */
2543     ASSERT(!info->bitmap);
2544     ASSERT(RB_EMPTY_NODE(&info->offset_index));
2545 
2546     if (ctl->total_bitmaps > 0) {
2547         bool stole_end;
2548         bool stole_front = false;
2549 
2550         stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2551         if (ctl->total_bitmaps > 0)
2552             stole_front = steal_from_bitmap_to_front(ctl, info,
2553                                  update_stat);
2554 
2555         if (stole_end || stole_front)
2556             try_merge_free_space(ctl, info, update_stat);
2557     }
2558 }
2559 
2560 int __btrfs_add_free_space(struct btrfs_block_group *block_group,
2561                u64 offset, u64 bytes,
2562                enum btrfs_trim_state trim_state)
2563 {
2564     struct btrfs_fs_info *fs_info = block_group->fs_info;
2565     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2566     struct btrfs_free_space *info;
2567     int ret = 0;
2568     u64 filter_bytes = bytes;
2569 
2570     ASSERT(!btrfs_is_zoned(fs_info));
2571 
2572     info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2573     if (!info)
2574         return -ENOMEM;
2575 
2576     info->offset = offset;
2577     info->bytes = bytes;
2578     info->trim_state = trim_state;
2579     RB_CLEAR_NODE(&info->offset_index);
2580     RB_CLEAR_NODE(&info->bytes_index);
2581 
2582     spin_lock(&ctl->tree_lock);
2583 
2584     if (try_merge_free_space(ctl, info, true))
2585         goto link;
2586 
2587     /*
2588      * There was no extent directly to the left or right of this new
2589      * extent then we know we're going to have to allocate a new extent, so
2590      * before we do that see if we need to drop this into a bitmap
2591      */
2592     ret = insert_into_bitmap(ctl, info);
2593     if (ret < 0) {
2594         goto out;
2595     } else if (ret) {
2596         ret = 0;
2597         goto out;
2598     }
2599 link:
2600     /*
2601      * Only steal free space from adjacent bitmaps if we're sure we're not
2602      * going to add the new free space to existing bitmap entries - because
2603      * that would mean unnecessary work that would be reverted. Therefore
2604      * attempt to steal space from bitmaps if we're adding an extent entry.
2605      */
2606     steal_from_bitmap(ctl, info, true);
2607 
2608     filter_bytes = max(filter_bytes, info->bytes);
2609 
2610     ret = link_free_space(ctl, info);
2611     if (ret)
2612         kmem_cache_free(btrfs_free_space_cachep, info);
2613 out:
2614     btrfs_discard_update_discardable(block_group);
2615     spin_unlock(&ctl->tree_lock);
2616 
2617     if (ret) {
2618         btrfs_crit(fs_info, "unable to add free space :%d", ret);
2619         ASSERT(ret != -EEXIST);
2620     }
2621 
2622     if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2623         btrfs_discard_check_filter(block_group, filter_bytes);
2624         btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2625     }
2626 
2627     return ret;
2628 }
2629 
2630 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2631                     u64 bytenr, u64 size, bool used)
2632 {
2633     struct btrfs_space_info *sinfo = block_group->space_info;
2634     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2635     u64 offset = bytenr - block_group->start;
2636     u64 to_free, to_unusable;
2637     int bg_reclaim_threshold = 0;
2638     bool initial = (size == block_group->length);
2639     u64 reclaimable_unusable;
2640 
2641     WARN_ON(!initial && offset + size > block_group->zone_capacity);
2642 
2643     if (!initial)
2644         bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
2645 
2646     spin_lock(&ctl->tree_lock);
2647     if (!used)
2648         to_free = size;
2649     else if (initial)
2650         to_free = block_group->zone_capacity;
2651     else if (offset >= block_group->alloc_offset)
2652         to_free = size;
2653     else if (offset + size <= block_group->alloc_offset)
2654         to_free = 0;
2655     else
2656         to_free = offset + size - block_group->alloc_offset;
2657     to_unusable = size - to_free;
2658 
2659     ctl->free_space += to_free;
2660     /*
2661      * If the block group is read-only, we should account freed space into
2662      * bytes_readonly.
2663      */
2664     if (!block_group->ro)
2665         block_group->zone_unusable += to_unusable;
2666     spin_unlock(&ctl->tree_lock);
2667     if (!used) {
2668         spin_lock(&block_group->lock);
2669         block_group->alloc_offset -= size;
2670         spin_unlock(&block_group->lock);
2671     }
2672 
2673     reclaimable_unusable = block_group->zone_unusable -
2674                    (block_group->length - block_group->zone_capacity);
2675     /* All the region is now unusable. Mark it as unused and reclaim */
2676     if (block_group->zone_unusable == block_group->length) {
2677         btrfs_mark_bg_unused(block_group);
2678     } else if (bg_reclaim_threshold &&
2679            reclaimable_unusable >=
2680            div_factor_fine(block_group->zone_capacity,
2681                    bg_reclaim_threshold)) {
2682         btrfs_mark_bg_to_reclaim(block_group);
2683     }
2684 
2685     return 0;
2686 }
2687 
2688 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2689              u64 bytenr, u64 size)
2690 {
2691     enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2692 
2693     if (btrfs_is_zoned(block_group->fs_info))
2694         return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2695                             true);
2696 
2697     if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2698         trim_state = BTRFS_TRIM_STATE_TRIMMED;
2699 
2700     return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2701 }
2702 
2703 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2704                 u64 bytenr, u64 size)
2705 {
2706     if (btrfs_is_zoned(block_group->fs_info))
2707         return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2708                             false);
2709 
2710     return btrfs_add_free_space(block_group, bytenr, size);
2711 }
2712 
2713 /*
2714  * This is a subtle distinction because when adding free space back in general,
2715  * we want it to be added as untrimmed for async. But in the case where we add
2716  * it on loading of a block group, we want to consider it trimmed.
2717  */
2718 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2719                        u64 bytenr, u64 size)
2720 {
2721     enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2722 
2723     if (btrfs_is_zoned(block_group->fs_info))
2724         return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2725                             true);
2726 
2727     if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2728         btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2729         trim_state = BTRFS_TRIM_STATE_TRIMMED;
2730 
2731     return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2732 }
2733 
2734 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2735                 u64 offset, u64 bytes)
2736 {
2737     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2738     struct btrfs_free_space *info;
2739     int ret;
2740     bool re_search = false;
2741 
2742     if (btrfs_is_zoned(block_group->fs_info)) {
2743         /*
2744          * This can happen with conventional zones when replaying log.
2745          * Since the allocation info of tree-log nodes are not recorded
2746          * to the extent-tree, calculate_alloc_pointer() failed to
2747          * advance the allocation pointer after last allocated tree log
2748          * node blocks.
2749          *
2750          * This function is called from
2751          * btrfs_pin_extent_for_log_replay() when replaying the log.
2752          * Advance the pointer not to overwrite the tree-log nodes.
2753          */
2754         if (block_group->start + block_group->alloc_offset <
2755             offset + bytes) {
2756             block_group->alloc_offset =
2757                 offset + bytes - block_group->start;
2758         }
2759         return 0;
2760     }
2761 
2762     spin_lock(&ctl->tree_lock);
2763 
2764 again:
2765     ret = 0;
2766     if (!bytes)
2767         goto out_lock;
2768 
2769     info = tree_search_offset(ctl, offset, 0, 0);
2770     if (!info) {
2771         /*
2772          * oops didn't find an extent that matched the space we wanted
2773          * to remove, look for a bitmap instead
2774          */
2775         info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2776                       1, 0);
2777         if (!info) {
2778             /*
2779              * If we found a partial bit of our free space in a
2780              * bitmap but then couldn't find the other part this may
2781              * be a problem, so WARN about it.
2782              */
2783             WARN_ON(re_search);
2784             goto out_lock;
2785         }
2786     }
2787 
2788     re_search = false;
2789     if (!info->bitmap) {
2790         unlink_free_space(ctl, info, true);
2791         if (offset == info->offset) {
2792             u64 to_free = min(bytes, info->bytes);
2793 
2794             info->bytes -= to_free;
2795             info->offset += to_free;
2796             if (info->bytes) {
2797                 ret = link_free_space(ctl, info);
2798                 WARN_ON(ret);
2799             } else {
2800                 kmem_cache_free(btrfs_free_space_cachep, info);
2801             }
2802 
2803             offset += to_free;
2804             bytes -= to_free;
2805             goto again;
2806         } else {
2807             u64 old_end = info->bytes + info->offset;
2808 
2809             info->bytes = offset - info->offset;
2810             ret = link_free_space(ctl, info);
2811             WARN_ON(ret);
2812             if (ret)
2813                 goto out_lock;
2814 
2815             /* Not enough bytes in this entry to satisfy us */
2816             if (old_end < offset + bytes) {
2817                 bytes -= old_end - offset;
2818                 offset = old_end;
2819                 goto again;
2820             } else if (old_end == offset + bytes) {
2821                 /* all done */
2822                 goto out_lock;
2823             }
2824             spin_unlock(&ctl->tree_lock);
2825 
2826             ret = __btrfs_add_free_space(block_group,
2827                              offset + bytes,
2828                              old_end - (offset + bytes),
2829                              info->trim_state);
2830             WARN_ON(ret);
2831             goto out;
2832         }
2833     }
2834 
2835     ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2836     if (ret == -EAGAIN) {
2837         re_search = true;
2838         goto again;
2839     }
2840 out_lock:
2841     btrfs_discard_update_discardable(block_group);
2842     spin_unlock(&ctl->tree_lock);
2843 out:
2844     return ret;
2845 }
2846 
2847 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2848                u64 bytes)
2849 {
2850     struct btrfs_fs_info *fs_info = block_group->fs_info;
2851     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2852     struct btrfs_free_space *info;
2853     struct rb_node *n;
2854     int count = 0;
2855 
2856     /*
2857      * Zoned btrfs does not use free space tree and cluster. Just print
2858      * out the free space after the allocation offset.
2859      */
2860     if (btrfs_is_zoned(fs_info)) {
2861         btrfs_info(fs_info, "free space %llu active %d",
2862                block_group->zone_capacity - block_group->alloc_offset,
2863                block_group->zone_is_active);
2864         return;
2865     }
2866 
2867     spin_lock(&ctl->tree_lock);
2868     for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2869         info = rb_entry(n, struct btrfs_free_space, offset_index);
2870         if (info->bytes >= bytes && !block_group->ro)
2871             count++;
2872         btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2873                info->offset, info->bytes,
2874                (info->bitmap) ? "yes" : "no");
2875     }
2876     spin_unlock(&ctl->tree_lock);
2877     btrfs_info(fs_info, "block group has cluster?: %s",
2878            list_empty(&block_group->cluster_list) ? "no" : "yes");
2879     btrfs_info(fs_info,
2880            "%d blocks of free space at or bigger than bytes is", count);
2881 }
2882 
2883 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2884                    struct btrfs_free_space_ctl *ctl)
2885 {
2886     struct btrfs_fs_info *fs_info = block_group->fs_info;
2887 
2888     spin_lock_init(&ctl->tree_lock);
2889     ctl->unit = fs_info->sectorsize;
2890     ctl->start = block_group->start;
2891     ctl->block_group = block_group;
2892     ctl->op = &free_space_op;
2893     ctl->free_space_bytes = RB_ROOT_CACHED;
2894     INIT_LIST_HEAD(&ctl->trimming_ranges);
2895     mutex_init(&ctl->cache_writeout_mutex);
2896 
2897     /*
2898      * we only want to have 32k of ram per block group for keeping
2899      * track of free space, and if we pass 1/2 of that we want to
2900      * start converting things over to using bitmaps
2901      */
2902     ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2903 }
2904 
2905 /*
2906  * for a given cluster, put all of its extents back into the free
2907  * space cache.  If the block group passed doesn't match the block group
2908  * pointed to by the cluster, someone else raced in and freed the
2909  * cluster already.  In that case, we just return without changing anything
2910  */
2911 static void __btrfs_return_cluster_to_free_space(
2912                  struct btrfs_block_group *block_group,
2913                  struct btrfs_free_cluster *cluster)
2914 {
2915     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2916     struct btrfs_free_space *entry;
2917     struct rb_node *node;
2918 
2919     spin_lock(&cluster->lock);
2920     if (cluster->block_group != block_group) {
2921         spin_unlock(&cluster->lock);
2922         return;
2923     }
2924 
2925     cluster->block_group = NULL;
2926     cluster->window_start = 0;
2927     list_del_init(&cluster->block_group_list);
2928 
2929     node = rb_first(&cluster->root);
2930     while (node) {
2931         bool bitmap;
2932 
2933         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2934         node = rb_next(&entry->offset_index);
2935         rb_erase(&entry->offset_index, &cluster->root);
2936         RB_CLEAR_NODE(&entry->offset_index);
2937 
2938         bitmap = (entry->bitmap != NULL);
2939         if (!bitmap) {
2940             /* Merging treats extents as if they were new */
2941             if (!btrfs_free_space_trimmed(entry)) {
2942                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
2943                 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2944                     entry->bytes;
2945             }
2946 
2947             try_merge_free_space(ctl, entry, false);
2948             steal_from_bitmap(ctl, entry, false);
2949 
2950             /* As we insert directly, update these statistics */
2951             if (!btrfs_free_space_trimmed(entry)) {
2952                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
2953                 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2954                     entry->bytes;
2955             }
2956         }
2957         tree_insert_offset(&ctl->free_space_offset,
2958                    entry->offset, &entry->offset_index, bitmap);
2959         rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
2960                   entry_less);
2961     }
2962     cluster->root = RB_ROOT;
2963     spin_unlock(&cluster->lock);
2964     btrfs_put_block_group(block_group);
2965 }
2966 
2967 static void __btrfs_remove_free_space_cache_locked(
2968                 struct btrfs_free_space_ctl *ctl)
2969 {
2970     struct btrfs_free_space *info;
2971     struct rb_node *node;
2972 
2973     while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2974         info = rb_entry(node, struct btrfs_free_space, offset_index);
2975         if (!info->bitmap) {
2976             unlink_free_space(ctl, info, true);
2977             kmem_cache_free(btrfs_free_space_cachep, info);
2978         } else {
2979             free_bitmap(ctl, info);
2980         }
2981 
2982         cond_resched_lock(&ctl->tree_lock);
2983     }
2984 }
2985 
2986 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2987 {
2988     spin_lock(&ctl->tree_lock);
2989     __btrfs_remove_free_space_cache_locked(ctl);
2990     if (ctl->block_group)
2991         btrfs_discard_update_discardable(ctl->block_group);
2992     spin_unlock(&ctl->tree_lock);
2993 }
2994 
2995 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2996 {
2997     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2998     struct btrfs_free_cluster *cluster;
2999     struct list_head *head;
3000 
3001     spin_lock(&ctl->tree_lock);
3002     while ((head = block_group->cluster_list.next) !=
3003            &block_group->cluster_list) {
3004         cluster = list_entry(head, struct btrfs_free_cluster,
3005                      block_group_list);
3006 
3007         WARN_ON(cluster->block_group != block_group);
3008         __btrfs_return_cluster_to_free_space(block_group, cluster);
3009 
3010         cond_resched_lock(&ctl->tree_lock);
3011     }
3012     __btrfs_remove_free_space_cache_locked(ctl);
3013     btrfs_discard_update_discardable(block_group);
3014     spin_unlock(&ctl->tree_lock);
3015 
3016 }
3017 
3018 /**
3019  * btrfs_is_free_space_trimmed - see if everything is trimmed
3020  * @block_group: block_group of interest
3021  *
3022  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3023  */
3024 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3025 {
3026     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3027     struct btrfs_free_space *info;
3028     struct rb_node *node;
3029     bool ret = true;
3030 
3031     spin_lock(&ctl->tree_lock);
3032     node = rb_first(&ctl->free_space_offset);
3033 
3034     while (node) {
3035         info = rb_entry(node, struct btrfs_free_space, offset_index);
3036 
3037         if (!btrfs_free_space_trimmed(info)) {
3038             ret = false;
3039             break;
3040         }
3041 
3042         node = rb_next(node);
3043     }
3044 
3045     spin_unlock(&ctl->tree_lock);
3046     return ret;
3047 }
3048 
3049 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
3050                    u64 offset, u64 bytes, u64 empty_size,
3051                    u64 *max_extent_size)
3052 {
3053     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3054     struct btrfs_discard_ctl *discard_ctl =
3055                     &block_group->fs_info->discard_ctl;
3056     struct btrfs_free_space *entry = NULL;
3057     u64 bytes_search = bytes + empty_size;
3058     u64 ret = 0;
3059     u64 align_gap = 0;
3060     u64 align_gap_len = 0;
3061     enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3062     bool use_bytes_index = (offset == block_group->start);
3063 
3064     ASSERT(!btrfs_is_zoned(block_group->fs_info));
3065 
3066     spin_lock(&ctl->tree_lock);
3067     entry = find_free_space(ctl, &offset, &bytes_search,
3068                 block_group->full_stripe_len, max_extent_size,
3069                 use_bytes_index);
3070     if (!entry)
3071         goto out;
3072 
3073     ret = offset;
3074     if (entry->bitmap) {
3075         bitmap_clear_bits(ctl, entry, offset, bytes, true);
3076 
3077         if (!btrfs_free_space_trimmed(entry))
3078             atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3079 
3080         if (!entry->bytes)
3081             free_bitmap(ctl, entry);
3082     } else {
3083         unlink_free_space(ctl, entry, true);
3084         align_gap_len = offset - entry->offset;
3085         align_gap = entry->offset;
3086         align_gap_trim_state = entry->trim_state;
3087 
3088         if (!btrfs_free_space_trimmed(entry))
3089             atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3090 
3091         entry->offset = offset + bytes;
3092         WARN_ON(entry->bytes < bytes + align_gap_len);
3093 
3094         entry->bytes -= bytes + align_gap_len;
3095         if (!entry->bytes)
3096             kmem_cache_free(btrfs_free_space_cachep, entry);
3097         else
3098             link_free_space(ctl, entry);
3099     }
3100 out:
3101     btrfs_discard_update_discardable(block_group);
3102     spin_unlock(&ctl->tree_lock);
3103 
3104     if (align_gap_len)
3105         __btrfs_add_free_space(block_group, align_gap, align_gap_len,
3106                        align_gap_trim_state);
3107     return ret;
3108 }
3109 
3110 /*
3111  * given a cluster, put all of its extents back into the free space
3112  * cache.  If a block group is passed, this function will only free
3113  * a cluster that belongs to the passed block group.
3114  *
3115  * Otherwise, it'll get a reference on the block group pointed to by the
3116  * cluster and remove the cluster from it.
3117  */
3118 void btrfs_return_cluster_to_free_space(
3119                    struct btrfs_block_group *block_group,
3120                    struct btrfs_free_cluster *cluster)
3121 {
3122     struct btrfs_free_space_ctl *ctl;
3123 
3124     /* first, get a safe pointer to the block group */
3125     spin_lock(&cluster->lock);
3126     if (!block_group) {
3127         block_group = cluster->block_group;
3128         if (!block_group) {
3129             spin_unlock(&cluster->lock);
3130             return;
3131         }
3132     } else if (cluster->block_group != block_group) {
3133         /* someone else has already freed it don't redo their work */
3134         spin_unlock(&cluster->lock);
3135         return;
3136     }
3137     btrfs_get_block_group(block_group);
3138     spin_unlock(&cluster->lock);
3139 
3140     ctl = block_group->free_space_ctl;
3141 
3142     /* now return any extents the cluster had on it */
3143     spin_lock(&ctl->tree_lock);
3144     __btrfs_return_cluster_to_free_space(block_group, cluster);
3145     spin_unlock(&ctl->tree_lock);
3146 
3147     btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3148 
3149     /* finally drop our ref */
3150     btrfs_put_block_group(block_group);
3151 }
3152 
3153 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3154                    struct btrfs_free_cluster *cluster,
3155                    struct btrfs_free_space *entry,
3156                    u64 bytes, u64 min_start,
3157                    u64 *max_extent_size)
3158 {
3159     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3160     int err;
3161     u64 search_start = cluster->window_start;
3162     u64 search_bytes = bytes;
3163     u64 ret = 0;
3164 
3165     search_start = min_start;
3166     search_bytes = bytes;
3167 
3168     err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3169     if (err) {
3170         *max_extent_size = max(get_max_extent_size(entry),
3171                        *max_extent_size);
3172         return 0;
3173     }
3174 
3175     ret = search_start;
3176     bitmap_clear_bits(ctl, entry, ret, bytes, false);
3177 
3178     return ret;
3179 }
3180 
3181 /*
3182  * given a cluster, try to allocate 'bytes' from it, returns 0
3183  * if it couldn't find anything suitably large, or a logical disk offset
3184  * if things worked out
3185  */
3186 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3187                  struct btrfs_free_cluster *cluster, u64 bytes,
3188                  u64 min_start, u64 *max_extent_size)
3189 {
3190     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3191     struct btrfs_discard_ctl *discard_ctl =
3192                     &block_group->fs_info->discard_ctl;
3193     struct btrfs_free_space *entry = NULL;
3194     struct rb_node *node;
3195     u64 ret = 0;
3196 
3197     ASSERT(!btrfs_is_zoned(block_group->fs_info));
3198 
3199     spin_lock(&cluster->lock);
3200     if (bytes > cluster->max_size)
3201         goto out;
3202 
3203     if (cluster->block_group != block_group)
3204         goto out;
3205 
3206     node = rb_first(&cluster->root);
3207     if (!node)
3208         goto out;
3209 
3210     entry = rb_entry(node, struct btrfs_free_space, offset_index);
3211     while (1) {
3212         if (entry->bytes < bytes)
3213             *max_extent_size = max(get_max_extent_size(entry),
3214                            *max_extent_size);
3215 
3216         if (entry->bytes < bytes ||
3217             (!entry->bitmap && entry->offset < min_start)) {
3218             node = rb_next(&entry->offset_index);
3219             if (!node)
3220                 break;
3221             entry = rb_entry(node, struct btrfs_free_space,
3222                      offset_index);
3223             continue;
3224         }
3225 
3226         if (entry->bitmap) {
3227             ret = btrfs_alloc_from_bitmap(block_group,
3228                               cluster, entry, bytes,
3229                               cluster->window_start,
3230                               max_extent_size);
3231             if (ret == 0) {
3232                 node = rb_next(&entry->offset_index);
3233                 if (!node)
3234                     break;
3235                 entry = rb_entry(node, struct btrfs_free_space,
3236                          offset_index);
3237                 continue;
3238             }
3239             cluster->window_start += bytes;
3240         } else {
3241             ret = entry->offset;
3242 
3243             entry->offset += bytes;
3244             entry->bytes -= bytes;
3245         }
3246 
3247         break;
3248     }
3249 out:
3250     spin_unlock(&cluster->lock);
3251 
3252     if (!ret)
3253         return 0;
3254 
3255     spin_lock(&ctl->tree_lock);
3256 
3257     if (!btrfs_free_space_trimmed(entry))
3258         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3259 
3260     ctl->free_space -= bytes;
3261     if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3262         ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3263 
3264     spin_lock(&cluster->lock);
3265     if (entry->bytes == 0) {
3266         rb_erase(&entry->offset_index, &cluster->root);
3267         ctl->free_extents--;
3268         if (entry->bitmap) {
3269             kmem_cache_free(btrfs_free_space_bitmap_cachep,
3270                     entry->bitmap);
3271             ctl->total_bitmaps--;
3272             recalculate_thresholds(ctl);
3273         } else if (!btrfs_free_space_trimmed(entry)) {
3274             ctl->discardable_extents[BTRFS_STAT_CURR]--;
3275         }
3276         kmem_cache_free(btrfs_free_space_cachep, entry);
3277     }
3278 
3279     spin_unlock(&cluster->lock);
3280     spin_unlock(&ctl->tree_lock);
3281 
3282     return ret;
3283 }
3284 
3285 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3286                 struct btrfs_free_space *entry,
3287                 struct btrfs_free_cluster *cluster,
3288                 u64 offset, u64 bytes,
3289                 u64 cont1_bytes, u64 min_bytes)
3290 {
3291     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3292     unsigned long next_zero;
3293     unsigned long i;
3294     unsigned long want_bits;
3295     unsigned long min_bits;
3296     unsigned long found_bits;
3297     unsigned long max_bits = 0;
3298     unsigned long start = 0;
3299     unsigned long total_found = 0;
3300     int ret;
3301 
3302     i = offset_to_bit(entry->offset, ctl->unit,
3303               max_t(u64, offset, entry->offset));
3304     want_bits = bytes_to_bits(bytes, ctl->unit);
3305     min_bits = bytes_to_bits(min_bytes, ctl->unit);
3306 
3307     /*
3308      * Don't bother looking for a cluster in this bitmap if it's heavily
3309      * fragmented.
3310      */
3311     if (entry->max_extent_size &&
3312         entry->max_extent_size < cont1_bytes)
3313         return -ENOSPC;
3314 again:
3315     found_bits = 0;
3316     for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3317         next_zero = find_next_zero_bit(entry->bitmap,
3318                            BITS_PER_BITMAP, i);
3319         if (next_zero - i >= min_bits) {
3320             found_bits = next_zero - i;
3321             if (found_bits > max_bits)
3322                 max_bits = found_bits;
3323             break;
3324         }
3325         if (next_zero - i > max_bits)
3326             max_bits = next_zero - i;
3327         i = next_zero;
3328     }
3329 
3330     if (!found_bits) {
3331         entry->max_extent_size = (u64)max_bits * ctl->unit;
3332         return -ENOSPC;
3333     }
3334 
3335     if (!total_found) {
3336         start = i;
3337         cluster->max_size = 0;
3338     }
3339 
3340     total_found += found_bits;
3341 
3342     if (cluster->max_size < found_bits * ctl->unit)
3343         cluster->max_size = found_bits * ctl->unit;
3344 
3345     if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3346         i = next_zero + 1;
3347         goto again;
3348     }
3349 
3350     cluster->window_start = start * ctl->unit + entry->offset;
3351     rb_erase(&entry->offset_index, &ctl->free_space_offset);
3352     rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3353 
3354     /*
3355      * We need to know if we're currently on the normal space index when we
3356      * manipulate the bitmap so that we know we need to remove and re-insert
3357      * it into the space_index tree.  Clear the bytes_index node here so the
3358      * bitmap manipulation helpers know not to mess with the space_index
3359      * until this bitmap entry is added back into the normal cache.
3360      */
3361     RB_CLEAR_NODE(&entry->bytes_index);
3362 
3363     ret = tree_insert_offset(&cluster->root, entry->offset,
3364                  &entry->offset_index, 1);
3365     ASSERT(!ret); /* -EEXIST; Logic error */
3366 
3367     trace_btrfs_setup_cluster(block_group, cluster,
3368                   total_found * ctl->unit, 1);
3369     return 0;
3370 }
3371 
3372 /*
3373  * This searches the block group for just extents to fill the cluster with.
3374  * Try to find a cluster with at least bytes total bytes, at least one
3375  * extent of cont1_bytes, and other clusters of at least min_bytes.
3376  */
3377 static noinline int
3378 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3379             struct btrfs_free_cluster *cluster,
3380             struct list_head *bitmaps, u64 offset, u64 bytes,
3381             u64 cont1_bytes, u64 min_bytes)
3382 {
3383     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3384     struct btrfs_free_space *first = NULL;
3385     struct btrfs_free_space *entry = NULL;
3386     struct btrfs_free_space *last;
3387     struct rb_node *node;
3388     u64 window_free;
3389     u64 max_extent;
3390     u64 total_size = 0;
3391 
3392     entry = tree_search_offset(ctl, offset, 0, 1);
3393     if (!entry)
3394         return -ENOSPC;
3395 
3396     /*
3397      * We don't want bitmaps, so just move along until we find a normal
3398      * extent entry.
3399      */
3400     while (entry->bitmap || entry->bytes < min_bytes) {
3401         if (entry->bitmap && list_empty(&entry->list))
3402             list_add_tail(&entry->list, bitmaps);
3403         node = rb_next(&entry->offset_index);
3404         if (!node)
3405             return -ENOSPC;
3406         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3407     }
3408 
3409     window_free = entry->bytes;
3410     max_extent = entry->bytes;
3411     first = entry;
3412     last = entry;
3413 
3414     for (node = rb_next(&entry->offset_index); node;
3415          node = rb_next(&entry->offset_index)) {
3416         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3417 
3418         if (entry->bitmap) {
3419             if (list_empty(&entry->list))
3420                 list_add_tail(&entry->list, bitmaps);
3421             continue;
3422         }
3423 
3424         if (entry->bytes < min_bytes)
3425             continue;
3426 
3427         last = entry;
3428         window_free += entry->bytes;
3429         if (entry->bytes > max_extent)
3430             max_extent = entry->bytes;
3431     }
3432 
3433     if (window_free < bytes || max_extent < cont1_bytes)
3434         return -ENOSPC;
3435 
3436     cluster->window_start = first->offset;
3437 
3438     node = &first->offset_index;
3439 
3440     /*
3441      * now we've found our entries, pull them out of the free space
3442      * cache and put them into the cluster rbtree
3443      */
3444     do {
3445         int ret;
3446 
3447         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3448         node = rb_next(&entry->offset_index);
3449         if (entry->bitmap || entry->bytes < min_bytes)
3450             continue;
3451 
3452         rb_erase(&entry->offset_index, &ctl->free_space_offset);
3453         rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3454         ret = tree_insert_offset(&cluster->root, entry->offset,
3455                      &entry->offset_index, 0);
3456         total_size += entry->bytes;
3457         ASSERT(!ret); /* -EEXIST; Logic error */
3458     } while (node && entry != last);
3459 
3460     cluster->max_size = max_extent;
3461     trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3462     return 0;
3463 }
3464 
3465 /*
3466  * This specifically looks for bitmaps that may work in the cluster, we assume
3467  * that we have already failed to find extents that will work.
3468  */
3469 static noinline int
3470 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3471              struct btrfs_free_cluster *cluster,
3472              struct list_head *bitmaps, u64 offset, u64 bytes,
3473              u64 cont1_bytes, u64 min_bytes)
3474 {
3475     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3476     struct btrfs_free_space *entry = NULL;
3477     int ret = -ENOSPC;
3478     u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3479 
3480     if (ctl->total_bitmaps == 0)
3481         return -ENOSPC;
3482 
3483     /*
3484      * The bitmap that covers offset won't be in the list unless offset
3485      * is just its start offset.
3486      */
3487     if (!list_empty(bitmaps))
3488         entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3489 
3490     if (!entry || entry->offset != bitmap_offset) {
3491         entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3492         if (entry && list_empty(&entry->list))
3493             list_add(&entry->list, bitmaps);
3494     }
3495 
3496     list_for_each_entry(entry, bitmaps, list) {
3497         if (entry->bytes < bytes)
3498             continue;
3499         ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3500                        bytes, cont1_bytes, min_bytes);
3501         if (!ret)
3502             return 0;
3503     }
3504 
3505     /*
3506      * The bitmaps list has all the bitmaps that record free space
3507      * starting after offset, so no more search is required.
3508      */
3509     return -ENOSPC;
3510 }
3511 
3512 /*
3513  * here we try to find a cluster of blocks in a block group.  The goal
3514  * is to find at least bytes+empty_size.
3515  * We might not find them all in one contiguous area.
3516  *
3517  * returns zero and sets up cluster if things worked out, otherwise
3518  * it returns -enospc
3519  */
3520 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3521                  struct btrfs_free_cluster *cluster,
3522                  u64 offset, u64 bytes, u64 empty_size)
3523 {
3524     struct btrfs_fs_info *fs_info = block_group->fs_info;
3525     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3526     struct btrfs_free_space *entry, *tmp;
3527     LIST_HEAD(bitmaps);
3528     u64 min_bytes;
3529     u64 cont1_bytes;
3530     int ret;
3531 
3532     /*
3533      * Choose the minimum extent size we'll require for this
3534      * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3535      * For metadata, allow allocates with smaller extents.  For
3536      * data, keep it dense.
3537      */
3538     if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3539         cont1_bytes = bytes + empty_size;
3540         min_bytes = cont1_bytes;
3541     } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3542         cont1_bytes = bytes;
3543         min_bytes = fs_info->sectorsize;
3544     } else {
3545         cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3546         min_bytes = fs_info->sectorsize;
3547     }
3548 
3549     spin_lock(&ctl->tree_lock);
3550 
3551     /*
3552      * If we know we don't have enough space to make a cluster don't even
3553      * bother doing all the work to try and find one.
3554      */
3555     if (ctl->free_space < bytes) {
3556         spin_unlock(&ctl->tree_lock);
3557         return -ENOSPC;
3558     }
3559 
3560     spin_lock(&cluster->lock);
3561 
3562     /* someone already found a cluster, hooray */
3563     if (cluster->block_group) {
3564         ret = 0;
3565         goto out;
3566     }
3567 
3568     trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3569                  min_bytes);
3570 
3571     ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3572                       bytes + empty_size,
3573                       cont1_bytes, min_bytes);
3574     if (ret)
3575         ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3576                        offset, bytes + empty_size,
3577                        cont1_bytes, min_bytes);
3578 
3579     /* Clear our temporary list */
3580     list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3581         list_del_init(&entry->list);
3582 
3583     if (!ret) {
3584         btrfs_get_block_group(block_group);
3585         list_add_tail(&cluster->block_group_list,
3586                   &block_group->cluster_list);
3587         cluster->block_group = block_group;
3588     } else {
3589         trace_btrfs_failed_cluster_setup(block_group);
3590     }
3591 out:
3592     spin_unlock(&cluster->lock);
3593     spin_unlock(&ctl->tree_lock);
3594 
3595     return ret;
3596 }
3597 
3598 /*
3599  * simple code to zero out a cluster
3600  */
3601 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3602 {
3603     spin_lock_init(&cluster->lock);
3604     spin_lock_init(&cluster->refill_lock);
3605     cluster->root = RB_ROOT;
3606     cluster->max_size = 0;
3607     cluster->fragmented = false;
3608     INIT_LIST_HEAD(&cluster->block_group_list);
3609     cluster->block_group = NULL;
3610 }
3611 
3612 static int do_trimming(struct btrfs_block_group *block_group,
3613                u64 *total_trimmed, u64 start, u64 bytes,
3614                u64 reserved_start, u64 reserved_bytes,
3615                enum btrfs_trim_state reserved_trim_state,
3616                struct btrfs_trim_range *trim_entry)
3617 {
3618     struct btrfs_space_info *space_info = block_group->space_info;
3619     struct btrfs_fs_info *fs_info = block_group->fs_info;
3620     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3621     int ret;
3622     int update = 0;
3623     const u64 end = start + bytes;
3624     const u64 reserved_end = reserved_start + reserved_bytes;
3625     enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3626     u64 trimmed = 0;
3627 
3628     spin_lock(&space_info->lock);
3629     spin_lock(&block_group->lock);
3630     if (!block_group->ro) {
3631         block_group->reserved += reserved_bytes;
3632         space_info->bytes_reserved += reserved_bytes;
3633         update = 1;
3634     }
3635     spin_unlock(&block_group->lock);
3636     spin_unlock(&space_info->lock);
3637 
3638     ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3639     if (!ret) {
3640         *total_trimmed += trimmed;
3641         trim_state = BTRFS_TRIM_STATE_TRIMMED;
3642     }
3643 
3644     mutex_lock(&ctl->cache_writeout_mutex);
3645     if (reserved_start < start)
3646         __btrfs_add_free_space(block_group, reserved_start,
3647                        start - reserved_start,
3648                        reserved_trim_state);
3649     if (start + bytes < reserved_start + reserved_bytes)
3650         __btrfs_add_free_space(block_group, end, reserved_end - end,
3651                        reserved_trim_state);
3652     __btrfs_add_free_space(block_group, start, bytes, trim_state);
3653     list_del(&trim_entry->list);
3654     mutex_unlock(&ctl->cache_writeout_mutex);
3655 
3656     if (update) {
3657         spin_lock(&space_info->lock);
3658         spin_lock(&block_group->lock);
3659         if (block_group->ro)
3660             space_info->bytes_readonly += reserved_bytes;
3661         block_group->reserved -= reserved_bytes;
3662         space_info->bytes_reserved -= reserved_bytes;
3663         spin_unlock(&block_group->lock);
3664         spin_unlock(&space_info->lock);
3665     }
3666 
3667     return ret;
3668 }
3669 
3670 /*
3671  * If @async is set, then we will trim 1 region and return.
3672  */
3673 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3674               u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3675               bool async)
3676 {
3677     struct btrfs_discard_ctl *discard_ctl =
3678                     &block_group->fs_info->discard_ctl;
3679     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3680     struct btrfs_free_space *entry;
3681     struct rb_node *node;
3682     int ret = 0;
3683     u64 extent_start;
3684     u64 extent_bytes;
3685     enum btrfs_trim_state extent_trim_state;
3686     u64 bytes;
3687     const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3688 
3689     while (start < end) {
3690         struct btrfs_trim_range trim_entry;
3691 
3692         mutex_lock(&ctl->cache_writeout_mutex);
3693         spin_lock(&ctl->tree_lock);
3694 
3695         if (ctl->free_space < minlen)
3696             goto out_unlock;
3697 
3698         entry = tree_search_offset(ctl, start, 0, 1);
3699         if (!entry)
3700             goto out_unlock;
3701 
3702         /* Skip bitmaps and if async, already trimmed entries */
3703         while (entry->bitmap ||
3704                (async && btrfs_free_space_trimmed(entry))) {
3705             node = rb_next(&entry->offset_index);
3706             if (!node)
3707                 goto out_unlock;
3708             entry = rb_entry(node, struct btrfs_free_space,
3709                      offset_index);
3710         }
3711 
3712         if (entry->offset >= end)
3713             goto out_unlock;
3714 
3715         extent_start = entry->offset;
3716         extent_bytes = entry->bytes;
3717         extent_trim_state = entry->trim_state;
3718         if (async) {
3719             start = entry->offset;
3720             bytes = entry->bytes;
3721             if (bytes < minlen) {
3722                 spin_unlock(&ctl->tree_lock);
3723                 mutex_unlock(&ctl->cache_writeout_mutex);
3724                 goto next;
3725             }
3726             unlink_free_space(ctl, entry, true);
3727             /*
3728              * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3729              * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3730              * X when we come back around.  So trim it now.
3731              */
3732             if (max_discard_size &&
3733                 bytes >= (max_discard_size +
3734                       BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3735                 bytes = max_discard_size;
3736                 extent_bytes = max_discard_size;
3737                 entry->offset += max_discard_size;
3738                 entry->bytes -= max_discard_size;
3739                 link_free_space(ctl, entry);
3740             } else {
3741                 kmem_cache_free(btrfs_free_space_cachep, entry);
3742             }
3743         } else {
3744             start = max(start, extent_start);
3745             bytes = min(extent_start + extent_bytes, end) - start;
3746             if (bytes < minlen) {
3747                 spin_unlock(&ctl->tree_lock);
3748                 mutex_unlock(&ctl->cache_writeout_mutex);
3749                 goto next;
3750             }
3751 
3752             unlink_free_space(ctl, entry, true);
3753             kmem_cache_free(btrfs_free_space_cachep, entry);
3754         }
3755 
3756         spin_unlock(&ctl->tree_lock);
3757         trim_entry.start = extent_start;
3758         trim_entry.bytes = extent_bytes;
3759         list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3760         mutex_unlock(&ctl->cache_writeout_mutex);
3761 
3762         ret = do_trimming(block_group, total_trimmed, start, bytes,
3763                   extent_start, extent_bytes, extent_trim_state,
3764                   &trim_entry);
3765         if (ret) {
3766             block_group->discard_cursor = start + bytes;
3767             break;
3768         }
3769 next:
3770         start += bytes;
3771         block_group->discard_cursor = start;
3772         if (async && *total_trimmed)
3773             break;
3774 
3775         if (fatal_signal_pending(current)) {
3776             ret = -ERESTARTSYS;
3777             break;
3778         }
3779 
3780         cond_resched();
3781     }
3782 
3783     return ret;
3784 
3785 out_unlock:
3786     block_group->discard_cursor = btrfs_block_group_end(block_group);
3787     spin_unlock(&ctl->tree_lock);
3788     mutex_unlock(&ctl->cache_writeout_mutex);
3789 
3790     return ret;
3791 }
3792 
3793 /*
3794  * If we break out of trimming a bitmap prematurely, we should reset the
3795  * trimming bit.  In a rather contrieved case, it's possible to race here so
3796  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3797  *
3798  * start = start of bitmap
3799  * end = near end of bitmap
3800  *
3801  * Thread 1:            Thread 2:
3802  * trim_bitmaps(start)
3803  *              trim_bitmaps(end)
3804  *              end_trimming_bitmap()
3805  * reset_trimming_bitmap()
3806  */
3807 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3808 {
3809     struct btrfs_free_space *entry;
3810 
3811     spin_lock(&ctl->tree_lock);
3812     entry = tree_search_offset(ctl, offset, 1, 0);
3813     if (entry) {
3814         if (btrfs_free_space_trimmed(entry)) {
3815             ctl->discardable_extents[BTRFS_STAT_CURR] +=
3816                 entry->bitmap_extents;
3817             ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3818         }
3819         entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3820     }
3821 
3822     spin_unlock(&ctl->tree_lock);
3823 }
3824 
3825 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3826                 struct btrfs_free_space *entry)
3827 {
3828     if (btrfs_free_space_trimming_bitmap(entry)) {
3829         entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3830         ctl->discardable_extents[BTRFS_STAT_CURR] -=
3831             entry->bitmap_extents;
3832         ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3833     }
3834 }
3835 
3836 /*
3837  * If @async is set, then we will trim 1 region and return.
3838  */
3839 static int trim_bitmaps(struct btrfs_block_group *block_group,
3840             u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3841             u64 maxlen, bool async)
3842 {
3843     struct btrfs_discard_ctl *discard_ctl =
3844                     &block_group->fs_info->discard_ctl;
3845     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3846     struct btrfs_free_space *entry;
3847     int ret = 0;
3848     int ret2;
3849     u64 bytes;
3850     u64 offset = offset_to_bitmap(ctl, start);
3851     const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3852 
3853     while (offset < end) {
3854         bool next_bitmap = false;
3855         struct btrfs_trim_range trim_entry;
3856 
3857         mutex_lock(&ctl->cache_writeout_mutex);
3858         spin_lock(&ctl->tree_lock);
3859 
3860         if (ctl->free_space < minlen) {
3861             block_group->discard_cursor =
3862                 btrfs_block_group_end(block_group);
3863             spin_unlock(&ctl->tree_lock);
3864             mutex_unlock(&ctl->cache_writeout_mutex);
3865             break;
3866         }
3867 
3868         entry = tree_search_offset(ctl, offset, 1, 0);
3869         /*
3870          * Bitmaps are marked trimmed lossily now to prevent constant
3871          * discarding of the same bitmap (the reason why we are bound
3872          * by the filters).  So, retrim the block group bitmaps when we
3873          * are preparing to punt to the unused_bgs list.  This uses
3874          * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3875          * which is the only discard index which sets minlen to 0.
3876          */
3877         if (!entry || (async && minlen && start == offset &&
3878                    btrfs_free_space_trimmed(entry))) {
3879             spin_unlock(&ctl->tree_lock);
3880             mutex_unlock(&ctl->cache_writeout_mutex);
3881             next_bitmap = true;
3882             goto next;
3883         }
3884 
3885         /*
3886          * Async discard bitmap trimming begins at by setting the start
3887          * to be key.objectid and the offset_to_bitmap() aligns to the
3888          * start of the bitmap.  This lets us know we are fully
3889          * scanning the bitmap rather than only some portion of it.
3890          */
3891         if (start == offset)
3892             entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3893 
3894         bytes = minlen;
3895         ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3896         if (ret2 || start >= end) {
3897             /*
3898              * We lossily consider a bitmap trimmed if we only skip
3899              * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3900              */
3901             if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3902                 end_trimming_bitmap(ctl, entry);
3903             else
3904                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3905             spin_unlock(&ctl->tree_lock);
3906             mutex_unlock(&ctl->cache_writeout_mutex);
3907             next_bitmap = true;
3908             goto next;
3909         }
3910 
3911         /*
3912          * We already trimmed a region, but are using the locking above
3913          * to reset the trim_state.
3914          */
3915         if (async && *total_trimmed) {
3916             spin_unlock(&ctl->tree_lock);
3917             mutex_unlock(&ctl->cache_writeout_mutex);
3918             goto out;
3919         }
3920 
3921         bytes = min(bytes, end - start);
3922         if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3923             spin_unlock(&ctl->tree_lock);
3924             mutex_unlock(&ctl->cache_writeout_mutex);
3925             goto next;
3926         }
3927 
3928         /*
3929          * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3930          * If X < @minlen, we won't trim X when we come back around.
3931          * So trim it now.  We differ here from trimming extents as we
3932          * don't keep individual state per bit.
3933          */
3934         if (async &&
3935             max_discard_size &&
3936             bytes > (max_discard_size + minlen))
3937             bytes = max_discard_size;
3938 
3939         bitmap_clear_bits(ctl, entry, start, bytes, true);
3940         if (entry->bytes == 0)
3941             free_bitmap(ctl, entry);
3942 
3943         spin_unlock(&ctl->tree_lock);
3944         trim_entry.start = start;
3945         trim_entry.bytes = bytes;
3946         list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3947         mutex_unlock(&ctl->cache_writeout_mutex);
3948 
3949         ret = do_trimming(block_group, total_trimmed, start, bytes,
3950                   start, bytes, 0, &trim_entry);
3951         if (ret) {
3952             reset_trimming_bitmap(ctl, offset);
3953             block_group->discard_cursor =
3954                 btrfs_block_group_end(block_group);
3955             break;
3956         }
3957 next:
3958         if (next_bitmap) {
3959             offset += BITS_PER_BITMAP * ctl->unit;
3960             start = offset;
3961         } else {
3962             start += bytes;
3963         }
3964         block_group->discard_cursor = start;
3965 
3966         if (fatal_signal_pending(current)) {
3967             if (start != offset)
3968                 reset_trimming_bitmap(ctl, offset);
3969             ret = -ERESTARTSYS;
3970             break;
3971         }
3972 
3973         cond_resched();
3974     }
3975 
3976     if (offset >= end)
3977         block_group->discard_cursor = end;
3978 
3979 out:
3980     return ret;
3981 }
3982 
3983 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3984                u64 *trimmed, u64 start, u64 end, u64 minlen)
3985 {
3986     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3987     int ret;
3988     u64 rem = 0;
3989 
3990     ASSERT(!btrfs_is_zoned(block_group->fs_info));
3991 
3992     *trimmed = 0;
3993 
3994     spin_lock(&block_group->lock);
3995     if (block_group->removed) {
3996         spin_unlock(&block_group->lock);
3997         return 0;
3998     }
3999     btrfs_freeze_block_group(block_group);
4000     spin_unlock(&block_group->lock);
4001 
4002     ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
4003     if (ret)
4004         goto out;
4005 
4006     ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
4007     div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4008     /* If we ended in the middle of a bitmap, reset the trimming flag */
4009     if (rem)
4010         reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
4011 out:
4012     btrfs_unfreeze_block_group(block_group);
4013     return ret;
4014 }
4015 
4016 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4017                    u64 *trimmed, u64 start, u64 end, u64 minlen,
4018                    bool async)
4019 {
4020     int ret;
4021 
4022     *trimmed = 0;
4023 
4024     spin_lock(&block_group->lock);
4025     if (block_group->removed) {
4026         spin_unlock(&block_group->lock);
4027         return 0;
4028     }
4029     btrfs_freeze_block_group(block_group);
4030     spin_unlock(&block_group->lock);
4031 
4032     ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
4033     btrfs_unfreeze_block_group(block_group);
4034 
4035     return ret;
4036 }
4037 
4038 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4039                    u64 *trimmed, u64 start, u64 end, u64 minlen,
4040                    u64 maxlen, bool async)
4041 {
4042     int ret;
4043 
4044     *trimmed = 0;
4045 
4046     spin_lock(&block_group->lock);
4047     if (block_group->removed) {
4048         spin_unlock(&block_group->lock);
4049         return 0;
4050     }
4051     btrfs_freeze_block_group(block_group);
4052     spin_unlock(&block_group->lock);
4053 
4054     ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4055                async);
4056 
4057     btrfs_unfreeze_block_group(block_group);
4058 
4059     return ret;
4060 }
4061 
4062 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4063 {
4064     return btrfs_super_cache_generation(fs_info->super_copy);
4065 }
4066 
4067 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4068                        struct btrfs_trans_handle *trans)
4069 {
4070     struct btrfs_block_group *block_group;
4071     struct rb_node *node;
4072     int ret = 0;
4073 
4074     btrfs_info(fs_info, "cleaning free space cache v1");
4075 
4076     node = rb_first_cached(&fs_info->block_group_cache_tree);
4077     while (node) {
4078         block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4079         ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4080         if (ret)
4081             goto out;
4082         node = rb_next(node);
4083     }
4084 out:
4085     return ret;
4086 }
4087 
4088 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4089 {
4090     struct btrfs_trans_handle *trans;
4091     int ret;
4092 
4093     /*
4094      * update_super_roots will appropriately set or unset
4095      * super_copy->cache_generation based on SPACE_CACHE and
4096      * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4097      * transaction commit whether we are enabling space cache v1 and don't
4098      * have any other work to do, or are disabling it and removing free
4099      * space inodes.
4100      */
4101     trans = btrfs_start_transaction(fs_info->tree_root, 0);
4102     if (IS_ERR(trans))
4103         return PTR_ERR(trans);
4104 
4105     if (!active) {
4106         set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4107         ret = cleanup_free_space_cache_v1(fs_info, trans);
4108         if (ret) {
4109             btrfs_abort_transaction(trans, ret);
4110             btrfs_end_transaction(trans);
4111             goto out;
4112         }
4113     }
4114 
4115     ret = btrfs_commit_transaction(trans);
4116 out:
4117     clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4118 
4119     return ret;
4120 }
4121 
4122 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4123 /*
4124  * Use this if you need to make a bitmap or extent entry specifically, it
4125  * doesn't do any of the merging that add_free_space does, this acts a lot like
4126  * how the free space cache loading stuff works, so you can get really weird
4127  * configurations.
4128  */
4129 int test_add_free_space_entry(struct btrfs_block_group *cache,
4130                   u64 offset, u64 bytes, bool bitmap)
4131 {
4132     struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4133     struct btrfs_free_space *info = NULL, *bitmap_info;
4134     void *map = NULL;
4135     enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4136     u64 bytes_added;
4137     int ret;
4138 
4139 again:
4140     if (!info) {
4141         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4142         if (!info)
4143             return -ENOMEM;
4144     }
4145 
4146     if (!bitmap) {
4147         spin_lock(&ctl->tree_lock);
4148         info->offset = offset;
4149         info->bytes = bytes;
4150         info->max_extent_size = 0;
4151         ret = link_free_space(ctl, info);
4152         spin_unlock(&ctl->tree_lock);
4153         if (ret)
4154             kmem_cache_free(btrfs_free_space_cachep, info);
4155         return ret;
4156     }
4157 
4158     if (!map) {
4159         map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4160         if (!map) {
4161             kmem_cache_free(btrfs_free_space_cachep, info);
4162             return -ENOMEM;
4163         }
4164     }
4165 
4166     spin_lock(&ctl->tree_lock);
4167     bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4168                      1, 0);
4169     if (!bitmap_info) {
4170         info->bitmap = map;
4171         map = NULL;
4172         add_new_bitmap(ctl, info, offset);
4173         bitmap_info = info;
4174         info = NULL;
4175     }
4176 
4177     bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4178                       trim_state);
4179 
4180     bytes -= bytes_added;
4181     offset += bytes_added;
4182     spin_unlock(&ctl->tree_lock);
4183 
4184     if (bytes)
4185         goto again;
4186 
4187     if (info)
4188         kmem_cache_free(btrfs_free_space_cachep, info);
4189     if (map)
4190         kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4191     return 0;
4192 }
4193 
4194 /*
4195  * Checks to see if the given range is in the free space cache.  This is really
4196  * just used to check the absence of space, so if there is free space in the
4197  * range at all we will return 1.
4198  */
4199 int test_check_exists(struct btrfs_block_group *cache,
4200               u64 offset, u64 bytes)
4201 {
4202     struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4203     struct btrfs_free_space *info;
4204     int ret = 0;
4205 
4206     spin_lock(&ctl->tree_lock);
4207     info = tree_search_offset(ctl, offset, 0, 0);
4208     if (!info) {
4209         info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4210                       1, 0);
4211         if (!info)
4212             goto out;
4213     }
4214 
4215 have_info:
4216     if (info->bitmap) {
4217         u64 bit_off, bit_bytes;
4218         struct rb_node *n;
4219         struct btrfs_free_space *tmp;
4220 
4221         bit_off = offset;
4222         bit_bytes = ctl->unit;
4223         ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4224         if (!ret) {
4225             if (bit_off == offset) {
4226                 ret = 1;
4227                 goto out;
4228             } else if (bit_off > offset &&
4229                    offset + bytes > bit_off) {
4230                 ret = 1;
4231                 goto out;
4232             }
4233         }
4234 
4235         n = rb_prev(&info->offset_index);
4236         while (n) {
4237             tmp = rb_entry(n, struct btrfs_free_space,
4238                        offset_index);
4239             if (tmp->offset + tmp->bytes < offset)
4240                 break;
4241             if (offset + bytes < tmp->offset) {
4242                 n = rb_prev(&tmp->offset_index);
4243                 continue;
4244             }
4245             info = tmp;
4246             goto have_info;
4247         }
4248 
4249         n = rb_next(&info->offset_index);
4250         while (n) {
4251             tmp = rb_entry(n, struct btrfs_free_space,
4252                        offset_index);
4253             if (offset + bytes < tmp->offset)
4254                 break;
4255             if (tmp->offset + tmp->bytes < offset) {
4256                 n = rb_next(&tmp->offset_index);
4257                 continue;
4258             }
4259             info = tmp;
4260             goto have_info;
4261         }
4262 
4263         ret = 0;
4264         goto out;
4265     }
4266 
4267     if (info->offset == offset) {
4268         ret = 1;
4269         goto out;
4270     }
4271 
4272     if (offset > info->offset && offset < info->offset + info->bytes)
4273         ret = 1;
4274 out:
4275     spin_unlock(&ctl->tree_lock);
4276     return ret;
4277 }
4278 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */