Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  fs/ext4/extents_status.c
0004  *
0005  * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
0006  * Modified by
0007  *  Allison Henderson <achender@linux.vnet.ibm.com>
0008  *  Hugh Dickins <hughd@google.com>
0009  *  Zheng Liu <wenqing.lz@taobao.com>
0010  *
0011  * Ext4 extents status tree core functions.
0012  */
0013 #include <linux/list_sort.h>
0014 #include <linux/proc_fs.h>
0015 #include <linux/seq_file.h>
0016 #include "ext4.h"
0017 
0018 #include <trace/events/ext4.h>
0019 
0020 /*
0021  * According to previous discussion in Ext4 Developer Workshop, we
0022  * will introduce a new structure called io tree to track all extent
0023  * status in order to solve some problems that we have met
0024  * (e.g. Reservation space warning), and provide extent-level locking.
0025  * Delay extent tree is the first step to achieve this goal.  It is
0026  * original built by Yongqiang Yang.  At that time it is called delay
0027  * extent tree, whose goal is only track delayed extents in memory to
0028  * simplify the implementation of fiemap and bigalloc, and introduce
0029  * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
0030  * delay extent tree at the first commit.  But for better understand
0031  * what it does, it has been rename to extent status tree.
0032  *
0033  * Step1:
0034  * Currently the first step has been done.  All delayed extents are
0035  * tracked in the tree.  It maintains the delayed extent when a delayed
0036  * allocation is issued, and the delayed extent is written out or
0037  * invalidated.  Therefore the implementation of fiemap and bigalloc
0038  * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
0039  *
0040  * The following comment describes the implemenmtation of extent
0041  * status tree and future works.
0042  *
0043  * Step2:
0044  * In this step all extent status are tracked by extent status tree.
0045  * Thus, we can first try to lookup a block mapping in this tree before
0046  * finding it in extent tree.  Hence, single extent cache can be removed
0047  * because extent status tree can do a better job.  Extents in status
0048  * tree are loaded on-demand.  Therefore, the extent status tree may not
0049  * contain all of the extents in a file.  Meanwhile we define a shrinker
0050  * to reclaim memory from extent status tree because fragmented extent
0051  * tree will make status tree cost too much memory.  written/unwritten/-
0052  * hole extents in the tree will be reclaimed by this shrinker when we
0053  * are under high memory pressure.  Delayed extents will not be
0054  * reclimed because fiemap, bigalloc, and seek_data/hole need it.
0055  */
0056 
0057 /*
0058  * Extent status tree implementation for ext4.
0059  *
0060  *
0061  * ==========================================================================
0062  * Extent status tree tracks all extent status.
0063  *
0064  * 1. Why we need to implement extent status tree?
0065  *
0066  * Without extent status tree, ext4 identifies a delayed extent by looking
0067  * up page cache, this has several deficiencies - complicated, buggy,
0068  * and inefficient code.
0069  *
0070  * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
0071  * block or a range of blocks are belonged to a delayed extent.
0072  *
0073  * Let us have a look at how they do without extent status tree.
0074  *   -- FIEMAP
0075  *  FIEMAP looks up page cache to identify delayed allocations from holes.
0076  *
0077  *   -- SEEK_HOLE/DATA
0078  *  SEEK_HOLE/DATA has the same problem as FIEMAP.
0079  *
0080  *   -- bigalloc
0081  *  bigalloc looks up page cache to figure out if a block is
0082  *  already under delayed allocation or not to determine whether
0083  *  quota reserving is needed for the cluster.
0084  *
0085  *   -- writeout
0086  *  Writeout looks up whole page cache to see if a buffer is
0087  *  mapped, If there are not very many delayed buffers, then it is
0088  *  time consuming.
0089  *
0090  * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
0091  * bigalloc and writeout can figure out if a block or a range of
0092  * blocks is under delayed allocation(belonged to a delayed extent) or
0093  * not by searching the extent tree.
0094  *
0095  *
0096  * ==========================================================================
0097  * 2. Ext4 extent status tree impelmentation
0098  *
0099  *   -- extent
0100  *  A extent is a range of blocks which are contiguous logically and
0101  *  physically.  Unlike extent in extent tree, this extent in ext4 is
0102  *  a in-memory struct, there is no corresponding on-disk data.  There
0103  *  is no limit on length of extent, so an extent can contain as many
0104  *  blocks as they are contiguous logically and physically.
0105  *
0106  *   -- extent status tree
0107  *  Every inode has an extent status tree and all allocation blocks
0108  *  are added to the tree with different status.  The extent in the
0109  *  tree are ordered by logical block no.
0110  *
0111  *   -- operations on a extent status tree
0112  *  There are three important operations on a delayed extent tree: find
0113  *  next extent, adding a extent(a range of blocks) and removing a extent.
0114  *
0115  *   -- race on a extent status tree
0116  *  Extent status tree is protected by inode->i_es_lock.
0117  *
0118  *   -- memory consumption
0119  *      Fragmented extent tree will make extent status tree cost too much
0120  *      memory.  Hence, we will reclaim written/unwritten/hole extents from
0121  *      the tree under a heavy memory pressure.
0122  *
0123  *
0124  * ==========================================================================
0125  * 3. Performance analysis
0126  *
0127  *   -- overhead
0128  *  1. There is a cache extent for write access, so if writes are
0129  *  not very random, adding space operaions are in O(1) time.
0130  *
0131  *   -- gain
0132  *  2. Code is much simpler, more readable, more maintainable and
0133  *  more efficient.
0134  *
0135  *
0136  * ==========================================================================
0137  * 4. TODO list
0138  *
0139  *   -- Refactor delayed space reservation
0140  *
0141  *   -- Extent-level locking
0142  */
0143 
0144 static struct kmem_cache *ext4_es_cachep;
0145 static struct kmem_cache *ext4_pending_cachep;
0146 
0147 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
0148 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
0149                   ext4_lblk_t end, int *reserved);
0150 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
0151 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
0152                struct ext4_inode_info *locked_ei);
0153 static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
0154                  ext4_lblk_t len);
0155 
0156 int __init ext4_init_es(void)
0157 {
0158     ext4_es_cachep = kmem_cache_create("ext4_extent_status",
0159                        sizeof(struct extent_status),
0160                        0, (SLAB_RECLAIM_ACCOUNT), NULL);
0161     if (ext4_es_cachep == NULL)
0162         return -ENOMEM;
0163     return 0;
0164 }
0165 
0166 void ext4_exit_es(void)
0167 {
0168     kmem_cache_destroy(ext4_es_cachep);
0169 }
0170 
0171 void ext4_es_init_tree(struct ext4_es_tree *tree)
0172 {
0173     tree->root = RB_ROOT;
0174     tree->cache_es = NULL;
0175 }
0176 
0177 #ifdef ES_DEBUG__
0178 static void ext4_es_print_tree(struct inode *inode)
0179 {
0180     struct ext4_es_tree *tree;
0181     struct rb_node *node;
0182 
0183     printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
0184     tree = &EXT4_I(inode)->i_es_tree;
0185     node = rb_first(&tree->root);
0186     while (node) {
0187         struct extent_status *es;
0188         es = rb_entry(node, struct extent_status, rb_node);
0189         printk(KERN_DEBUG " [%u/%u) %llu %x",
0190                es->es_lblk, es->es_len,
0191                ext4_es_pblock(es), ext4_es_status(es));
0192         node = rb_next(node);
0193     }
0194     printk(KERN_DEBUG "\n");
0195 }
0196 #else
0197 #define ext4_es_print_tree(inode)
0198 #endif
0199 
0200 static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
0201 {
0202     BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
0203     return es->es_lblk + es->es_len - 1;
0204 }
0205 
0206 /*
0207  * search through the tree for an delayed extent with a given offset.  If
0208  * it can't be found, try to find next extent.
0209  */
0210 static struct extent_status *__es_tree_search(struct rb_root *root,
0211                           ext4_lblk_t lblk)
0212 {
0213     struct rb_node *node = root->rb_node;
0214     struct extent_status *es = NULL;
0215 
0216     while (node) {
0217         es = rb_entry(node, struct extent_status, rb_node);
0218         if (lblk < es->es_lblk)
0219             node = node->rb_left;
0220         else if (lblk > ext4_es_end(es))
0221             node = node->rb_right;
0222         else
0223             return es;
0224     }
0225 
0226     if (es && lblk < es->es_lblk)
0227         return es;
0228 
0229     if (es && lblk > ext4_es_end(es)) {
0230         node = rb_next(&es->rb_node);
0231         return node ? rb_entry(node, struct extent_status, rb_node) :
0232                   NULL;
0233     }
0234 
0235     return NULL;
0236 }
0237 
0238 /*
0239  * ext4_es_find_extent_range - find extent with specified status within block
0240  *                             range or next extent following block range in
0241  *                             extents status tree
0242  *
0243  * @inode - file containing the range
0244  * @matching_fn - pointer to function that matches extents with desired status
0245  * @lblk - logical block defining start of range
0246  * @end - logical block defining end of range
0247  * @es - extent found, if any
0248  *
0249  * Find the first extent within the block range specified by @lblk and @end
0250  * in the extents status tree that satisfies @matching_fn.  If a match
0251  * is found, it's returned in @es.  If not, and a matching extent is found
0252  * beyond the block range, it's returned in @es.  If no match is found, an
0253  * extent is returned in @es whose es_lblk, es_len, and es_pblk components
0254  * are 0.
0255  */
0256 static void __es_find_extent_range(struct inode *inode,
0257                    int (*matching_fn)(struct extent_status *es),
0258                    ext4_lblk_t lblk, ext4_lblk_t end,
0259                    struct extent_status *es)
0260 {
0261     struct ext4_es_tree *tree = NULL;
0262     struct extent_status *es1 = NULL;
0263     struct rb_node *node;
0264 
0265     WARN_ON(es == NULL);
0266     WARN_ON(end < lblk);
0267 
0268     tree = &EXT4_I(inode)->i_es_tree;
0269 
0270     /* see if the extent has been cached */
0271     es->es_lblk = es->es_len = es->es_pblk = 0;
0272     if (tree->cache_es) {
0273         es1 = tree->cache_es;
0274         if (in_range(lblk, es1->es_lblk, es1->es_len)) {
0275             es_debug("%u cached by [%u/%u) %llu %x\n",
0276                  lblk, es1->es_lblk, es1->es_len,
0277                  ext4_es_pblock(es1), ext4_es_status(es1));
0278             goto out;
0279         }
0280     }
0281 
0282     es1 = __es_tree_search(&tree->root, lblk);
0283 
0284 out:
0285     if (es1 && !matching_fn(es1)) {
0286         while ((node = rb_next(&es1->rb_node)) != NULL) {
0287             es1 = rb_entry(node, struct extent_status, rb_node);
0288             if (es1->es_lblk > end) {
0289                 es1 = NULL;
0290                 break;
0291             }
0292             if (matching_fn(es1))
0293                 break;
0294         }
0295     }
0296 
0297     if (es1 && matching_fn(es1)) {
0298         tree->cache_es = es1;
0299         es->es_lblk = es1->es_lblk;
0300         es->es_len = es1->es_len;
0301         es->es_pblk = es1->es_pblk;
0302     }
0303 
0304 }
0305 
0306 /*
0307  * Locking for __es_find_extent_range() for external use
0308  */
0309 void ext4_es_find_extent_range(struct inode *inode,
0310                    int (*matching_fn)(struct extent_status *es),
0311                    ext4_lblk_t lblk, ext4_lblk_t end,
0312                    struct extent_status *es)
0313 {
0314     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0315         return;
0316 
0317     trace_ext4_es_find_extent_range_enter(inode, lblk);
0318 
0319     read_lock(&EXT4_I(inode)->i_es_lock);
0320     __es_find_extent_range(inode, matching_fn, lblk, end, es);
0321     read_unlock(&EXT4_I(inode)->i_es_lock);
0322 
0323     trace_ext4_es_find_extent_range_exit(inode, es);
0324 }
0325 
0326 /*
0327  * __es_scan_range - search block range for block with specified status
0328  *                   in extents status tree
0329  *
0330  * @inode - file containing the range
0331  * @matching_fn - pointer to function that matches extents with desired status
0332  * @lblk - logical block defining start of range
0333  * @end - logical block defining end of range
0334  *
0335  * Returns true if at least one block in the specified block range satisfies
0336  * the criterion specified by @matching_fn, and false if not.  If at least
0337  * one extent has the specified status, then there is at least one block
0338  * in the cluster with that status.  Should only be called by code that has
0339  * taken i_es_lock.
0340  */
0341 static bool __es_scan_range(struct inode *inode,
0342                 int (*matching_fn)(struct extent_status *es),
0343                 ext4_lblk_t start, ext4_lblk_t end)
0344 {
0345     struct extent_status es;
0346 
0347     __es_find_extent_range(inode, matching_fn, start, end, &es);
0348     if (es.es_len == 0)
0349         return false;   /* no matching extent in the tree */
0350     else if (es.es_lblk <= start &&
0351          start < es.es_lblk + es.es_len)
0352         return true;
0353     else if (start <= es.es_lblk && es.es_lblk <= end)
0354         return true;
0355     else
0356         return false;
0357 }
0358 /*
0359  * Locking for __es_scan_range() for external use
0360  */
0361 bool ext4_es_scan_range(struct inode *inode,
0362             int (*matching_fn)(struct extent_status *es),
0363             ext4_lblk_t lblk, ext4_lblk_t end)
0364 {
0365     bool ret;
0366 
0367     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0368         return false;
0369 
0370     read_lock(&EXT4_I(inode)->i_es_lock);
0371     ret = __es_scan_range(inode, matching_fn, lblk, end);
0372     read_unlock(&EXT4_I(inode)->i_es_lock);
0373 
0374     return ret;
0375 }
0376 
0377 /*
0378  * __es_scan_clu - search cluster for block with specified status in
0379  *                 extents status tree
0380  *
0381  * @inode - file containing the cluster
0382  * @matching_fn - pointer to function that matches extents with desired status
0383  * @lblk - logical block in cluster to be searched
0384  *
0385  * Returns true if at least one extent in the cluster containing @lblk
0386  * satisfies the criterion specified by @matching_fn, and false if not.  If at
0387  * least one extent has the specified status, then there is at least one block
0388  * in the cluster with that status.  Should only be called by code that has
0389  * taken i_es_lock.
0390  */
0391 static bool __es_scan_clu(struct inode *inode,
0392               int (*matching_fn)(struct extent_status *es),
0393               ext4_lblk_t lblk)
0394 {
0395     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0396     ext4_lblk_t lblk_start, lblk_end;
0397 
0398     lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
0399     lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
0400 
0401     return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
0402 }
0403 
0404 /*
0405  * Locking for __es_scan_clu() for external use
0406  */
0407 bool ext4_es_scan_clu(struct inode *inode,
0408               int (*matching_fn)(struct extent_status *es),
0409               ext4_lblk_t lblk)
0410 {
0411     bool ret;
0412 
0413     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0414         return false;
0415 
0416     read_lock(&EXT4_I(inode)->i_es_lock);
0417     ret = __es_scan_clu(inode, matching_fn, lblk);
0418     read_unlock(&EXT4_I(inode)->i_es_lock);
0419 
0420     return ret;
0421 }
0422 
0423 static void ext4_es_list_add(struct inode *inode)
0424 {
0425     struct ext4_inode_info *ei = EXT4_I(inode);
0426     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0427 
0428     if (!list_empty(&ei->i_es_list))
0429         return;
0430 
0431     spin_lock(&sbi->s_es_lock);
0432     if (list_empty(&ei->i_es_list)) {
0433         list_add_tail(&ei->i_es_list, &sbi->s_es_list);
0434         sbi->s_es_nr_inode++;
0435     }
0436     spin_unlock(&sbi->s_es_lock);
0437 }
0438 
0439 static void ext4_es_list_del(struct inode *inode)
0440 {
0441     struct ext4_inode_info *ei = EXT4_I(inode);
0442     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0443 
0444     spin_lock(&sbi->s_es_lock);
0445     if (!list_empty(&ei->i_es_list)) {
0446         list_del_init(&ei->i_es_list);
0447         sbi->s_es_nr_inode--;
0448         WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
0449     }
0450     spin_unlock(&sbi->s_es_lock);
0451 }
0452 
0453 static struct extent_status *
0454 ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
0455              ext4_fsblk_t pblk)
0456 {
0457     struct extent_status *es;
0458     es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
0459     if (es == NULL)
0460         return NULL;
0461     es->es_lblk = lblk;
0462     es->es_len = len;
0463     es->es_pblk = pblk;
0464 
0465     /*
0466      * We don't count delayed extent because we never try to reclaim them
0467      */
0468     if (!ext4_es_is_delayed(es)) {
0469         if (!EXT4_I(inode)->i_es_shk_nr++)
0470             ext4_es_list_add(inode);
0471         percpu_counter_inc(&EXT4_SB(inode->i_sb)->
0472                     s_es_stats.es_stats_shk_cnt);
0473     }
0474 
0475     EXT4_I(inode)->i_es_all_nr++;
0476     percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
0477 
0478     return es;
0479 }
0480 
0481 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
0482 {
0483     EXT4_I(inode)->i_es_all_nr--;
0484     percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
0485 
0486     /* Decrease the shrink counter when this es is not delayed */
0487     if (!ext4_es_is_delayed(es)) {
0488         BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
0489         if (!--EXT4_I(inode)->i_es_shk_nr)
0490             ext4_es_list_del(inode);
0491         percpu_counter_dec(&EXT4_SB(inode->i_sb)->
0492                     s_es_stats.es_stats_shk_cnt);
0493     }
0494 
0495     kmem_cache_free(ext4_es_cachep, es);
0496 }
0497 
0498 /*
0499  * Check whether or not two extents can be merged
0500  * Condition:
0501  *  - logical block number is contiguous
0502  *  - physical block number is contiguous
0503  *  - status is equal
0504  */
0505 static int ext4_es_can_be_merged(struct extent_status *es1,
0506                  struct extent_status *es2)
0507 {
0508     if (ext4_es_type(es1) != ext4_es_type(es2))
0509         return 0;
0510 
0511     if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
0512         pr_warn("ES assertion failed when merging extents. "
0513             "The sum of lengths of es1 (%d) and es2 (%d) "
0514             "is bigger than allowed file size (%d)\n",
0515             es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
0516         WARN_ON(1);
0517         return 0;
0518     }
0519 
0520     if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
0521         return 0;
0522 
0523     if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
0524         (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
0525         return 1;
0526 
0527     if (ext4_es_is_hole(es1))
0528         return 1;
0529 
0530     /* we need to check delayed extent is without unwritten status */
0531     if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
0532         return 1;
0533 
0534     return 0;
0535 }
0536 
0537 static struct extent_status *
0538 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
0539 {
0540     struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
0541     struct extent_status *es1;
0542     struct rb_node *node;
0543 
0544     node = rb_prev(&es->rb_node);
0545     if (!node)
0546         return es;
0547 
0548     es1 = rb_entry(node, struct extent_status, rb_node);
0549     if (ext4_es_can_be_merged(es1, es)) {
0550         es1->es_len += es->es_len;
0551         if (ext4_es_is_referenced(es))
0552             ext4_es_set_referenced(es1);
0553         rb_erase(&es->rb_node, &tree->root);
0554         ext4_es_free_extent(inode, es);
0555         es = es1;
0556     }
0557 
0558     return es;
0559 }
0560 
0561 static struct extent_status *
0562 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
0563 {
0564     struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
0565     struct extent_status *es1;
0566     struct rb_node *node;
0567 
0568     node = rb_next(&es->rb_node);
0569     if (!node)
0570         return es;
0571 
0572     es1 = rb_entry(node, struct extent_status, rb_node);
0573     if (ext4_es_can_be_merged(es, es1)) {
0574         es->es_len += es1->es_len;
0575         if (ext4_es_is_referenced(es1))
0576             ext4_es_set_referenced(es);
0577         rb_erase(node, &tree->root);
0578         ext4_es_free_extent(inode, es1);
0579     }
0580 
0581     return es;
0582 }
0583 
0584 #ifdef ES_AGGRESSIVE_TEST
0585 #include "ext4_extents.h"   /* Needed when ES_AGGRESSIVE_TEST is defined */
0586 
0587 static void ext4_es_insert_extent_ext_check(struct inode *inode,
0588                         struct extent_status *es)
0589 {
0590     struct ext4_ext_path *path = NULL;
0591     struct ext4_extent *ex;
0592     ext4_lblk_t ee_block;
0593     ext4_fsblk_t ee_start;
0594     unsigned short ee_len;
0595     int depth, ee_status, es_status;
0596 
0597     path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
0598     if (IS_ERR(path))
0599         return;
0600 
0601     depth = ext_depth(inode);
0602     ex = path[depth].p_ext;
0603 
0604     if (ex) {
0605 
0606         ee_block = le32_to_cpu(ex->ee_block);
0607         ee_start = ext4_ext_pblock(ex);
0608         ee_len = ext4_ext_get_actual_len(ex);
0609 
0610         ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
0611         es_status = ext4_es_is_unwritten(es) ? 1 : 0;
0612 
0613         /*
0614          * Make sure ex and es are not overlap when we try to insert
0615          * a delayed/hole extent.
0616          */
0617         if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
0618             if (in_range(es->es_lblk, ee_block, ee_len)) {
0619                 pr_warn("ES insert assertion failed for "
0620                     "inode: %lu we can find an extent "
0621                     "at block [%d/%d/%llu/%c], but we "
0622                     "want to add a delayed/hole extent "
0623                     "[%d/%d/%llu/%x]\n",
0624                     inode->i_ino, ee_block, ee_len,
0625                     ee_start, ee_status ? 'u' : 'w',
0626                     es->es_lblk, es->es_len,
0627                     ext4_es_pblock(es), ext4_es_status(es));
0628             }
0629             goto out;
0630         }
0631 
0632         /*
0633          * We don't check ee_block == es->es_lblk, etc. because es
0634          * might be a part of whole extent, vice versa.
0635          */
0636         if (es->es_lblk < ee_block ||
0637             ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
0638             pr_warn("ES insert assertion failed for inode: %lu "
0639                 "ex_status [%d/%d/%llu/%c] != "
0640                 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
0641                 ee_block, ee_len, ee_start,
0642                 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
0643                 ext4_es_pblock(es), es_status ? 'u' : 'w');
0644             goto out;
0645         }
0646 
0647         if (ee_status ^ es_status) {
0648             pr_warn("ES insert assertion failed for inode: %lu "
0649                 "ex_status [%d/%d/%llu/%c] != "
0650                 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
0651                 ee_block, ee_len, ee_start,
0652                 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
0653                 ext4_es_pblock(es), es_status ? 'u' : 'w');
0654         }
0655     } else {
0656         /*
0657          * We can't find an extent on disk.  So we need to make sure
0658          * that we don't want to add an written/unwritten extent.
0659          */
0660         if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
0661             pr_warn("ES insert assertion failed for inode: %lu "
0662                 "can't find an extent at block %d but we want "
0663                 "to add a written/unwritten extent "
0664                 "[%d/%d/%llu/%x]\n", inode->i_ino,
0665                 es->es_lblk, es->es_lblk, es->es_len,
0666                 ext4_es_pblock(es), ext4_es_status(es));
0667         }
0668     }
0669 out:
0670     ext4_ext_drop_refs(path);
0671     kfree(path);
0672 }
0673 
0674 static void ext4_es_insert_extent_ind_check(struct inode *inode,
0675                         struct extent_status *es)
0676 {
0677     struct ext4_map_blocks map;
0678     int retval;
0679 
0680     /*
0681      * Here we call ext4_ind_map_blocks to lookup a block mapping because
0682      * 'Indirect' structure is defined in indirect.c.  So we couldn't
0683      * access direct/indirect tree from outside.  It is too dirty to define
0684      * this function in indirect.c file.
0685      */
0686 
0687     map.m_lblk = es->es_lblk;
0688     map.m_len = es->es_len;
0689 
0690     retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
0691     if (retval > 0) {
0692         if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
0693             /*
0694              * We want to add a delayed/hole extent but this
0695              * block has been allocated.
0696              */
0697             pr_warn("ES insert assertion failed for inode: %lu "
0698                 "We can find blocks but we want to add a "
0699                 "delayed/hole extent [%d/%d/%llu/%x]\n",
0700                 inode->i_ino, es->es_lblk, es->es_len,
0701                 ext4_es_pblock(es), ext4_es_status(es));
0702             return;
0703         } else if (ext4_es_is_written(es)) {
0704             if (retval != es->es_len) {
0705                 pr_warn("ES insert assertion failed for "
0706                     "inode: %lu retval %d != es_len %d\n",
0707                     inode->i_ino, retval, es->es_len);
0708                 return;
0709             }
0710             if (map.m_pblk != ext4_es_pblock(es)) {
0711                 pr_warn("ES insert assertion failed for "
0712                     "inode: %lu m_pblk %llu != "
0713                     "es_pblk %llu\n",
0714                     inode->i_ino, map.m_pblk,
0715                     ext4_es_pblock(es));
0716                 return;
0717             }
0718         } else {
0719             /*
0720              * We don't need to check unwritten extent because
0721              * indirect-based file doesn't have it.
0722              */
0723             BUG();
0724         }
0725     } else if (retval == 0) {
0726         if (ext4_es_is_written(es)) {
0727             pr_warn("ES insert assertion failed for inode: %lu "
0728                 "We can't find the block but we want to add "
0729                 "a written extent [%d/%d/%llu/%x]\n",
0730                 inode->i_ino, es->es_lblk, es->es_len,
0731                 ext4_es_pblock(es), ext4_es_status(es));
0732             return;
0733         }
0734     }
0735 }
0736 
0737 static inline void ext4_es_insert_extent_check(struct inode *inode,
0738                            struct extent_status *es)
0739 {
0740     /*
0741      * We don't need to worry about the race condition because
0742      * caller takes i_data_sem locking.
0743      */
0744     BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
0745     if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
0746         ext4_es_insert_extent_ext_check(inode, es);
0747     else
0748         ext4_es_insert_extent_ind_check(inode, es);
0749 }
0750 #else
0751 static inline void ext4_es_insert_extent_check(struct inode *inode,
0752                            struct extent_status *es)
0753 {
0754 }
0755 #endif
0756 
0757 static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
0758 {
0759     struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
0760     struct rb_node **p = &tree->root.rb_node;
0761     struct rb_node *parent = NULL;
0762     struct extent_status *es;
0763 
0764     while (*p) {
0765         parent = *p;
0766         es = rb_entry(parent, struct extent_status, rb_node);
0767 
0768         if (newes->es_lblk < es->es_lblk) {
0769             if (ext4_es_can_be_merged(newes, es)) {
0770                 /*
0771                  * Here we can modify es_lblk directly
0772                  * because it isn't overlapped.
0773                  */
0774                 es->es_lblk = newes->es_lblk;
0775                 es->es_len += newes->es_len;
0776                 if (ext4_es_is_written(es) ||
0777                     ext4_es_is_unwritten(es))
0778                     ext4_es_store_pblock(es,
0779                                  newes->es_pblk);
0780                 es = ext4_es_try_to_merge_left(inode, es);
0781                 goto out;
0782             }
0783             p = &(*p)->rb_left;
0784         } else if (newes->es_lblk > ext4_es_end(es)) {
0785             if (ext4_es_can_be_merged(es, newes)) {
0786                 es->es_len += newes->es_len;
0787                 es = ext4_es_try_to_merge_right(inode, es);
0788                 goto out;
0789             }
0790             p = &(*p)->rb_right;
0791         } else {
0792             BUG();
0793             return -EINVAL;
0794         }
0795     }
0796 
0797     es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
0798                   newes->es_pblk);
0799     if (!es)
0800         return -ENOMEM;
0801     rb_link_node(&es->rb_node, parent, p);
0802     rb_insert_color(&es->rb_node, &tree->root);
0803 
0804 out:
0805     tree->cache_es = es;
0806     return 0;
0807 }
0808 
0809 /*
0810  * ext4_es_insert_extent() adds information to an inode's extent
0811  * status tree.
0812  *
0813  * Return 0 on success, error code on failure.
0814  */
0815 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
0816               ext4_lblk_t len, ext4_fsblk_t pblk,
0817               unsigned int status)
0818 {
0819     struct extent_status newes;
0820     ext4_lblk_t end = lblk + len - 1;
0821     int err = 0;
0822     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0823 
0824     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0825         return 0;
0826 
0827     es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
0828          lblk, len, pblk, status, inode->i_ino);
0829 
0830     if (!len)
0831         return 0;
0832 
0833     BUG_ON(end < lblk);
0834 
0835     if ((status & EXTENT_STATUS_DELAYED) &&
0836         (status & EXTENT_STATUS_WRITTEN)) {
0837         ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
0838                 " delayed and written which can potentially "
0839                 " cause data loss.", lblk, len);
0840         WARN_ON(1);
0841     }
0842 
0843     newes.es_lblk = lblk;
0844     newes.es_len = len;
0845     ext4_es_store_pblock_status(&newes, pblk, status);
0846     trace_ext4_es_insert_extent(inode, &newes);
0847 
0848     ext4_es_insert_extent_check(inode, &newes);
0849 
0850     write_lock(&EXT4_I(inode)->i_es_lock);
0851     err = __es_remove_extent(inode, lblk, end, NULL);
0852     if (err != 0)
0853         goto error;
0854 retry:
0855     err = __es_insert_extent(inode, &newes);
0856     if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
0857                       128, EXT4_I(inode)))
0858         goto retry;
0859     if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
0860         err = 0;
0861 
0862     if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
0863         (status & EXTENT_STATUS_WRITTEN ||
0864          status & EXTENT_STATUS_UNWRITTEN))
0865         __revise_pending(inode, lblk, len);
0866 
0867 error:
0868     write_unlock(&EXT4_I(inode)->i_es_lock);
0869 
0870     ext4_es_print_tree(inode);
0871 
0872     return err;
0873 }
0874 
0875 /*
0876  * ext4_es_cache_extent() inserts information into the extent status
0877  * tree if and only if there isn't information about the range in
0878  * question already.
0879  */
0880 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
0881               ext4_lblk_t len, ext4_fsblk_t pblk,
0882               unsigned int status)
0883 {
0884     struct extent_status *es;
0885     struct extent_status newes;
0886     ext4_lblk_t end = lblk + len - 1;
0887 
0888     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0889         return;
0890 
0891     newes.es_lblk = lblk;
0892     newes.es_len = len;
0893     ext4_es_store_pblock_status(&newes, pblk, status);
0894     trace_ext4_es_cache_extent(inode, &newes);
0895 
0896     if (!len)
0897         return;
0898 
0899     BUG_ON(end < lblk);
0900 
0901     write_lock(&EXT4_I(inode)->i_es_lock);
0902 
0903     es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
0904     if (!es || es->es_lblk > end)
0905         __es_insert_extent(inode, &newes);
0906     write_unlock(&EXT4_I(inode)->i_es_lock);
0907 }
0908 
0909 /*
0910  * ext4_es_lookup_extent() looks up an extent in extent status tree.
0911  *
0912  * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
0913  *
0914  * Return: 1 on found, 0 on not
0915  */
0916 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
0917               ext4_lblk_t *next_lblk,
0918               struct extent_status *es)
0919 {
0920     struct ext4_es_tree *tree;
0921     struct ext4_es_stats *stats;
0922     struct extent_status *es1 = NULL;
0923     struct rb_node *node;
0924     int found = 0;
0925 
0926     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
0927         return 0;
0928 
0929     trace_ext4_es_lookup_extent_enter(inode, lblk);
0930     es_debug("lookup extent in block %u\n", lblk);
0931 
0932     tree = &EXT4_I(inode)->i_es_tree;
0933     read_lock(&EXT4_I(inode)->i_es_lock);
0934 
0935     /* find extent in cache firstly */
0936     es->es_lblk = es->es_len = es->es_pblk = 0;
0937     if (tree->cache_es) {
0938         es1 = tree->cache_es;
0939         if (in_range(lblk, es1->es_lblk, es1->es_len)) {
0940             es_debug("%u cached by [%u/%u)\n",
0941                  lblk, es1->es_lblk, es1->es_len);
0942             found = 1;
0943             goto out;
0944         }
0945     }
0946 
0947     node = tree->root.rb_node;
0948     while (node) {
0949         es1 = rb_entry(node, struct extent_status, rb_node);
0950         if (lblk < es1->es_lblk)
0951             node = node->rb_left;
0952         else if (lblk > ext4_es_end(es1))
0953             node = node->rb_right;
0954         else {
0955             found = 1;
0956             break;
0957         }
0958     }
0959 
0960 out:
0961     stats = &EXT4_SB(inode->i_sb)->s_es_stats;
0962     if (found) {
0963         BUG_ON(!es1);
0964         es->es_lblk = es1->es_lblk;
0965         es->es_len = es1->es_len;
0966         es->es_pblk = es1->es_pblk;
0967         if (!ext4_es_is_referenced(es1))
0968             ext4_es_set_referenced(es1);
0969         percpu_counter_inc(&stats->es_stats_cache_hits);
0970         if (next_lblk) {
0971             node = rb_next(&es1->rb_node);
0972             if (node) {
0973                 es1 = rb_entry(node, struct extent_status,
0974                            rb_node);
0975                 *next_lblk = es1->es_lblk;
0976             } else
0977                 *next_lblk = 0;
0978         }
0979     } else {
0980         percpu_counter_inc(&stats->es_stats_cache_misses);
0981     }
0982 
0983     read_unlock(&EXT4_I(inode)->i_es_lock);
0984 
0985     trace_ext4_es_lookup_extent_exit(inode, es, found);
0986     return found;
0987 }
0988 
0989 struct rsvd_count {
0990     int ndelonly;
0991     bool first_do_lblk_found;
0992     ext4_lblk_t first_do_lblk;
0993     ext4_lblk_t last_do_lblk;
0994     struct extent_status *left_es;
0995     bool partial;
0996     ext4_lblk_t lclu;
0997 };
0998 
0999 /*
1000  * init_rsvd - initialize reserved count data before removing block range
1001  *         in file from extent status tree
1002  *
1003  * @inode - file containing range
1004  * @lblk - first block in range
1005  * @es - pointer to first extent in range
1006  * @rc - pointer to reserved count data
1007  *
1008  * Assumes es is not NULL
1009  */
1010 static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1011               struct extent_status *es, struct rsvd_count *rc)
1012 {
1013     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1014     struct rb_node *node;
1015 
1016     rc->ndelonly = 0;
1017 
1018     /*
1019      * for bigalloc, note the first delonly block in the range has not
1020      * been found, record the extent containing the block to the left of
1021      * the region to be removed, if any, and note that there's no partial
1022      * cluster to track
1023      */
1024     if (sbi->s_cluster_ratio > 1) {
1025         rc->first_do_lblk_found = false;
1026         if (lblk > es->es_lblk) {
1027             rc->left_es = es;
1028         } else {
1029             node = rb_prev(&es->rb_node);
1030             rc->left_es = node ? rb_entry(node,
1031                               struct extent_status,
1032                               rb_node) : NULL;
1033         }
1034         rc->partial = false;
1035     }
1036 }
1037 
1038 /*
1039  * count_rsvd - count the clusters containing delayed and not unwritten
1040  *      (delonly) blocks in a range within an extent and add to
1041  *          the running tally in rsvd_count
1042  *
1043  * @inode - file containing extent
1044  * @lblk - first block in range
1045  * @len - length of range in blocks
1046  * @es - pointer to extent containing clusters to be counted
1047  * @rc - pointer to reserved count data
1048  *
1049  * Tracks partial clusters found at the beginning and end of extents so
1050  * they aren't overcounted when they span adjacent extents
1051  */
1052 static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1053                struct extent_status *es, struct rsvd_count *rc)
1054 {
1055     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1056     ext4_lblk_t i, end, nclu;
1057 
1058     if (!ext4_es_is_delonly(es))
1059         return;
1060 
1061     WARN_ON(len <= 0);
1062 
1063     if (sbi->s_cluster_ratio == 1) {
1064         rc->ndelonly += (int) len;
1065         return;
1066     }
1067 
1068     /* bigalloc */
1069 
1070     i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1071     end = lblk + (ext4_lblk_t) len - 1;
1072     end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1073 
1074     /* record the first block of the first delonly extent seen */
1075     if (!rc->first_do_lblk_found) {
1076         rc->first_do_lblk = i;
1077         rc->first_do_lblk_found = true;
1078     }
1079 
1080     /* update the last lblk in the region seen so far */
1081     rc->last_do_lblk = end;
1082 
1083     /*
1084      * if we're tracking a partial cluster and the current extent
1085      * doesn't start with it, count it and stop tracking
1086      */
1087     if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1088         rc->ndelonly++;
1089         rc->partial = false;
1090     }
1091 
1092     /*
1093      * if the first cluster doesn't start on a cluster boundary but
1094      * ends on one, count it
1095      */
1096     if (EXT4_LBLK_COFF(sbi, i) != 0) {
1097         if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1098             rc->ndelonly++;
1099             rc->partial = false;
1100             i = EXT4_LBLK_CFILL(sbi, i) + 1;
1101         }
1102     }
1103 
1104     /*
1105      * if the current cluster starts on a cluster boundary, count the
1106      * number of whole delonly clusters in the extent
1107      */
1108     if ((i + sbi->s_cluster_ratio - 1) <= end) {
1109         nclu = (end - i + 1) >> sbi->s_cluster_bits;
1110         rc->ndelonly += nclu;
1111         i += nclu << sbi->s_cluster_bits;
1112     }
1113 
1114     /*
1115      * start tracking a partial cluster if there's a partial at the end
1116      * of the current extent and we're not already tracking one
1117      */
1118     if (!rc->partial && i <= end) {
1119         rc->partial = true;
1120         rc->lclu = EXT4_B2C(sbi, i);
1121     }
1122 }
1123 
1124 /*
1125  * __pr_tree_search - search for a pending cluster reservation
1126  *
1127  * @root - root of pending reservation tree
1128  * @lclu - logical cluster to search for
1129  *
1130  * Returns the pending reservation for the cluster identified by @lclu
1131  * if found.  If not, returns a reservation for the next cluster if any,
1132  * and if not, returns NULL.
1133  */
1134 static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1135                             ext4_lblk_t lclu)
1136 {
1137     struct rb_node *node = root->rb_node;
1138     struct pending_reservation *pr = NULL;
1139 
1140     while (node) {
1141         pr = rb_entry(node, struct pending_reservation, rb_node);
1142         if (lclu < pr->lclu)
1143             node = node->rb_left;
1144         else if (lclu > pr->lclu)
1145             node = node->rb_right;
1146         else
1147             return pr;
1148     }
1149     if (pr && lclu < pr->lclu)
1150         return pr;
1151     if (pr && lclu > pr->lclu) {
1152         node = rb_next(&pr->rb_node);
1153         return node ? rb_entry(node, struct pending_reservation,
1154                        rb_node) : NULL;
1155     }
1156     return NULL;
1157 }
1158 
1159 /*
1160  * get_rsvd - calculates and returns the number of cluster reservations to be
1161  *        released when removing a block range from the extent status tree
1162  *        and releases any pending reservations within the range
1163  *
1164  * @inode - file containing block range
1165  * @end - last block in range
1166  * @right_es - pointer to extent containing next block beyond end or NULL
1167  * @rc - pointer to reserved count data
1168  *
1169  * The number of reservations to be released is equal to the number of
1170  * clusters containing delayed and not unwritten (delonly) blocks within
1171  * the range, minus the number of clusters still containing delonly blocks
1172  * at the ends of the range, and minus the number of pending reservations
1173  * within the range.
1174  */
1175 static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1176                  struct extent_status *right_es,
1177                  struct rsvd_count *rc)
1178 {
1179     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1180     struct pending_reservation *pr;
1181     struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1182     struct rb_node *node;
1183     ext4_lblk_t first_lclu, last_lclu;
1184     bool left_delonly, right_delonly, count_pending;
1185     struct extent_status *es;
1186 
1187     if (sbi->s_cluster_ratio > 1) {
1188         /* count any remaining partial cluster */
1189         if (rc->partial)
1190             rc->ndelonly++;
1191 
1192         if (rc->ndelonly == 0)
1193             return 0;
1194 
1195         first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1196         last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1197 
1198         /*
1199          * decrease the delonly count by the number of clusters at the
1200          * ends of the range that still contain delonly blocks -
1201          * these clusters still need to be reserved
1202          */
1203         left_delonly = right_delonly = false;
1204 
1205         es = rc->left_es;
1206         while (es && ext4_es_end(es) >=
1207                EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1208             if (ext4_es_is_delonly(es)) {
1209                 rc->ndelonly--;
1210                 left_delonly = true;
1211                 break;
1212             }
1213             node = rb_prev(&es->rb_node);
1214             if (!node)
1215                 break;
1216             es = rb_entry(node, struct extent_status, rb_node);
1217         }
1218         if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1219             if (end < ext4_es_end(right_es)) {
1220                 es = right_es;
1221             } else {
1222                 node = rb_next(&right_es->rb_node);
1223                 es = node ? rb_entry(node, struct extent_status,
1224                              rb_node) : NULL;
1225             }
1226             while (es && es->es_lblk <=
1227                    EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1228                 if (ext4_es_is_delonly(es)) {
1229                     rc->ndelonly--;
1230                     right_delonly = true;
1231                     break;
1232                 }
1233                 node = rb_next(&es->rb_node);
1234                 if (!node)
1235                     break;
1236                 es = rb_entry(node, struct extent_status,
1237                           rb_node);
1238             }
1239         }
1240 
1241         /*
1242          * Determine the block range that should be searched for
1243          * pending reservations, if any.  Clusters on the ends of the
1244          * original removed range containing delonly blocks are
1245          * excluded.  They've already been accounted for and it's not
1246          * possible to determine if an associated pending reservation
1247          * should be released with the information available in the
1248          * extents status tree.
1249          */
1250         if (first_lclu == last_lclu) {
1251             if (left_delonly | right_delonly)
1252                 count_pending = false;
1253             else
1254                 count_pending = true;
1255         } else {
1256             if (left_delonly)
1257                 first_lclu++;
1258             if (right_delonly)
1259                 last_lclu--;
1260             if (first_lclu <= last_lclu)
1261                 count_pending = true;
1262             else
1263                 count_pending = false;
1264         }
1265 
1266         /*
1267          * a pending reservation found between first_lclu and last_lclu
1268          * represents an allocated cluster that contained at least one
1269          * delonly block, so the delonly total must be reduced by one
1270          * for each pending reservation found and released
1271          */
1272         if (count_pending) {
1273             pr = __pr_tree_search(&tree->root, first_lclu);
1274             while (pr && pr->lclu <= last_lclu) {
1275                 rc->ndelonly--;
1276                 node = rb_next(&pr->rb_node);
1277                 rb_erase(&pr->rb_node, &tree->root);
1278                 kmem_cache_free(ext4_pending_cachep, pr);
1279                 if (!node)
1280                     break;
1281                 pr = rb_entry(node, struct pending_reservation,
1282                           rb_node);
1283             }
1284         }
1285     }
1286     return rc->ndelonly;
1287 }
1288 
1289 
1290 /*
1291  * __es_remove_extent - removes block range from extent status tree
1292  *
1293  * @inode - file containing range
1294  * @lblk - first block in range
1295  * @end - last block in range
1296  * @reserved - number of cluster reservations released
1297  *
1298  * If @reserved is not NULL and delayed allocation is enabled, counts
1299  * block/cluster reservations freed by removing range and if bigalloc
1300  * enabled cancels pending reservations as needed. Returns 0 on success,
1301  * error code on failure.
1302  */
1303 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1304                   ext4_lblk_t end, int *reserved)
1305 {
1306     struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1307     struct rb_node *node;
1308     struct extent_status *es;
1309     struct extent_status orig_es;
1310     ext4_lblk_t len1, len2;
1311     ext4_fsblk_t block;
1312     int err;
1313     bool count_reserved = true;
1314     struct rsvd_count rc;
1315 
1316     if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1317         count_reserved = false;
1318 retry:
1319     err = 0;
1320 
1321     es = __es_tree_search(&tree->root, lblk);
1322     if (!es)
1323         goto out;
1324     if (es->es_lblk > end)
1325         goto out;
1326 
1327     /* Simply invalidate cache_es. */
1328     tree->cache_es = NULL;
1329     if (count_reserved)
1330         init_rsvd(inode, lblk, es, &rc);
1331 
1332     orig_es.es_lblk = es->es_lblk;
1333     orig_es.es_len = es->es_len;
1334     orig_es.es_pblk = es->es_pblk;
1335 
1336     len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1337     len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1338     if (len1 > 0)
1339         es->es_len = len1;
1340     if (len2 > 0) {
1341         if (len1 > 0) {
1342             struct extent_status newes;
1343 
1344             newes.es_lblk = end + 1;
1345             newes.es_len = len2;
1346             block = 0x7FDEADBEEFULL;
1347             if (ext4_es_is_written(&orig_es) ||
1348                 ext4_es_is_unwritten(&orig_es))
1349                 block = ext4_es_pblock(&orig_es) +
1350                     orig_es.es_len - len2;
1351             ext4_es_store_pblock_status(&newes, block,
1352                             ext4_es_status(&orig_es));
1353             err = __es_insert_extent(inode, &newes);
1354             if (err) {
1355                 es->es_lblk = orig_es.es_lblk;
1356                 es->es_len = orig_es.es_len;
1357                 if ((err == -ENOMEM) &&
1358                     __es_shrink(EXT4_SB(inode->i_sb),
1359                             128, EXT4_I(inode)))
1360                     goto retry;
1361                 goto out;
1362             }
1363         } else {
1364             es->es_lblk = end + 1;
1365             es->es_len = len2;
1366             if (ext4_es_is_written(es) ||
1367                 ext4_es_is_unwritten(es)) {
1368                 block = orig_es.es_pblk + orig_es.es_len - len2;
1369                 ext4_es_store_pblock(es, block);
1370             }
1371         }
1372         if (count_reserved)
1373             count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
1374                    &orig_es, &rc);
1375         goto out;
1376     }
1377 
1378     if (len1 > 0) {
1379         if (count_reserved)
1380             count_rsvd(inode, lblk, orig_es.es_len - len1,
1381                    &orig_es, &rc);
1382         node = rb_next(&es->rb_node);
1383         if (node)
1384             es = rb_entry(node, struct extent_status, rb_node);
1385         else
1386             es = NULL;
1387     }
1388 
1389     while (es && ext4_es_end(es) <= end) {
1390         if (count_reserved)
1391             count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1392         node = rb_next(&es->rb_node);
1393         rb_erase(&es->rb_node, &tree->root);
1394         ext4_es_free_extent(inode, es);
1395         if (!node) {
1396             es = NULL;
1397             break;
1398         }
1399         es = rb_entry(node, struct extent_status, rb_node);
1400     }
1401 
1402     if (es && es->es_lblk < end + 1) {
1403         ext4_lblk_t orig_len = es->es_len;
1404 
1405         len1 = ext4_es_end(es) - end;
1406         if (count_reserved)
1407             count_rsvd(inode, es->es_lblk, orig_len - len1,
1408                    es, &rc);
1409         es->es_lblk = end + 1;
1410         es->es_len = len1;
1411         if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1412             block = es->es_pblk + orig_len - len1;
1413             ext4_es_store_pblock(es, block);
1414         }
1415     }
1416 
1417     if (count_reserved)
1418         *reserved = get_rsvd(inode, end, es, &rc);
1419 out:
1420     return err;
1421 }
1422 
1423 /*
1424  * ext4_es_remove_extent - removes block range from extent status tree
1425  *
1426  * @inode - file containing range
1427  * @lblk - first block in range
1428  * @len - number of blocks to remove
1429  *
1430  * Reduces block/cluster reservation count and for bigalloc cancels pending
1431  * reservations as needed. Returns 0 on success, error code on failure.
1432  */
1433 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1434               ext4_lblk_t len)
1435 {
1436     ext4_lblk_t end;
1437     int err = 0;
1438     int reserved = 0;
1439 
1440     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1441         return 0;
1442 
1443     trace_ext4_es_remove_extent(inode, lblk, len);
1444     es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1445          lblk, len, inode->i_ino);
1446 
1447     if (!len)
1448         return err;
1449 
1450     end = lblk + len - 1;
1451     BUG_ON(end < lblk);
1452 
1453     /*
1454      * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1455      * so that we are sure __es_shrink() is done with the inode before it
1456      * is reclaimed.
1457      */
1458     write_lock(&EXT4_I(inode)->i_es_lock);
1459     err = __es_remove_extent(inode, lblk, end, &reserved);
1460     write_unlock(&EXT4_I(inode)->i_es_lock);
1461     ext4_es_print_tree(inode);
1462     ext4_da_release_space(inode, reserved);
1463     return err;
1464 }
1465 
1466 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1467                struct ext4_inode_info *locked_ei)
1468 {
1469     struct ext4_inode_info *ei;
1470     struct ext4_es_stats *es_stats;
1471     ktime_t start_time;
1472     u64 scan_time;
1473     int nr_to_walk;
1474     int nr_shrunk = 0;
1475     int retried = 0, nr_skipped = 0;
1476 
1477     es_stats = &sbi->s_es_stats;
1478     start_time = ktime_get();
1479 
1480 retry:
1481     spin_lock(&sbi->s_es_lock);
1482     nr_to_walk = sbi->s_es_nr_inode;
1483     while (nr_to_walk-- > 0) {
1484         if (list_empty(&sbi->s_es_list)) {
1485             spin_unlock(&sbi->s_es_lock);
1486             goto out;
1487         }
1488         ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1489                       i_es_list);
1490         /* Move the inode to the tail */
1491         list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1492 
1493         /*
1494          * Normally we try hard to avoid shrinking precached inodes,
1495          * but we will as a last resort.
1496          */
1497         if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1498                         EXT4_STATE_EXT_PRECACHED)) {
1499             nr_skipped++;
1500             continue;
1501         }
1502 
1503         if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1504             nr_skipped++;
1505             continue;
1506         }
1507         /*
1508          * Now we hold i_es_lock which protects us from inode reclaim
1509          * freeing inode under us
1510          */
1511         spin_unlock(&sbi->s_es_lock);
1512 
1513         nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1514         write_unlock(&ei->i_es_lock);
1515 
1516         if (nr_to_scan <= 0)
1517             goto out;
1518         spin_lock(&sbi->s_es_lock);
1519     }
1520     spin_unlock(&sbi->s_es_lock);
1521 
1522     /*
1523      * If we skipped any inodes, and we weren't able to make any
1524      * forward progress, try again to scan precached inodes.
1525      */
1526     if ((nr_shrunk == 0) && nr_skipped && !retried) {
1527         retried++;
1528         goto retry;
1529     }
1530 
1531     if (locked_ei && nr_shrunk == 0)
1532         nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1533 
1534 out:
1535     scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1536     if (likely(es_stats->es_stats_scan_time))
1537         es_stats->es_stats_scan_time = (scan_time +
1538                 es_stats->es_stats_scan_time*3) / 4;
1539     else
1540         es_stats->es_stats_scan_time = scan_time;
1541     if (scan_time > es_stats->es_stats_max_scan_time)
1542         es_stats->es_stats_max_scan_time = scan_time;
1543     if (likely(es_stats->es_stats_shrunk))
1544         es_stats->es_stats_shrunk = (nr_shrunk +
1545                 es_stats->es_stats_shrunk*3) / 4;
1546     else
1547         es_stats->es_stats_shrunk = nr_shrunk;
1548 
1549     trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1550                  nr_skipped, retried);
1551     return nr_shrunk;
1552 }
1553 
1554 static unsigned long ext4_es_count(struct shrinker *shrink,
1555                    struct shrink_control *sc)
1556 {
1557     unsigned long nr;
1558     struct ext4_sb_info *sbi;
1559 
1560     sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1561     nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1562     trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1563     return nr;
1564 }
1565 
1566 static unsigned long ext4_es_scan(struct shrinker *shrink,
1567                   struct shrink_control *sc)
1568 {
1569     struct ext4_sb_info *sbi = container_of(shrink,
1570                     struct ext4_sb_info, s_es_shrinker);
1571     int nr_to_scan = sc->nr_to_scan;
1572     int ret, nr_shrunk;
1573 
1574     ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1575     trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1576 
1577     nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1578 
1579     ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1580     trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1581     return nr_shrunk;
1582 }
1583 
1584 int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1585 {
1586     struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1587     struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1588     struct ext4_inode_info *ei, *max = NULL;
1589     unsigned int inode_cnt = 0;
1590 
1591     if (v != SEQ_START_TOKEN)
1592         return 0;
1593 
1594     /* here we just find an inode that has the max nr. of objects */
1595     spin_lock(&sbi->s_es_lock);
1596     list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1597         inode_cnt++;
1598         if (max && max->i_es_all_nr < ei->i_es_all_nr)
1599             max = ei;
1600         else if (!max)
1601             max = ei;
1602     }
1603     spin_unlock(&sbi->s_es_lock);
1604 
1605     seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
1606            percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1607            percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1608     seq_printf(seq, "  %lld/%lld cache hits/misses\n",
1609            percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1610            percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1611     if (inode_cnt)
1612         seq_printf(seq, "  %d inodes on list\n", inode_cnt);
1613 
1614     seq_printf(seq, "average:\n  %llu us scan time\n",
1615         div_u64(es_stats->es_stats_scan_time, 1000));
1616     seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
1617     if (inode_cnt)
1618         seq_printf(seq,
1619             "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
1620             "  %llu us max scan time\n",
1621             max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1622             div_u64(es_stats->es_stats_max_scan_time, 1000));
1623 
1624     return 0;
1625 }
1626 
1627 int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1628 {
1629     int err;
1630 
1631     /* Make sure we have enough bits for physical block number */
1632     BUILD_BUG_ON(ES_SHIFT < 48);
1633     INIT_LIST_HEAD(&sbi->s_es_list);
1634     sbi->s_es_nr_inode = 0;
1635     spin_lock_init(&sbi->s_es_lock);
1636     sbi->s_es_stats.es_stats_shrunk = 0;
1637     err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1638                   GFP_KERNEL);
1639     if (err)
1640         return err;
1641     err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1642                   GFP_KERNEL);
1643     if (err)
1644         goto err1;
1645     sbi->s_es_stats.es_stats_scan_time = 0;
1646     sbi->s_es_stats.es_stats_max_scan_time = 0;
1647     err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1648     if (err)
1649         goto err2;
1650     err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1651     if (err)
1652         goto err3;
1653 
1654     sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1655     sbi->s_es_shrinker.count_objects = ext4_es_count;
1656     sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1657     err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
1658                 sbi->s_sb->s_id);
1659     if (err)
1660         goto err4;
1661 
1662     return 0;
1663 err4:
1664     percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1665 err3:
1666     percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1667 err2:
1668     percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1669 err1:
1670     percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1671     return err;
1672 }
1673 
1674 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1675 {
1676     percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1677     percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1678     percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1679     percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1680     unregister_shrinker(&sbi->s_es_shrinker);
1681 }
1682 
1683 /*
1684  * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1685  * most *nr_to_scan extents, update *nr_to_scan accordingly.
1686  *
1687  * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1688  * Increment *nr_shrunk by the number of reclaimed extents. Also update
1689  * ei->i_es_shrink_lblk to where we should continue scanning.
1690  */
1691 static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1692                  int *nr_to_scan, int *nr_shrunk)
1693 {
1694     struct inode *inode = &ei->vfs_inode;
1695     struct ext4_es_tree *tree = &ei->i_es_tree;
1696     struct extent_status *es;
1697     struct rb_node *node;
1698 
1699     es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1700     if (!es)
1701         goto out_wrap;
1702 
1703     while (*nr_to_scan > 0) {
1704         if (es->es_lblk > end) {
1705             ei->i_es_shrink_lblk = end + 1;
1706             return 0;
1707         }
1708 
1709         (*nr_to_scan)--;
1710         node = rb_next(&es->rb_node);
1711         /*
1712          * We can't reclaim delayed extent from status tree because
1713          * fiemap, bigallic, and seek_data/hole need to use it.
1714          */
1715         if (ext4_es_is_delayed(es))
1716             goto next;
1717         if (ext4_es_is_referenced(es)) {
1718             ext4_es_clear_referenced(es);
1719             goto next;
1720         }
1721 
1722         rb_erase(&es->rb_node, &tree->root);
1723         ext4_es_free_extent(inode, es);
1724         (*nr_shrunk)++;
1725 next:
1726         if (!node)
1727             goto out_wrap;
1728         es = rb_entry(node, struct extent_status, rb_node);
1729     }
1730     ei->i_es_shrink_lblk = es->es_lblk;
1731     return 1;
1732 out_wrap:
1733     ei->i_es_shrink_lblk = 0;
1734     return 0;
1735 }
1736 
1737 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1738 {
1739     struct inode *inode = &ei->vfs_inode;
1740     int nr_shrunk = 0;
1741     ext4_lblk_t start = ei->i_es_shrink_lblk;
1742     static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1743                       DEFAULT_RATELIMIT_BURST);
1744 
1745     if (ei->i_es_shk_nr == 0)
1746         return 0;
1747 
1748     if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1749         __ratelimit(&_rs))
1750         ext4_warning(inode->i_sb, "forced shrink of precached extents");
1751 
1752     if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1753         start != 0)
1754         es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1755 
1756     ei->i_es_tree.cache_es = NULL;
1757     return nr_shrunk;
1758 }
1759 
1760 /*
1761  * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
1762  * discretionary entries from the extent status cache.  (Some entries
1763  * must be present for proper operations.)
1764  */
1765 void ext4_clear_inode_es(struct inode *inode)
1766 {
1767     struct ext4_inode_info *ei = EXT4_I(inode);
1768     struct extent_status *es;
1769     struct ext4_es_tree *tree;
1770     struct rb_node *node;
1771 
1772     write_lock(&ei->i_es_lock);
1773     tree = &EXT4_I(inode)->i_es_tree;
1774     tree->cache_es = NULL;
1775     node = rb_first(&tree->root);
1776     while (node) {
1777         es = rb_entry(node, struct extent_status, rb_node);
1778         node = rb_next(node);
1779         if (!ext4_es_is_delayed(es)) {
1780             rb_erase(&es->rb_node, &tree->root);
1781             ext4_es_free_extent(inode, es);
1782         }
1783     }
1784     ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1785     write_unlock(&ei->i_es_lock);
1786 }
1787 
1788 #ifdef ES_DEBUG__
1789 static void ext4_print_pending_tree(struct inode *inode)
1790 {
1791     struct ext4_pending_tree *tree;
1792     struct rb_node *node;
1793     struct pending_reservation *pr;
1794 
1795     printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1796     tree = &EXT4_I(inode)->i_pending_tree;
1797     node = rb_first(&tree->root);
1798     while (node) {
1799         pr = rb_entry(node, struct pending_reservation, rb_node);
1800         printk(KERN_DEBUG " %u", pr->lclu);
1801         node = rb_next(node);
1802     }
1803     printk(KERN_DEBUG "\n");
1804 }
1805 #else
1806 #define ext4_print_pending_tree(inode)
1807 #endif
1808 
1809 int __init ext4_init_pending(void)
1810 {
1811     ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
1812                        sizeof(struct pending_reservation),
1813                        0, (SLAB_RECLAIM_ACCOUNT), NULL);
1814     if (ext4_pending_cachep == NULL)
1815         return -ENOMEM;
1816     return 0;
1817 }
1818 
1819 void ext4_exit_pending(void)
1820 {
1821     kmem_cache_destroy(ext4_pending_cachep);
1822 }
1823 
1824 void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1825 {
1826     tree->root = RB_ROOT;
1827 }
1828 
1829 /*
1830  * __get_pending - retrieve a pointer to a pending reservation
1831  *
1832  * @inode - file containing the pending cluster reservation
1833  * @lclu - logical cluster of interest
1834  *
1835  * Returns a pointer to a pending reservation if it's a member of
1836  * the set, and NULL if not.  Must be called holding i_es_lock.
1837  */
1838 static struct pending_reservation *__get_pending(struct inode *inode,
1839                          ext4_lblk_t lclu)
1840 {
1841     struct ext4_pending_tree *tree;
1842     struct rb_node *node;
1843     struct pending_reservation *pr = NULL;
1844 
1845     tree = &EXT4_I(inode)->i_pending_tree;
1846     node = (&tree->root)->rb_node;
1847 
1848     while (node) {
1849         pr = rb_entry(node, struct pending_reservation, rb_node);
1850         if (lclu < pr->lclu)
1851             node = node->rb_left;
1852         else if (lclu > pr->lclu)
1853             node = node->rb_right;
1854         else if (lclu == pr->lclu)
1855             return pr;
1856     }
1857     return NULL;
1858 }
1859 
1860 /*
1861  * __insert_pending - adds a pending cluster reservation to the set of
1862  *                    pending reservations
1863  *
1864  * @inode - file containing the cluster
1865  * @lblk - logical block in the cluster to be added
1866  *
1867  * Returns 0 on successful insertion and -ENOMEM on failure.  If the
1868  * pending reservation is already in the set, returns successfully.
1869  */
1870 static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
1871 {
1872     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1873     struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1874     struct rb_node **p = &tree->root.rb_node;
1875     struct rb_node *parent = NULL;
1876     struct pending_reservation *pr;
1877     ext4_lblk_t lclu;
1878     int ret = 0;
1879 
1880     lclu = EXT4_B2C(sbi, lblk);
1881     /* search to find parent for insertion */
1882     while (*p) {
1883         parent = *p;
1884         pr = rb_entry(parent, struct pending_reservation, rb_node);
1885 
1886         if (lclu < pr->lclu) {
1887             p = &(*p)->rb_left;
1888         } else if (lclu > pr->lclu) {
1889             p = &(*p)->rb_right;
1890         } else {
1891             /* pending reservation already inserted */
1892             goto out;
1893         }
1894     }
1895 
1896     pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
1897     if (pr == NULL) {
1898         ret = -ENOMEM;
1899         goto out;
1900     }
1901     pr->lclu = lclu;
1902 
1903     rb_link_node(&pr->rb_node, parent, p);
1904     rb_insert_color(&pr->rb_node, &tree->root);
1905 
1906 out:
1907     return ret;
1908 }
1909 
1910 /*
1911  * __remove_pending - removes a pending cluster reservation from the set
1912  *                    of pending reservations
1913  *
1914  * @inode - file containing the cluster
1915  * @lblk - logical block in the pending cluster reservation to be removed
1916  *
1917  * Returns successfully if pending reservation is not a member of the set.
1918  */
1919 static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1920 {
1921     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1922     struct pending_reservation *pr;
1923     struct ext4_pending_tree *tree;
1924 
1925     pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1926     if (pr != NULL) {
1927         tree = &EXT4_I(inode)->i_pending_tree;
1928         rb_erase(&pr->rb_node, &tree->root);
1929         kmem_cache_free(ext4_pending_cachep, pr);
1930     }
1931 }
1932 
1933 /*
1934  * ext4_remove_pending - removes a pending cluster reservation from the set
1935  *                       of pending reservations
1936  *
1937  * @inode - file containing the cluster
1938  * @lblk - logical block in the pending cluster reservation to be removed
1939  *
1940  * Locking for external use of __remove_pending.
1941  */
1942 void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
1943 {
1944     struct ext4_inode_info *ei = EXT4_I(inode);
1945 
1946     write_lock(&ei->i_es_lock);
1947     __remove_pending(inode, lblk);
1948     write_unlock(&ei->i_es_lock);
1949 }
1950 
1951 /*
1952  * ext4_is_pending - determine whether a cluster has a pending reservation
1953  *                   on it
1954  *
1955  * @inode - file containing the cluster
1956  * @lblk - logical block in the cluster
1957  *
1958  * Returns true if there's a pending reservation for the cluster in the
1959  * set of pending reservations, and false if not.
1960  */
1961 bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
1962 {
1963     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1964     struct ext4_inode_info *ei = EXT4_I(inode);
1965     bool ret;
1966 
1967     read_lock(&ei->i_es_lock);
1968     ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
1969     read_unlock(&ei->i_es_lock);
1970 
1971     return ret;
1972 }
1973 
1974 /*
1975  * ext4_es_insert_delayed_block - adds a delayed block to the extents status
1976  *                                tree, adding a pending reservation where
1977  *                                needed
1978  *
1979  * @inode - file containing the newly added block
1980  * @lblk - logical block to be added
1981  * @allocated - indicates whether a physical cluster has been allocated for
1982  *              the logical cluster that contains the block
1983  *
1984  * Returns 0 on success, negative error code on failure.
1985  */
1986 int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
1987                  bool allocated)
1988 {
1989     struct extent_status newes;
1990     int err = 0;
1991 
1992     if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1993         return 0;
1994 
1995     es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
1996          lblk, inode->i_ino);
1997 
1998     newes.es_lblk = lblk;
1999     newes.es_len = 1;
2000     ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2001     trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
2002 
2003     ext4_es_insert_extent_check(inode, &newes);
2004 
2005     write_lock(&EXT4_I(inode)->i_es_lock);
2006 
2007     err = __es_remove_extent(inode, lblk, lblk, NULL);
2008     if (err != 0)
2009         goto error;
2010 retry:
2011     err = __es_insert_extent(inode, &newes);
2012     if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
2013                       128, EXT4_I(inode)))
2014         goto retry;
2015     if (err != 0)
2016         goto error;
2017 
2018     if (allocated)
2019         __insert_pending(inode, lblk);
2020 
2021 error:
2022     write_unlock(&EXT4_I(inode)->i_es_lock);
2023 
2024     ext4_es_print_tree(inode);
2025     ext4_print_pending_tree(inode);
2026 
2027     return err;
2028 }
2029 
2030 /*
2031  * __es_delayed_clu - count number of clusters containing blocks that
2032  *                    are delayed only
2033  *
2034  * @inode - file containing block range
2035  * @start - logical block defining start of range
2036  * @end - logical block defining end of range
2037  *
2038  * Returns the number of clusters containing only delayed (not delayed
2039  * and unwritten) blocks in the range specified by @start and @end.  Any
2040  * cluster or part of a cluster within the range and containing a delayed
2041  * and not unwritten block within the range is counted as a whole cluster.
2042  */
2043 static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
2044                      ext4_lblk_t end)
2045 {
2046     struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
2047     struct extent_status *es;
2048     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2049     struct rb_node *node;
2050     ext4_lblk_t first_lclu, last_lclu;
2051     unsigned long long last_counted_lclu;
2052     unsigned int n = 0;
2053 
2054     /* guaranteed to be unequal to any ext4_lblk_t value */
2055     last_counted_lclu = ~0ULL;
2056 
2057     es = __es_tree_search(&tree->root, start);
2058 
2059     while (es && (es->es_lblk <= end)) {
2060         if (ext4_es_is_delonly(es)) {
2061             if (es->es_lblk <= start)
2062                 first_lclu = EXT4_B2C(sbi, start);
2063             else
2064                 first_lclu = EXT4_B2C(sbi, es->es_lblk);
2065 
2066             if (ext4_es_end(es) >= end)
2067                 last_lclu = EXT4_B2C(sbi, end);
2068             else
2069                 last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
2070 
2071             if (first_lclu == last_counted_lclu)
2072                 n += last_lclu - first_lclu;
2073             else
2074                 n += last_lclu - first_lclu + 1;
2075             last_counted_lclu = last_lclu;
2076         }
2077         node = rb_next(&es->rb_node);
2078         if (!node)
2079             break;
2080         es = rb_entry(node, struct extent_status, rb_node);
2081     }
2082 
2083     return n;
2084 }
2085 
2086 /*
2087  * ext4_es_delayed_clu - count number of clusters containing blocks that
2088  *                       are both delayed and unwritten
2089  *
2090  * @inode - file containing block range
2091  * @lblk - logical block defining start of range
2092  * @len - number of blocks in range
2093  *
2094  * Locking for external use of __es_delayed_clu().
2095  */
2096 unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
2097                  ext4_lblk_t len)
2098 {
2099     struct ext4_inode_info *ei = EXT4_I(inode);
2100     ext4_lblk_t end;
2101     unsigned int n;
2102 
2103     if (len == 0)
2104         return 0;
2105 
2106     end = lblk + len - 1;
2107     WARN_ON(end < lblk);
2108 
2109     read_lock(&ei->i_es_lock);
2110 
2111     n = __es_delayed_clu(inode, lblk, end);
2112 
2113     read_unlock(&ei->i_es_lock);
2114 
2115     return n;
2116 }
2117 
2118 /*
2119  * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2120  *                    reservations for a specified block range depending
2121  *                    upon the presence or absence of delayed blocks
2122  *                    outside the range within clusters at the ends of the
2123  *                    range
2124  *
2125  * @inode - file containing the range
2126  * @lblk - logical block defining the start of range
2127  * @len  - length of range in blocks
2128  *
2129  * Used after a newly allocated extent is added to the extents status tree.
2130  * Requires that the extents in the range have either written or unwritten
2131  * status.  Must be called while holding i_es_lock.
2132  */
2133 static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2134                  ext4_lblk_t len)
2135 {
2136     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2137     ext4_lblk_t end = lblk + len - 1;
2138     ext4_lblk_t first, last;
2139     bool f_del = false, l_del = false;
2140 
2141     if (len == 0)
2142         return;
2143 
2144     /*
2145      * Two cases - block range within single cluster and block range
2146      * spanning two or more clusters.  Note that a cluster belonging
2147      * to a range starting and/or ending on a cluster boundary is treated
2148      * as if it does not contain a delayed extent.  The new range may
2149      * have allocated space for previously delayed blocks out to the
2150      * cluster boundary, requiring that any pre-existing pending
2151      * reservation be canceled.  Because this code only looks at blocks
2152      * outside the range, it should revise pending reservations
2153      * correctly even if the extent represented by the range can't be
2154      * inserted in the extents status tree due to ENOSPC.
2155      */
2156 
2157     if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2158         first = EXT4_LBLK_CMASK(sbi, lblk);
2159         if (first != lblk)
2160             f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2161                         first, lblk - 1);
2162         if (f_del) {
2163             __insert_pending(inode, first);
2164         } else {
2165             last = EXT4_LBLK_CMASK(sbi, end) +
2166                    sbi->s_cluster_ratio - 1;
2167             if (last != end)
2168                 l_del = __es_scan_range(inode,
2169                             &ext4_es_is_delonly,
2170                             end + 1, last);
2171             if (l_del)
2172                 __insert_pending(inode, last);
2173             else
2174                 __remove_pending(inode, last);
2175         }
2176     } else {
2177         first = EXT4_LBLK_CMASK(sbi, lblk);
2178         if (first != lblk)
2179             f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2180                         first, lblk - 1);
2181         if (f_del)
2182             __insert_pending(inode, first);
2183         else
2184             __remove_pending(inode, first);
2185 
2186         last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2187         if (last != end)
2188             l_del = __es_scan_range(inode, &ext4_es_is_delonly,
2189                         end + 1, last);
2190         if (l_del)
2191             __insert_pending(inode, last);
2192         else
2193             __remove_pending(inode, last);
2194     }
2195 }