fs/btrfs/file.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2007 Oracle.  All rights reserved.
0004  */
0005
0006 #include <linux/fs.h>
0007 #include <linux/pagemap.h>
0008 #include <linux/time.h>
0009 #include <linux/init.h>
0010 #include <linux/string.h>
0011 #include <linux/backing-dev.h>
0012 #include <linux/falloc.h>
0013 #include <linux/writeback.h>
0014 #include <linux/compat.h>
0015 #include <linux/slab.h>
0016 #include <linux/btrfs.h>
0017 #include <linux/uio.h>
0018 #include <linux/iversion.h>
0019 #include <linux/fsverity.h>
0020 #include "ctree.h"
0021 #include "disk-io.h"
0022 #include "transaction.h"
0023 #include "btrfs_inode.h"
0024 #include "print-tree.h"
0025 #include "tree-log.h"
0026 #include "locking.h"
0027 #include "volumes.h"
0028 #include "qgroup.h"
0029 #include "compression.h"
0030 #include "delalloc-space.h"
0031 #include "reflink.h"
0032 #include "subpage.h"
0033
0034 static struct kmem_cache *btrfs_inode_defrag_cachep;
0035 /*
0036  * when auto defrag is enabled we
0037  * queue up these defrag structs to remember which
0038  * inodes need defragging passes
0039  */
0040 struct inode_defrag {
0041     struct rb_node rb_node;
0042     /* objectid */
0043     u64 ino;
0044     /*
0045      * transid where the defrag was added, we search for
0046      * extents newer than this
0047      */
0048     u64 transid;
0049
0050     /* root objectid */
0051     u64 root;
0052
0053     /*
0054      * The extent size threshold for autodefrag.
0055      *
0056      * This value is different for compressed/non-compressed extents,
0057      * thus needs to be passed from higher layer.
0058      * (aka, inode_should_defrag())
0059      */
0060     u32 extent_thresh;
0061 };
0062
0063 static int __compare_inode_defrag(struct inode_defrag *defrag1,
0064                   struct inode_defrag *defrag2)
0065 {
0066     if (defrag1->root > defrag2->root)
0067         return 1;
0068     else if (defrag1->root < defrag2->root)
0069         return -1;
0070     else if (defrag1->ino > defrag2->ino)
0071         return 1;
0072     else if (defrag1->ino < defrag2->ino)
0073         return -1;
0074     else
0075         return 0;
0076 }
0077
0078 /* pop a record for an inode into the defrag tree.  The lock
0079  * must be held already
0080  *
0081  * If you're inserting a record for an older transid than an
0082  * existing record, the transid already in the tree is lowered
0083  *
0084  * If an existing record is found the defrag item you
0085  * pass in is freed
0086  */
0087 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
0088                     struct inode_defrag *defrag)
0089 {
0090     struct btrfs_fs_info *fs_info = inode->root->fs_info;
0091     struct inode_defrag *entry;
0092     struct rb_node **p;
0093     struct rb_node *parent = NULL;
0094     int ret;
0095
0096     p = &fs_info->defrag_inodes.rb_node;
0097     while (*p) {
0098         parent = *p;
0099         entry = rb_entry(parent, struct inode_defrag, rb_node);
0100
0101         ret = __compare_inode_defrag(defrag, entry);
0102         if (ret < 0)
0103             p = &parent->rb_left;
0104         else if (ret > 0)
0105             p = &parent->rb_right;
0106         else {
0107             /* if we're reinserting an entry for
0108              * an old defrag run, make sure to
0109              * lower the transid of our existing record
0110              */
0111             if (defrag->transid < entry->transid)
0112                 entry->transid = defrag->transid;
0113             entry->extent_thresh = min(defrag->extent_thresh,
0114                            entry->extent_thresh);
0115             return -EEXIST;
0116         }
0117     }
0118     set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
0119     rb_link_node(&defrag->rb_node, parent, p);
0120     rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
0121     return 0;
0122 }
0123
0124 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
0125 {
0126     if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
0127         return 0;
0128
0129     if (btrfs_fs_closing(fs_info))
0130         return 0;
0131
0132     return 1;
0133 }
0134
0135 /*
0136  * insert a defrag record for this inode if auto defrag is
0137  * enabled
0138  */
0139 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
0140                struct btrfs_inode *inode, u32 extent_thresh)
0141 {
0142     struct btrfs_root *root = inode->root;
0143     struct btrfs_fs_info *fs_info = root->fs_info;
0144     struct inode_defrag *defrag;
0145     u64 transid;
0146     int ret;
0147
0148     if (!__need_auto_defrag(fs_info))
0149         return 0;
0150
0151     if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
0152         return 0;
0153
0154     if (trans)
0155         transid = trans->transid;
0156     else
0157         transid = inode->root->last_trans;
0158
0159     defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
0160     if (!defrag)
0161         return -ENOMEM;
0162
0163     defrag->ino = btrfs_ino(inode);
0164     defrag->transid = transid;
0165     defrag->root = root->root_key.objectid;
0166     defrag->extent_thresh = extent_thresh;
0167
0168     spin_lock(&fs_info->defrag_inodes_lock);
0169     if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
0170         /*
0171          * If we set IN_DEFRAG flag and evict the inode from memory,
0172          * and then re-read this inode, this new inode doesn't have
0173          * IN_DEFRAG flag. At the case, we may find the existed defrag.
0174          */
0175         ret = __btrfs_add_inode_defrag(inode, defrag);
0176         if (ret)
0177             kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
0178     } else {
0179         kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
0180     }
0181     spin_unlock(&fs_info->defrag_inodes_lock);
0182     return 0;
0183 }
0184
0185 /*
0186  * pick the defragable inode that we want, if it doesn't exist, we will get
0187  * the next one.
0188  */
0189 static struct inode_defrag *
0190 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
0191 {
0192     struct inode_defrag *entry = NULL;
0193     struct inode_defrag tmp;
0194     struct rb_node *p;
0195     struct rb_node *parent = NULL;
0196     int ret;
0197
0198     tmp.ino = ino;
0199     tmp.root = root;
0200
0201     spin_lock(&fs_info->defrag_inodes_lock);
0202     p = fs_info->defrag_inodes.rb_node;
0203     while (p) {
0204         parent = p;
0205         entry = rb_entry(parent, struct inode_defrag, rb_node);
0206
0207         ret = __compare_inode_defrag(&tmp, entry);
0208         if (ret < 0)
0209             p = parent->rb_left;
0210         else if (ret > 0)
0211             p = parent->rb_right;
0212         else
0213             goto out;
0214     }
0215
0216     if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
0217         parent = rb_next(parent);
0218         if (parent)
0219             entry = rb_entry(parent, struct inode_defrag, rb_node);
0220         else
0221             entry = NULL;
0222     }
0223 out:
0224     if (entry)
0225         rb_erase(parent, &fs_info->defrag_inodes);
0226     spin_unlock(&fs_info->defrag_inodes_lock);
0227     return entry;
0228 }
0229
0230 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
0231 {
0232     struct inode_defrag *defrag;
0233     struct rb_node *node;
0234
0235     spin_lock(&fs_info->defrag_inodes_lock);
0236     node = rb_first(&fs_info->defrag_inodes);
0237     while (node) {
0238         rb_erase(node, &fs_info->defrag_inodes);
0239         defrag = rb_entry(node, struct inode_defrag, rb_node);
0240         kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
0241
0242         cond_resched_lock(&fs_info->defrag_inodes_lock);
0243
0244         node = rb_first(&fs_info->defrag_inodes);
0245     }
0246     spin_unlock(&fs_info->defrag_inodes_lock);
0247 }
0248
0249 #define BTRFS_DEFRAG_BATCH  1024
0250
0251 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
0252                     struct inode_defrag *defrag)
0253 {
0254     struct btrfs_root *inode_root;
0255     struct inode *inode;
0256     struct btrfs_ioctl_defrag_range_args range;
0257     int ret = 0;
0258     u64 cur = 0;
0259
0260 again:
0261     if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
0262         goto cleanup;
0263     if (!__need_auto_defrag(fs_info))
0264         goto cleanup;
0265
0266     /* get the inode */
0267     inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
0268     if (IS_ERR(inode_root)) {
0269         ret = PTR_ERR(inode_root);
0270         goto cleanup;
0271     }
0272
0273     inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
0274     btrfs_put_root(inode_root);
0275     if (IS_ERR(inode)) {
0276         ret = PTR_ERR(inode);
0277         goto cleanup;
0278     }
0279
0280     if (cur >= i_size_read(inode)) {
0281         iput(inode);
0282         goto cleanup;
0283     }
0284
0285     /* do a chunk of defrag */
0286     clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
0287     memset(&range, 0, sizeof(range));
0288     range.len = (u64)-1;
0289     range.start = cur;
0290     range.extent_thresh = defrag->extent_thresh;
0291
0292     sb_start_write(fs_info->sb);
0293     ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
0294                        BTRFS_DEFRAG_BATCH);
0295     sb_end_write(fs_info->sb);
0296     iput(inode);
0297
0298     if (ret < 0)
0299         goto cleanup;
0300
0301     cur = max(cur + fs_info->sectorsize, range.start);
0302     goto again;
0303
0304 cleanup:
0305     kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
0306     return ret;
0307 }
0308
0309 /*
0310  * run through the list of inodes in the FS that need
0311  * defragging
0312  */
0313 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
0314 {
0315     struct inode_defrag *defrag;
0316     u64 first_ino = 0;
0317     u64 root_objectid = 0;
0318
0319     atomic_inc(&fs_info->defrag_running);
0320     while (1) {
0321         /* Pause the auto defragger. */
0322         if (test_bit(BTRFS_FS_STATE_REMOUNTING,
0323                  &fs_info->fs_state))
0324             break;
0325
0326         if (!__need_auto_defrag(fs_info))
0327             break;
0328
0329         /* find an inode to defrag */
0330         defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
0331                          first_ino);
0332         if (!defrag) {
0333             if (root_objectid || first_ino) {
0334                 root_objectid = 0;
0335                 first_ino = 0;
0336                 continue;
0337             } else {
0338                 break;
0339             }
0340         }
0341
0342         first_ino = defrag->ino + 1;
0343         root_objectid = defrag->root;
0344
0345         __btrfs_run_defrag_inode(fs_info, defrag);
0346     }
0347     atomic_dec(&fs_info->defrag_running);
0348
0349     /*
0350      * during unmount, we use the transaction_wait queue to
0351      * wait for the defragger to stop
0352      */
0353     wake_up(&fs_info->transaction_wait);
0354     return 0;
0355 }
0356
0357 /* simple helper to fault in pages and copy.  This should go away
0358  * and be replaced with calls into generic code.
0359  */
0360 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
0361                      struct page **prepared_pages,
0362                      struct iov_iter *i)
0363 {
0364     size_t copied = 0;
0365     size_t total_copied = 0;
0366     int pg = 0;
0367     int offset = offset_in_page(pos);
0368
0369     while (write_bytes > 0) {
0370         size_t count = min_t(size_t,
0371                      PAGE_SIZE - offset, write_bytes);
0372         struct page *page = prepared_pages[pg];
0373         /*
0374          * Copy data from userspace to the current page
0375          */
0376         copied = copy_page_from_iter_atomic(page, offset, count, i);
0377
0378         /* Flush processor's dcache for this page */
0379         flush_dcache_page(page);
0380
0381         /*
0382          * if we get a partial write, we can end up with
0383          * partially up to date pages.  These add
0384          * a lot of complexity, so make sure they don't
0385          * happen by forcing this copy to be retried.
0386          *
0387          * The rest of the btrfs_file_write code will fall
0388          * back to page at a time copies after we return 0.
0389          */
0390         if (unlikely(copied < count)) {
0391             if (!PageUptodate(page)) {
0392                 iov_iter_revert(i, copied);
0393                 copied = 0;
0394             }
0395             if (!copied)
0396                 break;
0397         }
0398
0399         write_bytes -= copied;
0400         total_copied += copied;
0401         offset += copied;
0402         if (offset == PAGE_SIZE) {
0403             pg++;
0404             offset = 0;
0405         }
0406     }
0407     return total_copied;
0408 }
0409
0410 /*
0411  * unlocks pages after btrfs_file_write is done with them
0412  */
0413 static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
0414                  struct page **pages, size_t num_pages,
0415                  u64 pos, u64 copied)
0416 {
0417     size_t i;
0418     u64 block_start = round_down(pos, fs_info->sectorsize);
0419     u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
0420
0421     ASSERT(block_len <= U32_MAX);
0422     for (i = 0; i < num_pages; i++) {
0423         /* page checked is some magic around finding pages that
0424          * have been modified without going through btrfs_set_page_dirty
0425          * clear it here. There should be no need to mark the pages
0426          * accessed as prepare_pages should have marked them accessed
0427          * in prepare_pages via find_or_create_page()
0428          */
0429         btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
0430                            block_len);
0431         unlock_page(pages[i]);
0432         put_page(pages[i]);
0433     }
0434 }
0435
0436 /*
0437  * After btrfs_copy_from_user(), update the following things for delalloc:
0438  * - Mark newly dirtied pages as DELALLOC in the io tree.
0439  *   Used to advise which range is to be written back.
0440  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
0441  * - Update inode size for past EOF write
0442  */
0443 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
0444               size_t num_pages, loff_t pos, size_t write_bytes,
0445               struct extent_state **cached, bool noreserve)
0446 {
0447     struct btrfs_fs_info *fs_info = inode->root->fs_info;
0448     int err = 0;
0449     int i;
0450     u64 num_bytes;
0451     u64 start_pos;
0452     u64 end_of_last_block;
0453     u64 end_pos = pos + write_bytes;
0454     loff_t isize = i_size_read(&inode->vfs_inode);
0455     unsigned int extra_bits = 0;
0456
0457     if (write_bytes == 0)
0458         return 0;
0459
0460     if (noreserve)
0461         extra_bits |= EXTENT_NORESERVE;
0462
0463     start_pos = round_down(pos, fs_info->sectorsize);
0464     num_bytes = round_up(write_bytes + pos - start_pos,
0465                  fs_info->sectorsize);
0466     ASSERT(num_bytes <= U32_MAX);
0467
0468     end_of_last_block = start_pos + num_bytes - 1;
0469
0470     /*
0471      * The pages may have already been dirty, clear out old accounting so
0472      * we can set things up properly
0473      */
0474     clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
0475              EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0476              0, 0, cached);
0477
0478     err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
0479                     extra_bits, cached);
0480     if (err)
0481         return err;
0482
0483     for (i = 0; i < num_pages; i++) {
0484         struct page *p = pages[i];
0485
0486         btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
0487         btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
0488         btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
0489     }
0490
0491     /*
0492      * we've only changed i_size in ram, and we haven't updated
0493      * the disk i_size.  There is no need to log the inode
0494      * at this time.
0495      */
0496     if (end_pos > isize)
0497         i_size_write(&inode->vfs_inode, end_pos);
0498     return 0;
0499 }
0500
0501 /*
0502  * this drops all the extents in the cache that intersect the range
0503  * [start, end].  Existing extents are split as required.
0504  */
0505 void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
0506                  int skip_pinned)
0507 {
0508     struct extent_map *em;
0509     struct extent_map *split = NULL;
0510     struct extent_map *split2 = NULL;
0511     struct extent_map_tree *em_tree = &inode->extent_tree;
0512     u64 len = end - start + 1;
0513     u64 gen;
0514     int ret;
0515     int testend = 1;
0516     unsigned long flags;
0517     int compressed = 0;
0518     bool modified;
0519
0520     WARN_ON(end < start);
0521     if (end == (u64)-1) {
0522         len = (u64)-1;
0523         testend = 0;
0524     }
0525     while (1) {
0526         int no_splits = 0;
0527
0528         modified = false;
0529         if (!split)
0530             split = alloc_extent_map();
0531         if (!split2)
0532             split2 = alloc_extent_map();
0533         if (!split || !split2)
0534             no_splits = 1;
0535
0536         write_lock(&em_tree->lock);
0537         em = lookup_extent_mapping(em_tree, start, len);
0538         if (!em) {
0539             write_unlock(&em_tree->lock);
0540             break;
0541         }
0542         flags = em->flags;
0543         gen = em->generation;
0544         if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
0545             if (testend && em->start + em->len >= start + len) {
0546                 free_extent_map(em);
0547                 write_unlock(&em_tree->lock);
0548                 break;
0549             }
0550             start = em->start + em->len;
0551             if (testend)
0552                 len = start + len - (em->start + em->len);
0553             free_extent_map(em);
0554             write_unlock(&em_tree->lock);
0555             continue;
0556         }
0557         compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
0558         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
0559         clear_bit(EXTENT_FLAG_LOGGING, &flags);
0560         modified = !list_empty(&em->list);
0561         if (no_splits)
0562             goto next;
0563
0564         if (em->start < start) {
0565             split->start = em->start;
0566             split->len = start - em->start;
0567
0568             if (em->block_start < EXTENT_MAP_LAST_BYTE) {
0569                 split->orig_start = em->orig_start;
0570                 split->block_start = em->block_start;
0571
0572                 if (compressed)
0573                     split->block_len = em->block_len;
0574                 else
0575                     split->block_len = split->len;
0576                 split->orig_block_len = max(split->block_len,
0577                         em->orig_block_len);
0578                 split->ram_bytes = em->ram_bytes;
0579             } else {
0580                 split->orig_start = split->start;
0581                 split->block_len = 0;
0582                 split->block_start = em->block_start;
0583                 split->orig_block_len = 0;
0584                 split->ram_bytes = split->len;
0585             }
0586
0587             split->generation = gen;
0588             split->flags = flags;
0589             split->compress_type = em->compress_type;
0590             replace_extent_mapping(em_tree, em, split, modified);
0591             free_extent_map(split);
0592             split = split2;
0593             split2 = NULL;
0594         }
0595         if (testend && em->start + em->len > start + len) {
0596             u64 diff = start + len - em->start;
0597
0598             split->start = start + len;
0599             split->len = em->start + em->len - (start + len);
0600             split->flags = flags;
0601             split->compress_type = em->compress_type;
0602             split->generation = gen;
0603
0604             if (em->block_start < EXTENT_MAP_LAST_BYTE) {
0605                 split->orig_block_len = max(em->block_len,
0606                             em->orig_block_len);
0607
0608                 split->ram_bytes = em->ram_bytes;
0609                 if (compressed) {
0610                     split->block_len = em->block_len;
0611                     split->block_start = em->block_start;
0612                     split->orig_start = em->orig_start;
0613                 } else {
0614                     split->block_len = split->len;
0615                     split->block_start = em->block_start
0616                         + diff;
0617                     split->orig_start = em->orig_start;
0618                 }
0619             } else {
0620                 split->ram_bytes = split->len;
0621                 split->orig_start = split->start;
0622                 split->block_len = 0;
0623                 split->block_start = em->block_start;
0624                 split->orig_block_len = 0;
0625             }
0626
0627             if (extent_map_in_tree(em)) {
0628                 replace_extent_mapping(em_tree, em, split,
0629                                modified);
0630             } else {
0631                 ret = add_extent_mapping(em_tree, split,
0632                              modified);
0633                 ASSERT(ret == 0); /* Logic error */
0634             }
0635             free_extent_map(split);
0636             split = NULL;
0637         }
0638 next:
0639         if (extent_map_in_tree(em))
0640             remove_extent_mapping(em_tree, em);
0641         write_unlock(&em_tree->lock);
0642
0643         /* once for us */
0644         free_extent_map(em);
0645         /* once for the tree*/
0646         free_extent_map(em);
0647     }
0648     if (split)
0649         free_extent_map(split);
0650     if (split2)
0651         free_extent_map(split2);
0652 }
0653
0654 /*
0655  * this is very complex, but the basic idea is to drop all extents
0656  * in the range start - end.  hint_block is filled in with a block number
0657  * that would be a good hint to the block allocator for this file.
0658  *
0659  * If an extent intersects the range but is not entirely inside the range
0660  * it is either truncated or split.  Anything entirely inside the range
0661  * is deleted from the tree.
0662  *
0663  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
0664  * to deal with that. We set the field 'bytes_found' of the arguments structure
0665  * with the number of allocated bytes found in the target range, so that the
0666  * caller can update the inode's number of bytes in an atomic way when
0667  * replacing extents in a range to avoid races with stat(2).
0668  */
0669 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
0670                struct btrfs_root *root, struct btrfs_inode *inode,
0671                struct btrfs_drop_extents_args *args)
0672 {
0673     struct btrfs_fs_info *fs_info = root->fs_info;
0674     struct extent_buffer *leaf;
0675     struct btrfs_file_extent_item *fi;
0676     struct btrfs_ref ref = { 0 };
0677     struct btrfs_key key;
0678     struct btrfs_key new_key;
0679     u64 ino = btrfs_ino(inode);
0680     u64 search_start = args->start;
0681     u64 disk_bytenr = 0;
0682     u64 num_bytes = 0;
0683     u64 extent_offset = 0;
0684     u64 extent_end = 0;
0685     u64 last_end = args->start;
0686     int del_nr = 0;
0687     int del_slot = 0;
0688     int extent_type;
0689     int recow;
0690     int ret;
0691     int modify_tree = -1;
0692     int update_refs;
0693     int found = 0;
0694     struct btrfs_path *path = args->path;
0695
0696     args->bytes_found = 0;
0697     args->extent_inserted = false;
0698
0699     /* Must always have a path if ->replace_extent is true */
0700     ASSERT(!(args->replace_extent && !args->path));
0701
0702     if (!path) {
0703         path = btrfs_alloc_path();
0704         if (!path) {
0705             ret = -ENOMEM;
0706             goto out;
0707         }
0708     }
0709
0710     if (args->drop_cache)
0711         btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
0712
0713     if (args->start >= inode->disk_i_size && !args->replace_extent)
0714         modify_tree = 0;
0715
0716     update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
0717     while (1) {
0718         recow = 0;
0719         ret = btrfs_lookup_file_extent(trans, root, path, ino,
0720                            search_start, modify_tree);
0721         if (ret < 0)
0722             break;
0723         if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
0724             leaf = path->nodes[0];
0725             btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
0726             if (key.objectid == ino &&
0727                 key.type == BTRFS_EXTENT_DATA_KEY)
0728                 path->slots[0]--;
0729         }
0730         ret = 0;
0731 next_slot:
0732         leaf = path->nodes[0];
0733         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
0734             BUG_ON(del_nr > 0);
0735             ret = btrfs_next_leaf(root, path);
0736             if (ret < 0)
0737                 break;
0738             if (ret > 0) {
0739                 ret = 0;
0740                 break;
0741             }
0742             leaf = path->nodes[0];
0743             recow = 1;
0744         }
0745
0746         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
0747
0748         if (key.objectid > ino)
0749             break;
0750         if (WARN_ON_ONCE(key.objectid < ino) ||
0751             key.type < BTRFS_EXTENT_DATA_KEY) {
0752             ASSERT(del_nr == 0);
0753             path->slots[0]++;
0754             goto next_slot;
0755         }
0756         if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
0757             break;
0758
0759         fi = btrfs_item_ptr(leaf, path->slots[0],
0760                     struct btrfs_file_extent_item);
0761         extent_type = btrfs_file_extent_type(leaf, fi);
0762
0763         if (extent_type == BTRFS_FILE_EXTENT_REG ||
0764             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
0765             disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
0766             num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
0767             extent_offset = btrfs_file_extent_offset(leaf, fi);
0768             extent_end = key.offset +
0769                 btrfs_file_extent_num_bytes(leaf, fi);
0770         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
0771             extent_end = key.offset +
0772                 btrfs_file_extent_ram_bytes(leaf, fi);
0773         } else {
0774             /* can't happen */
0775             BUG();
0776         }
0777
0778         /*
0779          * Don't skip extent items representing 0 byte lengths. They
0780          * used to be created (bug) if while punching holes we hit
0781          * -ENOSPC condition. So if we find one here, just ensure we
0782          * delete it, otherwise we would insert a new file extent item
0783          * with the same key (offset) as that 0 bytes length file
0784          * extent item in the call to setup_items_for_insert() later
0785          * in this function.
0786          */
0787         if (extent_end == key.offset && extent_end >= search_start) {
0788             last_end = extent_end;
0789             goto delete_extent_item;
0790         }
0791
0792         if (extent_end <= search_start) {
0793             path->slots[0]++;
0794             goto next_slot;
0795         }
0796
0797         found = 1;
0798         search_start = max(key.offset, args->start);
0799         if (recow || !modify_tree) {
0800             modify_tree = -1;
0801             btrfs_release_path(path);
0802             continue;
0803         }
0804
0805         /*
0806          *     | - range to drop - |
0807          *  | -------- extent -------- |
0808          */
0809         if (args->start > key.offset && args->end < extent_end) {
0810             BUG_ON(del_nr > 0);
0811             if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
0812                 ret = -EOPNOTSUPP;
0813                 break;
0814             }
0815
0816             memcpy(&new_key, &key, sizeof(new_key));
0817             new_key.offset = args->start;
0818             ret = btrfs_duplicate_item(trans, root, path,
0819                            &new_key);
0820             if (ret == -EAGAIN) {
0821                 btrfs_release_path(path);
0822                 continue;
0823             }
0824             if (ret < 0)
0825                 break;
0826
0827             leaf = path->nodes[0];
0828             fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
0829                         struct btrfs_file_extent_item);
0830             btrfs_set_file_extent_num_bytes(leaf, fi,
0831                             args->start - key.offset);
0832
0833             fi = btrfs_item_ptr(leaf, path->slots[0],
0834                         struct btrfs_file_extent_item);
0835
0836             extent_offset += args->start - key.offset;
0837             btrfs_set_file_extent_offset(leaf, fi, extent_offset);
0838             btrfs_set_file_extent_num_bytes(leaf, fi,
0839                             extent_end - args->start);
0840             btrfs_mark_buffer_dirty(leaf);
0841
0842             if (update_refs && disk_bytenr > 0) {
0843                 btrfs_init_generic_ref(&ref,
0844                         BTRFS_ADD_DELAYED_REF,
0845                         disk_bytenr, num_bytes, 0);
0846                 btrfs_init_data_ref(&ref,
0847                         root->root_key.objectid,
0848                         new_key.objectid,
0849                         args->start - extent_offset,
0850                         0, false);
0851                 ret = btrfs_inc_extent_ref(trans, &ref);
0852                 BUG_ON(ret); /* -ENOMEM */
0853             }
0854             key.offset = args->start;
0855         }
0856         /*
0857          * From here on out we will have actually dropped something, so
0858          * last_end can be updated.
0859          */
0860         last_end = extent_end;
0861
0862         /*
0863          *  | ---- range to drop ----- |
0864          *      | -------- extent -------- |
0865          */
0866         if (args->start <= key.offset && args->end < extent_end) {
0867             if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
0868                 ret = -EOPNOTSUPP;
0869                 break;
0870             }
0871
0872             memcpy(&new_key, &key, sizeof(new_key));
0873             new_key.offset = args->end;
0874             btrfs_set_item_key_safe(fs_info, path, &new_key);
0875
0876             extent_offset += args->end - key.offset;
0877             btrfs_set_file_extent_offset(leaf, fi, extent_offset);
0878             btrfs_set_file_extent_num_bytes(leaf, fi,
0879                             extent_end - args->end);
0880             btrfs_mark_buffer_dirty(leaf);
0881             if (update_refs && disk_bytenr > 0)
0882                 args->bytes_found += args->end - key.offset;
0883             break;
0884         }
0885
0886         search_start = extent_end;
0887         /*
0888          *       | ---- range to drop ----- |
0889          *  | -------- extent -------- |
0890          */
0891         if (args->start > key.offset && args->end >= extent_end) {
0892             BUG_ON(del_nr > 0);
0893             if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
0894                 ret = -EOPNOTSUPP;
0895                 break;
0896             }
0897
0898             btrfs_set_file_extent_num_bytes(leaf, fi,
0899                             args->start - key.offset);
0900             btrfs_mark_buffer_dirty(leaf);
0901             if (update_refs && disk_bytenr > 0)
0902                 args->bytes_found += extent_end - args->start;
0903             if (args->end == extent_end)
0904                 break;
0905
0906             path->slots[0]++;
0907             goto next_slot;
0908         }
0909
0910         /*
0911          *  | ---- range to drop ----- |
0912          *    | ------ extent ------ |
0913          */
0914         if (args->start <= key.offset && args->end >= extent_end) {
0915 delete_extent_item:
0916             if (del_nr == 0) {
0917                 del_slot = path->slots[0];
0918                 del_nr = 1;
0919             } else {
0920                 BUG_ON(del_slot + del_nr != path->slots[0]);
0921                 del_nr++;
0922             }
0923
0924             if (update_refs &&
0925                 extent_type == BTRFS_FILE_EXTENT_INLINE) {
0926                 args->bytes_found += extent_end - key.offset;
0927                 extent_end = ALIGN(extent_end,
0928                            fs_info->sectorsize);
0929             } else if (update_refs && disk_bytenr > 0) {
0930                 btrfs_init_generic_ref(&ref,
0931                         BTRFS_DROP_DELAYED_REF,
0932                         disk_bytenr, num_bytes, 0);
0933                 btrfs_init_data_ref(&ref,
0934                         root->root_key.objectid,
0935                         key.objectid,
0936                         key.offset - extent_offset, 0,
0937                         false);
0938                 ret = btrfs_free_extent(trans, &ref);
0939                 BUG_ON(ret); /* -ENOMEM */
0940                 args->bytes_found += extent_end - key.offset;
0941             }
0942
0943             if (args->end == extent_end)
0944                 break;
0945
0946             if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
0947                 path->slots[0]++;
0948                 goto next_slot;
0949             }
0950
0951             ret = btrfs_del_items(trans, root, path, del_slot,
0952                           del_nr);
0953             if (ret) {
0954                 btrfs_abort_transaction(trans, ret);
0955                 break;
0956             }
0957
0958             del_nr = 0;
0959             del_slot = 0;
0960
0961             btrfs_release_path(path);
0962             continue;
0963         }
0964
0965         BUG();
0966     }
0967
0968     if (!ret && del_nr > 0) {
0969         /*
0970          * Set path->slots[0] to first slot, so that after the delete
0971          * if items are move off from our leaf to its immediate left or
0972          * right neighbor leafs, we end up with a correct and adjusted
0973          * path->slots[0] for our insertion (if args->replace_extent).
0974          */
0975         path->slots[0] = del_slot;
0976         ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
0977         if (ret)
0978             btrfs_abort_transaction(trans, ret);
0979     }
0980
0981     leaf = path->nodes[0];
0982     /*
0983      * If btrfs_del_items() was called, it might have deleted a leaf, in
0984      * which case it unlocked our path, so check path->locks[0] matches a
0985      * write lock.
0986      */
0987     if (!ret && args->replace_extent &&
0988         path->locks[0] == BTRFS_WRITE_LOCK &&
0989         btrfs_leaf_free_space(leaf) >=
0990         sizeof(struct btrfs_item) + args->extent_item_size) {
0991
0992         key.objectid = ino;
0993         key.type = BTRFS_EXTENT_DATA_KEY;
0994         key.offset = args->start;
0995         if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
0996             struct btrfs_key slot_key;
0997
0998             btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
0999             if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1000                 path->slots[0]++;
1001         }
1002         btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
1003         args->extent_inserted = true;
1004     }
1005
1006     if (!args->path)
1007         btrfs_free_path(path);
1008     else if (!args->extent_inserted)
1009         btrfs_release_path(path);
1010 out:
1011     args->drop_end = found ? min(args->end, last_end) : args->end;
1012
1013     return ret;
1014 }
1015
1016 static int extent_mergeable(struct extent_buffer *leaf, int slot,
1017                 u64 objectid, u64 bytenr, u64 orig_offset,
1018                 u64 *start, u64 *end)
1019 {
1020     struct btrfs_file_extent_item *fi;
1021     struct btrfs_key key;
1022     u64 extent_end;
1023
1024     if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1025         return 0;
1026
1027     btrfs_item_key_to_cpu(leaf, &key, slot);
1028     if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1029         return 0;
1030
1031     fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1032     if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1033         btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1034         btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1035         btrfs_file_extent_compression(leaf, fi) ||
1036         btrfs_file_extent_encryption(leaf, fi) ||
1037         btrfs_file_extent_other_encoding(leaf, fi))
1038         return 0;
1039
1040     extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1041     if ((*start && *start != key.offset) || (*end && *end != extent_end))
1042         return 0;
1043
1044     *start = key.offset;
1045     *end = extent_end;
1046     return 1;
1047 }
1048
1049 /*
1050  * Mark extent in the range start - end as written.
1051  *
1052  * This changes extent type from 'pre-allocated' to 'regular'. If only
1053  * part of extent is marked as written, the extent will be split into
1054  * two or three.
1055  */
1056 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1057                   struct btrfs_inode *inode, u64 start, u64 end)
1058 {
1059     struct btrfs_fs_info *fs_info = trans->fs_info;
1060     struct btrfs_root *root = inode->root;
1061     struct extent_buffer *leaf;
1062     struct btrfs_path *path;
1063     struct btrfs_file_extent_item *fi;
1064     struct btrfs_ref ref = { 0 };
1065     struct btrfs_key key;
1066     struct btrfs_key new_key;
1067     u64 bytenr;
1068     u64 num_bytes;
1069     u64 extent_end;
1070     u64 orig_offset;
1071     u64 other_start;
1072     u64 other_end;
1073     u64 split;
1074     int del_nr = 0;
1075     int del_slot = 0;
1076     int recow;
1077     int ret = 0;
1078     u64 ino = btrfs_ino(inode);
1079
1080     path = btrfs_alloc_path();
1081     if (!path)
1082         return -ENOMEM;
1083 again:
1084     recow = 0;
1085     split = start;
1086     key.objectid = ino;
1087     key.type = BTRFS_EXTENT_DATA_KEY;
1088     key.offset = split;
1089
1090     ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1091     if (ret < 0)
1092         goto out;
1093     if (ret > 0 && path->slots[0] > 0)
1094         path->slots[0]--;
1095
1096     leaf = path->nodes[0];
1097     btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1098     if (key.objectid != ino ||
1099         key.type != BTRFS_EXTENT_DATA_KEY) {
1100         ret = -EINVAL;
1101         btrfs_abort_transaction(trans, ret);
1102         goto out;
1103     }
1104     fi = btrfs_item_ptr(leaf, path->slots[0],
1105                 struct btrfs_file_extent_item);
1106     if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1107         ret = -EINVAL;
1108         btrfs_abort_transaction(trans, ret);
1109         goto out;
1110     }
1111     extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1112     if (key.offset > start || extent_end < end) {
1113         ret = -EINVAL;
1114         btrfs_abort_transaction(trans, ret);
1115         goto out;
1116     }
1117
1118     bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1119     num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1120     orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1121     memcpy(&new_key, &key, sizeof(new_key));
1122
1123     if (start == key.offset && end < extent_end) {
1124         other_start = 0;
1125         other_end = start;
1126         if (extent_mergeable(leaf, path->slots[0] - 1,
1127                      ino, bytenr, orig_offset,
1128                      &other_start, &other_end)) {
1129             new_key.offset = end;
1130             btrfs_set_item_key_safe(fs_info, path, &new_key);
1131             fi = btrfs_item_ptr(leaf, path->slots[0],
1132                         struct btrfs_file_extent_item);
1133             btrfs_set_file_extent_generation(leaf, fi,
1134                              trans->transid);
1135             btrfs_set_file_extent_num_bytes(leaf, fi,
1136                             extent_end - end);
1137             btrfs_set_file_extent_offset(leaf, fi,
1138                              end - orig_offset);
1139             fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1140                         struct btrfs_file_extent_item);
1141             btrfs_set_file_extent_generation(leaf, fi,
1142                              trans->transid);
1143             btrfs_set_file_extent_num_bytes(leaf, fi,
1144                             end - other_start);
1145             btrfs_mark_buffer_dirty(leaf);
1146             goto out;
1147         }
1148     }
1149
1150     if (start > key.offset && end == extent_end) {
1151         other_start = end;
1152         other_end = 0;
1153         if (extent_mergeable(leaf, path->slots[0] + 1,
1154                      ino, bytenr, orig_offset,
1155                      &other_start, &other_end)) {
1156             fi = btrfs_item_ptr(leaf, path->slots[0],
1157                         struct btrfs_file_extent_item);
1158             btrfs_set_file_extent_num_bytes(leaf, fi,
1159                             start - key.offset);
1160             btrfs_set_file_extent_generation(leaf, fi,
1161                              trans->transid);
1162             path->slots[0]++;
1163             new_key.offset = start;
1164             btrfs_set_item_key_safe(fs_info, path, &new_key);
1165
1166             fi = btrfs_item_ptr(leaf, path->slots[0],
1167                         struct btrfs_file_extent_item);
1168             btrfs_set_file_extent_generation(leaf, fi,
1169                              trans->transid);
1170             btrfs_set_file_extent_num_bytes(leaf, fi,
1171                             other_end - start);
1172             btrfs_set_file_extent_offset(leaf, fi,
1173                              start - orig_offset);
1174             btrfs_mark_buffer_dirty(leaf);
1175             goto out;
1176         }
1177     }
1178
1179     while (start > key.offset || end < extent_end) {
1180         if (key.offset == start)
1181             split = end;
1182
1183         new_key.offset = split;
1184         ret = btrfs_duplicate_item(trans, root, path, &new_key);
1185         if (ret == -EAGAIN) {
1186             btrfs_release_path(path);
1187             goto again;
1188         }
1189         if (ret < 0) {
1190             btrfs_abort_transaction(trans, ret);
1191             goto out;
1192         }
1193
1194         leaf = path->nodes[0];
1195         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1196                     struct btrfs_file_extent_item);
1197         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1198         btrfs_set_file_extent_num_bytes(leaf, fi,
1199                         split - key.offset);
1200
1201         fi = btrfs_item_ptr(leaf, path->slots[0],
1202                     struct btrfs_file_extent_item);
1203
1204         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1205         btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1206         btrfs_set_file_extent_num_bytes(leaf, fi,
1207                         extent_end - split);
1208         btrfs_mark_buffer_dirty(leaf);
1209
1210         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1211                        num_bytes, 0);
1212         btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1213                     orig_offset, 0, false);
1214         ret = btrfs_inc_extent_ref(trans, &ref);
1215         if (ret) {
1216             btrfs_abort_transaction(trans, ret);
1217             goto out;
1218         }
1219
1220         if (split == start) {
1221             key.offset = start;
1222         } else {
1223             if (start != key.offset) {
1224                 ret = -EINVAL;
1225                 btrfs_abort_transaction(trans, ret);
1226                 goto out;
1227             }
1228             path->slots[0]--;
1229             extent_end = end;
1230         }
1231         recow = 1;
1232     }
1233
1234     other_start = end;
1235     other_end = 0;
1236     btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1237                    num_bytes, 0);
1238     btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
1239                 0, false);
1240     if (extent_mergeable(leaf, path->slots[0] + 1,
1241                  ino, bytenr, orig_offset,
1242                  &other_start, &other_end)) {
1243         if (recow) {
1244             btrfs_release_path(path);
1245             goto again;
1246         }
1247         extent_end = other_end;
1248         del_slot = path->slots[0] + 1;
1249         del_nr++;
1250         ret = btrfs_free_extent(trans, &ref);
1251         if (ret) {
1252             btrfs_abort_transaction(trans, ret);
1253             goto out;
1254         }
1255     }
1256     other_start = 0;
1257     other_end = start;
1258     if (extent_mergeable(leaf, path->slots[0] - 1,
1259                  ino, bytenr, orig_offset,
1260                  &other_start, &other_end)) {
1261         if (recow) {
1262             btrfs_release_path(path);
1263             goto again;
1264         }
1265         key.offset = other_start;
1266         del_slot = path->slots[0];
1267         del_nr++;
1268         ret = btrfs_free_extent(trans, &ref);
1269         if (ret) {
1270             btrfs_abort_transaction(trans, ret);
1271             goto out;
1272         }
1273     }
1274     if (del_nr == 0) {
1275         fi = btrfs_item_ptr(leaf, path->slots[0],
1276                struct btrfs_file_extent_item);
1277         btrfs_set_file_extent_type(leaf, fi,
1278                        BTRFS_FILE_EXTENT_REG);
1279         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1280         btrfs_mark_buffer_dirty(leaf);
1281     } else {
1282         fi = btrfs_item_ptr(leaf, del_slot - 1,
1283                struct btrfs_file_extent_item);
1284         btrfs_set_file_extent_type(leaf, fi,
1285                        BTRFS_FILE_EXTENT_REG);
1286         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1287         btrfs_set_file_extent_num_bytes(leaf, fi,
1288                         extent_end - key.offset);
1289         btrfs_mark_buffer_dirty(leaf);
1290
1291         ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1292         if (ret < 0) {
1293             btrfs_abort_transaction(trans, ret);
1294             goto out;
1295         }
1296     }
1297 out:
1298     btrfs_free_path(path);
1299     return ret;
1300 }
1301
1302 /*
1303  * on error we return an unlocked page and the error value
1304  * on success we return a locked page and 0
1305  */
1306 static int prepare_uptodate_page(struct inode *inode,
1307                  struct page *page, u64 pos,
1308                  bool force_uptodate)
1309 {
1310     struct folio *folio = page_folio(page);
1311     int ret = 0;
1312
1313     if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1314         !PageUptodate(page)) {
1315         ret = btrfs_read_folio(NULL, folio);
1316         if (ret)
1317             return ret;
1318         lock_page(page);
1319         if (!PageUptodate(page)) {
1320             unlock_page(page);
1321             return -EIO;
1322         }
1323
1324         /*
1325          * Since btrfs_read_folio() will unlock the folio before it
1326          * returns, there is a window where btrfs_release_folio() can be
1327          * called to release the page.  Here we check both inode
1328          * mapping and PagePrivate() to make sure the page was not
1329          * released.
1330          *
1331          * The private flag check is essential for subpage as we need
1332          * to store extra bitmap using page->private.
1333          */
1334         if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
1335             unlock_page(page);
1336             return -EAGAIN;
1337         }
1338     }
1339     return 0;
1340 }
1341
1342 /*
1343  * this just gets pages into the page cache and locks them down.
1344  */
1345 static noinline int prepare_pages(struct inode *inode, struct page **pages,
1346                   size_t num_pages, loff_t pos,
1347                   size_t write_bytes, bool force_uptodate)
1348 {
1349     int i;
1350     unsigned long index = pos >> PAGE_SHIFT;
1351     gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1352     int err = 0;
1353     int faili;
1354
1355     for (i = 0; i < num_pages; i++) {
1356 again:
1357         pages[i] = find_or_create_page(inode->i_mapping, index + i,
1358                            mask | __GFP_WRITE);
1359         if (!pages[i]) {
1360             faili = i - 1;
1361             err = -ENOMEM;
1362             goto fail;
1363         }
1364
1365         err = set_page_extent_mapped(pages[i]);
1366         if (err < 0) {
1367             faili = i;
1368             goto fail;
1369         }
1370
1371         if (i == 0)
1372             err = prepare_uptodate_page(inode, pages[i], pos,
1373                             force_uptodate);
1374         if (!err && i == num_pages - 1)
1375             err = prepare_uptodate_page(inode, pages[i],
1376                             pos + write_bytes, false);
1377         if (err) {
1378             put_page(pages[i]);
1379             if (err == -EAGAIN) {
1380                 err = 0;
1381                 goto again;
1382             }
1383             faili = i - 1;
1384             goto fail;
1385         }
1386         wait_on_page_writeback(pages[i]);
1387     }
1388
1389     return 0;
1390 fail:
1391     while (faili >= 0) {
1392         unlock_page(pages[faili]);
1393         put_page(pages[faili]);
1394         faili--;
1395     }
1396     return err;
1397
1398 }
1399
1400 /*
1401  * This function locks the extent and properly waits for data=ordered extents
1402  * to finish before allowing the pages to be modified if need.
1403  *
1404  * The return value:
1405  * 1 - the extent is locked
1406  * 0 - the extent is not locked, and everything is OK
1407  * -EAGAIN - need re-prepare the pages
1408  * the other < 0 number - Something wrong happens
1409  */
1410 static noinline int
1411 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1412                 size_t num_pages, loff_t pos,
1413                 size_t write_bytes,
1414                 u64 *lockstart, u64 *lockend,
1415                 struct extent_state **cached_state)
1416 {
1417     struct btrfs_fs_info *fs_info = inode->root->fs_info;
1418     u64 start_pos;
1419     u64 last_pos;
1420     int i;
1421     int ret = 0;
1422
1423     start_pos = round_down(pos, fs_info->sectorsize);
1424     last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1425
1426     if (start_pos < inode->vfs_inode.i_size) {
1427         struct btrfs_ordered_extent *ordered;
1428
1429         lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1430                 cached_state);
1431         ordered = btrfs_lookup_ordered_range(inode, start_pos,
1432                              last_pos - start_pos + 1);
1433         if (ordered &&
1434             ordered->file_offset + ordered->num_bytes > start_pos &&
1435             ordered->file_offset <= last_pos) {
1436             unlock_extent_cached(&inode->io_tree, start_pos,
1437                     last_pos, cached_state);
1438             for (i = 0; i < num_pages; i++) {
1439                 unlock_page(pages[i]);
1440                 put_page(pages[i]);
1441             }
1442             btrfs_start_ordered_extent(ordered, 1);
1443             btrfs_put_ordered_extent(ordered);
1444             return -EAGAIN;
1445         }
1446         if (ordered)
1447             btrfs_put_ordered_extent(ordered);
1448
1449         *lockstart = start_pos;
1450         *lockend = last_pos;
1451         ret = 1;
1452     }
1453
1454     /*
1455      * We should be called after prepare_pages() which should have locked
1456      * all pages in the range.
1457      */
1458     for (i = 0; i < num_pages; i++)
1459         WARN_ON(!PageLocked(pages[i]));
1460
1461     return ret;
1462 }
1463
1464 /*
1465  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1466  *
1467  * @pos:         File offset.
1468  * @write_bytes: The length to write, will be updated to the nocow writeable
1469  *               range.
1470  *
1471  * This function will flush ordered extents in the range to ensure proper
1472  * nocow checks.
1473  *
1474  * Return:
1475  * > 0          If we can nocow, and updates @write_bytes.
1476  *  0           If we can't do a nocow write.
1477  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
1478  *              root is in progress.
1479  * < 0          If an error happened.
1480  *
1481  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1482  */
1483 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1484                size_t *write_bytes)
1485 {
1486     struct btrfs_fs_info *fs_info = inode->root->fs_info;
1487     struct btrfs_root *root = inode->root;
1488     u64 lockstart, lockend;
1489     u64 num_bytes;
1490     int ret;
1491
1492     if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1493         return 0;
1494
1495     if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1496         return -EAGAIN;
1497
1498     lockstart = round_down(pos, fs_info->sectorsize);
1499     lockend = round_up(pos + *write_bytes,
1500                fs_info->sectorsize) - 1;
1501     num_bytes = lockend - lockstart + 1;
1502
1503     btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
1504     ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1505             NULL, NULL, NULL, false);
1506     if (ret <= 0) {
1507         ret = 0;
1508         btrfs_drew_write_unlock(&root->snapshot_lock);
1509     } else {
1510         *write_bytes = min_t(size_t, *write_bytes ,
1511                      num_bytes - pos + lockstart);
1512     }
1513     unlock_extent(&inode->io_tree, lockstart, lockend);
1514
1515     return ret;
1516 }
1517
1518 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1519 {
1520     btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1521 }
1522
1523 static void update_time_for_write(struct inode *inode)
1524 {
1525     struct timespec64 now;
1526
1527     if (IS_NOCMTIME(inode))
1528         return;
1529
1530     now = current_time(inode);
1531     if (!timespec64_equal(&inode->i_mtime, &now))
1532         inode->i_mtime = now;
1533
1534     if (!timespec64_equal(&inode->i_ctime, &now))
1535         inode->i_ctime = now;
1536
1537     if (IS_I_VERSION(inode))
1538         inode_inc_iversion(inode);
1539 }
1540
1541 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1542                  size_t count)
1543 {
1544     struct file *file = iocb->ki_filp;
1545     struct inode *inode = file_inode(file);
1546     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1547     loff_t pos = iocb->ki_pos;
1548     int ret;
1549     loff_t oldsize;
1550     loff_t start_pos;
1551
1552     /*
1553      * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1554      * prealloc flags, as without those flags we always have to COW. We will
1555      * later check if we can really COW into the target range (using
1556      * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1557      */
1558     if ((iocb->ki_flags & IOCB_NOWAIT) &&
1559         !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1560         return -EAGAIN;
1561
1562     current->backing_dev_info = inode_to_bdi(inode);
1563     ret = file_remove_privs(file);
1564     if (ret)
1565         return ret;
1566
1567     /*
1568      * We reserve space for updating the inode when we reserve space for the
1569      * extent we are going to write, so we will enospc out there.  We don't
1570      * need to start yet another transaction to update the inode as we will
1571      * update the inode when we finish writing whatever data we write.
1572      */
1573     update_time_for_write(inode);
1574
1575     start_pos = round_down(pos, fs_info->sectorsize);
1576     oldsize = i_size_read(inode);
1577     if (start_pos > oldsize) {
1578         /* Expand hole size to cover write data, preventing empty gap */
1579         loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1580
1581         ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1582         if (ret) {
1583             current->backing_dev_info = NULL;
1584             return ret;
1585         }
1586     }
1587
1588     return 0;
1589 }
1590
1591 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1592                            struct iov_iter *i)
1593 {
1594     struct file *file = iocb->ki_filp;
1595     loff_t pos;
1596     struct inode *inode = file_inode(file);
1597     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1598     struct page **pages = NULL;
1599     struct extent_changeset *data_reserved = NULL;
1600     u64 release_bytes = 0;
1601     u64 lockstart;
1602     u64 lockend;
1603     size_t num_written = 0;
1604     int nrptrs;
1605     ssize_t ret;
1606     bool only_release_metadata = false;
1607     bool force_page_uptodate = false;
1608     loff_t old_isize = i_size_read(inode);
1609     unsigned int ilock_flags = 0;
1610
1611     if (iocb->ki_flags & IOCB_NOWAIT)
1612         ilock_flags |= BTRFS_ILOCK_TRY;
1613
1614     ret = btrfs_inode_lock(inode, ilock_flags);
1615     if (ret < 0)
1616         return ret;
1617
1618     ret = generic_write_checks(iocb, i);
1619     if (ret <= 0)
1620         goto out;
1621
1622     ret = btrfs_write_check(iocb, i, ret);
1623     if (ret < 0)
1624         goto out;
1625
1626     pos = iocb->ki_pos;
1627     nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1628             PAGE_SIZE / (sizeof(struct page *)));
1629     nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1630     nrptrs = max(nrptrs, 8);
1631     pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1632     if (!pages) {
1633         ret = -ENOMEM;
1634         goto out;
1635     }
1636
1637     while (iov_iter_count(i) > 0) {
1638         struct extent_state *cached_state = NULL;
1639         size_t offset = offset_in_page(pos);
1640         size_t sector_offset;
1641         size_t write_bytes = min(iov_iter_count(i),
1642                      nrptrs * (size_t)PAGE_SIZE -
1643                      offset);
1644         size_t num_pages;
1645         size_t reserve_bytes;
1646         size_t dirty_pages;
1647         size_t copied;
1648         size_t dirty_sectors;
1649         size_t num_sectors;
1650         int extents_locked;
1651
1652         /*
1653          * Fault pages before locking them in prepare_pages
1654          * to avoid recursive lock
1655          */
1656         if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1657             ret = -EFAULT;
1658             break;
1659         }
1660
1661         only_release_metadata = false;
1662         sector_offset = pos & (fs_info->sectorsize - 1);
1663
1664         extent_changeset_release(data_reserved);
1665         ret = btrfs_check_data_free_space(BTRFS_I(inode),
1666                           &data_reserved, pos,
1667                           write_bytes);
1668         if (ret < 0) {
1669             /*
1670              * If we don't have to COW at the offset, reserve
1671              * metadata only. write_bytes may get smaller than
1672              * requested here.
1673              */
1674             if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1675                            &write_bytes) > 0)
1676                 only_release_metadata = true;
1677             else
1678                 break;
1679         }
1680
1681         num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1682         WARN_ON(num_pages > nrptrs);
1683         reserve_bytes = round_up(write_bytes + sector_offset,
1684                      fs_info->sectorsize);
1685         WARN_ON(reserve_bytes == 0);
1686         ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1687                               reserve_bytes,
1688                               reserve_bytes, false);
1689         if (ret) {
1690             if (!only_release_metadata)
1691                 btrfs_free_reserved_data_space(BTRFS_I(inode),
1692                         data_reserved, pos,
1693                         write_bytes);
1694             else
1695                 btrfs_check_nocow_unlock(BTRFS_I(inode));
1696             break;
1697         }
1698
1699         release_bytes = reserve_bytes;
1700 again:
1701         /*
1702          * This is going to setup the pages array with the number of
1703          * pages we want, so we don't really need to worry about the
1704          * contents of pages from loop to loop
1705          */
1706         ret = prepare_pages(inode, pages, num_pages,
1707                     pos, write_bytes,
1708                     force_page_uptodate);
1709         if (ret) {
1710             btrfs_delalloc_release_extents(BTRFS_I(inode),
1711                                reserve_bytes);
1712             break;
1713         }
1714
1715         extents_locked = lock_and_cleanup_extent_if_need(
1716                 BTRFS_I(inode), pages,
1717                 num_pages, pos, write_bytes, &lockstart,
1718                 &lockend, &cached_state);
1719         if (extents_locked < 0) {
1720             if (extents_locked == -EAGAIN)
1721                 goto again;
1722             btrfs_delalloc_release_extents(BTRFS_I(inode),
1723                                reserve_bytes);
1724             ret = extents_locked;
1725             break;
1726         }
1727
1728         copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1729
1730         num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1731         dirty_sectors = round_up(copied + sector_offset,
1732                     fs_info->sectorsize);
1733         dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1734
1735         /*
1736          * if we have trouble faulting in the pages, fall
1737          * back to one page at a time
1738          */
1739         if (copied < write_bytes)
1740             nrptrs = 1;
1741
1742         if (copied == 0) {
1743             force_page_uptodate = true;
1744             dirty_sectors = 0;
1745             dirty_pages = 0;
1746         } else {
1747             force_page_uptodate = false;
1748             dirty_pages = DIV_ROUND_UP(copied + offset,
1749                            PAGE_SIZE);
1750         }
1751
1752         if (num_sectors > dirty_sectors) {
1753             /* release everything except the sectors we dirtied */
1754             release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1755             if (only_release_metadata) {
1756                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1757                             release_bytes, true);
1758             } else {
1759                 u64 __pos;
1760
1761                 __pos = round_down(pos,
1762                            fs_info->sectorsize) +
1763                     (dirty_pages << PAGE_SHIFT);
1764                 btrfs_delalloc_release_space(BTRFS_I(inode),
1765                         data_reserved, __pos,
1766                         release_bytes, true);
1767             }
1768         }
1769
1770         release_bytes = round_up(copied + sector_offset,
1771                     fs_info->sectorsize);
1772
1773         ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1774                     dirty_pages, pos, copied,
1775                     &cached_state, only_release_metadata);
1776
1777         /*
1778          * If we have not locked the extent range, because the range's
1779          * start offset is >= i_size, we might still have a non-NULL
1780          * cached extent state, acquired while marking the extent range
1781          * as delalloc through btrfs_dirty_pages(). Therefore free any
1782          * possible cached extent state to avoid a memory leak.
1783          */
1784         if (extents_locked)
1785             unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1786                          lockstart, lockend, &cached_state);
1787         else
1788             free_extent_state(cached_state);
1789
1790         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1791         if (ret) {
1792             btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1793             break;
1794         }
1795
1796         release_bytes = 0;
1797         if (only_release_metadata)
1798             btrfs_check_nocow_unlock(BTRFS_I(inode));
1799
1800         btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1801
1802         cond_resched();
1803
1804         balance_dirty_pages_ratelimited(inode->i_mapping);
1805
1806         pos += copied;
1807         num_written += copied;
1808     }
1809
1810     kfree(pages);
1811
1812     if (release_bytes) {
1813         if (only_release_metadata) {
1814             btrfs_check_nocow_unlock(BTRFS_I(inode));
1815             btrfs_delalloc_release_metadata(BTRFS_I(inode),
1816                     release_bytes, true);
1817         } else {
1818             btrfs_delalloc_release_space(BTRFS_I(inode),
1819                     data_reserved,
1820                     round_down(pos, fs_info->sectorsize),
1821                     release_bytes, true);
1822         }
1823     }
1824
1825     extent_changeset_free(data_reserved);
1826     if (num_written > 0) {
1827         pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1828         iocb->ki_pos += num_written;
1829     }
1830 out:
1831     btrfs_inode_unlock(inode, ilock_flags);
1832     return num_written ? num_written : ret;
1833 }
1834
1835 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1836                    const struct iov_iter *iter, loff_t offset)
1837 {
1838     const u32 blocksize_mask = fs_info->sectorsize - 1;
1839
1840     if (offset & blocksize_mask)
1841         return -EINVAL;
1842
1843     if (iov_iter_alignment(iter) & blocksize_mask)
1844         return -EINVAL;
1845
1846     return 0;
1847 }
1848
1849 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1850 {
1851     struct file *file = iocb->ki_filp;
1852     struct inode *inode = file_inode(file);
1853     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1854     loff_t pos;
1855     ssize_t written = 0;
1856     ssize_t written_buffered;
1857     size_t prev_left = 0;
1858     loff_t endbyte;
1859     ssize_t err;
1860     unsigned int ilock_flags = 0;
1861
1862     if (iocb->ki_flags & IOCB_NOWAIT)
1863         ilock_flags |= BTRFS_ILOCK_TRY;
1864
1865     /* If the write DIO is within EOF, use a shared lock */
1866     if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
1867         ilock_flags |= BTRFS_ILOCK_SHARED;
1868
1869 relock:
1870     err = btrfs_inode_lock(inode, ilock_flags);
1871     if (err < 0)
1872         return err;
1873
1874     err = generic_write_checks(iocb, from);
1875     if (err <= 0) {
1876         btrfs_inode_unlock(inode, ilock_flags);
1877         return err;
1878     }
1879
1880     err = btrfs_write_check(iocb, from, err);
1881     if (err < 0) {
1882         btrfs_inode_unlock(inode, ilock_flags);
1883         goto out;
1884     }
1885
1886     pos = iocb->ki_pos;
1887     /*
1888      * Re-check since file size may have changed just before taking the
1889      * lock or pos may have changed because of O_APPEND in generic_write_check()
1890      */
1891     if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1892         pos + iov_iter_count(from) > i_size_read(inode)) {
1893         btrfs_inode_unlock(inode, ilock_flags);
1894         ilock_flags &= ~BTRFS_ILOCK_SHARED;
1895         goto relock;
1896     }
1897
1898     if (check_direct_IO(fs_info, from, pos)) {
1899         btrfs_inode_unlock(inode, ilock_flags);
1900         goto buffered;
1901     }
1902
1903     /*
1904      * The iov_iter can be mapped to the same file range we are writing to.
1905      * If that's the case, then we will deadlock in the iomap code, because
1906      * it first calls our callback btrfs_dio_iomap_begin(), which will create
1907      * an ordered extent, and after that it will fault in the pages that the
1908      * iov_iter refers to. During the fault in we end up in the readahead
1909      * pages code (starting at btrfs_readahead()), which will lock the range,
1910      * find that ordered extent and then wait for it to complete (at
1911      * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1912      * obviously the ordered extent can never complete as we didn't submit
1913      * yet the respective bio(s). This always happens when the buffer is
1914      * memory mapped to the same file range, since the iomap DIO code always
1915      * invalidates pages in the target file range (after starting and waiting
1916      * for any writeback).
1917      *
1918      * So here we disable page faults in the iov_iter and then retry if we
1919      * got -EFAULT, faulting in the pages before the retry.
1920      */
1921 again:
1922     from->nofault = true;
1923     err = btrfs_dio_rw(iocb, from, written);
1924     from->nofault = false;
1925
1926     /* No increment (+=) because iomap returns a cumulative value. */
1927     if (err > 0)
1928         written = err;
1929
1930     if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
1931         const size_t left = iov_iter_count(from);
1932         /*
1933          * We have more data left to write. Try to fault in as many as
1934          * possible of the remainder pages and retry. We do this without
1935          * releasing and locking again the inode, to prevent races with
1936          * truncate.
1937          *
1938          * Also, in case the iov refers to pages in the file range of the
1939          * file we want to write to (due to a mmap), we could enter an
1940          * infinite loop if we retry after faulting the pages in, since
1941          * iomap will invalidate any pages in the range early on, before
1942          * it tries to fault in the pages of the iov. So we keep track of
1943          * how much was left of iov in the previous EFAULT and fallback
1944          * to buffered IO in case we haven't made any progress.
1945          */
1946         if (left == prev_left) {
1947             err = -ENOTBLK;
1948         } else {
1949             fault_in_iov_iter_readable(from, left);
1950             prev_left = left;
1951             goto again;
1952         }
1953     }
1954
1955     btrfs_inode_unlock(inode, ilock_flags);
1956
1957     /*
1958      * If 'err' is -ENOTBLK or we have not written all data, then it means
1959      * we must fallback to buffered IO.
1960      */
1961     if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
1962         goto out;
1963
1964 buffered:
1965     /*
1966      * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1967      * it must retry the operation in a context where blocking is acceptable,
1968      * since we currently don't have NOWAIT semantics support for buffered IO
1969      * and may block there for many reasons (reserving space for example).
1970      */
1971     if (iocb->ki_flags & IOCB_NOWAIT) {
1972         err = -EAGAIN;
1973         goto out;
1974     }
1975
1976     pos = iocb->ki_pos;
1977     written_buffered = btrfs_buffered_write(iocb, from);
1978     if (written_buffered < 0) {
1979         err = written_buffered;
1980         goto out;
1981     }
1982     /*
1983      * Ensure all data is persisted. We want the next direct IO read to be
1984      * able to read what was just written.
1985      */
1986     endbyte = pos + written_buffered - 1;
1987     err = btrfs_fdatawrite_range(inode, pos, endbyte);
1988     if (err)
1989         goto out;
1990     err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1991     if (err)
1992         goto out;
1993     written += written_buffered;
1994     iocb->ki_pos = pos + written_buffered;
1995     invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1996                  endbyte >> PAGE_SHIFT);
1997 out:
1998     return err < 0 ? err : written;
1999 }
2000
2001 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
2002             const struct btrfs_ioctl_encoded_io_args *encoded)
2003 {
2004     struct file *file = iocb->ki_filp;
2005     struct inode *inode = file_inode(file);
2006     loff_t count;
2007     ssize_t ret;
2008
2009     btrfs_inode_lock(inode, 0);
2010     count = encoded->len;
2011     ret = generic_write_checks_count(iocb, &count);
2012     if (ret == 0 && count != encoded->len) {
2013         /*
2014          * The write got truncated by generic_write_checks_count(). We
2015          * can't do a partial encoded write.
2016          */
2017         ret = -EFBIG;
2018     }
2019     if (ret || encoded->len == 0)
2020         goto out;
2021
2022     ret = btrfs_write_check(iocb, from, encoded->len);
2023     if (ret < 0)
2024         goto out;
2025
2026     ret = btrfs_do_encoded_write(iocb, from, encoded);
2027 out:
2028     btrfs_inode_unlock(inode, 0);
2029     return ret;
2030 }
2031
2032 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
2033                 const struct btrfs_ioctl_encoded_io_args *encoded)
2034 {
2035     struct file *file = iocb->ki_filp;
2036     struct btrfs_inode *inode = BTRFS_I(file_inode(file));
2037     ssize_t num_written, num_sync;
2038     const bool sync = iocb_is_dsync(iocb);
2039
2040     /*
2041      * If the fs flips readonly due to some impossible error, although we
2042      * have opened a file as writable, we have to stop this write operation
2043      * to ensure consistency.
2044      */
2045     if (BTRFS_FS_ERROR(inode->root->fs_info))
2046         return -EROFS;
2047
2048     if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
2049         return -EOPNOTSUPP;
2050
2051     if (sync)
2052         atomic_inc(&inode->sync_writers);
2053
2054     if (encoded) {
2055         num_written = btrfs_encoded_write(iocb, from, encoded);
2056         num_sync = encoded->len;
2057     } else if (iocb->ki_flags & IOCB_DIRECT) {
2058         num_written = btrfs_direct_write(iocb, from);
2059         num_sync = num_written;
2060     } else {
2061         num_written = btrfs_buffered_write(iocb, from);
2062         num_sync = num_written;
2063     }
2064
2065     btrfs_set_inode_last_sub_trans(inode);
2066
2067     if (num_sync > 0) {
2068         num_sync = generic_write_sync(iocb, num_sync);
2069         if (num_sync < 0)
2070             num_written = num_sync;
2071     }
2072
2073     if (sync)
2074         atomic_dec(&inode->sync_writers);
2075
2076     current->backing_dev_info = NULL;
2077     return num_written;
2078 }
2079
2080 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2081 {
2082     return btrfs_do_write_iter(iocb, from, NULL);
2083 }
2084
2085 int btrfs_release_file(struct inode *inode, struct file *filp)
2086 {
2087     struct btrfs_file_private *private = filp->private_data;
2088
2089     if (private && private->filldir_buf)
2090         kfree(private->filldir_buf);
2091     kfree(private);
2092     filp->private_data = NULL;
2093
2094     /*
2095      * Set by setattr when we are about to truncate a file from a non-zero
2096      * size to a zero size.  This tries to flush down new bytes that may
2097      * have been written if the application were using truncate to replace
2098      * a file in place.
2099      */
2100     if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2101                    &BTRFS_I(inode)->runtime_flags))
2102             filemap_flush(inode->i_mapping);
2103     return 0;
2104 }
2105
2106 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2107 {
2108     int ret;
2109     struct blk_plug plug;
2110
2111     /*
2112      * This is only called in fsync, which would do synchronous writes, so
2113      * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2114      * multiple disks using raid profile, a large IO can be split to
2115      * several segments of stripe length (currently 64K).
2116      */
2117     blk_start_plug(&plug);
2118     atomic_inc(&BTRFS_I(inode)->sync_writers);
2119     ret = btrfs_fdatawrite_range(inode, start, end);
2120     atomic_dec(&BTRFS_I(inode)->sync_writers);
2121     blk_finish_plug(&plug);
2122
2123     return ret;
2124 }
2125
2126 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2127 {
2128     struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2129     struct btrfs_fs_info *fs_info = inode->root->fs_info;
2130
2131     if (btrfs_inode_in_log(inode, fs_info->generation) &&
2132         list_empty(&ctx->ordered_extents))
2133         return true;
2134
2135     /*
2136      * If we are doing a fast fsync we can not bail out if the inode's
2137      * last_trans is <= then the last committed transaction, because we only
2138      * update the last_trans of the inode during ordered extent completion,
2139      * and for a fast fsync we don't wait for that, we only wait for the
2140      * writeback to complete.
2141      */
2142     if (inode->last_trans <= fs_info->last_trans_committed &&
2143         (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2144          list_empty(&ctx->ordered_extents)))
2145         return true;
2146
2147     return false;
2148 }
2149
2150 /*
2151  * fsync call for both files and directories.  This logs the inode into
2152  * the tree log instead of forcing full commits whenever possible.
2153  *
2154  * It needs to call filemap_fdatawait so that all ordered extent updates are
2155  * in the metadata btree are up to date for copying to the log.
2156  *
2157  * It drops the inode mutex before doing the tree log commit.  This is an
2158  * important optimization for directories because holding the mutex prevents
2159  * new operations on the dir while we write to disk.
2160  */
2161 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2162 {
2163     struct dentry *dentry = file_dentry(file);
2164     struct inode *inode = d_inode(dentry);
2165     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2166     struct btrfs_root *root = BTRFS_I(inode)->root;
2167     struct btrfs_trans_handle *trans;
2168     struct btrfs_log_ctx ctx;
2169     int ret = 0, err;
2170     u64 len;
2171     bool full_sync;
2172
2173     trace_btrfs_sync_file(file, datasync);
2174
2175     btrfs_init_log_ctx(&ctx, inode);
2176
2177     /*
2178      * Always set the range to a full range, otherwise we can get into
2179      * several problems, from missing file extent items to represent holes
2180      * when not using the NO_HOLES feature, to log tree corruption due to
2181      * races between hole detection during logging and completion of ordered
2182      * extents outside the range, to missing checksums due to ordered extents
2183      * for which we flushed only a subset of their pages.
2184      */
2185     start = 0;
2186     end = LLONG_MAX;
2187     len = (u64)LLONG_MAX + 1;
2188
2189     /*
2190      * We write the dirty pages in the range and wait until they complete
2191      * out of the ->i_mutex. If so, we can flush the dirty pages by
2192      * multi-task, and make the performance up.  See
2193      * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2194      */
2195     ret = start_ordered_ops(inode, start, end);
2196     if (ret)
2197         goto out;
2198
2199     btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2200
2201     atomic_inc(&root->log_batch);
2202
2203     /*
2204      * Always check for the full sync flag while holding the inode's lock,
2205      * to avoid races with other tasks. The flag must be either set all the
2206      * time during logging or always off all the time while logging.
2207      */
2208     full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2209                  &BTRFS_I(inode)->runtime_flags);
2210
2211     /*
2212      * Before we acquired the inode's lock and the mmap lock, someone may
2213      * have dirtied more pages in the target range. We need to make sure
2214      * that writeback for any such pages does not start while we are logging
2215      * the inode, because if it does, any of the following might happen when
2216      * we are not doing a full inode sync:
2217      *
2218      * 1) We log an extent after its writeback finishes but before its
2219      *    checksums are added to the csum tree, leading to -EIO errors
2220      *    when attempting to read the extent after a log replay.
2221      *
2222      * 2) We can end up logging an extent before its writeback finishes.
2223      *    Therefore after the log replay we will have a file extent item
2224      *    pointing to an unwritten extent (and no data checksums as well).
2225      *
2226      * So trigger writeback for any eventual new dirty pages and then we
2227      * wait for all ordered extents to complete below.
2228      */
2229     ret = start_ordered_ops(inode, start, end);
2230     if (ret) {
2231         btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2232         goto out;
2233     }
2234
2235     /*
2236      * We have to do this here to avoid the priority inversion of waiting on
2237      * IO of a lower priority task while holding a transaction open.
2238      *
2239      * For a full fsync we wait for the ordered extents to complete while
2240      * for a fast fsync we wait just for writeback to complete, and then
2241      * attach the ordered extents to the transaction so that a transaction
2242      * commit waits for their completion, to avoid data loss if we fsync,
2243      * the current transaction commits before the ordered extents complete
2244      * and a power failure happens right after that.
2245      *
2246      * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
2247      * logical address recorded in the ordered extent may change. We need
2248      * to wait for the IO to stabilize the logical address.
2249      */
2250     if (full_sync || btrfs_is_zoned(fs_info)) {
2251         ret = btrfs_wait_ordered_range(inode, start, len);
2252     } else {
2253         /*
2254          * Get our ordered extents as soon as possible to avoid doing
2255          * checksum lookups in the csum tree, and use instead the
2256          * checksums attached to the ordered extents.
2257          */
2258         btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2259                               &ctx.ordered_extents);
2260         ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2261     }
2262
2263     if (ret)
2264         goto out_release_extents;
2265
2266     atomic_inc(&root->log_batch);
2267
2268     smp_mb();
2269     if (skip_inode_logging(&ctx)) {
2270         /*
2271          * We've had everything committed since the last time we were
2272          * modified so clear this flag in case it was set for whatever
2273          * reason, it's no longer relevant.
2274          */
2275         clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2276               &BTRFS_I(inode)->runtime_flags);
2277         /*
2278          * An ordered extent might have started before and completed
2279          * already with io errors, in which case the inode was not
2280          * updated and we end up here. So check the inode's mapping
2281          * for any errors that might have happened since we last
2282          * checked called fsync.
2283          */
2284         ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2285         goto out_release_extents;
2286     }
2287
2288     /*
2289      * We use start here because we will need to wait on the IO to complete
2290      * in btrfs_sync_log, which could require joining a transaction (for
2291      * example checking cross references in the nocow path).  If we use join
2292      * here we could get into a situation where we're waiting on IO to
2293      * happen that is blocked on a transaction trying to commit.  With start
2294      * we inc the extwriter counter, so we wait for all extwriters to exit
2295      * before we start blocking joiners.  This comment is to keep somebody
2296      * from thinking they are super smart and changing this to
2297      * btrfs_join_transaction *cough*Josef*cough*.
2298      */
2299     trans = btrfs_start_transaction(root, 0);
2300     if (IS_ERR(trans)) {
2301         ret = PTR_ERR(trans);
2302         goto out_release_extents;
2303     }
2304     trans->in_fsync = true;
2305
2306     ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2307     btrfs_release_log_ctx_extents(&ctx);
2308     if (ret < 0) {
2309         /* Fallthrough and commit/free transaction. */
2310         ret = BTRFS_LOG_FORCE_COMMIT;
2311     }
2312
2313     /* we've logged all the items and now have a consistent
2314      * version of the file in the log.  It is possible that
2315      * someone will come in and modify the file, but that's
2316      * fine because the log is consistent on disk, and we
2317      * have references to all of the file's extents
2318      *
2319      * It is possible that someone will come in and log the
2320      * file again, but that will end up using the synchronization
2321      * inside btrfs_sync_log to keep things safe.
2322      */
2323     btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2324
2325     if (ret == BTRFS_NO_LOG_SYNC) {
2326         ret = btrfs_end_transaction(trans);
2327         goto out;
2328     }
2329
2330     /* We successfully logged the inode, attempt to sync the log. */
2331     if (!ret) {
2332         ret = btrfs_sync_log(trans, root, &ctx);
2333         if (!ret) {
2334             ret = btrfs_end_transaction(trans);
2335             goto out;
2336         }
2337     }
2338
2339     /*
2340      * At this point we need to commit the transaction because we had
2341      * btrfs_need_log_full_commit() or some other error.
2342      *
2343      * If we didn't do a full sync we have to stop the trans handle, wait on
2344      * the ordered extents, start it again and commit the transaction.  If
2345      * we attempt to wait on the ordered extents here we could deadlock with
2346      * something like fallocate() that is holding the extent lock trying to
2347      * start a transaction while some other thread is trying to commit the
2348      * transaction while we (fsync) are currently holding the transaction
2349      * open.
2350      */
2351     if (!full_sync) {
2352         ret = btrfs_end_transaction(trans);
2353         if (ret)
2354             goto out;
2355         ret = btrfs_wait_ordered_range(inode, start, len);
2356         if (ret)
2357             goto out;
2358
2359         /*
2360          * This is safe to use here because we're only interested in
2361          * making sure the transaction that had the ordered extents is
2362          * committed.  We aren't waiting on anything past this point,
2363          * we're purely getting the transaction and committing it.
2364          */
2365         trans = btrfs_attach_transaction_barrier(root);
2366         if (IS_ERR(trans)) {
2367             ret = PTR_ERR(trans);
2368
2369             /*
2370              * We committed the transaction and there's no currently
2371              * running transaction, this means everything we care
2372              * about made it to disk and we are done.
2373              */
2374             if (ret == -ENOENT)
2375                 ret = 0;
2376             goto out;
2377         }
2378     }
2379
2380     ret = btrfs_commit_transaction(trans);
2381 out:
2382     ASSERT(list_empty(&ctx.list));
2383     err = file_check_and_advance_wb_err(file);
2384     if (!ret)
2385         ret = err;
2386     return ret > 0 ? -EIO : ret;
2387
2388 out_release_extents:
2389     btrfs_release_log_ctx_extents(&ctx);
2390     btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2391     goto out;
2392 }
2393
2394 static const struct vm_operations_struct btrfs_file_vm_ops = {
2395     .fault      = filemap_fault,
2396     .map_pages  = filemap_map_pages,
2397     .page_mkwrite   = btrfs_page_mkwrite,
2398 };
2399
2400 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2401 {
2402     struct address_space *mapping = filp->f_mapping;
2403
2404     if (!mapping->a_ops->read_folio)
2405         return -ENOEXEC;
2406
2407     file_accessed(filp);
2408     vma->vm_ops = &btrfs_file_vm_ops;
2409
2410     return 0;
2411 }
2412
2413 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2414               int slot, u64 start, u64 end)
2415 {
2416     struct btrfs_file_extent_item *fi;
2417     struct btrfs_key key;
2418
2419     if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2420         return 0;
2421
2422     btrfs_item_key_to_cpu(leaf, &key, slot);
2423     if (key.objectid != btrfs_ino(inode) ||
2424         key.type != BTRFS_EXTENT_DATA_KEY)
2425         return 0;
2426
2427     fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2428
2429     if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2430         return 0;
2431
2432     if (btrfs_file_extent_disk_bytenr(leaf, fi))
2433         return 0;
2434
2435     if (key.offset == end)
2436         return 1;
2437     if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2438         return 1;
2439     return 0;
2440 }
2441
2442 static int fill_holes(struct btrfs_trans_handle *trans,
2443         struct btrfs_inode *inode,
2444         struct btrfs_path *path, u64 offset, u64 end)
2445 {
2446     struct btrfs_fs_info *fs_info = trans->fs_info;
2447     struct btrfs_root *root = inode->root;
2448     struct extent_buffer *leaf;
2449     struct btrfs_file_extent_item *fi;
2450     struct extent_map *hole_em;
2451     struct extent_map_tree *em_tree = &inode->extent_tree;
2452     struct btrfs_key key;
2453     int ret;
2454
2455     if (btrfs_fs_incompat(fs_info, NO_HOLES))
2456         goto out;
2457
2458     key.objectid = btrfs_ino(inode);
2459     key.type = BTRFS_EXTENT_DATA_KEY;
2460     key.offset = offset;
2461
2462     ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2463     if (ret <= 0) {
2464         /*
2465          * We should have dropped this offset, so if we find it then
2466          * something has gone horribly wrong.
2467          */
2468         if (ret == 0)
2469             ret = -EINVAL;
2470         return ret;
2471     }
2472
2473     leaf = path->nodes[0];
2474     if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2475         u64 num_bytes;
2476
2477         path->slots[0]--;
2478         fi = btrfs_item_ptr(leaf, path->slots[0],
2479                     struct btrfs_file_extent_item);
2480         num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2481             end - offset;
2482         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2483         btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2484         btrfs_set_file_extent_offset(leaf, fi, 0);
2485         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2486         btrfs_mark_buffer_dirty(leaf);
2487         goto out;
2488     }
2489
2490     if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2491         u64 num_bytes;
2492
2493         key.offset = offset;
2494         btrfs_set_item_key_safe(fs_info, path, &key);
2495         fi = btrfs_item_ptr(leaf, path->slots[0],
2496                     struct btrfs_file_extent_item);
2497         num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2498             offset;
2499         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2500         btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2501         btrfs_set_file_extent_offset(leaf, fi, 0);
2502         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2503         btrfs_mark_buffer_dirty(leaf);
2504         goto out;
2505     }
2506     btrfs_release_path(path);
2507
2508     ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2509             offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2510     if (ret)
2511         return ret;
2512
2513 out:
2514     btrfs_release_path(path);
2515
2516     hole_em = alloc_extent_map();
2517     if (!hole_em) {
2518         btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2519         btrfs_set_inode_full_sync(inode);
2520     } else {
2521         hole_em->start = offset;
2522         hole_em->len = end - offset;
2523         hole_em->ram_bytes = hole_em->len;
2524         hole_em->orig_start = offset;
2525
2526         hole_em->block_start = EXTENT_MAP_HOLE;
2527         hole_em->block_len = 0;
2528         hole_em->orig_block_len = 0;
2529         hole_em->compress_type = BTRFS_COMPRESS_NONE;
2530         hole_em->generation = trans->transid;
2531
2532         do {
2533             btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2534             write_lock(&em_tree->lock);
2535             ret = add_extent_mapping(em_tree, hole_em, 1);
2536             write_unlock(&em_tree->lock);
2537         } while (ret == -EEXIST);
2538         free_extent_map(hole_em);
2539         if (ret)
2540             btrfs_set_inode_full_sync(inode);
2541     }
2542
2543     return 0;
2544 }
2545
2546 /*
2547  * Find a hole extent on given inode and change start/len to the end of hole
2548  * extent.(hole/vacuum extent whose em->start <= start &&
2549  *     em->start + em->len > start)
2550  * When a hole extent is found, return 1 and modify start/len.
2551  */
2552 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2553 {
2554     struct btrfs_fs_info *fs_info = inode->root->fs_info;
2555     struct extent_map *em;
2556     int ret = 0;
2557
2558     em = btrfs_get_extent(inode, NULL, 0,
2559                   round_down(*start, fs_info->sectorsize),
2560                   round_up(*len, fs_info->sectorsize));
2561     if (IS_ERR(em))
2562         return PTR_ERR(em);
2563
2564     /* Hole or vacuum extent(only exists in no-hole mode) */
2565     if (em->block_start == EXTENT_MAP_HOLE) {
2566         ret = 1;
2567         *len = em->start + em->len > *start + *len ?
2568                0 : *start + *len - em->start - em->len;
2569         *start = em->start + em->len;
2570     }
2571     free_extent_map(em);
2572     return ret;
2573 }
2574
2575 static void btrfs_punch_hole_lock_range(struct inode *inode,
2576                     const u64 lockstart,
2577                     const u64 lockend,
2578                     struct extent_state **cached_state)
2579 {
2580     /*
2581      * For subpage case, if the range is not at page boundary, we could
2582      * have pages at the leading/tailing part of the range.
2583      * This could lead to dead loop since filemap_range_has_page()
2584      * will always return true.
2585      * So here we need to do extra page alignment for
2586      * filemap_range_has_page().
2587      */
2588     const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2589     const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2590
2591     while (1) {
2592         truncate_pagecache_range(inode, lockstart, lockend);
2593
2594         lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2595                  cached_state);
2596         /*
2597          * We can't have ordered extents in the range, nor dirty/writeback
2598          * pages, because we have locked the inode's VFS lock in exclusive
2599          * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2600          * we have flushed all delalloc in the range and we have waited
2601          * for any ordered extents in the range to complete.
2602          * We can race with anyone reading pages from this range, so after
2603          * locking the range check if we have pages in the range, and if
2604          * we do, unlock the range and retry.
2605          */
2606         if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2607                         page_lockend))
2608             break;
2609
2610         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2611                      lockend, cached_state);
2612     }
2613
2614     btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2615 }
2616
2617 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2618                      struct btrfs_inode *inode,
2619                      struct btrfs_path *path,
2620                      struct btrfs_replace_extent_info *extent_info,
2621                      const u64 replace_len,
2622                      const u64 bytes_to_drop)
2623 {
2624     struct btrfs_fs_info *fs_info = trans->fs_info;
2625     struct btrfs_root *root = inode->root;
2626     struct btrfs_file_extent_item *extent;
2627     struct extent_buffer *leaf;
2628     struct btrfs_key key;
2629     int slot;
2630     struct btrfs_ref ref = { 0 };
2631     int ret;
2632
2633     if (replace_len == 0)
2634         return 0;
2635
2636     if (extent_info->disk_offset == 0 &&
2637         btrfs_fs_incompat(fs_info, NO_HOLES)) {
2638         btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2639         return 0;
2640     }
2641
2642     key.objectid = btrfs_ino(inode);
2643     key.type = BTRFS_EXTENT_DATA_KEY;
2644     key.offset = extent_info->file_offset;
2645     ret = btrfs_insert_empty_item(trans, root, path, &key,
2646                       sizeof(struct btrfs_file_extent_item));
2647     if (ret)
2648         return ret;
2649     leaf = path->nodes[0];
2650     slot = path->slots[0];
2651     write_extent_buffer(leaf, extent_info->extent_buf,
2652                 btrfs_item_ptr_offset(leaf, slot),
2653                 sizeof(struct btrfs_file_extent_item));
2654     extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2655     ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2656     btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2657     btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2658     if (extent_info->is_new_extent)
2659         btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2660     btrfs_mark_buffer_dirty(leaf);
2661     btrfs_release_path(path);
2662
2663     ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2664                         replace_len);
2665     if (ret)
2666         return ret;
2667
2668     /* If it's a hole, nothing more needs to be done. */
2669     if (extent_info->disk_offset == 0) {
2670         btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2671         return 0;
2672     }
2673
2674     btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2675
2676     if (extent_info->is_new_extent && extent_info->insertions == 0) {
2677         key.objectid = extent_info->disk_offset;
2678         key.type = BTRFS_EXTENT_ITEM_KEY;
2679         key.offset = extent_info->disk_len;
2680         ret = btrfs_alloc_reserved_file_extent(trans, root,
2681                                btrfs_ino(inode),
2682                                extent_info->file_offset,
2683                                extent_info->qgroup_reserved,
2684                                &key);
2685     } else {
2686         u64 ref_offset;
2687
2688         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2689                        extent_info->disk_offset,
2690                        extent_info->disk_len, 0);
2691         ref_offset = extent_info->file_offset - extent_info->data_offset;
2692         btrfs_init_data_ref(&ref, root->root_key.objectid,
2693                     btrfs_ino(inode), ref_offset, 0, false);
2694         ret = btrfs_inc_extent_ref(trans, &ref);
2695     }
2696
2697     extent_info->insertions++;
2698
2699     return ret;
2700 }
2701
2702 /*
2703  * The respective range must have been previously locked, as well as the inode.
2704  * The end offset is inclusive (last byte of the range).
2705  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2706  * the file range with an extent.
2707  * When not punching a hole, we don't want to end up in a state where we dropped
2708  * extents without inserting a new one, so we must abort the transaction to avoid
2709  * a corruption.
2710  */
2711 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2712                    struct btrfs_path *path, const u64 start,
2713                    const u64 end,
2714                    struct btrfs_replace_extent_info *extent_info,
2715                    struct btrfs_trans_handle **trans_out)
2716 {
2717     struct btrfs_drop_extents_args drop_args = { 0 };
2718     struct btrfs_root *root = inode->root;
2719     struct btrfs_fs_info *fs_info = root->fs_info;
2720     u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2721     u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2722     struct btrfs_trans_handle *trans = NULL;
2723     struct btrfs_block_rsv *rsv;
2724     unsigned int rsv_count;
2725     u64 cur_offset;
2726     u64 len = end - start;
2727     int ret = 0;
2728
2729     if (end <= start)
2730         return -EINVAL;
2731
2732     rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2733     if (!rsv) {
2734         ret = -ENOMEM;
2735         goto out;
2736     }
2737     rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2738     rsv->failfast = true;
2739
2740     /*
2741      * 1 - update the inode
2742      * 1 - removing the extents in the range
2743      * 1 - adding the hole extent if no_holes isn't set or if we are
2744      *     replacing the range with a new extent
2745      */
2746     if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2747         rsv_count = 3;
2748     else
2749         rsv_count = 2;
2750
2751     trans = btrfs_start_transaction(root, rsv_count);
2752     if (IS_ERR(trans)) {
2753         ret = PTR_ERR(trans);
2754         trans = NULL;
2755         goto out_free;
2756     }
2757
2758     ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2759                       min_size, false);
2760     if (WARN_ON(ret))
2761         goto out_trans;
2762     trans->block_rsv = rsv;
2763
2764     cur_offset = start;
2765     drop_args.path = path;
2766     drop_args.end = end + 1;
2767     drop_args.drop_cache = true;
2768     while (cur_offset < end) {
2769         drop_args.start = cur_offset;
2770         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2771         /* If we are punching a hole decrement the inode's byte count */
2772         if (!extent_info)
2773             btrfs_update_inode_bytes(inode, 0,
2774                          drop_args.bytes_found);
2775         if (ret != -ENOSPC) {
2776             /*
2777              * The only time we don't want to abort is if we are
2778              * attempting to clone a partial inline extent, in which
2779              * case we'll get EOPNOTSUPP.  However if we aren't
2780              * clone we need to abort no matter what, because if we
2781              * got EOPNOTSUPP via prealloc then we messed up and
2782              * need to abort.
2783              */
2784             if (ret &&
2785                 (ret != -EOPNOTSUPP ||
2786                  (extent_info && extent_info->is_new_extent)))
2787                 btrfs_abort_transaction(trans, ret);
2788             break;
2789         }
2790
2791         trans->block_rsv = &fs_info->trans_block_rsv;
2792
2793         if (!extent_info && cur_offset < drop_args.drop_end &&
2794             cur_offset < ino_size) {
2795             ret = fill_holes(trans, inode, path, cur_offset,
2796                      drop_args.drop_end);
2797             if (ret) {
2798                 /*
2799                  * If we failed then we didn't insert our hole
2800                  * entries for the area we dropped, so now the
2801                  * fs is corrupted, so we must abort the
2802                  * transaction.
2803                  */
2804                 btrfs_abort_transaction(trans, ret);
2805                 break;
2806             }
2807         } else if (!extent_info && cur_offset < drop_args.drop_end) {
2808             /*
2809              * We are past the i_size here, but since we didn't
2810              * insert holes we need to clear the mapped area so we
2811              * know to not set disk_i_size in this area until a new
2812              * file extent is inserted here.
2813              */
2814             ret = btrfs_inode_clear_file_extent_range(inode,
2815                     cur_offset,
2816                     drop_args.drop_end - cur_offset);
2817             if (ret) {
2818                 /*
2819                  * We couldn't clear our area, so we could
2820                  * presumably adjust up and corrupt the fs, so
2821                  * we need to abort.
2822                  */
2823                 btrfs_abort_transaction(trans, ret);
2824                 break;
2825             }
2826         }
2827
2828         if (extent_info &&
2829             drop_args.drop_end > extent_info->file_offset) {
2830             u64 replace_len = drop_args.drop_end -
2831                       extent_info->file_offset;
2832
2833             ret = btrfs_insert_replace_extent(trans, inode, path,
2834                     extent_info, replace_len,
2835                     drop_args.bytes_found);
2836             if (ret) {
2837                 btrfs_abort_transaction(trans, ret);
2838                 break;
2839             }
2840             extent_info->data_len -= replace_len;
2841             extent_info->data_offset += replace_len;
2842             extent_info->file_offset += replace_len;
2843         }
2844
2845         /*
2846          * We are releasing our handle on the transaction, balance the
2847          * dirty pages of the btree inode and flush delayed items, and
2848          * then get a new transaction handle, which may now point to a
2849          * new transaction in case someone else may have committed the
2850          * transaction we used to replace/drop file extent items. So
2851          * bump the inode's iversion and update mtime and ctime except
2852          * if we are called from a dedupe context. This is because a
2853          * power failure/crash may happen after the transaction is
2854          * committed and before we finish replacing/dropping all the
2855          * file extent items we need.
2856          */
2857         inode_inc_iversion(&inode->vfs_inode);
2858
2859         if (!extent_info || extent_info->update_times) {
2860             inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
2861             inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
2862         }
2863
2864         ret = btrfs_update_inode(trans, root, inode);
2865         if (ret)
2866             break;
2867
2868         btrfs_end_transaction(trans);
2869         btrfs_btree_balance_dirty(fs_info);
2870
2871         trans = btrfs_start_transaction(root, rsv_count);
2872         if (IS_ERR(trans)) {
2873             ret = PTR_ERR(trans);
2874             trans = NULL;
2875             break;
2876         }
2877
2878         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2879                           rsv, min_size, false);
2880         if (WARN_ON(ret))
2881             break;
2882         trans->block_rsv = rsv;
2883
2884         cur_offset = drop_args.drop_end;
2885         len = end - cur_offset;
2886         if (!extent_info && len) {
2887             ret = find_first_non_hole(inode, &cur_offset, &len);
2888             if (unlikely(ret < 0))
2889                 break;
2890             if (ret && !len) {
2891                 ret = 0;
2892                 break;
2893             }
2894         }
2895     }
2896
2897     /*
2898      * If we were cloning, force the next fsync to be a full one since we
2899      * we replaced (or just dropped in the case of cloning holes when
2900      * NO_HOLES is enabled) file extent items and did not setup new extent
2901      * maps for the replacement extents (or holes).
2902      */
2903     if (extent_info && !extent_info->is_new_extent)
2904         btrfs_set_inode_full_sync(inode);
2905
2906     if (ret)
2907         goto out_trans;
2908
2909     trans->block_rsv = &fs_info->trans_block_rsv;
2910     /*
2911      * If we are using the NO_HOLES feature we might have had already an
2912      * hole that overlaps a part of the region [lockstart, lockend] and
2913      * ends at (or beyond) lockend. Since we have no file extent items to
2914      * represent holes, drop_end can be less than lockend and so we must
2915      * make sure we have an extent map representing the existing hole (the
2916      * call to __btrfs_drop_extents() might have dropped the existing extent
2917      * map representing the existing hole), otherwise the fast fsync path
2918      * will not record the existence of the hole region
2919      * [existing_hole_start, lockend].
2920      */
2921     if (drop_args.drop_end <= end)
2922         drop_args.drop_end = end + 1;
2923     /*
2924      * Don't insert file hole extent item if it's for a range beyond eof
2925      * (because it's useless) or if it represents a 0 bytes range (when
2926      * cur_offset == drop_end).
2927      */
2928     if (!extent_info && cur_offset < ino_size &&
2929         cur_offset < drop_args.drop_end) {
2930         ret = fill_holes(trans, inode, path, cur_offset,
2931                  drop_args.drop_end);
2932         if (ret) {
2933             /* Same comment as above. */
2934             btrfs_abort_transaction(trans, ret);
2935             goto out_trans;
2936         }
2937     } else if (!extent_info && cur_offset < drop_args.drop_end) {
2938         /* See the comment in the loop above for the reasoning here. */
2939         ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2940                     drop_args.drop_end - cur_offset);
2941         if (ret) {
2942             btrfs_abort_transaction(trans, ret);
2943             goto out_trans;
2944         }
2945
2946     }
2947     if (extent_info) {
2948         ret = btrfs_insert_replace_extent(trans, inode, path,
2949                 extent_info, extent_info->data_len,
2950                 drop_args.bytes_found);
2951         if (ret) {
2952             btrfs_abort_transaction(trans, ret);
2953             goto out_trans;
2954         }
2955     }
2956
2957 out_trans:
2958     if (!trans)
2959         goto out_free;
2960
2961     trans->block_rsv = &fs_info->trans_block_rsv;
2962     if (ret)
2963         btrfs_end_transaction(trans);
2964     else
2965         *trans_out = trans;
2966 out_free:
2967     btrfs_free_block_rsv(fs_info, rsv);
2968 out:
2969     return ret;
2970 }
2971
2972 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2973 {
2974     struct inode *inode = file_inode(file);
2975     struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2976     struct btrfs_root *root = BTRFS_I(inode)->root;
2977     struct extent_state *cached_state = NULL;
2978     struct btrfs_path *path;
2979     struct btrfs_trans_handle *trans = NULL;
2980     u64 lockstart;
2981     u64 lockend;
2982     u64 tail_start;
2983     u64 tail_len;
2984     u64 orig_start = offset;
2985     int ret = 0;
2986     bool same_block;
2987     u64 ino_size;
2988     bool truncated_block = false;
2989     bool updated_inode = false;
2990
2991     btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2992
2993     ret = btrfs_wait_ordered_range(inode, offset, len);
2994     if (ret)
2995         goto out_only_mutex;
2996
2997     ino_size = round_up(inode->i_size, fs_info->sectorsize);
2998     ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2999     if (ret < 0)
3000         goto out_only_mutex;
3001     if (ret && !len) {
3002         /* Already in a large hole */
3003         ret = 0;
3004         goto out_only_mutex;
3005     }
3006
3007     ret = file_modified(file);
3008     if (ret)
3009         goto out_only_mutex;
3010
3011     lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
3012     lockend = round_down(offset + len,
3013                  btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
3014     same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
3015         == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
3016     /*
3017      * We needn't truncate any block which is beyond the end of the file
3018      * because we are sure there is no data there.
3019      */
3020     /*
3021      * Only do this if we are in the same block and we aren't doing the
3022      * entire block.
3023      */
3024     if (same_block && len < fs_info->sectorsize) {
3025         if (offset < ino_size) {
3026             truncated_block = true;
3027             ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3028                            0);
3029         } else {
3030             ret = 0;
3031         }
3032         goto out_only_mutex;
3033     }
3034
3035     /* zero back part of the first block */
3036     if (offset < ino_size) {
3037         truncated_block = true;
3038         ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3039         if (ret) {
3040             btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3041             return ret;
3042         }
3043     }
3044
3045     /* Check the aligned pages after the first unaligned page,
3046      * if offset != orig_start, which means the first unaligned page
3047      * including several following pages are already in holes,
3048      * the extra check can be skipped */
3049     if (offset == orig_start) {
3050         /* after truncate page, check hole again */
3051         len = offset + len - lockstart;
3052         offset = lockstart;
3053         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
3054         if (ret < 0)
3055             goto out_only_mutex;
3056         if (ret && !len) {
3057             ret = 0;
3058             goto out_only_mutex;
3059         }
3060         lockstart = offset;
3061     }
3062
3063     /* Check the tail unaligned part is in a hole */
3064     tail_start = lockend + 1;
3065     tail_len = offset + len - tail_start;
3066     if (tail_len) {
3067         ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
3068         if (unlikely(ret < 0))
3069             goto out_only_mutex;
3070         if (!ret) {
3071             /* zero the front end of the last page */
3072             if (tail_start + tail_len < ino_size) {
3073                 truncated_block = true;
3074                 ret = btrfs_truncate_block(BTRFS_I(inode),
3075                             tail_start + tail_len,
3076                             0, 1);
3077                 if (ret)
3078                     goto out_only_mutex;
3079             }
3080         }
3081     }
3082
3083     if (lockend < lockstart) {
3084         ret = 0;
3085         goto out_only_mutex;
3086     }
3087
3088     btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
3089
3090     path = btrfs_alloc_path();
3091     if (!path) {
3092         ret = -ENOMEM;
3093         goto out;
3094     }
3095
3096     ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
3097                      lockend, NULL, &trans);
3098     btrfs_free_path(path);
3099     if (ret)
3100         goto out;
3101
3102     ASSERT(trans != NULL);
3103     inode_inc_iversion(inode);
3104     inode->i_mtime = current_time(inode);
3105     inode->i_ctime = inode->i_mtime;
3106     ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3107     updated_inode = true;
3108     btrfs_end_transaction(trans);
3109     btrfs_btree_balance_dirty(fs_info);
3110 out:
3111     unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3112                  &cached_state);
3113 out_only_mutex:
3114     if (!updated_inode && truncated_block && !ret) {
3115         /*
3116          * If we only end up zeroing part of a page, we still need to
3117          * update the inode item, so that all the time fields are
3118          * updated as well as the necessary btrfs inode in memory fields
3119          * for detecting, at fsync time, if the inode isn't yet in the
3120          * log tree or it's there but not up to date.
3121          */
3122         struct timespec64 now = current_time(inode);
3123
3124         inode_inc_iversion(inode);
3125         inode->i_mtime = now;
3126         inode->i_ctime = now;
3127         trans = btrfs_start_transaction(root, 1);
3128         if (IS_ERR(trans)) {
3129             ret = PTR_ERR(trans);
3130         } else {
3131             int ret2;
3132
3133             ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3134             ret2 = btrfs_end_transaction(trans);
3135             if (!ret)
3136                 ret = ret2;
3137         }
3138     }
3139     btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3140     return ret;
3141 }
3142
3143 /* Helper structure to record which range is already reserved */
3144 struct falloc_range {
3145     struct list_head list;
3146     u64 start;
3147     u64 len;
3148 };
3149
3150 /*
3151  * Helper function to add falloc range
3152  *
3153  * Caller should have locked the larger range of extent containing
3154  * [start, len)
3155  */
3156 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3157 {
3158     struct falloc_range *range = NULL;
3159
3160     if (!list_empty(head)) {
3161         /*
3162          * As fallocate iterates by bytenr order, we only need to check
3163          * the last range.
3164          */
3165         range = list_last_entry(head, struct falloc_range, list);
3166         if (range->start + range->len == start) {
3167             range->len += len;
3168             return 0;
3169         }
3170     }
3171
3172     range = kmalloc(sizeof(*range), GFP_KERNEL);
3173     if (!range)
3174         return -ENOMEM;
3175     range->start = start;
3176     range->len = len;
3177     list_add_tail(&range->list, head);
3178     return 0;
3179 }
3180
3181 static int btrfs_fallocate_update_isize(struct inode *inode,
3182                     const u64 end,
3183                     const int mode)
3184 {
3185     struct btrfs_trans_handle *trans;
3186     struct btrfs_root *root = BTRFS_I(inode)->root;
3187     int ret;
3188     int ret2;
3189
3190     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3191         return 0;
3192
3193     trans = btrfs_start_transaction(root, 1);
3194     if (IS_ERR(trans))
3195         return PTR_ERR(trans);
3196
3197     inode->i_ctime = current_time(inode);
3198     i_size_write(inode, end);
3199     btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3200     ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3201     ret2 = btrfs_end_transaction(trans);
3202
3203     return ret ? ret : ret2;
3204 }
3205
3206 enum {
3207     RANGE_BOUNDARY_WRITTEN_EXTENT,
3208     RANGE_BOUNDARY_PREALLOC_EXTENT,
3209     RANGE_BOUNDARY_HOLE,
3210 };
3211
3212 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3213                          u64 offset)
3214 {
3215     const u64 sectorsize = btrfs_inode_sectorsize(inode);
3216     struct extent_map *em;
3217     int ret;
3218
3219     offset = round_down(offset, sectorsize);
3220     em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3221     if (IS_ERR(em))
3222         return PTR_ERR(em);
3223
3224     if (em->block_start == EXTENT_MAP_HOLE)
3225         ret = RANGE_BOUNDARY_HOLE;
3226     else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3227         ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3228     else
3229         ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3230
3231     free_extent_map(em);
3232     return ret;
3233 }
3234
3235 static int btrfs_zero_range(struct inode *inode,
3236                 loff_t offset,
3237                 loff_t len,
3238                 const int mode)
3239 {
3240     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3241     struct extent_map *em;
3242     struct extent_changeset *data_reserved = NULL;
3243     int ret;
3244     u64 alloc_hint = 0;
3245     const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3246     u64 alloc_start = round_down(offset, sectorsize);
3247     u64 alloc_end = round_up(offset + len, sectorsize);
3248     u64 bytes_to_reserve = 0;
3249     bool space_reserved = false;
3250
3251     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3252                   alloc_end - alloc_start);
3253     if (IS_ERR(em)) {
3254         ret = PTR_ERR(em);
3255         goto out;
3256     }
3257
3258     /*
3259      * Avoid hole punching and extent allocation for some cases. More cases
3260      * could be considered, but these are unlikely common and we keep things
3261      * as simple as possible for now. Also, intentionally, if the target
3262      * range contains one or more prealloc extents together with regular
3263      * extents and holes, we drop all the existing extents and allocate a
3264      * new prealloc extent, so that we get a larger contiguous disk extent.
3265      */
3266     if (em->start <= alloc_start &&
3267         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3268         const u64 em_end = em->start + em->len;
3269
3270         if (em_end >= offset + len) {
3271             /*
3272              * The whole range is already a prealloc extent,
3273              * do nothing except updating the inode's i_size if
3274              * needed.
3275              */
3276             free_extent_map(em);
3277             ret = btrfs_fallocate_update_isize(inode, offset + len,
3278                                mode);
3279             goto out;
3280         }
3281         /*
3282          * Part of the range is already a prealloc extent, so operate
3283          * only on the remaining part of the range.
3284          */
3285         alloc_start = em_end;
3286         ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3287         len = offset + len - alloc_start;
3288         offset = alloc_start;
3289         alloc_hint = em->block_start + em->len;
3290     }
3291     free_extent_map(em);
3292
3293     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3294         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3295         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3296                       sectorsize);
3297         if (IS_ERR(em)) {
3298             ret = PTR_ERR(em);
3299             goto out;
3300         }
3301
3302         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3303             free_extent_map(em);
3304             ret = btrfs_fallocate_update_isize(inode, offset + len,
3305                                mode);
3306             goto out;
3307         }
3308         if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3309             free_extent_map(em);
3310             ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3311                            0);
3312             if (!ret)
3313                 ret = btrfs_fallocate_update_isize(inode,
3314                                    offset + len,
3315                                    mode);
3316             return ret;
3317         }
3318         free_extent_map(em);
3319         alloc_start = round_down(offset, sectorsize);
3320         alloc_end = alloc_start + sectorsize;
3321         goto reserve_space;
3322     }
3323
3324     alloc_start = round_up(offset, sectorsize);
3325     alloc_end = round_down(offset + len, sectorsize);
3326
3327     /*
3328      * For unaligned ranges, check the pages at the boundaries, they might
3329      * map to an extent, in which case we need to partially zero them, or
3330      * they might map to a hole, in which case we need our allocation range
3331      * to cover them.
3332      */
3333     if (!IS_ALIGNED(offset, sectorsize)) {
3334         ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3335                                 offset);
3336         if (ret < 0)
3337             goto out;
3338         if (ret == RANGE_BOUNDARY_HOLE) {
3339             alloc_start = round_down(offset, sectorsize);
3340             ret = 0;
3341         } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3342             ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3343             if (ret)
3344                 goto out;
3345         } else {
3346             ret = 0;
3347         }
3348     }
3349
3350     if (!IS_ALIGNED(offset + len, sectorsize)) {
3351         ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3352                                 offset + len);
3353         if (ret < 0)
3354             goto out;
3355         if (ret == RANGE_BOUNDARY_HOLE) {
3356             alloc_end = round_up(offset + len, sectorsize);
3357             ret = 0;
3358         } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3359             ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3360                            0, 1);
3361             if (ret)
3362                 goto out;
3363         } else {
3364             ret = 0;
3365         }
3366     }
3367
3368 reserve_space:
3369     if (alloc_start < alloc_end) {
3370         struct extent_state *cached_state = NULL;
3371         const u64 lockstart = alloc_start;
3372         const u64 lockend = alloc_end - 1;
3373
3374         bytes_to_reserve = alloc_end - alloc_start;
3375         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3376                               bytes_to_reserve);
3377         if (ret < 0)
3378             goto out;
3379         space_reserved = true;
3380         btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3381                         &cached_state);
3382         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3383                         alloc_start, bytes_to_reserve);
3384         if (ret) {
3385             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3386                          lockend, &cached_state);
3387             goto out;
3388         }
3389         ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3390                         alloc_end - alloc_start,
3391                         i_blocksize(inode),
3392                         offset + len, &alloc_hint);
3393         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3394                      lockend, &cached_state);
3395         /* btrfs_prealloc_file_range releases reserved space on error */
3396         if (ret) {
3397             space_reserved = false;
3398             goto out;
3399         }
3400     }
3401     ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3402  out:
3403     if (ret && space_reserved)
3404         btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3405                            alloc_start, bytes_to_reserve);
3406     extent_changeset_free(data_reserved);
3407
3408     return ret;
3409 }
3410
3411 static long btrfs_fallocate(struct file *file, int mode,
3412                 loff_t offset, loff_t len)
3413 {
3414     struct inode *inode = file_inode(file);
3415     struct extent_state *cached_state = NULL;
3416     struct extent_changeset *data_reserved = NULL;
3417     struct falloc_range *range;
3418     struct falloc_range *tmp;
3419     struct list_head reserve_list;
3420     u64 cur_offset;
3421     u64 last_byte;
3422     u64 alloc_start;
3423     u64 alloc_end;
3424     u64 alloc_hint = 0;
3425     u64 locked_end;
3426     u64 actual_end = 0;
3427     u64 data_space_needed = 0;
3428     u64 data_space_reserved = 0;
3429     u64 qgroup_reserved = 0;
3430     struct extent_map *em;
3431     int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3432     int ret;
3433
3434     /* Do not allow fallocate in ZONED mode */
3435     if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3436         return -EOPNOTSUPP;
3437
3438     alloc_start = round_down(offset, blocksize);
3439     alloc_end = round_up(offset + len, blocksize);
3440     cur_offset = alloc_start;
3441
3442     /* Make sure we aren't being give some crap mode */
3443     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3444              FALLOC_FL_ZERO_RANGE))
3445         return -EOPNOTSUPP;
3446
3447     if (mode & FALLOC_FL_PUNCH_HOLE)
3448         return btrfs_punch_hole(file, offset, len);
3449
3450     btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
3451
3452     if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3453         ret = inode_newsize_ok(inode, offset + len);
3454         if (ret)
3455             goto out;
3456     }
3457
3458     ret = file_modified(file);
3459     if (ret)
3460         goto out;
3461
3462     /*
3463      * TODO: Move these two operations after we have checked
3464      * accurate reserved space, or fallocate can still fail but
3465      * with page truncated or size expanded.
3466      *
3467      * But that's a minor problem and won't do much harm BTW.
3468      */
3469     if (alloc_start > inode->i_size) {
3470         ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3471                     alloc_start);
3472         if (ret)
3473             goto out;
3474     } else if (offset + len > inode->i_size) {
3475         /*
3476          * If we are fallocating from the end of the file onward we
3477          * need to zero out the end of the block if i_size lands in the
3478          * middle of a block.
3479          */
3480         ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3481         if (ret)
3482             goto out;
3483     }
3484
3485     /*
3486      * We have locked the inode at the VFS level (in exclusive mode) and we
3487      * have locked the i_mmap_lock lock (in exclusive mode). Now before
3488      * locking the file range, flush all dealloc in the range and wait for
3489      * all ordered extents in the range to complete. After this we can lock
3490      * the file range and, due to the previous locking we did, we know there
3491      * can't be more delalloc or ordered extents in the range.
3492      */
3493     ret = btrfs_wait_ordered_range(inode, alloc_start,
3494                        alloc_end - alloc_start);
3495     if (ret)
3496         goto out;
3497
3498     if (mode & FALLOC_FL_ZERO_RANGE) {
3499         ret = btrfs_zero_range(inode, offset, len, mode);
3500         btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3501         return ret;
3502     }
3503
3504     locked_end = alloc_end - 1;
3505     lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3506              &cached_state);
3507
3508     btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3509
3510     /* First, check if we exceed the qgroup limit */
3511     INIT_LIST_HEAD(&reserve_list);
3512     while (cur_offset < alloc_end) {
3513         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3514                       alloc_end - cur_offset);
3515         if (IS_ERR(em)) {
3516             ret = PTR_ERR(em);
3517             break;
3518         }
3519         last_byte = min(extent_map_end(em), alloc_end);
3520         actual_end = min_t(u64, extent_map_end(em), offset + len);
3521         last_byte = ALIGN(last_byte, blocksize);
3522         if (em->block_start == EXTENT_MAP_HOLE ||
3523             (cur_offset >= inode->i_size &&
3524              !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3525             const u64 range_len = last_byte - cur_offset;
3526
3527             ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3528             if (ret < 0) {
3529                 free_extent_map(em);
3530                 break;
3531             }
3532             ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3533                     &data_reserved, cur_offset, range_len);
3534             if (ret < 0) {
3535                 free_extent_map(em);
3536                 break;
3537             }
3538             qgroup_reserved += range_len;
3539             data_space_needed += range_len;
3540         }
3541         free_extent_map(em);
3542         cur_offset = last_byte;
3543     }
3544
3545     if (!ret && data_space_needed > 0) {
3546         /*
3547          * We are safe to reserve space here as we can't have delalloc
3548          * in the range, see above.
3549          */
3550         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3551                               data_space_needed);
3552         if (!ret)
3553             data_space_reserved = data_space_needed;
3554     }
3555
3556     /*
3557      * If ret is still 0, means we're OK to fallocate.
3558      * Or just cleanup the list and exit.
3559      */
3560     list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3561         if (!ret) {
3562             ret = btrfs_prealloc_file_range(inode, mode,
3563                     range->start,
3564                     range->len, i_blocksize(inode),
3565                     offset + len, &alloc_hint);
3566             /*
3567              * btrfs_prealloc_file_range() releases space even
3568              * if it returns an error.
3569              */
3570             data_space_reserved -= range->len;
3571             qgroup_reserved -= range->len;
3572         } else if (data_space_reserved > 0) {
3573             btrfs_free_reserved_data_space(BTRFS_I(inode),
3574                            data_reserved, range->start,
3575                            range->len);
3576             data_space_reserved -= range->len;
3577             qgroup_reserved -= range->len;
3578         } else if (qgroup_reserved > 0) {
3579             btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3580                            range->start, range->len);
3581             qgroup_reserved -= range->len;
3582         }
3583         list_del(&range->list);
3584         kfree(range);
3585     }
3586     if (ret < 0)
3587         goto out_unlock;
3588
3589     /*
3590      * We didn't need to allocate any more space, but we still extended the
3591      * size of the file so we need to update i_size and the inode item.
3592      */
3593     ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3594 out_unlock:
3595     unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3596                  &cached_state);
3597 out:
3598     btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3599     extent_changeset_free(data_reserved);
3600     return ret;
3601 }
3602
3603 static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
3604                   int whence)
3605 {
3606     struct btrfs_fs_info *fs_info = inode->root->fs_info;
3607     struct extent_map *em = NULL;
3608     struct extent_state *cached_state = NULL;
3609     loff_t i_size = inode->vfs_inode.i_size;
3610     u64 lockstart;
3611     u64 lockend;
3612     u64 start;
3613     u64 len;
3614     int ret = 0;
3615
3616     if (i_size == 0 || offset >= i_size)
3617         return -ENXIO;
3618
3619     /*
3620      * offset can be negative, in this case we start finding DATA/HOLE from
3621      * the very start of the file.
3622      */
3623     start = max_t(loff_t, 0, offset);
3624
3625     lockstart = round_down(start, fs_info->sectorsize);
3626     lockend = round_up(i_size, fs_info->sectorsize);
3627     if (lockend <= lockstart)
3628         lockend = lockstart + fs_info->sectorsize;
3629     lockend--;
3630     len = lockend - lockstart + 1;
3631
3632     lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
3633
3634     while (start < i_size) {
3635         em = btrfs_get_extent_fiemap(inode, start, len);
3636         if (IS_ERR(em)) {
3637             ret = PTR_ERR(em);
3638             em = NULL;
3639             break;
3640         }
3641
3642         if (whence == SEEK_HOLE &&
3643             (em->block_start == EXTENT_MAP_HOLE ||
3644              test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3645             break;
3646         else if (whence == SEEK_DATA &&
3647                (em->block_start != EXTENT_MAP_HOLE &&
3648                 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3649             break;
3650
3651         start = em->start + em->len;
3652         free_extent_map(em);
3653         em = NULL;
3654         cond_resched();
3655     }
3656     free_extent_map(em);
3657     unlock_extent_cached(&inode->io_tree, lockstart, lockend,
3658                  &cached_state);
3659     if (ret) {
3660         offset = ret;
3661     } else {
3662         if (whence == SEEK_DATA && start >= i_size)
3663             offset = -ENXIO;
3664         else
3665             offset = min_t(loff_t, start, i_size);
3666     }
3667
3668     return offset;
3669 }
3670
3671 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3672 {
3673     struct inode *inode = file->f_mapping->host;
3674
3675     switch (whence) {
3676     default:
3677         return generic_file_llseek(file, offset, whence);
3678     case SEEK_DATA:
3679     case SEEK_HOLE:
3680         btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3681         offset = find_desired_extent(BTRFS_I(inode), offset, whence);
3682         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3683         break;
3684     }
3685
3686     if (offset < 0)
3687         return offset;
3688
3689     return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3690 }
3691
3692 static int btrfs_file_open(struct inode *inode, struct file *filp)
3693 {
3694     int ret;
3695
3696     filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3697
3698     ret = fsverity_file_open(inode, filp);
3699     if (ret)
3700         return ret;
3701     return generic_file_open(inode, filp);
3702 }
3703
3704 static int check_direct_read(struct btrfs_fs_info *fs_info,
3705                  const struct iov_iter *iter, loff_t offset)
3706 {
3707     int ret;
3708     int i, seg;
3709
3710     ret = check_direct_IO(fs_info, iter, offset);
3711     if (ret < 0)
3712         return ret;
3713
3714     if (!iter_is_iovec(iter))
3715         return 0;
3716
3717     for (seg = 0; seg < iter->nr_segs; seg++)
3718         for (i = seg + 1; i < iter->nr_segs; i++)
3719             if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
3720                 return -EINVAL;
3721     return 0;
3722 }
3723
3724 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3725 {
3726     struct inode *inode = file_inode(iocb->ki_filp);
3727     size_t prev_left = 0;
3728     ssize_t read = 0;
3729     ssize_t ret;
3730
3731     if (fsverity_active(inode))
3732         return 0;
3733
3734     if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3735         return 0;
3736
3737     btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3738 again:
3739     /*
3740      * This is similar to what we do for direct IO writes, see the comment
3741      * at btrfs_direct_write(), but we also disable page faults in addition
3742      * to disabling them only at the iov_iter level. This is because when
3743      * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3744      * which can still trigger page fault ins despite having set ->nofault
3745      * to true of our 'to' iov_iter.
3746      *
3747      * The difference to direct IO writes is that we deadlock when trying
3748      * to lock the extent range in the inode's tree during he page reads
3749      * triggered by the fault in (while for writes it is due to waiting for
3750      * our own ordered extent). This is because for direct IO reads,
3751      * btrfs_dio_iomap_begin() returns with the extent range locked, which
3752      * is only unlocked in the endio callback (end_bio_extent_readpage()).
3753      */
3754     pagefault_disable();
3755     to->nofault = true;
3756     ret = btrfs_dio_rw(iocb, to, read);
3757     to->nofault = false;
3758     pagefault_enable();
3759
3760     /* No increment (+=) because iomap returns a cumulative value. */
3761     if (ret > 0)
3762         read = ret;
3763
3764     if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3765         const size_t left = iov_iter_count(to);
3766
3767         if (left == prev_left) {
3768             /*
3769              * We didn't make any progress since the last attempt,
3770              * fallback to a buffered read for the remainder of the
3771              * range. This is just to avoid any possibility of looping
3772              * for too long.
3773              */
3774             ret = read;
3775         } else {
3776             /*
3777              * We made some progress since the last retry or this is
3778              * the first time we are retrying. Fault in as many pages
3779              * as possible and retry.
3780              */
3781             fault_in_iov_iter_writeable(to, left);
3782             prev_left = left;
3783             goto again;
3784         }
3785     }
3786     btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3787     return ret < 0 ? ret : read;
3788 }
3789
3790 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3791 {
3792     ssize_t ret = 0;
3793
3794     if (iocb->ki_flags & IOCB_DIRECT) {
3795         ret = btrfs_direct_read(iocb, to);
3796         if (ret < 0 || !iov_iter_count(to) ||
3797             iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3798             return ret;
3799     }
3800
3801     return filemap_read(iocb, to, ret);
3802 }
3803
3804 const struct file_operations btrfs_file_operations = {
3805     .llseek     = btrfs_file_llseek,
3806     .read_iter      = btrfs_file_read_iter,
3807     .splice_read    = generic_file_splice_read,
3808     .write_iter = btrfs_file_write_iter,
3809     .splice_write   = iter_file_splice_write,
3810     .mmap       = btrfs_file_mmap,
3811     .open       = btrfs_file_open,
3812     .release    = btrfs_release_file,
3813     .fsync      = btrfs_sync_file,
3814     .fallocate  = btrfs_fallocate,
3815     .unlocked_ioctl = btrfs_ioctl,
3816 #ifdef CONFIG_COMPAT
3817     .compat_ioctl   = btrfs_compat_ioctl,
3818 #endif
3819     .remap_file_range = btrfs_remap_file_range,
3820 };
3821
3822 void __cold btrfs_auto_defrag_exit(void)
3823 {
3824     kmem_cache_destroy(btrfs_inode_defrag_cachep);
3825 }
3826
3827 int __init btrfs_auto_defrag_init(void)
3828 {
3829     btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3830                     sizeof(struct inode_defrag), 0,
3831                     SLAB_MEM_SPREAD,
3832                     NULL);
3833     if (!btrfs_inode_defrag_cachep)
3834         return -ENOMEM;
3835
3836     return 0;
3837 }
3838
3839 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3840 {
3841     int ret;
3842
3843     /*
3844      * So with compression we will find and lock a dirty page and clear the
3845      * first one as dirty, setup an async extent, and immediately return
3846      * with the entire range locked but with nobody actually marked with
3847      * writeback.  So we can't just filemap_write_and_wait_range() and
3848      * expect it to work since it will just kick off a thread to do the
3849      * actual work.  So we need to call filemap_fdatawrite_range _again_
3850      * since it will wait on the page lock, which won't be unlocked until
3851      * after the pages have been marked as writeback and so we're good to go
3852      * from there.  We have to do this otherwise we'll miss the ordered
3853      * extents and that results in badness.  Please Josef, do not think you
3854      * know better and pull this out at some point in the future, it is
3855      * right and you are wrong.
3856      */
3857     ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3858     if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3859                  &BTRFS_I(inode)->runtime_flags))
3860         ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3861
3862     return ret;
3863 }