fs/xfs/xfs_reflink.c

0001 // SPDX-License-Identifier: GPL-2.0+
0002 /*
0003  * Copyright (C) 2016 Oracle.  All Rights Reserved.
0004  * Author: Darrick J. Wong <darrick.wong@oracle.com>
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_mount.h"
0013 #include "xfs_defer.h"
0014 #include "xfs_inode.h"
0015 #include "xfs_trans.h"
0016 #include "xfs_bmap.h"
0017 #include "xfs_bmap_util.h"
0018 #include "xfs_trace.h"
0019 #include "xfs_icache.h"
0020 #include "xfs_btree.h"
0021 #include "xfs_refcount_btree.h"
0022 #include "xfs_refcount.h"
0023 #include "xfs_bmap_btree.h"
0024 #include "xfs_trans_space.h"
0025 #include "xfs_bit.h"
0026 #include "xfs_alloc.h"
0027 #include "xfs_quota.h"
0028 #include "xfs_reflink.h"
0029 #include "xfs_iomap.h"
0030 #include "xfs_ag.h"
0031 #include "xfs_ag_resv.h"
0032
0033 /*
0034  * Copy on Write of Shared Blocks
0035  *
0036  * XFS must preserve "the usual" file semantics even when two files share
0037  * the same physical blocks.  This means that a write to one file must not
0038  * alter the blocks in a different file; the way that we'll do that is
0039  * through the use of a copy-on-write mechanism.  At a high level, that
0040  * means that when we want to write to a shared block, we allocate a new
0041  * block, write the data to the new block, and if that succeeds we map the
0042  * new block into the file.
0043  *
0044  * XFS provides a "delayed allocation" mechanism that defers the allocation
0045  * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
0046  * possible.  This reduces fragmentation by enabling the filesystem to ask
0047  * for bigger chunks less often, which is exactly what we want for CoW.
0048  *
0049  * The delalloc mechanism begins when the kernel wants to make a block
0050  * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
0051  * create a delalloc mapping, which is a regular in-core extent, but without
0052  * a real startblock.  (For delalloc mappings, the startblock encodes both
0053  * a flag that this is a delalloc mapping, and a worst-case estimate of how
0054  * many blocks might be required to put the mapping into the BMBT.)  delalloc
0055  * mappings are a reservation against the free space in the filesystem;
0056  * adjacent mappings can also be combined into fewer larger mappings.
0057  *
0058  * As an optimization, the CoW extent size hint (cowextsz) creates
0059  * outsized aligned delalloc reservations in the hope of landing out of
0060  * order nearby CoW writes in a single extent on disk, thereby reducing
0061  * fragmentation and improving future performance.
0062  *
0063  * D: --RRRRRRSSSRRRRRRRR--- (data fork)
0064  * C: ------DDDDDDD--------- (CoW fork)
0065  *
0066  * When dirty pages are being written out (typically in writepage), the
0067  * delalloc reservations are converted into unwritten mappings by
0068  * allocating blocks and replacing the delalloc mapping with real ones.
0069  * A delalloc mapping can be replaced by several unwritten ones if the
0070  * free space is fragmented.
0071  *
0072  * D: --RRRRRRSSSRRRRRRRR---
0073  * C: ------UUUUUUU---------
0074  *
0075  * We want to adapt the delalloc mechanism for copy-on-write, since the
0076  * write paths are similar.  The first two steps (creating the reservation
0077  * and allocating the blocks) are exactly the same as delalloc except that
0078  * the mappings must be stored in a separate CoW fork because we do not want
0079  * to disturb the mapping in the data fork until we're sure that the write
0080  * succeeded.  IO completion in this case is the process of removing the old
0081  * mapping from the data fork and moving the new mapping from the CoW fork to
0082  * the data fork.  This will be discussed shortly.
0083  *
0084  * For now, unaligned directio writes will be bounced back to the page cache.
0085  * Block-aligned directio writes will use the same mechanism as buffered
0086  * writes.
0087  *
0088  * Just prior to submitting the actual disk write requests, we convert
0089  * the extents representing the range of the file actually being written
0090  * (as opposed to extra pieces created for the cowextsize hint) to real
0091  * extents.  This will become important in the next step:
0092  *
0093  * D: --RRRRRRSSSRRRRRRRR---
0094  * C: ------UUrrUUU---------
0095  *
0096  * CoW remapping must be done after the data block write completes,
0097  * because we don't want to destroy the old data fork map until we're sure
0098  * the new block has been written.  Since the new mappings are kept in a
0099  * separate fork, we can simply iterate these mappings to find the ones
0100  * that cover the file blocks that we just CoW'd.  For each extent, simply
0101  * unmap the corresponding range in the data fork, map the new range into
0102  * the data fork, and remove the extent from the CoW fork.  Because of
0103  * the presence of the cowextsize hint, however, we must be careful
0104  * only to remap the blocks that we've actually written out --  we must
0105  * never remap delalloc reservations nor CoW staging blocks that have
0106  * yet to be written.  This corresponds exactly to the real extents in
0107  * the CoW fork:
0108  *
0109  * D: --RRRRRRrrSRRRRRRRR---
0110  * C: ------UU--UUU---------
0111  *
0112  * Since the remapping operation can be applied to an arbitrary file
0113  * range, we record the need for the remap step as a flag in the ioend
0114  * instead of declaring a new IO type.  This is required for direct io
0115  * because we only have ioend for the whole dio, and we have to be able to
0116  * remember the presence of unwritten blocks and CoW blocks with a single
0117  * ioend structure.  Better yet, the more ground we can cover with one
0118  * ioend, the better.
0119  */
0120
0121 /*
0122  * Given an AG extent, find the lowest-numbered run of shared blocks
0123  * within that range and return the range in fbno/flen.  If
0124  * find_end_of_shared is true, return the longest contiguous extent of
0125  * shared blocks.  If there are no shared extents, fbno and flen will
0126  * be set to NULLAGBLOCK and 0, respectively.
0127  */
0128 static int
0129 xfs_reflink_find_shared(
0130     struct xfs_perag    *pag,
0131     struct xfs_trans    *tp,
0132     xfs_agblock_t       agbno,
0133     xfs_extlen_t        aglen,
0134     xfs_agblock_t       *fbno,
0135     xfs_extlen_t        *flen,
0136     bool            find_end_of_shared)
0137 {
0138     struct xfs_buf      *agbp;
0139     struct xfs_btree_cur    *cur;
0140     int         error;
0141
0142     error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
0143     if (error)
0144         return error;
0145
0146     cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
0147
0148     error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
0149             find_end_of_shared);
0150
0151     xfs_btree_del_cursor(cur, error);
0152
0153     xfs_trans_brelse(tp, agbp);
0154     return error;
0155 }
0156
0157 /*
0158  * Trim the mapping to the next block where there's a change in the
0159  * shared/unshared status.  More specifically, this means that we
0160  * find the lowest-numbered extent of shared blocks that coincides with
0161  * the given block mapping.  If the shared extent overlaps the start of
0162  * the mapping, trim the mapping to the end of the shared extent.  If
0163  * the shared region intersects the mapping, trim the mapping to the
0164  * start of the shared extent.  If there are no shared regions that
0165  * overlap, just return the original extent.
0166  */
0167 int
0168 xfs_reflink_trim_around_shared(
0169     struct xfs_inode    *ip,
0170     struct xfs_bmbt_irec    *irec,
0171     bool            *shared)
0172 {
0173     struct xfs_mount    *mp = ip->i_mount;
0174     struct xfs_perag    *pag;
0175     xfs_agblock_t       agbno;
0176     xfs_extlen_t        aglen;
0177     xfs_agblock_t       fbno;
0178     xfs_extlen_t        flen;
0179     int         error = 0;
0180
0181     /* Holes, unwritten, and delalloc extents cannot be shared */
0182     if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
0183         *shared = false;
0184         return 0;
0185     }
0186
0187     trace_xfs_reflink_trim_around_shared(ip, irec);
0188
0189     pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
0190     agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
0191     aglen = irec->br_blockcount;
0192
0193     error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
0194             true);
0195     xfs_perag_put(pag);
0196     if (error)
0197         return error;
0198
0199     *shared = false;
0200     if (fbno == NULLAGBLOCK) {
0201         /* No shared blocks at all. */
0202         return 0;
0203     } else if (fbno == agbno) {
0204         /*
0205          * The start of this extent is shared.  Truncate the
0206          * mapping at the end of the shared region so that a
0207          * subsequent iteration starts at the start of the
0208          * unshared region.
0209          */
0210         irec->br_blockcount = flen;
0211         *shared = true;
0212         return 0;
0213     } else {
0214         /*
0215          * There's a shared extent midway through this extent.
0216          * Truncate the mapping at the start of the shared
0217          * extent so that a subsequent iteration starts at the
0218          * start of the shared region.
0219          */
0220         irec->br_blockcount = fbno - agbno;
0221         return 0;
0222     }
0223 }
0224
0225 int
0226 xfs_bmap_trim_cow(
0227     struct xfs_inode    *ip,
0228     struct xfs_bmbt_irec    *imap,
0229     bool            *shared)
0230 {
0231     /* We can't update any real extents in always COW mode. */
0232     if (xfs_is_always_cow_inode(ip) &&
0233         !isnullstartblock(imap->br_startblock)) {
0234         *shared = true;
0235         return 0;
0236     }
0237
0238     /* Trim the mapping to the nearest shared extent boundary. */
0239     return xfs_reflink_trim_around_shared(ip, imap, shared);
0240 }
0241
0242 static int
0243 xfs_reflink_convert_cow_locked(
0244     struct xfs_inode    *ip,
0245     xfs_fileoff_t       offset_fsb,
0246     xfs_filblks_t       count_fsb)
0247 {
0248     struct xfs_iext_cursor  icur;
0249     struct xfs_bmbt_irec    got;
0250     struct xfs_btree_cur    *dummy_cur = NULL;
0251     int         dummy_logflags;
0252     int         error = 0;
0253
0254     if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
0255         return 0;
0256
0257     do {
0258         if (got.br_startoff >= offset_fsb + count_fsb)
0259             break;
0260         if (got.br_state == XFS_EXT_NORM)
0261             continue;
0262         if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
0263             return -EIO;
0264
0265         xfs_trim_extent(&got, offset_fsb, count_fsb);
0266         if (!got.br_blockcount)
0267             continue;
0268
0269         got.br_state = XFS_EXT_NORM;
0270         error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
0271                 XFS_COW_FORK, &icur, &dummy_cur, &got,
0272                 &dummy_logflags);
0273         if (error)
0274             return error;
0275     } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
0276
0277     return error;
0278 }
0279
0280 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
0281 int
0282 xfs_reflink_convert_cow(
0283     struct xfs_inode    *ip,
0284     xfs_off_t       offset,
0285     xfs_off_t       count)
0286 {
0287     struct xfs_mount    *mp = ip->i_mount;
0288     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
0289     xfs_fileoff_t       end_fsb = XFS_B_TO_FSB(mp, offset + count);
0290     xfs_filblks_t       count_fsb = end_fsb - offset_fsb;
0291     int         error;
0292
0293     ASSERT(count != 0);
0294
0295     xfs_ilock(ip, XFS_ILOCK_EXCL);
0296     error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
0297     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0298     return error;
0299 }
0300
0301 /*
0302  * Find the extent that maps the given range in the COW fork. Even if the extent
0303  * is not shared we might have a preallocation for it in the COW fork. If so we
0304  * use it that rather than trigger a new allocation.
0305  */
0306 static int
0307 xfs_find_trim_cow_extent(
0308     struct xfs_inode    *ip,
0309     struct xfs_bmbt_irec    *imap,
0310     struct xfs_bmbt_irec    *cmap,
0311     bool            *shared,
0312     bool            *found)
0313 {
0314     xfs_fileoff_t       offset_fsb = imap->br_startoff;
0315     xfs_filblks_t       count_fsb = imap->br_blockcount;
0316     struct xfs_iext_cursor  icur;
0317
0318     *found = false;
0319
0320     /*
0321      * If we don't find an overlapping extent, trim the range we need to
0322      * allocate to fit the hole we found.
0323      */
0324     if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
0325         cmap->br_startoff = offset_fsb + count_fsb;
0326     if (cmap->br_startoff > offset_fsb) {
0327         xfs_trim_extent(imap, imap->br_startoff,
0328                 cmap->br_startoff - imap->br_startoff);
0329         return xfs_bmap_trim_cow(ip, imap, shared);
0330     }
0331
0332     *shared = true;
0333     if (isnullstartblock(cmap->br_startblock)) {
0334         xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
0335         return 0;
0336     }
0337
0338     /* real extent found - no need to allocate */
0339     xfs_trim_extent(cmap, offset_fsb, count_fsb);
0340     *found = true;
0341     return 0;
0342 }
0343
0344 static int
0345 xfs_reflink_convert_unwritten(
0346     struct xfs_inode    *ip,
0347     struct xfs_bmbt_irec    *imap,
0348     struct xfs_bmbt_irec    *cmap,
0349     bool            convert_now)
0350 {
0351     xfs_fileoff_t       offset_fsb = imap->br_startoff;
0352     xfs_filblks_t       count_fsb = imap->br_blockcount;
0353     int         error;
0354
0355     /*
0356      * cmap might larger than imap due to cowextsize hint.
0357      */
0358     xfs_trim_extent(cmap, offset_fsb, count_fsb);
0359
0360     /*
0361      * COW fork extents are supposed to remain unwritten until we're ready
0362      * to initiate a disk write.  For direct I/O we are going to write the
0363      * data and need the conversion, but for buffered writes we're done.
0364      */
0365     if (!convert_now || cmap->br_state == XFS_EXT_NORM)
0366         return 0;
0367
0368     trace_xfs_reflink_convert_cow(ip, cmap);
0369
0370     error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
0371     if (!error)
0372         cmap->br_state = XFS_EXT_NORM;
0373
0374     return error;
0375 }
0376
0377 static int
0378 xfs_reflink_fill_cow_hole(
0379     struct xfs_inode    *ip,
0380     struct xfs_bmbt_irec    *imap,
0381     struct xfs_bmbt_irec    *cmap,
0382     bool            *shared,
0383     uint            *lockmode,
0384     bool            convert_now)
0385 {
0386     struct xfs_mount    *mp = ip->i_mount;
0387     struct xfs_trans    *tp;
0388     xfs_filblks_t       resaligned;
0389     xfs_extlen_t        resblks;
0390     int         nimaps;
0391     int         error;
0392     bool            found;
0393
0394     resaligned = xfs_aligned_fsb_count(imap->br_startoff,
0395         imap->br_blockcount, xfs_get_cowextsz_hint(ip));
0396     resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
0397
0398     xfs_iunlock(ip, *lockmode);
0399     *lockmode = 0;
0400
0401     error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
0402             false, &tp);
0403     if (error)
0404         return error;
0405
0406     *lockmode = XFS_ILOCK_EXCL;
0407
0408     error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
0409     if (error || !*shared)
0410         goto out_trans_cancel;
0411
0412     if (found) {
0413         xfs_trans_cancel(tp);
0414         goto convert;
0415     }
0416
0417     ASSERT(cmap->br_startoff > imap->br_startoff);
0418
0419     /* Allocate the entire reservation as unwritten blocks. */
0420     nimaps = 1;
0421     error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
0422             XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
0423             &nimaps);
0424     if (error)
0425         goto out_trans_cancel;
0426
0427     xfs_inode_set_cowblocks_tag(ip);
0428     error = xfs_trans_commit(tp);
0429     if (error)
0430         return error;
0431
0432     /*
0433      * Allocation succeeded but the requested range was not even partially
0434      * satisfied?  Bail out!
0435      */
0436     if (nimaps == 0)
0437         return -ENOSPC;
0438
0439 convert:
0440     return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
0441
0442 out_trans_cancel:
0443     xfs_trans_cancel(tp);
0444     return error;
0445 }
0446
0447 static int
0448 xfs_reflink_fill_delalloc(
0449     struct xfs_inode    *ip,
0450     struct xfs_bmbt_irec    *imap,
0451     struct xfs_bmbt_irec    *cmap,
0452     bool            *shared,
0453     uint            *lockmode,
0454     bool            convert_now)
0455 {
0456     struct xfs_mount    *mp = ip->i_mount;
0457     struct xfs_trans    *tp;
0458     int         nimaps;
0459     int         error;
0460     bool            found;
0461
0462     do {
0463         xfs_iunlock(ip, *lockmode);
0464         *lockmode = 0;
0465
0466         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
0467                 false, &tp);
0468         if (error)
0469             return error;
0470
0471         *lockmode = XFS_ILOCK_EXCL;
0472
0473         error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
0474                 &found);
0475         if (error || !*shared)
0476             goto out_trans_cancel;
0477
0478         if (found) {
0479             xfs_trans_cancel(tp);
0480             break;
0481         }
0482
0483         ASSERT(isnullstartblock(cmap->br_startblock) ||
0484                cmap->br_startblock == DELAYSTARTBLOCK);
0485
0486         /*
0487          * Replace delalloc reservation with an unwritten extent.
0488          */
0489         nimaps = 1;
0490         error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
0491                 cmap->br_blockcount,
0492                 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
0493                 cmap, &nimaps);
0494         if (error)
0495             goto out_trans_cancel;
0496
0497         xfs_inode_set_cowblocks_tag(ip);
0498         error = xfs_trans_commit(tp);
0499         if (error)
0500             return error;
0501
0502         /*
0503          * Allocation succeeded but the requested range was not even
0504          * partially satisfied?  Bail out!
0505          */
0506         if (nimaps == 0)
0507             return -ENOSPC;
0508     } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
0509
0510     return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
0511
0512 out_trans_cancel:
0513     xfs_trans_cancel(tp);
0514     return error;
0515 }
0516
0517 /* Allocate all CoW reservations covering a range of blocks in a file. */
0518 int
0519 xfs_reflink_allocate_cow(
0520     struct xfs_inode    *ip,
0521     struct xfs_bmbt_irec    *imap,
0522     struct xfs_bmbt_irec    *cmap,
0523     bool            *shared,
0524     uint            *lockmode,
0525     bool            convert_now)
0526 {
0527     int         error;
0528     bool            found;
0529
0530     ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
0531     if (!ip->i_cowfp) {
0532         ASSERT(!xfs_is_reflink_inode(ip));
0533         xfs_ifork_init_cow(ip);
0534     }
0535
0536     error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
0537     if (error || !*shared)
0538         return error;
0539
0540     /* CoW fork has a real extent */
0541     if (found)
0542         return xfs_reflink_convert_unwritten(ip, imap, cmap,
0543                 convert_now);
0544
0545     /*
0546      * CoW fork does not have an extent and data extent is shared.
0547      * Allocate a real extent in the CoW fork.
0548      */
0549     if (cmap->br_startoff > imap->br_startoff)
0550         return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
0551                 lockmode, convert_now);
0552
0553     /*
0554      * CoW fork has a delalloc reservation. Replace it with a real extent.
0555      * There may or may not be a data fork mapping.
0556      */
0557     if (isnullstartblock(cmap->br_startblock) ||
0558         cmap->br_startblock == DELAYSTARTBLOCK)
0559         return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
0560                 lockmode, convert_now);
0561
0562     /* Shouldn't get here. */
0563     ASSERT(0);
0564     return -EFSCORRUPTED;
0565 }
0566
0567 /*
0568  * Cancel CoW reservations for some block range of an inode.
0569  *
0570  * If cancel_real is true this function cancels all COW fork extents for the
0571  * inode; if cancel_real is false, real extents are not cleared.
0572  *
0573  * Caller must have already joined the inode to the current transaction. The
0574  * inode will be joined to the transaction returned to the caller.
0575  */
0576 int
0577 xfs_reflink_cancel_cow_blocks(
0578     struct xfs_inode        *ip,
0579     struct xfs_trans        **tpp,
0580     xfs_fileoff_t           offset_fsb,
0581     xfs_fileoff_t           end_fsb,
0582     bool                cancel_real)
0583 {
0584     struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
0585     struct xfs_bmbt_irec        got, del;
0586     struct xfs_iext_cursor      icur;
0587     int             error = 0;
0588
0589     if (!xfs_inode_has_cow_data(ip))
0590         return 0;
0591     if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
0592         return 0;
0593
0594     /* Walk backwards until we're out of the I/O range... */
0595     while (got.br_startoff + got.br_blockcount > offset_fsb) {
0596         del = got;
0597         xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
0598
0599         /* Extent delete may have bumped ext forward */
0600         if (!del.br_blockcount) {
0601             xfs_iext_prev(ifp, &icur);
0602             goto next_extent;
0603         }
0604
0605         trace_xfs_reflink_cancel_cow(ip, &del);
0606
0607         if (isnullstartblock(del.br_startblock)) {
0608             error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
0609                     &icur, &got, &del);
0610             if (error)
0611                 break;
0612         } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
0613             ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
0614
0615             /* Free the CoW orphan record. */
0616             xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
0617                     del.br_blockcount);
0618
0619             xfs_free_extent_later(*tpp, del.br_startblock,
0620                       del.br_blockcount, NULL);
0621
0622             /* Roll the transaction */
0623             error = xfs_defer_finish(tpp);
0624             if (error)
0625                 break;
0626
0627             /* Remove the mapping from the CoW fork. */
0628             xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
0629
0630             /* Remove the quota reservation */
0631             error = xfs_quota_unreserve_blkres(ip,
0632                     del.br_blockcount);
0633             if (error)
0634                 break;
0635         } else {
0636             /* Didn't do anything, push cursor back. */
0637             xfs_iext_prev(ifp, &icur);
0638         }
0639 next_extent:
0640         if (!xfs_iext_get_extent(ifp, &icur, &got))
0641             break;
0642     }
0643
0644     /* clear tag if cow fork is emptied */
0645     if (!ifp->if_bytes)
0646         xfs_inode_clear_cowblocks_tag(ip);
0647     return error;
0648 }
0649
0650 /*
0651  * Cancel CoW reservations for some byte range of an inode.
0652  *
0653  * If cancel_real is true this function cancels all COW fork extents for the
0654  * inode; if cancel_real is false, real extents are not cleared.
0655  */
0656 int
0657 xfs_reflink_cancel_cow_range(
0658     struct xfs_inode    *ip,
0659     xfs_off_t       offset,
0660     xfs_off_t       count,
0661     bool            cancel_real)
0662 {
0663     struct xfs_trans    *tp;
0664     xfs_fileoff_t       offset_fsb;
0665     xfs_fileoff_t       end_fsb;
0666     int         error;
0667
0668     trace_xfs_reflink_cancel_cow_range(ip, offset, count);
0669     ASSERT(ip->i_cowfp);
0670
0671     offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
0672     if (count == NULLFILEOFF)
0673         end_fsb = NULLFILEOFF;
0674     else
0675         end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
0676
0677     /* Start a rolling transaction to remove the mappings */
0678     error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
0679             0, 0, 0, &tp);
0680     if (error)
0681         goto out;
0682
0683     xfs_ilock(ip, XFS_ILOCK_EXCL);
0684     xfs_trans_ijoin(tp, ip, 0);
0685
0686     /* Scrape out the old CoW reservations */
0687     error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
0688             cancel_real);
0689     if (error)
0690         goto out_cancel;
0691
0692     error = xfs_trans_commit(tp);
0693
0694     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0695     return error;
0696
0697 out_cancel:
0698     xfs_trans_cancel(tp);
0699     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0700 out:
0701     trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
0702     return error;
0703 }
0704
0705 /*
0706  * Remap part of the CoW fork into the data fork.
0707  *
0708  * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
0709  * into the data fork; this function will remap what it can (at the end of the
0710  * range) and update @end_fsb appropriately.  Each remap gets its own
0711  * transaction because we can end up merging and splitting bmbt blocks for
0712  * every remap operation and we'd like to keep the block reservation
0713  * requirements as low as possible.
0714  */
0715 STATIC int
0716 xfs_reflink_end_cow_extent(
0717     struct xfs_inode    *ip,
0718     xfs_fileoff_t       *offset_fsb,
0719     xfs_fileoff_t       end_fsb)
0720 {
0721     struct xfs_iext_cursor  icur;
0722     struct xfs_bmbt_irec    got, del, data;
0723     struct xfs_mount    *mp = ip->i_mount;
0724     struct xfs_trans    *tp;
0725     struct xfs_ifork    *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
0726     unsigned int        resblks;
0727     int         nmaps;
0728     int         error;
0729
0730     /* No COW extents?  That's easy! */
0731     if (ifp->if_bytes == 0) {
0732         *offset_fsb = end_fsb;
0733         return 0;
0734     }
0735
0736     resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
0737     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
0738             XFS_TRANS_RESERVE, &tp);
0739     if (error)
0740         return error;
0741
0742     /*
0743      * Lock the inode.  We have to ijoin without automatic unlock because
0744      * the lead transaction is the refcountbt record deletion; the data
0745      * fork update follows as a deferred log item.
0746      */
0747     xfs_ilock(ip, XFS_ILOCK_EXCL);
0748     xfs_trans_ijoin(tp, ip, 0);
0749
0750     error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
0751             XFS_IEXT_REFLINK_END_COW_CNT);
0752     if (error == -EFBIG)
0753         error = xfs_iext_count_upgrade(tp, ip,
0754                 XFS_IEXT_REFLINK_END_COW_CNT);
0755     if (error)
0756         goto out_cancel;
0757
0758     /*
0759      * In case of racing, overlapping AIO writes no COW extents might be
0760      * left by the time I/O completes for the loser of the race.  In that
0761      * case we are done.
0762      */
0763     if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
0764         got.br_startoff >= end_fsb) {
0765         *offset_fsb = end_fsb;
0766         goto out_cancel;
0767     }
0768
0769     /*
0770      * Only remap real extents that contain data.  With AIO, speculative
0771      * preallocations can leak into the range we are called upon, and we
0772      * need to skip them.  Preserve @got for the eventual CoW fork
0773      * deletion; from now on @del represents the mapping that we're
0774      * actually remapping.
0775      */
0776     while (!xfs_bmap_is_written_extent(&got)) {
0777         if (!xfs_iext_next_extent(ifp, &icur, &got) ||
0778             got.br_startoff >= end_fsb) {
0779             *offset_fsb = end_fsb;
0780             goto out_cancel;
0781         }
0782     }
0783     del = got;
0784
0785     /* Grab the corresponding mapping in the data fork. */
0786     nmaps = 1;
0787     error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
0788             &nmaps, 0);
0789     if (error)
0790         goto out_cancel;
0791
0792     /* We can only remap the smaller of the two extent sizes. */
0793     data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
0794     del.br_blockcount = data.br_blockcount;
0795
0796     trace_xfs_reflink_cow_remap_from(ip, &del);
0797     trace_xfs_reflink_cow_remap_to(ip, &data);
0798
0799     if (xfs_bmap_is_real_extent(&data)) {
0800         /*
0801          * If the extent we're remapping is backed by storage (written
0802          * or not), unmap the extent and drop its refcount.
0803          */
0804         xfs_bmap_unmap_extent(tp, ip, &data);
0805         xfs_refcount_decrease_extent(tp, &data);
0806         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
0807                 -data.br_blockcount);
0808     } else if (data.br_startblock == DELAYSTARTBLOCK) {
0809         int     done;
0810
0811         /*
0812          * If the extent we're remapping is a delalloc reservation,
0813          * we can use the regular bunmapi function to release the
0814          * incore state.  Dropping the delalloc reservation takes care
0815          * of the quota reservation for us.
0816          */
0817         error = xfs_bunmapi(NULL, ip, data.br_startoff,
0818                 data.br_blockcount, 0, 1, &done);
0819         if (error)
0820             goto out_cancel;
0821         ASSERT(done);
0822     }
0823
0824     /* Free the CoW orphan record. */
0825     xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
0826
0827     /* Map the new blocks into the data fork. */
0828     xfs_bmap_map_extent(tp, ip, &del);
0829
0830     /* Charge this new data fork mapping to the on-disk quota. */
0831     xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
0832             (long)del.br_blockcount);
0833
0834     /* Remove the mapping from the CoW fork. */
0835     xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
0836
0837     error = xfs_trans_commit(tp);
0838     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0839     if (error)
0840         return error;
0841
0842     /* Update the caller about how much progress we made. */
0843     *offset_fsb = del.br_startoff + del.br_blockcount;
0844     return 0;
0845
0846 out_cancel:
0847     xfs_trans_cancel(tp);
0848     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0849     return error;
0850 }
0851
0852 /*
0853  * Remap parts of a file's data fork after a successful CoW.
0854  */
0855 int
0856 xfs_reflink_end_cow(
0857     struct xfs_inode        *ip,
0858     xfs_off_t           offset,
0859     xfs_off_t           count)
0860 {
0861     xfs_fileoff_t           offset_fsb;
0862     xfs_fileoff_t           end_fsb;
0863     int             error = 0;
0864
0865     trace_xfs_reflink_end_cow(ip, offset, count);
0866
0867     offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
0868     end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
0869
0870     /*
0871      * Walk forwards until we've remapped the I/O range.  The loop function
0872      * repeatedly cycles the ILOCK to allocate one transaction per remapped
0873      * extent.
0874      *
0875      * If we're being called by writeback then the pages will still
0876      * have PageWriteback set, which prevents races with reflink remapping
0877      * and truncate.  Reflink remapping prevents races with writeback by
0878      * taking the iolock and mmaplock before flushing the pages and
0879      * remapping, which means there won't be any further writeback or page
0880      * cache dirtying until the reflink completes.
0881      *
0882      * We should never have two threads issuing writeback for the same file
0883      * region.  There are also have post-eof checks in the writeback
0884      * preparation code so that we don't bother writing out pages that are
0885      * about to be truncated.
0886      *
0887      * If we're being called as part of directio write completion, the dio
0888      * count is still elevated, which reflink and truncate will wait for.
0889      * Reflink remapping takes the iolock and mmaplock and waits for
0890      * pending dio to finish, which should prevent any directio until the
0891      * remap completes.  Multiple concurrent directio writes to the same
0892      * region are handled by end_cow processing only occurring for the
0893      * threads which succeed; the outcome of multiple overlapping direct
0894      * writes is not well defined anyway.
0895      *
0896      * It's possible that a buffered write and a direct write could collide
0897      * here (the buffered write stumbles in after the dio flushes and
0898      * invalidates the page cache and immediately queues writeback), but we
0899      * have never supported this 100%.  If either disk write succeeds the
0900      * blocks will be remapped.
0901      */
0902     while (end_fsb > offset_fsb && !error)
0903         error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
0904
0905     if (error)
0906         trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
0907     return error;
0908 }
0909
0910 /*
0911  * Free all CoW staging blocks that are still referenced by the ondisk refcount
0912  * metadata.  The ondisk metadata does not track which inode created the
0913  * staging extent, so callers must ensure that there are no cached inodes with
0914  * live CoW staging extents.
0915  */
0916 int
0917 xfs_reflink_recover_cow(
0918     struct xfs_mount    *mp)
0919 {
0920     struct xfs_perag    *pag;
0921     xfs_agnumber_t      agno;
0922     int         error = 0;
0923
0924     if (!xfs_has_reflink(mp))
0925         return 0;
0926
0927     for_each_perag(mp, agno, pag) {
0928         error = xfs_refcount_recover_cow_leftovers(mp, pag);
0929         if (error) {
0930             xfs_perag_put(pag);
0931             break;
0932         }
0933     }
0934
0935     return error;
0936 }
0937
0938 /*
0939  * Reflinking (Block) Ranges of Two Files Together
0940  *
0941  * First, ensure that the reflink flag is set on both inodes.  The flag is an
0942  * optimization to avoid unnecessary refcount btree lookups in the write path.
0943  *
0944  * Now we can iteratively remap the range of extents (and holes) in src to the
0945  * corresponding ranges in dest.  Let drange and srange denote the ranges of
0946  * logical blocks in dest and src touched by the reflink operation.
0947  *
0948  * While the length of drange is greater than zero,
0949  *    - Read src's bmbt at the start of srange ("imap")
0950  *    - If imap doesn't exist, make imap appear to start at the end of srange
0951  *      with zero length.
0952  *    - If imap starts before srange, advance imap to start at srange.
0953  *    - If imap goes beyond srange, truncate imap to end at the end of srange.
0954  *    - Punch (imap start - srange start + imap len) blocks from dest at
0955  *      offset (drange start).
0956  *    - If imap points to a real range of pblks,
0957  *         > Increase the refcount of the imap's pblks
0958  *         > Map imap's pblks into dest at the offset
0959  *           (drange start + imap start - srange start)
0960  *    - Advance drange and srange by (imap start - srange start + imap len)
0961  *
0962  * Finally, if the reflink made dest longer, update both the in-core and
0963  * on-disk file sizes.
0964  *
0965  * ASCII Art Demonstration:
0966  *
0967  * Let's say we want to reflink this source file:
0968  *
0969  * ----SSSSSSS-SSSSS----SSSSSS (src file)
0970  *   <-------------------->
0971  *
0972  * into this destination file:
0973  *
0974  * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
0975  *        <-------------------->
0976  * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
0977  * Observe that the range has different logical offsets in either file.
0978  *
0979  * Consider that the first extent in the source file doesn't line up with our
0980  * reflink range.  Unmapping  and remapping are separate operations, so we can
0981  * unmap more blocks from the destination file than we remap.
0982  *
0983  * ----SSSSSSS-SSSSS----SSSSSS
0984  *   <------->
0985  * --DDDDD---------DDDDD--DDD
0986  *        <------->
0987  *
0988  * Now remap the source extent into the destination file:
0989  *
0990  * ----SSSSSSS-SSSSS----SSSSSS
0991  *   <------->
0992  * --DDDDD--SSSSSSSDDDDD--DDD
0993  *        <------->
0994  *
0995  * Do likewise with the second hole and extent in our range.  Holes in the
0996  * unmap range don't affect our operation.
0997  *
0998  * ----SSSSSSS-SSSSS----SSSSSS
0999  *            <---->
1000  * --DDDDD--SSSSSSS-SSSSS-DDD
1001  *                 <---->
1002  *
1003  * Finally, unmap and remap part of the third extent.  This will increase the
1004  * size of the destination file.
1005  *
1006  * ----SSSSSSS-SSSSS----SSSSSS
1007  *                  <----->
1008  * --DDDDD--SSSSSSS-SSSSS----SSS
1009  *                       <----->
1010  *
1011  * Once we update the destination file's i_size, we're done.
1012  */
1013
1014 /*
1015  * Ensure the reflink bit is set in both inodes.
1016  */
1017 STATIC int
1018 xfs_reflink_set_inode_flag(
1019     struct xfs_inode    *src,
1020     struct xfs_inode    *dest)
1021 {
1022     struct xfs_mount    *mp = src->i_mount;
1023     int         error;
1024     struct xfs_trans    *tp;
1025
1026     if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
1027         return 0;
1028
1029     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1030     if (error)
1031         goto out_error;
1032
1033     /* Lock both files against IO */
1034     if (src->i_ino == dest->i_ino)
1035         xfs_ilock(src, XFS_ILOCK_EXCL);
1036     else
1037         xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
1038
1039     if (!xfs_is_reflink_inode(src)) {
1040         trace_xfs_reflink_set_inode_flag(src);
1041         xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
1042         src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1043         xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
1044         xfs_ifork_init_cow(src);
1045     } else
1046         xfs_iunlock(src, XFS_ILOCK_EXCL);
1047
1048     if (src->i_ino == dest->i_ino)
1049         goto commit_flags;
1050
1051     if (!xfs_is_reflink_inode(dest)) {
1052         trace_xfs_reflink_set_inode_flag(dest);
1053         xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1054         dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1055         xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1056         xfs_ifork_init_cow(dest);
1057     } else
1058         xfs_iunlock(dest, XFS_ILOCK_EXCL);
1059
1060 commit_flags:
1061     error = xfs_trans_commit(tp);
1062     if (error)
1063         goto out_error;
1064     return error;
1065
1066 out_error:
1067     trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
1068     return error;
1069 }
1070
1071 /*
1072  * Update destination inode size & cowextsize hint, if necessary.
1073  */
1074 int
1075 xfs_reflink_update_dest(
1076     struct xfs_inode    *dest,
1077     xfs_off_t       newlen,
1078     xfs_extlen_t        cowextsize,
1079     unsigned int        remap_flags)
1080 {
1081     struct xfs_mount    *mp = dest->i_mount;
1082     struct xfs_trans    *tp;
1083     int         error;
1084
1085     if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
1086         return 0;
1087
1088     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1089     if (error)
1090         goto out_error;
1091
1092     xfs_ilock(dest, XFS_ILOCK_EXCL);
1093     xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1094
1095     if (newlen > i_size_read(VFS_I(dest))) {
1096         trace_xfs_reflink_update_inode_size(dest, newlen);
1097         i_size_write(VFS_I(dest), newlen);
1098         dest->i_disk_size = newlen;
1099     }
1100
1101     if (cowextsize) {
1102         dest->i_cowextsize = cowextsize;
1103         dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
1104     }
1105
1106     xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1107
1108     error = xfs_trans_commit(tp);
1109     if (error)
1110         goto out_error;
1111     return error;
1112
1113 out_error:
1114     trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
1115     return error;
1116 }
1117
1118 /*
1119  * Do we have enough reserve in this AG to handle a reflink?  The refcount
1120  * btree already reserved all the space it needs, but the rmap btree can grow
1121  * infinitely, so we won't allow more reflinks when the AG is down to the
1122  * btree reserves.
1123  */
1124 static int
1125 xfs_reflink_ag_has_free_space(
1126     struct xfs_mount    *mp,
1127     xfs_agnumber_t      agno)
1128 {
1129     struct xfs_perag    *pag;
1130     int         error = 0;
1131
1132     if (!xfs_has_rmapbt(mp))
1133         return 0;
1134
1135     pag = xfs_perag_get(mp, agno);
1136     if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
1137         xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
1138         error = -ENOSPC;
1139     xfs_perag_put(pag);
1140     return error;
1141 }
1142
1143 /*
1144  * Remap the given extent into the file.  The dmap blockcount will be set to
1145  * the number of blocks that were actually remapped.
1146  */
1147 STATIC int
1148 xfs_reflink_remap_extent(
1149     struct xfs_inode    *ip,
1150     struct xfs_bmbt_irec    *dmap,
1151     xfs_off_t       new_isize)
1152 {
1153     struct xfs_bmbt_irec    smap;
1154     struct xfs_mount    *mp = ip->i_mount;
1155     struct xfs_trans    *tp;
1156     xfs_off_t       newlen;
1157     int64_t         qdelta = 0;
1158     unsigned int        resblks;
1159     bool            quota_reserved = true;
1160     bool            smap_real;
1161     bool            dmap_written = xfs_bmap_is_written_extent(dmap);
1162     int         iext_delta = 0;
1163     int         nimaps;
1164     int         error;
1165
1166     /*
1167      * Start a rolling transaction to switch the mappings.
1168      *
1169      * Adding a written extent to the extent map can cause a bmbt split,
1170      * and removing a mapped extent from the extent can cause a bmbt split.
1171      * The two operations cannot both cause a split since they operate on
1172      * the same index in the bmap btree, so we only need a reservation for
1173      * one bmbt split if either thing is happening.  However, we haven't
1174      * locked the inode yet, so we reserve assuming this is the case.
1175      *
1176      * The first allocation call tries to reserve enough space to handle
1177      * mapping dmap into a sparse part of the file plus the bmbt split.  We
1178      * haven't locked the inode or read the existing mapping yet, so we do
1179      * not know for sure that we need the space.  This should succeed most
1180      * of the time.
1181      *
1182      * If the first attempt fails, try again but reserving only enough
1183      * space to handle a bmbt split.  This is the hard minimum requirement,
1184      * and we revisit quota reservations later when we know more about what
1185      * we're remapping.
1186      */
1187     resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1188     error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1189             resblks + dmap->br_blockcount, 0, false, &tp);
1190     if (error == -EDQUOT || error == -ENOSPC) {
1191         quota_reserved = false;
1192         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1193                 resblks, 0, false, &tp);
1194     }
1195     if (error)
1196         goto out;
1197
1198     /*
1199      * Read what's currently mapped in the destination file into smap.
1200      * If smap isn't a hole, we will have to remove it before we can add
1201      * dmap to the destination file.
1202      */
1203     nimaps = 1;
1204     error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
1205             &smap, &nimaps, 0);
1206     if (error)
1207         goto out_cancel;
1208     ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
1209     smap_real = xfs_bmap_is_real_extent(&smap);
1210
1211     /*
1212      * We can only remap as many blocks as the smaller of the two extent
1213      * maps, because we can only remap one extent at a time.
1214      */
1215     dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
1216     ASSERT(dmap->br_blockcount == smap.br_blockcount);
1217
1218     trace_xfs_reflink_remap_extent_dest(ip, &smap);
1219
1220     /*
1221      * Two extents mapped to the same physical block must not have
1222      * different states; that's filesystem corruption.  Move on to the next
1223      * extent if they're both holes or both the same physical extent.
1224      */
1225     if (dmap->br_startblock == smap.br_startblock) {
1226         if (dmap->br_state != smap.br_state)
1227             error = -EFSCORRUPTED;
1228         goto out_cancel;
1229     }
1230
1231     /* If both extents are unwritten, leave them alone. */
1232     if (dmap->br_state == XFS_EXT_UNWRITTEN &&
1233         smap.br_state == XFS_EXT_UNWRITTEN)
1234         goto out_cancel;
1235
1236     /* No reflinking if the AG of the dest mapping is low on space. */
1237     if (dmap_written) {
1238         error = xfs_reflink_ag_has_free_space(mp,
1239                 XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
1240         if (error)
1241             goto out_cancel;
1242     }
1243
1244     /*
1245      * Increase quota reservation if we think the quota block counter for
1246      * this file could increase.
1247      *
1248      * If we are mapping a written extent into the file, we need to have
1249      * enough quota block count reservation to handle the blocks in that
1250      * extent.  We log only the delta to the quota block counts, so if the
1251      * extent we're unmapping also has blocks allocated to it, we don't
1252      * need a quota reservation for the extent itself.
1253      *
1254      * Note that if we're replacing a delalloc reservation with a written
1255      * extent, we have to take the full quota reservation because removing
1256      * the delalloc reservation gives the block count back to the quota
1257      * count.  This is suboptimal, but the VFS flushed the dest range
1258      * before we started.  That should have removed all the delalloc
1259      * reservations, but we code defensively.
1260      *
1261      * xfs_trans_alloc_inode above already tried to grab an even larger
1262      * quota reservation, and kicked off a blockgc scan if it couldn't.
1263      * If we can't get a potentially smaller quota reservation now, we're
1264      * done.
1265      */
1266     if (!quota_reserved && !smap_real && dmap_written) {
1267         error = xfs_trans_reserve_quota_nblks(tp, ip,
1268                 dmap->br_blockcount, 0, false);
1269         if (error)
1270             goto out_cancel;
1271     }
1272
1273     if (smap_real)
1274         ++iext_delta;
1275
1276     if (dmap_written)
1277         ++iext_delta;
1278
1279     error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
1280     if (error == -EFBIG)
1281         error = xfs_iext_count_upgrade(tp, ip, iext_delta);
1282     if (error)
1283         goto out_cancel;
1284
1285     if (smap_real) {
1286         /*
1287          * If the extent we're unmapping is backed by storage (written
1288          * or not), unmap the extent and drop its refcount.
1289          */
1290         xfs_bmap_unmap_extent(tp, ip, &smap);
1291         xfs_refcount_decrease_extent(tp, &smap);
1292         qdelta -= smap.br_blockcount;
1293     } else if (smap.br_startblock == DELAYSTARTBLOCK) {
1294         int     done;
1295
1296         /*
1297          * If the extent we're unmapping is a delalloc reservation,
1298          * we can use the regular bunmapi function to release the
1299          * incore state.  Dropping the delalloc reservation takes care
1300          * of the quota reservation for us.
1301          */
1302         error = xfs_bunmapi(NULL, ip, smap.br_startoff,
1303                 smap.br_blockcount, 0, 1, &done);
1304         if (error)
1305             goto out_cancel;
1306         ASSERT(done);
1307     }
1308
1309     /*
1310      * If the extent we're sharing is backed by written storage, increase
1311      * its refcount and map it into the file.
1312      */
1313     if (dmap_written) {
1314         xfs_refcount_increase_extent(tp, dmap);
1315         xfs_bmap_map_extent(tp, ip, dmap);
1316         qdelta += dmap->br_blockcount;
1317     }
1318
1319     xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
1320
1321     /* Update dest isize if needed. */
1322     newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
1323     newlen = min_t(xfs_off_t, newlen, new_isize);
1324     if (newlen > i_size_read(VFS_I(ip))) {
1325         trace_xfs_reflink_update_inode_size(ip, newlen);
1326         i_size_write(VFS_I(ip), newlen);
1327         ip->i_disk_size = newlen;
1328         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1329     }
1330
1331     /* Commit everything and unlock. */
1332     error = xfs_trans_commit(tp);
1333     goto out_unlock;
1334
1335 out_cancel:
1336     xfs_trans_cancel(tp);
1337 out_unlock:
1338     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1339 out:
1340     if (error)
1341         trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1342     return error;
1343 }
1344
1345 /* Remap a range of one file to the other. */
1346 int
1347 xfs_reflink_remap_blocks(
1348     struct xfs_inode    *src,
1349     loff_t          pos_in,
1350     struct xfs_inode    *dest,
1351     loff_t          pos_out,
1352     loff_t          remap_len,
1353     loff_t          *remapped)
1354 {
1355     struct xfs_bmbt_irec    imap;
1356     struct xfs_mount    *mp = src->i_mount;
1357     xfs_fileoff_t       srcoff = XFS_B_TO_FSBT(mp, pos_in);
1358     xfs_fileoff_t       destoff = XFS_B_TO_FSBT(mp, pos_out);
1359     xfs_filblks_t       len;
1360     xfs_filblks_t       remapped_len = 0;
1361     xfs_off_t       new_isize = pos_out + remap_len;
1362     int         nimaps;
1363     int         error = 0;
1364
1365     len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
1366             XFS_MAX_FILEOFF);
1367
1368     trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
1369
1370     while (len > 0) {
1371         unsigned int    lock_mode;
1372
1373         /* Read extent from the source file */
1374         nimaps = 1;
1375         lock_mode = xfs_ilock_data_map_shared(src);
1376         error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
1377         xfs_iunlock(src, lock_mode);
1378         if (error)
1379             break;
1380         /*
1381          * The caller supposedly flushed all dirty pages in the source
1382          * file range, which means that writeback should have allocated
1383          * or deleted all delalloc reservations in that range.  If we
1384          * find one, that's a good sign that something is seriously
1385          * wrong here.
1386          */
1387         ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
1388         if (imap.br_startblock == DELAYSTARTBLOCK) {
1389             ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1390             error = -EFSCORRUPTED;
1391             break;
1392         }
1393
1394         trace_xfs_reflink_remap_extent_src(src, &imap);
1395
1396         /* Remap into the destination file at the given offset. */
1397         imap.br_startoff = destoff;
1398         error = xfs_reflink_remap_extent(dest, &imap, new_isize);
1399         if (error)
1400             break;
1401
1402         if (fatal_signal_pending(current)) {
1403             error = -EINTR;
1404             break;
1405         }
1406
1407         /* Advance drange/srange */
1408         srcoff += imap.br_blockcount;
1409         destoff += imap.br_blockcount;
1410         len -= imap.br_blockcount;
1411         remapped_len += imap.br_blockcount;
1412     }
1413
1414     if (error)
1415         trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1416     *remapped = min_t(loff_t, remap_len,
1417               XFS_FSB_TO_B(src->i_mount, remapped_len));
1418     return error;
1419 }
1420
1421 /*
1422  * If we're reflinking to a point past the destination file's EOF, we must
1423  * zero any speculative post-EOF preallocations that sit between the old EOF
1424  * and the destination file offset.
1425  */
1426 static int
1427 xfs_reflink_zero_posteof(
1428     struct xfs_inode    *ip,
1429     loff_t          pos)
1430 {
1431     loff_t          isize = i_size_read(VFS_I(ip));
1432
1433     if (pos <= isize)
1434         return 0;
1435
1436     trace_xfs_zero_eof(ip, isize, pos - isize);
1437     return xfs_zero_range(ip, isize, pos - isize, NULL);
1438 }
1439
1440 /*
1441  * Prepare two files for range cloning.  Upon a successful return both inodes
1442  * will have the iolock and mmaplock held, the page cache of the out file will
1443  * be truncated, and any leases on the out file will have been broken.  This
1444  * function borrows heavily from xfs_file_aio_write_checks.
1445  *
1446  * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
1447  * checked that the bytes beyond EOF physically match. Hence we cannot use the
1448  * EOF block in the source dedupe range because it's not a complete block match,
1449  * hence can introduce a corruption into the file that has it's block replaced.
1450  *
1451  * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
1452  * "block aligned" for the purposes of cloning entire files.  However, if the
1453  * source file range includes the EOF block and it lands within the existing EOF
1454  * of the destination file, then we can expose stale data from beyond the source
1455  * file EOF in the destination file.
1456  *
1457  * XFS doesn't support partial block sharing, so in both cases we have check
1458  * these cases ourselves. For dedupe, we can simply round the length to dedupe
1459  * down to the previous whole block and ignore the partial EOF block. While this
1460  * means we can't dedupe the last block of a file, this is an acceptible
1461  * tradeoff for simplicity on implementation.
1462  *
1463  * For cloning, we want to share the partial EOF block if it is also the new EOF
1464  * block of the destination file. If the partial EOF block lies inside the
1465  * existing destination EOF, then we have to abort the clone to avoid exposing
1466  * stale data in the destination file. Hence we reject these clone attempts with
1467  * -EINVAL in this case.
1468  */
1469 int
1470 xfs_reflink_remap_prep(
1471     struct file     *file_in,
1472     loff_t          pos_in,
1473     struct file     *file_out,
1474     loff_t          pos_out,
1475     loff_t          *len,
1476     unsigned int        remap_flags)
1477 {
1478     struct inode        *inode_in = file_inode(file_in);
1479     struct xfs_inode    *src = XFS_I(inode_in);
1480     struct inode        *inode_out = file_inode(file_out);
1481     struct xfs_inode    *dest = XFS_I(inode_out);
1482     int         ret;
1483
1484     /* Lock both files against IO */
1485     ret = xfs_ilock2_io_mmap(src, dest);
1486     if (ret)
1487         return ret;
1488
1489     /* Check file eligibility and prepare for block sharing. */
1490     ret = -EINVAL;
1491     /* Don't reflink realtime inodes */
1492     if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
1493         goto out_unlock;
1494
1495     /* Don't share DAX file data with non-DAX file. */
1496     if (IS_DAX(inode_in) != IS_DAX(inode_out))
1497         goto out_unlock;
1498
1499     if (!IS_DAX(inode_in))
1500         ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
1501                 pos_out, len, remap_flags);
1502     else
1503         ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
1504                 pos_out, len, remap_flags, &xfs_read_iomap_ops);
1505     if (ret || *len == 0)
1506         goto out_unlock;
1507
1508     /* Attach dquots to dest inode before changing block map */
1509     ret = xfs_qm_dqattach(dest);
1510     if (ret)
1511         goto out_unlock;
1512
1513     /*
1514      * Zero existing post-eof speculative preallocations in the destination
1515      * file.
1516      */
1517     ret = xfs_reflink_zero_posteof(dest, pos_out);
1518     if (ret)
1519         goto out_unlock;
1520
1521     /* Set flags and remap blocks. */
1522     ret = xfs_reflink_set_inode_flag(src, dest);
1523     if (ret)
1524         goto out_unlock;
1525
1526     /*
1527      * If pos_out > EOF, we may have dirtied blocks between EOF and
1528      * pos_out. In that case, we need to extend the flush and unmap to cover
1529      * from EOF to the end of the copy length.
1530      */
1531     if (pos_out > XFS_ISIZE(dest)) {
1532         loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
1533         ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
1534     } else {
1535         ret = xfs_flush_unmap_range(dest, pos_out, *len);
1536     }
1537     if (ret)
1538         goto out_unlock;
1539
1540     return 0;
1541 out_unlock:
1542     xfs_iunlock2_io_mmap(src, dest);
1543     return ret;
1544 }
1545
1546 /* Does this inode need the reflink flag? */
1547 int
1548 xfs_reflink_inode_has_shared_extents(
1549     struct xfs_trans        *tp,
1550     struct xfs_inode        *ip,
1551     bool                *has_shared)
1552 {
1553     struct xfs_bmbt_irec        got;
1554     struct xfs_mount        *mp = ip->i_mount;
1555     struct xfs_ifork        *ifp;
1556     struct xfs_iext_cursor      icur;
1557     bool                found;
1558     int             error;
1559
1560     ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1561     error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
1562     if (error)
1563         return error;
1564
1565     *has_shared = false;
1566     found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
1567     while (found) {
1568         struct xfs_perag    *pag;
1569         xfs_agblock_t       agbno;
1570         xfs_extlen_t        aglen;
1571         xfs_agblock_t       rbno;
1572         xfs_extlen_t        rlen;
1573
1574         if (isnullstartblock(got.br_startblock) ||
1575             got.br_state != XFS_EXT_NORM)
1576             goto next;
1577
1578         pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
1579         agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
1580         aglen = got.br_blockcount;
1581         error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
1582                 &rbno, &rlen, false);
1583         xfs_perag_put(pag);
1584         if (error)
1585             return error;
1586
1587         /* Is there still a shared block here? */
1588         if (rbno != NULLAGBLOCK) {
1589             *has_shared = true;
1590             return 0;
1591         }
1592 next:
1593         found = xfs_iext_next_extent(ifp, &icur, &got);
1594     }
1595
1596     return 0;
1597 }
1598
1599 /*
1600  * Clear the inode reflink flag if there are no shared extents.
1601  *
1602  * The caller is responsible for joining the inode to the transaction passed in.
1603  * The inode will be joined to the transaction that is returned to the caller.
1604  */
1605 int
1606 xfs_reflink_clear_inode_flag(
1607     struct xfs_inode    *ip,
1608     struct xfs_trans    **tpp)
1609 {
1610     bool            needs_flag;
1611     int         error = 0;
1612
1613     ASSERT(xfs_is_reflink_inode(ip));
1614
1615     error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
1616     if (error || needs_flag)
1617         return error;
1618
1619     /*
1620      * We didn't find any shared blocks so turn off the reflink flag.
1621      * First, get rid of any leftover CoW mappings.
1622      */
1623     error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
1624             true);
1625     if (error)
1626         return error;
1627
1628     /* Clear the inode flag. */
1629     trace_xfs_reflink_unset_inode_flag(ip);
1630     ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1631     xfs_inode_clear_cowblocks_tag(ip);
1632     xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1633
1634     return error;
1635 }
1636
1637 /*
1638  * Clear the inode reflink flag if there are no shared extents and the size
1639  * hasn't changed.
1640  */
1641 STATIC int
1642 xfs_reflink_try_clear_inode_flag(
1643     struct xfs_inode    *ip)
1644 {
1645     struct xfs_mount    *mp = ip->i_mount;
1646     struct xfs_trans    *tp;
1647     int         error = 0;
1648
1649     /* Start a rolling transaction to remove the mappings */
1650     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
1651     if (error)
1652         return error;
1653
1654     xfs_ilock(ip, XFS_ILOCK_EXCL);
1655     xfs_trans_ijoin(tp, ip, 0);
1656
1657     error = xfs_reflink_clear_inode_flag(ip, &tp);
1658     if (error)
1659         goto cancel;
1660
1661     error = xfs_trans_commit(tp);
1662     if (error)
1663         goto out;
1664
1665     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666     return 0;
1667 cancel:
1668     xfs_trans_cancel(tp);
1669 out:
1670     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1671     return error;
1672 }
1673
1674 /*
1675  * Pre-COW all shared blocks within a given byte range of a file and turn off
1676  * the reflink flag if we unshare all of the file's blocks.
1677  */
1678 int
1679 xfs_reflink_unshare(
1680     struct xfs_inode    *ip,
1681     xfs_off_t       offset,
1682     xfs_off_t       len)
1683 {
1684     struct inode        *inode = VFS_I(ip);
1685     int         error;
1686
1687     if (!xfs_is_reflink_inode(ip))
1688         return 0;
1689
1690     trace_xfs_reflink_unshare(ip, offset, len);
1691
1692     inode_dio_wait(inode);
1693
1694     error = iomap_file_unshare(inode, offset, len,
1695             &xfs_buffered_write_iomap_ops);
1696     if (error)
1697         goto out;
1698
1699     error = filemap_write_and_wait_range(inode->i_mapping, offset,
1700             offset + len - 1);
1701     if (error)
1702         goto out;
1703
1704     /* Turn off the reflink flag if possible. */
1705     error = xfs_reflink_try_clear_inode_flag(ip);
1706     if (error)
1707         goto out;
1708     return 0;
1709
1710 out:
1711     trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
1712     return error;
1713 }