fs/xfs/xfs_bmap_util.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
0004  * Copyright (c) 2012 Red Hat, Inc.
0005  * All Rights Reserved.
0006  */
0007 #include "xfs.h"
0008 #include "xfs_fs.h"
0009 #include "xfs_shared.h"
0010 #include "xfs_format.h"
0011 #include "xfs_log_format.h"
0012 #include "xfs_trans_resv.h"
0013 #include "xfs_bit.h"
0014 #include "xfs_mount.h"
0015 #include "xfs_defer.h"
0016 #include "xfs_inode.h"
0017 #include "xfs_btree.h"
0018 #include "xfs_trans.h"
0019 #include "xfs_alloc.h"
0020 #include "xfs_bmap.h"
0021 #include "xfs_bmap_util.h"
0022 #include "xfs_bmap_btree.h"
0023 #include "xfs_rtalloc.h"
0024 #include "xfs_error.h"
0025 #include "xfs_quota.h"
0026 #include "xfs_trans_space.h"
0027 #include "xfs_trace.h"
0028 #include "xfs_icache.h"
0029 #include "xfs_iomap.h"
0030 #include "xfs_reflink.h"
0031
0032 /* Kernel only BMAP related definitions and functions */
0033
0034 /*
0035  * Convert the given file system block to a disk block.  We have to treat it
0036  * differently based on whether the file is a real time file or not, because the
0037  * bmap code does.
0038  */
0039 xfs_daddr_t
0040 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
0041 {
0042     if (XFS_IS_REALTIME_INODE(ip))
0043         return XFS_FSB_TO_BB(ip->i_mount, fsb);
0044     return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
0045 }
0046
0047 /*
0048  * Routine to zero an extent on disk allocated to the specific inode.
0049  *
0050  * The VFS functions take a linearised filesystem block offset, so we have to
0051  * convert the sparse xfs fsb to the right format first.
0052  * VFS types are real funky, too.
0053  */
0054 int
0055 xfs_zero_extent(
0056     struct xfs_inode    *ip,
0057     xfs_fsblock_t       start_fsb,
0058     xfs_off_t       count_fsb)
0059 {
0060     struct xfs_mount    *mp = ip->i_mount;
0061     struct xfs_buftarg  *target = xfs_inode_buftarg(ip);
0062     xfs_daddr_t     sector = xfs_fsb_to_db(ip, start_fsb);
0063     sector_t        block = XFS_BB_TO_FSBT(mp, sector);
0064
0065     return blkdev_issue_zeroout(target->bt_bdev,
0066         block << (mp->m_super->s_blocksize_bits - 9),
0067         count_fsb << (mp->m_super->s_blocksize_bits - 9),
0068         GFP_NOFS, 0);
0069 }
0070
0071 #ifdef CONFIG_XFS_RT
0072 int
0073 xfs_bmap_rtalloc(
0074     struct xfs_bmalloca *ap)
0075 {
0076     struct xfs_mount    *mp = ap->ip->i_mount;
0077     xfs_fileoff_t       orig_offset = ap->offset;
0078     xfs_rtblock_t       rtb;
0079     xfs_extlen_t        prod = 0;  /* product factor for allocators */
0080     xfs_extlen_t        mod = 0;   /* product factor for allocators */
0081     xfs_extlen_t        ralen = 0; /* realtime allocation length */
0082     xfs_extlen_t        align;     /* minimum allocation alignment */
0083     xfs_extlen_t        orig_length = ap->length;
0084     xfs_extlen_t        minlen = mp->m_sb.sb_rextsize;
0085     xfs_extlen_t        raminlen;
0086     bool            rtlocked = false;
0087     bool            ignore_locality = false;
0088     int         error;
0089
0090     align = xfs_get_extsz_hint(ap->ip);
0091 retry:
0092     prod = align / mp->m_sb.sb_rextsize;
0093     error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
0094                     align, 1, ap->eof, 0,
0095                     ap->conv, &ap->offset, &ap->length);
0096     if (error)
0097         return error;
0098     ASSERT(ap->length);
0099     ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
0100
0101     /*
0102      * If we shifted the file offset downward to satisfy an extent size
0103      * hint, increase minlen by that amount so that the allocator won't
0104      * give us an allocation that's too short to cover at least one of the
0105      * blocks that the caller asked for.
0106      */
0107     if (ap->offset != orig_offset)
0108         minlen += orig_offset - ap->offset;
0109
0110     /*
0111      * If the offset & length are not perfectly aligned
0112      * then kill prod, it will just get us in trouble.
0113      */
0114     div_u64_rem(ap->offset, align, &mod);
0115     if (mod || ap->length % align)
0116         prod = 1;
0117     /*
0118      * Set ralen to be the actual requested length in rtextents.
0119      */
0120     ralen = ap->length / mp->m_sb.sb_rextsize;
0121     /*
0122      * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
0123      * we rounded up to it, cut it back so it's valid again.
0124      * Note that if it's a really large request (bigger than
0125      * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
0126      * adjust the starting point to match it.
0127      */
0128     if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
0129         ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
0130
0131     /*
0132      * Lock out modifications to both the RT bitmap and summary inodes
0133      */
0134     if (!rtlocked) {
0135         xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
0136         xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
0137         xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
0138         xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
0139         rtlocked = true;
0140     }
0141
0142     /*
0143      * If it's an allocation to an empty file at offset 0,
0144      * pick an extent that will space things out in the rt area.
0145      */
0146     if (ap->eof && ap->offset == 0) {
0147         xfs_rtblock_t rtx; /* realtime extent no */
0148
0149         error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
0150         if (error)
0151             return error;
0152         ap->blkno = rtx * mp->m_sb.sb_rextsize;
0153     } else {
0154         ap->blkno = 0;
0155     }
0156
0157     xfs_bmap_adjacent(ap);
0158
0159     /*
0160      * Realtime allocation, done through xfs_rtallocate_extent.
0161      */
0162     if (ignore_locality)
0163         ap->blkno = 0;
0164     else
0165         do_div(ap->blkno, mp->m_sb.sb_rextsize);
0166     rtb = ap->blkno;
0167     ap->length = ralen;
0168     raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
0169     error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
0170             &ralen, ap->wasdel, prod, &rtb);
0171     if (error)
0172         return error;
0173
0174     if (rtb != NULLRTBLOCK) {
0175         ap->blkno = rtb * mp->m_sb.sb_rextsize;
0176         ap->length = ralen * mp->m_sb.sb_rextsize;
0177         ap->ip->i_nblocks += ap->length;
0178         xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
0179         if (ap->wasdel)
0180             ap->ip->i_delayed_blks -= ap->length;
0181         /*
0182          * Adjust the disk quota also. This was reserved
0183          * earlier.
0184          */
0185         xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
0186             ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
0187                     XFS_TRANS_DQ_RTBCOUNT, ap->length);
0188         return 0;
0189     }
0190
0191     if (align > mp->m_sb.sb_rextsize) {
0192         /*
0193          * We previously enlarged the request length to try to satisfy
0194          * an extent size hint.  The allocator didn't return anything,
0195          * so reset the parameters to the original values and try again
0196          * without alignment criteria.
0197          */
0198         ap->offset = orig_offset;
0199         ap->length = orig_length;
0200         minlen = align = mp->m_sb.sb_rextsize;
0201         goto retry;
0202     }
0203
0204     if (!ignore_locality && ap->blkno != 0) {
0205         /*
0206          * If we can't allocate near a specific rt extent, try again
0207          * without locality criteria.
0208          */
0209         ignore_locality = true;
0210         goto retry;
0211     }
0212
0213     ap->blkno = NULLFSBLOCK;
0214     ap->length = 0;
0215     return 0;
0216 }
0217 #endif /* CONFIG_XFS_RT */
0218
0219 /*
0220  * Extent tree block counting routines.
0221  */
0222
0223 /*
0224  * Count leaf blocks given a range of extent records.  Delayed allocation
0225  * extents are not counted towards the totals.
0226  */
0227 xfs_extnum_t
0228 xfs_bmap_count_leaves(
0229     struct xfs_ifork    *ifp,
0230     xfs_filblks_t       *count)
0231 {
0232     struct xfs_iext_cursor  icur;
0233     struct xfs_bmbt_irec    got;
0234     xfs_extnum_t        numrecs = 0;
0235
0236     for_each_xfs_iext(ifp, &icur, &got) {
0237         if (!isnullstartblock(got.br_startblock)) {
0238             *count += got.br_blockcount;
0239             numrecs++;
0240         }
0241     }
0242
0243     return numrecs;
0244 }
0245
0246 /*
0247  * Count fsblocks of the given fork.  Delayed allocation extents are
0248  * not counted towards the totals.
0249  */
0250 int
0251 xfs_bmap_count_blocks(
0252     struct xfs_trans    *tp,
0253     struct xfs_inode    *ip,
0254     int         whichfork,
0255     xfs_extnum_t        *nextents,
0256     xfs_filblks_t       *count)
0257 {
0258     struct xfs_mount    *mp = ip->i_mount;
0259     struct xfs_ifork    *ifp = xfs_ifork_ptr(ip, whichfork);
0260     struct xfs_btree_cur    *cur;
0261     xfs_extlen_t        btblocks = 0;
0262     int         error;
0263
0264     *nextents = 0;
0265     *count = 0;
0266
0267     if (!ifp)
0268         return 0;
0269
0270     switch (ifp->if_format) {
0271     case XFS_DINODE_FMT_BTREE:
0272         error = xfs_iread_extents(tp, ip, whichfork);
0273         if (error)
0274             return error;
0275
0276         cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
0277         error = xfs_btree_count_blocks(cur, &btblocks);
0278         xfs_btree_del_cursor(cur, error);
0279         if (error)
0280             return error;
0281
0282         /*
0283          * xfs_btree_count_blocks includes the root block contained in
0284          * the inode fork in @btblocks, so subtract one because we're
0285          * only interested in allocated disk blocks.
0286          */
0287         *count += btblocks - 1;
0288
0289         fallthrough;
0290     case XFS_DINODE_FMT_EXTENTS:
0291         *nextents = xfs_bmap_count_leaves(ifp, count);
0292         break;
0293     }
0294
0295     return 0;
0296 }
0297
0298 static int
0299 xfs_getbmap_report_one(
0300     struct xfs_inode    *ip,
0301     struct getbmapx     *bmv,
0302     struct kgetbmap     *out,
0303     int64_t         bmv_end,
0304     struct xfs_bmbt_irec    *got)
0305 {
0306     struct kgetbmap     *p = out + bmv->bmv_entries;
0307     bool            shared = false;
0308     int         error;
0309
0310     error = xfs_reflink_trim_around_shared(ip, got, &shared);
0311     if (error)
0312         return error;
0313
0314     if (isnullstartblock(got->br_startblock) ||
0315         got->br_startblock == DELAYSTARTBLOCK) {
0316         /*
0317          * Delalloc extents that start beyond EOF can occur due to
0318          * speculative EOF allocation when the delalloc extent is larger
0319          * than the largest freespace extent at conversion time.  These
0320          * extents cannot be converted by data writeback, so can exist
0321          * here even if we are not supposed to be finding delalloc
0322          * extents.
0323          */
0324         if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
0325             ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
0326
0327         p->bmv_oflags |= BMV_OF_DELALLOC;
0328         p->bmv_block = -2;
0329     } else {
0330         p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
0331     }
0332
0333     if (got->br_state == XFS_EXT_UNWRITTEN &&
0334         (bmv->bmv_iflags & BMV_IF_PREALLOC))
0335         p->bmv_oflags |= BMV_OF_PREALLOC;
0336
0337     if (shared)
0338         p->bmv_oflags |= BMV_OF_SHARED;
0339
0340     p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
0341     p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
0342
0343     bmv->bmv_offset = p->bmv_offset + p->bmv_length;
0344     bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
0345     bmv->bmv_entries++;
0346     return 0;
0347 }
0348
0349 static void
0350 xfs_getbmap_report_hole(
0351     struct xfs_inode    *ip,
0352     struct getbmapx     *bmv,
0353     struct kgetbmap     *out,
0354     int64_t         bmv_end,
0355     xfs_fileoff_t       bno,
0356     xfs_fileoff_t       end)
0357 {
0358     struct kgetbmap     *p = out + bmv->bmv_entries;
0359
0360     if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
0361         return;
0362
0363     p->bmv_block = -1;
0364     p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
0365     p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
0366
0367     bmv->bmv_offset = p->bmv_offset + p->bmv_length;
0368     bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
0369     bmv->bmv_entries++;
0370 }
0371
0372 static inline bool
0373 xfs_getbmap_full(
0374     struct getbmapx     *bmv)
0375 {
0376     return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
0377 }
0378
0379 static bool
0380 xfs_getbmap_next_rec(
0381     struct xfs_bmbt_irec    *rec,
0382     xfs_fileoff_t       total_end)
0383 {
0384     xfs_fileoff_t       end = rec->br_startoff + rec->br_blockcount;
0385
0386     if (end == total_end)
0387         return false;
0388
0389     rec->br_startoff += rec->br_blockcount;
0390     if (!isnullstartblock(rec->br_startblock) &&
0391         rec->br_startblock != DELAYSTARTBLOCK)
0392         rec->br_startblock += rec->br_blockcount;
0393     rec->br_blockcount = total_end - end;
0394     return true;
0395 }
0396
0397 /*
0398  * Get inode's extents as described in bmv, and format for output.
0399  * Calls formatter to fill the user's buffer until all extents
0400  * are mapped, until the passed-in bmv->bmv_count slots have
0401  * been filled, or until the formatter short-circuits the loop,
0402  * if it is tracking filled-in extents on its own.
0403  */
0404 int                     /* error code */
0405 xfs_getbmap(
0406     struct xfs_inode    *ip,
0407     struct getbmapx     *bmv,       /* user bmap structure */
0408     struct kgetbmap     *out)
0409 {
0410     struct xfs_mount    *mp = ip->i_mount;
0411     int         iflags = bmv->bmv_iflags;
0412     int         whichfork, lock, error = 0;
0413     int64_t         bmv_end, max_len;
0414     xfs_fileoff_t       bno, first_bno;
0415     struct xfs_ifork    *ifp;
0416     struct xfs_bmbt_irec    got, rec;
0417     xfs_filblks_t       len;
0418     struct xfs_iext_cursor  icur;
0419
0420     if (bmv->bmv_iflags & ~BMV_IF_VALID)
0421         return -EINVAL;
0422 #ifndef DEBUG
0423     /* Only allow CoW fork queries if we're debugging. */
0424     if (iflags & BMV_IF_COWFORK)
0425         return -EINVAL;
0426 #endif
0427     if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
0428         return -EINVAL;
0429
0430     if (bmv->bmv_length < -1)
0431         return -EINVAL;
0432     bmv->bmv_entries = 0;
0433     if (bmv->bmv_length == 0)
0434         return 0;
0435
0436     if (iflags & BMV_IF_ATTRFORK)
0437         whichfork = XFS_ATTR_FORK;
0438     else if (iflags & BMV_IF_COWFORK)
0439         whichfork = XFS_COW_FORK;
0440     else
0441         whichfork = XFS_DATA_FORK;
0442
0443     xfs_ilock(ip, XFS_IOLOCK_SHARED);
0444     switch (whichfork) {
0445     case XFS_ATTR_FORK:
0446         lock = xfs_ilock_attr_map_shared(ip);
0447         if (!xfs_inode_has_attr_fork(ip))
0448             goto out_unlock_ilock;
0449
0450         max_len = 1LL << 32;
0451         break;
0452     case XFS_COW_FORK:
0453         lock = XFS_ILOCK_SHARED;
0454         xfs_ilock(ip, lock);
0455
0456         /* No CoW fork? Just return */
0457         if (!xfs_ifork_ptr(ip, whichfork))
0458             goto out_unlock_ilock;
0459
0460         if (xfs_get_cowextsz_hint(ip))
0461             max_len = mp->m_super->s_maxbytes;
0462         else
0463             max_len = XFS_ISIZE(ip);
0464         break;
0465     case XFS_DATA_FORK:
0466         if (!(iflags & BMV_IF_DELALLOC) &&
0467             (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) {
0468             error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
0469             if (error)
0470                 goto out_unlock_iolock;
0471
0472             /*
0473              * Even after flushing the inode, there can still be
0474              * delalloc blocks on the inode beyond EOF due to
0475              * speculative preallocation.  These are not removed
0476              * until the release function is called or the inode
0477              * is inactivated.  Hence we cannot assert here that
0478              * ip->i_delayed_blks == 0.
0479              */
0480         }
0481
0482         if (xfs_get_extsz_hint(ip) ||
0483             (ip->i_diflags &
0484              (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
0485             max_len = mp->m_super->s_maxbytes;
0486         else
0487             max_len = XFS_ISIZE(ip);
0488
0489         lock = xfs_ilock_data_map_shared(ip);
0490         break;
0491     }
0492
0493     ifp = xfs_ifork_ptr(ip, whichfork);
0494
0495     switch (ifp->if_format) {
0496     case XFS_DINODE_FMT_EXTENTS:
0497     case XFS_DINODE_FMT_BTREE:
0498         break;
0499     case XFS_DINODE_FMT_LOCAL:
0500         /* Local format inode forks report no extents. */
0501         goto out_unlock_ilock;
0502     default:
0503         error = -EINVAL;
0504         goto out_unlock_ilock;
0505     }
0506
0507     if (bmv->bmv_length == -1) {
0508         max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
0509         bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
0510     }
0511
0512     bmv_end = bmv->bmv_offset + bmv->bmv_length;
0513
0514     first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
0515     len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
0516
0517     error = xfs_iread_extents(NULL, ip, whichfork);
0518     if (error)
0519         goto out_unlock_ilock;
0520
0521     if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
0522         /*
0523          * Report a whole-file hole if the delalloc flag is set to
0524          * stay compatible with the old implementation.
0525          */
0526         if (iflags & BMV_IF_DELALLOC)
0527             xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
0528                     XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
0529         goto out_unlock_ilock;
0530     }
0531
0532     while (!xfs_getbmap_full(bmv)) {
0533         xfs_trim_extent(&got, first_bno, len);
0534
0535         /*
0536          * Report an entry for a hole if this extent doesn't directly
0537          * follow the previous one.
0538          */
0539         if (got.br_startoff > bno) {
0540             xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
0541                     got.br_startoff);
0542             if (xfs_getbmap_full(bmv))
0543                 break;
0544         }
0545
0546         /*
0547          * In order to report shared extents accurately, we report each
0548          * distinct shared / unshared part of a single bmbt record with
0549          * an individual getbmapx record.
0550          */
0551         bno = got.br_startoff + got.br_blockcount;
0552         rec = got;
0553         do {
0554             error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
0555                     &rec);
0556             if (error || xfs_getbmap_full(bmv))
0557                 goto out_unlock_ilock;
0558         } while (xfs_getbmap_next_rec(&rec, bno));
0559
0560         if (!xfs_iext_next_extent(ifp, &icur, &got)) {
0561             xfs_fileoff_t   end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
0562
0563             out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
0564
0565             if (whichfork != XFS_ATTR_FORK && bno < end &&
0566                 !xfs_getbmap_full(bmv)) {
0567                 xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
0568                         bno, end);
0569             }
0570             break;
0571         }
0572
0573         if (bno >= first_bno + len)
0574             break;
0575     }
0576
0577 out_unlock_ilock:
0578     xfs_iunlock(ip, lock);
0579 out_unlock_iolock:
0580     xfs_iunlock(ip, XFS_IOLOCK_SHARED);
0581     return error;
0582 }
0583
0584 /*
0585  * Dead simple method of punching delalyed allocation blocks from a range in
0586  * the inode.  This will always punch out both the start and end blocks, even
0587  * if the ranges only partially overlap them, so it is up to the caller to
0588  * ensure that partial blocks are not passed in.
0589  */
0590 int
0591 xfs_bmap_punch_delalloc_range(
0592     struct xfs_inode    *ip,
0593     xfs_fileoff_t       start_fsb,
0594     xfs_fileoff_t       length)
0595 {
0596     struct xfs_ifork    *ifp = &ip->i_df;
0597     xfs_fileoff_t       end_fsb = start_fsb + length;
0598     struct xfs_bmbt_irec    got, del;
0599     struct xfs_iext_cursor  icur;
0600     int         error = 0;
0601
0602     ASSERT(!xfs_need_iread_extents(ifp));
0603
0604     xfs_ilock(ip, XFS_ILOCK_EXCL);
0605     if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
0606         goto out_unlock;
0607
0608     while (got.br_startoff + got.br_blockcount > start_fsb) {
0609         del = got;
0610         xfs_trim_extent(&del, start_fsb, length);
0611
0612         /*
0613          * A delete can push the cursor forward. Step back to the
0614          * previous extent on non-delalloc or extents outside the
0615          * target range.
0616          */
0617         if (!del.br_blockcount ||
0618             !isnullstartblock(del.br_startblock)) {
0619             if (!xfs_iext_prev_extent(ifp, &icur, &got))
0620                 break;
0621             continue;
0622         }
0623
0624         error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
0625                           &got, &del);
0626         if (error || !xfs_iext_get_extent(ifp, &icur, &got))
0627             break;
0628     }
0629
0630 out_unlock:
0631     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0632     return error;
0633 }
0634
0635 /*
0636  * Test whether it is appropriate to check an inode for and free post EOF
0637  * blocks. The 'force' parameter determines whether we should also consider
0638  * regular files that are marked preallocated or append-only.
0639  */
0640 bool
0641 xfs_can_free_eofblocks(
0642     struct xfs_inode    *ip,
0643     bool            force)
0644 {
0645     struct xfs_bmbt_irec    imap;
0646     struct xfs_mount    *mp = ip->i_mount;
0647     xfs_fileoff_t       end_fsb;
0648     xfs_fileoff_t       last_fsb;
0649     int         nimaps = 1;
0650     int         error;
0651
0652     /*
0653      * Caller must either hold the exclusive io lock; or be inactivating
0654      * the inode, which guarantees there are no other users of the inode.
0655      */
0656     ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
0657            (VFS_I(ip)->i_state & I_FREEING));
0658
0659     /* prealloc/delalloc exists only on regular files */
0660     if (!S_ISREG(VFS_I(ip)->i_mode))
0661         return false;
0662
0663     /*
0664      * Zero sized files with no cached pages and delalloc blocks will not
0665      * have speculative prealloc/delalloc blocks to remove.
0666      */
0667     if (VFS_I(ip)->i_size == 0 &&
0668         VFS_I(ip)->i_mapping->nrpages == 0 &&
0669         ip->i_delayed_blks == 0)
0670         return false;
0671
0672     /* If we haven't read in the extent list, then don't do it now. */
0673     if (xfs_need_iread_extents(&ip->i_df))
0674         return false;
0675
0676     /*
0677      * Do not free real preallocated or append-only files unless the file
0678      * has delalloc blocks and we are forced to remove them.
0679      */
0680     if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
0681         if (!force || ip->i_delayed_blks == 0)
0682             return false;
0683
0684     /*
0685      * Do not try to free post-EOF blocks if EOF is beyond the end of the
0686      * range supported by the page cache, because the truncation will loop
0687      * forever.
0688      */
0689     end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
0690     if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
0691         end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
0692     last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
0693     if (last_fsb <= end_fsb)
0694         return false;
0695
0696     /*
0697      * Look up the mapping for the first block past EOF.  If we can't find
0698      * it, there's nothing to free.
0699      */
0700     xfs_ilock(ip, XFS_ILOCK_SHARED);
0701     error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
0702             0);
0703     xfs_iunlock(ip, XFS_ILOCK_SHARED);
0704     if (error || nimaps == 0)
0705         return false;
0706
0707     /*
0708      * If there's a real mapping there or there are delayed allocation
0709      * reservations, then we have post-EOF blocks to try to free.
0710      */
0711     return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks;
0712 }
0713
0714 /*
0715  * This is called to free any blocks beyond eof. The caller must hold
0716  * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
0717  * reference to the inode.
0718  */
0719 int
0720 xfs_free_eofblocks(
0721     struct xfs_inode    *ip)
0722 {
0723     struct xfs_trans    *tp;
0724     struct xfs_mount    *mp = ip->i_mount;
0725     int         error;
0726
0727     /* Attach the dquots to the inode up front. */
0728     error = xfs_qm_dqattach(ip);
0729     if (error)
0730         return error;
0731
0732     /* Wait on dio to ensure i_size has settled. */
0733     inode_dio_wait(VFS_I(ip));
0734
0735     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
0736     if (error) {
0737         ASSERT(xfs_is_shutdown(mp));
0738         return error;
0739     }
0740
0741     xfs_ilock(ip, XFS_ILOCK_EXCL);
0742     xfs_trans_ijoin(tp, ip, 0);
0743
0744     /*
0745      * Do not update the on-disk file size.  If we update the on-disk file
0746      * size and then the system crashes before the contents of the file are
0747      * flushed to disk then the files may be full of holes (ie NULL files
0748      * bug).
0749      */
0750     error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
0751                 XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
0752     if (error)
0753         goto err_cancel;
0754
0755     error = xfs_trans_commit(tp);
0756     if (error)
0757         goto out_unlock;
0758
0759     xfs_inode_clear_eofblocks_tag(ip);
0760     goto out_unlock;
0761
0762 err_cancel:
0763     /*
0764      * If we get an error at this point we simply don't
0765      * bother truncating the file.
0766      */
0767     xfs_trans_cancel(tp);
0768 out_unlock:
0769     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0770     return error;
0771 }
0772
0773 int
0774 xfs_alloc_file_space(
0775     struct xfs_inode    *ip,
0776     xfs_off_t       offset,
0777     xfs_off_t       len)
0778 {
0779     xfs_mount_t     *mp = ip->i_mount;
0780     xfs_off_t       count;
0781     xfs_filblks_t       allocated_fsb;
0782     xfs_filblks_t       allocatesize_fsb;
0783     xfs_extlen_t        extsz, temp;
0784     xfs_fileoff_t       startoffset_fsb;
0785     xfs_fileoff_t       endoffset_fsb;
0786     int         nimaps;
0787     int         rt;
0788     xfs_trans_t     *tp;
0789     xfs_bmbt_irec_t     imaps[1], *imapp;
0790     int         error;
0791
0792     trace_xfs_alloc_file_space(ip);
0793
0794     if (xfs_is_shutdown(mp))
0795         return -EIO;
0796
0797     error = xfs_qm_dqattach(ip);
0798     if (error)
0799         return error;
0800
0801     if (len <= 0)
0802         return -EINVAL;
0803
0804     rt = XFS_IS_REALTIME_INODE(ip);
0805     extsz = xfs_get_extsz_hint(ip);
0806
0807     count = len;
0808     imapp = &imaps[0];
0809     nimaps = 1;
0810     startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
0811     endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
0812     allocatesize_fsb = endoffset_fsb - startoffset_fsb;
0813
0814     /*
0815      * Allocate file space until done or until there is an error
0816      */
0817     while (allocatesize_fsb && !error) {
0818         xfs_fileoff_t   s, e;
0819         unsigned int    dblocks, rblocks, resblks;
0820
0821         /*
0822          * Determine space reservations for data/realtime.
0823          */
0824         if (unlikely(extsz)) {
0825             s = startoffset_fsb;
0826             do_div(s, extsz);
0827             s *= extsz;
0828             e = startoffset_fsb + allocatesize_fsb;
0829             div_u64_rem(startoffset_fsb, extsz, &temp);
0830             if (temp)
0831                 e += temp;
0832             div_u64_rem(e, extsz, &temp);
0833             if (temp)
0834                 e += extsz - temp;
0835         } else {
0836             s = 0;
0837             e = allocatesize_fsb;
0838         }
0839
0840         /*
0841          * The transaction reservation is limited to a 32-bit block
0842          * count, hence we need to limit the number of blocks we are
0843          * trying to reserve to avoid an overflow. We can't allocate
0844          * more than @nimaps extents, and an extent is limited on disk
0845          * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
0846          * limit.
0847          */
0848         resblks = min_t(xfs_fileoff_t, (e - s),
0849                 (XFS_MAX_BMBT_EXTLEN * nimaps));
0850         if (unlikely(rt)) {
0851             dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
0852             rblocks = resblks;
0853         } else {
0854             dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
0855             rblocks = 0;
0856         }
0857
0858         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
0859                 dblocks, rblocks, false, &tp);
0860         if (error)
0861             break;
0862
0863         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
0864                 XFS_IEXT_ADD_NOSPLIT_CNT);
0865         if (error == -EFBIG)
0866             error = xfs_iext_count_upgrade(tp, ip,
0867                     XFS_IEXT_ADD_NOSPLIT_CNT);
0868         if (error)
0869             goto error;
0870
0871         error = xfs_bmapi_write(tp, ip, startoffset_fsb,
0872                 allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
0873                 &nimaps);
0874         if (error)
0875             goto error;
0876
0877         ip->i_diflags |= XFS_DIFLAG_PREALLOC;
0878         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
0879
0880         error = xfs_trans_commit(tp);
0881         xfs_iunlock(ip, XFS_ILOCK_EXCL);
0882         if (error)
0883             break;
0884
0885         allocated_fsb = imapp->br_blockcount;
0886
0887         if (nimaps == 0) {
0888             error = -ENOSPC;
0889             break;
0890         }
0891
0892         startoffset_fsb += allocated_fsb;
0893         allocatesize_fsb -= allocated_fsb;
0894     }
0895
0896     return error;
0897
0898 error:
0899     xfs_trans_cancel(tp);
0900     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0901     return error;
0902 }
0903
0904 static int
0905 xfs_unmap_extent(
0906     struct xfs_inode    *ip,
0907     xfs_fileoff_t       startoffset_fsb,
0908     xfs_filblks_t       len_fsb,
0909     int         *done)
0910 {
0911     struct xfs_mount    *mp = ip->i_mount;
0912     struct xfs_trans    *tp;
0913     uint            resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
0914     int         error;
0915
0916     error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
0917             false, &tp);
0918     if (error)
0919         return error;
0920
0921     error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
0922             XFS_IEXT_PUNCH_HOLE_CNT);
0923     if (error == -EFBIG)
0924         error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
0925     if (error)
0926         goto out_trans_cancel;
0927
0928     error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
0929     if (error)
0930         goto out_trans_cancel;
0931
0932     error = xfs_trans_commit(tp);
0933 out_unlock:
0934     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0935     return error;
0936
0937 out_trans_cancel:
0938     xfs_trans_cancel(tp);
0939     goto out_unlock;
0940 }
0941
0942 /* Caller must first wait for the completion of any pending DIOs if required. */
0943 int
0944 xfs_flush_unmap_range(
0945     struct xfs_inode    *ip,
0946     xfs_off_t       offset,
0947     xfs_off_t       len)
0948 {
0949     struct xfs_mount    *mp = ip->i_mount;
0950     struct inode        *inode = VFS_I(ip);
0951     xfs_off_t       rounding, start, end;
0952     int         error;
0953
0954     rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
0955     start = round_down(offset, rounding);
0956     end = round_up(offset + len, rounding) - 1;
0957
0958     error = filemap_write_and_wait_range(inode->i_mapping, start, end);
0959     if (error)
0960         return error;
0961     truncate_pagecache_range(inode, start, end);
0962     return 0;
0963 }
0964
0965 int
0966 xfs_free_file_space(
0967     struct xfs_inode    *ip,
0968     xfs_off_t       offset,
0969     xfs_off_t       len)
0970 {
0971     struct xfs_mount    *mp = ip->i_mount;
0972     xfs_fileoff_t       startoffset_fsb;
0973     xfs_fileoff_t       endoffset_fsb;
0974     int         done = 0, error;
0975
0976     trace_xfs_free_file_space(ip);
0977
0978     error = xfs_qm_dqattach(ip);
0979     if (error)
0980         return error;
0981
0982     if (len <= 0)   /* if nothing being freed */
0983         return 0;
0984
0985     startoffset_fsb = XFS_B_TO_FSB(mp, offset);
0986     endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
0987
0988     /* We can only free complete realtime extents. */
0989     if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
0990         startoffset_fsb = roundup_64(startoffset_fsb,
0991                          mp->m_sb.sb_rextsize);
0992         endoffset_fsb = rounddown_64(endoffset_fsb,
0993                          mp->m_sb.sb_rextsize);
0994     }
0995
0996     /*
0997      * Need to zero the stuff we're not freeing, on disk.
0998      */
0999     if (endoffset_fsb > startoffset_fsb) {
1000         while (!done) {
1001             error = xfs_unmap_extent(ip, startoffset_fsb,
1002                     endoffset_fsb - startoffset_fsb, &done);
1003             if (error)
1004                 return error;
1005         }
1006     }
1007
1008     /*
1009      * Now that we've unmap all full blocks we'll have to zero out any
1010      * partial block at the beginning and/or end.  xfs_zero_range is smart
1011      * enough to skip any holes, including those we just created, but we
1012      * must take care not to zero beyond EOF and enlarge i_size.
1013      */
1014     if (offset >= XFS_ISIZE(ip))
1015         return 0;
1016     if (offset + len > XFS_ISIZE(ip))
1017         len = XFS_ISIZE(ip) - offset;
1018     error = xfs_zero_range(ip, offset, len, NULL);
1019     if (error)
1020         return error;
1021
1022     /*
1023      * If we zeroed right up to EOF and EOF straddles a page boundary we
1024      * must make sure that the post-EOF area is also zeroed because the
1025      * page could be mmap'd and xfs_zero_range doesn't do that for us.
1026      * Writeback of the eof page will do this, albeit clumsily.
1027      */
1028     if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
1029         error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1030                 round_down(offset + len, PAGE_SIZE), LLONG_MAX);
1031     }
1032
1033     return error;
1034 }
1035
1036 static int
1037 xfs_prepare_shift(
1038     struct xfs_inode    *ip,
1039     loff_t          offset)
1040 {
1041     struct xfs_mount    *mp = ip->i_mount;
1042     int         error;
1043
1044     /*
1045      * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1046      * into the accessible region of the file.
1047      */
1048     if (xfs_can_free_eofblocks(ip, true)) {
1049         error = xfs_free_eofblocks(ip);
1050         if (error)
1051             return error;
1052     }
1053
1054     /*
1055      * Shift operations must stabilize the start block offset boundary along
1056      * with the full range of the operation. If we don't, a COW writeback
1057      * completion could race with an insert, front merge with the start
1058      * extent (after split) during the shift and corrupt the file. Start
1059      * with the block just prior to the start to stabilize the boundary.
1060      */
1061     offset = round_down(offset, mp->m_sb.sb_blocksize);
1062     if (offset)
1063         offset -= mp->m_sb.sb_blocksize;
1064
1065     /*
1066      * Writeback and invalidate cache for the remainder of the file as we're
1067      * about to shift down every extent from offset to EOF.
1068      */
1069     error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
1070     if (error)
1071         return error;
1072
1073     /*
1074      * Clean out anything hanging around in the cow fork now that
1075      * we've flushed all the dirty data out to disk to avoid having
1076      * CoW extents at the wrong offsets.
1077      */
1078     if (xfs_inode_has_cow_data(ip)) {
1079         error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
1080                 true);
1081         if (error)
1082             return error;
1083     }
1084
1085     return 0;
1086 }
1087
1088 /*
1089  * xfs_collapse_file_space()
1090  *  This routine frees disk space and shift extent for the given file.
1091  *  The first thing we do is to free data blocks in the specified range
1092  *  by calling xfs_free_file_space(). It would also sync dirty data
1093  *  and invalidate page cache over the region on which collapse range
1094  *  is working. And Shift extent records to the left to cover a hole.
1095  * RETURNS:
1096  *  0 on success
1097  *  errno on error
1098  *
1099  */
1100 int
1101 xfs_collapse_file_space(
1102     struct xfs_inode    *ip,
1103     xfs_off_t       offset,
1104     xfs_off_t       len)
1105 {
1106     struct xfs_mount    *mp = ip->i_mount;
1107     struct xfs_trans    *tp;
1108     int         error;
1109     xfs_fileoff_t       next_fsb = XFS_B_TO_FSB(mp, offset + len);
1110     xfs_fileoff_t       shift_fsb = XFS_B_TO_FSB(mp, len);
1111     bool            done = false;
1112
1113     ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1114     ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
1115
1116     trace_xfs_collapse_file_space(ip);
1117
1118     error = xfs_free_file_space(ip, offset, len);
1119     if (error)
1120         return error;
1121
1122     error = xfs_prepare_shift(ip, offset);
1123     if (error)
1124         return error;
1125
1126     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
1127     if (error)
1128         return error;
1129
1130     xfs_ilock(ip, XFS_ILOCK_EXCL);
1131     xfs_trans_ijoin(tp, ip, 0);
1132
1133     while (!done) {
1134         error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
1135                 &done);
1136         if (error)
1137             goto out_trans_cancel;
1138         if (done)
1139             break;
1140
1141         /* finish any deferred frees and roll the transaction */
1142         error = xfs_defer_finish(&tp);
1143         if (error)
1144             goto out_trans_cancel;
1145     }
1146
1147     error = xfs_trans_commit(tp);
1148     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1149     return error;
1150
1151 out_trans_cancel:
1152     xfs_trans_cancel(tp);
1153     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1154     return error;
1155 }
1156
1157 /*
1158  * xfs_insert_file_space()
1159  *  This routine create hole space by shifting extents for the given file.
1160  *  The first thing we do is to sync dirty data and invalidate page cache
1161  *  over the region on which insert range is working. And split an extent
1162  *  to two extents at given offset by calling xfs_bmap_split_extent.
1163  *  And shift all extent records which are laying between [offset,
1164  *  last allocated extent] to the right to reserve hole range.
1165  * RETURNS:
1166  *  0 on success
1167  *  errno on error
1168  */
1169 int
1170 xfs_insert_file_space(
1171     struct xfs_inode    *ip,
1172     loff_t          offset,
1173     loff_t          len)
1174 {
1175     struct xfs_mount    *mp = ip->i_mount;
1176     struct xfs_trans    *tp;
1177     int         error;
1178     xfs_fileoff_t       stop_fsb = XFS_B_TO_FSB(mp, offset);
1179     xfs_fileoff_t       next_fsb = NULLFSBLOCK;
1180     xfs_fileoff_t       shift_fsb = XFS_B_TO_FSB(mp, len);
1181     bool            done = false;
1182
1183     ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1184     ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
1185
1186     trace_xfs_insert_file_space(ip);
1187
1188     error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
1189     if (error)
1190         return error;
1191
1192     error = xfs_prepare_shift(ip, offset);
1193     if (error)
1194         return error;
1195
1196     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
1197             XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
1198     if (error)
1199         return error;
1200
1201     xfs_ilock(ip, XFS_ILOCK_EXCL);
1202     xfs_trans_ijoin(tp, ip, 0);
1203
1204     error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
1205             XFS_IEXT_PUNCH_HOLE_CNT);
1206     if (error == -EFBIG)
1207         error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
1208     if (error)
1209         goto out_trans_cancel;
1210
1211     /*
1212      * The extent shifting code works on extent granularity. So, if stop_fsb
1213      * is not the starting block of extent, we need to split the extent at
1214      * stop_fsb.
1215      */
1216     error = xfs_bmap_split_extent(tp, ip, stop_fsb);
1217     if (error)
1218         goto out_trans_cancel;
1219
1220     do {
1221         error = xfs_defer_finish(&tp);
1222         if (error)
1223             goto out_trans_cancel;
1224
1225         error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
1226                 &done, stop_fsb);
1227         if (error)
1228             goto out_trans_cancel;
1229     } while (!done);
1230
1231     error = xfs_trans_commit(tp);
1232     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1233     return error;
1234
1235 out_trans_cancel:
1236     xfs_trans_cancel(tp);
1237     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1238     return error;
1239 }
1240
1241 /*
1242  * We need to check that the format of the data fork in the temporary inode is
1243  * valid for the target inode before doing the swap. This is not a problem with
1244  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1245  * data fork depending on the space the attribute fork is taking so we can get
1246  * invalid formats on the target inode.
1247  *
1248  * E.g. target has space for 7 extents in extent format, temp inode only has
1249  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
1250  * btree, but when swapped it needs to be in extent format. Hence we can't just
1251  * blindly swap data forks on attr2 filesystems.
1252  *
1253  * Note that we check the swap in both directions so that we don't end up with
1254  * a corrupt temporary inode, either.
1255  *
1256  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1257  * inode will prevent this situation from occurring, so all we do here is
1258  * reject and log the attempt. basically we are putting the responsibility on
1259  * userspace to get this right.
1260  */
1261 static int
1262 xfs_swap_extents_check_format(
1263     struct xfs_inode    *ip,    /* target inode */
1264     struct xfs_inode    *tip)   /* tmp inode */
1265 {
1266     struct xfs_ifork    *ifp = &ip->i_df;
1267     struct xfs_ifork    *tifp = &tip->i_df;
1268
1269     /* User/group/project quota ids must match if quotas are enforced. */
1270     if (XFS_IS_QUOTA_ON(ip->i_mount) &&
1271         (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
1272          !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
1273          ip->i_projid != tip->i_projid))
1274         return -EINVAL;
1275
1276     /* Should never get a local format */
1277     if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
1278         tifp->if_format == XFS_DINODE_FMT_LOCAL)
1279         return -EINVAL;
1280
1281     /*
1282      * if the target inode has less extents that then temporary inode then
1283      * why did userspace call us?
1284      */
1285     if (ifp->if_nextents < tifp->if_nextents)
1286         return -EINVAL;
1287
1288     /*
1289      * If we have to use the (expensive) rmap swap method, we can
1290      * handle any number of extents and any format.
1291      */
1292     if (xfs_has_rmapbt(ip->i_mount))
1293         return 0;
1294
1295     /*
1296      * if the target inode is in extent form and the temp inode is in btree
1297      * form then we will end up with the target inode in the wrong format
1298      * as we already know there are less extents in the temp inode.
1299      */
1300     if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1301         tifp->if_format == XFS_DINODE_FMT_BTREE)
1302         return -EINVAL;
1303
1304     /* Check temp in extent form to max in target */
1305     if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1306         tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1307         return -EINVAL;
1308
1309     /* Check target in extent form to max in temp */
1310     if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1311         ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1312         return -EINVAL;
1313
1314     /*
1315      * If we are in a btree format, check that the temp root block will fit
1316      * in the target and that it has enough extents to be in btree format
1317      * in the target.
1318      *
1319      * Note that we have to be careful to allow btree->extent conversions
1320      * (a common defrag case) which will occur when the temp inode is in
1321      * extent format...
1322      */
1323     if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
1324         if (xfs_inode_has_attr_fork(ip) &&
1325             XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
1326             return -EINVAL;
1327         if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1328             return -EINVAL;
1329     }
1330
1331     /* Reciprocal target->temp btree format checks */
1332     if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
1333         if (xfs_inode_has_attr_fork(tip) &&
1334             XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
1335             return -EINVAL;
1336         if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1337             return -EINVAL;
1338     }
1339
1340     return 0;
1341 }
1342
1343 static int
1344 xfs_swap_extent_flush(
1345     struct xfs_inode    *ip)
1346 {
1347     int error;
1348
1349     error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1350     if (error)
1351         return error;
1352     truncate_pagecache_range(VFS_I(ip), 0, -1);
1353
1354     /* Verify O_DIRECT for ftmp */
1355     if (VFS_I(ip)->i_mapping->nrpages)
1356         return -EINVAL;
1357     return 0;
1358 }
1359
1360 /*
1361  * Move extents from one file to another, when rmap is enabled.
1362  */
1363 STATIC int
1364 xfs_swap_extent_rmap(
1365     struct xfs_trans        **tpp,
1366     struct xfs_inode        *ip,
1367     struct xfs_inode        *tip)
1368 {
1369     struct xfs_trans        *tp = *tpp;
1370     struct xfs_bmbt_irec        irec;
1371     struct xfs_bmbt_irec        uirec;
1372     struct xfs_bmbt_irec        tirec;
1373     xfs_fileoff_t           offset_fsb;
1374     xfs_fileoff_t           end_fsb;
1375     xfs_filblks_t           count_fsb;
1376     int             error;
1377     xfs_filblks_t           ilen;
1378     xfs_filblks_t           rlen;
1379     int             nimaps;
1380     uint64_t            tip_flags2;
1381
1382     /*
1383      * If the source file has shared blocks, we must flag the donor
1384      * file as having shared blocks so that we get the shared-block
1385      * rmap functions when we go to fix up the rmaps.  The flags
1386      * will be switch for reals later.
1387      */
1388     tip_flags2 = tip->i_diflags2;
1389     if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
1390         tip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1391
1392     offset_fsb = 0;
1393     end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
1394     count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
1395
1396     while (count_fsb) {
1397         /* Read extent from the donor file */
1398         nimaps = 1;
1399         error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
1400                 &nimaps, 0);
1401         if (error)
1402             goto out;
1403         ASSERT(nimaps == 1);
1404         ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
1405
1406         trace_xfs_swap_extent_rmap_remap(tip, &tirec);
1407         ilen = tirec.br_blockcount;
1408
1409         /* Unmap the old blocks in the source file. */
1410         while (tirec.br_blockcount) {
1411             ASSERT(tp->t_firstblock == NULLFSBLOCK);
1412             trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
1413
1414             /* Read extent from the source file */
1415             nimaps = 1;
1416             error = xfs_bmapi_read(ip, tirec.br_startoff,
1417                     tirec.br_blockcount, &irec,
1418                     &nimaps, 0);
1419             if (error)
1420                 goto out;
1421             ASSERT(nimaps == 1);
1422             ASSERT(tirec.br_startoff == irec.br_startoff);
1423             trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
1424
1425             /* Trim the extent. */
1426             uirec = tirec;
1427             uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
1428                     tirec.br_blockcount,
1429                     irec.br_blockcount);
1430             trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
1431
1432             if (xfs_bmap_is_real_extent(&uirec)) {
1433                 error = xfs_iext_count_may_overflow(ip,
1434                         XFS_DATA_FORK,
1435                         XFS_IEXT_SWAP_RMAP_CNT);
1436                 if (error == -EFBIG)
1437                     error = xfs_iext_count_upgrade(tp, ip,
1438                             XFS_IEXT_SWAP_RMAP_CNT);
1439                 if (error)
1440                     goto out;
1441             }
1442
1443             if (xfs_bmap_is_real_extent(&irec)) {
1444                 error = xfs_iext_count_may_overflow(tip,
1445                         XFS_DATA_FORK,
1446                         XFS_IEXT_SWAP_RMAP_CNT);
1447                 if (error == -EFBIG)
1448                     error = xfs_iext_count_upgrade(tp, ip,
1449                             XFS_IEXT_SWAP_RMAP_CNT);
1450                 if (error)
1451                     goto out;
1452             }
1453
1454             /* Remove the mapping from the donor file. */
1455             xfs_bmap_unmap_extent(tp, tip, &uirec);
1456
1457             /* Remove the mapping from the source file. */
1458             xfs_bmap_unmap_extent(tp, ip, &irec);
1459
1460             /* Map the donor file's blocks into the source file. */
1461             xfs_bmap_map_extent(tp, ip, &uirec);
1462
1463             /* Map the source file's blocks into the donor file. */
1464             xfs_bmap_map_extent(tp, tip, &irec);
1465
1466             error = xfs_defer_finish(tpp);
1467             tp = *tpp;
1468             if (error)
1469                 goto out;
1470
1471             tirec.br_startoff += rlen;
1472             if (tirec.br_startblock != HOLESTARTBLOCK &&
1473                 tirec.br_startblock != DELAYSTARTBLOCK)
1474                 tirec.br_startblock += rlen;
1475             tirec.br_blockcount -= rlen;
1476         }
1477
1478         /* Roll on... */
1479         count_fsb -= ilen;
1480         offset_fsb += ilen;
1481     }
1482
1483     tip->i_diflags2 = tip_flags2;
1484     return 0;
1485
1486 out:
1487     trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
1488     tip->i_diflags2 = tip_flags2;
1489     return error;
1490 }
1491
1492 /* Swap the extents of two files by swapping data forks. */
1493 STATIC int
1494 xfs_swap_extent_forks(
1495     struct xfs_trans    *tp,
1496     struct xfs_inode    *ip,
1497     struct xfs_inode    *tip,
1498     int         *src_log_flags,
1499     int         *target_log_flags)
1500 {
1501     xfs_filblks_t       aforkblks = 0;
1502     xfs_filblks_t       taforkblks = 0;
1503     xfs_extnum_t        junk;
1504     uint64_t        tmp;
1505     int         error;
1506
1507     /*
1508      * Count the number of extended attribute blocks
1509      */
1510     if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
1511         ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
1512         error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
1513                 &aforkblks);
1514         if (error)
1515             return error;
1516     }
1517     if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
1518         tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
1519         error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
1520                 &taforkblks);
1521         if (error)
1522             return error;
1523     }
1524
1525     /*
1526      * Btree format (v3) inodes have the inode number stamped in the bmbt
1527      * block headers. We can't start changing the bmbt blocks until the
1528      * inode owner change is logged so recovery does the right thing in the
1529      * event of a crash. Set the owner change log flags now and leave the
1530      * bmbt scan as the last step.
1531      */
1532     if (xfs_has_v3inodes(ip->i_mount)) {
1533         if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
1534             (*target_log_flags) |= XFS_ILOG_DOWNER;
1535         if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
1536             (*src_log_flags) |= XFS_ILOG_DOWNER;
1537     }
1538
1539     /*
1540      * Swap the data forks of the inodes
1541      */
1542     swap(ip->i_df, tip->i_df);
1543
1544     /*
1545      * Fix the on-disk inode values
1546      */
1547     tmp = (uint64_t)ip->i_nblocks;
1548     ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
1549     tip->i_nblocks = tmp + taforkblks - aforkblks;
1550
1551     /*
1552      * The extents in the source inode could still contain speculative
1553      * preallocation beyond EOF (e.g. the file is open but not modified
1554      * while defrag is in progress). In that case, we need to copy over the
1555      * number of delalloc blocks the data fork in the source inode is
1556      * tracking beyond EOF so that when the fork is truncated away when the
1557      * temporary inode is unlinked we don't underrun the i_delayed_blks
1558      * counter on that inode.
1559      */
1560     ASSERT(tip->i_delayed_blks == 0);
1561     tip->i_delayed_blks = ip->i_delayed_blks;
1562     ip->i_delayed_blks = 0;
1563
1564     switch (ip->i_df.if_format) {
1565     case XFS_DINODE_FMT_EXTENTS:
1566         (*src_log_flags) |= XFS_ILOG_DEXT;
1567         break;
1568     case XFS_DINODE_FMT_BTREE:
1569         ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
1570                (*src_log_flags & XFS_ILOG_DOWNER));
1571         (*src_log_flags) |= XFS_ILOG_DBROOT;
1572         break;
1573     }
1574
1575     switch (tip->i_df.if_format) {
1576     case XFS_DINODE_FMT_EXTENTS:
1577         (*target_log_flags) |= XFS_ILOG_DEXT;
1578         break;
1579     case XFS_DINODE_FMT_BTREE:
1580         (*target_log_flags) |= XFS_ILOG_DBROOT;
1581         ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
1582                (*target_log_flags & XFS_ILOG_DOWNER));
1583         break;
1584     }
1585
1586     return 0;
1587 }
1588
1589 /*
1590  * Fix up the owners of the bmbt blocks to refer to the current inode. The
1591  * change owner scan attempts to order all modified buffers in the current
1592  * transaction. In the event of ordered buffer failure, the offending buffer is
1593  * physically logged as a fallback and the scan returns -EAGAIN. We must roll
1594  * the transaction in this case to replenish the fallback log reservation and
1595  * restart the scan. This process repeats until the scan completes.
1596  */
1597 static int
1598 xfs_swap_change_owner(
1599     struct xfs_trans    **tpp,
1600     struct xfs_inode    *ip,
1601     struct xfs_inode    *tmpip)
1602 {
1603     int         error;
1604     struct xfs_trans    *tp = *tpp;
1605
1606     do {
1607         error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
1608                           NULL);
1609         /* success or fatal error */
1610         if (error != -EAGAIN)
1611             break;
1612
1613         error = xfs_trans_roll(tpp);
1614         if (error)
1615             break;
1616         tp = *tpp;
1617
1618         /*
1619          * Redirty both inodes so they can relog and keep the log tail
1620          * moving forward.
1621          */
1622         xfs_trans_ijoin(tp, ip, 0);
1623         xfs_trans_ijoin(tp, tmpip, 0);
1624         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1625         xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
1626     } while (true);
1627
1628     return error;
1629 }
1630
1631 int
1632 xfs_swap_extents(
1633     struct xfs_inode    *ip,    /* target inode */
1634     struct xfs_inode    *tip,   /* tmp inode */
1635     struct xfs_swapext  *sxp)
1636 {
1637     struct xfs_mount    *mp = ip->i_mount;
1638     struct xfs_trans    *tp;
1639     struct xfs_bstat    *sbp = &sxp->sx_stat;
1640     int         src_log_flags, target_log_flags;
1641     int         error = 0;
1642     uint64_t        f;
1643     int         resblks = 0;
1644     unsigned int        flags = 0;
1645
1646     /*
1647      * Lock the inodes against other IO, page faults and truncate to
1648      * begin with.  Then we can ensure the inodes are flushed and have no
1649      * page cache safely. Once we have done this we can take the ilocks and
1650      * do the rest of the checks.
1651      */
1652     lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
1653     filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
1654                     VFS_I(tip)->i_mapping);
1655
1656     /* Verify that both files have the same format */
1657     if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
1658         error = -EINVAL;
1659         goto out_unlock;
1660     }
1661
1662     /* Verify both files are either real-time or non-realtime */
1663     if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1664         error = -EINVAL;
1665         goto out_unlock;
1666     }
1667
1668     error = xfs_qm_dqattach(ip);
1669     if (error)
1670         goto out_unlock;
1671
1672     error = xfs_qm_dqattach(tip);
1673     if (error)
1674         goto out_unlock;
1675
1676     error = xfs_swap_extent_flush(ip);
1677     if (error)
1678         goto out_unlock;
1679     error = xfs_swap_extent_flush(tip);
1680     if (error)
1681         goto out_unlock;
1682
1683     if (xfs_inode_has_cow_data(tip)) {
1684         error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
1685         if (error)
1686             goto out_unlock;
1687     }
1688
1689     /*
1690      * Extent "swapping" with rmap requires a permanent reservation and
1691      * a block reservation because it's really just a remap operation
1692      * performed with log redo items!
1693      */
1694     if (xfs_has_rmapbt(mp)) {
1695         int     w = XFS_DATA_FORK;
1696         uint32_t    ipnext = ip->i_df.if_nextents;
1697         uint32_t    tipnext = tip->i_df.if_nextents;
1698
1699         /*
1700          * Conceptually this shouldn't affect the shape of either bmbt,
1701          * but since we atomically move extents one by one, we reserve
1702          * enough space to rebuild both trees.
1703          */
1704         resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
1705         resblks +=  XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
1706
1707         /*
1708          * If either inode straddles a bmapbt block allocation boundary,
1709          * the rmapbt algorithm triggers repeated allocs and frees as
1710          * extents are remapped. This can exhaust the block reservation
1711          * prematurely and cause shutdown. Return freed blocks to the
1712          * transaction reservation to counter this behavior.
1713          */
1714         flags |= XFS_TRANS_RES_FDBLKS;
1715     }
1716     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
1717                 &tp);
1718     if (error)
1719         goto out_unlock;
1720
1721     /*
1722      * Lock and join the inodes to the tansaction so that transaction commit
1723      * or cancel will unlock the inodes from this point onwards.
1724      */
1725     xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
1726     xfs_trans_ijoin(tp, ip, 0);
1727     xfs_trans_ijoin(tp, tip, 0);
1728
1729
1730     /* Verify all data are being swapped */
1731     if (sxp->sx_offset != 0 ||
1732         sxp->sx_length != ip->i_disk_size ||
1733         sxp->sx_length != tip->i_disk_size) {
1734         error = -EFAULT;
1735         goto out_trans_cancel;
1736     }
1737
1738     trace_xfs_swap_extent_before(ip, 0);
1739     trace_xfs_swap_extent_before(tip, 1);
1740
1741     /* check inode formats now that data is flushed */
1742     error = xfs_swap_extents_check_format(ip, tip);
1743     if (error) {
1744         xfs_notice(mp,
1745             "%s: inode 0x%llx format is incompatible for exchanging.",
1746                 __func__, ip->i_ino);
1747         goto out_trans_cancel;
1748     }
1749
1750     /*
1751      * Compare the current change & modify times with that
1752      * passed in.  If they differ, we abort this swap.
1753      * This is the mechanism used to ensure the calling
1754      * process that the file was not changed out from
1755      * under it.
1756      */
1757     if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1758         (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1759         (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1760         (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1761         error = -EBUSY;
1762         goto out_trans_cancel;
1763     }
1764
1765     /*
1766      * Note the trickiness in setting the log flags - we set the owner log
1767      * flag on the opposite inode (i.e. the inode we are setting the new
1768      * owner to be) because once we swap the forks and log that, log
1769      * recovery is going to see the fork as owned by the swapped inode,
1770      * not the pre-swapped inodes.
1771      */
1772     src_log_flags = XFS_ILOG_CORE;
1773     target_log_flags = XFS_ILOG_CORE;
1774
1775     if (xfs_has_rmapbt(mp))
1776         error = xfs_swap_extent_rmap(&tp, ip, tip);
1777     else
1778         error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
1779                 &target_log_flags);
1780     if (error)
1781         goto out_trans_cancel;
1782
1783     /* Do we have to swap reflink flags? */
1784     if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
1785         (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
1786         f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
1787         ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1788         ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
1789         tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1790         tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
1791     }
1792
1793     /* Swap the cow forks. */
1794     if (xfs_has_reflink(mp)) {
1795         ASSERT(!ip->i_cowfp ||
1796                ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
1797         ASSERT(!tip->i_cowfp ||
1798                tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
1799
1800         swap(ip->i_cowfp, tip->i_cowfp);
1801
1802         if (ip->i_cowfp && ip->i_cowfp->if_bytes)
1803             xfs_inode_set_cowblocks_tag(ip);
1804         else
1805             xfs_inode_clear_cowblocks_tag(ip);
1806         if (tip->i_cowfp && tip->i_cowfp->if_bytes)
1807             xfs_inode_set_cowblocks_tag(tip);
1808         else
1809             xfs_inode_clear_cowblocks_tag(tip);
1810     }
1811
1812     xfs_trans_log_inode(tp, ip,  src_log_flags);
1813     xfs_trans_log_inode(tp, tip, target_log_flags);
1814
1815     /*
1816      * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1817      * have inode number owner values in the bmbt blocks that still refer to
1818      * the old inode. Scan each bmbt to fix up the owner values with the
1819      * inode number of the current inode.
1820      */
1821     if (src_log_flags & XFS_ILOG_DOWNER) {
1822         error = xfs_swap_change_owner(&tp, ip, tip);
1823         if (error)
1824             goto out_trans_cancel;
1825     }
1826     if (target_log_flags & XFS_ILOG_DOWNER) {
1827         error = xfs_swap_change_owner(&tp, tip, ip);
1828         if (error)
1829             goto out_trans_cancel;
1830     }
1831
1832     /*
1833      * If this is a synchronous mount, make sure that the
1834      * transaction goes to disk before returning to the user.
1835      */
1836     if (xfs_has_wsync(mp))
1837         xfs_trans_set_sync(tp);
1838
1839     error = xfs_trans_commit(tp);
1840
1841     trace_xfs_swap_extent_after(ip, 0);
1842     trace_xfs_swap_extent_after(tip, 1);
1843
1844 out_unlock_ilock:
1845     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1846     xfs_iunlock(tip, XFS_ILOCK_EXCL);
1847 out_unlock:
1848     filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
1849                       VFS_I(tip)->i_mapping);
1850     unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
1851     return error;
1852
1853 out_trans_cancel:
1854     xfs_trans_cancel(tp);
1855     goto out_unlock_ilock;
1856 }