fs/xfs/xfs_iomap.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
0004  * Copyright (c) 2016-2018 Christoph Hellwig.
0005  * All Rights Reserved.
0006  */
0007 #include "xfs.h"
0008 #include "xfs_fs.h"
0009 #include "xfs_shared.h"
0010 #include "xfs_format.h"
0011 #include "xfs_log_format.h"
0012 #include "xfs_trans_resv.h"
0013 #include "xfs_mount.h"
0014 #include "xfs_inode.h"
0015 #include "xfs_btree.h"
0016 #include "xfs_bmap_btree.h"
0017 #include "xfs_bmap.h"
0018 #include "xfs_bmap_util.h"
0019 #include "xfs_errortag.h"
0020 #include "xfs_error.h"
0021 #include "xfs_trans.h"
0022 #include "xfs_trans_space.h"
0023 #include "xfs_inode_item.h"
0024 #include "xfs_iomap.h"
0025 #include "xfs_trace.h"
0026 #include "xfs_quota.h"
0027 #include "xfs_dquot_item.h"
0028 #include "xfs_dquot.h"
0029 #include "xfs_reflink.h"
0030
0031 #define XFS_ALLOC_ALIGN(mp, off) \
0032     (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
0033
0034 static int
0035 xfs_alert_fsblock_zero(
0036     xfs_inode_t *ip,
0037     xfs_bmbt_irec_t *imap)
0038 {
0039     xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
0040             "Access to block zero in inode %llu "
0041             "start_block: %llx start_off: %llx "
0042             "blkcnt: %llx extent-state: %x",
0043         (unsigned long long)ip->i_ino,
0044         (unsigned long long)imap->br_startblock,
0045         (unsigned long long)imap->br_startoff,
0046         (unsigned long long)imap->br_blockcount,
0047         imap->br_state);
0048     return -EFSCORRUPTED;
0049 }
0050
0051 int
0052 xfs_bmbt_to_iomap(
0053     struct xfs_inode    *ip,
0054     struct iomap        *iomap,
0055     struct xfs_bmbt_irec    *imap,
0056     unsigned int        mapping_flags,
0057     u16         iomap_flags)
0058 {
0059     struct xfs_mount    *mp = ip->i_mount;
0060     struct xfs_buftarg  *target = xfs_inode_buftarg(ip);
0061
0062     if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
0063         return xfs_alert_fsblock_zero(ip, imap);
0064
0065     if (imap->br_startblock == HOLESTARTBLOCK) {
0066         iomap->addr = IOMAP_NULL_ADDR;
0067         iomap->type = IOMAP_HOLE;
0068     } else if (imap->br_startblock == DELAYSTARTBLOCK ||
0069            isnullstartblock(imap->br_startblock)) {
0070         iomap->addr = IOMAP_NULL_ADDR;
0071         iomap->type = IOMAP_DELALLOC;
0072     } else {
0073         iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
0074         if (mapping_flags & IOMAP_DAX)
0075             iomap->addr += target->bt_dax_part_off;
0076
0077         if (imap->br_state == XFS_EXT_UNWRITTEN)
0078             iomap->type = IOMAP_UNWRITTEN;
0079         else
0080             iomap->type = IOMAP_MAPPED;
0081
0082     }
0083     iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
0084     iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
0085     if (mapping_flags & IOMAP_DAX)
0086         iomap->dax_dev = target->bt_daxdev;
0087     else
0088         iomap->bdev = target->bt_bdev;
0089     iomap->flags = iomap_flags;
0090
0091     if (xfs_ipincount(ip) &&
0092         (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
0093         iomap->flags |= IOMAP_F_DIRTY;
0094     return 0;
0095 }
0096
0097 static void
0098 xfs_hole_to_iomap(
0099     struct xfs_inode    *ip,
0100     struct iomap        *iomap,
0101     xfs_fileoff_t       offset_fsb,
0102     xfs_fileoff_t       end_fsb)
0103 {
0104     struct xfs_buftarg  *target = xfs_inode_buftarg(ip);
0105
0106     iomap->addr = IOMAP_NULL_ADDR;
0107     iomap->type = IOMAP_HOLE;
0108     iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
0109     iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
0110     iomap->bdev = target->bt_bdev;
0111     iomap->dax_dev = target->bt_daxdev;
0112 }
0113
0114 static inline xfs_fileoff_t
0115 xfs_iomap_end_fsb(
0116     struct xfs_mount    *mp,
0117     loff_t          offset,
0118     loff_t          count)
0119 {
0120     ASSERT(offset <= mp->m_super->s_maxbytes);
0121     return min(XFS_B_TO_FSB(mp, offset + count),
0122            XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
0123 }
0124
0125 static xfs_extlen_t
0126 xfs_eof_alignment(
0127     struct xfs_inode    *ip)
0128 {
0129     struct xfs_mount    *mp = ip->i_mount;
0130     xfs_extlen_t        align = 0;
0131
0132     if (!XFS_IS_REALTIME_INODE(ip)) {
0133         /*
0134          * Round up the allocation request to a stripe unit
0135          * (m_dalign) boundary if the file size is >= stripe unit
0136          * size, and we are allocating past the allocation eof.
0137          *
0138          * If mounted with the "-o swalloc" option the alignment is
0139          * increased from the strip unit size to the stripe width.
0140          */
0141         if (mp->m_swidth && xfs_has_swalloc(mp))
0142             align = mp->m_swidth;
0143         else if (mp->m_dalign)
0144             align = mp->m_dalign;
0145
0146         if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
0147             align = 0;
0148     }
0149
0150     return align;
0151 }
0152
0153 /*
0154  * Check if last_fsb is outside the last extent, and if so grow it to the next
0155  * stripe unit boundary.
0156  */
0157 xfs_fileoff_t
0158 xfs_iomap_eof_align_last_fsb(
0159     struct xfs_inode    *ip,
0160     xfs_fileoff_t       end_fsb)
0161 {
0162     struct xfs_ifork    *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
0163     xfs_extlen_t        extsz = xfs_get_extsz_hint(ip);
0164     xfs_extlen_t        align = xfs_eof_alignment(ip);
0165     struct xfs_bmbt_irec    irec;
0166     struct xfs_iext_cursor  icur;
0167
0168     ASSERT(!xfs_need_iread_extents(ifp));
0169
0170     /*
0171      * Always round up the allocation request to the extent hint boundary.
0172      */
0173     if (extsz) {
0174         if (align)
0175             align = roundup_64(align, extsz);
0176         else
0177             align = extsz;
0178     }
0179
0180     if (align) {
0181         xfs_fileoff_t   aligned_end_fsb = roundup_64(end_fsb, align);
0182
0183         xfs_iext_last(ifp, &icur);
0184         if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
0185             aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
0186             return aligned_end_fsb;
0187     }
0188
0189     return end_fsb;
0190 }
0191
0192 int
0193 xfs_iomap_write_direct(
0194     struct xfs_inode    *ip,
0195     xfs_fileoff_t       offset_fsb,
0196     xfs_fileoff_t       count_fsb,
0197     unsigned int        flags,
0198     struct xfs_bmbt_irec    *imap)
0199 {
0200     struct xfs_mount    *mp = ip->i_mount;
0201     struct xfs_trans    *tp;
0202     xfs_filblks_t       resaligned;
0203     int         nimaps;
0204     unsigned int        dblocks, rblocks;
0205     bool            force = false;
0206     int         error;
0207     int         bmapi_flags = XFS_BMAPI_PREALLOC;
0208     int         nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT;
0209
0210     ASSERT(count_fsb > 0);
0211
0212     resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
0213                        xfs_get_extsz_hint(ip));
0214     if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
0215         dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
0216         rblocks = resaligned;
0217     } else {
0218         dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
0219         rblocks = 0;
0220     }
0221
0222     error = xfs_qm_dqattach(ip);
0223     if (error)
0224         return error;
0225
0226     /*
0227      * For DAX, we do not allocate unwritten extents, but instead we zero
0228      * the block before we commit the transaction.  Ideally we'd like to do
0229      * this outside the transaction context, but if we commit and then crash
0230      * we may not have zeroed the blocks and this will be exposed on
0231      * recovery of the allocation. Hence we must zero before commit.
0232      *
0233      * Further, if we are mapping unwritten extents here, we need to zero
0234      * and convert them to written so that we don't need an unwritten extent
0235      * callback for DAX. This also means that we need to be able to dip into
0236      * the reserve block pool for bmbt block allocation if there is no space
0237      * left but we need to do unwritten extent conversion.
0238      */
0239     if (flags & IOMAP_DAX) {
0240         bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
0241         if (imap->br_state == XFS_EXT_UNWRITTEN) {
0242             force = true;
0243             nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT;
0244             dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
0245         }
0246     }
0247
0248     error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
0249             rblocks, force, &tp);
0250     if (error)
0251         return error;
0252
0253     error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
0254     if (error == -EFBIG)
0255         error = xfs_iext_count_upgrade(tp, ip, nr_exts);
0256     if (error)
0257         goto out_trans_cancel;
0258
0259     /*
0260      * From this point onwards we overwrite the imap pointer that the
0261      * caller gave to us.
0262      */
0263     nimaps = 1;
0264     error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
0265                 imap, &nimaps);
0266     if (error)
0267         goto out_trans_cancel;
0268
0269     /*
0270      * Complete the transaction
0271      */
0272     error = xfs_trans_commit(tp);
0273     if (error)
0274         goto out_unlock;
0275
0276     /*
0277      * Copy any maps to caller's array and return any error.
0278      */
0279     if (nimaps == 0) {
0280         error = -ENOSPC;
0281         goto out_unlock;
0282     }
0283
0284     if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
0285         error = xfs_alert_fsblock_zero(ip, imap);
0286
0287 out_unlock:
0288     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0289     return error;
0290
0291 out_trans_cancel:
0292     xfs_trans_cancel(tp);
0293     goto out_unlock;
0294 }
0295
0296 STATIC bool
0297 xfs_quota_need_throttle(
0298     struct xfs_inode    *ip,
0299     xfs_dqtype_t        type,
0300     xfs_fsblock_t       alloc_blocks)
0301 {
0302     struct xfs_dquot    *dq = xfs_inode_dquot(ip, type);
0303
0304     if (!dq || !xfs_this_quota_on(ip->i_mount, type))
0305         return false;
0306
0307     /* no hi watermark, no throttle */
0308     if (!dq->q_prealloc_hi_wmark)
0309         return false;
0310
0311     /* under the lo watermark, no throttle */
0312     if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
0313         return false;
0314
0315     return true;
0316 }
0317
0318 STATIC void
0319 xfs_quota_calc_throttle(
0320     struct xfs_inode    *ip,
0321     xfs_dqtype_t        type,
0322     xfs_fsblock_t       *qblocks,
0323     int         *qshift,
0324     int64_t         *qfreesp)
0325 {
0326     struct xfs_dquot    *dq = xfs_inode_dquot(ip, type);
0327     int64_t         freesp;
0328     int         shift = 0;
0329
0330     /* no dq, or over hi wmark, squash the prealloc completely */
0331     if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
0332         *qblocks = 0;
0333         *qfreesp = 0;
0334         return;
0335     }
0336
0337     freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
0338     if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
0339         shift = 2;
0340         if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
0341             shift += 2;
0342         if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
0343             shift += 2;
0344     }
0345
0346     if (freesp < *qfreesp)
0347         *qfreesp = freesp;
0348
0349     /* only overwrite the throttle values if we are more aggressive */
0350     if ((freesp >> shift) < (*qblocks >> *qshift)) {
0351         *qblocks = freesp;
0352         *qshift = shift;
0353     }
0354 }
0355
0356 /*
0357  * If we don't have a user specified preallocation size, dynamically increase
0358  * the preallocation size as the size of the file grows.  Cap the maximum size
0359  * at a single extent or less if the filesystem is near full. The closer the
0360  * filesystem is to being full, the smaller the maximum preallocation.
0361  */
0362 STATIC xfs_fsblock_t
0363 xfs_iomap_prealloc_size(
0364     struct xfs_inode    *ip,
0365     int         whichfork,
0366     loff_t          offset,
0367     loff_t          count,
0368     struct xfs_iext_cursor  *icur)
0369 {
0370     struct xfs_iext_cursor  ncur = *icur;
0371     struct xfs_bmbt_irec    prev, got;
0372     struct xfs_mount    *mp = ip->i_mount;
0373     struct xfs_ifork    *ifp = xfs_ifork_ptr(ip, whichfork);
0374     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
0375     int64_t         freesp;
0376     xfs_fsblock_t       qblocks;
0377     xfs_fsblock_t       alloc_blocks = 0;
0378     xfs_extlen_t        plen;
0379     int         shift = 0;
0380     int         qshift = 0;
0381
0382     /*
0383      * As an exception we don't do any preallocation at all if the file is
0384      * smaller than the minimum preallocation and we are using the default
0385      * dynamic preallocation scheme, as it is likely this is the only write
0386      * to the file that is going to be done.
0387      */
0388     if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
0389         return 0;
0390
0391     /*
0392      * Use the minimum preallocation size for small files or if we are
0393      * writing right after a hole.
0394      */
0395     if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
0396         !xfs_iext_prev_extent(ifp, &ncur, &prev) ||
0397         prev.br_startoff + prev.br_blockcount < offset_fsb)
0398         return mp->m_allocsize_blocks;
0399
0400     /*
0401      * Take the size of the preceding data extents as the basis for the
0402      * preallocation size. Note that we don't care if the previous extents
0403      * are written or not.
0404      */
0405     plen = prev.br_blockcount;
0406     while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
0407         if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
0408             isnullstartblock(got.br_startblock) ||
0409             got.br_startoff + got.br_blockcount != prev.br_startoff ||
0410             got.br_startblock + got.br_blockcount != prev.br_startblock)
0411             break;
0412         plen += got.br_blockcount;
0413         prev = got;
0414     }
0415
0416     /*
0417      * If the size of the extents is greater than half the maximum extent
0418      * length, then use the current offset as the basis.  This ensures that
0419      * for large files the preallocation size always extends to
0420      * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
0421      * unit/width alignment of real extents.
0422      */
0423     alloc_blocks = plen * 2;
0424     if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
0425         alloc_blocks = XFS_B_TO_FSB(mp, offset);
0426     qblocks = alloc_blocks;
0427
0428     /*
0429      * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
0430      * down to the nearest power of two value after throttling. To prevent
0431      * the round down from unconditionally reducing the maximum supported
0432      * prealloc size, we round up first, apply appropriate throttling, round
0433      * down and cap the value to XFS_BMBT_MAX_EXTLEN.
0434      */
0435     alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
0436                        alloc_blocks);
0437
0438     freesp = percpu_counter_read_positive(&mp->m_fdblocks);
0439     if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
0440         shift = 2;
0441         if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
0442             shift++;
0443         if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
0444             shift++;
0445         if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
0446             shift++;
0447         if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
0448             shift++;
0449     }
0450
0451     /*
0452      * Check each quota to cap the prealloc size, provide a shift value to
0453      * throttle with and adjust amount of available space.
0454      */
0455     if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
0456         xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
0457                     &freesp);
0458     if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
0459         xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
0460                     &freesp);
0461     if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
0462         xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
0463                     &freesp);
0464
0465     /*
0466      * The final prealloc size is set to the minimum of free space available
0467      * in each of the quotas and the overall filesystem.
0468      *
0469      * The shift throttle value is set to the maximum value as determined by
0470      * the global low free space values and per-quota low free space values.
0471      */
0472     alloc_blocks = min(alloc_blocks, qblocks);
0473     shift = max(shift, qshift);
0474
0475     if (shift)
0476         alloc_blocks >>= shift;
0477     /*
0478      * rounddown_pow_of_two() returns an undefined result if we pass in
0479      * alloc_blocks = 0.
0480      */
0481     if (alloc_blocks)
0482         alloc_blocks = rounddown_pow_of_two(alloc_blocks);
0483     if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
0484         alloc_blocks = XFS_MAX_BMBT_EXTLEN;
0485
0486     /*
0487      * If we are still trying to allocate more space than is
0488      * available, squash the prealloc hard. This can happen if we
0489      * have a large file on a small filesystem and the above
0490      * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
0491      */
0492     while (alloc_blocks && alloc_blocks >= freesp)
0493         alloc_blocks >>= 4;
0494     if (alloc_blocks < mp->m_allocsize_blocks)
0495         alloc_blocks = mp->m_allocsize_blocks;
0496     trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
0497                       mp->m_allocsize_blocks);
0498     return alloc_blocks;
0499 }
0500
0501 int
0502 xfs_iomap_write_unwritten(
0503     xfs_inode_t *ip,
0504     xfs_off_t   offset,
0505     xfs_off_t   count,
0506     bool        update_isize)
0507 {
0508     xfs_mount_t *mp = ip->i_mount;
0509     xfs_fileoff_t   offset_fsb;
0510     xfs_filblks_t   count_fsb;
0511     xfs_filblks_t   numblks_fsb;
0512     int     nimaps;
0513     xfs_trans_t *tp;
0514     xfs_bmbt_irec_t imap;
0515     struct inode    *inode = VFS_I(ip);
0516     xfs_fsize_t i_size;
0517     uint        resblks;
0518     int     error;
0519
0520     trace_xfs_unwritten_convert(ip, offset, count);
0521
0522     offset_fsb = XFS_B_TO_FSBT(mp, offset);
0523     count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
0524     count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
0525
0526     /*
0527      * Reserve enough blocks in this transaction for two complete extent
0528      * btree splits.  We may be converting the middle part of an unwritten
0529      * extent and in this case we will insert two new extents in the btree
0530      * each of which could cause a full split.
0531      *
0532      * This reservation amount will be used in the first call to
0533      * xfs_bmbt_split() to select an AG with enough space to satisfy the
0534      * rest of the operation.
0535      */
0536     resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
0537
0538     /* Attach dquots so that bmbt splits are accounted correctly. */
0539     error = xfs_qm_dqattach(ip);
0540     if (error)
0541         return error;
0542
0543     do {
0544         /*
0545          * Set up a transaction to convert the range of extents
0546          * from unwritten to real. Do allocations in a loop until
0547          * we have covered the range passed in.
0548          *
0549          * Note that we can't risk to recursing back into the filesystem
0550          * here as we might be asked to write out the same inode that we
0551          * complete here and might deadlock on the iolock.
0552          */
0553         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
0554                 0, true, &tp);
0555         if (error)
0556             return error;
0557
0558         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
0559                 XFS_IEXT_WRITE_UNWRITTEN_CNT);
0560         if (error == -EFBIG)
0561             error = xfs_iext_count_upgrade(tp, ip,
0562                     XFS_IEXT_WRITE_UNWRITTEN_CNT);
0563         if (error)
0564             goto error_on_bmapi_transaction;
0565
0566         /*
0567          * Modify the unwritten extent state of the buffer.
0568          */
0569         nimaps = 1;
0570         error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
0571                     XFS_BMAPI_CONVERT, resblks, &imap,
0572                     &nimaps);
0573         if (error)
0574             goto error_on_bmapi_transaction;
0575
0576         /*
0577          * Log the updated inode size as we go.  We have to be careful
0578          * to only log it up to the actual write offset if it is
0579          * halfway into a block.
0580          */
0581         i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
0582         if (i_size > offset + count)
0583             i_size = offset + count;
0584         if (update_isize && i_size > i_size_read(inode))
0585             i_size_write(inode, i_size);
0586         i_size = xfs_new_eof(ip, i_size);
0587         if (i_size) {
0588             ip->i_disk_size = i_size;
0589             xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
0590         }
0591
0592         error = xfs_trans_commit(tp);
0593         xfs_iunlock(ip, XFS_ILOCK_EXCL);
0594         if (error)
0595             return error;
0596
0597         if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
0598             return xfs_alert_fsblock_zero(ip, &imap);
0599
0600         if ((numblks_fsb = imap.br_blockcount) == 0) {
0601             /*
0602              * The numblks_fsb value should always get
0603              * smaller, otherwise the loop is stuck.
0604              */
0605             ASSERT(imap.br_blockcount);
0606             break;
0607         }
0608         offset_fsb += numblks_fsb;
0609         count_fsb -= numblks_fsb;
0610     } while (count_fsb > 0);
0611
0612     return 0;
0613
0614 error_on_bmapi_transaction:
0615     xfs_trans_cancel(tp);
0616     xfs_iunlock(ip, XFS_ILOCK_EXCL);
0617     return error;
0618 }
0619
0620 static inline bool
0621 imap_needs_alloc(
0622     struct inode        *inode,
0623     unsigned        flags,
0624     struct xfs_bmbt_irec    *imap,
0625     int         nimaps)
0626 {
0627     /* don't allocate blocks when just zeroing */
0628     if (flags & IOMAP_ZERO)
0629         return false;
0630     if (!nimaps ||
0631         imap->br_startblock == HOLESTARTBLOCK ||
0632         imap->br_startblock == DELAYSTARTBLOCK)
0633         return true;
0634     /* we convert unwritten extents before copying the data for DAX */
0635     if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
0636         return true;
0637     return false;
0638 }
0639
0640 static inline bool
0641 imap_needs_cow(
0642     struct xfs_inode    *ip,
0643     unsigned int        flags,
0644     struct xfs_bmbt_irec    *imap,
0645     int         nimaps)
0646 {
0647     if (!xfs_is_cow_inode(ip))
0648         return false;
0649
0650     /* when zeroing we don't have to COW holes or unwritten extents */
0651     if (flags & IOMAP_ZERO) {
0652         if (!nimaps ||
0653             imap->br_startblock == HOLESTARTBLOCK ||
0654             imap->br_state == XFS_EXT_UNWRITTEN)
0655             return false;
0656     }
0657
0658     return true;
0659 }
0660
0661 static int
0662 xfs_ilock_for_iomap(
0663     struct xfs_inode    *ip,
0664     unsigned        flags,
0665     unsigned        *lockmode)
0666 {
0667     unsigned int        mode = *lockmode;
0668     bool            is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
0669
0670     /*
0671      * COW writes may allocate delalloc space or convert unwritten COW
0672      * extents, so we need to make sure to take the lock exclusively here.
0673      */
0674     if (xfs_is_cow_inode(ip) && is_write)
0675         mode = XFS_ILOCK_EXCL;
0676
0677     /*
0678      * Extents not yet cached requires exclusive access, don't block.  This
0679      * is an opencoded xfs_ilock_data_map_shared() call but with
0680      * non-blocking behaviour.
0681      */
0682     if (xfs_need_iread_extents(&ip->i_df)) {
0683         if (flags & IOMAP_NOWAIT)
0684             return -EAGAIN;
0685         mode = XFS_ILOCK_EXCL;
0686     }
0687
0688 relock:
0689     if (flags & IOMAP_NOWAIT) {
0690         if (!xfs_ilock_nowait(ip, mode))
0691             return -EAGAIN;
0692     } else {
0693         xfs_ilock(ip, mode);
0694     }
0695
0696     /*
0697      * The reflink iflag could have changed since the earlier unlocked
0698      * check, so if we got ILOCK_SHARED for a write and but we're now a
0699      * reflink inode we have to switch to ILOCK_EXCL and relock.
0700      */
0701     if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
0702         xfs_iunlock(ip, mode);
0703         mode = XFS_ILOCK_EXCL;
0704         goto relock;
0705     }
0706
0707     *lockmode = mode;
0708     return 0;
0709 }
0710
0711 /*
0712  * Check that the imap we are going to return to the caller spans the entire
0713  * range that the caller requested for the IO.
0714  */
0715 static bool
0716 imap_spans_range(
0717     struct xfs_bmbt_irec    *imap,
0718     xfs_fileoff_t       offset_fsb,
0719     xfs_fileoff_t       end_fsb)
0720 {
0721     if (imap->br_startoff > offset_fsb)
0722         return false;
0723     if (imap->br_startoff + imap->br_blockcount < end_fsb)
0724         return false;
0725     return true;
0726 }
0727
0728 static int
0729 xfs_direct_write_iomap_begin(
0730     struct inode        *inode,
0731     loff_t          offset,
0732     loff_t          length,
0733     unsigned        flags,
0734     struct iomap        *iomap,
0735     struct iomap        *srcmap)
0736 {
0737     struct xfs_inode    *ip = XFS_I(inode);
0738     struct xfs_mount    *mp = ip->i_mount;
0739     struct xfs_bmbt_irec    imap, cmap;
0740     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
0741     xfs_fileoff_t       end_fsb = xfs_iomap_end_fsb(mp, offset, length);
0742     int         nimaps = 1, error = 0;
0743     bool            shared = false;
0744     u16         iomap_flags = 0;
0745     unsigned int        lockmode = XFS_ILOCK_SHARED;
0746
0747     ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
0748
0749     if (xfs_is_shutdown(mp))
0750         return -EIO;
0751
0752     /*
0753      * Writes that span EOF might trigger an IO size update on completion,
0754      * so consider them to be dirty for the purposes of O_DSYNC even if
0755      * there is no other metadata changes pending or have been made here.
0756      */
0757     if (offset + length > i_size_read(inode))
0758         iomap_flags |= IOMAP_F_DIRTY;
0759
0760     error = xfs_ilock_for_iomap(ip, flags, &lockmode);
0761     if (error)
0762         return error;
0763
0764     error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
0765                    &nimaps, 0);
0766     if (error)
0767         goto out_unlock;
0768
0769     if (imap_needs_cow(ip, flags, &imap, nimaps)) {
0770         error = -EAGAIN;
0771         if (flags & IOMAP_NOWAIT)
0772             goto out_unlock;
0773
0774         /* may drop and re-acquire the ilock */
0775         error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
0776                 &lockmode,
0777                 (flags & IOMAP_DIRECT) || IS_DAX(inode));
0778         if (error)
0779             goto out_unlock;
0780         if (shared)
0781             goto out_found_cow;
0782         end_fsb = imap.br_startoff + imap.br_blockcount;
0783         length = XFS_FSB_TO_B(mp, end_fsb) - offset;
0784     }
0785
0786     if (imap_needs_alloc(inode, flags, &imap, nimaps))
0787         goto allocate_blocks;
0788
0789     /*
0790      * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
0791      * a single map so that we avoid partial IO failures due to the rest of
0792      * the I/O range not covered by this map triggering an EAGAIN condition
0793      * when it is subsequently mapped and aborting the I/O.
0794      */
0795     if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
0796         error = -EAGAIN;
0797         if (!imap_spans_range(&imap, offset_fsb, end_fsb))
0798             goto out_unlock;
0799     }
0800
0801     /*
0802      * For overwrite only I/O, we cannot convert unwritten extents without
0803      * requiring sub-block zeroing.  This can only be done under an
0804      * exclusive IOLOCK, hence return -EAGAIN if this is not a written
0805      * extent to tell the caller to try again.
0806      */
0807     if (flags & IOMAP_OVERWRITE_ONLY) {
0808         error = -EAGAIN;
0809         if (imap.br_state != XFS_EXT_NORM &&
0810                 ((offset | length) & mp->m_blockmask))
0811             goto out_unlock;
0812     }
0813
0814     xfs_iunlock(ip, lockmode);
0815     trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
0816     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
0817
0818 allocate_blocks:
0819     error = -EAGAIN;
0820     if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
0821         goto out_unlock;
0822
0823     /*
0824      * We cap the maximum length we map to a sane size  to keep the chunks
0825      * of work done where somewhat symmetric with the work writeback does.
0826      * This is a completely arbitrary number pulled out of thin air as a
0827      * best guess for initial testing.
0828      *
0829      * Note that the values needs to be less than 32-bits wide until the
0830      * lower level functions are updated.
0831      */
0832     length = min_t(loff_t, length, 1024 * PAGE_SIZE);
0833     end_fsb = xfs_iomap_end_fsb(mp, offset, length);
0834
0835     if (offset + length > XFS_ISIZE(ip))
0836         end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
0837     else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
0838         end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
0839     xfs_iunlock(ip, lockmode);
0840
0841     error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
0842             flags, &imap);
0843     if (error)
0844         return error;
0845
0846     trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
0847     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
0848                  iomap_flags | IOMAP_F_NEW);
0849
0850 out_found_cow:
0851     xfs_iunlock(ip, lockmode);
0852     length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
0853     trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
0854     if (imap.br_startblock != HOLESTARTBLOCK) {
0855         error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
0856         if (error)
0857             return error;
0858     }
0859     return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
0860
0861 out_unlock:
0862     if (lockmode)
0863         xfs_iunlock(ip, lockmode);
0864     return error;
0865 }
0866
0867 const struct iomap_ops xfs_direct_write_iomap_ops = {
0868     .iomap_begin        = xfs_direct_write_iomap_begin,
0869 };
0870
0871 static int
0872 xfs_dax_write_iomap_end(
0873     struct inode        *inode,
0874     loff_t          pos,
0875     loff_t          length,
0876     ssize_t         written,
0877     unsigned        flags,
0878     struct iomap        *iomap)
0879 {
0880     struct xfs_inode    *ip = XFS_I(inode);
0881
0882     if (!xfs_is_cow_inode(ip))
0883         return 0;
0884
0885     if (!written) {
0886         xfs_reflink_cancel_cow_range(ip, pos, length, true);
0887         return 0;
0888     }
0889
0890     return xfs_reflink_end_cow(ip, pos, written);
0891 }
0892
0893 const struct iomap_ops xfs_dax_write_iomap_ops = {
0894     .iomap_begin    = xfs_direct_write_iomap_begin,
0895     .iomap_end  = xfs_dax_write_iomap_end,
0896 };
0897
0898 static int
0899 xfs_buffered_write_iomap_begin(
0900     struct inode        *inode,
0901     loff_t          offset,
0902     loff_t          count,
0903     unsigned        flags,
0904     struct iomap        *iomap,
0905     struct iomap        *srcmap)
0906 {
0907     struct xfs_inode    *ip = XFS_I(inode);
0908     struct xfs_mount    *mp = ip->i_mount;
0909     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
0910     xfs_fileoff_t       end_fsb = xfs_iomap_end_fsb(mp, offset, count);
0911     struct xfs_bmbt_irec    imap, cmap;
0912     struct xfs_iext_cursor  icur, ccur;
0913     xfs_fsblock_t       prealloc_blocks = 0;
0914     bool            eof = false, cow_eof = false, shared = false;
0915     int         allocfork = XFS_DATA_FORK;
0916     int         error = 0;
0917     unsigned int        lockmode = XFS_ILOCK_EXCL;
0918
0919     if (xfs_is_shutdown(mp))
0920         return -EIO;
0921
0922     /* we can't use delayed allocations when using extent size hints */
0923     if (xfs_get_extsz_hint(ip))
0924         return xfs_direct_write_iomap_begin(inode, offset, count,
0925                 flags, iomap, srcmap);
0926
0927     ASSERT(!XFS_IS_REALTIME_INODE(ip));
0928
0929     error = xfs_ilock_for_iomap(ip, flags, &lockmode);
0930     if (error)
0931         return error;
0932
0933     if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
0934         XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
0935         error = -EFSCORRUPTED;
0936         goto out_unlock;
0937     }
0938
0939     XFS_STATS_INC(mp, xs_blk_mapw);
0940
0941     error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
0942     if (error)
0943         goto out_unlock;
0944
0945     /*
0946      * Search the data fork first to look up our source mapping.  We
0947      * always need the data fork map, as we have to return it to the
0948      * iomap code so that the higher level write code can read data in to
0949      * perform read-modify-write cycles for unaligned writes.
0950      */
0951     eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
0952     if (eof)
0953         imap.br_startoff = end_fsb; /* fake hole until the end */
0954
0955     /* We never need to allocate blocks for zeroing a hole. */
0956     if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
0957         xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
0958         goto out_unlock;
0959     }
0960
0961     /*
0962      * Search the COW fork extent list even if we did not find a data fork
0963      * extent.  This serves two purposes: first this implements the
0964      * speculative preallocation using cowextsize, so that we also unshare
0965      * block adjacent to shared blocks instead of just the shared blocks
0966      * themselves.  Second the lookup in the extent list is generally faster
0967      * than going out to the shared extent tree.
0968      */
0969     if (xfs_is_cow_inode(ip)) {
0970         if (!ip->i_cowfp) {
0971             ASSERT(!xfs_is_reflink_inode(ip));
0972             xfs_ifork_init_cow(ip);
0973         }
0974         cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
0975                 &ccur, &cmap);
0976         if (!cow_eof && cmap.br_startoff <= offset_fsb) {
0977             trace_xfs_reflink_cow_found(ip, &cmap);
0978             goto found_cow;
0979         }
0980     }
0981
0982     if (imap.br_startoff <= offset_fsb) {
0983         /*
0984          * For reflink files we may need a delalloc reservation when
0985          * overwriting shared extents.   This includes zeroing of
0986          * existing extents that contain data.
0987          */
0988         if (!xfs_is_cow_inode(ip) ||
0989             ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
0990             trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
0991                     &imap);
0992             goto found_imap;
0993         }
0994
0995         xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
0996
0997         /* Trim the mapping to the nearest shared extent boundary. */
0998         error = xfs_bmap_trim_cow(ip, &imap, &shared);
0999         if (error)
1000             goto out_unlock;
1001
1002         /* Not shared?  Just report the (potentially capped) extent. */
1003         if (!shared) {
1004             trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
1005                     &imap);
1006             goto found_imap;
1007         }
1008
1009         /*
1010          * Fork all the shared blocks from our write offset until the
1011          * end of the extent.
1012          */
1013         allocfork = XFS_COW_FORK;
1014         end_fsb = imap.br_startoff + imap.br_blockcount;
1015     } else {
1016         /*
1017          * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1018          * pages to keep the chunks of work done where somewhat
1019          * symmetric with the work writeback does.  This is a completely
1020          * arbitrary number pulled out of thin air.
1021          *
1022          * Note that the values needs to be less than 32-bits wide until
1023          * the lower level functions are updated.
1024          */
1025         count = min_t(loff_t, count, 1024 * PAGE_SIZE);
1026         end_fsb = xfs_iomap_end_fsb(mp, offset, count);
1027
1028         if (xfs_is_always_cow_inode(ip))
1029             allocfork = XFS_COW_FORK;
1030     }
1031
1032     error = xfs_qm_dqattach_locked(ip, false);
1033     if (error)
1034         goto out_unlock;
1035
1036     if (eof && offset + count > XFS_ISIZE(ip)) {
1037         /*
1038          * Determine the initial size of the preallocation.
1039          * We clean up any extra preallocation when the file is closed.
1040          */
1041         if (xfs_has_allocsize(mp))
1042             prealloc_blocks = mp->m_allocsize_blocks;
1043         else
1044             prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
1045                         offset, count, &icur);
1046         if (prealloc_blocks) {
1047             xfs_extlen_t    align;
1048             xfs_off_t   end_offset;
1049             xfs_fileoff_t   p_end_fsb;
1050
1051             end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
1052             p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
1053                     prealloc_blocks;
1054
1055             align = xfs_eof_alignment(ip);
1056             if (align)
1057                 p_end_fsb = roundup_64(p_end_fsb, align);
1058
1059             p_end_fsb = min(p_end_fsb,
1060                 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
1061             ASSERT(p_end_fsb > offset_fsb);
1062             prealloc_blocks = p_end_fsb - end_fsb;
1063         }
1064     }
1065
1066 retry:
1067     error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
1068             end_fsb - offset_fsb, prealloc_blocks,
1069             allocfork == XFS_DATA_FORK ? &imap : &cmap,
1070             allocfork == XFS_DATA_FORK ? &icur : &ccur,
1071             allocfork == XFS_DATA_FORK ? eof : cow_eof);
1072     switch (error) {
1073     case 0:
1074         break;
1075     case -ENOSPC:
1076     case -EDQUOT:
1077         /* retry without any preallocation */
1078         trace_xfs_delalloc_enospc(ip, offset, count);
1079         if (prealloc_blocks) {
1080             prealloc_blocks = 0;
1081             goto retry;
1082         }
1083         fallthrough;
1084     default:
1085         goto out_unlock;
1086     }
1087
1088     if (allocfork == XFS_COW_FORK) {
1089         trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
1090         goto found_cow;
1091     }
1092
1093     /*
1094      * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
1095      * them out if the write happens to fail.
1096      */
1097     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1098     trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
1099     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
1100
1101 found_imap:
1102     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1103     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
1104
1105 found_cow:
1106     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1107     if (imap.br_startoff <= offset_fsb) {
1108         error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
1109         if (error)
1110             return error;
1111         return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
1112                      IOMAP_F_SHARED);
1113     }
1114
1115     xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
1116     return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
1117
1118 out_unlock:
1119     xfs_iunlock(ip, XFS_ILOCK_EXCL);
1120     return error;
1121 }
1122
1123 static int
1124 xfs_buffered_write_iomap_end(
1125     struct inode        *inode,
1126     loff_t          offset,
1127     loff_t          length,
1128     ssize_t         written,
1129     unsigned        flags,
1130     struct iomap        *iomap)
1131 {
1132     struct xfs_inode    *ip = XFS_I(inode);
1133     struct xfs_mount    *mp = ip->i_mount;
1134     xfs_fileoff_t       start_fsb;
1135     xfs_fileoff_t       end_fsb;
1136     int         error = 0;
1137
1138     if (iomap->type != IOMAP_DELALLOC)
1139         return 0;
1140
1141     /*
1142      * Behave as if the write failed if drop writes is enabled. Set the NEW
1143      * flag to force delalloc cleanup.
1144      */
1145     if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
1146         iomap->flags |= IOMAP_F_NEW;
1147         written = 0;
1148     }
1149
1150     /*
1151      * start_fsb refers to the first unused block after a short write. If
1152      * nothing was written, round offset down to point at the first block in
1153      * the range.
1154      */
1155     if (unlikely(!written))
1156         start_fsb = XFS_B_TO_FSBT(mp, offset);
1157     else
1158         start_fsb = XFS_B_TO_FSB(mp, offset + written);
1159     end_fsb = XFS_B_TO_FSB(mp, offset + length);
1160
1161     /*
1162      * Trim delalloc blocks if they were allocated by this write and we
1163      * didn't manage to write the whole range.
1164      *
1165      * We don't need to care about racing delalloc as we hold i_mutex
1166      * across the reserve/allocate/unreserve calls. If there are delalloc
1167      * blocks in the range, they are ours.
1168      */
1169     if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
1170         truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
1171                      XFS_FSB_TO_B(mp, end_fsb) - 1);
1172
1173         error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1174                            end_fsb - start_fsb);
1175         if (error && !xfs_is_shutdown(mp)) {
1176             xfs_alert(mp, "%s: unable to clean up ino %lld",
1177                 __func__, ip->i_ino);
1178             return error;
1179         }
1180     }
1181
1182     return 0;
1183 }
1184
1185 const struct iomap_ops xfs_buffered_write_iomap_ops = {
1186     .iomap_begin        = xfs_buffered_write_iomap_begin,
1187     .iomap_end      = xfs_buffered_write_iomap_end,
1188 };
1189
1190 static int
1191 xfs_read_iomap_begin(
1192     struct inode        *inode,
1193     loff_t          offset,
1194     loff_t          length,
1195     unsigned        flags,
1196     struct iomap        *iomap,
1197     struct iomap        *srcmap)
1198 {
1199     struct xfs_inode    *ip = XFS_I(inode);
1200     struct xfs_mount    *mp = ip->i_mount;
1201     struct xfs_bmbt_irec    imap;
1202     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
1203     xfs_fileoff_t       end_fsb = xfs_iomap_end_fsb(mp, offset, length);
1204     int         nimaps = 1, error = 0;
1205     bool            shared = false;
1206     unsigned int        lockmode = XFS_ILOCK_SHARED;
1207
1208     ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
1209
1210     if (xfs_is_shutdown(mp))
1211         return -EIO;
1212
1213     error = xfs_ilock_for_iomap(ip, flags, &lockmode);
1214     if (error)
1215         return error;
1216     error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1217                    &nimaps, 0);
1218     if (!error && (flags & IOMAP_REPORT))
1219         error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
1220     xfs_iunlock(ip, lockmode);
1221
1222     if (error)
1223         return error;
1224     trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
1225     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
1226                  shared ? IOMAP_F_SHARED : 0);
1227 }
1228
1229 const struct iomap_ops xfs_read_iomap_ops = {
1230     .iomap_begin        = xfs_read_iomap_begin,
1231 };
1232
1233 static int
1234 xfs_seek_iomap_begin(
1235     struct inode        *inode,
1236     loff_t          offset,
1237     loff_t          length,
1238     unsigned        flags,
1239     struct iomap        *iomap,
1240     struct iomap        *srcmap)
1241 {
1242     struct xfs_inode    *ip = XFS_I(inode);
1243     struct xfs_mount    *mp = ip->i_mount;
1244     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
1245     xfs_fileoff_t       end_fsb = XFS_B_TO_FSB(mp, offset + length);
1246     xfs_fileoff_t       cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
1247     struct xfs_iext_cursor  icur;
1248     struct xfs_bmbt_irec    imap, cmap;
1249     int         error = 0;
1250     unsigned        lockmode;
1251
1252     if (xfs_is_shutdown(mp))
1253         return -EIO;
1254
1255     lockmode = xfs_ilock_data_map_shared(ip);
1256     error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
1257     if (error)
1258         goto out_unlock;
1259
1260     if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
1261         /*
1262          * If we found a data extent we are done.
1263          */
1264         if (imap.br_startoff <= offset_fsb)
1265             goto done;
1266         data_fsb = imap.br_startoff;
1267     } else {
1268         /*
1269          * Fake a hole until the end of the file.
1270          */
1271         data_fsb = xfs_iomap_end_fsb(mp, offset, length);
1272     }
1273
1274     /*
1275      * If a COW fork extent covers the hole, report it - capped to the next
1276      * data fork extent:
1277      */
1278     if (xfs_inode_has_cow_data(ip) &&
1279         xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
1280         cow_fsb = cmap.br_startoff;
1281     if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
1282         if (data_fsb < cow_fsb + cmap.br_blockcount)
1283             end_fsb = min(end_fsb, data_fsb);
1284         xfs_trim_extent(&cmap, offset_fsb, end_fsb);
1285         error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
1286                       IOMAP_F_SHARED);
1287         /*
1288          * This is a COW extent, so we must probe the page cache
1289          * because there could be dirty page cache being backed
1290          * by this extent.
1291          */
1292         iomap->type = IOMAP_UNWRITTEN;
1293         goto out_unlock;
1294     }
1295
1296     /*
1297      * Else report a hole, capped to the next found data or COW extent.
1298      */
1299     if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
1300         imap.br_blockcount = cow_fsb - offset_fsb;
1301     else
1302         imap.br_blockcount = data_fsb - offset_fsb;
1303     imap.br_startoff = offset_fsb;
1304     imap.br_startblock = HOLESTARTBLOCK;
1305     imap.br_state = XFS_EXT_NORM;
1306 done:
1307     xfs_trim_extent(&imap, offset_fsb, end_fsb);
1308     error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
1309 out_unlock:
1310     xfs_iunlock(ip, lockmode);
1311     return error;
1312 }
1313
1314 const struct iomap_ops xfs_seek_iomap_ops = {
1315     .iomap_begin        = xfs_seek_iomap_begin,
1316 };
1317
1318 static int
1319 xfs_xattr_iomap_begin(
1320     struct inode        *inode,
1321     loff_t          offset,
1322     loff_t          length,
1323     unsigned        flags,
1324     struct iomap        *iomap,
1325     struct iomap        *srcmap)
1326 {
1327     struct xfs_inode    *ip = XFS_I(inode);
1328     struct xfs_mount    *mp = ip->i_mount;
1329     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
1330     xfs_fileoff_t       end_fsb = XFS_B_TO_FSB(mp, offset + length);
1331     struct xfs_bmbt_irec    imap;
1332     int         nimaps = 1, error = 0;
1333     unsigned        lockmode;
1334
1335     if (xfs_is_shutdown(mp))
1336         return -EIO;
1337
1338     lockmode = xfs_ilock_attr_map_shared(ip);
1339
1340     /* if there are no attribute fork or extents, return ENOENT */
1341     if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
1342         error = -ENOENT;
1343         goto out_unlock;
1344     }
1345
1346     ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
1347     error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1348                    &nimaps, XFS_BMAPI_ATTRFORK);
1349 out_unlock:
1350     xfs_iunlock(ip, lockmode);
1351
1352     if (error)
1353         return error;
1354     ASSERT(nimaps);
1355     return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
1356 }
1357
1358 const struct iomap_ops xfs_xattr_iomap_ops = {
1359     .iomap_begin        = xfs_xattr_iomap_begin,
1360 };
1361
1362 int
1363 xfs_zero_range(
1364     struct xfs_inode    *ip,
1365     loff_t          pos,
1366     loff_t          len,
1367     bool            *did_zero)
1368 {
1369     struct inode        *inode = VFS_I(ip);
1370
1371     if (IS_DAX(inode))
1372         return dax_zero_range(inode, pos, len, did_zero,
1373                       &xfs_direct_write_iomap_ops);
1374     return iomap_zero_range(inode, pos, len, did_zero,
1375                 &xfs_buffered_write_iomap_ops);
1376 }
1377
1378 int
1379 xfs_truncate_page(
1380     struct xfs_inode    *ip,
1381     loff_t          pos,
1382     bool            *did_zero)
1383 {
1384     struct inode        *inode = VFS_I(ip);
1385
1386     if (IS_DAX(inode))
1387         return dax_truncate_page(inode, pos, did_zero,
1388                     &xfs_direct_write_iomap_ops);
1389     return iomap_truncate_page(inode, pos, did_zero,
1390                    &xfs_buffered_write_iomap_ops);
1391 }