fs/xfs/xfs_aops.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
0004  * Copyright (c) 2016-2018 Christoph Hellwig.
0005  * All Rights Reserved.
0006  */
0007 #include "xfs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_mount.h"
0013 #include "xfs_inode.h"
0014 #include "xfs_trans.h"
0015 #include "xfs_iomap.h"
0016 #include "xfs_trace.h"
0017 #include "xfs_bmap.h"
0018 #include "xfs_bmap_util.h"
0019 #include "xfs_reflink.h"
0020
0021 struct xfs_writepage_ctx {
0022     struct iomap_writepage_ctx ctx;
0023     unsigned int        data_seq;
0024     unsigned int        cow_seq;
0025 };
0026
0027 static inline struct xfs_writepage_ctx *
0028 XFS_WPC(struct iomap_writepage_ctx *ctx)
0029 {
0030     return container_of(ctx, struct xfs_writepage_ctx, ctx);
0031 }
0032
0033 /*
0034  * Fast and loose check if this write could update the on-disk inode size.
0035  */
0036 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
0037 {
0038     return ioend->io_offset + ioend->io_size >
0039         XFS_I(ioend->io_inode)->i_disk_size;
0040 }
0041
0042 /*
0043  * Update on-disk file size now that data has been written to disk.
0044  */
0045 int
0046 xfs_setfilesize(
0047     struct xfs_inode    *ip,
0048     xfs_off_t       offset,
0049     size_t          size)
0050 {
0051     struct xfs_mount    *mp = ip->i_mount;
0052     struct xfs_trans    *tp;
0053     xfs_fsize_t     isize;
0054     int         error;
0055
0056     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
0057     if (error)
0058         return error;
0059
0060     xfs_ilock(ip, XFS_ILOCK_EXCL);
0061     isize = xfs_new_eof(ip, offset + size);
0062     if (!isize) {
0063         xfs_iunlock(ip, XFS_ILOCK_EXCL);
0064         xfs_trans_cancel(tp);
0065         return 0;
0066     }
0067
0068     trace_xfs_setfilesize(ip, offset, size);
0069
0070     ip->i_disk_size = isize;
0071     xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
0072     xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
0073
0074     return xfs_trans_commit(tp);
0075 }
0076
0077 /*
0078  * IO write completion.
0079  */
0080 STATIC void
0081 xfs_end_ioend(
0082     struct iomap_ioend  *ioend)
0083 {
0084     struct xfs_inode    *ip = XFS_I(ioend->io_inode);
0085     struct xfs_mount    *mp = ip->i_mount;
0086     xfs_off_t       offset = ioend->io_offset;
0087     size_t          size = ioend->io_size;
0088     unsigned int        nofs_flag;
0089     int         error;
0090
0091     /*
0092      * We can allocate memory here while doing writeback on behalf of
0093      * memory reclaim.  To avoid memory allocation deadlocks set the
0094      * task-wide nofs context for the following operations.
0095      */
0096     nofs_flag = memalloc_nofs_save();
0097
0098     /*
0099      * Just clean up the in-memory structures if the fs has been shut down.
0100      */
0101     if (xfs_is_shutdown(mp)) {
0102         error = -EIO;
0103         goto done;
0104     }
0105
0106     /*
0107      * Clean up all COW blocks and underlying data fork delalloc blocks on
0108      * I/O error. The delalloc punch is required because this ioend was
0109      * mapped to blocks in the COW fork and the associated pages are no
0110      * longer dirty. If we don't remove delalloc blocks here, they become
0111      * stale and can corrupt free space accounting on unmount.
0112      */
0113     error = blk_status_to_errno(ioend->io_bio->bi_status);
0114     if (unlikely(error)) {
0115         if (ioend->io_flags & IOMAP_F_SHARED) {
0116             xfs_reflink_cancel_cow_range(ip, offset, size, true);
0117             xfs_bmap_punch_delalloc_range(ip,
0118                               XFS_B_TO_FSBT(mp, offset),
0119                               XFS_B_TO_FSB(mp, size));
0120         }
0121         goto done;
0122     }
0123
0124     /*
0125      * Success: commit the COW or unwritten blocks if needed.
0126      */
0127     if (ioend->io_flags & IOMAP_F_SHARED)
0128         error = xfs_reflink_end_cow(ip, offset, size);
0129     else if (ioend->io_type == IOMAP_UNWRITTEN)
0130         error = xfs_iomap_write_unwritten(ip, offset, size, false);
0131
0132     if (!error && xfs_ioend_is_append(ioend))
0133         error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
0134 done:
0135     iomap_finish_ioends(ioend, error);
0136     memalloc_nofs_restore(nofs_flag);
0137 }
0138
0139 /*
0140  * Finish all pending IO completions that require transactional modifications.
0141  *
0142  * We try to merge physical and logically contiguous ioends before completion to
0143  * minimise the number of transactions we need to perform during IO completion.
0144  * Both unwritten extent conversion and COW remapping need to iterate and modify
0145  * one physical extent at a time, so we gain nothing by merging physically
0146  * discontiguous extents here.
0147  *
0148  * The ioend chain length that we can be processing here is largely unbound in
0149  * length and we may have to perform significant amounts of work on each ioend
0150  * to complete it. Hence we have to be careful about holding the CPU for too
0151  * long in this loop.
0152  */
0153 void
0154 xfs_end_io(
0155     struct work_struct  *work)
0156 {
0157     struct xfs_inode    *ip =
0158         container_of(work, struct xfs_inode, i_ioend_work);
0159     struct iomap_ioend  *ioend;
0160     struct list_head    tmp;
0161     unsigned long       flags;
0162
0163     spin_lock_irqsave(&ip->i_ioend_lock, flags);
0164     list_replace_init(&ip->i_ioend_list, &tmp);
0165     spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
0166
0167     iomap_sort_ioends(&tmp);
0168     while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
0169             io_list))) {
0170         list_del_init(&ioend->io_list);
0171         iomap_ioend_try_merge(ioend, &tmp);
0172         xfs_end_ioend(ioend);
0173         cond_resched();
0174     }
0175 }
0176
0177 STATIC void
0178 xfs_end_bio(
0179     struct bio      *bio)
0180 {
0181     struct iomap_ioend  *ioend = bio->bi_private;
0182     struct xfs_inode    *ip = XFS_I(ioend->io_inode);
0183     unsigned long       flags;
0184
0185     spin_lock_irqsave(&ip->i_ioend_lock, flags);
0186     if (list_empty(&ip->i_ioend_list))
0187         WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
0188                      &ip->i_ioend_work));
0189     list_add_tail(&ioend->io_list, &ip->i_ioend_list);
0190     spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
0191 }
0192
0193 /*
0194  * Fast revalidation of the cached writeback mapping. Return true if the current
0195  * mapping is valid, false otherwise.
0196  */
0197 static bool
0198 xfs_imap_valid(
0199     struct iomap_writepage_ctx  *wpc,
0200     struct xfs_inode        *ip,
0201     loff_t              offset)
0202 {
0203     if (offset < wpc->iomap.offset ||
0204         offset >= wpc->iomap.offset + wpc->iomap.length)
0205         return false;
0206     /*
0207      * If this is a COW mapping, it is sufficient to check that the mapping
0208      * covers the offset. Be careful to check this first because the caller
0209      * can revalidate a COW mapping without updating the data seqno.
0210      */
0211     if (wpc->iomap.flags & IOMAP_F_SHARED)
0212         return true;
0213
0214     /*
0215      * This is not a COW mapping. Check the sequence number of the data fork
0216      * because concurrent changes could have invalidated the extent. Check
0217      * the COW fork because concurrent changes since the last time we
0218      * checked (and found nothing at this offset) could have added
0219      * overlapping blocks.
0220      */
0221     if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
0222         return false;
0223     if (xfs_inode_has_cow_data(ip) &&
0224         XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
0225         return false;
0226     return true;
0227 }
0228
0229 /*
0230  * Pass in a dellalloc extent and convert it to real extents, return the real
0231  * extent that maps offset_fsb in wpc->iomap.
0232  *
0233  * The current page is held locked so nothing could have removed the block
0234  * backing offset_fsb, although it could have moved from the COW to the data
0235  * fork by another thread.
0236  */
0237 static int
0238 xfs_convert_blocks(
0239     struct iomap_writepage_ctx *wpc,
0240     struct xfs_inode    *ip,
0241     int         whichfork,
0242     loff_t          offset)
0243 {
0244     int         error;
0245     unsigned        *seq;
0246
0247     if (whichfork == XFS_COW_FORK)
0248         seq = &XFS_WPC(wpc)->cow_seq;
0249     else
0250         seq = &XFS_WPC(wpc)->data_seq;
0251
0252     /*
0253      * Attempt to allocate whatever delalloc extent currently backs offset
0254      * and put the result into wpc->iomap.  Allocate in a loop because it
0255      * may take several attempts to allocate real blocks for a contiguous
0256      * delalloc extent if free space is sufficiently fragmented.
0257      */
0258     do {
0259         error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
0260                 &wpc->iomap, seq);
0261         if (error)
0262             return error;
0263     } while (wpc->iomap.offset + wpc->iomap.length <= offset);
0264
0265     return 0;
0266 }
0267
0268 static int
0269 xfs_map_blocks(
0270     struct iomap_writepage_ctx *wpc,
0271     struct inode        *inode,
0272     loff_t          offset)
0273 {
0274     struct xfs_inode    *ip = XFS_I(inode);
0275     struct xfs_mount    *mp = ip->i_mount;
0276     ssize_t         count = i_blocksize(inode);
0277     xfs_fileoff_t       offset_fsb = XFS_B_TO_FSBT(mp, offset);
0278     xfs_fileoff_t       end_fsb = XFS_B_TO_FSB(mp, offset + count);
0279     xfs_fileoff_t       cow_fsb;
0280     int         whichfork;
0281     struct xfs_bmbt_irec    imap;
0282     struct xfs_iext_cursor  icur;
0283     int         retries = 0;
0284     int         error = 0;
0285
0286     if (xfs_is_shutdown(mp))
0287         return -EIO;
0288
0289     /*
0290      * COW fork blocks can overlap data fork blocks even if the blocks
0291      * aren't shared.  COW I/O always takes precedent, so we must always
0292      * check for overlap on reflink inodes unless the mapping is already a
0293      * COW one, or the COW fork hasn't changed from the last time we looked
0294      * at it.
0295      *
0296      * It's safe to check the COW fork if_seq here without the ILOCK because
0297      * we've indirectly protected against concurrent updates: writeback has
0298      * the page locked, which prevents concurrent invalidations by reflink
0299      * and directio and prevents concurrent buffered writes to the same
0300      * page.  Changes to if_seq always happen under i_lock, which protects
0301      * against concurrent updates and provides a memory barrier on the way
0302      * out that ensures that we always see the current value.
0303      */
0304     if (xfs_imap_valid(wpc, ip, offset))
0305         return 0;
0306
0307     /*
0308      * If we don't have a valid map, now it's time to get a new one for this
0309      * offset.  This will convert delayed allocations (including COW ones)
0310      * into real extents.  If we return without a valid map, it means we
0311      * landed in a hole and we skip the block.
0312      */
0313 retry:
0314     cow_fsb = NULLFILEOFF;
0315     whichfork = XFS_DATA_FORK;
0316     xfs_ilock(ip, XFS_ILOCK_SHARED);
0317     ASSERT(!xfs_need_iread_extents(&ip->i_df));
0318
0319     /*
0320      * Check if this is offset is covered by a COW extents, and if yes use
0321      * it directly instead of looking up anything in the data fork.
0322      */
0323     if (xfs_inode_has_cow_data(ip) &&
0324         xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
0325         cow_fsb = imap.br_startoff;
0326     if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
0327         XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
0328         xfs_iunlock(ip, XFS_ILOCK_SHARED);
0329
0330         whichfork = XFS_COW_FORK;
0331         goto allocate_blocks;
0332     }
0333
0334     /*
0335      * No COW extent overlap. Revalidate now that we may have updated
0336      * ->cow_seq. If the data mapping is still valid, we're done.
0337      */
0338     if (xfs_imap_valid(wpc, ip, offset)) {
0339         xfs_iunlock(ip, XFS_ILOCK_SHARED);
0340         return 0;
0341     }
0342
0343     /*
0344      * If we don't have a valid map, now it's time to get a new one for this
0345      * offset.  This will convert delayed allocations (including COW ones)
0346      * into real extents.
0347      */
0348     if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
0349         imap.br_startoff = end_fsb; /* fake a hole past EOF */
0350     XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
0351     xfs_iunlock(ip, XFS_ILOCK_SHARED);
0352
0353     /* landed in a hole or beyond EOF? */
0354     if (imap.br_startoff > offset_fsb) {
0355         imap.br_blockcount = imap.br_startoff - offset_fsb;
0356         imap.br_startoff = offset_fsb;
0357         imap.br_startblock = HOLESTARTBLOCK;
0358         imap.br_state = XFS_EXT_NORM;
0359     }
0360
0361     /*
0362      * Truncate to the next COW extent if there is one.  This is the only
0363      * opportunity to do this because we can skip COW fork lookups for the
0364      * subsequent blocks in the mapping; however, the requirement to treat
0365      * the COW range separately remains.
0366      */
0367     if (cow_fsb != NULLFILEOFF &&
0368         cow_fsb < imap.br_startoff + imap.br_blockcount)
0369         imap.br_blockcount = cow_fsb - imap.br_startoff;
0370
0371     /* got a delalloc extent? */
0372     if (imap.br_startblock != HOLESTARTBLOCK &&
0373         isnullstartblock(imap.br_startblock))
0374         goto allocate_blocks;
0375
0376     xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
0377     trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
0378     return 0;
0379 allocate_blocks:
0380     error = xfs_convert_blocks(wpc, ip, whichfork, offset);
0381     if (error) {
0382         /*
0383          * If we failed to find the extent in the COW fork we might have
0384          * raced with a COW to data fork conversion or truncate.
0385          * Restart the lookup to catch the extent in the data fork for
0386          * the former case, but prevent additional retries to avoid
0387          * looping forever for the latter case.
0388          */
0389         if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
0390             goto retry;
0391         ASSERT(error != -EAGAIN);
0392         return error;
0393     }
0394
0395     /*
0396      * Due to merging the return real extent might be larger than the
0397      * original delalloc one.  Trim the return extent to the next COW
0398      * boundary again to force a re-lookup.
0399      */
0400     if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
0401         loff_t      cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
0402
0403         if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
0404             wpc->iomap.length = cow_offset - wpc->iomap.offset;
0405     }
0406
0407     ASSERT(wpc->iomap.offset <= offset);
0408     ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
0409     trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
0410     return 0;
0411 }
0412
0413 static int
0414 xfs_prepare_ioend(
0415     struct iomap_ioend  *ioend,
0416     int         status)
0417 {
0418     unsigned int        nofs_flag;
0419
0420     /*
0421      * We can allocate memory here while doing writeback on behalf of
0422      * memory reclaim.  To avoid memory allocation deadlocks set the
0423      * task-wide nofs context for the following operations.
0424      */
0425     nofs_flag = memalloc_nofs_save();
0426
0427     /* Convert CoW extents to regular */
0428     if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
0429         status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
0430                 ioend->io_offset, ioend->io_size);
0431     }
0432
0433     memalloc_nofs_restore(nofs_flag);
0434
0435     /* send ioends that might require a transaction to the completion wq */
0436     if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
0437         (ioend->io_flags & IOMAP_F_SHARED))
0438         ioend->io_bio->bi_end_io = xfs_end_bio;
0439     return status;
0440 }
0441
0442 /*
0443  * If the page has delalloc blocks on it, we need to punch them out before we
0444  * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
0445  * inode that can trip up a later direct I/O read operation on the same region.
0446  *
0447  * We prevent this by truncating away the delalloc regions on the page.  Because
0448  * they are delalloc, we can do this without needing a transaction. Indeed - if
0449  * we get ENOSPC errors, we have to be able to do this truncation without a
0450  * transaction as there is no space left for block reservation (typically why we
0451  * see a ENOSPC in writeback).
0452  */
0453 static void
0454 xfs_discard_folio(
0455     struct folio        *folio,
0456     loff_t          pos)
0457 {
0458     struct inode        *inode = folio->mapping->host;
0459     struct xfs_inode    *ip = XFS_I(inode);
0460     struct xfs_mount    *mp = ip->i_mount;
0461     size_t          offset = offset_in_folio(folio, pos);
0462     xfs_fileoff_t       start_fsb = XFS_B_TO_FSBT(mp, pos);
0463     xfs_fileoff_t       pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
0464     int         error;
0465
0466     if (xfs_is_shutdown(mp))
0467         return;
0468
0469     xfs_alert_ratelimited(mp,
0470         "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
0471             folio, ip->i_ino, pos);
0472
0473     error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
0474             i_blocks_per_folio(inode, folio) - pageoff_fsb);
0475     if (error && !xfs_is_shutdown(mp))
0476         xfs_alert(mp, "page discard unable to remove delalloc mapping.");
0477 }
0478
0479 static const struct iomap_writeback_ops xfs_writeback_ops = {
0480     .map_blocks     = xfs_map_blocks,
0481     .prepare_ioend      = xfs_prepare_ioend,
0482     .discard_folio      = xfs_discard_folio,
0483 };
0484
0485 STATIC int
0486 xfs_vm_writepages(
0487     struct address_space    *mapping,
0488     struct writeback_control *wbc)
0489 {
0490     struct xfs_writepage_ctx wpc = { };
0491
0492     /*
0493      * Writing back data in a transaction context can result in recursive
0494      * transactions. This is bad, so issue a warning and get out of here.
0495      */
0496     if (WARN_ON_ONCE(current->journal_info))
0497         return 0;
0498
0499     xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
0500     return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
0501 }
0502
0503 STATIC int
0504 xfs_dax_writepages(
0505     struct address_space    *mapping,
0506     struct writeback_control *wbc)
0507 {
0508     struct xfs_inode    *ip = XFS_I(mapping->host);
0509
0510     xfs_iflags_clear(ip, XFS_ITRUNCATED);
0511     return dax_writeback_mapping_range(mapping,
0512             xfs_inode_buftarg(ip)->bt_daxdev, wbc);
0513 }
0514
0515 STATIC sector_t
0516 xfs_vm_bmap(
0517     struct address_space    *mapping,
0518     sector_t        block)
0519 {
0520     struct xfs_inode    *ip = XFS_I(mapping->host);
0521
0522     trace_xfs_vm_bmap(ip);
0523
0524     /*
0525      * The swap code (ab-)uses ->bmap to get a block mapping and then
0526      * bypasses the file system for actual I/O.  We really can't allow
0527      * that on reflinks inodes, so we have to skip out here.  And yes,
0528      * 0 is the magic code for a bmap error.
0529      *
0530      * Since we don't pass back blockdev info, we can't return bmap
0531      * information for rt files either.
0532      */
0533     if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
0534         return 0;
0535     return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
0536 }
0537
0538 STATIC int
0539 xfs_vm_read_folio(
0540     struct file     *unused,
0541     struct folio        *folio)
0542 {
0543     return iomap_read_folio(folio, &xfs_read_iomap_ops);
0544 }
0545
0546 STATIC void
0547 xfs_vm_readahead(
0548     struct readahead_control    *rac)
0549 {
0550     iomap_readahead(rac, &xfs_read_iomap_ops);
0551 }
0552
0553 static int
0554 xfs_iomap_swapfile_activate(
0555     struct swap_info_struct     *sis,
0556     struct file         *swap_file,
0557     sector_t            *span)
0558 {
0559     sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
0560     return iomap_swapfile_activate(sis, swap_file, span,
0561             &xfs_read_iomap_ops);
0562 }
0563
0564 const struct address_space_operations xfs_address_space_operations = {
0565     .read_folio     = xfs_vm_read_folio,
0566     .readahead      = xfs_vm_readahead,
0567     .writepages     = xfs_vm_writepages,
0568     .dirty_folio        = filemap_dirty_folio,
0569     .release_folio      = iomap_release_folio,
0570     .invalidate_folio   = iomap_invalidate_folio,
0571     .bmap           = xfs_vm_bmap,
0572     .direct_IO      = noop_direct_IO,
0573     .migrate_folio      = filemap_migrate_folio,
0574     .is_partially_uptodate  = iomap_is_partially_uptodate,
0575     .error_remove_page  = generic_error_remove_page,
0576     .swap_activate      = xfs_iomap_swapfile_activate,
0577 };
0578
0579 const struct address_space_operations xfs_dax_aops = {
0580     .writepages     = xfs_dax_writepages,
0581     .direct_IO      = noop_direct_IO,
0582     .dirty_folio        = noop_dirty_folio,
0583     .swap_activate      = xfs_iomap_swapfile_activate,
0584 };