fs/xfs/xfs_file.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
0004  * All Rights Reserved.
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_mount.h"
0013 #include "xfs_inode.h"
0014 #include "xfs_trans.h"
0015 #include "xfs_inode_item.h"
0016 #include "xfs_bmap.h"
0017 #include "xfs_bmap_util.h"
0018 #include "xfs_dir2.h"
0019 #include "xfs_dir2_priv.h"
0020 #include "xfs_ioctl.h"
0021 #include "xfs_trace.h"
0022 #include "xfs_log.h"
0023 #include "xfs_icache.h"
0024 #include "xfs_pnfs.h"
0025 #include "xfs_iomap.h"
0026 #include "xfs_reflink.h"
0027
0028 #include <linux/dax.h>
0029 #include <linux/falloc.h>
0030 #include <linux/backing-dev.h>
0031 #include <linux/mman.h>
0032 #include <linux/fadvise.h>
0033 #include <linux/mount.h>
0034
0035 static const struct vm_operations_struct xfs_file_vm_ops;
0036
0037 /*
0038  * Decide if the given file range is aligned to the size of the fundamental
0039  * allocation unit for the file.
0040  */
0041 static bool
0042 xfs_is_falloc_aligned(
0043     struct xfs_inode    *ip,
0044     loff_t          pos,
0045     long long int       len)
0046 {
0047     struct xfs_mount    *mp = ip->i_mount;
0048     uint64_t        mask;
0049
0050     if (XFS_IS_REALTIME_INODE(ip)) {
0051         if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
0052             u64 rextbytes;
0053             u32 mod;
0054
0055             rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
0056             div_u64_rem(pos, rextbytes, &mod);
0057             if (mod)
0058                 return false;
0059             div_u64_rem(len, rextbytes, &mod);
0060             return mod == 0;
0061         }
0062         mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
0063     } else {
0064         mask = mp->m_sb.sb_blocksize - 1;
0065     }
0066
0067     return !((pos | len) & mask);
0068 }
0069
0070 /*
0071  * Fsync operations on directories are much simpler than on regular files,
0072  * as there is no file data to flush, and thus also no need for explicit
0073  * cache flush operations, and there are no non-transaction metadata updates
0074  * on directories either.
0075  */
0076 STATIC int
0077 xfs_dir_fsync(
0078     struct file     *file,
0079     loff_t          start,
0080     loff_t          end,
0081     int         datasync)
0082 {
0083     struct xfs_inode    *ip = XFS_I(file->f_mapping->host);
0084
0085     trace_xfs_dir_fsync(ip);
0086     return xfs_log_force_inode(ip);
0087 }
0088
0089 static xfs_csn_t
0090 xfs_fsync_seq(
0091     struct xfs_inode    *ip,
0092     bool            datasync)
0093 {
0094     if (!xfs_ipincount(ip))
0095         return 0;
0096     if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
0097         return 0;
0098     return ip->i_itemp->ili_commit_seq;
0099 }
0100
0101 /*
0102  * All metadata updates are logged, which means that we just have to flush the
0103  * log up to the latest LSN that touched the inode.
0104  *
0105  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
0106  * the log force before we clear the ili_fsync_fields field. This ensures that
0107  * we don't get a racing sync operation that does not wait for the metadata to
0108  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
0109  * then all that will happen is the log force will do nothing as the lsn will
0110  * already be on disk.  We can't race with setting ili_fsync_fields because that
0111  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
0112  * shared until after the ili_fsync_fields is cleared.
0113  */
0114 static  int
0115 xfs_fsync_flush_log(
0116     struct xfs_inode    *ip,
0117     bool            datasync,
0118     int         *log_flushed)
0119 {
0120     int         error = 0;
0121     xfs_csn_t       seq;
0122
0123     xfs_ilock(ip, XFS_ILOCK_SHARED);
0124     seq = xfs_fsync_seq(ip, datasync);
0125     if (seq) {
0126         error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
0127                       log_flushed);
0128
0129         spin_lock(&ip->i_itemp->ili_lock);
0130         ip->i_itemp->ili_fsync_fields = 0;
0131         spin_unlock(&ip->i_itemp->ili_lock);
0132     }
0133     xfs_iunlock(ip, XFS_ILOCK_SHARED);
0134     return error;
0135 }
0136
0137 STATIC int
0138 xfs_file_fsync(
0139     struct file     *file,
0140     loff_t          start,
0141     loff_t          end,
0142     int         datasync)
0143 {
0144     struct xfs_inode    *ip = XFS_I(file->f_mapping->host);
0145     struct xfs_mount    *mp = ip->i_mount;
0146     int         error, err2;
0147     int         log_flushed = 0;
0148
0149     trace_xfs_file_fsync(ip);
0150
0151     error = file_write_and_wait_range(file, start, end);
0152     if (error)
0153         return error;
0154
0155     if (xfs_is_shutdown(mp))
0156         return -EIO;
0157
0158     xfs_iflags_clear(ip, XFS_ITRUNCATED);
0159
0160     /*
0161      * If we have an RT and/or log subvolume we need to make sure to flush
0162      * the write cache the device used for file data first.  This is to
0163      * ensure newly written file data make it to disk before logging the new
0164      * inode size in case of an extending write.
0165      */
0166     if (XFS_IS_REALTIME_INODE(ip))
0167         error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
0168     else if (mp->m_logdev_targp != mp->m_ddev_targp)
0169         error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
0170
0171     /*
0172      * Any inode that has dirty modifications in the log is pinned.  The
0173      * racy check here for a pinned inode will not catch modifications
0174      * that happen concurrently to the fsync call, but fsync semantics
0175      * only require to sync previously completed I/O.
0176      */
0177     if (xfs_ipincount(ip)) {
0178         err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
0179         if (err2 && !error)
0180             error = err2;
0181     }
0182
0183     /*
0184      * If we only have a single device, and the log force about was
0185      * a no-op we might have to flush the data device cache here.
0186      * This can only happen for fdatasync/O_DSYNC if we were overwriting
0187      * an already allocated file and thus do not have any metadata to
0188      * commit.
0189      */
0190     if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
0191         mp->m_logdev_targp == mp->m_ddev_targp) {
0192         err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
0193         if (err2 && !error)
0194             error = err2;
0195     }
0196
0197     return error;
0198 }
0199
0200 static int
0201 xfs_ilock_iocb(
0202     struct kiocb        *iocb,
0203     unsigned int        lock_mode)
0204 {
0205     struct xfs_inode    *ip = XFS_I(file_inode(iocb->ki_filp));
0206
0207     if (iocb->ki_flags & IOCB_NOWAIT) {
0208         if (!xfs_ilock_nowait(ip, lock_mode))
0209             return -EAGAIN;
0210     } else {
0211         xfs_ilock(ip, lock_mode);
0212     }
0213
0214     return 0;
0215 }
0216
0217 STATIC ssize_t
0218 xfs_file_dio_read(
0219     struct kiocb        *iocb,
0220     struct iov_iter     *to)
0221 {
0222     struct xfs_inode    *ip = XFS_I(file_inode(iocb->ki_filp));
0223     ssize_t         ret;
0224
0225     trace_xfs_file_direct_read(iocb, to);
0226
0227     if (!iov_iter_count(to))
0228         return 0; /* skip atime */
0229
0230     file_accessed(iocb->ki_filp);
0231
0232     ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
0233     if (ret)
0234         return ret;
0235     ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
0236     xfs_iunlock(ip, XFS_IOLOCK_SHARED);
0237
0238     return ret;
0239 }
0240
0241 static noinline ssize_t
0242 xfs_file_dax_read(
0243     struct kiocb        *iocb,
0244     struct iov_iter     *to)
0245 {
0246     struct xfs_inode    *ip = XFS_I(iocb->ki_filp->f_mapping->host);
0247     ssize_t         ret = 0;
0248
0249     trace_xfs_file_dax_read(iocb, to);
0250
0251     if (!iov_iter_count(to))
0252         return 0; /* skip atime */
0253
0254     ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
0255     if (ret)
0256         return ret;
0257     ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
0258     xfs_iunlock(ip, XFS_IOLOCK_SHARED);
0259
0260     file_accessed(iocb->ki_filp);
0261     return ret;
0262 }
0263
0264 STATIC ssize_t
0265 xfs_file_buffered_read(
0266     struct kiocb        *iocb,
0267     struct iov_iter     *to)
0268 {
0269     struct xfs_inode    *ip = XFS_I(file_inode(iocb->ki_filp));
0270     ssize_t         ret;
0271
0272     trace_xfs_file_buffered_read(iocb, to);
0273
0274     ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
0275     if (ret)
0276         return ret;
0277     ret = generic_file_read_iter(iocb, to);
0278     xfs_iunlock(ip, XFS_IOLOCK_SHARED);
0279
0280     return ret;
0281 }
0282
0283 STATIC ssize_t
0284 xfs_file_read_iter(
0285     struct kiocb        *iocb,
0286     struct iov_iter     *to)
0287 {
0288     struct inode        *inode = file_inode(iocb->ki_filp);
0289     struct xfs_mount    *mp = XFS_I(inode)->i_mount;
0290     ssize_t         ret = 0;
0291
0292     XFS_STATS_INC(mp, xs_read_calls);
0293
0294     if (xfs_is_shutdown(mp))
0295         return -EIO;
0296
0297     if (IS_DAX(inode))
0298         ret = xfs_file_dax_read(iocb, to);
0299     else if (iocb->ki_flags & IOCB_DIRECT)
0300         ret = xfs_file_dio_read(iocb, to);
0301     else
0302         ret = xfs_file_buffered_read(iocb, to);
0303
0304     if (ret > 0)
0305         XFS_STATS_ADD(mp, xs_read_bytes, ret);
0306     return ret;
0307 }
0308
0309 /*
0310  * Common pre-write limit and setup checks.
0311  *
0312  * Called with the iolocked held either shared and exclusive according to
0313  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
0314  * if called for a direct write beyond i_size.
0315  */
0316 STATIC ssize_t
0317 xfs_file_write_checks(
0318     struct kiocb        *iocb,
0319     struct iov_iter     *from,
0320     unsigned int        *iolock)
0321 {
0322     struct file     *file = iocb->ki_filp;
0323     struct inode        *inode = file->f_mapping->host;
0324     struct xfs_inode    *ip = XFS_I(inode);
0325     ssize_t         error = 0;
0326     size_t          count = iov_iter_count(from);
0327     bool            drained_dio = false;
0328     loff_t          isize;
0329
0330 restart:
0331     error = generic_write_checks(iocb, from);
0332     if (error <= 0)
0333         return error;
0334
0335     if (iocb->ki_flags & IOCB_NOWAIT) {
0336         error = break_layout(inode, false);
0337         if (error == -EWOULDBLOCK)
0338             error = -EAGAIN;
0339     } else {
0340         error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
0341     }
0342
0343     if (error)
0344         return error;
0345
0346     /*
0347      * For changing security info in file_remove_privs() we need i_rwsem
0348      * exclusively.
0349      */
0350     if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
0351         xfs_iunlock(ip, *iolock);
0352         *iolock = XFS_IOLOCK_EXCL;
0353         error = xfs_ilock_iocb(iocb, *iolock);
0354         if (error) {
0355             *iolock = 0;
0356             return error;
0357         }
0358         goto restart;
0359     }
0360
0361     /*
0362      * If the offset is beyond the size of the file, we need to zero any
0363      * blocks that fall between the existing EOF and the start of this
0364      * write.  If zeroing is needed and we are currently holding the iolock
0365      * shared, we need to update it to exclusive which implies having to
0366      * redo all checks before.
0367      *
0368      * We need to serialise against EOF updates that occur in IO completions
0369      * here. We want to make sure that nobody is changing the size while we
0370      * do this check until we have placed an IO barrier (i.e.  hold the
0371      * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
0372      * spinlock effectively forms a memory barrier once we have the
0373      * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
0374      * hence be able to correctly determine if we need to run zeroing.
0375      *
0376      * We can do an unlocked check here safely as IO completion can only
0377      * extend EOF. Truncate is locked out at this point, so the EOF can
0378      * not move backwards, only forwards. Hence we only need to take the
0379      * slow path and spin locks when we are at or beyond the current EOF.
0380      */
0381     if (iocb->ki_pos <= i_size_read(inode))
0382         goto out;
0383
0384     spin_lock(&ip->i_flags_lock);
0385     isize = i_size_read(inode);
0386     if (iocb->ki_pos > isize) {
0387         spin_unlock(&ip->i_flags_lock);
0388
0389         if (iocb->ki_flags & IOCB_NOWAIT)
0390             return -EAGAIN;
0391
0392         if (!drained_dio) {
0393             if (*iolock == XFS_IOLOCK_SHARED) {
0394                 xfs_iunlock(ip, *iolock);
0395                 *iolock = XFS_IOLOCK_EXCL;
0396                 xfs_ilock(ip, *iolock);
0397                 iov_iter_reexpand(from, count);
0398             }
0399             /*
0400              * We now have an IO submission barrier in place, but
0401              * AIO can do EOF updates during IO completion and hence
0402              * we now need to wait for all of them to drain. Non-AIO
0403              * DIO will have drained before we are given the
0404              * XFS_IOLOCK_EXCL, and so for most cases this wait is a
0405              * no-op.
0406              */
0407             inode_dio_wait(inode);
0408             drained_dio = true;
0409             goto restart;
0410         }
0411
0412         trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
0413         error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
0414         if (error)
0415             return error;
0416     } else
0417         spin_unlock(&ip->i_flags_lock);
0418
0419 out:
0420     return kiocb_modified(iocb);
0421 }
0422
0423 static int
0424 xfs_dio_write_end_io(
0425     struct kiocb        *iocb,
0426     ssize_t         size,
0427     int         error,
0428     unsigned        flags)
0429 {
0430     struct inode        *inode = file_inode(iocb->ki_filp);
0431     struct xfs_inode    *ip = XFS_I(inode);
0432     loff_t          offset = iocb->ki_pos;
0433     unsigned int        nofs_flag;
0434
0435     trace_xfs_end_io_direct_write(ip, offset, size);
0436
0437     if (xfs_is_shutdown(ip->i_mount))
0438         return -EIO;
0439
0440     if (error)
0441         return error;
0442     if (!size)
0443         return 0;
0444
0445     /*
0446      * Capture amount written on completion as we can't reliably account
0447      * for it on submission.
0448      */
0449     XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
0450
0451     /*
0452      * We can allocate memory here while doing writeback on behalf of
0453      * memory reclaim.  To avoid memory allocation deadlocks set the
0454      * task-wide nofs context for the following operations.
0455      */
0456     nofs_flag = memalloc_nofs_save();
0457
0458     if (flags & IOMAP_DIO_COW) {
0459         error = xfs_reflink_end_cow(ip, offset, size);
0460         if (error)
0461             goto out;
0462     }
0463
0464     /*
0465      * Unwritten conversion updates the in-core isize after extent
0466      * conversion but before updating the on-disk size. Updating isize any
0467      * earlier allows a racing dio read to find unwritten extents before
0468      * they are converted.
0469      */
0470     if (flags & IOMAP_DIO_UNWRITTEN) {
0471         error = xfs_iomap_write_unwritten(ip, offset, size, true);
0472         goto out;
0473     }
0474
0475     /*
0476      * We need to update the in-core inode size here so that we don't end up
0477      * with the on-disk inode size being outside the in-core inode size. We
0478      * have no other method of updating EOF for AIO, so always do it here
0479      * if necessary.
0480      *
0481      * We need to lock the test/set EOF update as we can be racing with
0482      * other IO completions here to update the EOF. Failing to serialise
0483      * here can result in EOF moving backwards and Bad Things Happen when
0484      * that occurs.
0485      *
0486      * As IO completion only ever extends EOF, we can do an unlocked check
0487      * here to avoid taking the spinlock. If we land within the current EOF,
0488      * then we do not need to do an extending update at all, and we don't
0489      * need to take the lock to check this. If we race with an update moving
0490      * EOF, then we'll either still be beyond EOF and need to take the lock,
0491      * or we'll be within EOF and we don't need to take it at all.
0492      */
0493     if (offset + size <= i_size_read(inode))
0494         goto out;
0495
0496     spin_lock(&ip->i_flags_lock);
0497     if (offset + size > i_size_read(inode)) {
0498         i_size_write(inode, offset + size);
0499         spin_unlock(&ip->i_flags_lock);
0500         error = xfs_setfilesize(ip, offset, size);
0501     } else {
0502         spin_unlock(&ip->i_flags_lock);
0503     }
0504
0505 out:
0506     memalloc_nofs_restore(nofs_flag);
0507     return error;
0508 }
0509
0510 static const struct iomap_dio_ops xfs_dio_write_ops = {
0511     .end_io     = xfs_dio_write_end_io,
0512 };
0513
0514 /*
0515  * Handle block aligned direct I/O writes
0516  */
0517 static noinline ssize_t
0518 xfs_file_dio_write_aligned(
0519     struct xfs_inode    *ip,
0520     struct kiocb        *iocb,
0521     struct iov_iter     *from)
0522 {
0523     unsigned int        iolock = XFS_IOLOCK_SHARED;
0524     ssize_t         ret;
0525
0526     ret = xfs_ilock_iocb(iocb, iolock);
0527     if (ret)
0528         return ret;
0529     ret = xfs_file_write_checks(iocb, from, &iolock);
0530     if (ret)
0531         goto out_unlock;
0532
0533     /*
0534      * We don't need to hold the IOLOCK exclusively across the IO, so demote
0535      * the iolock back to shared if we had to take the exclusive lock in
0536      * xfs_file_write_checks() for other reasons.
0537      */
0538     if (iolock == XFS_IOLOCK_EXCL) {
0539         xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
0540         iolock = XFS_IOLOCK_SHARED;
0541     }
0542     trace_xfs_file_direct_write(iocb, from);
0543     ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
0544                &xfs_dio_write_ops, 0, NULL, 0);
0545 out_unlock:
0546     if (iolock)
0547         xfs_iunlock(ip, iolock);
0548     return ret;
0549 }
0550
0551 /*
0552  * Handle block unaligned direct I/O writes
0553  *
0554  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
0555  * them to be done in parallel with reads and other direct I/O writes.  However,
0556  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
0557  * to do sub-block zeroing and that requires serialisation against other direct
0558  * I/O to the same block.  In this case we need to serialise the submission of
0559  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
0560  * In the case where sub-block zeroing is not required, we can do concurrent
0561  * sub-block dios to the same block successfully.
0562  *
0563  * Optimistically submit the I/O using the shared lock first, but use the
0564  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
0565  * if block allocation or partial block zeroing would be required.  In that case
0566  * we try again with the exclusive lock.
0567  */
0568 static noinline ssize_t
0569 xfs_file_dio_write_unaligned(
0570     struct xfs_inode    *ip,
0571     struct kiocb        *iocb,
0572     struct iov_iter     *from)
0573 {
0574     size_t          isize = i_size_read(VFS_I(ip));
0575     size_t          count = iov_iter_count(from);
0576     unsigned int        iolock = XFS_IOLOCK_SHARED;
0577     unsigned int        flags = IOMAP_DIO_OVERWRITE_ONLY;
0578     ssize_t         ret;
0579
0580     /*
0581      * Extending writes need exclusivity because of the sub-block zeroing
0582      * that the DIO code always does for partial tail blocks beyond EOF, so
0583      * don't even bother trying the fast path in this case.
0584      */
0585     if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
0586         if (iocb->ki_flags & IOCB_NOWAIT)
0587             return -EAGAIN;
0588 retry_exclusive:
0589         iolock = XFS_IOLOCK_EXCL;
0590         flags = IOMAP_DIO_FORCE_WAIT;
0591     }
0592
0593     ret = xfs_ilock_iocb(iocb, iolock);
0594     if (ret)
0595         return ret;
0596
0597     /*
0598      * We can't properly handle unaligned direct I/O to reflink files yet,
0599      * as we can't unshare a partial block.
0600      */
0601     if (xfs_is_cow_inode(ip)) {
0602         trace_xfs_reflink_bounce_dio_write(iocb, from);
0603         ret = -ENOTBLK;
0604         goto out_unlock;
0605     }
0606
0607     ret = xfs_file_write_checks(iocb, from, &iolock);
0608     if (ret)
0609         goto out_unlock;
0610
0611     /*
0612      * If we are doing exclusive unaligned I/O, this must be the only I/O
0613      * in-flight.  Otherwise we risk data corruption due to unwritten extent
0614      * conversions from the AIO end_io handler.  Wait for all other I/O to
0615      * drain first.
0616      */
0617     if (flags & IOMAP_DIO_FORCE_WAIT)
0618         inode_dio_wait(VFS_I(ip));
0619
0620     trace_xfs_file_direct_write(iocb, from);
0621     ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
0622                &xfs_dio_write_ops, flags, NULL, 0);
0623
0624     /*
0625      * Retry unaligned I/O with exclusive blocking semantics if the DIO
0626      * layer rejected it for mapping or locking reasons. If we are doing
0627      * nonblocking user I/O, propagate the error.
0628      */
0629     if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
0630         ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
0631         xfs_iunlock(ip, iolock);
0632         goto retry_exclusive;
0633     }
0634
0635 out_unlock:
0636     if (iolock)
0637         xfs_iunlock(ip, iolock);
0638     return ret;
0639 }
0640
0641 static ssize_t
0642 xfs_file_dio_write(
0643     struct kiocb        *iocb,
0644     struct iov_iter     *from)
0645 {
0646     struct xfs_inode    *ip = XFS_I(file_inode(iocb->ki_filp));
0647     struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
0648     size_t          count = iov_iter_count(from);
0649
0650     /* direct I/O must be aligned to device logical sector size */
0651     if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
0652         return -EINVAL;
0653     if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
0654         return xfs_file_dio_write_unaligned(ip, iocb, from);
0655     return xfs_file_dio_write_aligned(ip, iocb, from);
0656 }
0657
0658 static noinline ssize_t
0659 xfs_file_dax_write(
0660     struct kiocb        *iocb,
0661     struct iov_iter     *from)
0662 {
0663     struct inode        *inode = iocb->ki_filp->f_mapping->host;
0664     struct xfs_inode    *ip = XFS_I(inode);
0665     unsigned int        iolock = XFS_IOLOCK_EXCL;
0666     ssize_t         ret, error = 0;
0667     loff_t          pos;
0668
0669     ret = xfs_ilock_iocb(iocb, iolock);
0670     if (ret)
0671         return ret;
0672     ret = xfs_file_write_checks(iocb, from, &iolock);
0673     if (ret)
0674         goto out;
0675
0676     pos = iocb->ki_pos;
0677
0678     trace_xfs_file_dax_write(iocb, from);
0679     ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
0680     if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
0681         i_size_write(inode, iocb->ki_pos);
0682         error = xfs_setfilesize(ip, pos, ret);
0683     }
0684 out:
0685     if (iolock)
0686         xfs_iunlock(ip, iolock);
0687     if (error)
0688         return error;
0689
0690     if (ret > 0) {
0691         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
0692
0693         /* Handle various SYNC-type writes */
0694         ret = generic_write_sync(iocb, ret);
0695     }
0696     return ret;
0697 }
0698
0699 STATIC ssize_t
0700 xfs_file_buffered_write(
0701     struct kiocb        *iocb,
0702     struct iov_iter     *from)
0703 {
0704     struct inode        *inode = iocb->ki_filp->f_mapping->host;
0705     struct xfs_inode    *ip = XFS_I(inode);
0706     ssize_t         ret;
0707     bool            cleared_space = false;
0708     unsigned int        iolock;
0709
0710 write_retry:
0711     iolock = XFS_IOLOCK_EXCL;
0712     ret = xfs_ilock_iocb(iocb, iolock);
0713     if (ret)
0714         return ret;
0715
0716     ret = xfs_file_write_checks(iocb, from, &iolock);
0717     if (ret)
0718         goto out;
0719
0720     /* We can write back this queue in page reclaim */
0721     current->backing_dev_info = inode_to_bdi(inode);
0722
0723     trace_xfs_file_buffered_write(iocb, from);
0724     ret = iomap_file_buffered_write(iocb, from,
0725             &xfs_buffered_write_iomap_ops);
0726     if (likely(ret >= 0))
0727         iocb->ki_pos += ret;
0728
0729     /*
0730      * If we hit a space limit, try to free up some lingering preallocated
0731      * space before returning an error. In the case of ENOSPC, first try to
0732      * write back all dirty inodes to free up some of the excess reserved
0733      * metadata space. This reduces the chances that the eofblocks scan
0734      * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
0735      * also behaves as a filter to prevent too many eofblocks scans from
0736      * running at the same time.  Use a synchronous scan to increase the
0737      * effectiveness of the scan.
0738      */
0739     if (ret == -EDQUOT && !cleared_space) {
0740         xfs_iunlock(ip, iolock);
0741         xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
0742         cleared_space = true;
0743         goto write_retry;
0744     } else if (ret == -ENOSPC && !cleared_space) {
0745         struct xfs_icwalk   icw = {0};
0746
0747         cleared_space = true;
0748         xfs_flush_inodes(ip->i_mount);
0749
0750         xfs_iunlock(ip, iolock);
0751         icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
0752         xfs_blockgc_free_space(ip->i_mount, &icw);
0753         goto write_retry;
0754     }
0755
0756     current->backing_dev_info = NULL;
0757 out:
0758     if (iolock)
0759         xfs_iunlock(ip, iolock);
0760
0761     if (ret > 0) {
0762         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
0763         /* Handle various SYNC-type writes */
0764         ret = generic_write_sync(iocb, ret);
0765     }
0766     return ret;
0767 }
0768
0769 STATIC ssize_t
0770 xfs_file_write_iter(
0771     struct kiocb        *iocb,
0772     struct iov_iter     *from)
0773 {
0774     struct inode        *inode = iocb->ki_filp->f_mapping->host;
0775     struct xfs_inode    *ip = XFS_I(inode);
0776     ssize_t         ret;
0777     size_t          ocount = iov_iter_count(from);
0778
0779     XFS_STATS_INC(ip->i_mount, xs_write_calls);
0780
0781     if (ocount == 0)
0782         return 0;
0783
0784     if (xfs_is_shutdown(ip->i_mount))
0785         return -EIO;
0786
0787     if (IS_DAX(inode))
0788         return xfs_file_dax_write(iocb, from);
0789
0790     if (iocb->ki_flags & IOCB_DIRECT) {
0791         /*
0792          * Allow a directio write to fall back to a buffered
0793          * write *only* in the case that we're doing a reflink
0794          * CoW.  In all other directio scenarios we do not
0795          * allow an operation to fall back to buffered mode.
0796          */
0797         ret = xfs_file_dio_write(iocb, from);
0798         if (ret != -ENOTBLK)
0799             return ret;
0800     }
0801
0802     return xfs_file_buffered_write(iocb, from);
0803 }
0804
0805 static void
0806 xfs_wait_dax_page(
0807     struct inode        *inode)
0808 {
0809     struct xfs_inode        *ip = XFS_I(inode);
0810
0811     xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
0812     schedule();
0813     xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
0814 }
0815
0816 int
0817 xfs_break_dax_layouts(
0818     struct inode        *inode,
0819     bool            *retry)
0820 {
0821     struct page     *page;
0822
0823     ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
0824
0825     page = dax_layout_busy_page(inode->i_mapping);
0826     if (!page)
0827         return 0;
0828
0829     *retry = true;
0830     return ___wait_var_event(&page->_refcount,
0831             atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
0832             0, 0, xfs_wait_dax_page(inode));
0833 }
0834
0835 int
0836 xfs_break_layouts(
0837     struct inode        *inode,
0838     uint            *iolock,
0839     enum layout_break_reason reason)
0840 {
0841     bool            retry;
0842     int         error;
0843
0844     ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
0845
0846     do {
0847         retry = false;
0848         switch (reason) {
0849         case BREAK_UNMAP:
0850             error = xfs_break_dax_layouts(inode, &retry);
0851             if (error || retry)
0852                 break;
0853             fallthrough;
0854         case BREAK_WRITE:
0855             error = xfs_break_leased_layouts(inode, iolock, &retry);
0856             break;
0857         default:
0858             WARN_ON_ONCE(1);
0859             error = -EINVAL;
0860         }
0861     } while (error == 0 && retry);
0862
0863     return error;
0864 }
0865
0866 /* Does this file, inode, or mount want synchronous writes? */
0867 static inline bool xfs_file_sync_writes(struct file *filp)
0868 {
0869     struct xfs_inode    *ip = XFS_I(file_inode(filp));
0870
0871     if (xfs_has_wsync(ip->i_mount))
0872         return true;
0873     if (filp->f_flags & (__O_SYNC | O_DSYNC))
0874         return true;
0875     if (IS_SYNC(file_inode(filp)))
0876         return true;
0877
0878     return false;
0879 }
0880
0881 #define XFS_FALLOC_FL_SUPPORTED                     \
0882         (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |       \
0883          FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |  \
0884          FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
0885
0886 STATIC long
0887 xfs_file_fallocate(
0888     struct file     *file,
0889     int         mode,
0890     loff_t          offset,
0891     loff_t          len)
0892 {
0893     struct inode        *inode = file_inode(file);
0894     struct xfs_inode    *ip = XFS_I(inode);
0895     long            error;
0896     uint            iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
0897     loff_t          new_size = 0;
0898     bool            do_file_insert = false;
0899
0900     if (!S_ISREG(inode->i_mode))
0901         return -EINVAL;
0902     if (mode & ~XFS_FALLOC_FL_SUPPORTED)
0903         return -EOPNOTSUPP;
0904
0905     xfs_ilock(ip, iolock);
0906     error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
0907     if (error)
0908         goto out_unlock;
0909
0910     /*
0911      * Must wait for all AIO to complete before we continue as AIO can
0912      * change the file size on completion without holding any locks we
0913      * currently hold. We must do this first because AIO can update both
0914      * the on disk and in memory inode sizes, and the operations that follow
0915      * require the in-memory size to be fully up-to-date.
0916      */
0917     inode_dio_wait(inode);
0918
0919     /*
0920      * Now AIO and DIO has drained we flush and (if necessary) invalidate
0921      * the cached range over the first operation we are about to run.
0922      *
0923      * We care about zero and collapse here because they both run a hole
0924      * punch over the range first. Because that can zero data, and the range
0925      * of invalidation for the shift operations is much larger, we still do
0926      * the required flush for collapse in xfs_prepare_shift().
0927      *
0928      * Insert has the same range requirements as collapse, and we extend the
0929      * file first which can zero data. Hence insert has the same
0930      * flush/invalidate requirements as collapse and so they are both
0931      * handled at the right time by xfs_prepare_shift().
0932      */
0933     if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
0934             FALLOC_FL_COLLAPSE_RANGE)) {
0935         error = xfs_flush_unmap_range(ip, offset, len);
0936         if (error)
0937             goto out_unlock;
0938     }
0939
0940     error = file_modified(file);
0941     if (error)
0942         goto out_unlock;
0943
0944     if (mode & FALLOC_FL_PUNCH_HOLE) {
0945         error = xfs_free_file_space(ip, offset, len);
0946         if (error)
0947             goto out_unlock;
0948     } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
0949         if (!xfs_is_falloc_aligned(ip, offset, len)) {
0950             error = -EINVAL;
0951             goto out_unlock;
0952         }
0953
0954         /*
0955          * There is no need to overlap collapse range with EOF,
0956          * in which case it is effectively a truncate operation
0957          */
0958         if (offset + len >= i_size_read(inode)) {
0959             error = -EINVAL;
0960             goto out_unlock;
0961         }
0962
0963         new_size = i_size_read(inode) - len;
0964
0965         error = xfs_collapse_file_space(ip, offset, len);
0966         if (error)
0967             goto out_unlock;
0968     } else if (mode & FALLOC_FL_INSERT_RANGE) {
0969         loff_t      isize = i_size_read(inode);
0970
0971         if (!xfs_is_falloc_aligned(ip, offset, len)) {
0972             error = -EINVAL;
0973             goto out_unlock;
0974         }
0975
0976         /*
0977          * New inode size must not exceed ->s_maxbytes, accounting for
0978          * possible signed overflow.
0979          */
0980         if (inode->i_sb->s_maxbytes - isize < len) {
0981             error = -EFBIG;
0982             goto out_unlock;
0983         }
0984         new_size = isize + len;
0985
0986         /* Offset should be less than i_size */
0987         if (offset >= isize) {
0988             error = -EINVAL;
0989             goto out_unlock;
0990         }
0991         do_file_insert = true;
0992     } else {
0993         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
0994             offset + len > i_size_read(inode)) {
0995             new_size = offset + len;
0996             error = inode_newsize_ok(inode, new_size);
0997             if (error)
0998                 goto out_unlock;
0999         }
1000
1001         if (mode & FALLOC_FL_ZERO_RANGE) {
1002             /*
1003              * Punch a hole and prealloc the range.  We use a hole
1004              * punch rather than unwritten extent conversion for two
1005              * reasons:
1006              *
1007              *   1.) Hole punch handles partial block zeroing for us.
1008              *   2.) If prealloc returns ENOSPC, the file range is
1009              *       still zero-valued by virtue of the hole punch.
1010              */
1011             unsigned int blksize = i_blocksize(inode);
1012
1013             trace_xfs_zero_file_space(ip);
1014
1015             error = xfs_free_file_space(ip, offset, len);
1016             if (error)
1017                 goto out_unlock;
1018
1019             len = round_up(offset + len, blksize) -
1020                   round_down(offset, blksize);
1021             offset = round_down(offset, blksize);
1022         } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1023             error = xfs_reflink_unshare(ip, offset, len);
1024             if (error)
1025                 goto out_unlock;
1026         } else {
1027             /*
1028              * If always_cow mode we can't use preallocations and
1029              * thus should not create them.
1030              */
1031             if (xfs_is_always_cow_inode(ip)) {
1032                 error = -EOPNOTSUPP;
1033                 goto out_unlock;
1034             }
1035         }
1036
1037         if (!xfs_is_always_cow_inode(ip)) {
1038             error = xfs_alloc_file_space(ip, offset, len);
1039             if (error)
1040                 goto out_unlock;
1041         }
1042     }
1043
1044     /* Change file size if needed */
1045     if (new_size) {
1046         struct iattr iattr;
1047
1048         iattr.ia_valid = ATTR_SIZE;
1049         iattr.ia_size = new_size;
1050         error = xfs_vn_setattr_size(file_mnt_user_ns(file),
1051                         file_dentry(file), &iattr);
1052         if (error)
1053             goto out_unlock;
1054     }
1055
1056     /*
1057      * Perform hole insertion now that the file size has been
1058      * updated so that if we crash during the operation we don't
1059      * leave shifted extents past EOF and hence losing access to
1060      * the data that is contained within them.
1061      */
1062     if (do_file_insert) {
1063         error = xfs_insert_file_space(ip, offset, len);
1064         if (error)
1065             goto out_unlock;
1066     }
1067
1068     if (xfs_file_sync_writes(file))
1069         error = xfs_log_force_inode(ip);
1070
1071 out_unlock:
1072     xfs_iunlock(ip, iolock);
1073     return error;
1074 }
1075
1076 STATIC int
1077 xfs_file_fadvise(
1078     struct file *file,
1079     loff_t      start,
1080     loff_t      end,
1081     int     advice)
1082 {
1083     struct xfs_inode *ip = XFS_I(file_inode(file));
1084     int ret;
1085     int lockflags = 0;
1086
1087     /*
1088      * Operations creating pages in page cache need protection from hole
1089      * punching and similar ops
1090      */
1091     if (advice == POSIX_FADV_WILLNEED) {
1092         lockflags = XFS_IOLOCK_SHARED;
1093         xfs_ilock(ip, lockflags);
1094     }
1095     ret = generic_fadvise(file, start, end, advice);
1096     if (lockflags)
1097         xfs_iunlock(ip, lockflags);
1098     return ret;
1099 }
1100
1101 STATIC loff_t
1102 xfs_file_remap_range(
1103     struct file     *file_in,
1104     loff_t          pos_in,
1105     struct file     *file_out,
1106     loff_t          pos_out,
1107     loff_t          len,
1108     unsigned int        remap_flags)
1109 {
1110     struct inode        *inode_in = file_inode(file_in);
1111     struct xfs_inode    *src = XFS_I(inode_in);
1112     struct inode        *inode_out = file_inode(file_out);
1113     struct xfs_inode    *dest = XFS_I(inode_out);
1114     struct xfs_mount    *mp = src->i_mount;
1115     loff_t          remapped = 0;
1116     xfs_extlen_t        cowextsize;
1117     int         ret;
1118
1119     if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1120         return -EINVAL;
1121
1122     if (!xfs_has_reflink(mp))
1123         return -EOPNOTSUPP;
1124
1125     if (xfs_is_shutdown(mp))
1126         return -EIO;
1127
1128     /* Prepare and then clone file data. */
1129     ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1130             &len, remap_flags);
1131     if (ret || len == 0)
1132         return ret;
1133
1134     trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1135
1136     ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1137             &remapped);
1138     if (ret)
1139         goto out_unlock;
1140
1141     /*
1142      * Carry the cowextsize hint from src to dest if we're sharing the
1143      * entire source file to the entire destination file, the source file
1144      * has a cowextsize hint, and the destination file does not.
1145      */
1146     cowextsize = 0;
1147     if (pos_in == 0 && len == i_size_read(inode_in) &&
1148         (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1149         pos_out == 0 && len >= i_size_read(inode_out) &&
1150         !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1151         cowextsize = src->i_cowextsize;
1152
1153     ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1154             remap_flags);
1155     if (ret)
1156         goto out_unlock;
1157
1158     if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1159         xfs_log_force_inode(dest);
1160 out_unlock:
1161     xfs_iunlock2_io_mmap(src, dest);
1162     if (ret)
1163         trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1164     return remapped > 0 ? remapped : ret;
1165 }
1166
1167 STATIC int
1168 xfs_file_open(
1169     struct inode    *inode,
1170     struct file *file)
1171 {
1172     if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1173         return -EIO;
1174     file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
1175     return generic_file_open(inode, file);
1176 }
1177
1178 STATIC int
1179 xfs_dir_open(
1180     struct inode    *inode,
1181     struct file *file)
1182 {
1183     struct xfs_inode *ip = XFS_I(inode);
1184     unsigned int    mode;
1185     int     error;
1186
1187     error = xfs_file_open(inode, file);
1188     if (error)
1189         return error;
1190
1191     /*
1192      * If there are any blocks, read-ahead block 0 as we're almost
1193      * certain to have the next operation be a read there.
1194      */
1195     mode = xfs_ilock_data_map_shared(ip);
1196     if (ip->i_df.if_nextents > 0)
1197         error = xfs_dir3_data_readahead(ip, 0, 0);
1198     xfs_iunlock(ip, mode);
1199     return error;
1200 }
1201
1202 STATIC int
1203 xfs_file_release(
1204     struct inode    *inode,
1205     struct file *filp)
1206 {
1207     return xfs_release(XFS_I(inode));
1208 }
1209
1210 STATIC int
1211 xfs_file_readdir(
1212     struct file *file,
1213     struct dir_context *ctx)
1214 {
1215     struct inode    *inode = file_inode(file);
1216     xfs_inode_t *ip = XFS_I(inode);
1217     size_t      bufsize;
1218
1219     /*
1220      * The Linux API doesn't pass down the total size of the buffer
1221      * we read into down to the filesystem.  With the filldir concept
1222      * it's not needed for correct information, but the XFS dir2 leaf
1223      * code wants an estimate of the buffer size to calculate it's
1224      * readahead window and size the buffers used for mapping to
1225      * physical blocks.
1226      *
1227      * Try to give it an estimate that's good enough, maybe at some
1228      * point we can change the ->readdir prototype to include the
1229      * buffer size.  For now we use the current glibc buffer size.
1230      */
1231     bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1232
1233     return xfs_readdir(NULL, ip, ctx, bufsize);
1234 }
1235
1236 STATIC loff_t
1237 xfs_file_llseek(
1238     struct file *file,
1239     loff_t      offset,
1240     int     whence)
1241 {
1242     struct inode        *inode = file->f_mapping->host;
1243
1244     if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1245         return -EIO;
1246
1247     switch (whence) {
1248     default:
1249         return generic_file_llseek(file, offset, whence);
1250     case SEEK_HOLE:
1251         offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1252         break;
1253     case SEEK_DATA:
1254         offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1255         break;
1256     }
1257
1258     if (offset < 0)
1259         return offset;
1260     return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1261 }
1262
1263 #ifdef CONFIG_FS_DAX
1264 static int
1265 xfs_dax_fault(
1266     struct vm_fault     *vmf,
1267     enum page_entry_size    pe_size,
1268     bool            write_fault,
1269     pfn_t           *pfn)
1270 {
1271     return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1272             (write_fault && !vmf->cow_page) ?
1273                 &xfs_dax_write_iomap_ops :
1274                 &xfs_read_iomap_ops);
1275 }
1276 #else
1277 static int
1278 xfs_dax_fault(
1279     struct vm_fault     *vmf,
1280     enum page_entry_size    pe_size,
1281     bool            write_fault,
1282     pfn_t           *pfn)
1283 {
1284     return 0;
1285 }
1286 #endif
1287
1288 /*
1289  * Locking for serialisation of IO during page faults. This results in a lock
1290  * ordering of:
1291  *
1292  * mmap_lock (MM)
1293  *   sb_start_pagefault(vfs, freeze)
1294  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1295  *       page_lock (MM)
1296  *         i_lock (XFS - extent map serialisation)
1297  */
1298 static vm_fault_t
1299 __xfs_filemap_fault(
1300     struct vm_fault     *vmf,
1301     enum page_entry_size    pe_size,
1302     bool            write_fault)
1303 {
1304     struct inode        *inode = file_inode(vmf->vma->vm_file);
1305     struct xfs_inode    *ip = XFS_I(inode);
1306     vm_fault_t      ret;
1307
1308     trace_xfs_filemap_fault(ip, pe_size, write_fault);
1309
1310     if (write_fault) {
1311         sb_start_pagefault(inode->i_sb);
1312         file_update_time(vmf->vma->vm_file);
1313     }
1314
1315     if (IS_DAX(inode)) {
1316         pfn_t pfn;
1317
1318         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1319         ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1320         if (ret & VM_FAULT_NEEDDSYNC)
1321             ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1322         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1323     } else {
1324         if (write_fault) {
1325             xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1326             ret = iomap_page_mkwrite(vmf,
1327                     &xfs_buffered_write_iomap_ops);
1328             xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1329         } else {
1330             ret = filemap_fault(vmf);
1331         }
1332     }
1333
1334     if (write_fault)
1335         sb_end_pagefault(inode->i_sb);
1336     return ret;
1337 }
1338
1339 static inline bool
1340 xfs_is_write_fault(
1341     struct vm_fault     *vmf)
1342 {
1343     return (vmf->flags & FAULT_FLAG_WRITE) &&
1344            (vmf->vma->vm_flags & VM_SHARED);
1345 }
1346
1347 static vm_fault_t
1348 xfs_filemap_fault(
1349     struct vm_fault     *vmf)
1350 {
1351     /* DAX can shortcut the normal fault path on write faults! */
1352     return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1353             IS_DAX(file_inode(vmf->vma->vm_file)) &&
1354             xfs_is_write_fault(vmf));
1355 }
1356
1357 static vm_fault_t
1358 xfs_filemap_huge_fault(
1359     struct vm_fault     *vmf,
1360     enum page_entry_size    pe_size)
1361 {
1362     if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1363         return VM_FAULT_FALLBACK;
1364
1365     /* DAX can shortcut the normal fault path on write faults! */
1366     return __xfs_filemap_fault(vmf, pe_size,
1367             xfs_is_write_fault(vmf));
1368 }
1369
1370 static vm_fault_t
1371 xfs_filemap_page_mkwrite(
1372     struct vm_fault     *vmf)
1373 {
1374     return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1375 }
1376
1377 /*
1378  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1379  * on write faults. In reality, it needs to serialise against truncate and
1380  * prepare memory for writing so handle is as standard write fault.
1381  */
1382 static vm_fault_t
1383 xfs_filemap_pfn_mkwrite(
1384     struct vm_fault     *vmf)
1385 {
1386
1387     return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1388 }
1389
1390 static vm_fault_t
1391 xfs_filemap_map_pages(
1392     struct vm_fault     *vmf,
1393     pgoff_t         start_pgoff,
1394     pgoff_t         end_pgoff)
1395 {
1396     struct inode        *inode = file_inode(vmf->vma->vm_file);
1397     vm_fault_t ret;
1398
1399     xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1400     ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
1401     xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1402     return ret;
1403 }
1404
1405 static const struct vm_operations_struct xfs_file_vm_ops = {
1406     .fault      = xfs_filemap_fault,
1407     .huge_fault = xfs_filemap_huge_fault,
1408     .map_pages  = xfs_filemap_map_pages,
1409     .page_mkwrite   = xfs_filemap_page_mkwrite,
1410     .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
1411 };
1412
1413 STATIC int
1414 xfs_file_mmap(
1415     struct file     *file,
1416     struct vm_area_struct   *vma)
1417 {
1418     struct inode        *inode = file_inode(file);
1419     struct xfs_buftarg  *target = xfs_inode_buftarg(XFS_I(inode));
1420
1421     /*
1422      * We don't support synchronous mappings for non-DAX files and
1423      * for DAX files if underneath dax_device is not synchronous.
1424      */
1425     if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1426         return -EOPNOTSUPP;
1427
1428     file_accessed(file);
1429     vma->vm_ops = &xfs_file_vm_ops;
1430     if (IS_DAX(inode))
1431         vma->vm_flags |= VM_HUGEPAGE;
1432     return 0;
1433 }
1434
1435 const struct file_operations xfs_file_operations = {
1436     .llseek     = xfs_file_llseek,
1437     .read_iter  = xfs_file_read_iter,
1438     .write_iter = xfs_file_write_iter,
1439     .splice_read    = generic_file_splice_read,
1440     .splice_write   = iter_file_splice_write,
1441     .iopoll     = iocb_bio_iopoll,
1442     .unlocked_ioctl = xfs_file_ioctl,
1443 #ifdef CONFIG_COMPAT
1444     .compat_ioctl   = xfs_file_compat_ioctl,
1445 #endif
1446     .mmap       = xfs_file_mmap,
1447     .mmap_supported_flags = MAP_SYNC,
1448     .open       = xfs_file_open,
1449     .release    = xfs_file_release,
1450     .fsync      = xfs_file_fsync,
1451     .get_unmapped_area = thp_get_unmapped_area,
1452     .fallocate  = xfs_file_fallocate,
1453     .fadvise    = xfs_file_fadvise,
1454     .remap_file_range = xfs_file_remap_range,
1455 };
1456
1457 const struct file_operations xfs_dir_file_operations = {
1458     .open       = xfs_dir_open,
1459     .read       = generic_read_dir,
1460     .iterate_shared = xfs_file_readdir,
1461     .llseek     = generic_file_llseek,
1462     .unlocked_ioctl = xfs_file_ioctl,
1463 #ifdef CONFIG_COMPAT
1464     .compat_ioctl   = xfs_file_compat_ioctl,
1465 #endif
1466     .fsync      = xfs_dir_fsync,
1467 };