fs/ext4/file.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  linux/fs/ext4/file.c
0004  *
0005  * Copyright (C) 1992, 1993, 1994, 1995
0006  * Remy Card (card@masi.ibp.fr)
0007  * Laboratoire MASI - Institut Blaise Pascal
0008  * Universite Pierre et Marie Curie (Paris VI)
0009  *
0010  *  from
0011  *
0012  *  linux/fs/minix/file.c
0013  *
0014  *  Copyright (C) 1991, 1992  Linus Torvalds
0015  *
0016  *  ext4 fs regular file handling primitives
0017  *
0018  *  64-bit file support on 64-bit platforms by Jakub Jelinek
0019  *  (jj@sunsite.ms.mff.cuni.cz)
0020  */
0021
0022 #include <linux/time.h>
0023 #include <linux/fs.h>
0024 #include <linux/iomap.h>
0025 #include <linux/mount.h>
0026 #include <linux/path.h>
0027 #include <linux/dax.h>
0028 #include <linux/quotaops.h>
0029 #include <linux/pagevec.h>
0030 #include <linux/uio.h>
0031 #include <linux/mman.h>
0032 #include <linux/backing-dev.h>
0033 #include "ext4.h"
0034 #include "ext4_jbd2.h"
0035 #include "xattr.h"
0036 #include "acl.h"
0037 #include "truncate.h"
0038
0039 static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
0040 {
0041     struct inode *inode = file_inode(iocb->ki_filp);
0042
0043     if (!fscrypt_dio_supported(iocb, iter))
0044         return false;
0045     if (fsverity_active(inode))
0046         return false;
0047     if (ext4_should_journal_data(inode))
0048         return false;
0049     if (ext4_has_inline_data(inode))
0050         return false;
0051     return true;
0052 }
0053
0054 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
0055 {
0056     ssize_t ret;
0057     struct inode *inode = file_inode(iocb->ki_filp);
0058
0059     if (iocb->ki_flags & IOCB_NOWAIT) {
0060         if (!inode_trylock_shared(inode))
0061             return -EAGAIN;
0062     } else {
0063         inode_lock_shared(inode);
0064     }
0065
0066     if (!ext4_dio_supported(iocb, to)) {
0067         inode_unlock_shared(inode);
0068         /*
0069          * Fallback to buffered I/O if the operation being performed on
0070          * the inode is not supported by direct I/O. The IOCB_DIRECT
0071          * flag needs to be cleared here in order to ensure that the
0072          * direct I/O path within generic_file_read_iter() is not
0073          * taken.
0074          */
0075         iocb->ki_flags &= ~IOCB_DIRECT;
0076         return generic_file_read_iter(iocb, to);
0077     }
0078
0079     ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
0080     inode_unlock_shared(inode);
0081
0082     file_accessed(iocb->ki_filp);
0083     return ret;
0084 }
0085
0086 #ifdef CONFIG_FS_DAX
0087 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
0088 {
0089     struct inode *inode = file_inode(iocb->ki_filp);
0090     ssize_t ret;
0091
0092     if (iocb->ki_flags & IOCB_NOWAIT) {
0093         if (!inode_trylock_shared(inode))
0094             return -EAGAIN;
0095     } else {
0096         inode_lock_shared(inode);
0097     }
0098     /*
0099      * Recheck under inode lock - at this point we are sure it cannot
0100      * change anymore
0101      */
0102     if (!IS_DAX(inode)) {
0103         inode_unlock_shared(inode);
0104         /* Fallback to buffered IO in case we cannot support DAX */
0105         return generic_file_read_iter(iocb, to);
0106     }
0107     ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
0108     inode_unlock_shared(inode);
0109
0110     file_accessed(iocb->ki_filp);
0111     return ret;
0112 }
0113 #endif
0114
0115 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
0116 {
0117     struct inode *inode = file_inode(iocb->ki_filp);
0118
0119     if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
0120         return -EIO;
0121
0122     if (!iov_iter_count(to))
0123         return 0; /* skip atime */
0124
0125 #ifdef CONFIG_FS_DAX
0126     if (IS_DAX(inode))
0127         return ext4_dax_read_iter(iocb, to);
0128 #endif
0129     if (iocb->ki_flags & IOCB_DIRECT)
0130         return ext4_dio_read_iter(iocb, to);
0131
0132     return generic_file_read_iter(iocb, to);
0133 }
0134
0135 /*
0136  * Called when an inode is released. Note that this is different
0137  * from ext4_file_open: open gets called at every open, but release
0138  * gets called only when /all/ the files are closed.
0139  */
0140 static int ext4_release_file(struct inode *inode, struct file *filp)
0141 {
0142     if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
0143         ext4_alloc_da_blocks(inode);
0144         ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
0145     }
0146     /* if we are the last writer on the inode, drop the block reservation */
0147     if ((filp->f_mode & FMODE_WRITE) &&
0148             (atomic_read(&inode->i_writecount) == 1) &&
0149             !EXT4_I(inode)->i_reserved_data_blocks) {
0150         down_write(&EXT4_I(inode)->i_data_sem);
0151         ext4_discard_preallocations(inode, 0);
0152         up_write(&EXT4_I(inode)->i_data_sem);
0153     }
0154     if (is_dx(inode) && filp->private_data)
0155         ext4_htree_free_dir_info(filp->private_data);
0156
0157     return 0;
0158 }
0159
0160 /*
0161  * This tests whether the IO in question is block-aligned or not.
0162  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
0163  * are converted to written only after the IO is complete.  Until they are
0164  * mapped, these blocks appear as holes, so dio_zero_block() will assume that
0165  * it needs to zero out portions of the start and/or end block.  If 2 AIO
0166  * threads are at work on the same unwritten block, they must be synchronized
0167  * or one thread will zero the other's data, causing corruption.
0168  */
0169 static bool
0170 ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
0171 {
0172     struct super_block *sb = inode->i_sb;
0173     unsigned long blockmask = sb->s_blocksize - 1;
0174
0175     if ((pos | iov_iter_alignment(from)) & blockmask)
0176         return true;
0177
0178     return false;
0179 }
0180
0181 static bool
0182 ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
0183 {
0184     if (offset + len > i_size_read(inode) ||
0185         offset + len > EXT4_I(inode)->i_disksize)
0186         return true;
0187     return false;
0188 }
0189
0190 /* Is IO overwriting allocated and initialized blocks? */
0191 static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
0192 {
0193     struct ext4_map_blocks map;
0194     unsigned int blkbits = inode->i_blkbits;
0195     int err, blklen;
0196
0197     if (pos + len > i_size_read(inode))
0198         return false;
0199
0200     map.m_lblk = pos >> blkbits;
0201     map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
0202     blklen = map.m_len;
0203
0204     err = ext4_map_blocks(NULL, inode, &map, 0);
0205     /*
0206      * 'err==len' means that all of the blocks have been preallocated,
0207      * regardless of whether they have been initialized or not. To exclude
0208      * unwritten extents, we need to check m_flags.
0209      */
0210     return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
0211 }
0212
0213 static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
0214                      struct iov_iter *from)
0215 {
0216     struct inode *inode = file_inode(iocb->ki_filp);
0217     ssize_t ret;
0218
0219     if (unlikely(IS_IMMUTABLE(inode)))
0220         return -EPERM;
0221
0222     ret = generic_write_checks(iocb, from);
0223     if (ret <= 0)
0224         return ret;
0225
0226     /*
0227      * If we have encountered a bitmap-format file, the size limit
0228      * is smaller than s_maxbytes, which is for extent-mapped files.
0229      */
0230     if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
0231         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0232
0233         if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
0234             return -EFBIG;
0235         iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
0236     }
0237
0238     return iov_iter_count(from);
0239 }
0240
0241 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
0242 {
0243     ssize_t ret, count;
0244
0245     count = ext4_generic_write_checks(iocb, from);
0246     if (count <= 0)
0247         return count;
0248
0249     ret = file_modified(iocb->ki_filp);
0250     if (ret)
0251         return ret;
0252     return count;
0253 }
0254
0255 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
0256                     struct iov_iter *from)
0257 {
0258     ssize_t ret;
0259     struct inode *inode = file_inode(iocb->ki_filp);
0260
0261     if (iocb->ki_flags & IOCB_NOWAIT)
0262         return -EOPNOTSUPP;
0263
0264     inode_lock(inode);
0265     ret = ext4_write_checks(iocb, from);
0266     if (ret <= 0)
0267         goto out;
0268
0269     current->backing_dev_info = inode_to_bdi(inode);
0270     ret = generic_perform_write(iocb, from);
0271     current->backing_dev_info = NULL;
0272
0273 out:
0274     inode_unlock(inode);
0275     if (likely(ret > 0)) {
0276         iocb->ki_pos += ret;
0277         ret = generic_write_sync(iocb, ret);
0278     }
0279
0280     return ret;
0281 }
0282
0283 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
0284                        ssize_t written, size_t count)
0285 {
0286     handle_t *handle;
0287     bool truncate = false;
0288     u8 blkbits = inode->i_blkbits;
0289     ext4_lblk_t written_blk, end_blk;
0290     int ret;
0291
0292     /*
0293      * Note that EXT4_I(inode)->i_disksize can get extended up to
0294      * inode->i_size while the I/O was running due to writeback of delalloc
0295      * blocks. But, the code in ext4_iomap_alloc() is careful to use
0296      * zeroed/unwritten extents if this is possible; thus we won't leave
0297      * uninitialized blocks in a file even if we didn't succeed in writing
0298      * as much as we intended.
0299      */
0300     WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
0301     if (offset + count <= EXT4_I(inode)->i_disksize) {
0302         /*
0303          * We need to ensure that the inode is removed from the orphan
0304          * list if it has been added prematurely, due to writeback of
0305          * delalloc blocks.
0306          */
0307         if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
0308             handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
0309
0310             if (IS_ERR(handle)) {
0311                 ext4_orphan_del(NULL, inode);
0312                 return PTR_ERR(handle);
0313             }
0314
0315             ext4_orphan_del(handle, inode);
0316             ext4_journal_stop(handle);
0317         }
0318
0319         return written;
0320     }
0321
0322     if (written < 0)
0323         goto truncate;
0324
0325     handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
0326     if (IS_ERR(handle)) {
0327         written = PTR_ERR(handle);
0328         goto truncate;
0329     }
0330
0331     if (ext4_update_inode_size(inode, offset + written)) {
0332         ret = ext4_mark_inode_dirty(handle, inode);
0333         if (unlikely(ret)) {
0334             written = ret;
0335             ext4_journal_stop(handle);
0336             goto truncate;
0337         }
0338     }
0339
0340     /*
0341      * We may need to truncate allocated but not written blocks beyond EOF.
0342      */
0343     written_blk = ALIGN(offset + written, 1 << blkbits);
0344     end_blk = ALIGN(offset + count, 1 << blkbits);
0345     if (written_blk < end_blk && ext4_can_truncate(inode))
0346         truncate = true;
0347
0348     /*
0349      * Remove the inode from the orphan list if it has been extended and
0350      * everything went OK.
0351      */
0352     if (!truncate && inode->i_nlink)
0353         ext4_orphan_del(handle, inode);
0354     ext4_journal_stop(handle);
0355
0356     if (truncate) {
0357 truncate:
0358         ext4_truncate_failed_write(inode);
0359         /*
0360          * If the truncate operation failed early, then the inode may
0361          * still be on the orphan list. In that case, we need to try
0362          * remove the inode from the in-memory linked list.
0363          */
0364         if (inode->i_nlink)
0365             ext4_orphan_del(NULL, inode);
0366     }
0367
0368     return written;
0369 }
0370
0371 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
0372                  int error, unsigned int flags)
0373 {
0374     loff_t pos = iocb->ki_pos;
0375     struct inode *inode = file_inode(iocb->ki_filp);
0376
0377     if (error)
0378         return error;
0379
0380     if (size && flags & IOMAP_DIO_UNWRITTEN) {
0381         error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
0382         if (error < 0)
0383             return error;
0384     }
0385     /*
0386      * If we are extending the file, we have to update i_size here before
0387      * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
0388      * buffered reads could zero out too much from page cache pages. Update
0389      * of on-disk size will happen later in ext4_dio_write_iter() where
0390      * we have enough information to also perform orphan list handling etc.
0391      * Note that we perform all extending writes synchronously under
0392      * i_rwsem held exclusively so i_size update is safe here in that case.
0393      * If the write was not extending, we cannot see pos > i_size here
0394      * because operations reducing i_size like truncate wait for all
0395      * outstanding DIO before updating i_size.
0396      */
0397     pos += size;
0398     if (pos > i_size_read(inode))
0399         i_size_write(inode, pos);
0400
0401     return 0;
0402 }
0403
0404 static const struct iomap_dio_ops ext4_dio_write_ops = {
0405     .end_io = ext4_dio_write_end_io,
0406 };
0407
0408 /*
0409  * The intention here is to start with shared lock acquired then see if any
0410  * condition requires an exclusive inode lock. If yes, then we restart the
0411  * whole operation by releasing the shared lock and acquiring exclusive lock.
0412  *
0413  * - For unaligned_io we never take shared lock as it may cause data corruption
0414  *   when two unaligned IO tries to modify the same block e.g. while zeroing.
0415  *
0416  * - For extending writes case we don't take the shared lock, since it requires
0417  *   updating inode i_disksize and/or orphan handling with exclusive lock.
0418  *
0419  * - shared locking will only be true mostly with overwrites. Otherwise we will
0420  *   switch to exclusive i_rwsem lock.
0421  */
0422 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
0423                      bool *ilock_shared, bool *extend)
0424 {
0425     struct file *file = iocb->ki_filp;
0426     struct inode *inode = file_inode(file);
0427     loff_t offset;
0428     size_t count;
0429     ssize_t ret;
0430
0431 restart:
0432     ret = ext4_generic_write_checks(iocb, from);
0433     if (ret <= 0)
0434         goto out;
0435
0436     offset = iocb->ki_pos;
0437     count = ret;
0438     if (ext4_extending_io(inode, offset, count))
0439         *extend = true;
0440     /*
0441      * Determine whether the IO operation will overwrite allocated
0442      * and initialized blocks.
0443      * We need exclusive i_rwsem for changing security info
0444      * in file_modified().
0445      */
0446     if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
0447          !ext4_overwrite_io(inode, offset, count))) {
0448         if (iocb->ki_flags & IOCB_NOWAIT) {
0449             ret = -EAGAIN;
0450             goto out;
0451         }
0452         inode_unlock_shared(inode);
0453         *ilock_shared = false;
0454         inode_lock(inode);
0455         goto restart;
0456     }
0457
0458     ret = file_modified(file);
0459     if (ret < 0)
0460         goto out;
0461
0462     return count;
0463 out:
0464     if (*ilock_shared)
0465         inode_unlock_shared(inode);
0466     else
0467         inode_unlock(inode);
0468     return ret;
0469 }
0470
0471 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
0472 {
0473     ssize_t ret;
0474     handle_t *handle;
0475     struct inode *inode = file_inode(iocb->ki_filp);
0476     loff_t offset = iocb->ki_pos;
0477     size_t count = iov_iter_count(from);
0478     const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
0479     bool extend = false, unaligned_io = false;
0480     bool ilock_shared = true;
0481
0482     /*
0483      * We initially start with shared inode lock unless it is
0484      * unaligned IO which needs exclusive lock anyways.
0485      */
0486     if (ext4_unaligned_io(inode, from, offset)) {
0487         unaligned_io = true;
0488         ilock_shared = false;
0489     }
0490     /*
0491      * Quick check here without any i_rwsem lock to see if it is extending
0492      * IO. A more reliable check is done in ext4_dio_write_checks() with
0493      * proper locking in place.
0494      */
0495     if (offset + count > i_size_read(inode))
0496         ilock_shared = false;
0497
0498     if (iocb->ki_flags & IOCB_NOWAIT) {
0499         if (ilock_shared) {
0500             if (!inode_trylock_shared(inode))
0501                 return -EAGAIN;
0502         } else {
0503             if (!inode_trylock(inode))
0504                 return -EAGAIN;
0505         }
0506     } else {
0507         if (ilock_shared)
0508             inode_lock_shared(inode);
0509         else
0510             inode_lock(inode);
0511     }
0512
0513     /* Fallback to buffered I/O if the inode does not support direct I/O. */
0514     if (!ext4_dio_supported(iocb, from)) {
0515         if (ilock_shared)
0516             inode_unlock_shared(inode);
0517         else
0518             inode_unlock(inode);
0519         return ext4_buffered_write_iter(iocb, from);
0520     }
0521
0522     ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
0523     if (ret <= 0)
0524         return ret;
0525
0526     /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
0527     if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
0528         ret = -EAGAIN;
0529         goto out;
0530     }
0531
0532     offset = iocb->ki_pos;
0533     count = ret;
0534
0535     /*
0536      * Unaligned direct IO must be serialized among each other as zeroing
0537      * of partial blocks of two competing unaligned IOs can result in data
0538      * corruption.
0539      *
0540      * So we make sure we don't allow any unaligned IO in flight.
0541      * For IOs where we need not wait (like unaligned non-AIO DIO),
0542      * below inode_dio_wait() may anyway become a no-op, since we start
0543      * with exclusive lock.
0544      */
0545     if (unaligned_io)
0546         inode_dio_wait(inode);
0547
0548     if (extend) {
0549         handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
0550         if (IS_ERR(handle)) {
0551             ret = PTR_ERR(handle);
0552             goto out;
0553         }
0554
0555         ret = ext4_orphan_add(handle, inode);
0556         if (ret) {
0557             ext4_journal_stop(handle);
0558             goto out;
0559         }
0560
0561         ext4_journal_stop(handle);
0562     }
0563
0564     if (ilock_shared)
0565         iomap_ops = &ext4_iomap_overwrite_ops;
0566     ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
0567                (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
0568                NULL, 0);
0569     if (ret == -ENOTBLK)
0570         ret = 0;
0571
0572     if (extend)
0573         ret = ext4_handle_inode_extension(inode, offset, ret, count);
0574
0575 out:
0576     if (ilock_shared)
0577         inode_unlock_shared(inode);
0578     else
0579         inode_unlock(inode);
0580
0581     if (ret >= 0 && iov_iter_count(from)) {
0582         ssize_t err;
0583         loff_t endbyte;
0584
0585         offset = iocb->ki_pos;
0586         err = ext4_buffered_write_iter(iocb, from);
0587         if (err < 0)
0588             return err;
0589
0590         /*
0591          * We need to ensure that the pages within the page cache for
0592          * the range covered by this I/O are written to disk and
0593          * invalidated. This is in attempt to preserve the expected
0594          * direct I/O semantics in the case we fallback to buffered I/O
0595          * to complete off the I/O request.
0596          */
0597         ret += err;
0598         endbyte = offset + err - 1;
0599         err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
0600                            offset, endbyte);
0601         if (!err)
0602             invalidate_mapping_pages(iocb->ki_filp->f_mapping,
0603                          offset >> PAGE_SHIFT,
0604                          endbyte >> PAGE_SHIFT);
0605     }
0606
0607     return ret;
0608 }
0609
0610 #ifdef CONFIG_FS_DAX
0611 static ssize_t
0612 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
0613 {
0614     ssize_t ret;
0615     size_t count;
0616     loff_t offset;
0617     handle_t *handle;
0618     bool extend = false;
0619     struct inode *inode = file_inode(iocb->ki_filp);
0620
0621     if (iocb->ki_flags & IOCB_NOWAIT) {
0622         if (!inode_trylock(inode))
0623             return -EAGAIN;
0624     } else {
0625         inode_lock(inode);
0626     }
0627
0628     ret = ext4_write_checks(iocb, from);
0629     if (ret <= 0)
0630         goto out;
0631
0632     offset = iocb->ki_pos;
0633     count = iov_iter_count(from);
0634
0635     if (offset + count > EXT4_I(inode)->i_disksize) {
0636         handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
0637         if (IS_ERR(handle)) {
0638             ret = PTR_ERR(handle);
0639             goto out;
0640         }
0641
0642         ret = ext4_orphan_add(handle, inode);
0643         if (ret) {
0644             ext4_journal_stop(handle);
0645             goto out;
0646         }
0647
0648         extend = true;
0649         ext4_journal_stop(handle);
0650     }
0651
0652     ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
0653
0654     if (extend)
0655         ret = ext4_handle_inode_extension(inode, offset, ret, count);
0656 out:
0657     inode_unlock(inode);
0658     if (ret > 0)
0659         ret = generic_write_sync(iocb, ret);
0660     return ret;
0661 }
0662 #endif
0663
0664 static ssize_t
0665 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
0666 {
0667     struct inode *inode = file_inode(iocb->ki_filp);
0668
0669     if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
0670         return -EIO;
0671
0672 #ifdef CONFIG_FS_DAX
0673     if (IS_DAX(inode))
0674         return ext4_dax_write_iter(iocb, from);
0675 #endif
0676     if (iocb->ki_flags & IOCB_DIRECT)
0677         return ext4_dio_write_iter(iocb, from);
0678     else
0679         return ext4_buffered_write_iter(iocb, from);
0680 }
0681
0682 #ifdef CONFIG_FS_DAX
0683 static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
0684         enum page_entry_size pe_size)
0685 {
0686     int error = 0;
0687     vm_fault_t result;
0688     int retries = 0;
0689     handle_t *handle = NULL;
0690     struct inode *inode = file_inode(vmf->vma->vm_file);
0691     struct super_block *sb = inode->i_sb;
0692
0693     /*
0694      * We have to distinguish real writes from writes which will result in a
0695      * COW page; COW writes should *not* poke the journal (the file will not
0696      * be changed). Doing so would cause unintended failures when mounted
0697      * read-only.
0698      *
0699      * We check for VM_SHARED rather than vmf->cow_page since the latter is
0700      * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
0701      * other sizes, dax_iomap_fault will handle splitting / fallback so that
0702      * we eventually come back with a COW page.
0703      */
0704     bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
0705         (vmf->vma->vm_flags & VM_SHARED);
0706     struct address_space *mapping = vmf->vma->vm_file->f_mapping;
0707     pfn_t pfn;
0708
0709     if (write) {
0710         sb_start_pagefault(sb);
0711         file_update_time(vmf->vma->vm_file);
0712         filemap_invalidate_lock_shared(mapping);
0713 retry:
0714         handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
0715                            EXT4_DATA_TRANS_BLOCKS(sb));
0716         if (IS_ERR(handle)) {
0717             filemap_invalidate_unlock_shared(mapping);
0718             sb_end_pagefault(sb);
0719             return VM_FAULT_SIGBUS;
0720         }
0721     } else {
0722         filemap_invalidate_lock_shared(mapping);
0723     }
0724     result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
0725     if (write) {
0726         ext4_journal_stop(handle);
0727
0728         if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
0729             ext4_should_retry_alloc(sb, &retries))
0730             goto retry;
0731         /* Handling synchronous page fault? */
0732         if (result & VM_FAULT_NEEDDSYNC)
0733             result = dax_finish_sync_fault(vmf, pe_size, pfn);
0734         filemap_invalidate_unlock_shared(mapping);
0735         sb_end_pagefault(sb);
0736     } else {
0737         filemap_invalidate_unlock_shared(mapping);
0738     }
0739
0740     return result;
0741 }
0742
0743 static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
0744 {
0745     return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
0746 }
0747
0748 static const struct vm_operations_struct ext4_dax_vm_ops = {
0749     .fault      = ext4_dax_fault,
0750     .huge_fault = ext4_dax_huge_fault,
0751     .page_mkwrite   = ext4_dax_fault,
0752     .pfn_mkwrite    = ext4_dax_fault,
0753 };
0754 #else
0755 #define ext4_dax_vm_ops ext4_file_vm_ops
0756 #endif
0757
0758 static const struct vm_operations_struct ext4_file_vm_ops = {
0759     .fault      = filemap_fault,
0760     .map_pages  = filemap_map_pages,
0761     .page_mkwrite   = ext4_page_mkwrite,
0762 };
0763
0764 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
0765 {
0766     struct inode *inode = file->f_mapping->host;
0767     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0768     struct dax_device *dax_dev = sbi->s_daxdev;
0769
0770     if (unlikely(ext4_forced_shutdown(sbi)))
0771         return -EIO;
0772
0773     /*
0774      * We don't support synchronous mappings for non-DAX files and
0775      * for DAX files if underneath dax_device is not synchronous.
0776      */
0777     if (!daxdev_mapping_supported(vma, dax_dev))
0778         return -EOPNOTSUPP;
0779
0780     file_accessed(file);
0781     if (IS_DAX(file_inode(file))) {
0782         vma->vm_ops = &ext4_dax_vm_ops;
0783         vma->vm_flags |= VM_HUGEPAGE;
0784     } else {
0785         vma->vm_ops = &ext4_file_vm_ops;
0786     }
0787     return 0;
0788 }
0789
0790 static int ext4_sample_last_mounted(struct super_block *sb,
0791                     struct vfsmount *mnt)
0792 {
0793     struct ext4_sb_info *sbi = EXT4_SB(sb);
0794     struct path path;
0795     char buf[64], *cp;
0796     handle_t *handle;
0797     int err;
0798
0799     if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
0800         return 0;
0801
0802     if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
0803         return 0;
0804
0805     ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
0806     /*
0807      * Sample where the filesystem has been mounted and
0808      * store it in the superblock for sysadmin convenience
0809      * when trying to sort through large numbers of block
0810      * devices or filesystem images.
0811      */
0812     memset(buf, 0, sizeof(buf));
0813     path.mnt = mnt;
0814     path.dentry = mnt->mnt_root;
0815     cp = d_path(&path, buf, sizeof(buf));
0816     err = 0;
0817     if (IS_ERR(cp))
0818         goto out;
0819
0820     handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
0821     err = PTR_ERR(handle);
0822     if (IS_ERR(handle))
0823         goto out;
0824     BUFFER_TRACE(sbi->s_sbh, "get_write_access");
0825     err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
0826                         EXT4_JTR_NONE);
0827     if (err)
0828         goto out_journal;
0829     lock_buffer(sbi->s_sbh);
0830     strncpy(sbi->s_es->s_last_mounted, cp,
0831         sizeof(sbi->s_es->s_last_mounted));
0832     ext4_superblock_csum_set(sb);
0833     unlock_buffer(sbi->s_sbh);
0834     ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
0835 out_journal:
0836     ext4_journal_stop(handle);
0837 out:
0838     sb_end_intwrite(sb);
0839     return err;
0840 }
0841
0842 static int ext4_file_open(struct inode *inode, struct file *filp)
0843 {
0844     int ret;
0845
0846     if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
0847         return -EIO;
0848
0849     ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
0850     if (ret)
0851         return ret;
0852
0853     ret = fscrypt_file_open(inode, filp);
0854     if (ret)
0855         return ret;
0856
0857     ret = fsverity_file_open(inode, filp);
0858     if (ret)
0859         return ret;
0860
0861     /*
0862      * Set up the jbd2_inode if we are opening the inode for
0863      * writing and the journal is present
0864      */
0865     if (filp->f_mode & FMODE_WRITE) {
0866         ret = ext4_inode_attach_jinode(inode);
0867         if (ret < 0)
0868             return ret;
0869     }
0870
0871     filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
0872     return dquot_file_open(inode, filp);
0873 }
0874
0875 /*
0876  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
0877  * by calling generic_file_llseek_size() with the appropriate maxbytes
0878  * value for each.
0879  */
0880 loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
0881 {
0882     struct inode *inode = file->f_mapping->host;
0883     loff_t maxbytes;
0884
0885     if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
0886         maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
0887     else
0888         maxbytes = inode->i_sb->s_maxbytes;
0889
0890     switch (whence) {
0891     default:
0892         return generic_file_llseek_size(file, offset, whence,
0893                         maxbytes, i_size_read(inode));
0894     case SEEK_HOLE:
0895         inode_lock_shared(inode);
0896         offset = iomap_seek_hole(inode, offset,
0897                      &ext4_iomap_report_ops);
0898         inode_unlock_shared(inode);
0899         break;
0900     case SEEK_DATA:
0901         inode_lock_shared(inode);
0902         offset = iomap_seek_data(inode, offset,
0903                      &ext4_iomap_report_ops);
0904         inode_unlock_shared(inode);
0905         break;
0906     }
0907
0908     if (offset < 0)
0909         return offset;
0910     return vfs_setpos(file, offset, maxbytes);
0911 }
0912
0913 const struct file_operations ext4_file_operations = {
0914     .llseek     = ext4_llseek,
0915     .read_iter  = ext4_file_read_iter,
0916     .write_iter = ext4_file_write_iter,
0917     .iopoll     = iocb_bio_iopoll,
0918     .unlocked_ioctl = ext4_ioctl,
0919 #ifdef CONFIG_COMPAT
0920     .compat_ioctl   = ext4_compat_ioctl,
0921 #endif
0922     .mmap       = ext4_file_mmap,
0923     .mmap_supported_flags = MAP_SYNC,
0924     .open       = ext4_file_open,
0925     .release    = ext4_release_file,
0926     .fsync      = ext4_sync_file,
0927     .get_unmapped_area = thp_get_unmapped_area,
0928     .splice_read    = generic_file_splice_read,
0929     .splice_write   = iter_file_splice_write,
0930     .fallocate  = ext4_fallocate,
0931 };
0932
0933 const struct inode_operations ext4_file_inode_operations = {
0934     .setattr    = ext4_setattr,
0935     .getattr    = ext4_file_getattr,
0936     .listxattr  = ext4_listxattr,
0937     .get_acl    = ext4_get_acl,
0938     .set_acl    = ext4_set_acl,
0939     .fiemap     = ext4_fiemap,
0940     .fileattr_get   = ext4_fileattr_get,
0941     .fileattr_set   = ext4_fileattr_set,
0942 };
0943