Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * High-level sync()-related operations
0004  */
0005 
0006 #include <linux/blkdev.h>
0007 #include <linux/kernel.h>
0008 #include <linux/file.h>
0009 #include <linux/fs.h>
0010 #include <linux/slab.h>
0011 #include <linux/export.h>
0012 #include <linux/namei.h>
0013 #include <linux/sched.h>
0014 #include <linux/writeback.h>
0015 #include <linux/syscalls.h>
0016 #include <linux/linkage.h>
0017 #include <linux/pagemap.h>
0018 #include <linux/quotaops.h>
0019 #include <linux/backing-dev.h>
0020 #include "internal.h"
0021 
0022 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
0023             SYNC_FILE_RANGE_WAIT_AFTER)
0024 
0025 /*
0026  * Write out and wait upon all dirty data associated with this
0027  * superblock.  Filesystem data as well as the underlying block
0028  * device.  Takes the superblock lock.
0029  */
0030 int sync_filesystem(struct super_block *sb)
0031 {
0032     int ret = 0;
0033 
0034     /*
0035      * We need to be protected against the filesystem going from
0036      * r/o to r/w or vice versa.
0037      */
0038     WARN_ON(!rwsem_is_locked(&sb->s_umount));
0039 
0040     /*
0041      * No point in syncing out anything if the filesystem is read-only.
0042      */
0043     if (sb_rdonly(sb))
0044         return 0;
0045 
0046     /*
0047      * Do the filesystem syncing work.  For simple filesystems
0048      * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
0049      * to submit I/O for these buffers via sync_blockdev().  This also
0050      * speeds up the wait == 1 case since in that case write_inode()
0051      * methods call sync_dirty_buffer() and thus effectively write one block
0052      * at a time.
0053      */
0054     writeback_inodes_sb(sb, WB_REASON_SYNC);
0055     if (sb->s_op->sync_fs) {
0056         ret = sb->s_op->sync_fs(sb, 0);
0057         if (ret)
0058             return ret;
0059     }
0060     ret = sync_blockdev_nowait(sb->s_bdev);
0061     if (ret)
0062         return ret;
0063 
0064     sync_inodes_sb(sb);
0065     if (sb->s_op->sync_fs) {
0066         ret = sb->s_op->sync_fs(sb, 1);
0067         if (ret)
0068             return ret;
0069     }
0070     return sync_blockdev(sb->s_bdev);
0071 }
0072 EXPORT_SYMBOL(sync_filesystem);
0073 
0074 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
0075 {
0076     if (!sb_rdonly(sb))
0077         sync_inodes_sb(sb);
0078 }
0079 
0080 static void sync_fs_one_sb(struct super_block *sb, void *arg)
0081 {
0082     if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
0083         sb->s_op->sync_fs)
0084         sb->s_op->sync_fs(sb, *(int *)arg);
0085 }
0086 
0087 /*
0088  * Sync everything. We start by waking flusher threads so that most of
0089  * writeback runs on all devices in parallel. Then we sync all inodes reliably
0090  * which effectively also waits for all flusher threads to finish doing
0091  * writeback. At this point all data is on disk so metadata should be stable
0092  * and we tell filesystems to sync their metadata via ->sync_fs() calls.
0093  * Finally, we writeout all block devices because some filesystems (e.g. ext2)
0094  * just write metadata (such as inodes or bitmaps) to block device page cache
0095  * and do not sync it on their own in ->sync_fs().
0096  */
0097 void ksys_sync(void)
0098 {
0099     int nowait = 0, wait = 1;
0100 
0101     wakeup_flusher_threads(WB_REASON_SYNC);
0102     iterate_supers(sync_inodes_one_sb, NULL);
0103     iterate_supers(sync_fs_one_sb, &nowait);
0104     iterate_supers(sync_fs_one_sb, &wait);
0105     sync_bdevs(false);
0106     sync_bdevs(true);
0107     if (unlikely(laptop_mode))
0108         laptop_sync_completion();
0109 }
0110 
0111 SYSCALL_DEFINE0(sync)
0112 {
0113     ksys_sync();
0114     return 0;
0115 }
0116 
0117 static void do_sync_work(struct work_struct *work)
0118 {
0119     int nowait = 0;
0120 
0121     /*
0122      * Sync twice to reduce the possibility we skipped some inodes / pages
0123      * because they were temporarily locked
0124      */
0125     iterate_supers(sync_inodes_one_sb, &nowait);
0126     iterate_supers(sync_fs_one_sb, &nowait);
0127     sync_bdevs(false);
0128     iterate_supers(sync_inodes_one_sb, &nowait);
0129     iterate_supers(sync_fs_one_sb, &nowait);
0130     sync_bdevs(false);
0131     printk("Emergency Sync complete\n");
0132     kfree(work);
0133 }
0134 
0135 void emergency_sync(void)
0136 {
0137     struct work_struct *work;
0138 
0139     work = kmalloc(sizeof(*work), GFP_ATOMIC);
0140     if (work) {
0141         INIT_WORK(work, do_sync_work);
0142         schedule_work(work);
0143     }
0144 }
0145 
0146 /*
0147  * sync a single super
0148  */
0149 SYSCALL_DEFINE1(syncfs, int, fd)
0150 {
0151     struct fd f = fdget(fd);
0152     struct super_block *sb;
0153     int ret, ret2;
0154 
0155     if (!f.file)
0156         return -EBADF;
0157     sb = f.file->f_path.dentry->d_sb;
0158 
0159     down_read(&sb->s_umount);
0160     ret = sync_filesystem(sb);
0161     up_read(&sb->s_umount);
0162 
0163     ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
0164 
0165     fdput(f);
0166     return ret ? ret : ret2;
0167 }
0168 
0169 /**
0170  * vfs_fsync_range - helper to sync a range of data & metadata to disk
0171  * @file:       file to sync
0172  * @start:      offset in bytes of the beginning of data range to sync
0173  * @end:        offset in bytes of the end of data range (inclusive)
0174  * @datasync:       perform only datasync
0175  *
0176  * Write back data in range @start..@end and metadata for @file to disk.  If
0177  * @datasync is set only metadata needed to access modified file data is
0178  * written.
0179  */
0180 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
0181 {
0182     struct inode *inode = file->f_mapping->host;
0183 
0184     if (!file->f_op->fsync)
0185         return -EINVAL;
0186     if (!datasync && (inode->i_state & I_DIRTY_TIME))
0187         mark_inode_dirty_sync(inode);
0188     return file->f_op->fsync(file, start, end, datasync);
0189 }
0190 EXPORT_SYMBOL(vfs_fsync_range);
0191 
0192 /**
0193  * vfs_fsync - perform a fsync or fdatasync on a file
0194  * @file:       file to sync
0195  * @datasync:       only perform a fdatasync operation
0196  *
0197  * Write back data and metadata for @file to disk.  If @datasync is
0198  * set only metadata needed to access modified file data is written.
0199  */
0200 int vfs_fsync(struct file *file, int datasync)
0201 {
0202     return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
0203 }
0204 EXPORT_SYMBOL(vfs_fsync);
0205 
0206 static int do_fsync(unsigned int fd, int datasync)
0207 {
0208     struct fd f = fdget(fd);
0209     int ret = -EBADF;
0210 
0211     if (f.file) {
0212         ret = vfs_fsync(f.file, datasync);
0213         fdput(f);
0214     }
0215     return ret;
0216 }
0217 
0218 SYSCALL_DEFINE1(fsync, unsigned int, fd)
0219 {
0220     return do_fsync(fd, 0);
0221 }
0222 
0223 SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
0224 {
0225     return do_fsync(fd, 1);
0226 }
0227 
0228 int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
0229             unsigned int flags)
0230 {
0231     int ret;
0232     struct address_space *mapping;
0233     loff_t endbyte;         /* inclusive */
0234     umode_t i_mode;
0235 
0236     ret = -EINVAL;
0237     if (flags & ~VALID_FLAGS)
0238         goto out;
0239 
0240     endbyte = offset + nbytes;
0241 
0242     if ((s64)offset < 0)
0243         goto out;
0244     if ((s64)endbyte < 0)
0245         goto out;
0246     if (endbyte < offset)
0247         goto out;
0248 
0249     if (sizeof(pgoff_t) == 4) {
0250         if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
0251             /*
0252              * The range starts outside a 32 bit machine's
0253              * pagecache addressing capabilities.  Let it "succeed"
0254              */
0255             ret = 0;
0256             goto out;
0257         }
0258         if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
0259             /*
0260              * Out to EOF
0261              */
0262             nbytes = 0;
0263         }
0264     }
0265 
0266     if (nbytes == 0)
0267         endbyte = LLONG_MAX;
0268     else
0269         endbyte--;      /* inclusive */
0270 
0271     i_mode = file_inode(file)->i_mode;
0272     ret = -ESPIPE;
0273     if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
0274             !S_ISLNK(i_mode))
0275         goto out;
0276 
0277     mapping = file->f_mapping;
0278     ret = 0;
0279     if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
0280         ret = file_fdatawait_range(file, offset, endbyte);
0281         if (ret < 0)
0282             goto out;
0283     }
0284 
0285     if (flags & SYNC_FILE_RANGE_WRITE) {
0286         int sync_mode = WB_SYNC_NONE;
0287 
0288         if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
0289                  SYNC_FILE_RANGE_WRITE_AND_WAIT)
0290             sync_mode = WB_SYNC_ALL;
0291 
0292         ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
0293                          sync_mode);
0294         if (ret < 0)
0295             goto out;
0296     }
0297 
0298     if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
0299         ret = file_fdatawait_range(file, offset, endbyte);
0300 
0301 out:
0302     return ret;
0303 }
0304 
0305 /*
0306  * ksys_sync_file_range() permits finely controlled syncing over a segment of
0307  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
0308  * zero then ksys_sync_file_range() will operate from offset out to EOF.
0309  *
0310  * The flag bits are:
0311  *
0312  * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
0313  * before performing the write.
0314  *
0315  * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
0316  * range which are not presently under writeback. Note that this may block for
0317  * significant periods due to exhaustion of disk request structures.
0318  *
0319  * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
0320  * after performing the write.
0321  *
0322  * Useful combinations of the flag bits are:
0323  *
0324  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
0325  * in the range which were dirty on entry to ksys_sync_file_range() are placed
0326  * under writeout.  This is a start-write-for-data-integrity operation.
0327  *
0328  * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
0329  * are not presently under writeout.  This is an asynchronous flush-to-disk
0330  * operation.  Not suitable for data integrity operations.
0331  *
0332  * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
0333  * completion of writeout of all pages in the range.  This will be used after an
0334  * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
0335  * for that operation to complete and to return the result.
0336  *
0337  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
0338  * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
0339  * a traditional sync() operation.  This is a write-for-data-integrity operation
0340  * which will ensure that all pages in the range which were dirty on entry to
0341  * ksys_sync_file_range() are written to disk.  It should be noted that disk
0342  * caches are not flushed by this call, so there are no guarantees here that the
0343  * data will be available on disk after a crash.
0344  *
0345  *
0346  * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
0347  * I/O errors or ENOSPC conditions and will return those to the caller, after
0348  * clearing the EIO and ENOSPC flags in the address_space.
0349  *
0350  * It should be noted that none of these operations write out the file's
0351  * metadata.  So unless the application is strictly performing overwrites of
0352  * already-instantiated disk blocks, there are no guarantees here that the data
0353  * will be available after a crash.
0354  */
0355 int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
0356              unsigned int flags)
0357 {
0358     int ret;
0359     struct fd f;
0360 
0361     ret = -EBADF;
0362     f = fdget(fd);
0363     if (f.file)
0364         ret = sync_file_range(f.file, offset, nbytes, flags);
0365 
0366     fdput(f);
0367     return ret;
0368 }
0369 
0370 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
0371                 unsigned int, flags)
0372 {
0373     return ksys_sync_file_range(fd, offset, nbytes, flags);
0374 }
0375 
0376 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
0377 COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
0378                compat_arg_u64_dual(nbytes), unsigned int, flags)
0379 {
0380     return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
0381                     compat_arg_u64_glue(nbytes), flags);
0382 }
0383 #endif
0384 
0385 /* It would be nice if people remember that not all the world's an i386
0386    when they introduce new system calls */
0387 SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
0388                  loff_t, offset, loff_t, nbytes)
0389 {
0390     return ksys_sync_file_range(fd, offset, nbytes, flags);
0391 }