Back to home page

LXR

 
 

    


0001 /*
0002  * High-level sync()-related operations
0003  */
0004 
0005 #include <linux/kernel.h>
0006 #include <linux/file.h>
0007 #include <linux/fs.h>
0008 #include <linux/slab.h>
0009 #include <linux/export.h>
0010 #include <linux/namei.h>
0011 #include <linux/sched.h>
0012 #include <linux/writeback.h>
0013 #include <linux/syscalls.h>
0014 #include <linux/linkage.h>
0015 #include <linux/pagemap.h>
0016 #include <linux/quotaops.h>
0017 #include <linux/backing-dev.h>
0018 #include "internal.h"
0019 
0020 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
0021             SYNC_FILE_RANGE_WAIT_AFTER)
0022 
0023 /*
0024  * Do the filesystem syncing work. For simple filesystems
0025  * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
0026  * submit IO for these buffers via __sync_blockdev(). This also speeds up the
0027  * wait == 1 case since in that case write_inode() functions do
0028  * sync_dirty_buffer() and thus effectively write one block at a time.
0029  */
0030 static int __sync_filesystem(struct super_block *sb, int wait)
0031 {
0032     if (wait)
0033         sync_inodes_sb(sb);
0034     else
0035         writeback_inodes_sb(sb, WB_REASON_SYNC);
0036 
0037     if (sb->s_op->sync_fs)
0038         sb->s_op->sync_fs(sb, wait);
0039     return __sync_blockdev(sb->s_bdev, wait);
0040 }
0041 
0042 /*
0043  * Write out and wait upon all dirty data associated with this
0044  * superblock.  Filesystem data as well as the underlying block
0045  * device.  Takes the superblock lock.
0046  */
0047 int sync_filesystem(struct super_block *sb)
0048 {
0049     int ret;
0050 
0051     /*
0052      * We need to be protected against the filesystem going from
0053      * r/o to r/w or vice versa.
0054      */
0055     WARN_ON(!rwsem_is_locked(&sb->s_umount));
0056 
0057     /*
0058      * No point in syncing out anything if the filesystem is read-only.
0059      */
0060     if (sb->s_flags & MS_RDONLY)
0061         return 0;
0062 
0063     ret = __sync_filesystem(sb, 0);
0064     if (ret < 0)
0065         return ret;
0066     return __sync_filesystem(sb, 1);
0067 }
0068 EXPORT_SYMBOL(sync_filesystem);
0069 
0070 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
0071 {
0072     if (!(sb->s_flags & MS_RDONLY))
0073         sync_inodes_sb(sb);
0074 }
0075 
0076 static void sync_fs_one_sb(struct super_block *sb, void *arg)
0077 {
0078     if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
0079         sb->s_op->sync_fs(sb, *(int *)arg);
0080 }
0081 
0082 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
0083 {
0084     filemap_fdatawrite(bdev->bd_inode->i_mapping);
0085 }
0086 
0087 static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
0088 {
0089     /*
0090      * We keep the error status of individual mapping so that
0091      * applications can catch the writeback error using fsync(2).
0092      * See filemap_fdatawait_keep_errors() for details.
0093      */
0094     filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
0095 }
0096 
0097 /*
0098  * Sync everything. We start by waking flusher threads so that most of
0099  * writeback runs on all devices in parallel. Then we sync all inodes reliably
0100  * which effectively also waits for all flusher threads to finish doing
0101  * writeback. At this point all data is on disk so metadata should be stable
0102  * and we tell filesystems to sync their metadata via ->sync_fs() calls.
0103  * Finally, we writeout all block devices because some filesystems (e.g. ext2)
0104  * just write metadata (such as inodes or bitmaps) to block device page cache
0105  * and do not sync it on their own in ->sync_fs().
0106  */
0107 SYSCALL_DEFINE0(sync)
0108 {
0109     int nowait = 0, wait = 1;
0110 
0111     wakeup_flusher_threads(0, WB_REASON_SYNC);
0112     iterate_supers(sync_inodes_one_sb, NULL);
0113     iterate_supers(sync_fs_one_sb, &nowait);
0114     iterate_supers(sync_fs_one_sb, &wait);
0115     iterate_bdevs(fdatawrite_one_bdev, NULL);
0116     iterate_bdevs(fdatawait_one_bdev, NULL);
0117     if (unlikely(laptop_mode))
0118         laptop_sync_completion();
0119     return 0;
0120 }
0121 
0122 static void do_sync_work(struct work_struct *work)
0123 {
0124     int nowait = 0;
0125 
0126     /*
0127      * Sync twice to reduce the possibility we skipped some inodes / pages
0128      * because they were temporarily locked
0129      */
0130     iterate_supers(sync_inodes_one_sb, &nowait);
0131     iterate_supers(sync_fs_one_sb, &nowait);
0132     iterate_bdevs(fdatawrite_one_bdev, NULL);
0133     iterate_supers(sync_inodes_one_sb, &nowait);
0134     iterate_supers(sync_fs_one_sb, &nowait);
0135     iterate_bdevs(fdatawrite_one_bdev, NULL);
0136     printk("Emergency Sync complete\n");
0137     kfree(work);
0138 }
0139 
0140 void emergency_sync(void)
0141 {
0142     struct work_struct *work;
0143 
0144     work = kmalloc(sizeof(*work), GFP_ATOMIC);
0145     if (work) {
0146         INIT_WORK(work, do_sync_work);
0147         schedule_work(work);
0148     }
0149 }
0150 
0151 /*
0152  * sync a single super
0153  */
0154 SYSCALL_DEFINE1(syncfs, int, fd)
0155 {
0156     struct fd f = fdget(fd);
0157     struct super_block *sb;
0158     int ret;
0159 
0160     if (!f.file)
0161         return -EBADF;
0162     sb = f.file->f_path.dentry->d_sb;
0163 
0164     down_read(&sb->s_umount);
0165     ret = sync_filesystem(sb);
0166     up_read(&sb->s_umount);
0167 
0168     fdput(f);
0169     return ret;
0170 }
0171 
0172 /**
0173  * vfs_fsync_range - helper to sync a range of data & metadata to disk
0174  * @file:       file to sync
0175  * @start:      offset in bytes of the beginning of data range to sync
0176  * @end:        offset in bytes of the end of data range (inclusive)
0177  * @datasync:       perform only datasync
0178  *
0179  * Write back data in range @start..@end and metadata for @file to disk.  If
0180  * @datasync is set only metadata needed to access modified file data is
0181  * written.
0182  */
0183 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
0184 {
0185     struct inode *inode = file->f_mapping->host;
0186 
0187     if (!file->f_op->fsync)
0188         return -EINVAL;
0189     if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
0190         spin_lock(&inode->i_lock);
0191         inode->i_state &= ~I_DIRTY_TIME;
0192         spin_unlock(&inode->i_lock);
0193         mark_inode_dirty_sync(inode);
0194     }
0195     return file->f_op->fsync(file, start, end, datasync);
0196 }
0197 EXPORT_SYMBOL(vfs_fsync_range);
0198 
0199 /**
0200  * vfs_fsync - perform a fsync or fdatasync on a file
0201  * @file:       file to sync
0202  * @datasync:       only perform a fdatasync operation
0203  *
0204  * Write back data and metadata for @file to disk.  If @datasync is
0205  * set only metadata needed to access modified file data is written.
0206  */
0207 int vfs_fsync(struct file *file, int datasync)
0208 {
0209     return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
0210 }
0211 EXPORT_SYMBOL(vfs_fsync);
0212 
0213 static int do_fsync(unsigned int fd, int datasync)
0214 {
0215     struct fd f = fdget(fd);
0216     int ret = -EBADF;
0217 
0218     if (f.file) {
0219         ret = vfs_fsync(f.file, datasync);
0220         fdput(f);
0221     }
0222     return ret;
0223 }
0224 
0225 SYSCALL_DEFINE1(fsync, unsigned int, fd)
0226 {
0227     return do_fsync(fd, 0);
0228 }
0229 
0230 SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
0231 {
0232     return do_fsync(fd, 1);
0233 }
0234 
0235 /*
0236  * sys_sync_file_range() permits finely controlled syncing over a segment of
0237  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
0238  * zero then sys_sync_file_range() will operate from offset out to EOF.
0239  *
0240  * The flag bits are:
0241  *
0242  * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
0243  * before performing the write.
0244  *
0245  * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
0246  * range which are not presently under writeback. Note that this may block for
0247  * significant periods due to exhaustion of disk request structures.
0248  *
0249  * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
0250  * after performing the write.
0251  *
0252  * Useful combinations of the flag bits are:
0253  *
0254  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
0255  * in the range which were dirty on entry to sys_sync_file_range() are placed
0256  * under writeout.  This is a start-write-for-data-integrity operation.
0257  *
0258  * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
0259  * are not presently under writeout.  This is an asynchronous flush-to-disk
0260  * operation.  Not suitable for data integrity operations.
0261  *
0262  * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
0263  * completion of writeout of all pages in the range.  This will be used after an
0264  * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
0265  * for that operation to complete and to return the result.
0266  *
0267  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
0268  * a traditional sync() operation.  This is a write-for-data-integrity operation
0269  * which will ensure that all pages in the range which were dirty on entry to
0270  * sys_sync_file_range() are committed to disk.
0271  *
0272  *
0273  * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
0274  * I/O errors or ENOSPC conditions and will return those to the caller, after
0275  * clearing the EIO and ENOSPC flags in the address_space.
0276  *
0277  * It should be noted that none of these operations write out the file's
0278  * metadata.  So unless the application is strictly performing overwrites of
0279  * already-instantiated disk blocks, there are no guarantees here that the data
0280  * will be available after a crash.
0281  */
0282 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
0283                 unsigned int, flags)
0284 {
0285     int ret;
0286     struct fd f;
0287     struct address_space *mapping;
0288     loff_t endbyte;         /* inclusive */
0289     umode_t i_mode;
0290 
0291     ret = -EINVAL;
0292     if (flags & ~VALID_FLAGS)
0293         goto out;
0294 
0295     endbyte = offset + nbytes;
0296 
0297     if ((s64)offset < 0)
0298         goto out;
0299     if ((s64)endbyte < 0)
0300         goto out;
0301     if (endbyte < offset)
0302         goto out;
0303 
0304     if (sizeof(pgoff_t) == 4) {
0305         if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
0306             /*
0307              * The range starts outside a 32 bit machine's
0308              * pagecache addressing capabilities.  Let it "succeed"
0309              */
0310             ret = 0;
0311             goto out;
0312         }
0313         if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
0314             /*
0315              * Out to EOF
0316              */
0317             nbytes = 0;
0318         }
0319     }
0320 
0321     if (nbytes == 0)
0322         endbyte = LLONG_MAX;
0323     else
0324         endbyte--;      /* inclusive */
0325 
0326     ret = -EBADF;
0327     f = fdget(fd);
0328     if (!f.file)
0329         goto out;
0330 
0331     i_mode = file_inode(f.file)->i_mode;
0332     ret = -ESPIPE;
0333     if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
0334             !S_ISLNK(i_mode))
0335         goto out_put;
0336 
0337     mapping = f.file->f_mapping;
0338     if (!mapping) {
0339         ret = -EINVAL;
0340         goto out_put;
0341     }
0342 
0343     ret = 0;
0344     if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
0345         ret = filemap_fdatawait_range(mapping, offset, endbyte);
0346         if (ret < 0)
0347             goto out_put;
0348     }
0349 
0350     if (flags & SYNC_FILE_RANGE_WRITE) {
0351         ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
0352                          WB_SYNC_NONE);
0353         if (ret < 0)
0354             goto out_put;
0355     }
0356 
0357     if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
0358         ret = filemap_fdatawait_range(mapping, offset, endbyte);
0359 
0360 out_put:
0361     fdput(f);
0362 out:
0363     return ret;
0364 }
0365 
0366 /* It would be nice if people remember that not all the world's an i386
0367    when they introduce new system calls */
0368 SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
0369                  loff_t, offset, loff_t, nbytes)
0370 {
0371     return sys_sync_file_range(fd, offset, nbytes, flags);
0372 }