Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/fs/read_write.c
0003  *
0004  *  Copyright (C) 1991, 1992  Linus Torvalds
0005  */
0006 
0007 #include <linux/slab.h> 
0008 #include <linux/stat.h>
0009 #include <linux/fcntl.h>
0010 #include <linux/file.h>
0011 #include <linux/uio.h>
0012 #include <linux/fsnotify.h>
0013 #include <linux/security.h>
0014 #include <linux/export.h>
0015 #include <linux/syscalls.h>
0016 #include <linux/pagemap.h>
0017 #include <linux/splice.h>
0018 #include <linux/compat.h>
0019 #include <linux/mount.h>
0020 #include <linux/fs.h>
0021 #include "internal.h"
0022 
0023 #include <linux/uaccess.h>
0024 #include <asm/unistd.h>
0025 
0026 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
0027 typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
0028 
0029 const struct file_operations generic_ro_fops = {
0030     .llseek     = generic_file_llseek,
0031     .read_iter  = generic_file_read_iter,
0032     .mmap       = generic_file_readonly_mmap,
0033     .splice_read    = generic_file_splice_read,
0034 };
0035 
0036 EXPORT_SYMBOL(generic_ro_fops);
0037 
0038 static inline int unsigned_offsets(struct file *file)
0039 {
0040     return file->f_mode & FMODE_UNSIGNED_OFFSET;
0041 }
0042 
0043 /**
0044  * vfs_setpos - update the file offset for lseek
0045  * @file:   file structure in question
0046  * @offset: file offset to seek to
0047  * @maxsize:    maximum file size
0048  *
0049  * This is a low-level filesystem helper for updating the file offset to
0050  * the value specified by @offset if the given offset is valid and it is
0051  * not equal to the current file offset.
0052  *
0053  * Return the specified offset on success and -EINVAL on invalid offset.
0054  */
0055 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
0056 {
0057     if (offset < 0 && !unsigned_offsets(file))
0058         return -EINVAL;
0059     if (offset > maxsize)
0060         return -EINVAL;
0061 
0062     if (offset != file->f_pos) {
0063         file->f_pos = offset;
0064         file->f_version = 0;
0065     }
0066     return offset;
0067 }
0068 EXPORT_SYMBOL(vfs_setpos);
0069 
0070 /**
0071  * generic_file_llseek_size - generic llseek implementation for regular files
0072  * @file:   file structure to seek on
0073  * @offset: file offset to seek to
0074  * @whence: type of seek
0075  * @size:   max size of this file in file system
0076  * @eof:    offset used for SEEK_END position
0077  *
0078  * This is a variant of generic_file_llseek that allows passing in a custom
0079  * maximum file size and a custom EOF position, for e.g. hashed directories
0080  *
0081  * Synchronization:
0082  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
0083  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
0084  * read/writes behave like SEEK_SET against seeks.
0085  */
0086 loff_t
0087 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
0088         loff_t maxsize, loff_t eof)
0089 {
0090     switch (whence) {
0091     case SEEK_END:
0092         offset += eof;
0093         break;
0094     case SEEK_CUR:
0095         /*
0096          * Here we special-case the lseek(fd, 0, SEEK_CUR)
0097          * position-querying operation.  Avoid rewriting the "same"
0098          * f_pos value back to the file because a concurrent read(),
0099          * write() or lseek() might have altered it
0100          */
0101         if (offset == 0)
0102             return file->f_pos;
0103         /*
0104          * f_lock protects against read/modify/write race with other
0105          * SEEK_CURs. Note that parallel writes and reads behave
0106          * like SEEK_SET.
0107          */
0108         spin_lock(&file->f_lock);
0109         offset = vfs_setpos(file, file->f_pos + offset, maxsize);
0110         spin_unlock(&file->f_lock);
0111         return offset;
0112     case SEEK_DATA:
0113         /*
0114          * In the generic case the entire file is data, so as long as
0115          * offset isn't at the end of the file then the offset is data.
0116          */
0117         if (offset >= eof)
0118             return -ENXIO;
0119         break;
0120     case SEEK_HOLE:
0121         /*
0122          * There is a virtual hole at the end of the file, so as long as
0123          * offset isn't i_size or larger, return i_size.
0124          */
0125         if (offset >= eof)
0126             return -ENXIO;
0127         offset = eof;
0128         break;
0129     }
0130 
0131     return vfs_setpos(file, offset, maxsize);
0132 }
0133 EXPORT_SYMBOL(generic_file_llseek_size);
0134 
0135 /**
0136  * generic_file_llseek - generic llseek implementation for regular files
0137  * @file:   file structure to seek on
0138  * @offset: file offset to seek to
0139  * @whence: type of seek
0140  *
0141  * This is a generic implemenation of ->llseek useable for all normal local
0142  * filesystems.  It just updates the file offset to the value specified by
0143  * @offset and @whence.
0144  */
0145 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
0146 {
0147     struct inode *inode = file->f_mapping->host;
0148 
0149     return generic_file_llseek_size(file, offset, whence,
0150                     inode->i_sb->s_maxbytes,
0151                     i_size_read(inode));
0152 }
0153 EXPORT_SYMBOL(generic_file_llseek);
0154 
0155 /**
0156  * fixed_size_llseek - llseek implementation for fixed-sized devices
0157  * @file:   file structure to seek on
0158  * @offset: file offset to seek to
0159  * @whence: type of seek
0160  * @size:   size of the file
0161  *
0162  */
0163 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
0164 {
0165     switch (whence) {
0166     case SEEK_SET: case SEEK_CUR: case SEEK_END:
0167         return generic_file_llseek_size(file, offset, whence,
0168                         size, size);
0169     default:
0170         return -EINVAL;
0171     }
0172 }
0173 EXPORT_SYMBOL(fixed_size_llseek);
0174 
0175 /**
0176  * no_seek_end_llseek - llseek implementation for fixed-sized devices
0177  * @file:   file structure to seek on
0178  * @offset: file offset to seek to
0179  * @whence: type of seek
0180  *
0181  */
0182 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
0183 {
0184     switch (whence) {
0185     case SEEK_SET: case SEEK_CUR:
0186         return generic_file_llseek_size(file, offset, whence,
0187                         OFFSET_MAX, 0);
0188     default:
0189         return -EINVAL;
0190     }
0191 }
0192 EXPORT_SYMBOL(no_seek_end_llseek);
0193 
0194 /**
0195  * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
0196  * @file:   file structure to seek on
0197  * @offset: file offset to seek to
0198  * @whence: type of seek
0199  * @size:   maximal offset allowed
0200  *
0201  */
0202 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
0203 {
0204     switch (whence) {
0205     case SEEK_SET: case SEEK_CUR:
0206         return generic_file_llseek_size(file, offset, whence,
0207                         size, 0);
0208     default:
0209         return -EINVAL;
0210     }
0211 }
0212 EXPORT_SYMBOL(no_seek_end_llseek_size);
0213 
0214 /**
0215  * noop_llseek - No Operation Performed llseek implementation
0216  * @file:   file structure to seek on
0217  * @offset: file offset to seek to
0218  * @whence: type of seek
0219  *
0220  * This is an implementation of ->llseek useable for the rare special case when
0221  * userspace expects the seek to succeed but the (device) file is actually not
0222  * able to perform the seek. In this case you use noop_llseek() instead of
0223  * falling back to the default implementation of ->llseek.
0224  */
0225 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
0226 {
0227     return file->f_pos;
0228 }
0229 EXPORT_SYMBOL(noop_llseek);
0230 
0231 loff_t no_llseek(struct file *file, loff_t offset, int whence)
0232 {
0233     return -ESPIPE;
0234 }
0235 EXPORT_SYMBOL(no_llseek);
0236 
0237 loff_t default_llseek(struct file *file, loff_t offset, int whence)
0238 {
0239     struct inode *inode = file_inode(file);
0240     loff_t retval;
0241 
0242     inode_lock(inode);
0243     switch (whence) {
0244         case SEEK_END:
0245             offset += i_size_read(inode);
0246             break;
0247         case SEEK_CUR:
0248             if (offset == 0) {
0249                 retval = file->f_pos;
0250                 goto out;
0251             }
0252             offset += file->f_pos;
0253             break;
0254         case SEEK_DATA:
0255             /*
0256              * In the generic case the entire file is data, so as
0257              * long as offset isn't at the end of the file then the
0258              * offset is data.
0259              */
0260             if (offset >= inode->i_size) {
0261                 retval = -ENXIO;
0262                 goto out;
0263             }
0264             break;
0265         case SEEK_HOLE:
0266             /*
0267              * There is a virtual hole at the end of the file, so
0268              * as long as offset isn't i_size or larger, return
0269              * i_size.
0270              */
0271             if (offset >= inode->i_size) {
0272                 retval = -ENXIO;
0273                 goto out;
0274             }
0275             offset = inode->i_size;
0276             break;
0277     }
0278     retval = -EINVAL;
0279     if (offset >= 0 || unsigned_offsets(file)) {
0280         if (offset != file->f_pos) {
0281             file->f_pos = offset;
0282             file->f_version = 0;
0283         }
0284         retval = offset;
0285     }
0286 out:
0287     inode_unlock(inode);
0288     return retval;
0289 }
0290 EXPORT_SYMBOL(default_llseek);
0291 
0292 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
0293 {
0294     loff_t (*fn)(struct file *, loff_t, int);
0295 
0296     fn = no_llseek;
0297     if (file->f_mode & FMODE_LSEEK) {
0298         if (file->f_op->llseek)
0299             fn = file->f_op->llseek;
0300     }
0301     return fn(file, offset, whence);
0302 }
0303 EXPORT_SYMBOL(vfs_llseek);
0304 
0305 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
0306 {
0307     off_t retval;
0308     struct fd f = fdget_pos(fd);
0309     if (!f.file)
0310         return -EBADF;
0311 
0312     retval = -EINVAL;
0313     if (whence <= SEEK_MAX) {
0314         loff_t res = vfs_llseek(f.file, offset, whence);
0315         retval = res;
0316         if (res != (loff_t)retval)
0317             retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
0318     }
0319     fdput_pos(f);
0320     return retval;
0321 }
0322 
0323 #ifdef CONFIG_COMPAT
0324 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
0325 {
0326     return sys_lseek(fd, offset, whence);
0327 }
0328 #endif
0329 
0330 #ifdef __ARCH_WANT_SYS_LLSEEK
0331 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
0332         unsigned long, offset_low, loff_t __user *, result,
0333         unsigned int, whence)
0334 {
0335     int retval;
0336     struct fd f = fdget_pos(fd);
0337     loff_t offset;
0338 
0339     if (!f.file)
0340         return -EBADF;
0341 
0342     retval = -EINVAL;
0343     if (whence > SEEK_MAX)
0344         goto out_putf;
0345 
0346     offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
0347             whence);
0348 
0349     retval = (int)offset;
0350     if (offset >= 0) {
0351         retval = -EFAULT;
0352         if (!copy_to_user(result, &offset, sizeof(offset)))
0353             retval = 0;
0354     }
0355 out_putf:
0356     fdput_pos(f);
0357     return retval;
0358 }
0359 #endif
0360 
0361 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
0362 {
0363     struct kiocb kiocb;
0364     ssize_t ret;
0365 
0366     if (!file->f_op->read_iter)
0367         return -EINVAL;
0368 
0369     init_sync_kiocb(&kiocb, file);
0370     kiocb.ki_pos = *ppos;
0371 
0372     iter->type |= READ;
0373     ret = file->f_op->read_iter(&kiocb, iter);
0374     BUG_ON(ret == -EIOCBQUEUED);
0375     if (ret > 0)
0376         *ppos = kiocb.ki_pos;
0377     return ret;
0378 }
0379 EXPORT_SYMBOL(vfs_iter_read);
0380 
0381 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
0382 {
0383     struct kiocb kiocb;
0384     ssize_t ret;
0385 
0386     if (!file->f_op->write_iter)
0387         return -EINVAL;
0388 
0389     init_sync_kiocb(&kiocb, file);
0390     kiocb.ki_pos = *ppos;
0391 
0392     iter->type |= WRITE;
0393     ret = file->f_op->write_iter(&kiocb, iter);
0394     BUG_ON(ret == -EIOCBQUEUED);
0395     if (ret > 0)
0396         *ppos = kiocb.ki_pos;
0397     return ret;
0398 }
0399 EXPORT_SYMBOL(vfs_iter_write);
0400 
0401 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
0402 {
0403     struct inode *inode;
0404     loff_t pos;
0405     int retval = -EINVAL;
0406 
0407     inode = file_inode(file);
0408     if (unlikely((ssize_t) count < 0))
0409         return retval;
0410     pos = *ppos;
0411     if (unlikely(pos < 0)) {
0412         if (!unsigned_offsets(file))
0413             return retval;
0414         if (count >= -pos) /* both values are in 0..LLONG_MAX */
0415             return -EOVERFLOW;
0416     } else if (unlikely((loff_t) (pos + count) < 0)) {
0417         if (!unsigned_offsets(file))
0418             return retval;
0419     }
0420 
0421     if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
0422         retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
0423                 read_write == READ ? F_RDLCK : F_WRLCK);
0424         if (retval < 0)
0425             return retval;
0426     }
0427     return security_file_permission(file,
0428                 read_write == READ ? MAY_READ : MAY_WRITE);
0429 }
0430 
0431 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
0432 {
0433     struct iovec iov = { .iov_base = buf, .iov_len = len };
0434     struct kiocb kiocb;
0435     struct iov_iter iter;
0436     ssize_t ret;
0437 
0438     init_sync_kiocb(&kiocb, filp);
0439     kiocb.ki_pos = *ppos;
0440     iov_iter_init(&iter, READ, &iov, 1, len);
0441 
0442     ret = filp->f_op->read_iter(&kiocb, &iter);
0443     BUG_ON(ret == -EIOCBQUEUED);
0444     *ppos = kiocb.ki_pos;
0445     return ret;
0446 }
0447 
0448 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
0449            loff_t *pos)
0450 {
0451     if (file->f_op->read)
0452         return file->f_op->read(file, buf, count, pos);
0453     else if (file->f_op->read_iter)
0454         return new_sync_read(file, buf, count, pos);
0455     else
0456         return -EINVAL;
0457 }
0458 EXPORT_SYMBOL(__vfs_read);
0459 
0460 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
0461 {
0462     ssize_t ret;
0463 
0464     if (!(file->f_mode & FMODE_READ))
0465         return -EBADF;
0466     if (!(file->f_mode & FMODE_CAN_READ))
0467         return -EINVAL;
0468     if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
0469         return -EFAULT;
0470 
0471     ret = rw_verify_area(READ, file, pos, count);
0472     if (!ret) {
0473         if (count > MAX_RW_COUNT)
0474             count =  MAX_RW_COUNT;
0475         ret = __vfs_read(file, buf, count, pos);
0476         if (ret > 0) {
0477             fsnotify_access(file);
0478             add_rchar(current, ret);
0479         }
0480         inc_syscr(current);
0481     }
0482 
0483     return ret;
0484 }
0485 
0486 EXPORT_SYMBOL(vfs_read);
0487 
0488 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
0489 {
0490     struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
0491     struct kiocb kiocb;
0492     struct iov_iter iter;
0493     ssize_t ret;
0494 
0495     init_sync_kiocb(&kiocb, filp);
0496     kiocb.ki_pos = *ppos;
0497     iov_iter_init(&iter, WRITE, &iov, 1, len);
0498 
0499     ret = filp->f_op->write_iter(&kiocb, &iter);
0500     BUG_ON(ret == -EIOCBQUEUED);
0501     if (ret > 0)
0502         *ppos = kiocb.ki_pos;
0503     return ret;
0504 }
0505 
0506 ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
0507             loff_t *pos)
0508 {
0509     if (file->f_op->write)
0510         return file->f_op->write(file, p, count, pos);
0511     else if (file->f_op->write_iter)
0512         return new_sync_write(file, p, count, pos);
0513     else
0514         return -EINVAL;
0515 }
0516 EXPORT_SYMBOL(__vfs_write);
0517 
0518 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
0519 {
0520     mm_segment_t old_fs;
0521     const char __user *p;
0522     ssize_t ret;
0523 
0524     if (!(file->f_mode & FMODE_CAN_WRITE))
0525         return -EINVAL;
0526 
0527     old_fs = get_fs();
0528     set_fs(get_ds());
0529     p = (__force const char __user *)buf;
0530     if (count > MAX_RW_COUNT)
0531         count =  MAX_RW_COUNT;
0532     ret = __vfs_write(file, p, count, pos);
0533     set_fs(old_fs);
0534     if (ret > 0) {
0535         fsnotify_modify(file);
0536         add_wchar(current, ret);
0537     }
0538     inc_syscw(current);
0539     return ret;
0540 }
0541 
0542 EXPORT_SYMBOL(__kernel_write);
0543 
0544 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
0545 {
0546     ssize_t ret;
0547 
0548     if (!(file->f_mode & FMODE_WRITE))
0549         return -EBADF;
0550     if (!(file->f_mode & FMODE_CAN_WRITE))
0551         return -EINVAL;
0552     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
0553         return -EFAULT;
0554 
0555     ret = rw_verify_area(WRITE, file, pos, count);
0556     if (!ret) {
0557         if (count > MAX_RW_COUNT)
0558             count =  MAX_RW_COUNT;
0559         file_start_write(file);
0560         ret = __vfs_write(file, buf, count, pos);
0561         if (ret > 0) {
0562             fsnotify_modify(file);
0563             add_wchar(current, ret);
0564         }
0565         inc_syscw(current);
0566         file_end_write(file);
0567     }
0568 
0569     return ret;
0570 }
0571 
0572 EXPORT_SYMBOL(vfs_write);
0573 
0574 static inline loff_t file_pos_read(struct file *file)
0575 {
0576     return file->f_pos;
0577 }
0578 
0579 static inline void file_pos_write(struct file *file, loff_t pos)
0580 {
0581     file->f_pos = pos;
0582 }
0583 
0584 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
0585 {
0586     struct fd f = fdget_pos(fd);
0587     ssize_t ret = -EBADF;
0588 
0589     if (f.file) {
0590         loff_t pos = file_pos_read(f.file);
0591         ret = vfs_read(f.file, buf, count, &pos);
0592         if (ret >= 0)
0593             file_pos_write(f.file, pos);
0594         fdput_pos(f);
0595     }
0596     return ret;
0597 }
0598 
0599 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
0600         size_t, count)
0601 {
0602     struct fd f = fdget_pos(fd);
0603     ssize_t ret = -EBADF;
0604 
0605     if (f.file) {
0606         loff_t pos = file_pos_read(f.file);
0607         ret = vfs_write(f.file, buf, count, &pos);
0608         if (ret >= 0)
0609             file_pos_write(f.file, pos);
0610         fdput_pos(f);
0611     }
0612 
0613     return ret;
0614 }
0615 
0616 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
0617             size_t, count, loff_t, pos)
0618 {
0619     struct fd f;
0620     ssize_t ret = -EBADF;
0621 
0622     if (pos < 0)
0623         return -EINVAL;
0624 
0625     f = fdget(fd);
0626     if (f.file) {
0627         ret = -ESPIPE;
0628         if (f.file->f_mode & FMODE_PREAD)
0629             ret = vfs_read(f.file, buf, count, &pos);
0630         fdput(f);
0631     }
0632 
0633     return ret;
0634 }
0635 
0636 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
0637              size_t, count, loff_t, pos)
0638 {
0639     struct fd f;
0640     ssize_t ret = -EBADF;
0641 
0642     if (pos < 0)
0643         return -EINVAL;
0644 
0645     f = fdget(fd);
0646     if (f.file) {
0647         ret = -ESPIPE;
0648         if (f.file->f_mode & FMODE_PWRITE)  
0649             ret = vfs_write(f.file, buf, count, &pos);
0650         fdput(f);
0651     }
0652 
0653     return ret;
0654 }
0655 
0656 /*
0657  * Reduce an iovec's length in-place.  Return the resulting number of segments
0658  */
0659 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
0660 {
0661     unsigned long seg = 0;
0662     size_t len = 0;
0663 
0664     while (seg < nr_segs) {
0665         seg++;
0666         if (len + iov->iov_len >= to) {
0667             iov->iov_len = to - len;
0668             break;
0669         }
0670         len += iov->iov_len;
0671         iov++;
0672     }
0673     return seg;
0674 }
0675 EXPORT_SYMBOL(iov_shorten);
0676 
0677 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
0678         loff_t *ppos, iter_fn_t fn, int flags)
0679 {
0680     struct kiocb kiocb;
0681     ssize_t ret;
0682 
0683     if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
0684         return -EOPNOTSUPP;
0685 
0686     init_sync_kiocb(&kiocb, filp);
0687     if (flags & RWF_HIPRI)
0688         kiocb.ki_flags |= IOCB_HIPRI;
0689     if (flags & RWF_DSYNC)
0690         kiocb.ki_flags |= IOCB_DSYNC;
0691     if (flags & RWF_SYNC)
0692         kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
0693     kiocb.ki_pos = *ppos;
0694 
0695     ret = fn(&kiocb, iter);
0696     BUG_ON(ret == -EIOCBQUEUED);
0697     *ppos = kiocb.ki_pos;
0698     return ret;
0699 }
0700 
0701 /* Do it by hand, with file-ops */
0702 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
0703         loff_t *ppos, io_fn_t fn, int flags)
0704 {
0705     ssize_t ret = 0;
0706 
0707     if (flags & ~RWF_HIPRI)
0708         return -EOPNOTSUPP;
0709 
0710     while (iov_iter_count(iter)) {
0711         struct iovec iovec = iov_iter_iovec(iter);
0712         ssize_t nr;
0713 
0714         nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
0715 
0716         if (nr < 0) {
0717             if (!ret)
0718                 ret = nr;
0719             break;
0720         }
0721         ret += nr;
0722         if (nr != iovec.iov_len)
0723             break;
0724         iov_iter_advance(iter, nr);
0725     }
0726 
0727     return ret;
0728 }
0729 
0730 /* A write operation does a read from user space and vice versa */
0731 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
0732 
0733 /**
0734  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
0735  *     into the kernel and check that it is valid.
0736  *
0737  * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
0738  * @uvector: Pointer to the userspace array.
0739  * @nr_segs: Number of elements in userspace array.
0740  * @fast_segs: Number of elements in @fast_pointer.
0741  * @fast_pointer: Pointer to (usually small on-stack) kernel array.
0742  * @ret_pointer: (output parameter) Pointer to a variable that will point to
0743  *     either @fast_pointer, a newly allocated kernel array, or NULL,
0744  *     depending on which array was used.
0745  *
0746  * This function copies an array of &struct iovec of @nr_segs from
0747  * userspace into the kernel and checks that each element is valid (e.g.
0748  * it does not point to a kernel address or cause overflow by being too
0749  * large, etc.).
0750  *
0751  * As an optimization, the caller may provide a pointer to a small
0752  * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
0753  * (the size of this array, or 0 if unused, should be given in @fast_segs).
0754  *
0755  * @ret_pointer will always point to the array that was used, so the
0756  * caller must take care not to call kfree() on it e.g. in case the
0757  * @fast_pointer array was used and it was allocated on the stack.
0758  *
0759  * Return: The total number of bytes covered by the iovec array on success
0760  *   or a negative error code on error.
0761  */
0762 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
0763                   unsigned long nr_segs, unsigned long fast_segs,
0764                   struct iovec *fast_pointer,
0765                   struct iovec **ret_pointer)
0766 {
0767     unsigned long seg;
0768     ssize_t ret;
0769     struct iovec *iov = fast_pointer;
0770 
0771     /*
0772      * SuS says "The readv() function *may* fail if the iovcnt argument
0773      * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
0774      * traditionally returned zero for zero segments, so...
0775      */
0776     if (nr_segs == 0) {
0777         ret = 0;
0778         goto out;
0779     }
0780 
0781     /*
0782      * First get the "struct iovec" from user memory and
0783      * verify all the pointers
0784      */
0785     if (nr_segs > UIO_MAXIOV) {
0786         ret = -EINVAL;
0787         goto out;
0788     }
0789     if (nr_segs > fast_segs) {
0790         iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
0791         if (iov == NULL) {
0792             ret = -ENOMEM;
0793             goto out;
0794         }
0795     }
0796     if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
0797         ret = -EFAULT;
0798         goto out;
0799     }
0800 
0801     /*
0802      * According to the Single Unix Specification we should return EINVAL
0803      * if an element length is < 0 when cast to ssize_t or if the
0804      * total length would overflow the ssize_t return value of the
0805      * system call.
0806      *
0807      * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
0808      * overflow case.
0809      */
0810     ret = 0;
0811     for (seg = 0; seg < nr_segs; seg++) {
0812         void __user *buf = iov[seg].iov_base;
0813         ssize_t len = (ssize_t)iov[seg].iov_len;
0814 
0815         /* see if we we're about to use an invalid len or if
0816          * it's about to overflow ssize_t */
0817         if (len < 0) {
0818             ret = -EINVAL;
0819             goto out;
0820         }
0821         if (type >= 0
0822             && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
0823             ret = -EFAULT;
0824             goto out;
0825         }
0826         if (len > MAX_RW_COUNT - ret) {
0827             len = MAX_RW_COUNT - ret;
0828             iov[seg].iov_len = len;
0829         }
0830         ret += len;
0831     }
0832 out:
0833     *ret_pointer = iov;
0834     return ret;
0835 }
0836 
0837 static ssize_t do_readv_writev(int type, struct file *file,
0838                    const struct iovec __user * uvector,
0839                    unsigned long nr_segs, loff_t *pos,
0840                    int flags)
0841 {
0842     size_t tot_len;
0843     struct iovec iovstack[UIO_FASTIOV];
0844     struct iovec *iov = iovstack;
0845     struct iov_iter iter;
0846     ssize_t ret;
0847     io_fn_t fn;
0848     iter_fn_t iter_fn;
0849 
0850     ret = import_iovec(type, uvector, nr_segs,
0851                ARRAY_SIZE(iovstack), &iov, &iter);
0852     if (ret < 0)
0853         return ret;
0854 
0855     tot_len = iov_iter_count(&iter);
0856     if (!tot_len)
0857         goto out;
0858     ret = rw_verify_area(type, file, pos, tot_len);
0859     if (ret < 0)
0860         goto out;
0861 
0862     if (type == READ) {
0863         fn = file->f_op->read;
0864         iter_fn = file->f_op->read_iter;
0865     } else {
0866         fn = (io_fn_t)file->f_op->write;
0867         iter_fn = file->f_op->write_iter;
0868         file_start_write(file);
0869     }
0870 
0871     if (iter_fn)
0872         ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
0873     else
0874         ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
0875 
0876     if (type != READ)
0877         file_end_write(file);
0878 
0879 out:
0880     kfree(iov);
0881     if ((ret + (type == READ)) > 0) {
0882         if (type == READ)
0883             fsnotify_access(file);
0884         else
0885             fsnotify_modify(file);
0886     }
0887     return ret;
0888 }
0889 
0890 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
0891           unsigned long vlen, loff_t *pos, int flags)
0892 {
0893     if (!(file->f_mode & FMODE_READ))
0894         return -EBADF;
0895     if (!(file->f_mode & FMODE_CAN_READ))
0896         return -EINVAL;
0897 
0898     return do_readv_writev(READ, file, vec, vlen, pos, flags);
0899 }
0900 
0901 EXPORT_SYMBOL(vfs_readv);
0902 
0903 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
0904            unsigned long vlen, loff_t *pos, int flags)
0905 {
0906     if (!(file->f_mode & FMODE_WRITE))
0907         return -EBADF;
0908     if (!(file->f_mode & FMODE_CAN_WRITE))
0909         return -EINVAL;
0910 
0911     return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
0912 }
0913 
0914 EXPORT_SYMBOL(vfs_writev);
0915 
0916 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
0917             unsigned long vlen, int flags)
0918 {
0919     struct fd f = fdget_pos(fd);
0920     ssize_t ret = -EBADF;
0921 
0922     if (f.file) {
0923         loff_t pos = file_pos_read(f.file);
0924         ret = vfs_readv(f.file, vec, vlen, &pos, flags);
0925         if (ret >= 0)
0926             file_pos_write(f.file, pos);
0927         fdput_pos(f);
0928     }
0929 
0930     if (ret > 0)
0931         add_rchar(current, ret);
0932     inc_syscr(current);
0933     return ret;
0934 }
0935 
0936 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
0937              unsigned long vlen, int flags)
0938 {
0939     struct fd f = fdget_pos(fd);
0940     ssize_t ret = -EBADF;
0941 
0942     if (f.file) {
0943         loff_t pos = file_pos_read(f.file);
0944         ret = vfs_writev(f.file, vec, vlen, &pos, flags);
0945         if (ret >= 0)
0946             file_pos_write(f.file, pos);
0947         fdput_pos(f);
0948     }
0949 
0950     if (ret > 0)
0951         add_wchar(current, ret);
0952     inc_syscw(current);
0953     return ret;
0954 }
0955 
0956 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
0957 {
0958 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
0959     return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
0960 }
0961 
0962 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
0963              unsigned long vlen, loff_t pos, int flags)
0964 {
0965     struct fd f;
0966     ssize_t ret = -EBADF;
0967 
0968     if (pos < 0)
0969         return -EINVAL;
0970 
0971     f = fdget(fd);
0972     if (f.file) {
0973         ret = -ESPIPE;
0974         if (f.file->f_mode & FMODE_PREAD)
0975             ret = vfs_readv(f.file, vec, vlen, &pos, flags);
0976         fdput(f);
0977     }
0978 
0979     if (ret > 0)
0980         add_rchar(current, ret);
0981     inc_syscr(current);
0982     return ret;
0983 }
0984 
0985 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
0986               unsigned long vlen, loff_t pos, int flags)
0987 {
0988     struct fd f;
0989     ssize_t ret = -EBADF;
0990 
0991     if (pos < 0)
0992         return -EINVAL;
0993 
0994     f = fdget(fd);
0995     if (f.file) {
0996         ret = -ESPIPE;
0997         if (f.file->f_mode & FMODE_PWRITE)
0998             ret = vfs_writev(f.file, vec, vlen, &pos, flags);
0999         fdput(f);
1000     }
1001 
1002     if (ret > 0)
1003         add_wchar(current, ret);
1004     inc_syscw(current);
1005     return ret;
1006 }
1007 
1008 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1009         unsigned long, vlen)
1010 {
1011     return do_readv(fd, vec, vlen, 0);
1012 }
1013 
1014 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1015         unsigned long, vlen)
1016 {
1017     return do_writev(fd, vec, vlen, 0);
1018 }
1019 
1020 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1021         unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1022 {
1023     loff_t pos = pos_from_hilo(pos_h, pos_l);
1024 
1025     return do_preadv(fd, vec, vlen, pos, 0);
1026 }
1027 
1028 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1029         unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1030         int, flags)
1031 {
1032     loff_t pos = pos_from_hilo(pos_h, pos_l);
1033 
1034     if (pos == -1)
1035         return do_readv(fd, vec, vlen, flags);
1036 
1037     return do_preadv(fd, vec, vlen, pos, flags);
1038 }
1039 
1040 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1041         unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1042 {
1043     loff_t pos = pos_from_hilo(pos_h, pos_l);
1044 
1045     return do_pwritev(fd, vec, vlen, pos, 0);
1046 }
1047 
1048 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1049         unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1050         int, flags)
1051 {
1052     loff_t pos = pos_from_hilo(pos_h, pos_l);
1053 
1054     if (pos == -1)
1055         return do_writev(fd, vec, vlen, flags);
1056 
1057     return do_pwritev(fd, vec, vlen, pos, flags);
1058 }
1059 
1060 #ifdef CONFIG_COMPAT
1061 
1062 static ssize_t compat_do_readv_writev(int type, struct file *file,
1063                    const struct compat_iovec __user *uvector,
1064                    unsigned long nr_segs, loff_t *pos,
1065                    int flags)
1066 {
1067     compat_ssize_t tot_len;
1068     struct iovec iovstack[UIO_FASTIOV];
1069     struct iovec *iov = iovstack;
1070     struct iov_iter iter;
1071     ssize_t ret;
1072     io_fn_t fn;
1073     iter_fn_t iter_fn;
1074 
1075     ret = compat_import_iovec(type, uvector, nr_segs,
1076                   UIO_FASTIOV, &iov, &iter);
1077     if (ret < 0)
1078         return ret;
1079 
1080     tot_len = iov_iter_count(&iter);
1081     if (!tot_len)
1082         goto out;
1083     ret = rw_verify_area(type, file, pos, tot_len);
1084     if (ret < 0)
1085         goto out;
1086 
1087     if (type == READ) {
1088         fn = file->f_op->read;
1089         iter_fn = file->f_op->read_iter;
1090     } else {
1091         fn = (io_fn_t)file->f_op->write;
1092         iter_fn = file->f_op->write_iter;
1093         file_start_write(file);
1094     }
1095 
1096     if (iter_fn)
1097         ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
1098     else
1099         ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
1100 
1101     if (type != READ)
1102         file_end_write(file);
1103 
1104 out:
1105     kfree(iov);
1106     if ((ret + (type == READ)) > 0) {
1107         if (type == READ)
1108             fsnotify_access(file);
1109         else
1110             fsnotify_modify(file);
1111     }
1112     return ret;
1113 }
1114 
1115 static size_t compat_readv(struct file *file,
1116                const struct compat_iovec __user *vec,
1117                unsigned long vlen, loff_t *pos, int flags)
1118 {
1119     ssize_t ret = -EBADF;
1120 
1121     if (!(file->f_mode & FMODE_READ))
1122         goto out;
1123 
1124     ret = -EINVAL;
1125     if (!(file->f_mode & FMODE_CAN_READ))
1126         goto out;
1127 
1128     ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
1129 
1130 out:
1131     if (ret > 0)
1132         add_rchar(current, ret);
1133     inc_syscr(current);
1134     return ret;
1135 }
1136 
1137 static size_t do_compat_readv(compat_ulong_t fd,
1138                  const struct compat_iovec __user *vec,
1139                  compat_ulong_t vlen, int flags)
1140 {
1141     struct fd f = fdget_pos(fd);
1142     ssize_t ret;
1143     loff_t pos;
1144 
1145     if (!f.file)
1146         return -EBADF;
1147     pos = f.file->f_pos;
1148     ret = compat_readv(f.file, vec, vlen, &pos, flags);
1149     if (ret >= 0)
1150         f.file->f_pos = pos;
1151     fdput_pos(f);
1152     return ret;
1153 
1154 }
1155 
1156 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1157         const struct compat_iovec __user *,vec,
1158         compat_ulong_t, vlen)
1159 {
1160     return do_compat_readv(fd, vec, vlen, 0);
1161 }
1162 
1163 static long do_compat_preadv64(unsigned long fd,
1164                   const struct compat_iovec __user *vec,
1165                   unsigned long vlen, loff_t pos, int flags)
1166 {
1167     struct fd f;
1168     ssize_t ret;
1169 
1170     if (pos < 0)
1171         return -EINVAL;
1172     f = fdget(fd);
1173     if (!f.file)
1174         return -EBADF;
1175     ret = -ESPIPE;
1176     if (f.file->f_mode & FMODE_PREAD)
1177         ret = compat_readv(f.file, vec, vlen, &pos, flags);
1178     fdput(f);
1179     return ret;
1180 }
1181 
1182 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1183 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1184         const struct compat_iovec __user *,vec,
1185         unsigned long, vlen, loff_t, pos)
1186 {
1187     return do_compat_preadv64(fd, vec, vlen, pos, 0);
1188 }
1189 #endif
1190 
1191 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1192         const struct compat_iovec __user *,vec,
1193         compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1194 {
1195     loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1196 
1197     return do_compat_preadv64(fd, vec, vlen, pos, 0);
1198 }
1199 
1200 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1201 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1202         const struct compat_iovec __user *,vec,
1203         unsigned long, vlen, loff_t, pos, int, flags)
1204 {
1205     return do_compat_preadv64(fd, vec, vlen, pos, flags);
1206 }
1207 #endif
1208 
1209 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1210         const struct compat_iovec __user *,vec,
1211         compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1212         int, flags)
1213 {
1214     loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1215 
1216     if (pos == -1)
1217         return do_compat_readv(fd, vec, vlen, flags);
1218 
1219     return do_compat_preadv64(fd, vec, vlen, pos, flags);
1220 }
1221 
1222 static size_t compat_writev(struct file *file,
1223                 const struct compat_iovec __user *vec,
1224                 unsigned long vlen, loff_t *pos, int flags)
1225 {
1226     ssize_t ret = -EBADF;
1227 
1228     if (!(file->f_mode & FMODE_WRITE))
1229         goto out;
1230 
1231     ret = -EINVAL;
1232     if (!(file->f_mode & FMODE_CAN_WRITE))
1233         goto out;
1234 
1235     ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
1236 
1237 out:
1238     if (ret > 0)
1239         add_wchar(current, ret);
1240     inc_syscw(current);
1241     return ret;
1242 }
1243 
1244 static size_t do_compat_writev(compat_ulong_t fd,
1245                   const struct compat_iovec __user* vec,
1246                   compat_ulong_t vlen, int flags)
1247 {
1248     struct fd f = fdget_pos(fd);
1249     ssize_t ret;
1250     loff_t pos;
1251 
1252     if (!f.file)
1253         return -EBADF;
1254     pos = f.file->f_pos;
1255     ret = compat_writev(f.file, vec, vlen, &pos, flags);
1256     if (ret >= 0)
1257         f.file->f_pos = pos;
1258     fdput_pos(f);
1259     return ret;
1260 }
1261 
1262 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1263         const struct compat_iovec __user *, vec,
1264         compat_ulong_t, vlen)
1265 {
1266     return do_compat_writev(fd, vec, vlen, 0);
1267 }
1268 
1269 static long do_compat_pwritev64(unsigned long fd,
1270                    const struct compat_iovec __user *vec,
1271                    unsigned long vlen, loff_t pos, int flags)
1272 {
1273     struct fd f;
1274     ssize_t ret;
1275 
1276     if (pos < 0)
1277         return -EINVAL;
1278     f = fdget(fd);
1279     if (!f.file)
1280         return -EBADF;
1281     ret = -ESPIPE;
1282     if (f.file->f_mode & FMODE_PWRITE)
1283         ret = compat_writev(f.file, vec, vlen, &pos, flags);
1284     fdput(f);
1285     return ret;
1286 }
1287 
1288 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1289 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1290         const struct compat_iovec __user *,vec,
1291         unsigned long, vlen, loff_t, pos)
1292 {
1293     return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1294 }
1295 #endif
1296 
1297 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1298         const struct compat_iovec __user *,vec,
1299         compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1300 {
1301     loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1302 
1303     return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1304 }
1305 
1306 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1307 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1308         const struct compat_iovec __user *,vec,
1309         unsigned long, vlen, loff_t, pos, int, flags)
1310 {
1311     return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1312 }
1313 #endif
1314 
1315 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1316         const struct compat_iovec __user *,vec,
1317         compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
1318 {
1319     loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1320 
1321     if (pos == -1)
1322         return do_compat_writev(fd, vec, vlen, flags);
1323 
1324     return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1325 }
1326 
1327 #endif
1328 
1329 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1330                size_t count, loff_t max)
1331 {
1332     struct fd in, out;
1333     struct inode *in_inode, *out_inode;
1334     loff_t pos;
1335     loff_t out_pos;
1336     ssize_t retval;
1337     int fl;
1338 
1339     /*
1340      * Get input file, and verify that it is ok..
1341      */
1342     retval = -EBADF;
1343     in = fdget(in_fd);
1344     if (!in.file)
1345         goto out;
1346     if (!(in.file->f_mode & FMODE_READ))
1347         goto fput_in;
1348     retval = -ESPIPE;
1349     if (!ppos) {
1350         pos = in.file->f_pos;
1351     } else {
1352         pos = *ppos;
1353         if (!(in.file->f_mode & FMODE_PREAD))
1354             goto fput_in;
1355     }
1356     retval = rw_verify_area(READ, in.file, &pos, count);
1357     if (retval < 0)
1358         goto fput_in;
1359     if (count > MAX_RW_COUNT)
1360         count =  MAX_RW_COUNT;
1361 
1362     /*
1363      * Get output file, and verify that it is ok..
1364      */
1365     retval = -EBADF;
1366     out = fdget(out_fd);
1367     if (!out.file)
1368         goto fput_in;
1369     if (!(out.file->f_mode & FMODE_WRITE))
1370         goto fput_out;
1371     retval = -EINVAL;
1372     in_inode = file_inode(in.file);
1373     out_inode = file_inode(out.file);
1374     out_pos = out.file->f_pos;
1375     retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1376     if (retval < 0)
1377         goto fput_out;
1378 
1379     if (!max)
1380         max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1381 
1382     if (unlikely(pos + count > max)) {
1383         retval = -EOVERFLOW;
1384         if (pos >= max)
1385             goto fput_out;
1386         count = max - pos;
1387     }
1388 
1389     fl = 0;
1390 #if 0
1391     /*
1392      * We need to debate whether we can enable this or not. The
1393      * man page documents EAGAIN return for the output at least,
1394      * and the application is arguably buggy if it doesn't expect
1395      * EAGAIN on a non-blocking file descriptor.
1396      */
1397     if (in.file->f_flags & O_NONBLOCK)
1398         fl = SPLICE_F_NONBLOCK;
1399 #endif
1400     file_start_write(out.file);
1401     retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1402     file_end_write(out.file);
1403 
1404     if (retval > 0) {
1405         add_rchar(current, retval);
1406         add_wchar(current, retval);
1407         fsnotify_access(in.file);
1408         fsnotify_modify(out.file);
1409         out.file->f_pos = out_pos;
1410         if (ppos)
1411             *ppos = pos;
1412         else
1413             in.file->f_pos = pos;
1414     }
1415 
1416     inc_syscr(current);
1417     inc_syscw(current);
1418     if (pos > max)
1419         retval = -EOVERFLOW;
1420 
1421 fput_out:
1422     fdput(out);
1423 fput_in:
1424     fdput(in);
1425 out:
1426     return retval;
1427 }
1428 
1429 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1430 {
1431     loff_t pos;
1432     off_t off;
1433     ssize_t ret;
1434 
1435     if (offset) {
1436         if (unlikely(get_user(off, offset)))
1437             return -EFAULT;
1438         pos = off;
1439         ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1440         if (unlikely(put_user(pos, offset)))
1441             return -EFAULT;
1442         return ret;
1443     }
1444 
1445     return do_sendfile(out_fd, in_fd, NULL, count, 0);
1446 }
1447 
1448 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1449 {
1450     loff_t pos;
1451     ssize_t ret;
1452 
1453     if (offset) {
1454         if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1455             return -EFAULT;
1456         ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1457         if (unlikely(put_user(pos, offset)))
1458             return -EFAULT;
1459         return ret;
1460     }
1461 
1462     return do_sendfile(out_fd, in_fd, NULL, count, 0);
1463 }
1464 
1465 #ifdef CONFIG_COMPAT
1466 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1467         compat_off_t __user *, offset, compat_size_t, count)
1468 {
1469     loff_t pos;
1470     off_t off;
1471     ssize_t ret;
1472 
1473     if (offset) {
1474         if (unlikely(get_user(off, offset)))
1475             return -EFAULT;
1476         pos = off;
1477         ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1478         if (unlikely(put_user(pos, offset)))
1479             return -EFAULT;
1480         return ret;
1481     }
1482 
1483     return do_sendfile(out_fd, in_fd, NULL, count, 0);
1484 }
1485 
1486 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1487         compat_loff_t __user *, offset, compat_size_t, count)
1488 {
1489     loff_t pos;
1490     ssize_t ret;
1491 
1492     if (offset) {
1493         if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1494             return -EFAULT;
1495         ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1496         if (unlikely(put_user(pos, offset)))
1497             return -EFAULT;
1498         return ret;
1499     }
1500 
1501     return do_sendfile(out_fd, in_fd, NULL, count, 0);
1502 }
1503 #endif
1504 
1505 /*
1506  * copy_file_range() differs from regular file read and write in that it
1507  * specifically allows return partial success.  When it does so is up to
1508  * the copy_file_range method.
1509  */
1510 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1511                 struct file *file_out, loff_t pos_out,
1512                 size_t len, unsigned int flags)
1513 {
1514     struct inode *inode_in = file_inode(file_in);
1515     struct inode *inode_out = file_inode(file_out);
1516     ssize_t ret;
1517 
1518     if (flags != 0)
1519         return -EINVAL;
1520 
1521     ret = rw_verify_area(READ, file_in, &pos_in, len);
1522     if (unlikely(ret))
1523         return ret;
1524 
1525     ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1526     if (unlikely(ret))
1527         return ret;
1528 
1529     if (!(file_in->f_mode & FMODE_READ) ||
1530         !(file_out->f_mode & FMODE_WRITE) ||
1531         (file_out->f_flags & O_APPEND))
1532         return -EBADF;
1533 
1534     /* this could be relaxed once a method supports cross-fs copies */
1535     if (inode_in->i_sb != inode_out->i_sb)
1536         return -EXDEV;
1537 
1538     if (len == 0)
1539         return 0;
1540 
1541     sb_start_write(inode_out->i_sb);
1542 
1543     /*
1544      * Try cloning first, this is supported by more file systems, and
1545      * more efficient if both clone and copy are supported (e.g. NFS).
1546      */
1547     if (file_in->f_op->clone_file_range) {
1548         ret = file_in->f_op->clone_file_range(file_in, pos_in,
1549                 file_out, pos_out, len);
1550         if (ret == 0) {
1551             ret = len;
1552             goto done;
1553         }
1554     }
1555 
1556     if (file_out->f_op->copy_file_range) {
1557         ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1558                               pos_out, len, flags);
1559         if (ret != -EOPNOTSUPP)
1560             goto done;
1561     }
1562 
1563     ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1564             len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1565 
1566 done:
1567     if (ret > 0) {
1568         fsnotify_access(file_in);
1569         add_rchar(current, ret);
1570         fsnotify_modify(file_out);
1571         add_wchar(current, ret);
1572     }
1573 
1574     inc_syscr(current);
1575     inc_syscw(current);
1576 
1577     sb_end_write(inode_out->i_sb);
1578 
1579     return ret;
1580 }
1581 EXPORT_SYMBOL(vfs_copy_file_range);
1582 
1583 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1584         int, fd_out, loff_t __user *, off_out,
1585         size_t, len, unsigned int, flags)
1586 {
1587     loff_t pos_in;
1588     loff_t pos_out;
1589     struct fd f_in;
1590     struct fd f_out;
1591     ssize_t ret = -EBADF;
1592 
1593     f_in = fdget(fd_in);
1594     if (!f_in.file)
1595         goto out2;
1596 
1597     f_out = fdget(fd_out);
1598     if (!f_out.file)
1599         goto out1;
1600 
1601     ret = -EFAULT;
1602     if (off_in) {
1603         if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1604             goto out;
1605     } else {
1606         pos_in = f_in.file->f_pos;
1607     }
1608 
1609     if (off_out) {
1610         if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1611             goto out;
1612     } else {
1613         pos_out = f_out.file->f_pos;
1614     }
1615 
1616     ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1617                   flags);
1618     if (ret > 0) {
1619         pos_in += ret;
1620         pos_out += ret;
1621 
1622         if (off_in) {
1623             if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1624                 ret = -EFAULT;
1625         } else {
1626             f_in.file->f_pos = pos_in;
1627         }
1628 
1629         if (off_out) {
1630             if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1631                 ret = -EFAULT;
1632         } else {
1633             f_out.file->f_pos = pos_out;
1634         }
1635     }
1636 
1637 out:
1638     fdput(f_out);
1639 out1:
1640     fdput(f_in);
1641 out2:
1642     return ret;
1643 }
1644 
1645 static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1646 {
1647     struct inode *inode = file_inode(file);
1648 
1649     if (unlikely(pos < 0))
1650         return -EINVAL;
1651 
1652      if (unlikely((loff_t) (pos + len) < 0))
1653         return -EINVAL;
1654 
1655     if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1656         loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1657         int retval;
1658 
1659         retval = locks_mandatory_area(inode, file, pos, end,
1660                 write ? F_WRLCK : F_RDLCK);
1661         if (retval < 0)
1662             return retval;
1663     }
1664 
1665     return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1666 }
1667 
1668 /*
1669  * Check that the two inodes are eligible for cloning, the ranges make
1670  * sense, and then flush all dirty data.  Caller must ensure that the
1671  * inodes have been locked against any other modifications.
1672  *
1673  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1674  * the usual negative error code.
1675  */
1676 int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1677                    struct inode *inode_out, loff_t pos_out,
1678                    u64 *len, bool is_dedupe)
1679 {
1680     loff_t bs = inode_out->i_sb->s_blocksize;
1681     loff_t blen;
1682     loff_t isize;
1683     bool same_inode = (inode_in == inode_out);
1684     int ret;
1685 
1686     /* Don't touch certain kinds of inodes */
1687     if (IS_IMMUTABLE(inode_out))
1688         return -EPERM;
1689 
1690     if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1691         return -ETXTBSY;
1692 
1693     /* Don't reflink dirs, pipes, sockets... */
1694     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1695         return -EISDIR;
1696     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1697         return -EINVAL;
1698 
1699     /* Are we going all the way to the end? */
1700     isize = i_size_read(inode_in);
1701     if (isize == 0)
1702         return 0;
1703 
1704     /* Zero length dedupe exits immediately; reflink goes to EOF. */
1705     if (*len == 0) {
1706         if (is_dedupe || pos_in == isize)
1707             return 0;
1708         if (pos_in > isize)
1709             return -EINVAL;
1710         *len = isize - pos_in;
1711     }
1712 
1713     /* Ensure offsets don't wrap and the input is inside i_size */
1714     if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1715         pos_in + *len > isize)
1716         return -EINVAL;
1717 
1718     /* Don't allow dedupe past EOF in the dest file */
1719     if (is_dedupe) {
1720         loff_t  disize;
1721 
1722         disize = i_size_read(inode_out);
1723         if (pos_out >= disize || pos_out + *len > disize)
1724             return -EINVAL;
1725     }
1726 
1727     /* If we're linking to EOF, continue to the block boundary. */
1728     if (pos_in + *len == isize)
1729         blen = ALIGN(isize, bs) - pos_in;
1730     else
1731         blen = *len;
1732 
1733     /* Only reflink if we're aligned to block boundaries */
1734     if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1735         !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1736         return -EINVAL;
1737 
1738     /* Don't allow overlapped reflink within the same file */
1739     if (same_inode) {
1740         if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1741             return -EINVAL;
1742     }
1743 
1744     /* Wait for the completion of any pending IOs on both files */
1745     inode_dio_wait(inode_in);
1746     if (!same_inode)
1747         inode_dio_wait(inode_out);
1748 
1749     ret = filemap_write_and_wait_range(inode_in->i_mapping,
1750             pos_in, pos_in + *len - 1);
1751     if (ret)
1752         return ret;
1753 
1754     ret = filemap_write_and_wait_range(inode_out->i_mapping,
1755             pos_out, pos_out + *len - 1);
1756     if (ret)
1757         return ret;
1758 
1759     /*
1760      * Check that the extents are the same.
1761      */
1762     if (is_dedupe) {
1763         bool        is_same = false;
1764 
1765         ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1766                 inode_out, pos_out, *len, &is_same);
1767         if (ret)
1768             return ret;
1769         if (!is_same)
1770             return -EBADE;
1771     }
1772 
1773     return 1;
1774 }
1775 EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1776 
1777 int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1778         struct file *file_out, loff_t pos_out, u64 len)
1779 {
1780     struct inode *inode_in = file_inode(file_in);
1781     struct inode *inode_out = file_inode(file_out);
1782     int ret;
1783 
1784     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1785         return -EISDIR;
1786     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1787         return -EINVAL;
1788 
1789     /*
1790      * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1791      * the same mount. Practically, they only need to be on the same file
1792      * system.
1793      */
1794     if (inode_in->i_sb != inode_out->i_sb)
1795         return -EXDEV;
1796 
1797     if (!(file_in->f_mode & FMODE_READ) ||
1798         !(file_out->f_mode & FMODE_WRITE) ||
1799         (file_out->f_flags & O_APPEND))
1800         return -EBADF;
1801 
1802     if (!file_in->f_op->clone_file_range)
1803         return -EOPNOTSUPP;
1804 
1805     ret = clone_verify_area(file_in, pos_in, len, false);
1806     if (ret)
1807         return ret;
1808 
1809     ret = clone_verify_area(file_out, pos_out, len, true);
1810     if (ret)
1811         return ret;
1812 
1813     if (pos_in + len > i_size_read(inode_in))
1814         return -EINVAL;
1815 
1816     ret = file_in->f_op->clone_file_range(file_in, pos_in,
1817             file_out, pos_out, len);
1818     if (!ret) {
1819         fsnotify_access(file_in);
1820         fsnotify_modify(file_out);
1821     }
1822 
1823     return ret;
1824 }
1825 EXPORT_SYMBOL(vfs_clone_file_range);
1826 
1827 /*
1828  * Read a page's worth of file data into the page cache.  Return the page
1829  * locked.
1830  */
1831 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1832 {
1833     struct address_space *mapping;
1834     struct page *page;
1835     pgoff_t n;
1836 
1837     n = offset >> PAGE_SHIFT;
1838     mapping = inode->i_mapping;
1839     page = read_mapping_page(mapping, n, NULL);
1840     if (IS_ERR(page))
1841         return page;
1842     if (!PageUptodate(page)) {
1843         put_page(page);
1844         return ERR_PTR(-EIO);
1845     }
1846     lock_page(page);
1847     return page;
1848 }
1849 
1850 /*
1851  * Compare extents of two files to see if they are the same.
1852  * Caller must have locked both inodes to prevent write races.
1853  */
1854 int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1855                   struct inode *dest, loff_t destoff,
1856                   loff_t len, bool *is_same)
1857 {
1858     loff_t src_poff;
1859     loff_t dest_poff;
1860     void *src_addr;
1861     void *dest_addr;
1862     struct page *src_page;
1863     struct page *dest_page;
1864     loff_t cmp_len;
1865     bool same;
1866     int error;
1867 
1868     error = -EINVAL;
1869     same = true;
1870     while (len) {
1871         src_poff = srcoff & (PAGE_SIZE - 1);
1872         dest_poff = destoff & (PAGE_SIZE - 1);
1873         cmp_len = min(PAGE_SIZE - src_poff,
1874                   PAGE_SIZE - dest_poff);
1875         cmp_len = min(cmp_len, len);
1876         if (cmp_len <= 0)
1877             goto out_error;
1878 
1879         src_page = vfs_dedupe_get_page(src, srcoff);
1880         if (IS_ERR(src_page)) {
1881             error = PTR_ERR(src_page);
1882             goto out_error;
1883         }
1884         dest_page = vfs_dedupe_get_page(dest, destoff);
1885         if (IS_ERR(dest_page)) {
1886             error = PTR_ERR(dest_page);
1887             unlock_page(src_page);
1888             put_page(src_page);
1889             goto out_error;
1890         }
1891         src_addr = kmap_atomic(src_page);
1892         dest_addr = kmap_atomic(dest_page);
1893 
1894         flush_dcache_page(src_page);
1895         flush_dcache_page(dest_page);
1896 
1897         if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1898             same = false;
1899 
1900         kunmap_atomic(dest_addr);
1901         kunmap_atomic(src_addr);
1902         unlock_page(dest_page);
1903         unlock_page(src_page);
1904         put_page(dest_page);
1905         put_page(src_page);
1906 
1907         if (!same)
1908             break;
1909 
1910         srcoff += cmp_len;
1911         destoff += cmp_len;
1912         len -= cmp_len;
1913     }
1914 
1915     *is_same = same;
1916     return 0;
1917 
1918 out_error:
1919     return error;
1920 }
1921 EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1922 
1923 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1924 {
1925     struct file_dedupe_range_info *info;
1926     struct inode *src = file_inode(file);
1927     u64 off;
1928     u64 len;
1929     int i;
1930     int ret;
1931     bool is_admin = capable(CAP_SYS_ADMIN);
1932     u16 count = same->dest_count;
1933     struct file *dst_file;
1934     loff_t dst_off;
1935     ssize_t deduped;
1936 
1937     if (!(file->f_mode & FMODE_READ))
1938         return -EINVAL;
1939 
1940     if (same->reserved1 || same->reserved2)
1941         return -EINVAL;
1942 
1943     off = same->src_offset;
1944     len = same->src_length;
1945 
1946     ret = -EISDIR;
1947     if (S_ISDIR(src->i_mode))
1948         goto out;
1949 
1950     ret = -EINVAL;
1951     if (!S_ISREG(src->i_mode))
1952         goto out;
1953 
1954     ret = clone_verify_area(file, off, len, false);
1955     if (ret < 0)
1956         goto out;
1957     ret = 0;
1958 
1959     if (off + len > i_size_read(src))
1960         return -EINVAL;
1961 
1962     /* pre-format output fields to sane values */
1963     for (i = 0; i < count; i++) {
1964         same->info[i].bytes_deduped = 0ULL;
1965         same->info[i].status = FILE_DEDUPE_RANGE_SAME;
1966     }
1967 
1968     for (i = 0, info = same->info; i < count; i++, info++) {
1969         struct inode *dst;
1970         struct fd dst_fd = fdget(info->dest_fd);
1971 
1972         dst_file = dst_fd.file;
1973         if (!dst_file) {
1974             info->status = -EBADF;
1975             goto next_loop;
1976         }
1977         dst = file_inode(dst_file);
1978 
1979         ret = mnt_want_write_file(dst_file);
1980         if (ret) {
1981             info->status = ret;
1982             goto next_loop;
1983         }
1984 
1985         dst_off = info->dest_offset;
1986         ret = clone_verify_area(dst_file, dst_off, len, true);
1987         if (ret < 0) {
1988             info->status = ret;
1989             goto next_file;
1990         }
1991         ret = 0;
1992 
1993         if (info->reserved) {
1994             info->status = -EINVAL;
1995         } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
1996             info->status = -EINVAL;
1997         } else if (file->f_path.mnt != dst_file->f_path.mnt) {
1998             info->status = -EXDEV;
1999         } else if (S_ISDIR(dst->i_mode)) {
2000             info->status = -EISDIR;
2001         } else if (dst_file->f_op->dedupe_file_range == NULL) {
2002             info->status = -EINVAL;
2003         } else {
2004             deduped = dst_file->f_op->dedupe_file_range(file, off,
2005                             len, dst_file,
2006                             info->dest_offset);
2007             if (deduped == -EBADE)
2008                 info->status = FILE_DEDUPE_RANGE_DIFFERS;
2009             else if (deduped < 0)
2010                 info->status = deduped;
2011             else
2012                 info->bytes_deduped += deduped;
2013         }
2014 
2015 next_file:
2016         mnt_drop_write_file(dst_file);
2017 next_loop:
2018         fdput(dst_fd);
2019 
2020         if (fatal_signal_pending(current))
2021             goto out;
2022     }
2023 
2024 out:
2025     return ret;
2026 }
2027 EXPORT_SYMBOL(vfs_dedupe_file_range);