Back to home page

LXR

 
 

    


0001 /*
0002  * "splice": joining two ropes together by interweaving their strands.
0003  *
0004  * This is the "extended pipe" functionality, where a pipe is used as
0005  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
0006  * buffer that you can use to transfer data from one end to the other.
0007  *
0008  * The traditional unix read/write is extended with a "splice()" operation
0009  * that transfers data buffers to or from a pipe buffer.
0010  *
0011  * Named by Larry McVoy, original implementation from Linus, extended by
0012  * Jens to support splicing to files, network, direct splicing, etc and
0013  * fixing lots of bugs.
0014  *
0015  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
0016  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
0017  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
0018  *
0019  */
0020 #include <linux/bvec.h>
0021 #include <linux/fs.h>
0022 #include <linux/file.h>
0023 #include <linux/pagemap.h>
0024 #include <linux/splice.h>
0025 #include <linux/memcontrol.h>
0026 #include <linux/mm_inline.h>
0027 #include <linux/swap.h>
0028 #include <linux/writeback.h>
0029 #include <linux/export.h>
0030 #include <linux/syscalls.h>
0031 #include <linux/uio.h>
0032 #include <linux/security.h>
0033 #include <linux/gfp.h>
0034 #include <linux/socket.h>
0035 #include <linux/compat.h>
0036 #include "internal.h"
0037 
0038 /*
0039  * Attempt to steal a page from a pipe buffer. This should perhaps go into
0040  * a vm helper function, it's already simplified quite a bit by the
0041  * addition of remove_mapping(). If success is returned, the caller may
0042  * attempt to reuse this page for another destination.
0043  */
0044 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
0045                      struct pipe_buffer *buf)
0046 {
0047     struct page *page = buf->page;
0048     struct address_space *mapping;
0049 
0050     lock_page(page);
0051 
0052     mapping = page_mapping(page);
0053     if (mapping) {
0054         WARN_ON(!PageUptodate(page));
0055 
0056         /*
0057          * At least for ext2 with nobh option, we need to wait on
0058          * writeback completing on this page, since we'll remove it
0059          * from the pagecache.  Otherwise truncate wont wait on the
0060          * page, allowing the disk blocks to be reused by someone else
0061          * before we actually wrote our data to them. fs corruption
0062          * ensues.
0063          */
0064         wait_on_page_writeback(page);
0065 
0066         if (page_has_private(page) &&
0067             !try_to_release_page(page, GFP_KERNEL))
0068             goto out_unlock;
0069 
0070         /*
0071          * If we succeeded in removing the mapping, set LRU flag
0072          * and return good.
0073          */
0074         if (remove_mapping(mapping, page)) {
0075             buf->flags |= PIPE_BUF_FLAG_LRU;
0076             return 0;
0077         }
0078     }
0079 
0080     /*
0081      * Raced with truncate or failed to remove page from current
0082      * address space, unlock and return failure.
0083      */
0084 out_unlock:
0085     unlock_page(page);
0086     return 1;
0087 }
0088 
0089 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
0090                     struct pipe_buffer *buf)
0091 {
0092     put_page(buf->page);
0093     buf->flags &= ~PIPE_BUF_FLAG_LRU;
0094 }
0095 
0096 /*
0097  * Check whether the contents of buf is OK to access. Since the content
0098  * is a page cache page, IO may be in flight.
0099  */
0100 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
0101                        struct pipe_buffer *buf)
0102 {
0103     struct page *page = buf->page;
0104     int err;
0105 
0106     if (!PageUptodate(page)) {
0107         lock_page(page);
0108 
0109         /*
0110          * Page got truncated/unhashed. This will cause a 0-byte
0111          * splice, if this is the first page.
0112          */
0113         if (!page->mapping) {
0114             err = -ENODATA;
0115             goto error;
0116         }
0117 
0118         /*
0119          * Uh oh, read-error from disk.
0120          */
0121         if (!PageUptodate(page)) {
0122             err = -EIO;
0123             goto error;
0124         }
0125 
0126         /*
0127          * Page is ok afterall, we are done.
0128          */
0129         unlock_page(page);
0130     }
0131 
0132     return 0;
0133 error:
0134     unlock_page(page);
0135     return err;
0136 }
0137 
0138 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
0139     .can_merge = 0,
0140     .confirm = page_cache_pipe_buf_confirm,
0141     .release = page_cache_pipe_buf_release,
0142     .steal = page_cache_pipe_buf_steal,
0143     .get = generic_pipe_buf_get,
0144 };
0145 
0146 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
0147                     struct pipe_buffer *buf)
0148 {
0149     if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
0150         return 1;
0151 
0152     buf->flags |= PIPE_BUF_FLAG_LRU;
0153     return generic_pipe_buf_steal(pipe, buf);
0154 }
0155 
0156 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
0157     .can_merge = 0,
0158     .confirm = generic_pipe_buf_confirm,
0159     .release = page_cache_pipe_buf_release,
0160     .steal = user_page_pipe_buf_steal,
0161     .get = generic_pipe_buf_get,
0162 };
0163 
0164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
0165 {
0166     smp_mb();
0167     if (waitqueue_active(&pipe->wait))
0168         wake_up_interruptible(&pipe->wait);
0169     kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
0170 }
0171 
0172 /**
0173  * splice_to_pipe - fill passed data into a pipe
0174  * @pipe:   pipe to fill
0175  * @spd:    data to fill
0176  *
0177  * Description:
0178  *    @spd contains a map of pages and len/offset tuples, along with
0179  *    the struct pipe_buf_operations associated with these pages. This
0180  *    function will link that data to the pipe.
0181  *
0182  */
0183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
0184                struct splice_pipe_desc *spd)
0185 {
0186     unsigned int spd_pages = spd->nr_pages;
0187     int ret = 0, page_nr = 0;
0188 
0189     if (!spd_pages)
0190         return 0;
0191 
0192     if (unlikely(!pipe->readers)) {
0193         send_sig(SIGPIPE, current, 0);
0194         ret = -EPIPE;
0195         goto out;
0196     }
0197 
0198     while (pipe->nrbufs < pipe->buffers) {
0199         int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
0200         struct pipe_buffer *buf = pipe->bufs + newbuf;
0201 
0202         buf->page = spd->pages[page_nr];
0203         buf->offset = spd->partial[page_nr].offset;
0204         buf->len = spd->partial[page_nr].len;
0205         buf->private = spd->partial[page_nr].private;
0206         buf->ops = spd->ops;
0207         buf->flags = 0;
0208 
0209         pipe->nrbufs++;
0210         page_nr++;
0211         ret += buf->len;
0212 
0213         if (!--spd->nr_pages)
0214             break;
0215     }
0216 
0217     if (!ret)
0218         ret = -EAGAIN;
0219 
0220 out:
0221     while (page_nr < spd_pages)
0222         spd->spd_release(spd, page_nr++);
0223 
0224     return ret;
0225 }
0226 EXPORT_SYMBOL_GPL(splice_to_pipe);
0227 
0228 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
0229 {
0230     int ret;
0231 
0232     if (unlikely(!pipe->readers)) {
0233         send_sig(SIGPIPE, current, 0);
0234         ret = -EPIPE;
0235     } else if (pipe->nrbufs == pipe->buffers) {
0236         ret = -EAGAIN;
0237     } else {
0238         int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
0239         pipe->bufs[newbuf] = *buf;
0240         pipe->nrbufs++;
0241         return buf->len;
0242     }
0243     pipe_buf_release(pipe, buf);
0244     return ret;
0245 }
0246 EXPORT_SYMBOL(add_to_pipe);
0247 
0248 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
0249 {
0250     put_page(spd->pages[i]);
0251 }
0252 
0253 /*
0254  * Check if we need to grow the arrays holding pages and partial page
0255  * descriptions.
0256  */
0257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
0258 {
0259     unsigned int buffers = ACCESS_ONCE(pipe->buffers);
0260 
0261     spd->nr_pages_max = buffers;
0262     if (buffers <= PIPE_DEF_BUFFERS)
0263         return 0;
0264 
0265     spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
0266     spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
0267 
0268     if (spd->pages && spd->partial)
0269         return 0;
0270 
0271     kfree(spd->pages);
0272     kfree(spd->partial);
0273     return -ENOMEM;
0274 }
0275 
0276 void splice_shrink_spd(struct splice_pipe_desc *spd)
0277 {
0278     if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
0279         return;
0280 
0281     kfree(spd->pages);
0282     kfree(spd->partial);
0283 }
0284 
0285 /**
0286  * generic_file_splice_read - splice data from file to a pipe
0287  * @in:     file to splice from
0288  * @ppos:   position in @in
0289  * @pipe:   pipe to splice to
0290  * @len:    number of bytes to splice
0291  * @flags:  splice modifier flags
0292  *
0293  * Description:
0294  *    Will read pages from given file and fill them into a pipe. Can be
0295  *    used as long as it has more or less sane ->read_iter().
0296  *
0297  */
0298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
0299                  struct pipe_inode_info *pipe, size_t len,
0300                  unsigned int flags)
0301 {
0302     struct iov_iter to;
0303     struct kiocb kiocb;
0304     int idx, ret;
0305 
0306     iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
0307     idx = to.idx;
0308     init_sync_kiocb(&kiocb, in);
0309     kiocb.ki_pos = *ppos;
0310     ret = in->f_op->read_iter(&kiocb, &to);
0311     if (ret > 0) {
0312         *ppos = kiocb.ki_pos;
0313         file_accessed(in);
0314     } else if (ret < 0) {
0315         to.idx = idx;
0316         to.iov_offset = 0;
0317         iov_iter_advance(&to, 0); /* to free what was emitted */
0318         /*
0319          * callers of ->splice_read() expect -EAGAIN on
0320          * "can't put anything in there", rather than -EFAULT.
0321          */
0322         if (ret == -EFAULT)
0323             ret = -EAGAIN;
0324     }
0325 
0326     return ret;
0327 }
0328 EXPORT_SYMBOL(generic_file_splice_read);
0329 
0330 const struct pipe_buf_operations default_pipe_buf_ops = {
0331     .can_merge = 0,
0332     .confirm = generic_pipe_buf_confirm,
0333     .release = generic_pipe_buf_release,
0334     .steal = generic_pipe_buf_steal,
0335     .get = generic_pipe_buf_get,
0336 };
0337 
0338 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
0339                     struct pipe_buffer *buf)
0340 {
0341     return 1;
0342 }
0343 
0344 /* Pipe buffer operations for a socket and similar. */
0345 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
0346     .can_merge = 0,
0347     .confirm = generic_pipe_buf_confirm,
0348     .release = generic_pipe_buf_release,
0349     .steal = generic_pipe_buf_nosteal,
0350     .get = generic_pipe_buf_get,
0351 };
0352 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
0353 
0354 static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
0355                 unsigned long vlen, loff_t offset)
0356 {
0357     mm_segment_t old_fs;
0358     loff_t pos = offset;
0359     ssize_t res;
0360 
0361     old_fs = get_fs();
0362     set_fs(get_ds());
0363     /* The cast to a user pointer is valid due to the set_fs() */
0364     res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
0365     set_fs(old_fs);
0366 
0367     return res;
0368 }
0369 
0370 ssize_t kernel_write(struct file *file, const char *buf, size_t count,
0371                 loff_t pos)
0372 {
0373     mm_segment_t old_fs;
0374     ssize_t res;
0375 
0376     old_fs = get_fs();
0377     set_fs(get_ds());
0378     /* The cast to a user pointer is valid due to the set_fs() */
0379     res = vfs_write(file, (__force const char __user *)buf, count, &pos);
0380     set_fs(old_fs);
0381 
0382     return res;
0383 }
0384 EXPORT_SYMBOL(kernel_write);
0385 
0386 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
0387                  struct pipe_inode_info *pipe, size_t len,
0388                  unsigned int flags)
0389 {
0390     struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
0391     struct iov_iter to;
0392     struct page **pages;
0393     unsigned int nr_pages;
0394     size_t offset, dummy, copied = 0;
0395     ssize_t res;
0396     int i;
0397 
0398     if (pipe->nrbufs == pipe->buffers)
0399         return -EAGAIN;
0400 
0401     /*
0402      * Try to keep page boundaries matching to source pagecache ones -
0403      * it probably won't be much help, but...
0404      */
0405     offset = *ppos & ~PAGE_MASK;
0406 
0407     iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
0408 
0409     res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
0410     if (res <= 0)
0411         return -ENOMEM;
0412 
0413     BUG_ON(dummy);
0414     nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
0415 
0416     vec = __vec;
0417     if (nr_pages > PIPE_DEF_BUFFERS) {
0418         vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
0419         if (unlikely(!vec)) {
0420             res = -ENOMEM;
0421             goto out;
0422         }
0423     }
0424 
0425     pipe->bufs[to.idx].offset = offset;
0426     pipe->bufs[to.idx].len -= offset;
0427 
0428     for (i = 0; i < nr_pages; i++) {
0429         size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
0430         vec[i].iov_base = page_address(pages[i]) + offset;
0431         vec[i].iov_len = this_len;
0432         len -= this_len;
0433         offset = 0;
0434     }
0435 
0436     res = kernel_readv(in, vec, nr_pages, *ppos);
0437     if (res > 0) {
0438         copied = res;
0439         *ppos += res;
0440     }
0441 
0442     if (vec != __vec)
0443         kfree(vec);
0444 out:
0445     for (i = 0; i < nr_pages; i++)
0446         put_page(pages[i]);
0447     kvfree(pages);
0448     iov_iter_advance(&to, copied);  /* truncates and discards */
0449     return res;
0450 }
0451 
0452 /*
0453  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
0454  * using sendpage(). Return the number of bytes sent.
0455  */
0456 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
0457                 struct pipe_buffer *buf, struct splice_desc *sd)
0458 {
0459     struct file *file = sd->u.file;
0460     loff_t pos = sd->pos;
0461     int more;
0462 
0463     if (!likely(file->f_op->sendpage))
0464         return -EINVAL;
0465 
0466     more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
0467 
0468     if (sd->len < sd->total_len && pipe->nrbufs > 1)
0469         more |= MSG_SENDPAGE_NOTLAST;
0470 
0471     return file->f_op->sendpage(file, buf->page, buf->offset,
0472                     sd->len, &pos, more);
0473 }
0474 
0475 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
0476 {
0477     smp_mb();
0478     if (waitqueue_active(&pipe->wait))
0479         wake_up_interruptible(&pipe->wait);
0480     kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
0481 }
0482 
0483 /**
0484  * splice_from_pipe_feed - feed available data from a pipe to a file
0485  * @pipe:   pipe to splice from
0486  * @sd:     information to @actor
0487  * @actor:  handler that splices the data
0488  *
0489  * Description:
0490  *    This function loops over the pipe and calls @actor to do the
0491  *    actual moving of a single struct pipe_buffer to the desired
0492  *    destination.  It returns when there's no more buffers left in
0493  *    the pipe or if the requested number of bytes (@sd->total_len)
0494  *    have been copied.  It returns a positive number (one) if the
0495  *    pipe needs to be filled with more data, zero if the required
0496  *    number of bytes have been copied and -errno on error.
0497  *
0498  *    This, together with splice_from_pipe_{begin,end,next}, may be
0499  *    used to implement the functionality of __splice_from_pipe() when
0500  *    locking is required around copying the pipe buffers to the
0501  *    destination.
0502  */
0503 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
0504               splice_actor *actor)
0505 {
0506     int ret;
0507 
0508     while (pipe->nrbufs) {
0509         struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
0510 
0511         sd->len = buf->len;
0512         if (sd->len > sd->total_len)
0513             sd->len = sd->total_len;
0514 
0515         ret = pipe_buf_confirm(pipe, buf);
0516         if (unlikely(ret)) {
0517             if (ret == -ENODATA)
0518                 ret = 0;
0519             return ret;
0520         }
0521 
0522         ret = actor(pipe, buf, sd);
0523         if (ret <= 0)
0524             return ret;
0525 
0526         buf->offset += ret;
0527         buf->len -= ret;
0528 
0529         sd->num_spliced += ret;
0530         sd->len -= ret;
0531         sd->pos += ret;
0532         sd->total_len -= ret;
0533 
0534         if (!buf->len) {
0535             pipe_buf_release(pipe, buf);
0536             pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
0537             pipe->nrbufs--;
0538             if (pipe->files)
0539                 sd->need_wakeup = true;
0540         }
0541 
0542         if (!sd->total_len)
0543             return 0;
0544     }
0545 
0546     return 1;
0547 }
0548 
0549 /**
0550  * splice_from_pipe_next - wait for some data to splice from
0551  * @pipe:   pipe to splice from
0552  * @sd:     information about the splice operation
0553  *
0554  * Description:
0555  *    This function will wait for some data and return a positive
0556  *    value (one) if pipe buffers are available.  It will return zero
0557  *    or -errno if no more data needs to be spliced.
0558  */
0559 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
0560 {
0561     /*
0562      * Check for signal early to make process killable when there are
0563      * always buffers available
0564      */
0565     if (signal_pending(current))
0566         return -ERESTARTSYS;
0567 
0568     while (!pipe->nrbufs) {
0569         if (!pipe->writers)
0570             return 0;
0571 
0572         if (!pipe->waiting_writers && sd->num_spliced)
0573             return 0;
0574 
0575         if (sd->flags & SPLICE_F_NONBLOCK)
0576             return -EAGAIN;
0577 
0578         if (signal_pending(current))
0579             return -ERESTARTSYS;
0580 
0581         if (sd->need_wakeup) {
0582             wakeup_pipe_writers(pipe);
0583             sd->need_wakeup = false;
0584         }
0585 
0586         pipe_wait(pipe);
0587     }
0588 
0589     return 1;
0590 }
0591 
0592 /**
0593  * splice_from_pipe_begin - start splicing from pipe
0594  * @sd:     information about the splice operation
0595  *
0596  * Description:
0597  *    This function should be called before a loop containing
0598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
0599  *    initialize the necessary fields of @sd.
0600  */
0601 static void splice_from_pipe_begin(struct splice_desc *sd)
0602 {
0603     sd->num_spliced = 0;
0604     sd->need_wakeup = false;
0605 }
0606 
0607 /**
0608  * splice_from_pipe_end - finish splicing from pipe
0609  * @pipe:   pipe to splice from
0610  * @sd:     information about the splice operation
0611  *
0612  * Description:
0613  *    This function will wake up pipe writers if necessary.  It should
0614  *    be called after a loop containing splice_from_pipe_next() and
0615  *    splice_from_pipe_feed().
0616  */
0617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
0618 {
0619     if (sd->need_wakeup)
0620         wakeup_pipe_writers(pipe);
0621 }
0622 
0623 /**
0624  * __splice_from_pipe - splice data from a pipe to given actor
0625  * @pipe:   pipe to splice from
0626  * @sd:     information to @actor
0627  * @actor:  handler that splices the data
0628  *
0629  * Description:
0630  *    This function does little more than loop over the pipe and call
0631  *    @actor to do the actual moving of a single struct pipe_buffer to
0632  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
0633  *    pipe_to_user.
0634  *
0635  */
0636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
0637                splice_actor *actor)
0638 {
0639     int ret;
0640 
0641     splice_from_pipe_begin(sd);
0642     do {
0643         cond_resched();
0644         ret = splice_from_pipe_next(pipe, sd);
0645         if (ret > 0)
0646             ret = splice_from_pipe_feed(pipe, sd, actor);
0647     } while (ret > 0);
0648     splice_from_pipe_end(pipe, sd);
0649 
0650     return sd->num_spliced ? sd->num_spliced : ret;
0651 }
0652 EXPORT_SYMBOL(__splice_from_pipe);
0653 
0654 /**
0655  * splice_from_pipe - splice data from a pipe to a file
0656  * @pipe:   pipe to splice from
0657  * @out:    file to splice to
0658  * @ppos:   position in @out
0659  * @len:    how many bytes to splice
0660  * @flags:  splice modifier flags
0661  * @actor:  handler that splices the data
0662  *
0663  * Description:
0664  *    See __splice_from_pipe. This function locks the pipe inode,
0665  *    otherwise it's identical to __splice_from_pipe().
0666  *
0667  */
0668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
0669              loff_t *ppos, size_t len, unsigned int flags,
0670              splice_actor *actor)
0671 {
0672     ssize_t ret;
0673     struct splice_desc sd = {
0674         .total_len = len,
0675         .flags = flags,
0676         .pos = *ppos,
0677         .u.file = out,
0678     };
0679 
0680     pipe_lock(pipe);
0681     ret = __splice_from_pipe(pipe, &sd, actor);
0682     pipe_unlock(pipe);
0683 
0684     return ret;
0685 }
0686 
0687 /**
0688  * iter_file_splice_write - splice data from a pipe to a file
0689  * @pipe:   pipe info
0690  * @out:    file to write to
0691  * @ppos:   position in @out
0692  * @len:    number of bytes to splice
0693  * @flags:  splice modifier flags
0694  *
0695  * Description:
0696  *    Will either move or copy pages (determined by @flags options) from
0697  *    the given pipe inode to the given file.
0698  *    This one is ->write_iter-based.
0699  *
0700  */
0701 ssize_t
0702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
0703               loff_t *ppos, size_t len, unsigned int flags)
0704 {
0705     struct splice_desc sd = {
0706         .total_len = len,
0707         .flags = flags,
0708         .pos = *ppos,
0709         .u.file = out,
0710     };
0711     int nbufs = pipe->buffers;
0712     struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
0713                     GFP_KERNEL);
0714     ssize_t ret;
0715 
0716     if (unlikely(!array))
0717         return -ENOMEM;
0718 
0719     pipe_lock(pipe);
0720 
0721     splice_from_pipe_begin(&sd);
0722     while (sd.total_len) {
0723         struct iov_iter from;
0724         size_t left;
0725         int n, idx;
0726 
0727         ret = splice_from_pipe_next(pipe, &sd);
0728         if (ret <= 0)
0729             break;
0730 
0731         if (unlikely(nbufs < pipe->buffers)) {
0732             kfree(array);
0733             nbufs = pipe->buffers;
0734             array = kcalloc(nbufs, sizeof(struct bio_vec),
0735                     GFP_KERNEL);
0736             if (!array) {
0737                 ret = -ENOMEM;
0738                 break;
0739             }
0740         }
0741 
0742         /* build the vector */
0743         left = sd.total_len;
0744         for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
0745             struct pipe_buffer *buf = pipe->bufs + idx;
0746             size_t this_len = buf->len;
0747 
0748             if (this_len > left)
0749                 this_len = left;
0750 
0751             if (idx == pipe->buffers - 1)
0752                 idx = -1;
0753 
0754             ret = pipe_buf_confirm(pipe, buf);
0755             if (unlikely(ret)) {
0756                 if (ret == -ENODATA)
0757                     ret = 0;
0758                 goto done;
0759             }
0760 
0761             array[n].bv_page = buf->page;
0762             array[n].bv_len = this_len;
0763             array[n].bv_offset = buf->offset;
0764             left -= this_len;
0765         }
0766 
0767         iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
0768                   sd.total_len - left);
0769         ret = vfs_iter_write(out, &from, &sd.pos);
0770         if (ret <= 0)
0771             break;
0772 
0773         sd.num_spliced += ret;
0774         sd.total_len -= ret;
0775         *ppos = sd.pos;
0776 
0777         /* dismiss the fully eaten buffers, adjust the partial one */
0778         while (ret) {
0779             struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
0780             if (ret >= buf->len) {
0781                 ret -= buf->len;
0782                 buf->len = 0;
0783                 pipe_buf_release(pipe, buf);
0784                 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
0785                 pipe->nrbufs--;
0786                 if (pipe->files)
0787                     sd.need_wakeup = true;
0788             } else {
0789                 buf->offset += ret;
0790                 buf->len -= ret;
0791                 ret = 0;
0792             }
0793         }
0794     }
0795 done:
0796     kfree(array);
0797     splice_from_pipe_end(pipe, &sd);
0798 
0799     pipe_unlock(pipe);
0800 
0801     if (sd.num_spliced)
0802         ret = sd.num_spliced;
0803 
0804     return ret;
0805 }
0806 
0807 EXPORT_SYMBOL(iter_file_splice_write);
0808 
0809 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
0810               struct splice_desc *sd)
0811 {
0812     int ret;
0813     void *data;
0814     loff_t tmp = sd->pos;
0815 
0816     data = kmap(buf->page);
0817     ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
0818     kunmap(buf->page);
0819 
0820     return ret;
0821 }
0822 
0823 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
0824                      struct file *out, loff_t *ppos,
0825                      size_t len, unsigned int flags)
0826 {
0827     ssize_t ret;
0828 
0829     ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
0830     if (ret > 0)
0831         *ppos += ret;
0832 
0833     return ret;
0834 }
0835 
0836 /**
0837  * generic_splice_sendpage - splice data from a pipe to a socket
0838  * @pipe:   pipe to splice from
0839  * @out:    socket to write to
0840  * @ppos:   position in @out
0841  * @len:    number of bytes to splice
0842  * @flags:  splice modifier flags
0843  *
0844  * Description:
0845  *    Will send @len bytes from the pipe to a network socket. No data copying
0846  *    is involved.
0847  *
0848  */
0849 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
0850                 loff_t *ppos, size_t len, unsigned int flags)
0851 {
0852     return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
0853 }
0854 
0855 EXPORT_SYMBOL(generic_splice_sendpage);
0856 
0857 /*
0858  * Attempt to initiate a splice from pipe to file.
0859  */
0860 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
0861                loff_t *ppos, size_t len, unsigned int flags)
0862 {
0863     ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
0864                 loff_t *, size_t, unsigned int);
0865 
0866     if (out->f_op->splice_write)
0867         splice_write = out->f_op->splice_write;
0868     else
0869         splice_write = default_file_splice_write;
0870 
0871     return splice_write(pipe, out, ppos, len, flags);
0872 }
0873 
0874 /*
0875  * Attempt to initiate a splice from a file to a pipe.
0876  */
0877 static long do_splice_to(struct file *in, loff_t *ppos,
0878              struct pipe_inode_info *pipe, size_t len,
0879              unsigned int flags)
0880 {
0881     ssize_t (*splice_read)(struct file *, loff_t *,
0882                    struct pipe_inode_info *, size_t, unsigned int);
0883     int ret;
0884 
0885     if (unlikely(!(in->f_mode & FMODE_READ)))
0886         return -EBADF;
0887 
0888     ret = rw_verify_area(READ, in, ppos, len);
0889     if (unlikely(ret < 0))
0890         return ret;
0891 
0892     if (unlikely(len > MAX_RW_COUNT))
0893         len = MAX_RW_COUNT;
0894 
0895     if (in->f_op->splice_read)
0896         splice_read = in->f_op->splice_read;
0897     else
0898         splice_read = default_file_splice_read;
0899 
0900     return splice_read(in, ppos, pipe, len, flags);
0901 }
0902 
0903 /**
0904  * splice_direct_to_actor - splices data directly between two non-pipes
0905  * @in:     file to splice from
0906  * @sd:     actor information on where to splice to
0907  * @actor:  handles the data splicing
0908  *
0909  * Description:
0910  *    This is a special case helper to splice directly between two
0911  *    points, without requiring an explicit pipe. Internally an allocated
0912  *    pipe is cached in the process, and reused during the lifetime of
0913  *    that process.
0914  *
0915  */
0916 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
0917                    splice_direct_actor *actor)
0918 {
0919     struct pipe_inode_info *pipe;
0920     long ret, bytes;
0921     umode_t i_mode;
0922     size_t len;
0923     int i, flags, more;
0924 
0925     /*
0926      * We require the input being a regular file, as we don't want to
0927      * randomly drop data for eg socket -> socket splicing. Use the
0928      * piped splicing for that!
0929      */
0930     i_mode = file_inode(in)->i_mode;
0931     if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
0932         return -EINVAL;
0933 
0934     /*
0935      * neither in nor out is a pipe, setup an internal pipe attached to
0936      * 'out' and transfer the wanted data from 'in' to 'out' through that
0937      */
0938     pipe = current->splice_pipe;
0939     if (unlikely(!pipe)) {
0940         pipe = alloc_pipe_info();
0941         if (!pipe)
0942             return -ENOMEM;
0943 
0944         /*
0945          * We don't have an immediate reader, but we'll read the stuff
0946          * out of the pipe right after the splice_to_pipe(). So set
0947          * PIPE_READERS appropriately.
0948          */
0949         pipe->readers = 1;
0950 
0951         current->splice_pipe = pipe;
0952     }
0953 
0954     /*
0955      * Do the splice.
0956      */
0957     ret = 0;
0958     bytes = 0;
0959     len = sd->total_len;
0960     flags = sd->flags;
0961 
0962     /*
0963      * Don't block on output, we have to drain the direct pipe.
0964      */
0965     sd->flags &= ~SPLICE_F_NONBLOCK;
0966     more = sd->flags & SPLICE_F_MORE;
0967 
0968     while (len) {
0969         size_t read_len;
0970         loff_t pos = sd->pos, prev_pos = pos;
0971 
0972         ret = do_splice_to(in, &pos, pipe, len, flags);
0973         if (unlikely(ret <= 0))
0974             goto out_release;
0975 
0976         read_len = ret;
0977         sd->total_len = read_len;
0978 
0979         /*
0980          * If more data is pending, set SPLICE_F_MORE
0981          * If this is the last data and SPLICE_F_MORE was not set
0982          * initially, clears it.
0983          */
0984         if (read_len < len)
0985             sd->flags |= SPLICE_F_MORE;
0986         else if (!more)
0987             sd->flags &= ~SPLICE_F_MORE;
0988         /*
0989          * NOTE: nonblocking mode only applies to the input. We
0990          * must not do the output in nonblocking mode as then we
0991          * could get stuck data in the internal pipe:
0992          */
0993         ret = actor(pipe, sd);
0994         if (unlikely(ret <= 0)) {
0995             sd->pos = prev_pos;
0996             goto out_release;
0997         }
0998 
0999         bytes += ret;
1000         len -= ret;
1001         sd->pos = pos;
1002 
1003         if (ret < read_len) {
1004             sd->pos = prev_pos + ret;
1005             goto out_release;
1006         }
1007     }
1008 
1009 done:
1010     pipe->nrbufs = pipe->curbuf = 0;
1011     file_accessed(in);
1012     return bytes;
1013 
1014 out_release:
1015     /*
1016      * If we did an incomplete transfer we must release
1017      * the pipe buffers in question:
1018      */
1019     for (i = 0; i < pipe->buffers; i++) {
1020         struct pipe_buffer *buf = pipe->bufs + i;
1021 
1022         if (buf->ops)
1023             pipe_buf_release(pipe, buf);
1024     }
1025 
1026     if (!bytes)
1027         bytes = ret;
1028 
1029     goto done;
1030 }
1031 EXPORT_SYMBOL(splice_direct_to_actor);
1032 
1033 static int direct_splice_actor(struct pipe_inode_info *pipe,
1034                    struct splice_desc *sd)
1035 {
1036     struct file *file = sd->u.file;
1037 
1038     return do_splice_from(pipe, file, sd->opos, sd->total_len,
1039                   sd->flags);
1040 }
1041 
1042 /**
1043  * do_splice_direct - splices data directly between two files
1044  * @in:     file to splice from
1045  * @ppos:   input file offset
1046  * @out:    file to splice to
1047  * @opos:   output file offset
1048  * @len:    number of bytes to splice
1049  * @flags:  splice modifier flags
1050  *
1051  * Description:
1052  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1053  *    doing it in the application would incur an extra system call
1054  *    (splice in + splice out, as compared to just sendfile()). So this helper
1055  *    can splice directly through a process-private pipe.
1056  *
1057  */
1058 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1059               loff_t *opos, size_t len, unsigned int flags)
1060 {
1061     struct splice_desc sd = {
1062         .len        = len,
1063         .total_len  = len,
1064         .flags      = flags,
1065         .pos        = *ppos,
1066         .u.file     = out,
1067         .opos       = opos,
1068     };
1069     long ret;
1070 
1071     if (unlikely(!(out->f_mode & FMODE_WRITE)))
1072         return -EBADF;
1073 
1074     if (unlikely(out->f_flags & O_APPEND))
1075         return -EINVAL;
1076 
1077     ret = rw_verify_area(WRITE, out, opos, len);
1078     if (unlikely(ret < 0))
1079         return ret;
1080 
1081     ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1082     if (ret > 0)
1083         *ppos = sd.pos;
1084 
1085     return ret;
1086 }
1087 EXPORT_SYMBOL(do_splice_direct);
1088 
1089 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1090 {
1091     for (;;) {
1092         if (unlikely(!pipe->readers)) {
1093             send_sig(SIGPIPE, current, 0);
1094             return -EPIPE;
1095         }
1096         if (pipe->nrbufs != pipe->buffers)
1097             return 0;
1098         if (flags & SPLICE_F_NONBLOCK)
1099             return -EAGAIN;
1100         if (signal_pending(current))
1101             return -ERESTARTSYS;
1102         pipe->waiting_writers++;
1103         pipe_wait(pipe);
1104         pipe->waiting_writers--;
1105     }
1106 }
1107 
1108 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1109                    struct pipe_inode_info *opipe,
1110                    size_t len, unsigned int flags);
1111 
1112 /*
1113  * Determine where to splice to/from.
1114  */
1115 static long do_splice(struct file *in, loff_t __user *off_in,
1116               struct file *out, loff_t __user *off_out,
1117               size_t len, unsigned int flags)
1118 {
1119     struct pipe_inode_info *ipipe;
1120     struct pipe_inode_info *opipe;
1121     loff_t offset;
1122     long ret;
1123 
1124     ipipe = get_pipe_info(in);
1125     opipe = get_pipe_info(out);
1126 
1127     if (ipipe && opipe) {
1128         if (off_in || off_out)
1129             return -ESPIPE;
1130 
1131         if (!(in->f_mode & FMODE_READ))
1132             return -EBADF;
1133 
1134         if (!(out->f_mode & FMODE_WRITE))
1135             return -EBADF;
1136 
1137         /* Splicing to self would be fun, but... */
1138         if (ipipe == opipe)
1139             return -EINVAL;
1140 
1141         return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1142     }
1143 
1144     if (ipipe) {
1145         if (off_in)
1146             return -ESPIPE;
1147         if (off_out) {
1148             if (!(out->f_mode & FMODE_PWRITE))
1149                 return -EINVAL;
1150             if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1151                 return -EFAULT;
1152         } else {
1153             offset = out->f_pos;
1154         }
1155 
1156         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1157             return -EBADF;
1158 
1159         if (unlikely(out->f_flags & O_APPEND))
1160             return -EINVAL;
1161 
1162         ret = rw_verify_area(WRITE, out, &offset, len);
1163         if (unlikely(ret < 0))
1164             return ret;
1165 
1166         file_start_write(out);
1167         ret = do_splice_from(ipipe, out, &offset, len, flags);
1168         file_end_write(out);
1169 
1170         if (!off_out)
1171             out->f_pos = offset;
1172         else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1173             ret = -EFAULT;
1174 
1175         return ret;
1176     }
1177 
1178     if (opipe) {
1179         if (off_out)
1180             return -ESPIPE;
1181         if (off_in) {
1182             if (!(in->f_mode & FMODE_PREAD))
1183                 return -EINVAL;
1184             if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1185                 return -EFAULT;
1186         } else {
1187             offset = in->f_pos;
1188         }
1189 
1190         pipe_lock(opipe);
1191         ret = wait_for_space(opipe, flags);
1192         if (!ret)
1193             ret = do_splice_to(in, &offset, opipe, len, flags);
1194         pipe_unlock(opipe);
1195         if (ret > 0)
1196             wakeup_pipe_readers(opipe);
1197         if (!off_in)
1198             in->f_pos = offset;
1199         else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1200             ret = -EFAULT;
1201 
1202         return ret;
1203     }
1204 
1205     return -EINVAL;
1206 }
1207 
1208 static int iter_to_pipe(struct iov_iter *from,
1209             struct pipe_inode_info *pipe,
1210             unsigned flags)
1211 {
1212     struct pipe_buffer buf = {
1213         .ops = &user_page_pipe_buf_ops,
1214         .flags = flags
1215     };
1216     size_t total = 0;
1217     int ret = 0;
1218     bool failed = false;
1219 
1220     while (iov_iter_count(from) && !failed) {
1221         struct page *pages[16];
1222         ssize_t copied;
1223         size_t start;
1224         int n;
1225 
1226         copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1227         if (copied <= 0) {
1228             ret = copied;
1229             break;
1230         }
1231 
1232         for (n = 0; copied; n++, start = 0) {
1233             int size = min_t(int, copied, PAGE_SIZE - start);
1234             if (!failed) {
1235                 buf.page = pages[n];
1236                 buf.offset = start;
1237                 buf.len = size;
1238                 ret = add_to_pipe(pipe, &buf);
1239                 if (unlikely(ret < 0)) {
1240                     failed = true;
1241                 } else {
1242                     iov_iter_advance(from, ret);
1243                     total += ret;
1244                 }
1245             } else {
1246                 put_page(pages[n]);
1247             }
1248             copied -= size;
1249         }
1250     }
1251     return total ? total : ret;
1252 }
1253 
1254 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1255             struct splice_desc *sd)
1256 {
1257     int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1258     return n == sd->len ? n : -EFAULT;
1259 }
1260 
1261 /*
1262  * For lack of a better implementation, implement vmsplice() to userspace
1263  * as a simple copy of the pipes pages to the user iov.
1264  */
1265 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1266                  unsigned long nr_segs, unsigned int flags)
1267 {
1268     struct pipe_inode_info *pipe;
1269     struct splice_desc sd;
1270     long ret;
1271     struct iovec iovstack[UIO_FASTIOV];
1272     struct iovec *iov = iovstack;
1273     struct iov_iter iter;
1274 
1275     pipe = get_pipe_info(file);
1276     if (!pipe)
1277         return -EBADF;
1278 
1279     ret = import_iovec(READ, uiov, nr_segs,
1280                ARRAY_SIZE(iovstack), &iov, &iter);
1281     if (ret < 0)
1282         return ret;
1283 
1284     sd.total_len = iov_iter_count(&iter);
1285     sd.len = 0;
1286     sd.flags = flags;
1287     sd.u.data = &iter;
1288     sd.pos = 0;
1289 
1290     if (sd.total_len) {
1291         pipe_lock(pipe);
1292         ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1293         pipe_unlock(pipe);
1294     }
1295 
1296     kfree(iov);
1297     return ret;
1298 }
1299 
1300 /*
1301  * vmsplice splices a user address range into a pipe. It can be thought of
1302  * as splice-from-memory, where the regular splice is splice-from-file (or
1303  * to file). In both cases the output is a pipe, naturally.
1304  */
1305 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1306                  unsigned long nr_segs, unsigned int flags)
1307 {
1308     struct pipe_inode_info *pipe;
1309     struct iovec iovstack[UIO_FASTIOV];
1310     struct iovec *iov = iovstack;
1311     struct iov_iter from;
1312     long ret;
1313     unsigned buf_flag = 0;
1314 
1315     if (flags & SPLICE_F_GIFT)
1316         buf_flag = PIPE_BUF_FLAG_GIFT;
1317 
1318     pipe = get_pipe_info(file);
1319     if (!pipe)
1320         return -EBADF;
1321 
1322     ret = import_iovec(WRITE, uiov, nr_segs,
1323                ARRAY_SIZE(iovstack), &iov, &from);
1324     if (ret < 0)
1325         return ret;
1326 
1327     pipe_lock(pipe);
1328     ret = wait_for_space(pipe, flags);
1329     if (!ret)
1330         ret = iter_to_pipe(&from, pipe, buf_flag);
1331     pipe_unlock(pipe);
1332     if (ret > 0)
1333         wakeup_pipe_readers(pipe);
1334     kfree(iov);
1335     return ret;
1336 }
1337 
1338 /*
1339  * Note that vmsplice only really supports true splicing _from_ user memory
1340  * to a pipe, not the other way around. Splicing from user memory is a simple
1341  * operation that can be supported without any funky alignment restrictions
1342  * or nasty vm tricks. We simply map in the user memory and fill them into
1343  * a pipe. The reverse isn't quite as easy, though. There are two possible
1344  * solutions for that:
1345  *
1346  *  - memcpy() the data internally, at which point we might as well just
1347  *    do a regular read() on the buffer anyway.
1348  *  - Lots of nasty vm tricks, that are neither fast nor flexible (it
1349  *    has restriction limitations on both ends of the pipe).
1350  *
1351  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1352  *
1353  */
1354 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1355         unsigned long, nr_segs, unsigned int, flags)
1356 {
1357     struct fd f;
1358     long error;
1359 
1360     if (unlikely(nr_segs > UIO_MAXIOV))
1361         return -EINVAL;
1362     else if (unlikely(!nr_segs))
1363         return 0;
1364 
1365     error = -EBADF;
1366     f = fdget(fd);
1367     if (f.file) {
1368         if (f.file->f_mode & FMODE_WRITE)
1369             error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1370         else if (f.file->f_mode & FMODE_READ)
1371             error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1372 
1373         fdput(f);
1374     }
1375 
1376     return error;
1377 }
1378 
1379 #ifdef CONFIG_COMPAT
1380 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1381             unsigned int, nr_segs, unsigned int, flags)
1382 {
1383     unsigned i;
1384     struct iovec __user *iov;
1385     if (nr_segs > UIO_MAXIOV)
1386         return -EINVAL;
1387     iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1388     for (i = 0; i < nr_segs; i++) {
1389         struct compat_iovec v;
1390         if (get_user(v.iov_base, &iov32[i].iov_base) ||
1391             get_user(v.iov_len, &iov32[i].iov_len) ||
1392             put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1393             put_user(v.iov_len, &iov[i].iov_len))
1394             return -EFAULT;
1395     }
1396     return sys_vmsplice(fd, iov, nr_segs, flags);
1397 }
1398 #endif
1399 
1400 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1401         int, fd_out, loff_t __user *, off_out,
1402         size_t, len, unsigned int, flags)
1403 {
1404     struct fd in, out;
1405     long error;
1406 
1407     if (unlikely(!len))
1408         return 0;
1409 
1410     error = -EBADF;
1411     in = fdget(fd_in);
1412     if (in.file) {
1413         if (in.file->f_mode & FMODE_READ) {
1414             out = fdget(fd_out);
1415             if (out.file) {
1416                 if (out.file->f_mode & FMODE_WRITE)
1417                     error = do_splice(in.file, off_in,
1418                               out.file, off_out,
1419                               len, flags);
1420                 fdput(out);
1421             }
1422         }
1423         fdput(in);
1424     }
1425     return error;
1426 }
1427 
1428 /*
1429  * Make sure there's data to read. Wait for input if we can, otherwise
1430  * return an appropriate error.
1431  */
1432 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1433 {
1434     int ret;
1435 
1436     /*
1437      * Check ->nrbufs without the inode lock first. This function
1438      * is speculative anyways, so missing one is ok.
1439      */
1440     if (pipe->nrbufs)
1441         return 0;
1442 
1443     ret = 0;
1444     pipe_lock(pipe);
1445 
1446     while (!pipe->nrbufs) {
1447         if (signal_pending(current)) {
1448             ret = -ERESTARTSYS;
1449             break;
1450         }
1451         if (!pipe->writers)
1452             break;
1453         if (!pipe->waiting_writers) {
1454             if (flags & SPLICE_F_NONBLOCK) {
1455                 ret = -EAGAIN;
1456                 break;
1457             }
1458         }
1459         pipe_wait(pipe);
1460     }
1461 
1462     pipe_unlock(pipe);
1463     return ret;
1464 }
1465 
1466 /*
1467  * Make sure there's writeable room. Wait for room if we can, otherwise
1468  * return an appropriate error.
1469  */
1470 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1471 {
1472     int ret;
1473 
1474     /*
1475      * Check ->nrbufs without the inode lock first. This function
1476      * is speculative anyways, so missing one is ok.
1477      */
1478     if (pipe->nrbufs < pipe->buffers)
1479         return 0;
1480 
1481     ret = 0;
1482     pipe_lock(pipe);
1483 
1484     while (pipe->nrbufs >= pipe->buffers) {
1485         if (!pipe->readers) {
1486             send_sig(SIGPIPE, current, 0);
1487             ret = -EPIPE;
1488             break;
1489         }
1490         if (flags & SPLICE_F_NONBLOCK) {
1491             ret = -EAGAIN;
1492             break;
1493         }
1494         if (signal_pending(current)) {
1495             ret = -ERESTARTSYS;
1496             break;
1497         }
1498         pipe->waiting_writers++;
1499         pipe_wait(pipe);
1500         pipe->waiting_writers--;
1501     }
1502 
1503     pipe_unlock(pipe);
1504     return ret;
1505 }
1506 
1507 /*
1508  * Splice contents of ipipe to opipe.
1509  */
1510 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1511                    struct pipe_inode_info *opipe,
1512                    size_t len, unsigned int flags)
1513 {
1514     struct pipe_buffer *ibuf, *obuf;
1515     int ret = 0, nbuf;
1516     bool input_wakeup = false;
1517 
1518 
1519 retry:
1520     ret = ipipe_prep(ipipe, flags);
1521     if (ret)
1522         return ret;
1523 
1524     ret = opipe_prep(opipe, flags);
1525     if (ret)
1526         return ret;
1527 
1528     /*
1529      * Potential ABBA deadlock, work around it by ordering lock
1530      * grabbing by pipe info address. Otherwise two different processes
1531      * could deadlock (one doing tee from A -> B, the other from B -> A).
1532      */
1533     pipe_double_lock(ipipe, opipe);
1534 
1535     do {
1536         if (!opipe->readers) {
1537             send_sig(SIGPIPE, current, 0);
1538             if (!ret)
1539                 ret = -EPIPE;
1540             break;
1541         }
1542 
1543         if (!ipipe->nrbufs && !ipipe->writers)
1544             break;
1545 
1546         /*
1547          * Cannot make any progress, because either the input
1548          * pipe is empty or the output pipe is full.
1549          */
1550         if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1551             /* Already processed some buffers, break */
1552             if (ret)
1553                 break;
1554 
1555             if (flags & SPLICE_F_NONBLOCK) {
1556                 ret = -EAGAIN;
1557                 break;
1558             }
1559 
1560             /*
1561              * We raced with another reader/writer and haven't
1562              * managed to process any buffers.  A zero return
1563              * value means EOF, so retry instead.
1564              */
1565             pipe_unlock(ipipe);
1566             pipe_unlock(opipe);
1567             goto retry;
1568         }
1569 
1570         ibuf = ipipe->bufs + ipipe->curbuf;
1571         nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1572         obuf = opipe->bufs + nbuf;
1573 
1574         if (len >= ibuf->len) {
1575             /*
1576              * Simply move the whole buffer from ipipe to opipe
1577              */
1578             *obuf = *ibuf;
1579             ibuf->ops = NULL;
1580             opipe->nrbufs++;
1581             ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1582             ipipe->nrbufs--;
1583             input_wakeup = true;
1584         } else {
1585             /*
1586              * Get a reference to this pipe buffer,
1587              * so we can copy the contents over.
1588              */
1589             pipe_buf_get(ipipe, ibuf);
1590             *obuf = *ibuf;
1591 
1592             /*
1593              * Don't inherit the gift flag, we need to
1594              * prevent multiple steals of this page.
1595              */
1596             obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1597 
1598             obuf->len = len;
1599             opipe->nrbufs++;
1600             ibuf->offset += obuf->len;
1601             ibuf->len -= obuf->len;
1602         }
1603         ret += obuf->len;
1604         len -= obuf->len;
1605     } while (len);
1606 
1607     pipe_unlock(ipipe);
1608     pipe_unlock(opipe);
1609 
1610     /*
1611      * If we put data in the output pipe, wakeup any potential readers.
1612      */
1613     if (ret > 0)
1614         wakeup_pipe_readers(opipe);
1615 
1616     if (input_wakeup)
1617         wakeup_pipe_writers(ipipe);
1618 
1619     return ret;
1620 }
1621 
1622 /*
1623  * Link contents of ipipe to opipe.
1624  */
1625 static int link_pipe(struct pipe_inode_info *ipipe,
1626              struct pipe_inode_info *opipe,
1627              size_t len, unsigned int flags)
1628 {
1629     struct pipe_buffer *ibuf, *obuf;
1630     int ret = 0, i = 0, nbuf;
1631 
1632     /*
1633      * Potential ABBA deadlock, work around it by ordering lock
1634      * grabbing by pipe info address. Otherwise two different processes
1635      * could deadlock (one doing tee from A -> B, the other from B -> A).
1636      */
1637     pipe_double_lock(ipipe, opipe);
1638 
1639     do {
1640         if (!opipe->readers) {
1641             send_sig(SIGPIPE, current, 0);
1642             if (!ret)
1643                 ret = -EPIPE;
1644             break;
1645         }
1646 
1647         /*
1648          * If we have iterated all input buffers or ran out of
1649          * output room, break.
1650          */
1651         if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1652             break;
1653 
1654         ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1655         nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1656 
1657         /*
1658          * Get a reference to this pipe buffer,
1659          * so we can copy the contents over.
1660          */
1661         pipe_buf_get(ipipe, ibuf);
1662 
1663         obuf = opipe->bufs + nbuf;
1664         *obuf = *ibuf;
1665 
1666         /*
1667          * Don't inherit the gift flag, we need to
1668          * prevent multiple steals of this page.
1669          */
1670         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1671 
1672         if (obuf->len > len)
1673             obuf->len = len;
1674 
1675         opipe->nrbufs++;
1676         ret += obuf->len;
1677         len -= obuf->len;
1678         i++;
1679     } while (len);
1680 
1681     /*
1682      * return EAGAIN if we have the potential of some data in the
1683      * future, otherwise just return 0
1684      */
1685     if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1686         ret = -EAGAIN;
1687 
1688     pipe_unlock(ipipe);
1689     pipe_unlock(opipe);
1690 
1691     /*
1692      * If we put data in the output pipe, wakeup any potential readers.
1693      */
1694     if (ret > 0)
1695         wakeup_pipe_readers(opipe);
1696 
1697     return ret;
1698 }
1699 
1700 /*
1701  * This is a tee(1) implementation that works on pipes. It doesn't copy
1702  * any data, it simply references the 'in' pages on the 'out' pipe.
1703  * The 'flags' used are the SPLICE_F_* variants, currently the only
1704  * applicable one is SPLICE_F_NONBLOCK.
1705  */
1706 static long do_tee(struct file *in, struct file *out, size_t len,
1707            unsigned int flags)
1708 {
1709     struct pipe_inode_info *ipipe = get_pipe_info(in);
1710     struct pipe_inode_info *opipe = get_pipe_info(out);
1711     int ret = -EINVAL;
1712 
1713     /*
1714      * Duplicate the contents of ipipe to opipe without actually
1715      * copying the data.
1716      */
1717     if (ipipe && opipe && ipipe != opipe) {
1718         /*
1719          * Keep going, unless we encounter an error. The ipipe/opipe
1720          * ordering doesn't really matter.
1721          */
1722         ret = ipipe_prep(ipipe, flags);
1723         if (!ret) {
1724             ret = opipe_prep(opipe, flags);
1725             if (!ret)
1726                 ret = link_pipe(ipipe, opipe, len, flags);
1727         }
1728     }
1729 
1730     return ret;
1731 }
1732 
1733 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1734 {
1735     struct fd in;
1736     int error;
1737 
1738     if (unlikely(!len))
1739         return 0;
1740 
1741     error = -EBADF;
1742     in = fdget(fdin);
1743     if (in.file) {
1744         if (in.file->f_mode & FMODE_READ) {
1745             struct fd out = fdget(fdout);
1746             if (out.file) {
1747                 if (out.file->f_mode & FMODE_WRITE)
1748                     error = do_tee(in.file, out.file,
1749                             len, flags);
1750                 fdput(out);
1751             }
1752         }
1753         fdput(in);
1754     }
1755 
1756     return error;
1757 }