Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * "splice": joining two ropes together by interweaving their strands.
0004  *
0005  * This is the "extended pipe" functionality, where a pipe is used as
0006  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
0007  * buffer that you can use to transfer data from one end to the other.
0008  *
0009  * The traditional unix read/write is extended with a "splice()" operation
0010  * that transfers data buffers to or from a pipe buffer.
0011  *
0012  * Named by Larry McVoy, original implementation from Linus, extended by
0013  * Jens to support splicing to files, network, direct splicing, etc and
0014  * fixing lots of bugs.
0015  *
0016  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
0017  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
0018  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
0019  *
0020  */
0021 #include <linux/bvec.h>
0022 #include <linux/fs.h>
0023 #include <linux/file.h>
0024 #include <linux/pagemap.h>
0025 #include <linux/splice.h>
0026 #include <linux/memcontrol.h>
0027 #include <linux/mm_inline.h>
0028 #include <linux/swap.h>
0029 #include <linux/writeback.h>
0030 #include <linux/export.h>
0031 #include <linux/syscalls.h>
0032 #include <linux/uio.h>
0033 #include <linux/security.h>
0034 #include <linux/gfp.h>
0035 #include <linux/socket.h>
0036 #include <linux/sched/signal.h>
0037 
0038 #include "internal.h"
0039 
0040 /*
0041  * Attempt to steal a page from a pipe buffer. This should perhaps go into
0042  * a vm helper function, it's already simplified quite a bit by the
0043  * addition of remove_mapping(). If success is returned, the caller may
0044  * attempt to reuse this page for another destination.
0045  */
0046 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
0047         struct pipe_buffer *buf)
0048 {
0049     struct folio *folio = page_folio(buf->page);
0050     struct address_space *mapping;
0051 
0052     folio_lock(folio);
0053 
0054     mapping = folio_mapping(folio);
0055     if (mapping) {
0056         WARN_ON(!folio_test_uptodate(folio));
0057 
0058         /*
0059          * At least for ext2 with nobh option, we need to wait on
0060          * writeback completing on this folio, since we'll remove it
0061          * from the pagecache.  Otherwise truncate wont wait on the
0062          * folio, allowing the disk blocks to be reused by someone else
0063          * before we actually wrote our data to them. fs corruption
0064          * ensues.
0065          */
0066         folio_wait_writeback(folio);
0067 
0068         if (folio_has_private(folio) &&
0069             !filemap_release_folio(folio, GFP_KERNEL))
0070             goto out_unlock;
0071 
0072         /*
0073          * If we succeeded in removing the mapping, set LRU flag
0074          * and return good.
0075          */
0076         if (remove_mapping(mapping, folio)) {
0077             buf->flags |= PIPE_BUF_FLAG_LRU;
0078             return true;
0079         }
0080     }
0081 
0082     /*
0083      * Raced with truncate or failed to remove folio from current
0084      * address space, unlock and return failure.
0085      */
0086 out_unlock:
0087     folio_unlock(folio);
0088     return false;
0089 }
0090 
0091 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
0092                     struct pipe_buffer *buf)
0093 {
0094     put_page(buf->page);
0095     buf->flags &= ~PIPE_BUF_FLAG_LRU;
0096 }
0097 
0098 /*
0099  * Check whether the contents of buf is OK to access. Since the content
0100  * is a page cache page, IO may be in flight.
0101  */
0102 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
0103                        struct pipe_buffer *buf)
0104 {
0105     struct page *page = buf->page;
0106     int err;
0107 
0108     if (!PageUptodate(page)) {
0109         lock_page(page);
0110 
0111         /*
0112          * Page got truncated/unhashed. This will cause a 0-byte
0113          * splice, if this is the first page.
0114          */
0115         if (!page->mapping) {
0116             err = -ENODATA;
0117             goto error;
0118         }
0119 
0120         /*
0121          * Uh oh, read-error from disk.
0122          */
0123         if (!PageUptodate(page)) {
0124             err = -EIO;
0125             goto error;
0126         }
0127 
0128         /*
0129          * Page is ok afterall, we are done.
0130          */
0131         unlock_page(page);
0132     }
0133 
0134     return 0;
0135 error:
0136     unlock_page(page);
0137     return err;
0138 }
0139 
0140 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
0141     .confirm    = page_cache_pipe_buf_confirm,
0142     .release    = page_cache_pipe_buf_release,
0143     .try_steal  = page_cache_pipe_buf_try_steal,
0144     .get        = generic_pipe_buf_get,
0145 };
0146 
0147 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
0148         struct pipe_buffer *buf)
0149 {
0150     if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
0151         return false;
0152 
0153     buf->flags |= PIPE_BUF_FLAG_LRU;
0154     return generic_pipe_buf_try_steal(pipe, buf);
0155 }
0156 
0157 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
0158     .release    = page_cache_pipe_buf_release,
0159     .try_steal  = user_page_pipe_buf_try_steal,
0160     .get        = generic_pipe_buf_get,
0161 };
0162 
0163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
0164 {
0165     smp_mb();
0166     if (waitqueue_active(&pipe->rd_wait))
0167         wake_up_interruptible(&pipe->rd_wait);
0168     kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
0169 }
0170 
0171 /**
0172  * splice_to_pipe - fill passed data into a pipe
0173  * @pipe:   pipe to fill
0174  * @spd:    data to fill
0175  *
0176  * Description:
0177  *    @spd contains a map of pages and len/offset tuples, along with
0178  *    the struct pipe_buf_operations associated with these pages. This
0179  *    function will link that data to the pipe.
0180  *
0181  */
0182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
0183                struct splice_pipe_desc *spd)
0184 {
0185     unsigned int spd_pages = spd->nr_pages;
0186     unsigned int tail = pipe->tail;
0187     unsigned int head = pipe->head;
0188     unsigned int mask = pipe->ring_size - 1;
0189     int ret = 0, page_nr = 0;
0190 
0191     if (!spd_pages)
0192         return 0;
0193 
0194     if (unlikely(!pipe->readers)) {
0195         send_sig(SIGPIPE, current, 0);
0196         ret = -EPIPE;
0197         goto out;
0198     }
0199 
0200     while (!pipe_full(head, tail, pipe->max_usage)) {
0201         struct pipe_buffer *buf = &pipe->bufs[head & mask];
0202 
0203         buf->page = spd->pages[page_nr];
0204         buf->offset = spd->partial[page_nr].offset;
0205         buf->len = spd->partial[page_nr].len;
0206         buf->private = spd->partial[page_nr].private;
0207         buf->ops = spd->ops;
0208         buf->flags = 0;
0209 
0210         head++;
0211         pipe->head = head;
0212         page_nr++;
0213         ret += buf->len;
0214 
0215         if (!--spd->nr_pages)
0216             break;
0217     }
0218 
0219     if (!ret)
0220         ret = -EAGAIN;
0221 
0222 out:
0223     while (page_nr < spd_pages)
0224         spd->spd_release(spd, page_nr++);
0225 
0226     return ret;
0227 }
0228 EXPORT_SYMBOL_GPL(splice_to_pipe);
0229 
0230 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
0231 {
0232     unsigned int head = pipe->head;
0233     unsigned int tail = pipe->tail;
0234     unsigned int mask = pipe->ring_size - 1;
0235     int ret;
0236 
0237     if (unlikely(!pipe->readers)) {
0238         send_sig(SIGPIPE, current, 0);
0239         ret = -EPIPE;
0240     } else if (pipe_full(head, tail, pipe->max_usage)) {
0241         ret = -EAGAIN;
0242     } else {
0243         pipe->bufs[head & mask] = *buf;
0244         pipe->head = head + 1;
0245         return buf->len;
0246     }
0247     pipe_buf_release(pipe, buf);
0248     return ret;
0249 }
0250 EXPORT_SYMBOL(add_to_pipe);
0251 
0252 /*
0253  * Check if we need to grow the arrays holding pages and partial page
0254  * descriptions.
0255  */
0256 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
0257 {
0258     unsigned int max_usage = READ_ONCE(pipe->max_usage);
0259 
0260     spd->nr_pages_max = max_usage;
0261     if (max_usage <= PIPE_DEF_BUFFERS)
0262         return 0;
0263 
0264     spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
0265     spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
0266                      GFP_KERNEL);
0267 
0268     if (spd->pages && spd->partial)
0269         return 0;
0270 
0271     kfree(spd->pages);
0272     kfree(spd->partial);
0273     return -ENOMEM;
0274 }
0275 
0276 void splice_shrink_spd(struct splice_pipe_desc *spd)
0277 {
0278     if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
0279         return;
0280 
0281     kfree(spd->pages);
0282     kfree(spd->partial);
0283 }
0284 
0285 /**
0286  * generic_file_splice_read - splice data from file to a pipe
0287  * @in:     file to splice from
0288  * @ppos:   position in @in
0289  * @pipe:   pipe to splice to
0290  * @len:    number of bytes to splice
0291  * @flags:  splice modifier flags
0292  *
0293  * Description:
0294  *    Will read pages from given file and fill them into a pipe. Can be
0295  *    used as long as it has more or less sane ->read_iter().
0296  *
0297  */
0298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
0299                  struct pipe_inode_info *pipe, size_t len,
0300                  unsigned int flags)
0301 {
0302     struct iov_iter to;
0303     struct kiocb kiocb;
0304     int ret;
0305 
0306     iov_iter_pipe(&to, READ, pipe, len);
0307     init_sync_kiocb(&kiocb, in);
0308     kiocb.ki_pos = *ppos;
0309     ret = call_read_iter(in, &kiocb, &to);
0310     if (ret > 0) {
0311         *ppos = kiocb.ki_pos;
0312         file_accessed(in);
0313     } else if (ret < 0) {
0314         /* free what was emitted */
0315         pipe_discard_from(pipe, to.start_head);
0316         /*
0317          * callers of ->splice_read() expect -EAGAIN on
0318          * "can't put anything in there", rather than -EFAULT.
0319          */
0320         if (ret == -EFAULT)
0321             ret = -EAGAIN;
0322     }
0323 
0324     return ret;
0325 }
0326 EXPORT_SYMBOL(generic_file_splice_read);
0327 
0328 const struct pipe_buf_operations default_pipe_buf_ops = {
0329     .release    = generic_pipe_buf_release,
0330     .try_steal  = generic_pipe_buf_try_steal,
0331     .get        = generic_pipe_buf_get,
0332 };
0333 
0334 /* Pipe buffer operations for a socket and similar. */
0335 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
0336     .release    = generic_pipe_buf_release,
0337     .get        = generic_pipe_buf_get,
0338 };
0339 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
0340 
0341 /*
0342  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
0343  * using sendpage(). Return the number of bytes sent.
0344  */
0345 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
0346                 struct pipe_buffer *buf, struct splice_desc *sd)
0347 {
0348     struct file *file = sd->u.file;
0349     loff_t pos = sd->pos;
0350     int more;
0351 
0352     if (!likely(file->f_op->sendpage))
0353         return -EINVAL;
0354 
0355     more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
0356 
0357     if (sd->len < sd->total_len &&
0358         pipe_occupancy(pipe->head, pipe->tail) > 1)
0359         more |= MSG_SENDPAGE_NOTLAST;
0360 
0361     return file->f_op->sendpage(file, buf->page, buf->offset,
0362                     sd->len, &pos, more);
0363 }
0364 
0365 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
0366 {
0367     smp_mb();
0368     if (waitqueue_active(&pipe->wr_wait))
0369         wake_up_interruptible(&pipe->wr_wait);
0370     kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
0371 }
0372 
0373 /**
0374  * splice_from_pipe_feed - feed available data from a pipe to a file
0375  * @pipe:   pipe to splice from
0376  * @sd:     information to @actor
0377  * @actor:  handler that splices the data
0378  *
0379  * Description:
0380  *    This function loops over the pipe and calls @actor to do the
0381  *    actual moving of a single struct pipe_buffer to the desired
0382  *    destination.  It returns when there's no more buffers left in
0383  *    the pipe or if the requested number of bytes (@sd->total_len)
0384  *    have been copied.  It returns a positive number (one) if the
0385  *    pipe needs to be filled with more data, zero if the required
0386  *    number of bytes have been copied and -errno on error.
0387  *
0388  *    This, together with splice_from_pipe_{begin,end,next}, may be
0389  *    used to implement the functionality of __splice_from_pipe() when
0390  *    locking is required around copying the pipe buffers to the
0391  *    destination.
0392  */
0393 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
0394               splice_actor *actor)
0395 {
0396     unsigned int head = pipe->head;
0397     unsigned int tail = pipe->tail;
0398     unsigned int mask = pipe->ring_size - 1;
0399     int ret;
0400 
0401     while (!pipe_empty(head, tail)) {
0402         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
0403 
0404         sd->len = buf->len;
0405         if (sd->len > sd->total_len)
0406             sd->len = sd->total_len;
0407 
0408         ret = pipe_buf_confirm(pipe, buf);
0409         if (unlikely(ret)) {
0410             if (ret == -ENODATA)
0411                 ret = 0;
0412             return ret;
0413         }
0414 
0415         ret = actor(pipe, buf, sd);
0416         if (ret <= 0)
0417             return ret;
0418 
0419         buf->offset += ret;
0420         buf->len -= ret;
0421 
0422         sd->num_spliced += ret;
0423         sd->len -= ret;
0424         sd->pos += ret;
0425         sd->total_len -= ret;
0426 
0427         if (!buf->len) {
0428             pipe_buf_release(pipe, buf);
0429             tail++;
0430             pipe->tail = tail;
0431             if (pipe->files)
0432                 sd->need_wakeup = true;
0433         }
0434 
0435         if (!sd->total_len)
0436             return 0;
0437     }
0438 
0439     return 1;
0440 }
0441 
0442 /* We know we have a pipe buffer, but maybe it's empty? */
0443 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
0444 {
0445     unsigned int tail = pipe->tail;
0446     unsigned int mask = pipe->ring_size - 1;
0447     struct pipe_buffer *buf = &pipe->bufs[tail & mask];
0448 
0449     if (unlikely(!buf->len)) {
0450         pipe_buf_release(pipe, buf);
0451         pipe->tail = tail+1;
0452         return true;
0453     }
0454 
0455     return false;
0456 }
0457 
0458 /**
0459  * splice_from_pipe_next - wait for some data to splice from
0460  * @pipe:   pipe to splice from
0461  * @sd:     information about the splice operation
0462  *
0463  * Description:
0464  *    This function will wait for some data and return a positive
0465  *    value (one) if pipe buffers are available.  It will return zero
0466  *    or -errno if no more data needs to be spliced.
0467  */
0468 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
0469 {
0470     /*
0471      * Check for signal early to make process killable when there are
0472      * always buffers available
0473      */
0474     if (signal_pending(current))
0475         return -ERESTARTSYS;
0476 
0477 repeat:
0478     while (pipe_empty(pipe->head, pipe->tail)) {
0479         if (!pipe->writers)
0480             return 0;
0481 
0482         if (sd->num_spliced)
0483             return 0;
0484 
0485         if (sd->flags & SPLICE_F_NONBLOCK)
0486             return -EAGAIN;
0487 
0488         if (signal_pending(current))
0489             return -ERESTARTSYS;
0490 
0491         if (sd->need_wakeup) {
0492             wakeup_pipe_writers(pipe);
0493             sd->need_wakeup = false;
0494         }
0495 
0496         pipe_wait_readable(pipe);
0497     }
0498 
0499     if (eat_empty_buffer(pipe))
0500         goto repeat;
0501 
0502     return 1;
0503 }
0504 
0505 /**
0506  * splice_from_pipe_begin - start splicing from pipe
0507  * @sd:     information about the splice operation
0508  *
0509  * Description:
0510  *    This function should be called before a loop containing
0511  *    splice_from_pipe_next() and splice_from_pipe_feed() to
0512  *    initialize the necessary fields of @sd.
0513  */
0514 static void splice_from_pipe_begin(struct splice_desc *sd)
0515 {
0516     sd->num_spliced = 0;
0517     sd->need_wakeup = false;
0518 }
0519 
0520 /**
0521  * splice_from_pipe_end - finish splicing from pipe
0522  * @pipe:   pipe to splice from
0523  * @sd:     information about the splice operation
0524  *
0525  * Description:
0526  *    This function will wake up pipe writers if necessary.  It should
0527  *    be called after a loop containing splice_from_pipe_next() and
0528  *    splice_from_pipe_feed().
0529  */
0530 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
0531 {
0532     if (sd->need_wakeup)
0533         wakeup_pipe_writers(pipe);
0534 }
0535 
0536 /**
0537  * __splice_from_pipe - splice data from a pipe to given actor
0538  * @pipe:   pipe to splice from
0539  * @sd:     information to @actor
0540  * @actor:  handler that splices the data
0541  *
0542  * Description:
0543  *    This function does little more than loop over the pipe and call
0544  *    @actor to do the actual moving of a single struct pipe_buffer to
0545  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
0546  *    pipe_to_user.
0547  *
0548  */
0549 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
0550                splice_actor *actor)
0551 {
0552     int ret;
0553 
0554     splice_from_pipe_begin(sd);
0555     do {
0556         cond_resched();
0557         ret = splice_from_pipe_next(pipe, sd);
0558         if (ret > 0)
0559             ret = splice_from_pipe_feed(pipe, sd, actor);
0560     } while (ret > 0);
0561     splice_from_pipe_end(pipe, sd);
0562 
0563     return sd->num_spliced ? sd->num_spliced : ret;
0564 }
0565 EXPORT_SYMBOL(__splice_from_pipe);
0566 
0567 /**
0568  * splice_from_pipe - splice data from a pipe to a file
0569  * @pipe:   pipe to splice from
0570  * @out:    file to splice to
0571  * @ppos:   position in @out
0572  * @len:    how many bytes to splice
0573  * @flags:  splice modifier flags
0574  * @actor:  handler that splices the data
0575  *
0576  * Description:
0577  *    See __splice_from_pipe. This function locks the pipe inode,
0578  *    otherwise it's identical to __splice_from_pipe().
0579  *
0580  */
0581 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
0582              loff_t *ppos, size_t len, unsigned int flags,
0583              splice_actor *actor)
0584 {
0585     ssize_t ret;
0586     struct splice_desc sd = {
0587         .total_len = len,
0588         .flags = flags,
0589         .pos = *ppos,
0590         .u.file = out,
0591     };
0592 
0593     pipe_lock(pipe);
0594     ret = __splice_from_pipe(pipe, &sd, actor);
0595     pipe_unlock(pipe);
0596 
0597     return ret;
0598 }
0599 
0600 /**
0601  * iter_file_splice_write - splice data from a pipe to a file
0602  * @pipe:   pipe info
0603  * @out:    file to write to
0604  * @ppos:   position in @out
0605  * @len:    number of bytes to splice
0606  * @flags:  splice modifier flags
0607  *
0608  * Description:
0609  *    Will either move or copy pages (determined by @flags options) from
0610  *    the given pipe inode to the given file.
0611  *    This one is ->write_iter-based.
0612  *
0613  */
0614 ssize_t
0615 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
0616               loff_t *ppos, size_t len, unsigned int flags)
0617 {
0618     struct splice_desc sd = {
0619         .total_len = len,
0620         .flags = flags,
0621         .pos = *ppos,
0622         .u.file = out,
0623     };
0624     int nbufs = pipe->max_usage;
0625     struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
0626                     GFP_KERNEL);
0627     ssize_t ret;
0628 
0629     if (unlikely(!array))
0630         return -ENOMEM;
0631 
0632     pipe_lock(pipe);
0633 
0634     splice_from_pipe_begin(&sd);
0635     while (sd.total_len) {
0636         struct iov_iter from;
0637         unsigned int head, tail, mask;
0638         size_t left;
0639         int n;
0640 
0641         ret = splice_from_pipe_next(pipe, &sd);
0642         if (ret <= 0)
0643             break;
0644 
0645         if (unlikely(nbufs < pipe->max_usage)) {
0646             kfree(array);
0647             nbufs = pipe->max_usage;
0648             array = kcalloc(nbufs, sizeof(struct bio_vec),
0649                     GFP_KERNEL);
0650             if (!array) {
0651                 ret = -ENOMEM;
0652                 break;
0653             }
0654         }
0655 
0656         head = pipe->head;
0657         tail = pipe->tail;
0658         mask = pipe->ring_size - 1;
0659 
0660         /* build the vector */
0661         left = sd.total_len;
0662         for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
0663             struct pipe_buffer *buf = &pipe->bufs[tail & mask];
0664             size_t this_len = buf->len;
0665 
0666             /* zero-length bvecs are not supported, skip them */
0667             if (!this_len)
0668                 continue;
0669             this_len = min(this_len, left);
0670 
0671             ret = pipe_buf_confirm(pipe, buf);
0672             if (unlikely(ret)) {
0673                 if (ret == -ENODATA)
0674                     ret = 0;
0675                 goto done;
0676             }
0677 
0678             array[n].bv_page = buf->page;
0679             array[n].bv_len = this_len;
0680             array[n].bv_offset = buf->offset;
0681             left -= this_len;
0682             n++;
0683         }
0684 
0685         iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
0686         ret = vfs_iter_write(out, &from, &sd.pos, 0);
0687         if (ret <= 0)
0688             break;
0689 
0690         sd.num_spliced += ret;
0691         sd.total_len -= ret;
0692         *ppos = sd.pos;
0693 
0694         /* dismiss the fully eaten buffers, adjust the partial one */
0695         tail = pipe->tail;
0696         while (ret) {
0697             struct pipe_buffer *buf = &pipe->bufs[tail & mask];
0698             if (ret >= buf->len) {
0699                 ret -= buf->len;
0700                 buf->len = 0;
0701                 pipe_buf_release(pipe, buf);
0702                 tail++;
0703                 pipe->tail = tail;
0704                 if (pipe->files)
0705                     sd.need_wakeup = true;
0706             } else {
0707                 buf->offset += ret;
0708                 buf->len -= ret;
0709                 ret = 0;
0710             }
0711         }
0712     }
0713 done:
0714     kfree(array);
0715     splice_from_pipe_end(pipe, &sd);
0716 
0717     pipe_unlock(pipe);
0718 
0719     if (sd.num_spliced)
0720         ret = sd.num_spliced;
0721 
0722     return ret;
0723 }
0724 
0725 EXPORT_SYMBOL(iter_file_splice_write);
0726 
0727 /**
0728  * generic_splice_sendpage - splice data from a pipe to a socket
0729  * @pipe:   pipe to splice from
0730  * @out:    socket to write to
0731  * @ppos:   position in @out
0732  * @len:    number of bytes to splice
0733  * @flags:  splice modifier flags
0734  *
0735  * Description:
0736  *    Will send @len bytes from the pipe to a network socket. No data copying
0737  *    is involved.
0738  *
0739  */
0740 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
0741                 loff_t *ppos, size_t len, unsigned int flags)
0742 {
0743     return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
0744 }
0745 
0746 EXPORT_SYMBOL(generic_splice_sendpage);
0747 
0748 static int warn_unsupported(struct file *file, const char *op)
0749 {
0750     pr_debug_ratelimited(
0751         "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
0752         op, file, current->pid, current->comm);
0753     return -EINVAL;
0754 }
0755 
0756 /*
0757  * Attempt to initiate a splice from pipe to file.
0758  */
0759 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
0760                loff_t *ppos, size_t len, unsigned int flags)
0761 {
0762     if (unlikely(!out->f_op->splice_write))
0763         return warn_unsupported(out, "write");
0764     return out->f_op->splice_write(pipe, out, ppos, len, flags);
0765 }
0766 
0767 /*
0768  * Attempt to initiate a splice from a file to a pipe.
0769  */
0770 static long do_splice_to(struct file *in, loff_t *ppos,
0771              struct pipe_inode_info *pipe, size_t len,
0772              unsigned int flags)
0773 {
0774     unsigned int p_space;
0775     int ret;
0776 
0777     if (unlikely(!(in->f_mode & FMODE_READ)))
0778         return -EBADF;
0779 
0780     /* Don't try to read more the pipe has space for. */
0781     p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
0782     len = min_t(size_t, len, p_space << PAGE_SHIFT);
0783 
0784     ret = rw_verify_area(READ, in, ppos, len);
0785     if (unlikely(ret < 0))
0786         return ret;
0787 
0788     if (unlikely(len > MAX_RW_COUNT))
0789         len = MAX_RW_COUNT;
0790 
0791     if (unlikely(!in->f_op->splice_read))
0792         return warn_unsupported(in, "read");
0793     return in->f_op->splice_read(in, ppos, pipe, len, flags);
0794 }
0795 
0796 /**
0797  * splice_direct_to_actor - splices data directly between two non-pipes
0798  * @in:     file to splice from
0799  * @sd:     actor information on where to splice to
0800  * @actor:  handles the data splicing
0801  *
0802  * Description:
0803  *    This is a special case helper to splice directly between two
0804  *    points, without requiring an explicit pipe. Internally an allocated
0805  *    pipe is cached in the process, and reused during the lifetime of
0806  *    that process.
0807  *
0808  */
0809 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
0810                    splice_direct_actor *actor)
0811 {
0812     struct pipe_inode_info *pipe;
0813     long ret, bytes;
0814     size_t len;
0815     int i, flags, more;
0816 
0817     /*
0818      * We require the input to be seekable, as we don't want to randomly
0819      * drop data for eg socket -> socket splicing. Use the piped splicing
0820      * for that!
0821      */
0822     if (unlikely(!(in->f_mode & FMODE_LSEEK)))
0823         return -EINVAL;
0824 
0825     /*
0826      * neither in nor out is a pipe, setup an internal pipe attached to
0827      * 'out' and transfer the wanted data from 'in' to 'out' through that
0828      */
0829     pipe = current->splice_pipe;
0830     if (unlikely(!pipe)) {
0831         pipe = alloc_pipe_info();
0832         if (!pipe)
0833             return -ENOMEM;
0834 
0835         /*
0836          * We don't have an immediate reader, but we'll read the stuff
0837          * out of the pipe right after the splice_to_pipe(). So set
0838          * PIPE_READERS appropriately.
0839          */
0840         pipe->readers = 1;
0841 
0842         current->splice_pipe = pipe;
0843     }
0844 
0845     /*
0846      * Do the splice.
0847      */
0848     ret = 0;
0849     bytes = 0;
0850     len = sd->total_len;
0851     flags = sd->flags;
0852 
0853     /*
0854      * Don't block on output, we have to drain the direct pipe.
0855      */
0856     sd->flags &= ~SPLICE_F_NONBLOCK;
0857     more = sd->flags & SPLICE_F_MORE;
0858 
0859     WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
0860 
0861     while (len) {
0862         size_t read_len;
0863         loff_t pos = sd->pos, prev_pos = pos;
0864 
0865         ret = do_splice_to(in, &pos, pipe, len, flags);
0866         if (unlikely(ret <= 0))
0867             goto out_release;
0868 
0869         read_len = ret;
0870         sd->total_len = read_len;
0871 
0872         /*
0873          * If more data is pending, set SPLICE_F_MORE
0874          * If this is the last data and SPLICE_F_MORE was not set
0875          * initially, clears it.
0876          */
0877         if (read_len < len)
0878             sd->flags |= SPLICE_F_MORE;
0879         else if (!more)
0880             sd->flags &= ~SPLICE_F_MORE;
0881         /*
0882          * NOTE: nonblocking mode only applies to the input. We
0883          * must not do the output in nonblocking mode as then we
0884          * could get stuck data in the internal pipe:
0885          */
0886         ret = actor(pipe, sd);
0887         if (unlikely(ret <= 0)) {
0888             sd->pos = prev_pos;
0889             goto out_release;
0890         }
0891 
0892         bytes += ret;
0893         len -= ret;
0894         sd->pos = pos;
0895 
0896         if (ret < read_len) {
0897             sd->pos = prev_pos + ret;
0898             goto out_release;
0899         }
0900     }
0901 
0902 done:
0903     pipe->tail = pipe->head = 0;
0904     file_accessed(in);
0905     return bytes;
0906 
0907 out_release:
0908     /*
0909      * If we did an incomplete transfer we must release
0910      * the pipe buffers in question:
0911      */
0912     for (i = 0; i < pipe->ring_size; i++) {
0913         struct pipe_buffer *buf = &pipe->bufs[i];
0914 
0915         if (buf->ops)
0916             pipe_buf_release(pipe, buf);
0917     }
0918 
0919     if (!bytes)
0920         bytes = ret;
0921 
0922     goto done;
0923 }
0924 EXPORT_SYMBOL(splice_direct_to_actor);
0925 
0926 static int direct_splice_actor(struct pipe_inode_info *pipe,
0927                    struct splice_desc *sd)
0928 {
0929     struct file *file = sd->u.file;
0930 
0931     return do_splice_from(pipe, file, sd->opos, sd->total_len,
0932                   sd->flags);
0933 }
0934 
0935 /**
0936  * do_splice_direct - splices data directly between two files
0937  * @in:     file to splice from
0938  * @ppos:   input file offset
0939  * @out:    file to splice to
0940  * @opos:   output file offset
0941  * @len:    number of bytes to splice
0942  * @flags:  splice modifier flags
0943  *
0944  * Description:
0945  *    For use by do_sendfile(). splice can easily emulate sendfile, but
0946  *    doing it in the application would incur an extra system call
0947  *    (splice in + splice out, as compared to just sendfile()). So this helper
0948  *    can splice directly through a process-private pipe.
0949  *
0950  */
0951 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
0952               loff_t *opos, size_t len, unsigned int flags)
0953 {
0954     struct splice_desc sd = {
0955         .len        = len,
0956         .total_len  = len,
0957         .flags      = flags,
0958         .pos        = *ppos,
0959         .u.file     = out,
0960         .opos       = opos,
0961     };
0962     long ret;
0963 
0964     if (unlikely(!(out->f_mode & FMODE_WRITE)))
0965         return -EBADF;
0966 
0967     if (unlikely(out->f_flags & O_APPEND))
0968         return -EINVAL;
0969 
0970     ret = rw_verify_area(WRITE, out, opos, len);
0971     if (unlikely(ret < 0))
0972         return ret;
0973 
0974     ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
0975     if (ret > 0)
0976         *ppos = sd.pos;
0977 
0978     return ret;
0979 }
0980 EXPORT_SYMBOL(do_splice_direct);
0981 
0982 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
0983 {
0984     for (;;) {
0985         if (unlikely(!pipe->readers)) {
0986             send_sig(SIGPIPE, current, 0);
0987             return -EPIPE;
0988         }
0989         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
0990             return 0;
0991         if (flags & SPLICE_F_NONBLOCK)
0992             return -EAGAIN;
0993         if (signal_pending(current))
0994             return -ERESTARTSYS;
0995         pipe_wait_writable(pipe);
0996     }
0997 }
0998 
0999 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1000                    struct pipe_inode_info *opipe,
1001                    size_t len, unsigned int flags);
1002 
1003 long splice_file_to_pipe(struct file *in,
1004              struct pipe_inode_info *opipe,
1005              loff_t *offset,
1006              size_t len, unsigned int flags)
1007 {
1008     long ret;
1009 
1010     pipe_lock(opipe);
1011     ret = wait_for_space(opipe, flags);
1012     if (!ret)
1013         ret = do_splice_to(in, offset, opipe, len, flags);
1014     pipe_unlock(opipe);
1015     if (ret > 0)
1016         wakeup_pipe_readers(opipe);
1017     return ret;
1018 }
1019 
1020 /*
1021  * Determine where to splice to/from.
1022  */
1023 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1024            loff_t *off_out, size_t len, unsigned int flags)
1025 {
1026     struct pipe_inode_info *ipipe;
1027     struct pipe_inode_info *opipe;
1028     loff_t offset;
1029     long ret;
1030 
1031     if (unlikely(!(in->f_mode & FMODE_READ) ||
1032              !(out->f_mode & FMODE_WRITE)))
1033         return -EBADF;
1034 
1035     ipipe = get_pipe_info(in, true);
1036     opipe = get_pipe_info(out, true);
1037 
1038     if (ipipe && opipe) {
1039         if (off_in || off_out)
1040             return -ESPIPE;
1041 
1042         /* Splicing to self would be fun, but... */
1043         if (ipipe == opipe)
1044             return -EINVAL;
1045 
1046         if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1047             flags |= SPLICE_F_NONBLOCK;
1048 
1049         return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1050     }
1051 
1052     if (ipipe) {
1053         if (off_in)
1054             return -ESPIPE;
1055         if (off_out) {
1056             if (!(out->f_mode & FMODE_PWRITE))
1057                 return -EINVAL;
1058             offset = *off_out;
1059         } else {
1060             offset = out->f_pos;
1061         }
1062 
1063         if (unlikely(out->f_flags & O_APPEND))
1064             return -EINVAL;
1065 
1066         ret = rw_verify_area(WRITE, out, &offset, len);
1067         if (unlikely(ret < 0))
1068             return ret;
1069 
1070         if (in->f_flags & O_NONBLOCK)
1071             flags |= SPLICE_F_NONBLOCK;
1072 
1073         file_start_write(out);
1074         ret = do_splice_from(ipipe, out, &offset, len, flags);
1075         file_end_write(out);
1076 
1077         if (!off_out)
1078             out->f_pos = offset;
1079         else
1080             *off_out = offset;
1081 
1082         return ret;
1083     }
1084 
1085     if (opipe) {
1086         if (off_out)
1087             return -ESPIPE;
1088         if (off_in) {
1089             if (!(in->f_mode & FMODE_PREAD))
1090                 return -EINVAL;
1091             offset = *off_in;
1092         } else {
1093             offset = in->f_pos;
1094         }
1095 
1096         if (out->f_flags & O_NONBLOCK)
1097             flags |= SPLICE_F_NONBLOCK;
1098 
1099         ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1100         if (!off_in)
1101             in->f_pos = offset;
1102         else
1103             *off_in = offset;
1104 
1105         return ret;
1106     }
1107 
1108     return -EINVAL;
1109 }
1110 
1111 static long __do_splice(struct file *in, loff_t __user *off_in,
1112             struct file *out, loff_t __user *off_out,
1113             size_t len, unsigned int flags)
1114 {
1115     struct pipe_inode_info *ipipe;
1116     struct pipe_inode_info *opipe;
1117     loff_t offset, *__off_in = NULL, *__off_out = NULL;
1118     long ret;
1119 
1120     ipipe = get_pipe_info(in, true);
1121     opipe = get_pipe_info(out, true);
1122 
1123     if (ipipe && off_in)
1124         return -ESPIPE;
1125     if (opipe && off_out)
1126         return -ESPIPE;
1127 
1128     if (off_out) {
1129         if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1130             return -EFAULT;
1131         __off_out = &offset;
1132     }
1133     if (off_in) {
1134         if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1135             return -EFAULT;
1136         __off_in = &offset;
1137     }
1138 
1139     ret = do_splice(in, __off_in, out, __off_out, len, flags);
1140     if (ret < 0)
1141         return ret;
1142 
1143     if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1144         return -EFAULT;
1145     if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1146         return -EFAULT;
1147 
1148     return ret;
1149 }
1150 
1151 static int iter_to_pipe(struct iov_iter *from,
1152             struct pipe_inode_info *pipe,
1153             unsigned flags)
1154 {
1155     struct pipe_buffer buf = {
1156         .ops = &user_page_pipe_buf_ops,
1157         .flags = flags
1158     };
1159     size_t total = 0;
1160     int ret = 0;
1161 
1162     while (iov_iter_count(from)) {
1163         struct page *pages[16];
1164         ssize_t left;
1165         size_t start;
1166         int i, n;
1167 
1168         left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1169         if (left <= 0) {
1170             ret = left;
1171             break;
1172         }
1173 
1174         n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1175         for (i = 0; i < n; i++) {
1176             int size = min_t(int, left, PAGE_SIZE - start);
1177 
1178             buf.page = pages[i];
1179             buf.offset = start;
1180             buf.len = size;
1181             ret = add_to_pipe(pipe, &buf);
1182             if (unlikely(ret < 0)) {
1183                 iov_iter_revert(from, left);
1184                 // this one got dropped by add_to_pipe()
1185                 while (++i < n)
1186                     put_page(pages[i]);
1187                 goto out;
1188             }
1189             total += ret;
1190             left -= size;
1191             start = 0;
1192         }
1193     }
1194 out:
1195     return total ? total : ret;
1196 }
1197 
1198 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1199             struct splice_desc *sd)
1200 {
1201     int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1202     return n == sd->len ? n : -EFAULT;
1203 }
1204 
1205 /*
1206  * For lack of a better implementation, implement vmsplice() to userspace
1207  * as a simple copy of the pipes pages to the user iov.
1208  */
1209 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1210                  unsigned int flags)
1211 {
1212     struct pipe_inode_info *pipe = get_pipe_info(file, true);
1213     struct splice_desc sd = {
1214         .total_len = iov_iter_count(iter),
1215         .flags = flags,
1216         .u.data = iter
1217     };
1218     long ret = 0;
1219 
1220     if (!pipe)
1221         return -EBADF;
1222 
1223     if (sd.total_len) {
1224         pipe_lock(pipe);
1225         ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1226         pipe_unlock(pipe);
1227     }
1228 
1229     return ret;
1230 }
1231 
1232 /*
1233  * vmsplice splices a user address range into a pipe. It can be thought of
1234  * as splice-from-memory, where the regular splice is splice-from-file (or
1235  * to file). In both cases the output is a pipe, naturally.
1236  */
1237 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1238                  unsigned int flags)
1239 {
1240     struct pipe_inode_info *pipe;
1241     long ret = 0;
1242     unsigned buf_flag = 0;
1243 
1244     if (flags & SPLICE_F_GIFT)
1245         buf_flag = PIPE_BUF_FLAG_GIFT;
1246 
1247     pipe = get_pipe_info(file, true);
1248     if (!pipe)
1249         return -EBADF;
1250 
1251     pipe_lock(pipe);
1252     ret = wait_for_space(pipe, flags);
1253     if (!ret)
1254         ret = iter_to_pipe(iter, pipe, buf_flag);
1255     pipe_unlock(pipe);
1256     if (ret > 0)
1257         wakeup_pipe_readers(pipe);
1258     return ret;
1259 }
1260 
1261 static int vmsplice_type(struct fd f, int *type)
1262 {
1263     if (!f.file)
1264         return -EBADF;
1265     if (f.file->f_mode & FMODE_WRITE) {
1266         *type = WRITE;
1267     } else if (f.file->f_mode & FMODE_READ) {
1268         *type = READ;
1269     } else {
1270         fdput(f);
1271         return -EBADF;
1272     }
1273     return 0;
1274 }
1275 
1276 /*
1277  * Note that vmsplice only really supports true splicing _from_ user memory
1278  * to a pipe, not the other way around. Splicing from user memory is a simple
1279  * operation that can be supported without any funky alignment restrictions
1280  * or nasty vm tricks. We simply map in the user memory and fill them into
1281  * a pipe. The reverse isn't quite as easy, though. There are two possible
1282  * solutions for that:
1283  *
1284  *  - memcpy() the data internally, at which point we might as well just
1285  *    do a regular read() on the buffer anyway.
1286  *  - Lots of nasty vm tricks, that are neither fast nor flexible (it
1287  *    has restriction limitations on both ends of the pipe).
1288  *
1289  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1290  *
1291  */
1292 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1293         unsigned long, nr_segs, unsigned int, flags)
1294 {
1295     struct iovec iovstack[UIO_FASTIOV];
1296     struct iovec *iov = iovstack;
1297     struct iov_iter iter;
1298     ssize_t error;
1299     struct fd f;
1300     int type;
1301 
1302     if (unlikely(flags & ~SPLICE_F_ALL))
1303         return -EINVAL;
1304 
1305     f = fdget(fd);
1306     error = vmsplice_type(f, &type);
1307     if (error)
1308         return error;
1309 
1310     error = import_iovec(type, uiov, nr_segs,
1311                  ARRAY_SIZE(iovstack), &iov, &iter);
1312     if (error < 0)
1313         goto out_fdput;
1314 
1315     if (!iov_iter_count(&iter))
1316         error = 0;
1317     else if (iov_iter_rw(&iter) == WRITE)
1318         error = vmsplice_to_pipe(f.file, &iter, flags);
1319     else
1320         error = vmsplice_to_user(f.file, &iter, flags);
1321 
1322     kfree(iov);
1323 out_fdput:
1324     fdput(f);
1325     return error;
1326 }
1327 
1328 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1329         int, fd_out, loff_t __user *, off_out,
1330         size_t, len, unsigned int, flags)
1331 {
1332     struct fd in, out;
1333     long error;
1334 
1335     if (unlikely(!len))
1336         return 0;
1337 
1338     if (unlikely(flags & ~SPLICE_F_ALL))
1339         return -EINVAL;
1340 
1341     error = -EBADF;
1342     in = fdget(fd_in);
1343     if (in.file) {
1344         out = fdget(fd_out);
1345         if (out.file) {
1346             error = __do_splice(in.file, off_in, out.file, off_out,
1347                         len, flags);
1348             fdput(out);
1349         }
1350         fdput(in);
1351     }
1352     return error;
1353 }
1354 
1355 /*
1356  * Make sure there's data to read. Wait for input if we can, otherwise
1357  * return an appropriate error.
1358  */
1359 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1360 {
1361     int ret;
1362 
1363     /*
1364      * Check the pipe occupancy without the inode lock first. This function
1365      * is speculative anyways, so missing one is ok.
1366      */
1367     if (!pipe_empty(pipe->head, pipe->tail))
1368         return 0;
1369 
1370     ret = 0;
1371     pipe_lock(pipe);
1372 
1373     while (pipe_empty(pipe->head, pipe->tail)) {
1374         if (signal_pending(current)) {
1375             ret = -ERESTARTSYS;
1376             break;
1377         }
1378         if (!pipe->writers)
1379             break;
1380         if (flags & SPLICE_F_NONBLOCK) {
1381             ret = -EAGAIN;
1382             break;
1383         }
1384         pipe_wait_readable(pipe);
1385     }
1386 
1387     pipe_unlock(pipe);
1388     return ret;
1389 }
1390 
1391 /*
1392  * Make sure there's writeable room. Wait for room if we can, otherwise
1393  * return an appropriate error.
1394  */
1395 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1396 {
1397     int ret;
1398 
1399     /*
1400      * Check pipe occupancy without the inode lock first. This function
1401      * is speculative anyways, so missing one is ok.
1402      */
1403     if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1404         return 0;
1405 
1406     ret = 0;
1407     pipe_lock(pipe);
1408 
1409     while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1410         if (!pipe->readers) {
1411             send_sig(SIGPIPE, current, 0);
1412             ret = -EPIPE;
1413             break;
1414         }
1415         if (flags & SPLICE_F_NONBLOCK) {
1416             ret = -EAGAIN;
1417             break;
1418         }
1419         if (signal_pending(current)) {
1420             ret = -ERESTARTSYS;
1421             break;
1422         }
1423         pipe_wait_writable(pipe);
1424     }
1425 
1426     pipe_unlock(pipe);
1427     return ret;
1428 }
1429 
1430 /*
1431  * Splice contents of ipipe to opipe.
1432  */
1433 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1434                    struct pipe_inode_info *opipe,
1435                    size_t len, unsigned int flags)
1436 {
1437     struct pipe_buffer *ibuf, *obuf;
1438     unsigned int i_head, o_head;
1439     unsigned int i_tail, o_tail;
1440     unsigned int i_mask, o_mask;
1441     int ret = 0;
1442     bool input_wakeup = false;
1443 
1444 
1445 retry:
1446     ret = ipipe_prep(ipipe, flags);
1447     if (ret)
1448         return ret;
1449 
1450     ret = opipe_prep(opipe, flags);
1451     if (ret)
1452         return ret;
1453 
1454     /*
1455      * Potential ABBA deadlock, work around it by ordering lock
1456      * grabbing by pipe info address. Otherwise two different processes
1457      * could deadlock (one doing tee from A -> B, the other from B -> A).
1458      */
1459     pipe_double_lock(ipipe, opipe);
1460 
1461     i_tail = ipipe->tail;
1462     i_mask = ipipe->ring_size - 1;
1463     o_head = opipe->head;
1464     o_mask = opipe->ring_size - 1;
1465 
1466     do {
1467         size_t o_len;
1468 
1469         if (!opipe->readers) {
1470             send_sig(SIGPIPE, current, 0);
1471             if (!ret)
1472                 ret = -EPIPE;
1473             break;
1474         }
1475 
1476         i_head = ipipe->head;
1477         o_tail = opipe->tail;
1478 
1479         if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1480             break;
1481 
1482         /*
1483          * Cannot make any progress, because either the input
1484          * pipe is empty or the output pipe is full.
1485          */
1486         if (pipe_empty(i_head, i_tail) ||
1487             pipe_full(o_head, o_tail, opipe->max_usage)) {
1488             /* Already processed some buffers, break */
1489             if (ret)
1490                 break;
1491 
1492             if (flags & SPLICE_F_NONBLOCK) {
1493                 ret = -EAGAIN;
1494                 break;
1495             }
1496 
1497             /*
1498              * We raced with another reader/writer and haven't
1499              * managed to process any buffers.  A zero return
1500              * value means EOF, so retry instead.
1501              */
1502             pipe_unlock(ipipe);
1503             pipe_unlock(opipe);
1504             goto retry;
1505         }
1506 
1507         ibuf = &ipipe->bufs[i_tail & i_mask];
1508         obuf = &opipe->bufs[o_head & o_mask];
1509 
1510         if (len >= ibuf->len) {
1511             /*
1512              * Simply move the whole buffer from ipipe to opipe
1513              */
1514             *obuf = *ibuf;
1515             ibuf->ops = NULL;
1516             i_tail++;
1517             ipipe->tail = i_tail;
1518             input_wakeup = true;
1519             o_len = obuf->len;
1520             o_head++;
1521             opipe->head = o_head;
1522         } else {
1523             /*
1524              * Get a reference to this pipe buffer,
1525              * so we can copy the contents over.
1526              */
1527             if (!pipe_buf_get(ipipe, ibuf)) {
1528                 if (ret == 0)
1529                     ret = -EFAULT;
1530                 break;
1531             }
1532             *obuf = *ibuf;
1533 
1534             /*
1535              * Don't inherit the gift and merge flags, we need to
1536              * prevent multiple steals of this page.
1537              */
1538             obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1539             obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1540 
1541             obuf->len = len;
1542             ibuf->offset += len;
1543             ibuf->len -= len;
1544             o_len = len;
1545             o_head++;
1546             opipe->head = o_head;
1547         }
1548         ret += o_len;
1549         len -= o_len;
1550     } while (len);
1551 
1552     pipe_unlock(ipipe);
1553     pipe_unlock(opipe);
1554 
1555     /*
1556      * If we put data in the output pipe, wakeup any potential readers.
1557      */
1558     if (ret > 0)
1559         wakeup_pipe_readers(opipe);
1560 
1561     if (input_wakeup)
1562         wakeup_pipe_writers(ipipe);
1563 
1564     return ret;
1565 }
1566 
1567 /*
1568  * Link contents of ipipe to opipe.
1569  */
1570 static int link_pipe(struct pipe_inode_info *ipipe,
1571              struct pipe_inode_info *opipe,
1572              size_t len, unsigned int flags)
1573 {
1574     struct pipe_buffer *ibuf, *obuf;
1575     unsigned int i_head, o_head;
1576     unsigned int i_tail, o_tail;
1577     unsigned int i_mask, o_mask;
1578     int ret = 0;
1579 
1580     /*
1581      * Potential ABBA deadlock, work around it by ordering lock
1582      * grabbing by pipe info address. Otherwise two different processes
1583      * could deadlock (one doing tee from A -> B, the other from B -> A).
1584      */
1585     pipe_double_lock(ipipe, opipe);
1586 
1587     i_tail = ipipe->tail;
1588     i_mask = ipipe->ring_size - 1;
1589     o_head = opipe->head;
1590     o_mask = opipe->ring_size - 1;
1591 
1592     do {
1593         if (!opipe->readers) {
1594             send_sig(SIGPIPE, current, 0);
1595             if (!ret)
1596                 ret = -EPIPE;
1597             break;
1598         }
1599 
1600         i_head = ipipe->head;
1601         o_tail = opipe->tail;
1602 
1603         /*
1604          * If we have iterated all input buffers or run out of
1605          * output room, break.
1606          */
1607         if (pipe_empty(i_head, i_tail) ||
1608             pipe_full(o_head, o_tail, opipe->max_usage))
1609             break;
1610 
1611         ibuf = &ipipe->bufs[i_tail & i_mask];
1612         obuf = &opipe->bufs[o_head & o_mask];
1613 
1614         /*
1615          * Get a reference to this pipe buffer,
1616          * so we can copy the contents over.
1617          */
1618         if (!pipe_buf_get(ipipe, ibuf)) {
1619             if (ret == 0)
1620                 ret = -EFAULT;
1621             break;
1622         }
1623 
1624         *obuf = *ibuf;
1625 
1626         /*
1627          * Don't inherit the gift and merge flag, we need to prevent
1628          * multiple steals of this page.
1629          */
1630         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1631         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1632 
1633         if (obuf->len > len)
1634             obuf->len = len;
1635         ret += obuf->len;
1636         len -= obuf->len;
1637 
1638         o_head++;
1639         opipe->head = o_head;
1640         i_tail++;
1641     } while (len);
1642 
1643     pipe_unlock(ipipe);
1644     pipe_unlock(opipe);
1645 
1646     /*
1647      * If we put data in the output pipe, wakeup any potential readers.
1648      */
1649     if (ret > 0)
1650         wakeup_pipe_readers(opipe);
1651 
1652     return ret;
1653 }
1654 
1655 /*
1656  * This is a tee(1) implementation that works on pipes. It doesn't copy
1657  * any data, it simply references the 'in' pages on the 'out' pipe.
1658  * The 'flags' used are the SPLICE_F_* variants, currently the only
1659  * applicable one is SPLICE_F_NONBLOCK.
1660  */
1661 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1662 {
1663     struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1664     struct pipe_inode_info *opipe = get_pipe_info(out, true);
1665     int ret = -EINVAL;
1666 
1667     if (unlikely(!(in->f_mode & FMODE_READ) ||
1668              !(out->f_mode & FMODE_WRITE)))
1669         return -EBADF;
1670 
1671     /*
1672      * Duplicate the contents of ipipe to opipe without actually
1673      * copying the data.
1674      */
1675     if (ipipe && opipe && ipipe != opipe) {
1676         if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1677             flags |= SPLICE_F_NONBLOCK;
1678 
1679         /*
1680          * Keep going, unless we encounter an error. The ipipe/opipe
1681          * ordering doesn't really matter.
1682          */
1683         ret = ipipe_prep(ipipe, flags);
1684         if (!ret) {
1685             ret = opipe_prep(opipe, flags);
1686             if (!ret)
1687                 ret = link_pipe(ipipe, opipe, len, flags);
1688         }
1689     }
1690 
1691     return ret;
1692 }
1693 
1694 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1695 {
1696     struct fd in, out;
1697     int error;
1698 
1699     if (unlikely(flags & ~SPLICE_F_ALL))
1700         return -EINVAL;
1701 
1702     if (unlikely(!len))
1703         return 0;
1704 
1705     error = -EBADF;
1706     in = fdget(fdin);
1707     if (in.file) {
1708         out = fdget(fdout);
1709         if (out.file) {
1710             error = do_tee(in.file, out.file, len, flags);
1711             fdput(out);
1712         }
1713         fdput(in);
1714     }
1715 
1716     return error;
1717 }