Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/kernel.h>
0003 #include <linux/errno.h>
0004 #include <linux/fs.h>
0005 #include <linux/file.h>
0006 #include <linux/blk-mq.h>
0007 #include <linux/mm.h>
0008 #include <linux/slab.h>
0009 #include <linux/fsnotify.h>
0010 #include <linux/poll.h>
0011 #include <linux/nospec.h>
0012 #include <linux/compat.h>
0013 #include <linux/io_uring.h>
0014 
0015 #include <uapi/linux/io_uring.h>
0016 
0017 #include "io_uring.h"
0018 #include "opdef.h"
0019 #include "kbuf.h"
0020 #include "rsrc.h"
0021 #include "rw.h"
0022 
0023 struct io_rw {
0024     /* NOTE: kiocb has the file as the first member, so don't do it here */
0025     struct kiocb            kiocb;
0026     u64             addr;
0027     u32             len;
0028     rwf_t               flags;
0029 };
0030 
0031 static inline bool io_file_supports_nowait(struct io_kiocb *req)
0032 {
0033     return req->flags & REQ_F_SUPPORT_NOWAIT;
0034 }
0035 
0036 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
0037 {
0038     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0039     unsigned ioprio;
0040     int ret;
0041 
0042     rw->kiocb.ki_pos = READ_ONCE(sqe->off);
0043     /* used for fixed read/write too - just read unconditionally */
0044     req->buf_index = READ_ONCE(sqe->buf_index);
0045 
0046     if (req->opcode == IORING_OP_READ_FIXED ||
0047         req->opcode == IORING_OP_WRITE_FIXED) {
0048         struct io_ring_ctx *ctx = req->ctx;
0049         u16 index;
0050 
0051         if (unlikely(req->buf_index >= ctx->nr_user_bufs))
0052             return -EFAULT;
0053         index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
0054         req->imu = ctx->user_bufs[index];
0055         io_req_set_rsrc_node(req, ctx, 0);
0056     }
0057 
0058     ioprio = READ_ONCE(sqe->ioprio);
0059     if (ioprio) {
0060         ret = ioprio_check_cap(ioprio);
0061         if (ret)
0062             return ret;
0063 
0064         rw->kiocb.ki_ioprio = ioprio;
0065     } else {
0066         rw->kiocb.ki_ioprio = get_current_ioprio();
0067     }
0068 
0069     rw->addr = READ_ONCE(sqe->addr);
0070     rw->len = READ_ONCE(sqe->len);
0071     rw->flags = READ_ONCE(sqe->rw_flags);
0072     return 0;
0073 }
0074 
0075 void io_readv_writev_cleanup(struct io_kiocb *req)
0076 {
0077     struct io_async_rw *io = req->async_data;
0078 
0079     kfree(io->free_iovec);
0080 }
0081 
0082 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
0083 {
0084     switch (ret) {
0085     case -EIOCBQUEUED:
0086         break;
0087     case -ERESTARTSYS:
0088     case -ERESTARTNOINTR:
0089     case -ERESTARTNOHAND:
0090     case -ERESTART_RESTARTBLOCK:
0091         /*
0092          * We can't just restart the syscall, since previously
0093          * submitted sqes may already be in progress. Just fail this
0094          * IO with EINTR.
0095          */
0096         ret = -EINTR;
0097         fallthrough;
0098     default:
0099         kiocb->ki_complete(kiocb, ret);
0100     }
0101 }
0102 
0103 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
0104 {
0105     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0106 
0107     if (rw->kiocb.ki_pos != -1)
0108         return &rw->kiocb.ki_pos;
0109 
0110     if (!(req->file->f_mode & FMODE_STREAM)) {
0111         req->flags |= REQ_F_CUR_POS;
0112         rw->kiocb.ki_pos = req->file->f_pos;
0113         return &rw->kiocb.ki_pos;
0114     }
0115 
0116     rw->kiocb.ki_pos = 0;
0117     return NULL;
0118 }
0119 
0120 static void io_req_task_queue_reissue(struct io_kiocb *req)
0121 {
0122     req->io_task_work.func = io_queue_iowq;
0123     io_req_task_work_add(req);
0124 }
0125 
0126 #ifdef CONFIG_BLOCK
0127 static bool io_resubmit_prep(struct io_kiocb *req)
0128 {
0129     struct io_async_rw *io = req->async_data;
0130 
0131     if (!req_has_async_data(req))
0132         return !io_req_prep_async(req);
0133     iov_iter_restore(&io->s.iter, &io->s.iter_state);
0134     return true;
0135 }
0136 
0137 static bool io_rw_should_reissue(struct io_kiocb *req)
0138 {
0139     umode_t mode = file_inode(req->file)->i_mode;
0140     struct io_ring_ctx *ctx = req->ctx;
0141 
0142     if (!S_ISBLK(mode) && !S_ISREG(mode))
0143         return false;
0144     if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
0145         !(ctx->flags & IORING_SETUP_IOPOLL)))
0146         return false;
0147     /*
0148      * If ref is dying, we might be running poll reap from the exit work.
0149      * Don't attempt to reissue from that path, just let it fail with
0150      * -EAGAIN.
0151      */
0152     if (percpu_ref_is_dying(&ctx->refs))
0153         return false;
0154     /*
0155      * Play it safe and assume not safe to re-import and reissue if we're
0156      * not in the original thread group (or in task context).
0157      */
0158     if (!same_thread_group(req->task, current) || !in_task())
0159         return false;
0160     return true;
0161 }
0162 #else
0163 static bool io_resubmit_prep(struct io_kiocb *req)
0164 {
0165     return false;
0166 }
0167 static bool io_rw_should_reissue(struct io_kiocb *req)
0168 {
0169     return false;
0170 }
0171 #endif
0172 
0173 static void kiocb_end_write(struct io_kiocb *req)
0174 {
0175     /*
0176      * Tell lockdep we inherited freeze protection from submission
0177      * thread.
0178      */
0179     if (req->flags & REQ_F_ISREG) {
0180         struct super_block *sb = file_inode(req->file)->i_sb;
0181 
0182         __sb_writers_acquired(sb, SB_FREEZE_WRITE);
0183         sb_end_write(sb);
0184     }
0185 }
0186 
0187 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
0188 {
0189     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0190 
0191     if (rw->kiocb.ki_flags & IOCB_WRITE) {
0192         kiocb_end_write(req);
0193         fsnotify_modify(req->file);
0194     } else {
0195         fsnotify_access(req->file);
0196     }
0197     if (unlikely(res != req->cqe.res)) {
0198         if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
0199             io_rw_should_reissue(req)) {
0200             req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
0201             return true;
0202         }
0203         req_set_fail(req);
0204         req->cqe.res = res;
0205     }
0206     return false;
0207 }
0208 
0209 static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
0210 {
0211     struct io_async_rw *io = req->async_data;
0212 
0213     /* add previously done IO, if any */
0214     if (req_has_async_data(req) && io->bytes_done > 0) {
0215         if (res < 0)
0216             res = io->bytes_done;
0217         else
0218             res += io->bytes_done;
0219     }
0220     return res;
0221 }
0222 
0223 static void io_complete_rw(struct kiocb *kiocb, long res)
0224 {
0225     struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
0226     struct io_kiocb *req = cmd_to_io_kiocb(rw);
0227 
0228     if (__io_complete_rw_common(req, res))
0229         return;
0230     io_req_set_res(req, io_fixup_rw_res(req, res), 0);
0231     req->io_task_work.func = io_req_task_complete;
0232     io_req_task_work_add(req);
0233 }
0234 
0235 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
0236 {
0237     struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
0238     struct io_kiocb *req = cmd_to_io_kiocb(rw);
0239 
0240     if (kiocb->ki_flags & IOCB_WRITE)
0241         kiocb_end_write(req);
0242     if (unlikely(res != req->cqe.res)) {
0243         if (res == -EAGAIN && io_rw_should_reissue(req)) {
0244             req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
0245             return;
0246         }
0247         req->cqe.res = res;
0248     }
0249 
0250     /* order with io_iopoll_complete() checking ->iopoll_completed */
0251     smp_store_release(&req->iopoll_completed, 1);
0252 }
0253 
0254 static int kiocb_done(struct io_kiocb *req, ssize_t ret,
0255                unsigned int issue_flags)
0256 {
0257     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0258     unsigned final_ret = io_fixup_rw_res(req, ret);
0259 
0260     if (req->flags & REQ_F_CUR_POS)
0261         req->file->f_pos = rw->kiocb.ki_pos;
0262     if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
0263         if (!__io_complete_rw_common(req, ret)) {
0264             io_req_set_res(req, final_ret,
0265                        io_put_kbuf(req, issue_flags));
0266             return IOU_OK;
0267         }
0268     } else {
0269         io_rw_done(&rw->kiocb, ret);
0270     }
0271 
0272     if (req->flags & REQ_F_REISSUE) {
0273         req->flags &= ~REQ_F_REISSUE;
0274         if (io_resubmit_prep(req))
0275             io_req_task_queue_reissue(req);
0276         else
0277             io_req_task_queue_fail(req, final_ret);
0278     }
0279     return IOU_ISSUE_SKIP_COMPLETE;
0280 }
0281 
0282 #ifdef CONFIG_COMPAT
0283 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
0284                 unsigned int issue_flags)
0285 {
0286     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0287     struct compat_iovec __user *uiov;
0288     compat_ssize_t clen;
0289     void __user *buf;
0290     size_t len;
0291 
0292     uiov = u64_to_user_ptr(rw->addr);
0293     if (!access_ok(uiov, sizeof(*uiov)))
0294         return -EFAULT;
0295     if (__get_user(clen, &uiov->iov_len))
0296         return -EFAULT;
0297     if (clen < 0)
0298         return -EINVAL;
0299 
0300     len = clen;
0301     buf = io_buffer_select(req, &len, issue_flags);
0302     if (!buf)
0303         return -ENOBUFS;
0304     rw->addr = (unsigned long) buf;
0305     iov[0].iov_base = buf;
0306     rw->len = iov[0].iov_len = (compat_size_t) len;
0307     return 0;
0308 }
0309 #endif
0310 
0311 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
0312                       unsigned int issue_flags)
0313 {
0314     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0315     struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
0316     void __user *buf;
0317     ssize_t len;
0318 
0319     if (copy_from_user(iov, uiov, sizeof(*uiov)))
0320         return -EFAULT;
0321 
0322     len = iov[0].iov_len;
0323     if (len < 0)
0324         return -EINVAL;
0325     buf = io_buffer_select(req, &len, issue_flags);
0326     if (!buf)
0327         return -ENOBUFS;
0328     rw->addr = (unsigned long) buf;
0329     iov[0].iov_base = buf;
0330     rw->len = iov[0].iov_len = len;
0331     return 0;
0332 }
0333 
0334 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
0335                     unsigned int issue_flags)
0336 {
0337     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0338 
0339     if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
0340         iov[0].iov_base = u64_to_user_ptr(rw->addr);
0341         iov[0].iov_len = rw->len;
0342         return 0;
0343     }
0344     if (rw->len != 1)
0345         return -EINVAL;
0346 
0347 #ifdef CONFIG_COMPAT
0348     if (req->ctx->compat)
0349         return io_compat_import(req, iov, issue_flags);
0350 #endif
0351 
0352     return __io_iov_buffer_select(req, iov, issue_flags);
0353 }
0354 
0355 static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
0356                        struct io_rw_state *s,
0357                        unsigned int issue_flags)
0358 {
0359     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0360     struct iov_iter *iter = &s->iter;
0361     u8 opcode = req->opcode;
0362     struct iovec *iovec;
0363     void __user *buf;
0364     size_t sqe_len;
0365     ssize_t ret;
0366 
0367     if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
0368         ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
0369         if (ret)
0370             return ERR_PTR(ret);
0371         return NULL;
0372     }
0373 
0374     buf = u64_to_user_ptr(rw->addr);
0375     sqe_len = rw->len;
0376 
0377     if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
0378         if (io_do_buffer_select(req)) {
0379             buf = io_buffer_select(req, &sqe_len, issue_flags);
0380             if (!buf)
0381                 return ERR_PTR(-ENOBUFS);
0382             rw->addr = (unsigned long) buf;
0383             rw->len = sqe_len;
0384         }
0385 
0386         ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
0387         if (ret)
0388             return ERR_PTR(ret);
0389         return NULL;
0390     }
0391 
0392     iovec = s->fast_iov;
0393     if (req->flags & REQ_F_BUFFER_SELECT) {
0394         ret = io_iov_buffer_select(req, iovec, issue_flags);
0395         if (ret)
0396             return ERR_PTR(ret);
0397         iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
0398         return NULL;
0399     }
0400 
0401     ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
0402                   req->ctx->compat);
0403     if (unlikely(ret < 0))
0404         return ERR_PTR(ret);
0405     return iovec;
0406 }
0407 
0408 static inline int io_import_iovec(int rw, struct io_kiocb *req,
0409                   struct iovec **iovec, struct io_rw_state *s,
0410                   unsigned int issue_flags)
0411 {
0412     *iovec = __io_import_iovec(rw, req, s, issue_flags);
0413     if (unlikely(IS_ERR(*iovec)))
0414         return PTR_ERR(*iovec);
0415 
0416     iov_iter_save_state(&s->iter, &s->iter_state);
0417     return 0;
0418 }
0419 
0420 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
0421 {
0422     return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0423 }
0424 
0425 /*
0426  * For files that don't have ->read_iter() and ->write_iter(), handle them
0427  * by looping over ->read() or ->write() manually.
0428  */
0429 static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
0430 {
0431     struct kiocb *kiocb = &rw->kiocb;
0432     struct file *file = kiocb->ki_filp;
0433     ssize_t ret = 0;
0434     loff_t *ppos;
0435 
0436     /*
0437      * Don't support polled IO through this interface, and we can't
0438      * support non-blocking either. For the latter, this just causes
0439      * the kiocb to be handled from an async context.
0440      */
0441     if (kiocb->ki_flags & IOCB_HIPRI)
0442         return -EOPNOTSUPP;
0443     if ((kiocb->ki_flags & IOCB_NOWAIT) &&
0444         !(kiocb->ki_filp->f_flags & O_NONBLOCK))
0445         return -EAGAIN;
0446 
0447     ppos = io_kiocb_ppos(kiocb);
0448 
0449     while (iov_iter_count(iter)) {
0450         struct iovec iovec;
0451         ssize_t nr;
0452 
0453         if (!iov_iter_is_bvec(iter)) {
0454             iovec = iov_iter_iovec(iter);
0455         } else {
0456             iovec.iov_base = u64_to_user_ptr(rw->addr);
0457             iovec.iov_len = rw->len;
0458         }
0459 
0460         if (ddir == READ) {
0461             nr = file->f_op->read(file, iovec.iov_base,
0462                           iovec.iov_len, ppos);
0463         } else {
0464             nr = file->f_op->write(file, iovec.iov_base,
0465                            iovec.iov_len, ppos);
0466         }
0467 
0468         if (nr < 0) {
0469             if (!ret)
0470                 ret = nr;
0471             break;
0472         }
0473         ret += nr;
0474         if (!iov_iter_is_bvec(iter)) {
0475             iov_iter_advance(iter, nr);
0476         } else {
0477             rw->addr += nr;
0478             rw->len -= nr;
0479             if (!rw->len)
0480                 break;
0481         }
0482         if (nr != iovec.iov_len)
0483             break;
0484     }
0485 
0486     return ret;
0487 }
0488 
0489 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
0490               const struct iovec *fast_iov, struct iov_iter *iter)
0491 {
0492     struct io_async_rw *io = req->async_data;
0493 
0494     memcpy(&io->s.iter, iter, sizeof(*iter));
0495     io->free_iovec = iovec;
0496     io->bytes_done = 0;
0497     /* can only be fixed buffers, no need to do anything */
0498     if (iov_iter_is_bvec(iter))
0499         return;
0500     if (!iovec) {
0501         unsigned iov_off = 0;
0502 
0503         io->s.iter.iov = io->s.fast_iov;
0504         if (iter->iov != fast_iov) {
0505             iov_off = iter->iov - fast_iov;
0506             io->s.iter.iov += iov_off;
0507         }
0508         if (io->s.fast_iov != fast_iov)
0509             memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
0510                    sizeof(struct iovec) * iter->nr_segs);
0511     } else {
0512         req->flags |= REQ_F_NEED_CLEANUP;
0513     }
0514 }
0515 
0516 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
0517                  struct io_rw_state *s, bool force)
0518 {
0519     if (!force && !io_op_defs[req->opcode].prep_async)
0520         return 0;
0521     if (!req_has_async_data(req)) {
0522         struct io_async_rw *iorw;
0523 
0524         if (io_alloc_async_data(req)) {
0525             kfree(iovec);
0526             return -ENOMEM;
0527         }
0528 
0529         io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
0530         iorw = req->async_data;
0531         /* we've copied and mapped the iter, ensure state is saved */
0532         iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
0533     }
0534     return 0;
0535 }
0536 
0537 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
0538 {
0539     struct io_async_rw *iorw = req->async_data;
0540     struct iovec *iov;
0541     int ret;
0542 
0543     /* submission path, ->uring_lock should already be taken */
0544     ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
0545     if (unlikely(ret < 0))
0546         return ret;
0547 
0548     iorw->bytes_done = 0;
0549     iorw->free_iovec = iov;
0550     if (iov)
0551         req->flags |= REQ_F_NEED_CLEANUP;
0552     return 0;
0553 }
0554 
0555 int io_readv_prep_async(struct io_kiocb *req)
0556 {
0557     return io_rw_prep_async(req, READ);
0558 }
0559 
0560 int io_writev_prep_async(struct io_kiocb *req)
0561 {
0562     return io_rw_prep_async(req, WRITE);
0563 }
0564 
0565 /*
0566  * This is our waitqueue callback handler, registered through __folio_lock_async()
0567  * when we initially tried to do the IO with the iocb armed our waitqueue.
0568  * This gets called when the page is unlocked, and we generally expect that to
0569  * happen when the page IO is completed and the page is now uptodate. This will
0570  * queue a task_work based retry of the operation, attempting to copy the data
0571  * again. If the latter fails because the page was NOT uptodate, then we will
0572  * do a thread based blocking retry of the operation. That's the unexpected
0573  * slow path.
0574  */
0575 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
0576                  int sync, void *arg)
0577 {
0578     struct wait_page_queue *wpq;
0579     struct io_kiocb *req = wait->private;
0580     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0581     struct wait_page_key *key = arg;
0582 
0583     wpq = container_of(wait, struct wait_page_queue, wait);
0584 
0585     if (!wake_page_match(wpq, key))
0586         return 0;
0587 
0588     rw->kiocb.ki_flags &= ~IOCB_WAITQ;
0589     list_del_init(&wait->entry);
0590     io_req_task_queue(req);
0591     return 1;
0592 }
0593 
0594 /*
0595  * This controls whether a given IO request should be armed for async page
0596  * based retry. If we return false here, the request is handed to the async
0597  * worker threads for retry. If we're doing buffered reads on a regular file,
0598  * we prepare a private wait_page_queue entry and retry the operation. This
0599  * will either succeed because the page is now uptodate and unlocked, or it
0600  * will register a callback when the page is unlocked at IO completion. Through
0601  * that callback, io_uring uses task_work to setup a retry of the operation.
0602  * That retry will attempt the buffered read again. The retry will generally
0603  * succeed, or in rare cases where it fails, we then fall back to using the
0604  * async worker threads for a blocking retry.
0605  */
0606 static bool io_rw_should_retry(struct io_kiocb *req)
0607 {
0608     struct io_async_rw *io = req->async_data;
0609     struct wait_page_queue *wait = &io->wpq;
0610     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0611     struct kiocb *kiocb = &rw->kiocb;
0612 
0613     /* never retry for NOWAIT, we just complete with -EAGAIN */
0614     if (req->flags & REQ_F_NOWAIT)
0615         return false;
0616 
0617     /* Only for buffered IO */
0618     if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
0619         return false;
0620 
0621     /*
0622      * just use poll if we can, and don't attempt if the fs doesn't
0623      * support callback based unlocks
0624      */
0625     if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
0626         return false;
0627 
0628     wait->wait.func = io_async_buf_func;
0629     wait->wait.private = req;
0630     wait->wait.flags = 0;
0631     INIT_LIST_HEAD(&wait->wait.entry);
0632     kiocb->ki_flags |= IOCB_WAITQ;
0633     kiocb->ki_flags &= ~IOCB_NOWAIT;
0634     kiocb->ki_waitq = wait;
0635     return true;
0636 }
0637 
0638 static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
0639 {
0640     struct file *file = rw->kiocb.ki_filp;
0641 
0642     if (likely(file->f_op->read_iter))
0643         return call_read_iter(file, &rw->kiocb, iter);
0644     else if (file->f_op->read)
0645         return loop_rw_iter(READ, rw, iter);
0646     else
0647         return -EINVAL;
0648 }
0649 
0650 static bool need_complete_io(struct io_kiocb *req)
0651 {
0652     return req->flags & REQ_F_ISREG ||
0653         S_ISBLK(file_inode(req->file)->i_mode);
0654 }
0655 
0656 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
0657 {
0658     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0659     struct kiocb *kiocb = &rw->kiocb;
0660     struct io_ring_ctx *ctx = req->ctx;
0661     struct file *file = req->file;
0662     int ret;
0663 
0664     if (unlikely(!file || !(file->f_mode & mode)))
0665         return -EBADF;
0666 
0667     if (!io_req_ffs_set(req))
0668         req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
0669 
0670     kiocb->ki_flags = file->f_iocb_flags;
0671     ret = kiocb_set_rw_flags(kiocb, rw->flags);
0672     if (unlikely(ret))
0673         return ret;
0674 
0675     /*
0676      * If the file is marked O_NONBLOCK, still allow retry for it if it
0677      * supports async. Otherwise it's impossible to use O_NONBLOCK files
0678      * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
0679      */
0680     if ((kiocb->ki_flags & IOCB_NOWAIT) ||
0681         ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
0682         req->flags |= REQ_F_NOWAIT;
0683 
0684     if (ctx->flags & IORING_SETUP_IOPOLL) {
0685         if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
0686             return -EOPNOTSUPP;
0687 
0688         kiocb->private = NULL;
0689         kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
0690         kiocb->ki_complete = io_complete_rw_iopoll;
0691         req->iopoll_completed = 0;
0692     } else {
0693         if (kiocb->ki_flags & IOCB_HIPRI)
0694             return -EINVAL;
0695         kiocb->ki_complete = io_complete_rw;
0696     }
0697 
0698     return 0;
0699 }
0700 
0701 int io_read(struct io_kiocb *req, unsigned int issue_flags)
0702 {
0703     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0704     struct io_rw_state __s, *s = &__s;
0705     struct iovec *iovec;
0706     struct kiocb *kiocb = &rw->kiocb;
0707     bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
0708     struct io_async_rw *io;
0709     ssize_t ret, ret2;
0710     loff_t *ppos;
0711 
0712     if (!req_has_async_data(req)) {
0713         ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
0714         if (unlikely(ret < 0))
0715             return ret;
0716     } else {
0717         io = req->async_data;
0718         s = &io->s;
0719 
0720         /*
0721          * Safe and required to re-import if we're using provided
0722          * buffers, as we dropped the selected one before retry.
0723          */
0724         if (io_do_buffer_select(req)) {
0725             ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
0726             if (unlikely(ret < 0))
0727                 return ret;
0728         }
0729 
0730         /*
0731          * We come here from an earlier attempt, restore our state to
0732          * match in case it doesn't. It's cheap enough that we don't
0733          * need to make this conditional.
0734          */
0735         iov_iter_restore(&s->iter, &s->iter_state);
0736         iovec = NULL;
0737     }
0738     ret = io_rw_init_file(req, FMODE_READ);
0739     if (unlikely(ret)) {
0740         kfree(iovec);
0741         return ret;
0742     }
0743     req->cqe.res = iov_iter_count(&s->iter);
0744 
0745     if (force_nonblock) {
0746         /* If the file doesn't support async, just async punt */
0747         if (unlikely(!io_file_supports_nowait(req))) {
0748             ret = io_setup_async_rw(req, iovec, s, true);
0749             return ret ?: -EAGAIN;
0750         }
0751         kiocb->ki_flags |= IOCB_NOWAIT;
0752     } else {
0753         /* Ensure we clear previously set non-block flag */
0754         kiocb->ki_flags &= ~IOCB_NOWAIT;
0755     }
0756 
0757     ppos = io_kiocb_update_pos(req);
0758 
0759     ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
0760     if (unlikely(ret)) {
0761         kfree(iovec);
0762         return ret;
0763     }
0764 
0765     ret = io_iter_do_read(rw, &s->iter);
0766 
0767     if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
0768         req->flags &= ~REQ_F_REISSUE;
0769         /* if we can poll, just do that */
0770         if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
0771             return -EAGAIN;
0772         /* IOPOLL retry should happen for io-wq threads */
0773         if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
0774             goto done;
0775         /* no retry on NONBLOCK nor RWF_NOWAIT */
0776         if (req->flags & REQ_F_NOWAIT)
0777             goto done;
0778         ret = 0;
0779     } else if (ret == -EIOCBQUEUED) {
0780         if (iovec)
0781             kfree(iovec);
0782         return IOU_ISSUE_SKIP_COMPLETE;
0783     } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
0784            (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
0785         /* read all, failed, already did sync or don't want to retry */
0786         goto done;
0787     }
0788 
0789     /*
0790      * Don't depend on the iter state matching what was consumed, or being
0791      * untouched in case of error. Restore it and we'll advance it
0792      * manually if we need to.
0793      */
0794     iov_iter_restore(&s->iter, &s->iter_state);
0795 
0796     ret2 = io_setup_async_rw(req, iovec, s, true);
0797     if (ret2)
0798         return ret2;
0799 
0800     iovec = NULL;
0801     io = req->async_data;
0802     s = &io->s;
0803     /*
0804      * Now use our persistent iterator and state, if we aren't already.
0805      * We've restored and mapped the iter to match.
0806      */
0807 
0808     do {
0809         /*
0810          * We end up here because of a partial read, either from
0811          * above or inside this loop. Advance the iter by the bytes
0812          * that were consumed.
0813          */
0814         iov_iter_advance(&s->iter, ret);
0815         if (!iov_iter_count(&s->iter))
0816             break;
0817         io->bytes_done += ret;
0818         iov_iter_save_state(&s->iter, &s->iter_state);
0819 
0820         /* if we can retry, do so with the callbacks armed */
0821         if (!io_rw_should_retry(req)) {
0822             kiocb->ki_flags &= ~IOCB_WAITQ;
0823             return -EAGAIN;
0824         }
0825 
0826         /*
0827          * Now retry read with the IOCB_WAITQ parts set in the iocb. If
0828          * we get -EIOCBQUEUED, then we'll get a notification when the
0829          * desired page gets unlocked. We can also get a partial read
0830          * here, and if we do, then just retry at the new offset.
0831          */
0832         ret = io_iter_do_read(rw, &s->iter);
0833         if (ret == -EIOCBQUEUED)
0834             return IOU_ISSUE_SKIP_COMPLETE;
0835         /* we got some bytes, but not all. retry. */
0836         kiocb->ki_flags &= ~IOCB_WAITQ;
0837         iov_iter_restore(&s->iter, &s->iter_state);
0838     } while (ret > 0);
0839 done:
0840     /* it's faster to check here then delegate to kfree */
0841     if (iovec)
0842         kfree(iovec);
0843     return kiocb_done(req, ret, issue_flags);
0844 }
0845 
0846 int io_write(struct io_kiocb *req, unsigned int issue_flags)
0847 {
0848     struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
0849     struct io_rw_state __s, *s = &__s;
0850     struct iovec *iovec;
0851     struct kiocb *kiocb = &rw->kiocb;
0852     bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
0853     ssize_t ret, ret2;
0854     loff_t *ppos;
0855 
0856     if (!req_has_async_data(req)) {
0857         ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
0858         if (unlikely(ret < 0))
0859             return ret;
0860     } else {
0861         struct io_async_rw *io = req->async_data;
0862 
0863         s = &io->s;
0864         iov_iter_restore(&s->iter, &s->iter_state);
0865         iovec = NULL;
0866     }
0867     ret = io_rw_init_file(req, FMODE_WRITE);
0868     if (unlikely(ret)) {
0869         kfree(iovec);
0870         return ret;
0871     }
0872     req->cqe.res = iov_iter_count(&s->iter);
0873 
0874     if (force_nonblock) {
0875         /* If the file doesn't support async, just async punt */
0876         if (unlikely(!io_file_supports_nowait(req)))
0877             goto copy_iov;
0878 
0879         /* File path supports NOWAIT for non-direct_IO only for block devices. */
0880         if (!(kiocb->ki_flags & IOCB_DIRECT) &&
0881             !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
0882             (req->flags & REQ_F_ISREG))
0883             goto copy_iov;
0884 
0885         kiocb->ki_flags |= IOCB_NOWAIT;
0886     } else {
0887         /* Ensure we clear previously set non-block flag */
0888         kiocb->ki_flags &= ~IOCB_NOWAIT;
0889     }
0890 
0891     ppos = io_kiocb_update_pos(req);
0892 
0893     ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
0894     if (unlikely(ret)) {
0895         kfree(iovec);
0896         return ret;
0897     }
0898 
0899     /*
0900      * Open-code file_start_write here to grab freeze protection,
0901      * which will be released by another thread in
0902      * io_complete_rw().  Fool lockdep by telling it the lock got
0903      * released so that it doesn't complain about the held lock when
0904      * we return to userspace.
0905      */
0906     if (req->flags & REQ_F_ISREG) {
0907         sb_start_write(file_inode(req->file)->i_sb);
0908         __sb_writers_release(file_inode(req->file)->i_sb,
0909                     SB_FREEZE_WRITE);
0910     }
0911     kiocb->ki_flags |= IOCB_WRITE;
0912 
0913     if (likely(req->file->f_op->write_iter))
0914         ret2 = call_write_iter(req->file, kiocb, &s->iter);
0915     else if (req->file->f_op->write)
0916         ret2 = loop_rw_iter(WRITE, rw, &s->iter);
0917     else
0918         ret2 = -EINVAL;
0919 
0920     if (req->flags & REQ_F_REISSUE) {
0921         req->flags &= ~REQ_F_REISSUE;
0922         ret2 = -EAGAIN;
0923     }
0924 
0925     /*
0926      * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
0927      * retry them without IOCB_NOWAIT.
0928      */
0929     if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
0930         ret2 = -EAGAIN;
0931     /* no retry on NONBLOCK nor RWF_NOWAIT */
0932     if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
0933         goto done;
0934     if (!force_nonblock || ret2 != -EAGAIN) {
0935         /* IOPOLL retry should happen for io-wq threads */
0936         if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
0937             goto copy_iov;
0938 
0939         if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
0940             struct io_async_rw *rw;
0941 
0942             trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
0943                         req->cqe.res, ret2);
0944 
0945             /* This is a partial write. The file pos has already been
0946              * updated, setup the async struct to complete the request
0947              * in the worker. Also update bytes_done to account for
0948              * the bytes already written.
0949              */
0950             iov_iter_save_state(&s->iter, &s->iter_state);
0951             ret = io_setup_async_rw(req, iovec, s, true);
0952 
0953             rw = req->async_data;
0954             if (rw)
0955                 rw->bytes_done += ret2;
0956 
0957             if (kiocb->ki_flags & IOCB_WRITE)
0958                 kiocb_end_write(req);
0959             return ret ? ret : -EAGAIN;
0960         }
0961 done:
0962         ret = kiocb_done(req, ret2, issue_flags);
0963     } else {
0964 copy_iov:
0965         iov_iter_restore(&s->iter, &s->iter_state);
0966         ret = io_setup_async_rw(req, iovec, s, false);
0967         if (!ret) {
0968             if (kiocb->ki_flags & IOCB_WRITE)
0969                 kiocb_end_write(req);
0970             return -EAGAIN;
0971         }
0972         return ret;
0973     }
0974     /* it's reportedly faster than delegating the null check to kfree() */
0975     if (iovec)
0976         kfree(iovec);
0977     return ret;
0978 }
0979 
0980 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
0981 {
0982     io_commit_cqring_flush(ctx);
0983     if (ctx->flags & IORING_SETUP_SQPOLL)
0984         io_cqring_wake(ctx);
0985 }
0986 
0987 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
0988 {
0989     struct io_wq_work_node *pos, *start, *prev;
0990     unsigned int poll_flags = BLK_POLL_NOSLEEP;
0991     DEFINE_IO_COMP_BATCH(iob);
0992     int nr_events = 0;
0993 
0994     /*
0995      * Only spin for completions if we don't have multiple devices hanging
0996      * off our complete list.
0997      */
0998     if (ctx->poll_multi_queue || force_nonspin)
0999         poll_flags |= BLK_POLL_ONESHOT;
1000 
1001     wq_list_for_each(pos, start, &ctx->iopoll_list) {
1002         struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1003         struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
1004         int ret;
1005 
1006         /*
1007          * Move completed and retryable entries to our local lists.
1008          * If we find a request that requires polling, break out
1009          * and complete those lists first, if we have entries there.
1010          */
1011         if (READ_ONCE(req->iopoll_completed))
1012             break;
1013 
1014         ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
1015         if (unlikely(ret < 0))
1016             return ret;
1017         else if (ret)
1018             poll_flags |= BLK_POLL_ONESHOT;
1019 
1020         /* iopoll may have completed current req */
1021         if (!rq_list_empty(iob.req_list) ||
1022             READ_ONCE(req->iopoll_completed))
1023             break;
1024     }
1025 
1026     if (!rq_list_empty(iob.req_list))
1027         iob.complete(&iob);
1028     else if (!pos)
1029         return 0;
1030 
1031     prev = start;
1032     wq_list_for_each_resume(pos, prev) {
1033         struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1034 
1035         /* order with io_complete_rw_iopoll(), e.g. ->result updates */
1036         if (!smp_load_acquire(&req->iopoll_completed))
1037             break;
1038         nr_events++;
1039         if (unlikely(req->flags & REQ_F_CQE_SKIP))
1040             continue;
1041 
1042         req->cqe.flags = io_put_kbuf(req, 0);
1043         __io_fill_cqe_req(req->ctx, req);
1044     }
1045 
1046     if (unlikely(!nr_events))
1047         return 0;
1048 
1049     io_commit_cqring(ctx);
1050     io_cqring_ev_posted_iopoll(ctx);
1051     pos = start ? start->next : ctx->iopoll_list.first;
1052     wq_list_cut(&ctx->iopoll_list, prev, start);
1053     io_free_batch_list(ctx, pos);
1054     return nr_events;
1055 }