Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/kernel.h>
0003 #include <linux/errno.h>
0004 #include <linux/fs.h>
0005 #include <linux/file.h>
0006 #include <linux/mm.h>
0007 #include <linux/slab.h>
0008 #include <linux/namei.h>
0009 #include <linux/poll.h>
0010 #include <linux/io_uring.h>
0011 
0012 #include <uapi/linux/io_uring.h>
0013 
0014 #include "io_uring.h"
0015 #include "opdef.h"
0016 #include "kbuf.h"
0017 
0018 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
0019 
0020 #define BGID_ARRAY  64
0021 
0022 struct io_provide_buf {
0023     struct file         *file;
0024     __u64               addr;
0025     __u32               len;
0026     __u32               bgid;
0027     __u16               nbufs;
0028     __u16               bid;
0029 };
0030 
0031 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
0032                             unsigned int bgid)
0033 {
0034     if (ctx->io_bl && bgid < BGID_ARRAY)
0035         return &ctx->io_bl[bgid];
0036 
0037     return xa_load(&ctx->io_bl_xa, bgid);
0038 }
0039 
0040 static int io_buffer_add_list(struct io_ring_ctx *ctx,
0041                   struct io_buffer_list *bl, unsigned int bgid)
0042 {
0043     bl->bgid = bgid;
0044     if (bgid < BGID_ARRAY)
0045         return 0;
0046 
0047     return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
0048 }
0049 
0050 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
0051 {
0052     struct io_ring_ctx *ctx = req->ctx;
0053     struct io_buffer_list *bl;
0054     struct io_buffer *buf;
0055 
0056     /*
0057      * For legacy provided buffer mode, don't recycle if we already did
0058      * IO to this buffer. For ring-mapped provided buffer mode, we should
0059      * increment ring->head to explicitly monopolize the buffer to avoid
0060      * multiple use.
0061      */
0062     if (req->flags & REQ_F_PARTIAL_IO)
0063         return;
0064 
0065     io_ring_submit_lock(ctx, issue_flags);
0066 
0067     buf = req->kbuf;
0068     bl = io_buffer_get_list(ctx, buf->bgid);
0069     list_add(&buf->list, &bl->buf_list);
0070     req->flags &= ~REQ_F_BUFFER_SELECTED;
0071     req->buf_index = buf->bgid;
0072 
0073     io_ring_submit_unlock(ctx, issue_flags);
0074     return;
0075 }
0076 
0077 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
0078 {
0079     unsigned int cflags;
0080 
0081     /*
0082      * We can add this buffer back to two lists:
0083      *
0084      * 1) The io_buffers_cache list. This one is protected by the
0085      *    ctx->uring_lock. If we already hold this lock, add back to this
0086      *    list as we can grab it from issue as well.
0087      * 2) The io_buffers_comp list. This one is protected by the
0088      *    ctx->completion_lock.
0089      *
0090      * We migrate buffers from the comp_list to the issue cache list
0091      * when we need one.
0092      */
0093     if (req->flags & REQ_F_BUFFER_RING) {
0094         /* no buffers to recycle for this case */
0095         cflags = __io_put_kbuf_list(req, NULL);
0096     } else if (issue_flags & IO_URING_F_UNLOCKED) {
0097         struct io_ring_ctx *ctx = req->ctx;
0098 
0099         spin_lock(&ctx->completion_lock);
0100         cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
0101         spin_unlock(&ctx->completion_lock);
0102     } else {
0103         lockdep_assert_held(&req->ctx->uring_lock);
0104 
0105         cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
0106     }
0107     return cflags;
0108 }
0109 
0110 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
0111                           struct io_buffer_list *bl)
0112 {
0113     if (!list_empty(&bl->buf_list)) {
0114         struct io_buffer *kbuf;
0115 
0116         kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
0117         list_del(&kbuf->list);
0118         if (*len == 0 || *len > kbuf->len)
0119             *len = kbuf->len;
0120         req->flags |= REQ_F_BUFFER_SELECTED;
0121         req->kbuf = kbuf;
0122         req->buf_index = kbuf->bid;
0123         return u64_to_user_ptr(kbuf->addr);
0124     }
0125     return NULL;
0126 }
0127 
0128 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
0129                       struct io_buffer_list *bl,
0130                       unsigned int issue_flags)
0131 {
0132     struct io_uring_buf_ring *br = bl->buf_ring;
0133     struct io_uring_buf *buf;
0134     __u16 head = bl->head;
0135 
0136     if (unlikely(smp_load_acquire(&br->tail) == head))
0137         return NULL;
0138 
0139     head &= bl->mask;
0140     if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
0141         buf = &br->bufs[head];
0142     } else {
0143         int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
0144         int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
0145         buf = page_address(bl->buf_pages[index]);
0146         buf += off;
0147     }
0148     if (*len == 0 || *len > buf->len)
0149         *len = buf->len;
0150     req->flags |= REQ_F_BUFFER_RING;
0151     req->buf_list = bl;
0152     req->buf_index = buf->bid;
0153 
0154     if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
0155         /*
0156          * If we came in unlocked, we have no choice but to consume the
0157          * buffer here, otherwise nothing ensures that the buffer won't
0158          * get used by others. This does mean it'll be pinned until the
0159          * IO completes, coming in unlocked means we're being called from
0160          * io-wq context and there may be further retries in async hybrid
0161          * mode. For the locked case, the caller must call commit when
0162          * the transfer completes (or if we get -EAGAIN and must poll of
0163          * retry).
0164          */
0165         req->buf_list = NULL;
0166         bl->head++;
0167     }
0168     return u64_to_user_ptr(buf->addr);
0169 }
0170 
0171 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
0172                   unsigned int issue_flags)
0173 {
0174     struct io_ring_ctx *ctx = req->ctx;
0175     struct io_buffer_list *bl;
0176     void __user *ret = NULL;
0177 
0178     io_ring_submit_lock(req->ctx, issue_flags);
0179 
0180     bl = io_buffer_get_list(ctx, req->buf_index);
0181     if (likely(bl)) {
0182         if (bl->buf_nr_pages)
0183             ret = io_ring_buffer_select(req, len, bl, issue_flags);
0184         else
0185             ret = io_provided_buffer_select(req, len, bl);
0186     }
0187     io_ring_submit_unlock(req->ctx, issue_flags);
0188     return ret;
0189 }
0190 
0191 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
0192 {
0193     int i;
0194 
0195     ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
0196                 GFP_KERNEL);
0197     if (!ctx->io_bl)
0198         return -ENOMEM;
0199 
0200     for (i = 0; i < BGID_ARRAY; i++) {
0201         INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
0202         ctx->io_bl[i].bgid = i;
0203     }
0204 
0205     return 0;
0206 }
0207 
0208 static int __io_remove_buffers(struct io_ring_ctx *ctx,
0209                    struct io_buffer_list *bl, unsigned nbufs)
0210 {
0211     unsigned i = 0;
0212 
0213     /* shouldn't happen */
0214     if (!nbufs)
0215         return 0;
0216 
0217     if (bl->buf_nr_pages) {
0218         int j;
0219 
0220         i = bl->buf_ring->tail - bl->head;
0221         for (j = 0; j < bl->buf_nr_pages; j++)
0222             unpin_user_page(bl->buf_pages[j]);
0223         kvfree(bl->buf_pages);
0224         bl->buf_pages = NULL;
0225         bl->buf_nr_pages = 0;
0226         /* make sure it's seen as empty */
0227         INIT_LIST_HEAD(&bl->buf_list);
0228         return i;
0229     }
0230 
0231     /* the head kbuf is the list itself */
0232     while (!list_empty(&bl->buf_list)) {
0233         struct io_buffer *nxt;
0234 
0235         nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
0236         list_del(&nxt->list);
0237         if (++i == nbufs)
0238             return i;
0239         cond_resched();
0240     }
0241     i++;
0242 
0243     return i;
0244 }
0245 
0246 void io_destroy_buffers(struct io_ring_ctx *ctx)
0247 {
0248     struct io_buffer_list *bl;
0249     unsigned long index;
0250     int i;
0251 
0252     for (i = 0; i < BGID_ARRAY; i++) {
0253         if (!ctx->io_bl)
0254             break;
0255         __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
0256     }
0257 
0258     xa_for_each(&ctx->io_bl_xa, index, bl) {
0259         xa_erase(&ctx->io_bl_xa, bl->bgid);
0260         __io_remove_buffers(ctx, bl, -1U);
0261         kfree(bl);
0262     }
0263 
0264     while (!list_empty(&ctx->io_buffers_pages)) {
0265         struct page *page;
0266 
0267         page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
0268         list_del_init(&page->lru);
0269         __free_page(page);
0270     }
0271 }
0272 
0273 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
0274 {
0275     struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
0276     u64 tmp;
0277 
0278     if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
0279         sqe->splice_fd_in)
0280         return -EINVAL;
0281 
0282     tmp = READ_ONCE(sqe->fd);
0283     if (!tmp || tmp > USHRT_MAX)
0284         return -EINVAL;
0285 
0286     memset(p, 0, sizeof(*p));
0287     p->nbufs = tmp;
0288     p->bgid = READ_ONCE(sqe->buf_group);
0289     return 0;
0290 }
0291 
0292 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
0293 {
0294     struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
0295     struct io_ring_ctx *ctx = req->ctx;
0296     struct io_buffer_list *bl;
0297     int ret = 0;
0298 
0299     io_ring_submit_lock(ctx, issue_flags);
0300 
0301     ret = -ENOENT;
0302     bl = io_buffer_get_list(ctx, p->bgid);
0303     if (bl) {
0304         ret = -EINVAL;
0305         /* can't use provide/remove buffers command on mapped buffers */
0306         if (!bl->buf_nr_pages)
0307             ret = __io_remove_buffers(ctx, bl, p->nbufs);
0308     }
0309     if (ret < 0)
0310         req_set_fail(req);
0311 
0312     /* complete before unlock, IOPOLL may need the lock */
0313     io_req_set_res(req, ret, 0);
0314     __io_req_complete(req, issue_flags);
0315     io_ring_submit_unlock(ctx, issue_flags);
0316     return IOU_ISSUE_SKIP_COMPLETE;
0317 }
0318 
0319 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
0320 {
0321     unsigned long size, tmp_check;
0322     struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
0323     u64 tmp;
0324 
0325     if (sqe->rw_flags || sqe->splice_fd_in)
0326         return -EINVAL;
0327 
0328     tmp = READ_ONCE(sqe->fd);
0329     if (!tmp || tmp > USHRT_MAX)
0330         return -E2BIG;
0331     p->nbufs = tmp;
0332     p->addr = READ_ONCE(sqe->addr);
0333     p->len = READ_ONCE(sqe->len);
0334 
0335     if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
0336                 &size))
0337         return -EOVERFLOW;
0338     if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
0339         return -EOVERFLOW;
0340 
0341     size = (unsigned long)p->len * p->nbufs;
0342     if (!access_ok(u64_to_user_ptr(p->addr), size))
0343         return -EFAULT;
0344 
0345     p->bgid = READ_ONCE(sqe->buf_group);
0346     tmp = READ_ONCE(sqe->off);
0347     if (tmp > USHRT_MAX)
0348         return -E2BIG;
0349     p->bid = tmp;
0350     return 0;
0351 }
0352 
0353 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
0354 {
0355     struct io_buffer *buf;
0356     struct page *page;
0357     int bufs_in_page;
0358 
0359     /*
0360      * Completions that don't happen inline (eg not under uring_lock) will
0361      * add to ->io_buffers_comp. If we don't have any free buffers, check
0362      * the completion list and splice those entries first.
0363      */
0364     if (!list_empty_careful(&ctx->io_buffers_comp)) {
0365         spin_lock(&ctx->completion_lock);
0366         if (!list_empty(&ctx->io_buffers_comp)) {
0367             list_splice_init(&ctx->io_buffers_comp,
0368                         &ctx->io_buffers_cache);
0369             spin_unlock(&ctx->completion_lock);
0370             return 0;
0371         }
0372         spin_unlock(&ctx->completion_lock);
0373     }
0374 
0375     /*
0376      * No free buffers and no completion entries either. Allocate a new
0377      * page worth of buffer entries and add those to our freelist.
0378      */
0379     page = alloc_page(GFP_KERNEL_ACCOUNT);
0380     if (!page)
0381         return -ENOMEM;
0382 
0383     list_add(&page->lru, &ctx->io_buffers_pages);
0384 
0385     buf = page_address(page);
0386     bufs_in_page = PAGE_SIZE / sizeof(*buf);
0387     while (bufs_in_page) {
0388         list_add_tail(&buf->list, &ctx->io_buffers_cache);
0389         buf++;
0390         bufs_in_page--;
0391     }
0392 
0393     return 0;
0394 }
0395 
0396 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
0397               struct io_buffer_list *bl)
0398 {
0399     struct io_buffer *buf;
0400     u64 addr = pbuf->addr;
0401     int i, bid = pbuf->bid;
0402 
0403     for (i = 0; i < pbuf->nbufs; i++) {
0404         if (list_empty(&ctx->io_buffers_cache) &&
0405             io_refill_buffer_cache(ctx))
0406             break;
0407         buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
0408                     list);
0409         list_move_tail(&buf->list, &bl->buf_list);
0410         buf->addr = addr;
0411         buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
0412         buf->bid = bid;
0413         buf->bgid = pbuf->bgid;
0414         addr += pbuf->len;
0415         bid++;
0416         cond_resched();
0417     }
0418 
0419     return i ? 0 : -ENOMEM;
0420 }
0421 
0422 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
0423 {
0424     struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
0425     struct io_ring_ctx *ctx = req->ctx;
0426     struct io_buffer_list *bl;
0427     int ret = 0;
0428 
0429     io_ring_submit_lock(ctx, issue_flags);
0430 
0431     if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
0432         ret = io_init_bl_list(ctx);
0433         if (ret)
0434             goto err;
0435     }
0436 
0437     bl = io_buffer_get_list(ctx, p->bgid);
0438     if (unlikely(!bl)) {
0439         bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
0440         if (!bl) {
0441             ret = -ENOMEM;
0442             goto err;
0443         }
0444         INIT_LIST_HEAD(&bl->buf_list);
0445         ret = io_buffer_add_list(ctx, bl, p->bgid);
0446         if (ret) {
0447             kfree(bl);
0448             goto err;
0449         }
0450     }
0451     /* can't add buffers via this command for a mapped buffer ring */
0452     if (bl->buf_nr_pages) {
0453         ret = -EINVAL;
0454         goto err;
0455     }
0456 
0457     ret = io_add_buffers(ctx, p, bl);
0458 err:
0459     if (ret < 0)
0460         req_set_fail(req);
0461     /* complete before unlock, IOPOLL may need the lock */
0462     io_req_set_res(req, ret, 0);
0463     __io_req_complete(req, issue_flags);
0464     io_ring_submit_unlock(ctx, issue_flags);
0465     return IOU_ISSUE_SKIP_COMPLETE;
0466 }
0467 
0468 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
0469 {
0470     struct io_uring_buf_ring *br;
0471     struct io_uring_buf_reg reg;
0472     struct io_buffer_list *bl, *free_bl = NULL;
0473     struct page **pages;
0474     int nr_pages;
0475 
0476     if (copy_from_user(&reg, arg, sizeof(reg)))
0477         return -EFAULT;
0478 
0479     if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
0480         return -EINVAL;
0481     if (!reg.ring_addr)
0482         return -EFAULT;
0483     if (reg.ring_addr & ~PAGE_MASK)
0484         return -EINVAL;
0485     if (!is_power_of_2(reg.ring_entries))
0486         return -EINVAL;
0487 
0488     /* cannot disambiguate full vs empty due to head/tail size */
0489     if (reg.ring_entries >= 65536)
0490         return -EINVAL;
0491 
0492     if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
0493         int ret = io_init_bl_list(ctx);
0494         if (ret)
0495             return ret;
0496     }
0497 
0498     bl = io_buffer_get_list(ctx, reg.bgid);
0499     if (bl) {
0500         /* if mapped buffer ring OR classic exists, don't allow */
0501         if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
0502             return -EEXIST;
0503     } else {
0504         free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
0505         if (!bl)
0506             return -ENOMEM;
0507     }
0508 
0509     pages = io_pin_pages(reg.ring_addr,
0510                  struct_size(br, bufs, reg.ring_entries),
0511                  &nr_pages);
0512     if (IS_ERR(pages)) {
0513         kfree(free_bl);
0514         return PTR_ERR(pages);
0515     }
0516 
0517     br = page_address(pages[0]);
0518     bl->buf_pages = pages;
0519     bl->buf_nr_pages = nr_pages;
0520     bl->nr_entries = reg.ring_entries;
0521     bl->buf_ring = br;
0522     bl->mask = reg.ring_entries - 1;
0523     io_buffer_add_list(ctx, bl, reg.bgid);
0524     return 0;
0525 }
0526 
0527 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
0528 {
0529     struct io_uring_buf_reg reg;
0530     struct io_buffer_list *bl;
0531 
0532     if (copy_from_user(&reg, arg, sizeof(reg)))
0533         return -EFAULT;
0534     if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
0535         return -EINVAL;
0536 
0537     bl = io_buffer_get_list(ctx, reg.bgid);
0538     if (!bl)
0539         return -ENOENT;
0540     if (!bl->buf_nr_pages)
0541         return -EINVAL;
0542 
0543     __io_remove_buffers(ctx, bl, -1U);
0544     if (bl->bgid >= BGID_ARRAY) {
0545         xa_erase(&ctx->io_bl_xa, bl->bgid);
0546         kfree(bl);
0547     }
0548     return 0;
0549 }