0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042 #include <linux/kernel.h>
0043 #include <linux/init.h>
0044 #include <linux/errno.h>
0045 #include <linux/syscalls.h>
0046 #include <net/compat.h>
0047 #include <linux/refcount.h>
0048 #include <linux/uio.h>
0049 #include <linux/bits.h>
0050
0051 #include <linux/sched/signal.h>
0052 #include <linux/fs.h>
0053 #include <linux/file.h>
0054 #include <linux/fdtable.h>
0055 #include <linux/mm.h>
0056 #include <linux/mman.h>
0057 #include <linux/percpu.h>
0058 #include <linux/slab.h>
0059 #include <linux/bvec.h>
0060 #include <linux/net.h>
0061 #include <net/sock.h>
0062 #include <net/af_unix.h>
0063 #include <net/scm.h>
0064 #include <linux/anon_inodes.h>
0065 #include <linux/sched/mm.h>
0066 #include <linux/uaccess.h>
0067 #include <linux/nospec.h>
0068 #include <linux/highmem.h>
0069 #include <linux/fsnotify.h>
0070 #include <linux/fadvise.h>
0071 #include <linux/task_work.h>
0072 #include <linux/io_uring.h>
0073 #include <linux/audit.h>
0074 #include <linux/security.h>
0075
0076 #define CREATE_TRACE_POINTS
0077 #include <trace/events/io_uring.h>
0078
0079 #include <uapi/linux/io_uring.h>
0080
0081 #include "io-wq.h"
0082
0083 #include "io_uring.h"
0084 #include "opdef.h"
0085 #include "refs.h"
0086 #include "tctx.h"
0087 #include "sqpoll.h"
0088 #include "fdinfo.h"
0089 #include "kbuf.h"
0090 #include "rsrc.h"
0091 #include "cancel.h"
0092 #include "net.h"
0093 #include "notif.h"
0094
0095 #include "timeout.h"
0096 #include "poll.h"
0097 #include "alloc_cache.h"
0098
0099 #define IORING_MAX_ENTRIES 32768
0100 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
0101
0102 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
0103 IORING_REGISTER_LAST + IORING_OP_LAST)
0104
0105 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
0106 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
0107
0108 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
0109 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
0110
0111 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
0112 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
0113 REQ_F_ASYNC_DATA)
0114
0115 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
0116 IO_REQ_CLEAN_FLAGS)
0117
0118 #define IO_TCTX_REFS_CACHE_NR (1U << 10)
0119
0120 #define IO_COMPL_BATCH 32
0121 #define IO_REQ_ALLOC_BATCH 8
0122
0123 enum {
0124 IO_CHECK_CQ_OVERFLOW_BIT,
0125 IO_CHECK_CQ_DROPPED_BIT,
0126 };
0127
0128 struct io_defer_entry {
0129 struct list_head list;
0130 struct io_kiocb *req;
0131 u32 seq;
0132 };
0133
0134
0135 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
0136 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
0137
0138 static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
0139 struct task_struct *task,
0140 bool cancel_all);
0141
0142 static void io_dismantle_req(struct io_kiocb *req);
0143 static void io_clean_op(struct io_kiocb *req);
0144 static void io_queue_sqe(struct io_kiocb *req);
0145
0146 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
0147
0148 static struct kmem_cache *req_cachep;
0149
0150 struct sock *io_uring_get_socket(struct file *file)
0151 {
0152 #if defined(CONFIG_UNIX)
0153 if (io_is_uring_fops(file)) {
0154 struct io_ring_ctx *ctx = file->private_data;
0155
0156 return ctx->ring_sock->sk;
0157 }
0158 #endif
0159 return NULL;
0160 }
0161 EXPORT_SYMBOL(io_uring_get_socket);
0162
0163 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
0164 {
0165 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
0166 __io_submit_flush_completions(ctx);
0167 }
0168
0169 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
0170 {
0171 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
0172 }
0173
0174 static bool io_match_linked(struct io_kiocb *head)
0175 {
0176 struct io_kiocb *req;
0177
0178 io_for_each_link(req, head) {
0179 if (req->flags & REQ_F_INFLIGHT)
0180 return true;
0181 }
0182 return false;
0183 }
0184
0185
0186
0187
0188
0189 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
0190 bool cancel_all)
0191 {
0192 bool matched;
0193
0194 if (task && head->task != task)
0195 return false;
0196 if (cancel_all)
0197 return true;
0198
0199 if (head->flags & REQ_F_LINK_TIMEOUT) {
0200 struct io_ring_ctx *ctx = head->ctx;
0201
0202
0203 spin_lock_irq(&ctx->timeout_lock);
0204 matched = io_match_linked(head);
0205 spin_unlock_irq(&ctx->timeout_lock);
0206 } else {
0207 matched = io_match_linked(head);
0208 }
0209 return matched;
0210 }
0211
0212 static inline void req_fail_link_node(struct io_kiocb *req, int res)
0213 {
0214 req_set_fail(req);
0215 io_req_set_res(req, res, 0);
0216 }
0217
0218 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
0219 {
0220 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
0221 }
0222
0223 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
0224 {
0225 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
0226
0227 complete(&ctx->ref_comp);
0228 }
0229
0230 static __cold void io_fallback_req_func(struct work_struct *work)
0231 {
0232 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
0233 fallback_work.work);
0234 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
0235 struct io_kiocb *req, *tmp;
0236 bool locked = false;
0237
0238 percpu_ref_get(&ctx->refs);
0239 llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
0240 req->io_task_work.func(req, &locked);
0241
0242 if (locked) {
0243 io_submit_flush_completions(ctx);
0244 mutex_unlock(&ctx->uring_lock);
0245 }
0246 percpu_ref_put(&ctx->refs);
0247 }
0248
0249 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
0250 {
0251 unsigned hash_buckets = 1U << bits;
0252 size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
0253
0254 table->hbs = kmalloc(hash_size, GFP_KERNEL);
0255 if (!table->hbs)
0256 return -ENOMEM;
0257
0258 table->hash_bits = bits;
0259 init_hash_table(table, hash_buckets);
0260 return 0;
0261 }
0262
0263 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
0264 {
0265 struct io_ring_ctx *ctx;
0266 int hash_bits;
0267
0268 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
0269 if (!ctx)
0270 return NULL;
0271
0272 xa_init(&ctx->io_bl_xa);
0273
0274
0275
0276
0277
0278
0279 hash_bits = ilog2(p->cq_entries) - 5;
0280 hash_bits = clamp(hash_bits, 1, 8);
0281 if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
0282 goto err;
0283 if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
0284 goto err;
0285
0286 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
0287 if (!ctx->dummy_ubuf)
0288 goto err;
0289
0290 ctx->dummy_ubuf->ubuf = -1UL;
0291
0292 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
0293 0, GFP_KERNEL))
0294 goto err;
0295
0296 ctx->flags = p->flags;
0297 init_waitqueue_head(&ctx->sqo_sq_wait);
0298 INIT_LIST_HEAD(&ctx->sqd_list);
0299 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0300 INIT_LIST_HEAD(&ctx->io_buffers_cache);
0301 io_alloc_cache_init(&ctx->apoll_cache);
0302 io_alloc_cache_init(&ctx->netmsg_cache);
0303 init_completion(&ctx->ref_comp);
0304 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
0305 mutex_init(&ctx->uring_lock);
0306 init_waitqueue_head(&ctx->cq_wait);
0307 spin_lock_init(&ctx->completion_lock);
0308 spin_lock_init(&ctx->timeout_lock);
0309 INIT_WQ_LIST(&ctx->iopoll_list);
0310 INIT_LIST_HEAD(&ctx->io_buffers_pages);
0311 INIT_LIST_HEAD(&ctx->io_buffers_comp);
0312 INIT_LIST_HEAD(&ctx->defer_list);
0313 INIT_LIST_HEAD(&ctx->timeout_list);
0314 INIT_LIST_HEAD(&ctx->ltimeout_list);
0315 spin_lock_init(&ctx->rsrc_ref_lock);
0316 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
0317 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
0318 init_llist_head(&ctx->rsrc_put_llist);
0319 INIT_LIST_HEAD(&ctx->tctx_list);
0320 ctx->submit_state.free_list.next = NULL;
0321 INIT_WQ_LIST(&ctx->locked_free_list);
0322 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
0323 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
0324 return ctx;
0325 err:
0326 kfree(ctx->dummy_ubuf);
0327 kfree(ctx->cancel_table.hbs);
0328 kfree(ctx->cancel_table_locked.hbs);
0329 kfree(ctx->io_bl);
0330 xa_destroy(&ctx->io_bl_xa);
0331 kfree(ctx);
0332 return NULL;
0333 }
0334
0335 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
0336 {
0337 struct io_rings *r = ctx->rings;
0338
0339 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
0340 ctx->cq_extra--;
0341 }
0342
0343 static bool req_need_defer(struct io_kiocb *req, u32 seq)
0344 {
0345 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
0346 struct io_ring_ctx *ctx = req->ctx;
0347
0348 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
0349 }
0350
0351 return false;
0352 }
0353
0354 static inline void io_req_track_inflight(struct io_kiocb *req)
0355 {
0356 if (!(req->flags & REQ_F_INFLIGHT)) {
0357 req->flags |= REQ_F_INFLIGHT;
0358 atomic_inc(&req->task->io_uring->inflight_tracked);
0359 }
0360 }
0361
0362 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
0363 {
0364 if (WARN_ON_ONCE(!req->link))
0365 return NULL;
0366
0367 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0368 req->flags |= REQ_F_LINK_TIMEOUT;
0369
0370
0371 io_req_set_refcount(req);
0372 __io_req_set_refcount(req->link, 2);
0373 return req->link;
0374 }
0375
0376 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
0377 {
0378 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
0379 return NULL;
0380 return __io_prep_linked_timeout(req);
0381 }
0382
0383 static noinline void __io_arm_ltimeout(struct io_kiocb *req)
0384 {
0385 io_queue_linked_timeout(__io_prep_linked_timeout(req));
0386 }
0387
0388 static inline void io_arm_ltimeout(struct io_kiocb *req)
0389 {
0390 if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
0391 __io_arm_ltimeout(req);
0392 }
0393
0394 static void io_prep_async_work(struct io_kiocb *req)
0395 {
0396 const struct io_op_def *def = &io_op_defs[req->opcode];
0397 struct io_ring_ctx *ctx = req->ctx;
0398
0399 if (!(req->flags & REQ_F_CREDS)) {
0400 req->flags |= REQ_F_CREDS;
0401 req->creds = get_current_cred();
0402 }
0403
0404 req->work.list.next = NULL;
0405 req->work.flags = 0;
0406 req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
0407 if (req->flags & REQ_F_FORCE_ASYNC)
0408 req->work.flags |= IO_WQ_WORK_CONCURRENT;
0409
0410 if (req->file && !io_req_ffs_set(req))
0411 req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
0412
0413 if (req->flags & REQ_F_ISREG) {
0414 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
0415 io_wq_hash_work(&req->work, file_inode(req->file));
0416 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
0417 if (def->unbound_nonreg_file)
0418 req->work.flags |= IO_WQ_WORK_UNBOUND;
0419 }
0420 }
0421
0422 static void io_prep_async_link(struct io_kiocb *req)
0423 {
0424 struct io_kiocb *cur;
0425
0426 if (req->flags & REQ_F_LINK_TIMEOUT) {
0427 struct io_ring_ctx *ctx = req->ctx;
0428
0429 spin_lock_irq(&ctx->timeout_lock);
0430 io_for_each_link(cur, req)
0431 io_prep_async_work(cur);
0432 spin_unlock_irq(&ctx->timeout_lock);
0433 } else {
0434 io_for_each_link(cur, req)
0435 io_prep_async_work(cur);
0436 }
0437 }
0438
0439 void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
0440 {
0441 struct io_kiocb *link = io_prep_linked_timeout(req);
0442 struct io_uring_task *tctx = req->task->io_uring;
0443
0444 BUG_ON(!tctx);
0445 BUG_ON(!tctx->io_wq);
0446
0447
0448 io_prep_async_link(req);
0449
0450
0451
0452
0453
0454
0455
0456
0457 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
0458 req->work.flags |= IO_WQ_WORK_CANCEL;
0459
0460 trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
0461 io_wq_enqueue(tctx->io_wq, &req->work);
0462 if (link)
0463 io_queue_linked_timeout(link);
0464 }
0465
0466 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
0467 {
0468 while (!list_empty(&ctx->defer_list)) {
0469 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
0470 struct io_defer_entry, list);
0471
0472 if (req_need_defer(de->req, de->seq))
0473 break;
0474 list_del_init(&de->list);
0475 io_req_task_queue(de->req);
0476 kfree(de);
0477 }
0478 }
0479
0480 static void io_eventfd_signal(struct io_ring_ctx *ctx)
0481 {
0482 struct io_ev_fd *ev_fd;
0483 bool skip;
0484
0485 spin_lock(&ctx->completion_lock);
0486
0487
0488
0489
0490
0491
0492
0493
0494 skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
0495 ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
0496 spin_unlock(&ctx->completion_lock);
0497 if (skip)
0498 return;
0499
0500 rcu_read_lock();
0501
0502
0503
0504
0505 ev_fd = rcu_dereference(ctx->io_ev_fd);
0506
0507
0508
0509
0510
0511
0512 if (unlikely(!ev_fd))
0513 goto out;
0514 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
0515 goto out;
0516
0517 if (!ev_fd->eventfd_async || io_wq_current_is_worker())
0518 eventfd_signal(ev_fd->cq_ev_fd, 1);
0519 out:
0520 rcu_read_unlock();
0521 }
0522
0523 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
0524 {
0525 if (ctx->off_timeout_used || ctx->drain_active) {
0526 spin_lock(&ctx->completion_lock);
0527 if (ctx->off_timeout_used)
0528 io_flush_timeouts(ctx);
0529 if (ctx->drain_active)
0530 io_queue_deferred(ctx);
0531 spin_unlock(&ctx->completion_lock);
0532 }
0533 if (ctx->has_evfd)
0534 io_eventfd_signal(ctx);
0535 }
0536
0537 static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
0538 {
0539 io_commit_cqring_flush(ctx);
0540 io_cqring_wake(ctx);
0541 }
0542
0543 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
0544 __releases(ctx->completion_lock)
0545 {
0546 io_commit_cqring(ctx);
0547 spin_unlock(&ctx->completion_lock);
0548 io_cqring_ev_posted(ctx);
0549 }
0550
0551 void io_cq_unlock_post(struct io_ring_ctx *ctx)
0552 {
0553 __io_cq_unlock_post(ctx);
0554 }
0555
0556
0557 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
0558 {
0559 bool all_flushed;
0560 size_t cqe_size = sizeof(struct io_uring_cqe);
0561
0562 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
0563 return false;
0564
0565 if (ctx->flags & IORING_SETUP_CQE32)
0566 cqe_size <<= 1;
0567
0568 io_cq_lock(ctx);
0569 while (!list_empty(&ctx->cq_overflow_list)) {
0570 struct io_uring_cqe *cqe = io_get_cqe(ctx);
0571 struct io_overflow_cqe *ocqe;
0572
0573 if (!cqe && !force)
0574 break;
0575 ocqe = list_first_entry(&ctx->cq_overflow_list,
0576 struct io_overflow_cqe, list);
0577 if (cqe)
0578 memcpy(cqe, &ocqe->cqe, cqe_size);
0579 else
0580 io_account_cq_overflow(ctx);
0581
0582 list_del(&ocqe->list);
0583 kfree(ocqe);
0584 }
0585
0586 all_flushed = list_empty(&ctx->cq_overflow_list);
0587 if (all_flushed) {
0588 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
0589 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
0590 }
0591
0592 io_cq_unlock_post(ctx);
0593 return all_flushed;
0594 }
0595
0596 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
0597 {
0598 bool ret = true;
0599
0600 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
0601
0602 if (ctx->flags & IORING_SETUP_IOPOLL)
0603 mutex_lock(&ctx->uring_lock);
0604 ret = __io_cqring_overflow_flush(ctx, false);
0605 if (ctx->flags & IORING_SETUP_IOPOLL)
0606 mutex_unlock(&ctx->uring_lock);
0607 }
0608
0609 return ret;
0610 }
0611
0612 void __io_put_task(struct task_struct *task, int nr)
0613 {
0614 struct io_uring_task *tctx = task->io_uring;
0615
0616 percpu_counter_sub(&tctx->inflight, nr);
0617 if (unlikely(atomic_read(&tctx->in_idle)))
0618 wake_up(&tctx->wait);
0619 put_task_struct_many(task, nr);
0620 }
0621
0622 void io_task_refs_refill(struct io_uring_task *tctx)
0623 {
0624 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
0625
0626 percpu_counter_add(&tctx->inflight, refill);
0627 refcount_add(refill, ¤t->usage);
0628 tctx->cached_refs += refill;
0629 }
0630
0631 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
0632 {
0633 struct io_uring_task *tctx = task->io_uring;
0634 unsigned int refs = tctx->cached_refs;
0635
0636 if (refs) {
0637 tctx->cached_refs = 0;
0638 percpu_counter_sub(&tctx->inflight, refs);
0639 put_task_struct_many(task, refs);
0640 }
0641 }
0642
0643 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
0644 s32 res, u32 cflags, u64 extra1, u64 extra2)
0645 {
0646 struct io_overflow_cqe *ocqe;
0647 size_t ocq_size = sizeof(struct io_overflow_cqe);
0648 bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
0649
0650 if (is_cqe32)
0651 ocq_size += sizeof(struct io_uring_cqe);
0652
0653 ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
0654 trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
0655 if (!ocqe) {
0656
0657
0658
0659
0660
0661 io_account_cq_overflow(ctx);
0662 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
0663 return false;
0664 }
0665 if (list_empty(&ctx->cq_overflow_list)) {
0666 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
0667 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
0668
0669 }
0670 ocqe->cqe.user_data = user_data;
0671 ocqe->cqe.res = res;
0672 ocqe->cqe.flags = cflags;
0673 if (is_cqe32) {
0674 ocqe->cqe.big_cqe[0] = extra1;
0675 ocqe->cqe.big_cqe[1] = extra2;
0676 }
0677 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
0678 return true;
0679 }
0680
0681 bool io_req_cqe_overflow(struct io_kiocb *req)
0682 {
0683 if (!(req->flags & REQ_F_CQE32_INIT)) {
0684 req->extra1 = 0;
0685 req->extra2 = 0;
0686 }
0687 return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
0688 req->cqe.res, req->cqe.flags,
0689 req->extra1, req->extra2);
0690 }
0691
0692
0693
0694
0695
0696
0697 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
0698 {
0699 struct io_rings *rings = ctx->rings;
0700 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
0701 unsigned int free, queued, len;
0702
0703
0704
0705 queued = min(__io_cqring_events(ctx), ctx->cq_entries);
0706 free = ctx->cq_entries - queued;
0707
0708 len = min(free, ctx->cq_entries - off);
0709 if (!len)
0710 return NULL;
0711
0712 if (ctx->flags & IORING_SETUP_CQE32) {
0713 off <<= 1;
0714 len <<= 1;
0715 }
0716
0717 ctx->cqe_cached = &rings->cqes[off];
0718 ctx->cqe_sentinel = ctx->cqe_cached + len;
0719
0720 ctx->cached_cq_tail++;
0721 ctx->cqe_cached++;
0722 if (ctx->flags & IORING_SETUP_CQE32)
0723 ctx->cqe_cached++;
0724 return &rings->cqes[off];
0725 }
0726
0727 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
0728 bool allow_overflow)
0729 {
0730 struct io_uring_cqe *cqe;
0731
0732 ctx->cq_extra++;
0733
0734
0735
0736
0737
0738
0739 cqe = io_get_cqe(ctx);
0740 if (likely(cqe)) {
0741 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
0742
0743 WRITE_ONCE(cqe->user_data, user_data);
0744 WRITE_ONCE(cqe->res, res);
0745 WRITE_ONCE(cqe->flags, cflags);
0746
0747 if (ctx->flags & IORING_SETUP_CQE32) {
0748 WRITE_ONCE(cqe->big_cqe[0], 0);
0749 WRITE_ONCE(cqe->big_cqe[1], 0);
0750 }
0751 return true;
0752 }
0753
0754 if (allow_overflow)
0755 return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
0756
0757 return false;
0758 }
0759
0760 bool io_post_aux_cqe(struct io_ring_ctx *ctx,
0761 u64 user_data, s32 res, u32 cflags,
0762 bool allow_overflow)
0763 {
0764 bool filled;
0765
0766 io_cq_lock(ctx);
0767 filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow);
0768 io_cq_unlock_post(ctx);
0769 return filled;
0770 }
0771
0772 static void __io_req_complete_put(struct io_kiocb *req)
0773 {
0774
0775
0776
0777
0778 if (req_ref_put_and_test(req)) {
0779 struct io_ring_ctx *ctx = req->ctx;
0780
0781 if (req->flags & IO_REQ_LINK_FLAGS) {
0782 if (req->flags & IO_DISARM_MASK)
0783 io_disarm_next(req);
0784 if (req->link) {
0785 io_req_task_queue(req->link);
0786 req->link = NULL;
0787 }
0788 }
0789 io_req_put_rsrc(req);
0790
0791
0792
0793
0794
0795 io_put_kbuf_comp(req);
0796 io_dismantle_req(req);
0797 io_put_task(req->task, 1);
0798 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
0799 ctx->locked_free_nr++;
0800 }
0801 }
0802
0803 void __io_req_complete_post(struct io_kiocb *req)
0804 {
0805 if (!(req->flags & REQ_F_CQE_SKIP))
0806 __io_fill_cqe_req(req->ctx, req);
0807 __io_req_complete_put(req);
0808 }
0809
0810 void io_req_complete_post(struct io_kiocb *req)
0811 {
0812 struct io_ring_ctx *ctx = req->ctx;
0813
0814 io_cq_lock(ctx);
0815 __io_req_complete_post(req);
0816 io_cq_unlock_post(ctx);
0817 }
0818
0819 inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
0820 {
0821 io_req_complete_post(req);
0822 }
0823
0824 void io_req_complete_failed(struct io_kiocb *req, s32 res)
0825 {
0826 req_set_fail(req);
0827 io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
0828 io_req_complete_post(req);
0829 }
0830
0831
0832
0833
0834
0835 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
0836 {
0837 req->ctx = ctx;
0838 req->link = NULL;
0839 req->async_data = NULL;
0840
0841 req->cqe.res = 0;
0842 }
0843
0844 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
0845 struct io_submit_state *state)
0846 {
0847 spin_lock(&ctx->completion_lock);
0848 wq_list_splice(&ctx->locked_free_list, &state->free_list);
0849 ctx->locked_free_nr = 0;
0850 spin_unlock(&ctx->completion_lock);
0851 }
0852
0853
0854
0855
0856
0857
0858
0859 __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
0860 __must_hold(&ctx->uring_lock)
0861 {
0862 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
0863 void *reqs[IO_REQ_ALLOC_BATCH];
0864 int ret, i;
0865
0866
0867
0868
0869
0870
0871 if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
0872 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
0873 if (!io_req_cache_empty(ctx))
0874 return true;
0875 }
0876
0877 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
0878
0879
0880
0881
0882
0883 if (unlikely(ret <= 0)) {
0884 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
0885 if (!reqs[0])
0886 return false;
0887 ret = 1;
0888 }
0889
0890 percpu_ref_get_many(&ctx->refs, ret);
0891 for (i = 0; i < ret; i++) {
0892 struct io_kiocb *req = reqs[i];
0893
0894 io_preinit_req(req, ctx);
0895 io_req_add_to_cache(req, ctx);
0896 }
0897 return true;
0898 }
0899
0900 static inline void io_dismantle_req(struct io_kiocb *req)
0901 {
0902 unsigned int flags = req->flags;
0903
0904 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
0905 io_clean_op(req);
0906 if (!(flags & REQ_F_FIXED_FILE))
0907 io_put_file(req->file);
0908 }
0909
0910 __cold void io_free_req(struct io_kiocb *req)
0911 {
0912 struct io_ring_ctx *ctx = req->ctx;
0913
0914 io_req_put_rsrc(req);
0915 io_dismantle_req(req);
0916 io_put_task(req->task, 1);
0917
0918 spin_lock(&ctx->completion_lock);
0919 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
0920 ctx->locked_free_nr++;
0921 spin_unlock(&ctx->completion_lock);
0922 }
0923
0924 static void __io_req_find_next_prep(struct io_kiocb *req)
0925 {
0926 struct io_ring_ctx *ctx = req->ctx;
0927
0928 io_cq_lock(ctx);
0929 io_disarm_next(req);
0930 io_cq_unlock_post(ctx);
0931 }
0932
0933 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
0934 {
0935 struct io_kiocb *nxt;
0936
0937
0938
0939
0940
0941
0942
0943 if (unlikely(req->flags & IO_DISARM_MASK))
0944 __io_req_find_next_prep(req);
0945 nxt = req->link;
0946 req->link = NULL;
0947 return nxt;
0948 }
0949
0950 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
0951 {
0952 if (!ctx)
0953 return;
0954 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
0955 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
0956 if (*locked) {
0957 io_submit_flush_completions(ctx);
0958 mutex_unlock(&ctx->uring_lock);
0959 *locked = false;
0960 }
0961 percpu_ref_put(&ctx->refs);
0962 }
0963
0964 static unsigned int handle_tw_list(struct llist_node *node,
0965 struct io_ring_ctx **ctx, bool *locked,
0966 struct llist_node *last)
0967 {
0968 unsigned int count = 0;
0969
0970 while (node != last) {
0971 struct llist_node *next = node->next;
0972 struct io_kiocb *req = container_of(node, struct io_kiocb,
0973 io_task_work.node);
0974
0975 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
0976
0977 if (req->ctx != *ctx) {
0978 ctx_flush_and_put(*ctx, locked);
0979 *ctx = req->ctx;
0980
0981 *locked = mutex_trylock(&(*ctx)->uring_lock);
0982 percpu_ref_get(&(*ctx)->refs);
0983 }
0984 req->io_task_work.func(req, locked);
0985 node = next;
0986 count++;
0987 }
0988
0989 return count;
0990 }
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000 static inline struct llist_node *io_llist_xchg(struct llist_head *head,
1001 struct llist_node *new)
1002 {
1003 return xchg(&head->first, new);
1004 }
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015 static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
1016 struct llist_node *old,
1017 struct llist_node *new)
1018 {
1019 return cmpxchg(&head->first, old, new);
1020 }
1021
1022 void tctx_task_work(struct callback_head *cb)
1023 {
1024 bool uring_locked = false;
1025 struct io_ring_ctx *ctx = NULL;
1026 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
1027 task_work);
1028 struct llist_node fake = {};
1029 struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
1030 unsigned int loops = 1;
1031 unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL);
1032
1033 node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
1034 while (node != &fake) {
1035 loops++;
1036 node = io_llist_xchg(&tctx->task_list, &fake);
1037 count += handle_tw_list(node, &ctx, &uring_locked, &fake);
1038 node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
1039 }
1040
1041 ctx_flush_and_put(ctx, &uring_locked);
1042
1043
1044 if (unlikely(atomic_read(&tctx->in_idle)))
1045 io_uring_drop_tctx_refs(current);
1046
1047 trace_io_uring_task_work_run(tctx, count, loops);
1048 }
1049
1050 void io_req_task_work_add(struct io_kiocb *req)
1051 {
1052 struct io_uring_task *tctx = req->task->io_uring;
1053 struct io_ring_ctx *ctx = req->ctx;
1054 struct llist_node *node;
1055 bool running;
1056
1057 running = !llist_add(&req->io_task_work.node, &tctx->task_list);
1058
1059
1060 if (running)
1061 return;
1062
1063 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1064 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1065
1066 if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1067 return;
1068
1069 node = llist_del_all(&tctx->task_list);
1070
1071 while (node) {
1072 req = container_of(node, struct io_kiocb, io_task_work.node);
1073 node = node->next;
1074 if (llist_add(&req->io_task_work.node,
1075 &req->ctx->fallback_llist))
1076 schedule_delayed_work(&req->ctx->fallback_work, 1);
1077 }
1078 }
1079
1080 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
1081 {
1082 io_req_complete_post(req);
1083 }
1084
1085 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
1086 {
1087 io_req_set_res(req, res, cflags);
1088 req->io_task_work.func = io_req_tw_post;
1089 io_req_task_work_add(req);
1090 }
1091
1092 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
1093 {
1094
1095 io_tw_lock(req->ctx, locked);
1096 io_req_complete_failed(req, req->cqe.res);
1097 }
1098
1099 void io_req_task_submit(struct io_kiocb *req, bool *locked)
1100 {
1101 io_tw_lock(req->ctx, locked);
1102
1103 if (likely(!(req->task->flags & PF_EXITING)))
1104 io_queue_sqe(req);
1105 else
1106 io_req_complete_failed(req, -EFAULT);
1107 }
1108
1109 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1110 {
1111 io_req_set_res(req, ret, 0);
1112 req->io_task_work.func = io_req_task_cancel;
1113 io_req_task_work_add(req);
1114 }
1115
1116 void io_req_task_queue(struct io_kiocb *req)
1117 {
1118 req->io_task_work.func = io_req_task_submit;
1119 io_req_task_work_add(req);
1120 }
1121
1122 void io_queue_next(struct io_kiocb *req)
1123 {
1124 struct io_kiocb *nxt = io_req_find_next(req);
1125
1126 if (nxt)
1127 io_req_task_queue(nxt);
1128 }
1129
1130 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
1131 __must_hold(&ctx->uring_lock)
1132 {
1133 struct task_struct *task = NULL;
1134 int task_refs = 0;
1135
1136 do {
1137 struct io_kiocb *req = container_of(node, struct io_kiocb,
1138 comp_list);
1139
1140 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
1141 if (req->flags & REQ_F_REFCOUNT) {
1142 node = req->comp_list.next;
1143 if (!req_ref_put_and_test(req))
1144 continue;
1145 }
1146 if ((req->flags & REQ_F_POLLED) && req->apoll) {
1147 struct async_poll *apoll = req->apoll;
1148
1149 if (apoll->double_poll)
1150 kfree(apoll->double_poll);
1151 if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
1152 kfree(apoll);
1153 req->flags &= ~REQ_F_POLLED;
1154 }
1155 if (req->flags & IO_REQ_LINK_FLAGS)
1156 io_queue_next(req);
1157 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
1158 io_clean_op(req);
1159 }
1160 if (!(req->flags & REQ_F_FIXED_FILE))
1161 io_put_file(req->file);
1162
1163 io_req_put_rsrc_locked(req, ctx);
1164
1165 if (req->task != task) {
1166 if (task)
1167 io_put_task(task, task_refs);
1168 task = req->task;
1169 task_refs = 0;
1170 }
1171 task_refs++;
1172 node = req->comp_list.next;
1173 io_req_add_to_cache(req, ctx);
1174 } while (node);
1175
1176 if (task)
1177 io_put_task(task, task_refs);
1178 }
1179
1180 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1181 __must_hold(&ctx->uring_lock)
1182 {
1183 struct io_wq_work_node *node, *prev;
1184 struct io_submit_state *state = &ctx->submit_state;
1185
1186 spin_lock(&ctx->completion_lock);
1187 wq_list_for_each(node, prev, &state->compl_reqs) {
1188 struct io_kiocb *req = container_of(node, struct io_kiocb,
1189 comp_list);
1190
1191 if (!(req->flags & REQ_F_CQE_SKIP))
1192 __io_fill_cqe_req(ctx, req);
1193 }
1194 __io_cq_unlock_post(ctx);
1195
1196 io_free_batch_list(ctx, state->compl_reqs.first);
1197 INIT_WQ_LIST(&state->compl_reqs);
1198 }
1199
1200
1201
1202
1203
1204 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1205 {
1206 struct io_kiocb *nxt = NULL;
1207
1208 if (req_ref_put_and_test(req)) {
1209 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
1210 nxt = io_req_find_next(req);
1211 io_free_req(req);
1212 }
1213 return nxt;
1214 }
1215
1216 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1217 {
1218
1219 smp_rmb();
1220 return __io_cqring_events(ctx);
1221 }
1222
1223
1224
1225
1226
1227 static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
1228 {
1229 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1230 return;
1231
1232 mutex_lock(&ctx->uring_lock);
1233 while (!wq_list_empty(&ctx->iopoll_list)) {
1234
1235 if (io_do_iopoll(ctx, true) == 0)
1236 break;
1237
1238
1239
1240
1241
1242 if (need_resched()) {
1243 mutex_unlock(&ctx->uring_lock);
1244 cond_resched();
1245 mutex_lock(&ctx->uring_lock);
1246 }
1247 }
1248 mutex_unlock(&ctx->uring_lock);
1249 }
1250
1251 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
1252 {
1253 unsigned int nr_events = 0;
1254 int ret = 0;
1255 unsigned long check_cq;
1256
1257 check_cq = READ_ONCE(ctx->check_cq);
1258 if (unlikely(check_cq)) {
1259 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
1260 __io_cqring_overflow_flush(ctx, false);
1261
1262
1263
1264
1265 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
1266 return -EBADR;
1267 }
1268
1269
1270
1271
1272
1273 if (io_cqring_events(ctx))
1274 return 0;
1275
1276 do {
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287 if (wq_list_empty(&ctx->iopoll_list)) {
1288 u32 tail = ctx->cached_cq_tail;
1289
1290 mutex_unlock(&ctx->uring_lock);
1291 io_run_task_work();
1292 mutex_lock(&ctx->uring_lock);
1293
1294
1295 if (tail != ctx->cached_cq_tail ||
1296 wq_list_empty(&ctx->iopoll_list))
1297 break;
1298 }
1299 ret = io_do_iopoll(ctx, !min);
1300 if (ret < 0)
1301 break;
1302 nr_events += ret;
1303 ret = 0;
1304 } while (nr_events < min && !need_resched());
1305
1306 return ret;
1307 }
1308
1309 void io_req_task_complete(struct io_kiocb *req, bool *locked)
1310 {
1311 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
1312 unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
1313
1314 req->cqe.flags |= io_put_kbuf(req, issue_flags);
1315 }
1316
1317 if (*locked)
1318 io_req_complete_defer(req);
1319 else
1320 io_req_complete_post(req);
1321 }
1322
1323
1324
1325
1326
1327
1328
1329 static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
1330 {
1331 struct io_ring_ctx *ctx = req->ctx;
1332 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
1333
1334
1335 if (unlikely(needs_lock))
1336 mutex_lock(&ctx->uring_lock);
1337
1338
1339
1340
1341
1342
1343 if (wq_list_empty(&ctx->iopoll_list)) {
1344 ctx->poll_multi_queue = false;
1345 } else if (!ctx->poll_multi_queue) {
1346 struct io_kiocb *list_req;
1347
1348 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
1349 comp_list);
1350 if (list_req->file != req->file)
1351 ctx->poll_multi_queue = true;
1352 }
1353
1354
1355
1356
1357
1358 if (READ_ONCE(req->iopoll_completed))
1359 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
1360 else
1361 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
1362
1363 if (unlikely(needs_lock)) {
1364
1365
1366
1367
1368
1369
1370 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1371 wq_has_sleeper(&ctx->sq_data->wait))
1372 wake_up(&ctx->sq_data->wait);
1373
1374 mutex_unlock(&ctx->uring_lock);
1375 }
1376 }
1377
1378 static bool io_bdev_nowait(struct block_device *bdev)
1379 {
1380 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
1381 }
1382
1383
1384
1385
1386
1387
1388 static bool __io_file_supports_nowait(struct file *file, umode_t mode)
1389 {
1390 if (S_ISBLK(mode)) {
1391 if (IS_ENABLED(CONFIG_BLOCK) &&
1392 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
1393 return true;
1394 return false;
1395 }
1396 if (S_ISSOCK(mode))
1397 return true;
1398 if (S_ISREG(mode)) {
1399 if (IS_ENABLED(CONFIG_BLOCK) &&
1400 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
1401 !io_is_uring_fops(file))
1402 return true;
1403 return false;
1404 }
1405
1406
1407 if (file->f_flags & O_NONBLOCK)
1408 return true;
1409 return file->f_mode & FMODE_NOWAIT;
1410 }
1411
1412
1413
1414
1415
1416
1417 unsigned int io_file_get_flags(struct file *file)
1418 {
1419 umode_t mode = file_inode(file)->i_mode;
1420 unsigned int res = 0;
1421
1422 if (S_ISREG(mode))
1423 res |= FFS_ISREG;
1424 if (__io_file_supports_nowait(file, mode))
1425 res |= FFS_NOWAIT;
1426 if (io_file_need_scm(file))
1427 res |= FFS_SCM;
1428 return res;
1429 }
1430
1431 bool io_alloc_async_data(struct io_kiocb *req)
1432 {
1433 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
1434 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
1435 if (req->async_data) {
1436 req->flags |= REQ_F_ASYNC_DATA;
1437 return false;
1438 }
1439 return true;
1440 }
1441
1442 int io_req_prep_async(struct io_kiocb *req)
1443 {
1444 const struct io_op_def *def = &io_op_defs[req->opcode];
1445
1446
1447 if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
1448 req->file = io_file_get_normal(req, req->cqe.fd);
1449 if (!def->prep_async)
1450 return 0;
1451 if (WARN_ON_ONCE(req_has_async_data(req)))
1452 return -EFAULT;
1453 if (!io_op_defs[req->opcode].manual_alloc) {
1454 if (io_alloc_async_data(req))
1455 return -EAGAIN;
1456 }
1457 return def->prep_async(req);
1458 }
1459
1460 static u32 io_get_sequence(struct io_kiocb *req)
1461 {
1462 u32 seq = req->ctx->cached_sq_head;
1463 struct io_kiocb *cur;
1464
1465
1466 io_for_each_link(cur, req)
1467 seq--;
1468 return seq;
1469 }
1470
1471 static __cold void io_drain_req(struct io_kiocb *req)
1472 {
1473 struct io_ring_ctx *ctx = req->ctx;
1474 struct io_defer_entry *de;
1475 int ret;
1476 u32 seq = io_get_sequence(req);
1477
1478
1479 spin_lock(&ctx->completion_lock);
1480 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
1481 spin_unlock(&ctx->completion_lock);
1482 queue:
1483 ctx->drain_active = false;
1484 io_req_task_queue(req);
1485 return;
1486 }
1487 spin_unlock(&ctx->completion_lock);
1488
1489 ret = io_req_prep_async(req);
1490 if (ret) {
1491 fail:
1492 io_req_complete_failed(req, ret);
1493 return;
1494 }
1495 io_prep_async_link(req);
1496 de = kmalloc(sizeof(*de), GFP_KERNEL);
1497 if (!de) {
1498 ret = -ENOMEM;
1499 goto fail;
1500 }
1501
1502 spin_lock(&ctx->completion_lock);
1503 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
1504 spin_unlock(&ctx->completion_lock);
1505 kfree(de);
1506 goto queue;
1507 }
1508
1509 trace_io_uring_defer(req);
1510 de->req = req;
1511 de->seq = seq;
1512 list_add_tail(&de->list, &ctx->defer_list);
1513 spin_unlock(&ctx->completion_lock);
1514 }
1515
1516 static void io_clean_op(struct io_kiocb *req)
1517 {
1518 if (req->flags & REQ_F_BUFFER_SELECTED) {
1519 spin_lock(&req->ctx->completion_lock);
1520 io_put_kbuf_comp(req);
1521 spin_unlock(&req->ctx->completion_lock);
1522 }
1523
1524 if (req->flags & REQ_F_NEED_CLEANUP) {
1525 const struct io_op_def *def = &io_op_defs[req->opcode];
1526
1527 if (def->cleanup)
1528 def->cleanup(req);
1529 }
1530 if ((req->flags & REQ_F_POLLED) && req->apoll) {
1531 kfree(req->apoll->double_poll);
1532 kfree(req->apoll);
1533 req->apoll = NULL;
1534 }
1535 if (req->flags & REQ_F_INFLIGHT) {
1536 struct io_uring_task *tctx = req->task->io_uring;
1537
1538 atomic_dec(&tctx->inflight_tracked);
1539 }
1540 if (req->flags & REQ_F_CREDS)
1541 put_cred(req->creds);
1542 if (req->flags & REQ_F_ASYNC_DATA) {
1543 kfree(req->async_data);
1544 req->async_data = NULL;
1545 }
1546 req->flags &= ~IO_REQ_CLEAN_FLAGS;
1547 }
1548
1549 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
1550 {
1551 if (req->file || !io_op_defs[req->opcode].needs_file)
1552 return true;
1553
1554 if (req->flags & REQ_F_FIXED_FILE)
1555 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
1556 else
1557 req->file = io_file_get_normal(req, req->cqe.fd);
1558
1559 return !!req->file;
1560 }
1561
1562 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
1563 {
1564 const struct io_op_def *def = &io_op_defs[req->opcode];
1565 const struct cred *creds = NULL;
1566 int ret;
1567
1568 if (unlikely(!io_assign_file(req, issue_flags)))
1569 return -EBADF;
1570
1571 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
1572 creds = override_creds(req->creds);
1573
1574 if (!def->audit_skip)
1575 audit_uring_entry(req->opcode);
1576
1577 ret = def->issue(req, issue_flags);
1578
1579 if (!def->audit_skip)
1580 audit_uring_exit(!ret, ret);
1581
1582 if (creds)
1583 revert_creds(creds);
1584
1585 if (ret == IOU_OK) {
1586 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1587 io_req_complete_defer(req);
1588 else
1589 io_req_complete_post(req);
1590 } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
1591 return ret;
1592
1593
1594 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
1595 io_iopoll_req_issued(req, issue_flags);
1596
1597 return 0;
1598 }
1599
1600 int io_poll_issue(struct io_kiocb *req, bool *locked)
1601 {
1602 io_tw_lock(req->ctx, locked);
1603 if (unlikely(req->task->flags & PF_EXITING))
1604 return -EFAULT;
1605 return io_issue_sqe(req, IO_URING_F_NONBLOCK);
1606 }
1607
1608 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
1609 {
1610 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1611
1612 req = io_put_req_find_next(req);
1613 return req ? &req->work : NULL;
1614 }
1615
1616 void io_wq_submit_work(struct io_wq_work *work)
1617 {
1618 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1619 const struct io_op_def *def = &io_op_defs[req->opcode];
1620 unsigned int issue_flags = IO_URING_F_UNLOCKED;
1621 bool needs_poll = false;
1622 int ret = 0, err = -ECANCELED;
1623
1624
1625 if (!(req->flags & REQ_F_REFCOUNT))
1626 __io_req_set_refcount(req, 2);
1627 else
1628 req_ref_get(req);
1629
1630 io_arm_ltimeout(req);
1631
1632
1633 if (work->flags & IO_WQ_WORK_CANCEL) {
1634 fail:
1635 io_req_task_queue_fail(req, err);
1636 return;
1637 }
1638 if (!io_assign_file(req, issue_flags)) {
1639 err = -EBADF;
1640 work->flags |= IO_WQ_WORK_CANCEL;
1641 goto fail;
1642 }
1643
1644 if (req->flags & REQ_F_FORCE_ASYNC) {
1645 bool opcode_poll = def->pollin || def->pollout;
1646
1647 if (opcode_poll && file_can_poll(req->file)) {
1648 needs_poll = true;
1649 issue_flags |= IO_URING_F_NONBLOCK;
1650 }
1651 }
1652
1653 do {
1654 ret = io_issue_sqe(req, issue_flags);
1655 if (ret != -EAGAIN)
1656 break;
1657
1658
1659
1660
1661
1662 if (!needs_poll) {
1663 if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
1664 break;
1665 cond_resched();
1666 continue;
1667 }
1668
1669 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
1670 return;
1671
1672 needs_poll = false;
1673 issue_flags &= ~IO_URING_F_NONBLOCK;
1674 } while (1);
1675
1676
1677 if (ret < 0)
1678 io_req_task_queue_fail(req, ret);
1679 }
1680
1681 inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1682 unsigned int issue_flags)
1683 {
1684 struct io_ring_ctx *ctx = req->ctx;
1685 struct file *file = NULL;
1686 unsigned long file_ptr;
1687
1688 io_ring_submit_lock(ctx, issue_flags);
1689
1690 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
1691 goto out;
1692 fd = array_index_nospec(fd, ctx->nr_user_files);
1693 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
1694 file = (struct file *) (file_ptr & FFS_MASK);
1695 file_ptr &= ~FFS_MASK;
1696
1697 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
1698 io_req_set_rsrc_node(req, ctx, 0);
1699 WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
1700 out:
1701 io_ring_submit_unlock(ctx, issue_flags);
1702 return file;
1703 }
1704
1705 struct file *io_file_get_normal(struct io_kiocb *req, int fd)
1706 {
1707 struct file *file = fget(fd);
1708
1709 trace_io_uring_file_get(req, fd);
1710
1711
1712 if (file && io_is_uring_fops(file))
1713 io_req_track_inflight(req);
1714 return file;
1715 }
1716
1717 static void io_queue_async(struct io_kiocb *req, int ret)
1718 __must_hold(&req->ctx->uring_lock)
1719 {
1720 struct io_kiocb *linked_timeout;
1721
1722 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
1723 io_req_complete_failed(req, ret);
1724 return;
1725 }
1726
1727 linked_timeout = io_prep_linked_timeout(req);
1728
1729 switch (io_arm_poll_handler(req, 0)) {
1730 case IO_APOLL_READY:
1731 io_kbuf_recycle(req, 0);
1732 io_req_task_queue(req);
1733 break;
1734 case IO_APOLL_ABORTED:
1735
1736
1737
1738
1739 io_kbuf_recycle(req, 0);
1740 io_queue_iowq(req, NULL);
1741 break;
1742 case IO_APOLL_OK:
1743 break;
1744 }
1745
1746 if (linked_timeout)
1747 io_queue_linked_timeout(linked_timeout);
1748 }
1749
1750 static inline void io_queue_sqe(struct io_kiocb *req)
1751 __must_hold(&req->ctx->uring_lock)
1752 {
1753 int ret;
1754
1755 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
1756
1757
1758
1759
1760
1761 if (likely(!ret))
1762 io_arm_ltimeout(req);
1763 else
1764 io_queue_async(req, ret);
1765 }
1766
1767 static void io_queue_sqe_fallback(struct io_kiocb *req)
1768 __must_hold(&req->ctx->uring_lock)
1769 {
1770 if (unlikely(req->flags & REQ_F_FAIL)) {
1771
1772
1773
1774
1775 req->flags &= ~REQ_F_HARDLINK;
1776 req->flags |= REQ_F_LINK;
1777 io_req_complete_failed(req, req->cqe.res);
1778 } else if (unlikely(req->ctx->drain_active)) {
1779 io_drain_req(req);
1780 } else {
1781 int ret = io_req_prep_async(req);
1782
1783 if (unlikely(ret))
1784 io_req_complete_failed(req, ret);
1785 else
1786 io_queue_iowq(req, NULL);
1787 }
1788 }
1789
1790
1791
1792
1793
1794
1795 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
1796 struct io_kiocb *req,
1797 unsigned int sqe_flags)
1798 {
1799 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
1800 return false;
1801
1802 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
1803 ctx->restrictions.sqe_flags_required)
1804 return false;
1805
1806 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
1807 ctx->restrictions.sqe_flags_required))
1808 return false;
1809
1810 return true;
1811 }
1812
1813 static void io_init_req_drain(struct io_kiocb *req)
1814 {
1815 struct io_ring_ctx *ctx = req->ctx;
1816 struct io_kiocb *head = ctx->submit_state.link.head;
1817
1818 ctx->drain_active = true;
1819 if (head) {
1820
1821
1822
1823
1824
1825
1826
1827 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1828 ctx->drain_next = true;
1829 }
1830 }
1831
1832 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
1833 const struct io_uring_sqe *sqe)
1834 __must_hold(&ctx->uring_lock)
1835 {
1836 const struct io_op_def *def;
1837 unsigned int sqe_flags;
1838 int personality;
1839 u8 opcode;
1840
1841
1842 req->opcode = opcode = READ_ONCE(sqe->opcode);
1843
1844 req->flags = sqe_flags = READ_ONCE(sqe->flags);
1845 req->cqe.user_data = READ_ONCE(sqe->user_data);
1846 req->file = NULL;
1847 req->rsrc_node = NULL;
1848 req->task = current;
1849
1850 if (unlikely(opcode >= IORING_OP_LAST)) {
1851 req->opcode = 0;
1852 return -EINVAL;
1853 }
1854 def = &io_op_defs[opcode];
1855 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
1856
1857 if (sqe_flags & ~SQE_VALID_FLAGS)
1858 return -EINVAL;
1859 if (sqe_flags & IOSQE_BUFFER_SELECT) {
1860 if (!def->buffer_select)
1861 return -EOPNOTSUPP;
1862 req->buf_index = READ_ONCE(sqe->buf_group);
1863 }
1864 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
1865 ctx->drain_disabled = true;
1866 if (sqe_flags & IOSQE_IO_DRAIN) {
1867 if (ctx->drain_disabled)
1868 return -EOPNOTSUPP;
1869 io_init_req_drain(req);
1870 }
1871 }
1872 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
1873 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
1874 return -EACCES;
1875
1876 if (ctx->drain_active)
1877 req->flags |= REQ_F_FORCE_ASYNC;
1878
1879 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
1880 ctx->drain_next = false;
1881 ctx->drain_active = true;
1882 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1883 }
1884 }
1885
1886 if (!def->ioprio && sqe->ioprio)
1887 return -EINVAL;
1888 if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
1889 return -EINVAL;
1890
1891 if (def->needs_file) {
1892 struct io_submit_state *state = &ctx->submit_state;
1893
1894 req->cqe.fd = READ_ONCE(sqe->fd);
1895
1896
1897
1898
1899
1900 if (state->need_plug && def->plug) {
1901 state->plug_started = true;
1902 state->need_plug = false;
1903 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
1904 }
1905 }
1906
1907 personality = READ_ONCE(sqe->personality);
1908 if (personality) {
1909 int ret;
1910
1911 req->creds = xa_load(&ctx->personalities, personality);
1912 if (!req->creds)
1913 return -EINVAL;
1914 get_cred(req->creds);
1915 ret = security_uring_override_creds(req->creds);
1916 if (ret) {
1917 put_cred(req->creds);
1918 return ret;
1919 }
1920 req->flags |= REQ_F_CREDS;
1921 }
1922
1923 return def->prep(req, sqe);
1924 }
1925
1926 static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
1927 struct io_kiocb *req, int ret)
1928 {
1929 struct io_ring_ctx *ctx = req->ctx;
1930 struct io_submit_link *link = &ctx->submit_state.link;
1931 struct io_kiocb *head = link->head;
1932
1933 trace_io_uring_req_failed(sqe, req, ret);
1934
1935
1936
1937
1938
1939
1940
1941 req_fail_link_node(req, ret);
1942 if (head && !(head->flags & REQ_F_FAIL))
1943 req_fail_link_node(head, -ECANCELED);
1944
1945 if (!(req->flags & IO_REQ_LINK_FLAGS)) {
1946 if (head) {
1947 link->last->link = req;
1948 link->head = NULL;
1949 req = head;
1950 }
1951 io_queue_sqe_fallback(req);
1952 return ret;
1953 }
1954
1955 if (head)
1956 link->last->link = req;
1957 else
1958 link->head = req;
1959 link->last = req;
1960 return 0;
1961 }
1962
1963 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1964 const struct io_uring_sqe *sqe)
1965 __must_hold(&ctx->uring_lock)
1966 {
1967 struct io_submit_link *link = &ctx->submit_state.link;
1968 int ret;
1969
1970 ret = io_init_req(ctx, req, sqe);
1971 if (unlikely(ret))
1972 return io_submit_fail_init(sqe, req, ret);
1973
1974
1975 trace_io_uring_submit_sqe(req, true);
1976
1977
1978
1979
1980
1981
1982
1983
1984 if (unlikely(link->head)) {
1985 ret = io_req_prep_async(req);
1986 if (unlikely(ret))
1987 return io_submit_fail_init(sqe, req, ret);
1988
1989 trace_io_uring_link(req, link->head);
1990 link->last->link = req;
1991 link->last = req;
1992
1993 if (req->flags & IO_REQ_LINK_FLAGS)
1994 return 0;
1995
1996 req = link->head;
1997 link->head = NULL;
1998 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
1999 goto fallback;
2000
2001 } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
2002 REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
2003 if (req->flags & IO_REQ_LINK_FLAGS) {
2004 link->head = req;
2005 link->last = req;
2006 } else {
2007 fallback:
2008 io_queue_sqe_fallback(req);
2009 }
2010 return 0;
2011 }
2012
2013 io_queue_sqe(req);
2014 return 0;
2015 }
2016
2017
2018
2019
2020 static void io_submit_state_end(struct io_ring_ctx *ctx)
2021 {
2022 struct io_submit_state *state = &ctx->submit_state;
2023
2024 if (unlikely(state->link.head))
2025 io_queue_sqe_fallback(state->link.head);
2026
2027 io_submit_flush_completions(ctx);
2028 if (state->plug_started)
2029 blk_finish_plug(&state->plug);
2030 }
2031
2032
2033
2034
2035 static void io_submit_state_start(struct io_submit_state *state,
2036 unsigned int max_ios)
2037 {
2038 state->plug_started = false;
2039 state->need_plug = max_ios > 2;
2040 state->submit_nr = max_ios;
2041
2042 state->link.head = NULL;
2043 }
2044
2045 static void io_commit_sqring(struct io_ring_ctx *ctx)
2046 {
2047 struct io_rings *rings = ctx->rings;
2048
2049
2050
2051
2052
2053
2054 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2055 }
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2066 {
2067 unsigned head, mask = ctx->sq_entries - 1;
2068 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078 head = READ_ONCE(ctx->sq_array[sq_idx]);
2079 if (likely(head < ctx->sq_entries)) {
2080
2081 if (ctx->flags & IORING_SETUP_SQE128)
2082 head <<= 1;
2083 return &ctx->sq_sqes[head];
2084 }
2085
2086
2087 ctx->cq_extra--;
2088 WRITE_ONCE(ctx->rings->sq_dropped,
2089 READ_ONCE(ctx->rings->sq_dropped) + 1);
2090 return NULL;
2091 }
2092
2093 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
2094 __must_hold(&ctx->uring_lock)
2095 {
2096 unsigned int entries = io_sqring_entries(ctx);
2097 unsigned int left;
2098 int ret;
2099
2100 if (unlikely(!entries))
2101 return 0;
2102
2103 ret = left = min3(nr, ctx->sq_entries, entries);
2104 io_get_task_refs(left);
2105 io_submit_state_start(&ctx->submit_state, left);
2106
2107 do {
2108 const struct io_uring_sqe *sqe;
2109 struct io_kiocb *req;
2110
2111 if (unlikely(!io_alloc_req_refill(ctx)))
2112 break;
2113 req = io_alloc_req(ctx);
2114 sqe = io_get_sqe(ctx);
2115 if (unlikely(!sqe)) {
2116 io_req_add_to_cache(req, ctx);
2117 break;
2118 }
2119
2120
2121
2122
2123
2124 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
2125 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
2126 left--;
2127 break;
2128 }
2129 } while (--left);
2130
2131 if (unlikely(left)) {
2132 ret -= left;
2133
2134 if (!ret && io_req_cache_empty(ctx))
2135 ret = -EAGAIN;
2136 current->io_uring->cached_refs += left;
2137 }
2138
2139 io_submit_state_end(ctx);
2140
2141 io_commit_sqring(ctx);
2142 return ret;
2143 }
2144
2145 struct io_wait_queue {
2146 struct wait_queue_entry wq;
2147 struct io_ring_ctx *ctx;
2148 unsigned cq_tail;
2149 unsigned nr_timeouts;
2150 };
2151
2152 static inline bool io_should_wake(struct io_wait_queue *iowq)
2153 {
2154 struct io_ring_ctx *ctx = iowq->ctx;
2155 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
2156
2157
2158
2159
2160
2161
2162 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2163 }
2164
2165 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
2166 int wake_flags, void *key)
2167 {
2168 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
2169 wq);
2170
2171
2172
2173
2174
2175 if (io_should_wake(iowq) ||
2176 test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
2177 return autoremove_wake_function(curr, mode, wake_flags, key);
2178 return -1;
2179 }
2180
2181 int io_run_task_work_sig(void)
2182 {
2183 if (io_run_task_work())
2184 return 1;
2185 if (task_sigpending(current))
2186 return -EINTR;
2187 return 0;
2188 }
2189
2190
2191 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
2192 struct io_wait_queue *iowq,
2193 ktime_t timeout)
2194 {
2195 int ret;
2196 unsigned long check_cq;
2197
2198
2199 ret = io_run_task_work_sig();
2200 if (ret || io_should_wake(iowq))
2201 return ret;
2202
2203 check_cq = READ_ONCE(ctx->check_cq);
2204 if (unlikely(check_cq)) {
2205
2206 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
2207 return 1;
2208 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
2209 return -EBADR;
2210 }
2211 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
2212 return -ETIME;
2213 return 1;
2214 }
2215
2216
2217
2218
2219
2220 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2221 const sigset_t __user *sig, size_t sigsz,
2222 struct __kernel_timespec __user *uts)
2223 {
2224 struct io_wait_queue iowq;
2225 struct io_rings *rings = ctx->rings;
2226 ktime_t timeout = KTIME_MAX;
2227 int ret;
2228
2229 do {
2230 io_cqring_overflow_flush(ctx);
2231 if (io_cqring_events(ctx) >= min_events)
2232 return 0;
2233 if (!io_run_task_work())
2234 break;
2235 } while (1);
2236
2237 if (sig) {
2238 #ifdef CONFIG_COMPAT
2239 if (in_compat_syscall())
2240 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2241 sigsz);
2242 else
2243 #endif
2244 ret = set_user_sigmask(sig, sigsz);
2245
2246 if (ret)
2247 return ret;
2248 }
2249
2250 if (uts) {
2251 struct timespec64 ts;
2252
2253 if (get_timespec64(&ts, uts))
2254 return -EFAULT;
2255 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
2256 }
2257
2258 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
2259 iowq.wq.private = current;
2260 INIT_LIST_HEAD(&iowq.wq.entry);
2261 iowq.ctx = ctx;
2262 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2263 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2264
2265 trace_io_uring_cqring_wait(ctx, min_events);
2266 do {
2267
2268 if (!io_cqring_overflow_flush(ctx)) {
2269 ret = -EBUSY;
2270 break;
2271 }
2272 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
2273 TASK_INTERRUPTIBLE);
2274 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
2275 cond_resched();
2276 } while (ret > 0);
2277
2278 finish_wait(&ctx->cq_wait, &iowq.wq);
2279 restore_saved_sigmask_unless(ret == -EINTR);
2280
2281 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2282 }
2283
2284 static void io_mem_free(void *ptr)
2285 {
2286 struct page *page;
2287
2288 if (!ptr)
2289 return;
2290
2291 page = virt_to_head_page(ptr);
2292 if (put_page_testzero(page))
2293 free_compound_page(page);
2294 }
2295
2296 static void *io_mem_alloc(size_t size)
2297 {
2298 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2299
2300 return (void *) __get_free_pages(gfp, get_order(size));
2301 }
2302
2303 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
2304 unsigned int cq_entries, size_t *sq_offset)
2305 {
2306 struct io_rings *rings;
2307 size_t off, sq_array_size;
2308
2309 off = struct_size(rings, cqes, cq_entries);
2310 if (off == SIZE_MAX)
2311 return SIZE_MAX;
2312 if (ctx->flags & IORING_SETUP_CQE32) {
2313 if (check_shl_overflow(off, 1, &off))
2314 return SIZE_MAX;
2315 }
2316
2317 #ifdef CONFIG_SMP
2318 off = ALIGN(off, SMP_CACHE_BYTES);
2319 if (off == 0)
2320 return SIZE_MAX;
2321 #endif
2322
2323 if (sq_offset)
2324 *sq_offset = off;
2325
2326 sq_array_size = array_size(sizeof(u32), sq_entries);
2327 if (sq_array_size == SIZE_MAX)
2328 return SIZE_MAX;
2329
2330 if (check_add_overflow(off, sq_array_size, &off))
2331 return SIZE_MAX;
2332
2333 return off;
2334 }
2335
2336 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
2337 unsigned int eventfd_async)
2338 {
2339 struct io_ev_fd *ev_fd;
2340 __s32 __user *fds = arg;
2341 int fd;
2342
2343 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2344 lockdep_is_held(&ctx->uring_lock));
2345 if (ev_fd)
2346 return -EBUSY;
2347
2348 if (copy_from_user(&fd, fds, sizeof(*fds)))
2349 return -EFAULT;
2350
2351 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
2352 if (!ev_fd)
2353 return -ENOMEM;
2354
2355 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
2356 if (IS_ERR(ev_fd->cq_ev_fd)) {
2357 int ret = PTR_ERR(ev_fd->cq_ev_fd);
2358 kfree(ev_fd);
2359 return ret;
2360 }
2361
2362 spin_lock(&ctx->completion_lock);
2363 ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
2364 spin_unlock(&ctx->completion_lock);
2365
2366 ev_fd->eventfd_async = eventfd_async;
2367 ctx->has_evfd = true;
2368 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
2369 return 0;
2370 }
2371
2372 static void io_eventfd_put(struct rcu_head *rcu)
2373 {
2374 struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
2375
2376 eventfd_ctx_put(ev_fd->cq_ev_fd);
2377 kfree(ev_fd);
2378 }
2379
2380 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2381 {
2382 struct io_ev_fd *ev_fd;
2383
2384 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2385 lockdep_is_held(&ctx->uring_lock));
2386 if (ev_fd) {
2387 ctx->has_evfd = false;
2388 rcu_assign_pointer(ctx->io_ev_fd, NULL);
2389 call_rcu(&ev_fd->rcu, io_eventfd_put);
2390 return 0;
2391 }
2392
2393 return -ENXIO;
2394 }
2395
2396 static void io_req_caches_free(struct io_ring_ctx *ctx)
2397 {
2398 struct io_submit_state *state = &ctx->submit_state;
2399 int nr = 0;
2400
2401 mutex_lock(&ctx->uring_lock);
2402 io_flush_cached_locked_reqs(ctx, state);
2403
2404 while (!io_req_cache_empty(ctx)) {
2405 struct io_wq_work_node *node;
2406 struct io_kiocb *req;
2407
2408 node = wq_stack_extract(&state->free_list);
2409 req = container_of(node, struct io_kiocb, comp_list);
2410 kmem_cache_free(req_cachep, req);
2411 nr++;
2412 }
2413 if (nr)
2414 percpu_ref_put_many(&ctx->refs, nr);
2415 mutex_unlock(&ctx->uring_lock);
2416 }
2417
2418 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2419 {
2420 io_sq_thread_finish(ctx);
2421
2422 if (ctx->mm_account) {
2423 mmdrop(ctx->mm_account);
2424 ctx->mm_account = NULL;
2425 }
2426
2427 io_rsrc_refs_drop(ctx);
2428
2429 io_wait_rsrc_data(ctx->buf_data);
2430 io_wait_rsrc_data(ctx->file_data);
2431
2432 mutex_lock(&ctx->uring_lock);
2433 if (ctx->buf_data)
2434 __io_sqe_buffers_unregister(ctx);
2435 if (ctx->file_data)
2436 __io_sqe_files_unregister(ctx);
2437 if (ctx->rings)
2438 __io_cqring_overflow_flush(ctx, true);
2439 io_eventfd_unregister(ctx);
2440 io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
2441 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
2442 mutex_unlock(&ctx->uring_lock);
2443 io_destroy_buffers(ctx);
2444 if (ctx->sq_creds)
2445 put_cred(ctx->sq_creds);
2446 if (ctx->submitter_task)
2447 put_task_struct(ctx->submitter_task);
2448
2449
2450 if (ctx->rsrc_node)
2451 io_rsrc_node_destroy(ctx->rsrc_node);
2452 if (ctx->rsrc_backup_node)
2453 io_rsrc_node_destroy(ctx->rsrc_backup_node);
2454 flush_delayed_work(&ctx->rsrc_put_work);
2455 flush_delayed_work(&ctx->fallback_work);
2456
2457 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
2458 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
2459
2460 #if defined(CONFIG_UNIX)
2461 if (ctx->ring_sock) {
2462 ctx->ring_sock->file = NULL;
2463 sock_release(ctx->ring_sock);
2464 }
2465 #endif
2466 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2467 WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
2468
2469 io_mem_free(ctx->rings);
2470 io_mem_free(ctx->sq_sqes);
2471
2472 percpu_ref_exit(&ctx->refs);
2473 free_uid(ctx->user);
2474 io_req_caches_free(ctx);
2475 if (ctx->hash_map)
2476 io_wq_put_hash(ctx->hash_map);
2477 kfree(ctx->cancel_table.hbs);
2478 kfree(ctx->cancel_table_locked.hbs);
2479 kfree(ctx->dummy_ubuf);
2480 kfree(ctx->io_bl);
2481 xa_destroy(&ctx->io_bl_xa);
2482 kfree(ctx);
2483 }
2484
2485 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2486 {
2487 struct io_ring_ctx *ctx = file->private_data;
2488 __poll_t mask = 0;
2489
2490 poll_wait(file, &ctx->cq_wait, wait);
2491
2492
2493
2494
2495 smp_rmb();
2496 if (!io_sqring_full(ctx))
2497 mask |= EPOLLOUT | EPOLLWRNORM;
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512 if (io_cqring_events(ctx) ||
2513 test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
2514 mask |= EPOLLIN | EPOLLRDNORM;
2515
2516 return mask;
2517 }
2518
2519 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
2520 {
2521 const struct cred *creds;
2522
2523 creds = xa_erase(&ctx->personalities, id);
2524 if (creds) {
2525 put_cred(creds);
2526 return 0;
2527 }
2528
2529 return -EINVAL;
2530 }
2531
2532 struct io_tctx_exit {
2533 struct callback_head task_work;
2534 struct completion completion;
2535 struct io_ring_ctx *ctx;
2536 };
2537
2538 static __cold void io_tctx_exit_cb(struct callback_head *cb)
2539 {
2540 struct io_uring_task *tctx = current->io_uring;
2541 struct io_tctx_exit *work;
2542
2543 work = container_of(cb, struct io_tctx_exit, task_work);
2544
2545
2546
2547
2548 if (!atomic_read(&tctx->in_idle))
2549 io_uring_del_tctx_node((unsigned long)work->ctx);
2550 complete(&work->completion);
2551 }
2552
2553 static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
2554 {
2555 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2556
2557 return req->ctx == data;
2558 }
2559
2560 static __cold void io_ring_exit_work(struct work_struct *work)
2561 {
2562 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
2563 unsigned long timeout = jiffies + HZ * 60 * 5;
2564 unsigned long interval = HZ / 20;
2565 struct io_tctx_exit exit;
2566 struct io_tctx_node *node;
2567 int ret;
2568
2569
2570
2571
2572
2573
2574
2575 do {
2576 while (io_uring_try_cancel_requests(ctx, NULL, true))
2577 cond_resched();
2578
2579 if (ctx->sq_data) {
2580 struct io_sq_data *sqd = ctx->sq_data;
2581 struct task_struct *tsk;
2582
2583 io_sq_thread_park(sqd);
2584 tsk = sqd->thread;
2585 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
2586 io_wq_cancel_cb(tsk->io_uring->io_wq,
2587 io_cancel_ctx_cb, ctx, true);
2588 io_sq_thread_unpark(sqd);
2589 }
2590
2591 io_req_caches_free(ctx);
2592
2593 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
2594
2595 interval = HZ * 60;
2596 }
2597 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
2598
2599 init_completion(&exit.completion);
2600 init_task_work(&exit.task_work, io_tctx_exit_cb);
2601 exit.ctx = ctx;
2602
2603
2604
2605
2606
2607
2608 mutex_lock(&ctx->uring_lock);
2609 while (!list_empty(&ctx->tctx_list)) {
2610 WARN_ON_ONCE(time_after(jiffies, timeout));
2611
2612 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
2613 ctx_node);
2614
2615 list_rotate_left(&ctx->tctx_list);
2616 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
2617 if (WARN_ON_ONCE(ret))
2618 continue;
2619
2620 mutex_unlock(&ctx->uring_lock);
2621 wait_for_completion(&exit.completion);
2622 mutex_lock(&ctx->uring_lock);
2623 }
2624 mutex_unlock(&ctx->uring_lock);
2625 spin_lock(&ctx->completion_lock);
2626 spin_unlock(&ctx->completion_lock);
2627
2628 io_ring_ctx_free(ctx);
2629 }
2630
2631 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2632 {
2633 unsigned long index;
2634 struct creds *creds;
2635
2636 mutex_lock(&ctx->uring_lock);
2637 percpu_ref_kill(&ctx->refs);
2638 if (ctx->rings)
2639 __io_cqring_overflow_flush(ctx, true);
2640 xa_for_each(&ctx->personalities, index, creds)
2641 io_unregister_personality(ctx, index);
2642 if (ctx->rings)
2643 io_poll_remove_all(ctx, NULL, true);
2644 mutex_unlock(&ctx->uring_lock);
2645
2646
2647 if (ctx->rings) {
2648 io_kill_timeouts(ctx, NULL, true);
2649
2650 io_iopoll_try_reap_events(ctx);
2651
2652 if (current->io_uring)
2653 io_uring_drop_tctx_refs(current);
2654 }
2655
2656 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
2657
2658
2659
2660
2661
2662
2663 queue_work(system_unbound_wq, &ctx->exit_work);
2664 }
2665
2666 static int io_uring_release(struct inode *inode, struct file *file)
2667 {
2668 struct io_ring_ctx *ctx = file->private_data;
2669
2670 file->private_data = NULL;
2671 io_ring_ctx_wait_and_kill(ctx);
2672 return 0;
2673 }
2674
2675 struct io_task_cancel {
2676 struct task_struct *task;
2677 bool all;
2678 };
2679
2680 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
2681 {
2682 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2683 struct io_task_cancel *cancel = data;
2684
2685 return io_match_task_safe(req, cancel->task, cancel->all);
2686 }
2687
2688 static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
2689 struct task_struct *task,
2690 bool cancel_all)
2691 {
2692 struct io_defer_entry *de;
2693 LIST_HEAD(list);
2694
2695 spin_lock(&ctx->completion_lock);
2696 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
2697 if (io_match_task_safe(de->req, task, cancel_all)) {
2698 list_cut_position(&list, &ctx->defer_list, &de->list);
2699 break;
2700 }
2701 }
2702 spin_unlock(&ctx->completion_lock);
2703 if (list_empty(&list))
2704 return false;
2705
2706 while (!list_empty(&list)) {
2707 de = list_first_entry(&list, struct io_defer_entry, list);
2708 list_del_init(&de->list);
2709 io_req_complete_failed(de->req, -ECANCELED);
2710 kfree(de);
2711 }
2712 return true;
2713 }
2714
2715 static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
2716 {
2717 struct io_tctx_node *node;
2718 enum io_wq_cancel cret;
2719 bool ret = false;
2720
2721 mutex_lock(&ctx->uring_lock);
2722 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
2723 struct io_uring_task *tctx = node->task->io_uring;
2724
2725
2726
2727
2728
2729 if (!tctx || !tctx->io_wq)
2730 continue;
2731 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
2732 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
2733 }
2734 mutex_unlock(&ctx->uring_lock);
2735
2736 return ret;
2737 }
2738
2739 static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
2740 struct task_struct *task,
2741 bool cancel_all)
2742 {
2743 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
2744 struct io_uring_task *tctx = task ? task->io_uring : NULL;
2745 enum io_wq_cancel cret;
2746 bool ret = false;
2747
2748
2749 if (!ctx->rings)
2750 return false;
2751
2752 if (!task) {
2753 ret |= io_uring_try_cancel_iowq(ctx);
2754 } else if (tctx && tctx->io_wq) {
2755
2756
2757
2758
2759 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
2760 &cancel, true);
2761 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
2762 }
2763
2764
2765 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
2766 (ctx->sq_data && ctx->sq_data->thread == current)) {
2767 while (!wq_list_empty(&ctx->iopoll_list)) {
2768 io_iopoll_try_reap_events(ctx);
2769 ret = true;
2770 }
2771 }
2772
2773 ret |= io_cancel_defer_files(ctx, task, cancel_all);
2774 mutex_lock(&ctx->uring_lock);
2775 ret |= io_poll_remove_all(ctx, task, cancel_all);
2776 mutex_unlock(&ctx->uring_lock);
2777 ret |= io_kill_timeouts(ctx, task, cancel_all);
2778 if (task)
2779 ret |= io_run_task_work();
2780 return ret;
2781 }
2782
2783 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
2784 {
2785 if (tracked)
2786 return atomic_read(&tctx->inflight_tracked);
2787 return percpu_counter_sum(&tctx->inflight);
2788 }
2789
2790
2791
2792
2793
2794 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
2795 {
2796 struct io_uring_task *tctx = current->io_uring;
2797 struct io_ring_ctx *ctx;
2798 s64 inflight;
2799 DEFINE_WAIT(wait);
2800
2801 WARN_ON_ONCE(sqd && sqd->thread != current);
2802
2803 if (!current->io_uring)
2804 return;
2805 if (tctx->io_wq)
2806 io_wq_exit_start(tctx->io_wq);
2807
2808 atomic_inc(&tctx->in_idle);
2809 do {
2810 bool loop = false;
2811
2812 io_uring_drop_tctx_refs(current);
2813
2814 inflight = tctx_inflight(tctx, !cancel_all);
2815 if (!inflight)
2816 break;
2817
2818 if (!sqd) {
2819 struct io_tctx_node *node;
2820 unsigned long index;
2821
2822 xa_for_each(&tctx->xa, index, node) {
2823
2824 if (node->ctx->sq_data)
2825 continue;
2826 loop |= io_uring_try_cancel_requests(node->ctx,
2827 current, cancel_all);
2828 }
2829 } else {
2830 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
2831 loop |= io_uring_try_cancel_requests(ctx,
2832 current,
2833 cancel_all);
2834 }
2835
2836 if (loop) {
2837 cond_resched();
2838 continue;
2839 }
2840
2841 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
2842 io_run_task_work();
2843 io_uring_drop_tctx_refs(current);
2844
2845
2846
2847
2848
2849
2850 if (inflight == tctx_inflight(tctx, !cancel_all))
2851 schedule();
2852 finish_wait(&tctx->wait, &wait);
2853 } while (1);
2854
2855 io_uring_clean_tctx(tctx);
2856 if (cancel_all) {
2857
2858
2859
2860
2861 atomic_dec(&tctx->in_idle);
2862
2863 __io_uring_free(current);
2864 }
2865 }
2866
2867 void __io_uring_cancel(bool cancel_all)
2868 {
2869 io_uring_cancel_generic(cancel_all, NULL);
2870 }
2871
2872 static void *io_uring_validate_mmap_request(struct file *file,
2873 loff_t pgoff, size_t sz)
2874 {
2875 struct io_ring_ctx *ctx = file->private_data;
2876 loff_t offset = pgoff << PAGE_SHIFT;
2877 struct page *page;
2878 void *ptr;
2879
2880 switch (offset) {
2881 case IORING_OFF_SQ_RING:
2882 case IORING_OFF_CQ_RING:
2883 ptr = ctx->rings;
2884 break;
2885 case IORING_OFF_SQES:
2886 ptr = ctx->sq_sqes;
2887 break;
2888 default:
2889 return ERR_PTR(-EINVAL);
2890 }
2891
2892 page = virt_to_head_page(ptr);
2893 if (sz > page_size(page))
2894 return ERR_PTR(-EINVAL);
2895
2896 return ptr;
2897 }
2898
2899 #ifdef CONFIG_MMU
2900
2901 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2902 {
2903 size_t sz = vma->vm_end - vma->vm_start;
2904 unsigned long pfn;
2905 void *ptr;
2906
2907 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
2908 if (IS_ERR(ptr))
2909 return PTR_ERR(ptr);
2910
2911 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
2912 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2913 }
2914
2915 #else
2916
2917 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2918 {
2919 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
2920 }
2921
2922 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
2923 {
2924 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
2925 }
2926
2927 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
2928 unsigned long addr, unsigned long len,
2929 unsigned long pgoff, unsigned long flags)
2930 {
2931 void *ptr;
2932
2933 ptr = io_uring_validate_mmap_request(file, pgoff, len);
2934 if (IS_ERR(ptr))
2935 return PTR_ERR(ptr);
2936
2937 return (unsigned long) ptr;
2938 }
2939
2940 #endif
2941
2942 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
2943 {
2944 if (flags & IORING_ENTER_EXT_ARG) {
2945 struct io_uring_getevents_arg arg;
2946
2947 if (argsz != sizeof(arg))
2948 return -EINVAL;
2949 if (copy_from_user(&arg, argp, sizeof(arg)))
2950 return -EFAULT;
2951 }
2952 return 0;
2953 }
2954
2955 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
2956 struct __kernel_timespec __user **ts,
2957 const sigset_t __user **sig)
2958 {
2959 struct io_uring_getevents_arg arg;
2960
2961
2962
2963
2964
2965 if (!(flags & IORING_ENTER_EXT_ARG)) {
2966 *sig = (const sigset_t __user *) argp;
2967 *ts = NULL;
2968 return 0;
2969 }
2970
2971
2972
2973
2974
2975 if (*argsz != sizeof(arg))
2976 return -EINVAL;
2977 if (copy_from_user(&arg, argp, sizeof(arg)))
2978 return -EFAULT;
2979 if (arg.pad)
2980 return -EINVAL;
2981 *sig = u64_to_user_ptr(arg.sigmask);
2982 *argsz = arg.sigmask_sz;
2983 *ts = u64_to_user_ptr(arg.ts);
2984 return 0;
2985 }
2986
2987 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2988 u32, min_complete, u32, flags, const void __user *, argp,
2989 size_t, argsz)
2990 {
2991 struct io_ring_ctx *ctx;
2992 struct fd f;
2993 long ret;
2994
2995 io_run_task_work();
2996
2997 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
2998 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
2999 IORING_ENTER_REGISTERED_RING)))
3000 return -EINVAL;
3001
3002
3003
3004
3005
3006 if (flags & IORING_ENTER_REGISTERED_RING) {
3007 struct io_uring_task *tctx = current->io_uring;
3008
3009 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
3010 return -EINVAL;
3011 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
3012 f.file = tctx->registered_rings[fd];
3013 f.flags = 0;
3014 if (unlikely(!f.file))
3015 return -EBADF;
3016 } else {
3017 f = fdget(fd);
3018 if (unlikely(!f.file))
3019 return -EBADF;
3020 ret = -EOPNOTSUPP;
3021 if (unlikely(!io_is_uring_fops(f.file)))
3022 goto out;
3023 }
3024
3025 ctx = f.file->private_data;
3026 ret = -EBADFD;
3027 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3028 goto out;
3029
3030
3031
3032
3033
3034
3035 ret = 0;
3036 if (ctx->flags & IORING_SETUP_SQPOLL) {
3037 io_cqring_overflow_flush(ctx);
3038
3039 if (unlikely(ctx->sq_data->thread == NULL)) {
3040 ret = -EOWNERDEAD;
3041 goto out;
3042 }
3043 if (flags & IORING_ENTER_SQ_WAKEUP)
3044 wake_up(&ctx->sq_data->wait);
3045 if (flags & IORING_ENTER_SQ_WAIT) {
3046 ret = io_sqpoll_wait_sq(ctx);
3047 if (ret)
3048 goto out;
3049 }
3050 ret = to_submit;
3051 } else if (to_submit) {
3052 ret = io_uring_add_tctx_node(ctx);
3053 if (unlikely(ret))
3054 goto out;
3055
3056 mutex_lock(&ctx->uring_lock);
3057 ret = io_submit_sqes(ctx, to_submit);
3058 if (ret != to_submit) {
3059 mutex_unlock(&ctx->uring_lock);
3060 goto out;
3061 }
3062 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
3063 goto iopoll_locked;
3064 mutex_unlock(&ctx->uring_lock);
3065 }
3066 if (flags & IORING_ENTER_GETEVENTS) {
3067 int ret2;
3068 if (ctx->syscall_iopoll) {
3069
3070
3071
3072
3073
3074
3075 mutex_lock(&ctx->uring_lock);
3076 iopoll_locked:
3077 ret2 = io_validate_ext_arg(flags, argp, argsz);
3078 if (likely(!ret2)) {
3079 min_complete = min(min_complete,
3080 ctx->cq_entries);
3081 ret2 = io_iopoll_check(ctx, min_complete);
3082 }
3083 mutex_unlock(&ctx->uring_lock);
3084 } else {
3085 const sigset_t __user *sig;
3086 struct __kernel_timespec __user *ts;
3087
3088 ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
3089 if (likely(!ret2)) {
3090 min_complete = min(min_complete,
3091 ctx->cq_entries);
3092 ret2 = io_cqring_wait(ctx, min_complete, sig,
3093 argsz, ts);
3094 }
3095 }
3096
3097 if (!ret) {
3098 ret = ret2;
3099
3100
3101
3102
3103
3104
3105 if (unlikely(ret2 == -EBADR))
3106 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
3107 &ctx->check_cq);
3108 }
3109 }
3110 out:
3111 fdput(f);
3112 return ret;
3113 }
3114
3115 static const struct file_operations io_uring_fops = {
3116 .release = io_uring_release,
3117 .mmap = io_uring_mmap,
3118 #ifndef CONFIG_MMU
3119 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
3120 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
3121 #endif
3122 .poll = io_uring_poll,
3123 #ifdef CONFIG_PROC_FS
3124 .show_fdinfo = io_uring_show_fdinfo,
3125 #endif
3126 };
3127
3128 bool io_is_uring_fops(struct file *file)
3129 {
3130 return file->f_op == &io_uring_fops;
3131 }
3132
3133 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3134 struct io_uring_params *p)
3135 {
3136 struct io_rings *rings;
3137 size_t size, sq_array_offset;
3138
3139
3140 ctx->sq_entries = p->sq_entries;
3141 ctx->cq_entries = p->cq_entries;
3142
3143 size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
3144 if (size == SIZE_MAX)
3145 return -EOVERFLOW;
3146
3147 rings = io_mem_alloc(size);
3148 if (!rings)
3149 return -ENOMEM;
3150
3151 ctx->rings = rings;
3152 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
3153 rings->sq_ring_mask = p->sq_entries - 1;
3154 rings->cq_ring_mask = p->cq_entries - 1;
3155 rings->sq_ring_entries = p->sq_entries;
3156 rings->cq_ring_entries = p->cq_entries;
3157
3158 if (p->flags & IORING_SETUP_SQE128)
3159 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
3160 else
3161 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3162 if (size == SIZE_MAX) {
3163 io_mem_free(ctx->rings);
3164 ctx->rings = NULL;
3165 return -EOVERFLOW;
3166 }
3167
3168 ctx->sq_sqes = io_mem_alloc(size);
3169 if (!ctx->sq_sqes) {
3170 io_mem_free(ctx->rings);
3171 ctx->rings = NULL;
3172 return -ENOMEM;
3173 }
3174
3175 return 0;
3176 }
3177
3178 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
3179 {
3180 int ret, fd;
3181
3182 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
3183 if (fd < 0)
3184 return fd;
3185
3186 ret = __io_uring_add_tctx_node(ctx, false);
3187 if (ret) {
3188 put_unused_fd(fd);
3189 return ret;
3190 }
3191 fd_install(fd, file);
3192 return fd;
3193 }
3194
3195
3196
3197
3198
3199
3200
3201 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
3202 {
3203 struct file *file;
3204 #if defined(CONFIG_UNIX)
3205 int ret;
3206
3207 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
3208 &ctx->ring_sock);
3209 if (ret)
3210 return ERR_PTR(ret);
3211 #endif
3212
3213 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
3214 O_RDWR | O_CLOEXEC, NULL);
3215 #if defined(CONFIG_UNIX)
3216 if (IS_ERR(file)) {
3217 sock_release(ctx->ring_sock);
3218 ctx->ring_sock = NULL;
3219 } else {
3220 ctx->ring_sock->file = file;
3221 }
3222 #endif
3223 return file;
3224 }
3225
3226 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
3227 struct io_uring_params __user *params)
3228 {
3229 struct io_ring_ctx *ctx;
3230 struct file *file;
3231 int ret;
3232
3233 if (!entries)
3234 return -EINVAL;
3235 if (entries > IORING_MAX_ENTRIES) {
3236 if (!(p->flags & IORING_SETUP_CLAMP))
3237 return -EINVAL;
3238 entries = IORING_MAX_ENTRIES;
3239 }
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249 p->sq_entries = roundup_pow_of_two(entries);
3250 if (p->flags & IORING_SETUP_CQSIZE) {
3251
3252
3253
3254
3255
3256 if (!p->cq_entries)
3257 return -EINVAL;
3258 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
3259 if (!(p->flags & IORING_SETUP_CLAMP))
3260 return -EINVAL;
3261 p->cq_entries = IORING_MAX_CQ_ENTRIES;
3262 }
3263 p->cq_entries = roundup_pow_of_two(p->cq_entries);
3264 if (p->cq_entries < p->sq_entries)
3265 return -EINVAL;
3266 } else {
3267 p->cq_entries = 2 * p->sq_entries;
3268 }
3269
3270 ctx = io_ring_ctx_alloc(p);
3271 if (!ctx)
3272 return -ENOMEM;
3273
3274
3275
3276
3277
3278
3279
3280 if (ctx->flags & IORING_SETUP_IOPOLL &&
3281 !(ctx->flags & IORING_SETUP_SQPOLL))
3282 ctx->syscall_iopoll = 1;
3283
3284 ctx->compat = in_compat_syscall();
3285 if (!capable(CAP_IPC_LOCK))
3286 ctx->user = get_uid(current_user());
3287
3288
3289
3290
3291
3292 ret = -EINVAL;
3293 if (ctx->flags & IORING_SETUP_SQPOLL) {
3294
3295 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
3296 IORING_SETUP_TASKRUN_FLAG))
3297 goto err;
3298 ctx->notify_method = TWA_SIGNAL_NO_IPI;
3299 } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
3300 ctx->notify_method = TWA_SIGNAL_NO_IPI;
3301 } else {
3302 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
3303 goto err;
3304 ctx->notify_method = TWA_SIGNAL;
3305 }
3306
3307
3308
3309
3310
3311
3312
3313 mmgrab(current->mm);
3314 ctx->mm_account = current->mm;
3315
3316 ret = io_allocate_scq_urings(ctx, p);
3317 if (ret)
3318 goto err;
3319
3320 ret = io_sq_offload_create(ctx, p);
3321 if (ret)
3322 goto err;
3323
3324 ret = io_rsrc_node_switch_start(ctx);
3325 if (ret)
3326 goto err;
3327 io_rsrc_node_switch(ctx, NULL);
3328
3329 memset(&p->sq_off, 0, sizeof(p->sq_off));
3330 p->sq_off.head = offsetof(struct io_rings, sq.head);
3331 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
3332 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
3333 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
3334 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
3335 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
3336 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
3337
3338 memset(&p->cq_off, 0, sizeof(p->cq_off));
3339 p->cq_off.head = offsetof(struct io_rings, cq.head);
3340 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
3341 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
3342 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
3343 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
3344 p->cq_off.cqes = offsetof(struct io_rings, cqes);
3345 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
3346
3347 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
3348 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
3349 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
3350 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
3351 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
3352 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
3353 IORING_FEAT_LINKED_FILE;
3354
3355 if (copy_to_user(params, p, sizeof(*p))) {
3356 ret = -EFAULT;
3357 goto err;
3358 }
3359
3360 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
3361 && !(ctx->flags & IORING_SETUP_R_DISABLED))
3362 ctx->submitter_task = get_task_struct(current);
3363
3364 file = io_uring_get_file(ctx);
3365 if (IS_ERR(file)) {
3366 ret = PTR_ERR(file);
3367 goto err;
3368 }
3369
3370
3371
3372
3373
3374 ret = io_uring_install_fd(ctx, file);
3375 if (ret < 0) {
3376
3377 fput(file);
3378 return ret;
3379 }
3380
3381 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
3382 return ret;
3383 err:
3384 io_ring_ctx_wait_and_kill(ctx);
3385 return ret;
3386 }
3387
3388
3389
3390
3391
3392
3393 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
3394 {
3395 struct io_uring_params p;
3396 int i;
3397
3398 if (copy_from_user(&p, params, sizeof(p)))
3399 return -EFAULT;
3400 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
3401 if (p.resv[i])
3402 return -EINVAL;
3403 }
3404
3405 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3406 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
3407 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
3408 IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
3409 IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
3410 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
3411 IORING_SETUP_SINGLE_ISSUER))
3412 return -EINVAL;
3413
3414 return io_uring_create(entries, &p, params);
3415 }
3416
3417 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
3418 struct io_uring_params __user *, params)
3419 {
3420 return io_uring_setup(entries, params);
3421 }
3422
3423 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
3424 unsigned nr_args)
3425 {
3426 struct io_uring_probe *p;
3427 size_t size;
3428 int i, ret;
3429
3430 size = struct_size(p, ops, nr_args);
3431 if (size == SIZE_MAX)
3432 return -EOVERFLOW;
3433 p = kzalloc(size, GFP_KERNEL);
3434 if (!p)
3435 return -ENOMEM;
3436
3437 ret = -EFAULT;
3438 if (copy_from_user(p, arg, size))
3439 goto out;
3440 ret = -EINVAL;
3441 if (memchr_inv(p, 0, size))
3442 goto out;
3443
3444 p->last_op = IORING_OP_LAST - 1;
3445 if (nr_args > IORING_OP_LAST)
3446 nr_args = IORING_OP_LAST;
3447
3448 for (i = 0; i < nr_args; i++) {
3449 p->ops[i].op = i;
3450 if (!io_op_defs[i].not_supported)
3451 p->ops[i].flags = IO_URING_OP_SUPPORTED;
3452 }
3453 p->ops_len = i;
3454
3455 ret = 0;
3456 if (copy_to_user(arg, p, size))
3457 ret = -EFAULT;
3458 out:
3459 kfree(p);
3460 return ret;
3461 }
3462
3463 static int io_register_personality(struct io_ring_ctx *ctx)
3464 {
3465 const struct cred *creds;
3466 u32 id;
3467 int ret;
3468
3469 creds = get_current_cred();
3470
3471 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
3472 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
3473 if (ret < 0) {
3474 put_cred(creds);
3475 return ret;
3476 }
3477 return id;
3478 }
3479
3480 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
3481 void __user *arg, unsigned int nr_args)
3482 {
3483 struct io_uring_restriction *res;
3484 size_t size;
3485 int i, ret;
3486
3487
3488 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
3489 return -EBADFD;
3490
3491
3492 if (ctx->restrictions.registered)
3493 return -EBUSY;
3494
3495 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
3496 return -EINVAL;
3497
3498 size = array_size(nr_args, sizeof(*res));
3499 if (size == SIZE_MAX)
3500 return -EOVERFLOW;
3501
3502 res = memdup_user(arg, size);
3503 if (IS_ERR(res))
3504 return PTR_ERR(res);
3505
3506 ret = 0;
3507
3508 for (i = 0; i < nr_args; i++) {
3509 switch (res[i].opcode) {
3510 case IORING_RESTRICTION_REGISTER_OP:
3511 if (res[i].register_op >= IORING_REGISTER_LAST) {
3512 ret = -EINVAL;
3513 goto out;
3514 }
3515
3516 __set_bit(res[i].register_op,
3517 ctx->restrictions.register_op);
3518 break;
3519 case IORING_RESTRICTION_SQE_OP:
3520 if (res[i].sqe_op >= IORING_OP_LAST) {
3521 ret = -EINVAL;
3522 goto out;
3523 }
3524
3525 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
3526 break;
3527 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
3528 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
3529 break;
3530 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
3531 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
3532 break;
3533 default:
3534 ret = -EINVAL;
3535 goto out;
3536 }
3537 }
3538
3539 out:
3540
3541 if (ret != 0)
3542 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
3543 else
3544 ctx->restrictions.registered = true;
3545
3546 kfree(res);
3547 return ret;
3548 }
3549
3550 static int io_register_enable_rings(struct io_ring_ctx *ctx)
3551 {
3552 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
3553 return -EBADFD;
3554
3555 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task)
3556 ctx->submitter_task = get_task_struct(current);
3557
3558 if (ctx->restrictions.registered)
3559 ctx->restricted = 1;
3560
3561 ctx->flags &= ~IORING_SETUP_R_DISABLED;
3562 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
3563 wake_up(&ctx->sq_data->wait);
3564 return 0;
3565 }
3566
3567 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
3568 void __user *arg, unsigned len)
3569 {
3570 struct io_uring_task *tctx = current->io_uring;
3571 cpumask_var_t new_mask;
3572 int ret;
3573
3574 if (!tctx || !tctx->io_wq)
3575 return -EINVAL;
3576
3577 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3578 return -ENOMEM;
3579
3580 cpumask_clear(new_mask);
3581 if (len > cpumask_size())
3582 len = cpumask_size();
3583
3584 if (in_compat_syscall()) {
3585 ret = compat_get_bitmap(cpumask_bits(new_mask),
3586 (const compat_ulong_t __user *)arg,
3587 len * 8 );
3588 } else {
3589 ret = copy_from_user(new_mask, arg, len);
3590 }
3591
3592 if (ret) {
3593 free_cpumask_var(new_mask);
3594 return -EFAULT;
3595 }
3596
3597 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
3598 free_cpumask_var(new_mask);
3599 return ret;
3600 }
3601
3602 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
3603 {
3604 struct io_uring_task *tctx = current->io_uring;
3605
3606 if (!tctx || !tctx->io_wq)
3607 return -EINVAL;
3608
3609 return io_wq_cpu_affinity(tctx->io_wq, NULL);
3610 }
3611
3612 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
3613 void __user *arg)
3614 __must_hold(&ctx->uring_lock)
3615 {
3616 struct io_tctx_node *node;
3617 struct io_uring_task *tctx = NULL;
3618 struct io_sq_data *sqd = NULL;
3619 __u32 new_count[2];
3620 int i, ret;
3621
3622 if (copy_from_user(new_count, arg, sizeof(new_count)))
3623 return -EFAULT;
3624 for (i = 0; i < ARRAY_SIZE(new_count); i++)
3625 if (new_count[i] > INT_MAX)
3626 return -EINVAL;
3627
3628 if (ctx->flags & IORING_SETUP_SQPOLL) {
3629 sqd = ctx->sq_data;
3630 if (sqd) {
3631
3632
3633
3634
3635
3636 refcount_inc(&sqd->refs);
3637 mutex_unlock(&ctx->uring_lock);
3638 mutex_lock(&sqd->lock);
3639 mutex_lock(&ctx->uring_lock);
3640 if (sqd->thread)
3641 tctx = sqd->thread->io_uring;
3642 }
3643 } else {
3644 tctx = current->io_uring;
3645 }
3646
3647 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
3648
3649 for (i = 0; i < ARRAY_SIZE(new_count); i++)
3650 if (new_count[i])
3651 ctx->iowq_limits[i] = new_count[i];
3652 ctx->iowq_limits_set = true;
3653
3654 if (tctx && tctx->io_wq) {
3655 ret = io_wq_max_workers(tctx->io_wq, new_count);
3656 if (ret)
3657 goto err;
3658 } else {
3659 memset(new_count, 0, sizeof(new_count));
3660 }
3661
3662 if (sqd) {
3663 mutex_unlock(&sqd->lock);
3664 io_put_sq_data(sqd);
3665 }
3666
3667 if (copy_to_user(arg, new_count, sizeof(new_count)))
3668 return -EFAULT;
3669
3670
3671 if (sqd)
3672 return 0;
3673
3674
3675 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
3676 struct io_uring_task *tctx = node->task->io_uring;
3677
3678 if (WARN_ON_ONCE(!tctx->io_wq))
3679 continue;
3680
3681 for (i = 0; i < ARRAY_SIZE(new_count); i++)
3682 new_count[i] = ctx->iowq_limits[i];
3683
3684 (void)io_wq_max_workers(tctx->io_wq, new_count);
3685 }
3686 return 0;
3687 err:
3688 if (sqd) {
3689 mutex_unlock(&sqd->lock);
3690 io_put_sq_data(sqd);
3691 }
3692 return ret;
3693 }
3694
3695 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
3696 void __user *arg, unsigned nr_args)
3697 __releases(ctx->uring_lock)
3698 __acquires(ctx->uring_lock)
3699 {
3700 int ret;
3701
3702
3703
3704
3705
3706 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
3707 return -ENXIO;
3708
3709 if (ctx->restricted) {
3710 if (opcode >= IORING_REGISTER_LAST)
3711 return -EINVAL;
3712 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
3713 if (!test_bit(opcode, ctx->restrictions.register_op))
3714 return -EACCES;
3715 }
3716
3717 switch (opcode) {
3718 case IORING_REGISTER_BUFFERS:
3719 ret = -EFAULT;
3720 if (!arg)
3721 break;
3722 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
3723 break;
3724 case IORING_UNREGISTER_BUFFERS:
3725 ret = -EINVAL;
3726 if (arg || nr_args)
3727 break;
3728 ret = io_sqe_buffers_unregister(ctx);
3729 break;
3730 case IORING_REGISTER_FILES:
3731 ret = -EFAULT;
3732 if (!arg)
3733 break;
3734 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
3735 break;
3736 case IORING_UNREGISTER_FILES:
3737 ret = -EINVAL;
3738 if (arg || nr_args)
3739 break;
3740 ret = io_sqe_files_unregister(ctx);
3741 break;
3742 case IORING_REGISTER_FILES_UPDATE:
3743 ret = io_register_files_update(ctx, arg, nr_args);
3744 break;
3745 case IORING_REGISTER_EVENTFD:
3746 ret = -EINVAL;
3747 if (nr_args != 1)
3748 break;
3749 ret = io_eventfd_register(ctx, arg, 0);
3750 break;
3751 case IORING_REGISTER_EVENTFD_ASYNC:
3752 ret = -EINVAL;
3753 if (nr_args != 1)
3754 break;
3755 ret = io_eventfd_register(ctx, arg, 1);
3756 break;
3757 case IORING_UNREGISTER_EVENTFD:
3758 ret = -EINVAL;
3759 if (arg || nr_args)
3760 break;
3761 ret = io_eventfd_unregister(ctx);
3762 break;
3763 case IORING_REGISTER_PROBE:
3764 ret = -EINVAL;
3765 if (!arg || nr_args > 256)
3766 break;
3767 ret = io_probe(ctx, arg, nr_args);
3768 break;
3769 case IORING_REGISTER_PERSONALITY:
3770 ret = -EINVAL;
3771 if (arg || nr_args)
3772 break;
3773 ret = io_register_personality(ctx);
3774 break;
3775 case IORING_UNREGISTER_PERSONALITY:
3776 ret = -EINVAL;
3777 if (arg)
3778 break;
3779 ret = io_unregister_personality(ctx, nr_args);
3780 break;
3781 case IORING_REGISTER_ENABLE_RINGS:
3782 ret = -EINVAL;
3783 if (arg || nr_args)
3784 break;
3785 ret = io_register_enable_rings(ctx);
3786 break;
3787 case IORING_REGISTER_RESTRICTIONS:
3788 ret = io_register_restrictions(ctx, arg, nr_args);
3789 break;
3790 case IORING_REGISTER_FILES2:
3791 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
3792 break;
3793 case IORING_REGISTER_FILES_UPDATE2:
3794 ret = io_register_rsrc_update(ctx, arg, nr_args,
3795 IORING_RSRC_FILE);
3796 break;
3797 case IORING_REGISTER_BUFFERS2:
3798 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
3799 break;
3800 case IORING_REGISTER_BUFFERS_UPDATE:
3801 ret = io_register_rsrc_update(ctx, arg, nr_args,
3802 IORING_RSRC_BUFFER);
3803 break;
3804 case IORING_REGISTER_IOWQ_AFF:
3805 ret = -EINVAL;
3806 if (!arg || !nr_args)
3807 break;
3808 ret = io_register_iowq_aff(ctx, arg, nr_args);
3809 break;
3810 case IORING_UNREGISTER_IOWQ_AFF:
3811 ret = -EINVAL;
3812 if (arg || nr_args)
3813 break;
3814 ret = io_unregister_iowq_aff(ctx);
3815 break;
3816 case IORING_REGISTER_IOWQ_MAX_WORKERS:
3817 ret = -EINVAL;
3818 if (!arg || nr_args != 2)
3819 break;
3820 ret = io_register_iowq_max_workers(ctx, arg);
3821 break;
3822 case IORING_REGISTER_RING_FDS:
3823 ret = io_ringfd_register(ctx, arg, nr_args);
3824 break;
3825 case IORING_UNREGISTER_RING_FDS:
3826 ret = io_ringfd_unregister(ctx, arg, nr_args);
3827 break;
3828 case IORING_REGISTER_PBUF_RING:
3829 ret = -EINVAL;
3830 if (!arg || nr_args != 1)
3831 break;
3832 ret = io_register_pbuf_ring(ctx, arg);
3833 break;
3834 case IORING_UNREGISTER_PBUF_RING:
3835 ret = -EINVAL;
3836 if (!arg || nr_args != 1)
3837 break;
3838 ret = io_unregister_pbuf_ring(ctx, arg);
3839 break;
3840 case IORING_REGISTER_SYNC_CANCEL:
3841 ret = -EINVAL;
3842 if (!arg || nr_args != 1)
3843 break;
3844 ret = io_sync_cancel(ctx, arg);
3845 break;
3846 case IORING_REGISTER_FILE_ALLOC_RANGE:
3847 ret = -EINVAL;
3848 if (!arg || nr_args)
3849 break;
3850 ret = io_register_file_alloc_range(ctx, arg);
3851 break;
3852 default:
3853 ret = -EINVAL;
3854 break;
3855 }
3856
3857 return ret;
3858 }
3859
3860 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
3861 void __user *, arg, unsigned int, nr_args)
3862 {
3863 struct io_ring_ctx *ctx;
3864 long ret = -EBADF;
3865 struct fd f;
3866
3867 f = fdget(fd);
3868 if (!f.file)
3869 return -EBADF;
3870
3871 ret = -EOPNOTSUPP;
3872 if (!io_is_uring_fops(f.file))
3873 goto out_fput;
3874
3875 ctx = f.file->private_data;
3876
3877 io_run_task_work();
3878
3879 mutex_lock(&ctx->uring_lock);
3880 ret = __io_uring_register(ctx, opcode, arg, nr_args);
3881 mutex_unlock(&ctx->uring_lock);
3882 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
3883 out_fput:
3884 fdput(f);
3885 return ret;
3886 }
3887
3888 static int __init io_uring_init(void)
3889 {
3890 #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
3891 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
3892 BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
3893 } while (0)
3894
3895 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
3896 __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
3897 #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
3898 __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
3899 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
3900 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
3901 BUILD_BUG_SQE_ELEM(1, __u8, flags);
3902 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
3903 BUILD_BUG_SQE_ELEM(4, __s32, fd);
3904 BUILD_BUG_SQE_ELEM(8, __u64, off);
3905 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
3906 BUILD_BUG_SQE_ELEM(8, __u32, cmd_op);
3907 BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
3908 BUILD_BUG_SQE_ELEM(16, __u64, addr);
3909 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
3910 BUILD_BUG_SQE_ELEM(24, __u32, len);
3911 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
3912 BUILD_BUG_SQE_ELEM(28, int, rw_flags);
3913 BUILD_BUG_SQE_ELEM(28, __u32, rw_flags);
3914 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
3915 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
3916 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
3917 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
3918 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
3919 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
3920 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
3921 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
3922 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
3923 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
3924 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
3925 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
3926 BUILD_BUG_SQE_ELEM(28, __u32, rename_flags);
3927 BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags);
3928 BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags);
3929 BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags);
3930 BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags);
3931 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
3932 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
3933 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
3934 BUILD_BUG_SQE_ELEM(42, __u16, personality);
3935 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
3936 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
3937 BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
3938 BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
3939 BUILD_BUG_SQE_ELEM(48, __u64, addr3);
3940 BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
3941 BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
3942
3943 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
3944 sizeof(struct io_uring_rsrc_update));
3945 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
3946 sizeof(struct io_uring_rsrc_update2));
3947
3948
3949 BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
3950 BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
3951 offsetof(struct io_uring_buf_ring, tail));
3952
3953
3954 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
3955 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
3956 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
3957
3958 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
3959
3960 BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
3961
3962 io_uring_optable_init();
3963
3964 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
3965 SLAB_ACCOUNT);
3966 return 0;
3967 };
3968 __initcall(io_uring_init);