0001
0002 #include <linux/kernel.h>
0003 #include <linux/errno.h>
0004 #include <linux/fs.h>
0005 #include <linux/file.h>
0006 #include <linux/mm.h>
0007 #include <linux/slab.h>
0008 #include <linux/nospec.h>
0009 #include <linux/hugetlb.h>
0010 #include <linux/compat.h>
0011 #include <linux/io_uring.h>
0012
0013 #include <uapi/linux/io_uring.h>
0014
0015 #include "io_uring.h"
0016 #include "openclose.h"
0017 #include "rsrc.h"
0018
0019 struct io_rsrc_update {
0020 struct file *file;
0021 u64 arg;
0022 u32 nr_args;
0023 u32 offset;
0024 };
0025
0026 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
0027 struct io_mapped_ubuf **pimu,
0028 struct page **last_hpage);
0029
0030 #define IO_RSRC_REF_BATCH 100
0031
0032
0033 #define IORING_MAX_FIXED_FILES (1U << 20)
0034 #define IORING_MAX_REG_BUFFERS (1U << 14)
0035
0036 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
0037 __must_hold(&ctx->uring_lock)
0038 {
0039 if (ctx->rsrc_cached_refs) {
0040 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
0041 ctx->rsrc_cached_refs = 0;
0042 }
0043 }
0044
0045 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
0046 {
0047 unsigned long page_limit, cur_pages, new_pages;
0048
0049 if (!nr_pages)
0050 return 0;
0051
0052
0053 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
0054
0055 cur_pages = atomic_long_read(&user->locked_vm);
0056 do {
0057 new_pages = cur_pages + nr_pages;
0058 if (new_pages > page_limit)
0059 return -ENOMEM;
0060 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
0061 &cur_pages, new_pages));
0062 return 0;
0063 }
0064
0065 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
0066 {
0067 if (ctx->user)
0068 __io_unaccount_mem(ctx->user, nr_pages);
0069
0070 if (ctx->mm_account)
0071 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
0072 }
0073
0074 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
0075 {
0076 int ret;
0077
0078 if (ctx->user) {
0079 ret = __io_account_mem(ctx->user, nr_pages);
0080 if (ret)
0081 return ret;
0082 }
0083
0084 if (ctx->mm_account)
0085 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
0086
0087 return 0;
0088 }
0089
0090 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
0091 void __user *arg, unsigned index)
0092 {
0093 struct iovec __user *src;
0094
0095 #ifdef CONFIG_COMPAT
0096 if (ctx->compat) {
0097 struct compat_iovec __user *ciovs;
0098 struct compat_iovec ciov;
0099
0100 ciovs = (struct compat_iovec __user *) arg;
0101 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
0102 return -EFAULT;
0103
0104 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
0105 dst->iov_len = ciov.iov_len;
0106 return 0;
0107 }
0108 #endif
0109 src = (struct iovec __user *) arg;
0110 if (copy_from_user(dst, &src[index], sizeof(*dst)))
0111 return -EFAULT;
0112 return 0;
0113 }
0114
0115 static int io_buffer_validate(struct iovec *iov)
0116 {
0117 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
0118
0119
0120
0121
0122
0123
0124 if (!iov->iov_base)
0125 return iov->iov_len ? -EFAULT : 0;
0126 if (!iov->iov_len)
0127 return -EFAULT;
0128
0129
0130 if (iov->iov_len > SZ_1G)
0131 return -EFAULT;
0132
0133 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
0134 return -EOVERFLOW;
0135
0136 return 0;
0137 }
0138
0139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
0140 {
0141 struct io_mapped_ubuf *imu = *slot;
0142 unsigned int i;
0143
0144 if (imu != ctx->dummy_ubuf) {
0145 for (i = 0; i < imu->nr_bvecs; i++)
0146 unpin_user_page(imu->bvec[i].bv_page);
0147 if (imu->acct_pages)
0148 io_unaccount_mem(ctx, imu->acct_pages);
0149 kvfree(imu);
0150 }
0151 *slot = NULL;
0152 }
0153
0154 void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
0155 __must_hold(&ctx->uring_lock)
0156 {
0157 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
0158 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
0159 }
0160
0161 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
0162 {
0163 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
0164 struct io_ring_ctx *ctx = rsrc_data->ctx;
0165 struct io_rsrc_put *prsrc, *tmp;
0166
0167 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
0168 list_del(&prsrc->list);
0169
0170 if (prsrc->tag) {
0171 if (ctx->flags & IORING_SETUP_IOPOLL) {
0172 mutex_lock(&ctx->uring_lock);
0173 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
0174 mutex_unlock(&ctx->uring_lock);
0175 } else {
0176 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
0177 }
0178 }
0179
0180 rsrc_data->do_put(ctx, prsrc);
0181 kfree(prsrc);
0182 }
0183
0184 io_rsrc_node_destroy(ref_node);
0185 if (atomic_dec_and_test(&rsrc_data->refs))
0186 complete(&rsrc_data->done);
0187 }
0188
0189 void io_rsrc_put_work(struct work_struct *work)
0190 {
0191 struct io_ring_ctx *ctx;
0192 struct llist_node *node;
0193
0194 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
0195 node = llist_del_all(&ctx->rsrc_put_llist);
0196
0197 while (node) {
0198 struct io_rsrc_node *ref_node;
0199 struct llist_node *next = node->next;
0200
0201 ref_node = llist_entry(node, struct io_rsrc_node, llist);
0202 __io_rsrc_put_work(ref_node);
0203 node = next;
0204 }
0205 }
0206
0207 void io_wait_rsrc_data(struct io_rsrc_data *data)
0208 {
0209 if (data && !atomic_dec_and_test(&data->refs))
0210 wait_for_completion(&data->done);
0211 }
0212
0213 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
0214 {
0215 percpu_ref_exit(&ref_node->refs);
0216 kfree(ref_node);
0217 }
0218
0219 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
0220 {
0221 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
0222 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
0223 unsigned long flags;
0224 bool first_add = false;
0225 unsigned long delay = HZ;
0226
0227 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
0228 node->done = true;
0229
0230
0231 if (node->rsrc_data->quiesce)
0232 delay = 0;
0233
0234 while (!list_empty(&ctx->rsrc_ref_list)) {
0235 node = list_first_entry(&ctx->rsrc_ref_list,
0236 struct io_rsrc_node, node);
0237
0238 if (!node->done)
0239 break;
0240 list_del(&node->node);
0241 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
0242 }
0243 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
0244
0245 if (first_add)
0246 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
0247 }
0248
0249 static struct io_rsrc_node *io_rsrc_node_alloc(void)
0250 {
0251 struct io_rsrc_node *ref_node;
0252
0253 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
0254 if (!ref_node)
0255 return NULL;
0256
0257 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
0258 0, GFP_KERNEL)) {
0259 kfree(ref_node);
0260 return NULL;
0261 }
0262 INIT_LIST_HEAD(&ref_node->node);
0263 INIT_LIST_HEAD(&ref_node->rsrc_list);
0264 ref_node->done = false;
0265 return ref_node;
0266 }
0267
0268 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
0269 struct io_rsrc_data *data_to_kill)
0270 __must_hold(&ctx->uring_lock)
0271 {
0272 WARN_ON_ONCE(!ctx->rsrc_backup_node);
0273 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
0274
0275 io_rsrc_refs_drop(ctx);
0276
0277 if (data_to_kill) {
0278 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
0279
0280 rsrc_node->rsrc_data = data_to_kill;
0281 spin_lock_irq(&ctx->rsrc_ref_lock);
0282 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
0283 spin_unlock_irq(&ctx->rsrc_ref_lock);
0284
0285 atomic_inc(&data_to_kill->refs);
0286 percpu_ref_kill(&rsrc_node->refs);
0287 ctx->rsrc_node = NULL;
0288 }
0289
0290 if (!ctx->rsrc_node) {
0291 ctx->rsrc_node = ctx->rsrc_backup_node;
0292 ctx->rsrc_backup_node = NULL;
0293 }
0294 }
0295
0296 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
0297 {
0298 if (ctx->rsrc_backup_node)
0299 return 0;
0300 ctx->rsrc_backup_node = io_rsrc_node_alloc();
0301 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
0302 }
0303
0304 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
0305 struct io_ring_ctx *ctx)
0306 {
0307 int ret;
0308
0309
0310 if (data->quiesce)
0311 return -ENXIO;
0312
0313 data->quiesce = true;
0314 do {
0315 ret = io_rsrc_node_switch_start(ctx);
0316 if (ret)
0317 break;
0318 io_rsrc_node_switch(ctx, data);
0319
0320
0321 if (atomic_dec_and_test(&data->refs))
0322 break;
0323 mutex_unlock(&ctx->uring_lock);
0324 flush_delayed_work(&ctx->rsrc_put_work);
0325 ret = wait_for_completion_interruptible(&data->done);
0326 if (!ret) {
0327 mutex_lock(&ctx->uring_lock);
0328 if (atomic_read(&data->refs) > 0) {
0329
0330
0331
0332
0333 mutex_unlock(&ctx->uring_lock);
0334 } else {
0335 break;
0336 }
0337 }
0338
0339 atomic_inc(&data->refs);
0340
0341 flush_delayed_work(&ctx->rsrc_put_work);
0342 reinit_completion(&data->done);
0343
0344 ret = io_run_task_work_sig();
0345 mutex_lock(&ctx->uring_lock);
0346 } while (ret >= 0);
0347 data->quiesce = false;
0348
0349 return ret;
0350 }
0351
0352 static void io_free_page_table(void **table, size_t size)
0353 {
0354 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
0355
0356 for (i = 0; i < nr_tables; i++)
0357 kfree(table[i]);
0358 kfree(table);
0359 }
0360
0361 static void io_rsrc_data_free(struct io_rsrc_data *data)
0362 {
0363 size_t size = data->nr * sizeof(data->tags[0][0]);
0364
0365 if (data->tags)
0366 io_free_page_table((void **)data->tags, size);
0367 kfree(data);
0368 }
0369
0370 static __cold void **io_alloc_page_table(size_t size)
0371 {
0372 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
0373 size_t init_size = size;
0374 void **table;
0375
0376 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
0377 if (!table)
0378 return NULL;
0379
0380 for (i = 0; i < nr_tables; i++) {
0381 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
0382
0383 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
0384 if (!table[i]) {
0385 io_free_page_table(table, init_size);
0386 return NULL;
0387 }
0388 size -= this_size;
0389 }
0390 return table;
0391 }
0392
0393 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
0394 rsrc_put_fn *do_put, u64 __user *utags,
0395 unsigned nr, struct io_rsrc_data **pdata)
0396 {
0397 struct io_rsrc_data *data;
0398 int ret = -ENOMEM;
0399 unsigned i;
0400
0401 data = kzalloc(sizeof(*data), GFP_KERNEL);
0402 if (!data)
0403 return -ENOMEM;
0404 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
0405 if (!data->tags) {
0406 kfree(data);
0407 return -ENOMEM;
0408 }
0409
0410 data->nr = nr;
0411 data->ctx = ctx;
0412 data->do_put = do_put;
0413 if (utags) {
0414 ret = -EFAULT;
0415 for (i = 0; i < nr; i++) {
0416 u64 *tag_slot = io_get_tag_slot(data, i);
0417
0418 if (copy_from_user(tag_slot, &utags[i],
0419 sizeof(*tag_slot)))
0420 goto fail;
0421 }
0422 }
0423
0424 atomic_set(&data->refs, 1);
0425 init_completion(&data->done);
0426 *pdata = data;
0427 return 0;
0428 fail:
0429 io_rsrc_data_free(data);
0430 return ret;
0431 }
0432
0433 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
0434 struct io_uring_rsrc_update2 *up,
0435 unsigned nr_args)
0436 {
0437 u64 __user *tags = u64_to_user_ptr(up->tags);
0438 __s32 __user *fds = u64_to_user_ptr(up->data);
0439 struct io_rsrc_data *data = ctx->file_data;
0440 struct io_fixed_file *file_slot;
0441 struct file *file;
0442 int fd, i, err = 0;
0443 unsigned int done;
0444 bool needs_switch = false;
0445
0446 if (!ctx->file_data)
0447 return -ENXIO;
0448 if (up->offset + nr_args > ctx->nr_user_files)
0449 return -EINVAL;
0450
0451 for (done = 0; done < nr_args; done++) {
0452 u64 tag = 0;
0453
0454 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
0455 copy_from_user(&fd, &fds[done], sizeof(fd))) {
0456 err = -EFAULT;
0457 break;
0458 }
0459 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
0460 err = -EINVAL;
0461 break;
0462 }
0463 if (fd == IORING_REGISTER_FILES_SKIP)
0464 continue;
0465
0466 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
0467 file_slot = io_fixed_file_slot(&ctx->file_table, i);
0468
0469 if (file_slot->file_ptr) {
0470 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
0471 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
0472 if (err)
0473 break;
0474 file_slot->file_ptr = 0;
0475 io_file_bitmap_clear(&ctx->file_table, i);
0476 needs_switch = true;
0477 }
0478 if (fd != -1) {
0479 file = fget(fd);
0480 if (!file) {
0481 err = -EBADF;
0482 break;
0483 }
0484
0485
0486
0487
0488
0489
0490
0491
0492 if (io_is_uring_fops(file)) {
0493 fput(file);
0494 err = -EBADF;
0495 break;
0496 }
0497 err = io_scm_file_account(ctx, file);
0498 if (err) {
0499 fput(file);
0500 break;
0501 }
0502 *io_get_tag_slot(data, i) = tag;
0503 io_fixed_file_set(file_slot, file);
0504 io_file_bitmap_set(&ctx->file_table, i);
0505 }
0506 }
0507
0508 if (needs_switch)
0509 io_rsrc_node_switch(ctx, data);
0510 return done ? done : err;
0511 }
0512
0513 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
0514 struct io_uring_rsrc_update2 *up,
0515 unsigned int nr_args)
0516 {
0517 u64 __user *tags = u64_to_user_ptr(up->tags);
0518 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
0519 struct page *last_hpage = NULL;
0520 bool needs_switch = false;
0521 __u32 done;
0522 int i, err;
0523
0524 if (!ctx->buf_data)
0525 return -ENXIO;
0526 if (up->offset + nr_args > ctx->nr_user_bufs)
0527 return -EINVAL;
0528
0529 for (done = 0; done < nr_args; done++) {
0530 struct io_mapped_ubuf *imu;
0531 int offset = up->offset + done;
0532 u64 tag = 0;
0533
0534 err = io_copy_iov(ctx, &iov, iovs, done);
0535 if (err)
0536 break;
0537 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
0538 err = -EFAULT;
0539 break;
0540 }
0541 err = io_buffer_validate(&iov);
0542 if (err)
0543 break;
0544 if (!iov.iov_base && tag) {
0545 err = -EINVAL;
0546 break;
0547 }
0548 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
0549 if (err)
0550 break;
0551
0552 i = array_index_nospec(offset, ctx->nr_user_bufs);
0553 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
0554 err = io_queue_rsrc_removal(ctx->buf_data, i,
0555 ctx->rsrc_node, ctx->user_bufs[i]);
0556 if (unlikely(err)) {
0557 io_buffer_unmap(ctx, &imu);
0558 break;
0559 }
0560 ctx->user_bufs[i] = ctx->dummy_ubuf;
0561 needs_switch = true;
0562 }
0563
0564 ctx->user_bufs[i] = imu;
0565 *io_get_tag_slot(ctx->buf_data, offset) = tag;
0566 }
0567
0568 if (needs_switch)
0569 io_rsrc_node_switch(ctx, ctx->buf_data);
0570 return done ? done : err;
0571 }
0572
0573 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
0574 struct io_uring_rsrc_update2 *up,
0575 unsigned nr_args)
0576 {
0577 __u32 tmp;
0578 int err;
0579
0580 if (check_add_overflow(up->offset, nr_args, &tmp))
0581 return -EOVERFLOW;
0582 err = io_rsrc_node_switch_start(ctx);
0583 if (err)
0584 return err;
0585
0586 switch (type) {
0587 case IORING_RSRC_FILE:
0588 return __io_sqe_files_update(ctx, up, nr_args);
0589 case IORING_RSRC_BUFFER:
0590 return __io_sqe_buffers_update(ctx, up, nr_args);
0591 }
0592 return -EINVAL;
0593 }
0594
0595 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
0596 unsigned nr_args)
0597 {
0598 struct io_uring_rsrc_update2 up;
0599
0600 if (!nr_args)
0601 return -EINVAL;
0602 memset(&up, 0, sizeof(up));
0603 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
0604 return -EFAULT;
0605 if (up.resv || up.resv2)
0606 return -EINVAL;
0607 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
0608 }
0609
0610 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
0611 unsigned size, unsigned type)
0612 {
0613 struct io_uring_rsrc_update2 up;
0614
0615 if (size != sizeof(up))
0616 return -EINVAL;
0617 if (copy_from_user(&up, arg, sizeof(up)))
0618 return -EFAULT;
0619 if (!up.nr || up.resv || up.resv2)
0620 return -EINVAL;
0621 return __io_register_rsrc_update(ctx, type, &up, up.nr);
0622 }
0623
0624 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
0625 unsigned int size, unsigned int type)
0626 {
0627 struct io_uring_rsrc_register rr;
0628
0629
0630 if (size != sizeof(rr))
0631 return -EINVAL;
0632
0633 memset(&rr, 0, sizeof(rr));
0634 if (copy_from_user(&rr, arg, size))
0635 return -EFAULT;
0636 if (!rr.nr || rr.resv2)
0637 return -EINVAL;
0638 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
0639 return -EINVAL;
0640
0641 switch (type) {
0642 case IORING_RSRC_FILE:
0643 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
0644 break;
0645 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
0646 rr.nr, u64_to_user_ptr(rr.tags));
0647 case IORING_RSRC_BUFFER:
0648 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
0649 break;
0650 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
0651 rr.nr, u64_to_user_ptr(rr.tags));
0652 }
0653 return -EINVAL;
0654 }
0655
0656 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
0657 {
0658 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0659
0660 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
0661 return -EINVAL;
0662 if (sqe->rw_flags || sqe->splice_fd_in)
0663 return -EINVAL;
0664
0665 up->offset = READ_ONCE(sqe->off);
0666 up->nr_args = READ_ONCE(sqe->len);
0667 if (!up->nr_args)
0668 return -EINVAL;
0669 up->arg = READ_ONCE(sqe->addr);
0670 return 0;
0671 }
0672
0673 static int io_files_update_with_index_alloc(struct io_kiocb *req,
0674 unsigned int issue_flags)
0675 {
0676 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0677 __s32 __user *fds = u64_to_user_ptr(up->arg);
0678 unsigned int done;
0679 struct file *file;
0680 int ret, fd;
0681
0682 if (!req->ctx->file_data)
0683 return -ENXIO;
0684
0685 for (done = 0; done < up->nr_args; done++) {
0686 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
0687 ret = -EFAULT;
0688 break;
0689 }
0690
0691 file = fget(fd);
0692 if (!file) {
0693 ret = -EBADF;
0694 break;
0695 }
0696 ret = io_fixed_fd_install(req, issue_flags, file,
0697 IORING_FILE_INDEX_ALLOC);
0698 if (ret < 0)
0699 break;
0700 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
0701 __io_close_fixed(req->ctx, issue_flags, ret);
0702 ret = -EFAULT;
0703 break;
0704 }
0705 }
0706
0707 if (done)
0708 return done;
0709 return ret;
0710 }
0711
0712 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
0713 {
0714 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0715 struct io_ring_ctx *ctx = req->ctx;
0716 struct io_uring_rsrc_update2 up2;
0717 int ret;
0718
0719 up2.offset = up->offset;
0720 up2.data = up->arg;
0721 up2.nr = 0;
0722 up2.tags = 0;
0723 up2.resv = 0;
0724 up2.resv2 = 0;
0725
0726 if (up->offset == IORING_FILE_INDEX_ALLOC) {
0727 ret = io_files_update_with_index_alloc(req, issue_flags);
0728 } else {
0729 io_ring_submit_lock(ctx, issue_flags);
0730 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
0731 &up2, up->nr_args);
0732 io_ring_submit_unlock(ctx, issue_flags);
0733 }
0734
0735 if (ret < 0)
0736 req_set_fail(req);
0737 io_req_set_res(req, ret, 0);
0738 return IOU_OK;
0739 }
0740
0741 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
0742 struct io_rsrc_node *node, void *rsrc)
0743 {
0744 u64 *tag_slot = io_get_tag_slot(data, idx);
0745 struct io_rsrc_put *prsrc;
0746
0747 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
0748 if (!prsrc)
0749 return -ENOMEM;
0750
0751 prsrc->tag = *tag_slot;
0752 *tag_slot = 0;
0753 prsrc->rsrc = rsrc;
0754 list_add(&prsrc->list, &node->rsrc_list);
0755 return 0;
0756 }
0757
0758 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
0759 {
0760 #if !defined(IO_URING_SCM_ALL)
0761 int i;
0762
0763 for (i = 0; i < ctx->nr_user_files; i++) {
0764 struct file *file = io_file_from_index(&ctx->file_table, i);
0765
0766 if (!file)
0767 continue;
0768 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
0769 continue;
0770 io_file_bitmap_clear(&ctx->file_table, i);
0771 fput(file);
0772 }
0773 #endif
0774
0775 #if defined(CONFIG_UNIX)
0776 if (ctx->ring_sock) {
0777 struct sock *sock = ctx->ring_sock->sk;
0778 struct sk_buff *skb;
0779
0780 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
0781 kfree_skb(skb);
0782 }
0783 #endif
0784 io_free_file_tables(&ctx->file_table);
0785 io_rsrc_data_free(ctx->file_data);
0786 ctx->file_data = NULL;
0787 ctx->nr_user_files = 0;
0788 }
0789
0790 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
0791 {
0792 unsigned nr = ctx->nr_user_files;
0793 int ret;
0794
0795 if (!ctx->file_data)
0796 return -ENXIO;
0797
0798
0799
0800
0801
0802 ctx->nr_user_files = 0;
0803 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
0804 ctx->nr_user_files = nr;
0805 if (!ret)
0806 __io_sqe_files_unregister(ctx);
0807 return ret;
0808 }
0809
0810
0811
0812
0813
0814
0815
0816
0817 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
0818 {
0819 #if defined(CONFIG_UNIX)
0820 struct sock *sk = ctx->ring_sock->sk;
0821 struct sk_buff_head *head = &sk->sk_receive_queue;
0822 struct scm_fp_list *fpl;
0823 struct sk_buff *skb;
0824
0825 if (likely(!io_file_need_scm(file)))
0826 return 0;
0827
0828
0829
0830
0831
0832
0833 spin_lock_irq(&head->lock);
0834 skb = skb_peek(head);
0835 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
0836 __skb_unlink(skb, head);
0837 else
0838 skb = NULL;
0839 spin_unlock_irq(&head->lock);
0840
0841 if (!skb) {
0842 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
0843 if (!fpl)
0844 return -ENOMEM;
0845
0846 skb = alloc_skb(0, GFP_KERNEL);
0847 if (!skb) {
0848 kfree(fpl);
0849 return -ENOMEM;
0850 }
0851
0852 fpl->user = get_uid(current_user());
0853 fpl->max = SCM_MAX_FD;
0854 fpl->count = 0;
0855
0856 UNIXCB(skb).fp = fpl;
0857 skb->sk = sk;
0858 skb->destructor = unix_destruct_scm;
0859 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
0860 }
0861
0862 fpl = UNIXCB(skb).fp;
0863 fpl->fp[fpl->count++] = get_file(file);
0864 unix_inflight(fpl->user, file);
0865 skb_queue_head(head, skb);
0866 fput(file);
0867 #endif
0868 return 0;
0869 }
0870
0871 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
0872 {
0873 struct file *file = prsrc->file;
0874 #if defined(CONFIG_UNIX)
0875 struct sock *sock = ctx->ring_sock->sk;
0876 struct sk_buff_head list, *head = &sock->sk_receive_queue;
0877 struct sk_buff *skb;
0878 int i;
0879
0880 if (!io_file_need_scm(file)) {
0881 fput(file);
0882 return;
0883 }
0884
0885 __skb_queue_head_init(&list);
0886
0887
0888
0889
0890
0891 skb = skb_dequeue(head);
0892 while (skb) {
0893 struct scm_fp_list *fp;
0894
0895 fp = UNIXCB(skb).fp;
0896 for (i = 0; i < fp->count; i++) {
0897 int left;
0898
0899 if (fp->fp[i] != file)
0900 continue;
0901
0902 unix_notinflight(fp->user, fp->fp[i]);
0903 left = fp->count - 1 - i;
0904 if (left) {
0905 memmove(&fp->fp[i], &fp->fp[i + 1],
0906 left * sizeof(struct file *));
0907 }
0908 fp->count--;
0909 if (!fp->count) {
0910 kfree_skb(skb);
0911 skb = NULL;
0912 } else {
0913 __skb_queue_tail(&list, skb);
0914 }
0915 fput(file);
0916 file = NULL;
0917 break;
0918 }
0919
0920 if (!file)
0921 break;
0922
0923 __skb_queue_tail(&list, skb);
0924
0925 skb = skb_dequeue(head);
0926 }
0927
0928 if (skb_peek(&list)) {
0929 spin_lock_irq(&head->lock);
0930 while ((skb = __skb_dequeue(&list)) != NULL)
0931 __skb_queue_tail(head, skb);
0932 spin_unlock_irq(&head->lock);
0933 }
0934 #else
0935 fput(file);
0936 #endif
0937 }
0938
0939 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
0940 unsigned nr_args, u64 __user *tags)
0941 {
0942 __s32 __user *fds = (__s32 __user *) arg;
0943 struct file *file;
0944 int fd, ret;
0945 unsigned i;
0946
0947 if (ctx->file_data)
0948 return -EBUSY;
0949 if (!nr_args)
0950 return -EINVAL;
0951 if (nr_args > IORING_MAX_FIXED_FILES)
0952 return -EMFILE;
0953 if (nr_args > rlimit(RLIMIT_NOFILE))
0954 return -EMFILE;
0955 ret = io_rsrc_node_switch_start(ctx);
0956 if (ret)
0957 return ret;
0958 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
0959 &ctx->file_data);
0960 if (ret)
0961 return ret;
0962
0963 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
0964 io_rsrc_data_free(ctx->file_data);
0965 ctx->file_data = NULL;
0966 return -ENOMEM;
0967 }
0968
0969 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
0970 struct io_fixed_file *file_slot;
0971
0972 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
0973 ret = -EFAULT;
0974 goto fail;
0975 }
0976
0977 if (!fds || fd == -1) {
0978 ret = -EINVAL;
0979 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
0980 goto fail;
0981 continue;
0982 }
0983
0984 file = fget(fd);
0985 ret = -EBADF;
0986 if (unlikely(!file))
0987 goto fail;
0988
0989
0990
0991
0992
0993
0994
0995
0996 if (io_is_uring_fops(file)) {
0997 fput(file);
0998 goto fail;
0999 }
1000 ret = io_scm_file_account(ctx, file);
1001 if (ret) {
1002 fput(file);
1003 goto fail;
1004 }
1005 file_slot = io_fixed_file_slot(&ctx->file_table, i);
1006 io_fixed_file_set(file_slot, file);
1007 io_file_bitmap_set(&ctx->file_table, i);
1008 }
1009
1010
1011 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
1012 io_rsrc_node_switch(ctx, NULL);
1013 return 0;
1014 fail:
1015 __io_sqe_files_unregister(ctx);
1016 return ret;
1017 }
1018
1019 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1020 {
1021 io_buffer_unmap(ctx, &prsrc->buf);
1022 prsrc->buf = NULL;
1023 }
1024
1025 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1026 {
1027 unsigned int i;
1028
1029 for (i = 0; i < ctx->nr_user_bufs; i++)
1030 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1031 kfree(ctx->user_bufs);
1032 io_rsrc_data_free(ctx->buf_data);
1033 ctx->user_bufs = NULL;
1034 ctx->buf_data = NULL;
1035 ctx->nr_user_bufs = 0;
1036 }
1037
1038 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1039 {
1040 unsigned nr = ctx->nr_user_bufs;
1041 int ret;
1042
1043 if (!ctx->buf_data)
1044 return -ENXIO;
1045
1046
1047
1048
1049
1050 ctx->nr_user_bufs = 0;
1051 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1052 ctx->nr_user_bufs = nr;
1053 if (!ret)
1054 __io_sqe_buffers_unregister(ctx);
1055 return ret;
1056 }
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1068 int nr_pages, struct page *hpage)
1069 {
1070 int i, j;
1071
1072
1073 for (i = 0; i < nr_pages; i++) {
1074 if (!PageCompound(pages[i]))
1075 continue;
1076 if (compound_head(pages[i]) == hpage)
1077 return true;
1078 }
1079
1080
1081 for (i = 0; i < ctx->nr_user_bufs; i++) {
1082 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1083
1084 for (j = 0; j < imu->nr_bvecs; j++) {
1085 if (!PageCompound(imu->bvec[j].bv_page))
1086 continue;
1087 if (compound_head(imu->bvec[j].bv_page) == hpage)
1088 return true;
1089 }
1090 }
1091
1092 return false;
1093 }
1094
1095 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1096 int nr_pages, struct io_mapped_ubuf *imu,
1097 struct page **last_hpage)
1098 {
1099 int i, ret;
1100
1101 imu->acct_pages = 0;
1102 for (i = 0; i < nr_pages; i++) {
1103 if (!PageCompound(pages[i])) {
1104 imu->acct_pages++;
1105 } else {
1106 struct page *hpage;
1107
1108 hpage = compound_head(pages[i]);
1109 if (hpage == *last_hpage)
1110 continue;
1111 *last_hpage = hpage;
1112 if (headpage_already_acct(ctx, pages, i, hpage))
1113 continue;
1114 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1115 }
1116 }
1117
1118 if (!imu->acct_pages)
1119 return 0;
1120
1121 ret = io_account_mem(ctx, imu->acct_pages);
1122 if (ret)
1123 imu->acct_pages = 0;
1124 return ret;
1125 }
1126
1127 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1128 {
1129 unsigned long start, end, nr_pages;
1130 struct vm_area_struct **vmas = NULL;
1131 struct page **pages = NULL;
1132 int i, pret, ret = -ENOMEM;
1133
1134 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1135 start = ubuf >> PAGE_SHIFT;
1136 nr_pages = end - start;
1137
1138 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1139 if (!pages)
1140 goto done;
1141
1142 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1143 GFP_KERNEL);
1144 if (!vmas)
1145 goto done;
1146
1147 ret = 0;
1148 mmap_read_lock(current->mm);
1149 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1150 pages, vmas);
1151 if (pret == nr_pages) {
1152
1153 for (i = 0; i < nr_pages; i++) {
1154 struct vm_area_struct *vma = vmas[i];
1155
1156 if (vma_is_shmem(vma))
1157 continue;
1158 if (vma->vm_file &&
1159 !is_file_hugepages(vma->vm_file)) {
1160 ret = -EOPNOTSUPP;
1161 break;
1162 }
1163 }
1164 *npages = nr_pages;
1165 } else {
1166 ret = pret < 0 ? pret : -EFAULT;
1167 }
1168 mmap_read_unlock(current->mm);
1169 if (ret) {
1170
1171
1172
1173
1174 if (pret > 0)
1175 unpin_user_pages(pages, pret);
1176 goto done;
1177 }
1178 ret = 0;
1179 done:
1180 kvfree(vmas);
1181 if (ret < 0) {
1182 kvfree(pages);
1183 pages = ERR_PTR(ret);
1184 }
1185 return pages;
1186 }
1187
1188 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1189 struct io_mapped_ubuf **pimu,
1190 struct page **last_hpage)
1191 {
1192 struct io_mapped_ubuf *imu = NULL;
1193 struct page **pages = NULL;
1194 unsigned long off;
1195 size_t size;
1196 int ret, nr_pages, i;
1197
1198 *pimu = ctx->dummy_ubuf;
1199 if (!iov->iov_base)
1200 return 0;
1201
1202 ret = -ENOMEM;
1203 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1204 &nr_pages);
1205 if (IS_ERR(pages)) {
1206 ret = PTR_ERR(pages);
1207 pages = NULL;
1208 goto done;
1209 }
1210
1211 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1212 if (!imu)
1213 goto done;
1214
1215 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1216 if (ret) {
1217 unpin_user_pages(pages, nr_pages);
1218 goto done;
1219 }
1220
1221 off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1222 size = iov->iov_len;
1223 for (i = 0; i < nr_pages; i++) {
1224 size_t vec_len;
1225
1226 vec_len = min_t(size_t, size, PAGE_SIZE - off);
1227 imu->bvec[i].bv_page = pages[i];
1228 imu->bvec[i].bv_len = vec_len;
1229 imu->bvec[i].bv_offset = off;
1230 off = 0;
1231 size -= vec_len;
1232 }
1233
1234 imu->ubuf = (unsigned long) iov->iov_base;
1235 imu->ubuf_end = imu->ubuf + iov->iov_len;
1236 imu->nr_bvecs = nr_pages;
1237 *pimu = imu;
1238 ret = 0;
1239 done:
1240 if (ret)
1241 kvfree(imu);
1242 kvfree(pages);
1243 return ret;
1244 }
1245
1246 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1247 {
1248 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1249 return ctx->user_bufs ? 0 : -ENOMEM;
1250 }
1251
1252 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1253 unsigned int nr_args, u64 __user *tags)
1254 {
1255 struct page *last_hpage = NULL;
1256 struct io_rsrc_data *data;
1257 int i, ret;
1258 struct iovec iov;
1259
1260 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1261
1262 if (ctx->user_bufs)
1263 return -EBUSY;
1264 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1265 return -EINVAL;
1266 ret = io_rsrc_node_switch_start(ctx);
1267 if (ret)
1268 return ret;
1269 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1270 if (ret)
1271 return ret;
1272 ret = io_buffers_map_alloc(ctx, nr_args);
1273 if (ret) {
1274 io_rsrc_data_free(data);
1275 return ret;
1276 }
1277
1278 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1279 if (arg) {
1280 ret = io_copy_iov(ctx, &iov, arg, i);
1281 if (ret)
1282 break;
1283 ret = io_buffer_validate(&iov);
1284 if (ret)
1285 break;
1286 } else {
1287 memset(&iov, 0, sizeof(iov));
1288 }
1289
1290 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1291 ret = -EINVAL;
1292 break;
1293 }
1294
1295 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1296 &last_hpage);
1297 if (ret)
1298 break;
1299 }
1300
1301 WARN_ON_ONCE(ctx->buf_data);
1302
1303 ctx->buf_data = data;
1304 if (ret)
1305 __io_sqe_buffers_unregister(ctx);
1306 else
1307 io_rsrc_node_switch(ctx, NULL);
1308 return ret;
1309 }
1310
1311 int io_import_fixed(int ddir, struct iov_iter *iter,
1312 struct io_mapped_ubuf *imu,
1313 u64 buf_addr, size_t len)
1314 {
1315 u64 buf_end;
1316 size_t offset;
1317
1318 if (WARN_ON_ONCE(!imu))
1319 return -EFAULT;
1320 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1321 return -EFAULT;
1322
1323 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1324 return -EFAULT;
1325
1326
1327
1328
1329
1330 offset = buf_addr - imu->ubuf;
1331 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1332
1333 if (offset) {
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350 const struct bio_vec *bvec = imu->bvec;
1351
1352 if (offset <= bvec->bv_len) {
1353 iov_iter_advance(iter, offset);
1354 } else {
1355 unsigned long seg_skip;
1356
1357
1358 offset -= bvec->bv_len;
1359 seg_skip = 1 + (offset >> PAGE_SHIFT);
1360
1361 iter->bvec = bvec + seg_skip;
1362 iter->nr_segs -= seg_skip;
1363 iter->count -= bvec->bv_len + offset;
1364 iter->iov_offset = offset & ~PAGE_MASK;
1365 }
1366 }
1367
1368 return 0;
1369 }