Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/kernel.h>
0003 #include <linux/errno.h>
0004 #include <linux/fs.h>
0005 #include <linux/file.h>
0006 #include <linux/mm.h>
0007 #include <linux/slab.h>
0008 #include <linux/nospec.h>
0009 #include <linux/hugetlb.h>
0010 #include <linux/compat.h>
0011 #include <linux/io_uring.h>
0012 
0013 #include <uapi/linux/io_uring.h>
0014 
0015 #include "io_uring.h"
0016 #include "openclose.h"
0017 #include "rsrc.h"
0018 
0019 struct io_rsrc_update {
0020     struct file         *file;
0021     u64             arg;
0022     u32             nr_args;
0023     u32             offset;
0024 };
0025 
0026 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
0027                   struct io_mapped_ubuf **pimu,
0028                   struct page **last_hpage);
0029 
0030 #define IO_RSRC_REF_BATCH   100
0031 
0032 /* only define max */
0033 #define IORING_MAX_FIXED_FILES  (1U << 20)
0034 #define IORING_MAX_REG_BUFFERS  (1U << 14)
0035 
0036 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
0037     __must_hold(&ctx->uring_lock)
0038 {
0039     if (ctx->rsrc_cached_refs) {
0040         io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
0041         ctx->rsrc_cached_refs = 0;
0042     }
0043 }
0044 
0045 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
0046 {
0047     unsigned long page_limit, cur_pages, new_pages;
0048 
0049     if (!nr_pages)
0050         return 0;
0051 
0052     /* Don't allow more pages than we can safely lock */
0053     page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
0054 
0055     cur_pages = atomic_long_read(&user->locked_vm);
0056     do {
0057         new_pages = cur_pages + nr_pages;
0058         if (new_pages > page_limit)
0059             return -ENOMEM;
0060     } while (!atomic_long_try_cmpxchg(&user->locked_vm,
0061                       &cur_pages, new_pages));
0062     return 0;
0063 }
0064 
0065 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
0066 {
0067     if (ctx->user)
0068         __io_unaccount_mem(ctx->user, nr_pages);
0069 
0070     if (ctx->mm_account)
0071         atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
0072 }
0073 
0074 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
0075 {
0076     int ret;
0077 
0078     if (ctx->user) {
0079         ret = __io_account_mem(ctx->user, nr_pages);
0080         if (ret)
0081             return ret;
0082     }
0083 
0084     if (ctx->mm_account)
0085         atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
0086 
0087     return 0;
0088 }
0089 
0090 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
0091                void __user *arg, unsigned index)
0092 {
0093     struct iovec __user *src;
0094 
0095 #ifdef CONFIG_COMPAT
0096     if (ctx->compat) {
0097         struct compat_iovec __user *ciovs;
0098         struct compat_iovec ciov;
0099 
0100         ciovs = (struct compat_iovec __user *) arg;
0101         if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
0102             return -EFAULT;
0103 
0104         dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
0105         dst->iov_len = ciov.iov_len;
0106         return 0;
0107     }
0108 #endif
0109     src = (struct iovec __user *) arg;
0110     if (copy_from_user(dst, &src[index], sizeof(*dst)))
0111         return -EFAULT;
0112     return 0;
0113 }
0114 
0115 static int io_buffer_validate(struct iovec *iov)
0116 {
0117     unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
0118 
0119     /*
0120      * Don't impose further limits on the size and buffer
0121      * constraints here, we'll -EINVAL later when IO is
0122      * submitted if they are wrong.
0123      */
0124     if (!iov->iov_base)
0125         return iov->iov_len ? -EFAULT : 0;
0126     if (!iov->iov_len)
0127         return -EFAULT;
0128 
0129     /* arbitrary limit, but we need something */
0130     if (iov->iov_len > SZ_1G)
0131         return -EFAULT;
0132 
0133     if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
0134         return -EOVERFLOW;
0135 
0136     return 0;
0137 }
0138 
0139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
0140 {
0141     struct io_mapped_ubuf *imu = *slot;
0142     unsigned int i;
0143 
0144     if (imu != ctx->dummy_ubuf) {
0145         for (i = 0; i < imu->nr_bvecs; i++)
0146             unpin_user_page(imu->bvec[i].bv_page);
0147         if (imu->acct_pages)
0148             io_unaccount_mem(ctx, imu->acct_pages);
0149         kvfree(imu);
0150     }
0151     *slot = NULL;
0152 }
0153 
0154 void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
0155     __must_hold(&ctx->uring_lock)
0156 {
0157     ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
0158     percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
0159 }
0160 
0161 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
0162 {
0163     struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
0164     struct io_ring_ctx *ctx = rsrc_data->ctx;
0165     struct io_rsrc_put *prsrc, *tmp;
0166 
0167     list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
0168         list_del(&prsrc->list);
0169 
0170         if (prsrc->tag) {
0171             if (ctx->flags & IORING_SETUP_IOPOLL) {
0172                 mutex_lock(&ctx->uring_lock);
0173                 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
0174                 mutex_unlock(&ctx->uring_lock);
0175             } else {
0176                 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
0177             }
0178         }
0179 
0180         rsrc_data->do_put(ctx, prsrc);
0181         kfree(prsrc);
0182     }
0183 
0184     io_rsrc_node_destroy(ref_node);
0185     if (atomic_dec_and_test(&rsrc_data->refs))
0186         complete(&rsrc_data->done);
0187 }
0188 
0189 void io_rsrc_put_work(struct work_struct *work)
0190 {
0191     struct io_ring_ctx *ctx;
0192     struct llist_node *node;
0193 
0194     ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
0195     node = llist_del_all(&ctx->rsrc_put_llist);
0196 
0197     while (node) {
0198         struct io_rsrc_node *ref_node;
0199         struct llist_node *next = node->next;
0200 
0201         ref_node = llist_entry(node, struct io_rsrc_node, llist);
0202         __io_rsrc_put_work(ref_node);
0203         node = next;
0204     }
0205 }
0206 
0207 void io_wait_rsrc_data(struct io_rsrc_data *data)
0208 {
0209     if (data && !atomic_dec_and_test(&data->refs))
0210         wait_for_completion(&data->done);
0211 }
0212 
0213 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
0214 {
0215     percpu_ref_exit(&ref_node->refs);
0216     kfree(ref_node);
0217 }
0218 
0219 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
0220 {
0221     struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
0222     struct io_ring_ctx *ctx = node->rsrc_data->ctx;
0223     unsigned long flags;
0224     bool first_add = false;
0225     unsigned long delay = HZ;
0226 
0227     spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
0228     node->done = true;
0229 
0230     /* if we are mid-quiesce then do not delay */
0231     if (node->rsrc_data->quiesce)
0232         delay = 0;
0233 
0234     while (!list_empty(&ctx->rsrc_ref_list)) {
0235         node = list_first_entry(&ctx->rsrc_ref_list,
0236                         struct io_rsrc_node, node);
0237         /* recycle ref nodes in order */
0238         if (!node->done)
0239             break;
0240         list_del(&node->node);
0241         first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
0242     }
0243     spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
0244 
0245     if (first_add)
0246         mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
0247 }
0248 
0249 static struct io_rsrc_node *io_rsrc_node_alloc(void)
0250 {
0251     struct io_rsrc_node *ref_node;
0252 
0253     ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
0254     if (!ref_node)
0255         return NULL;
0256 
0257     if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
0258                 0, GFP_KERNEL)) {
0259         kfree(ref_node);
0260         return NULL;
0261     }
0262     INIT_LIST_HEAD(&ref_node->node);
0263     INIT_LIST_HEAD(&ref_node->rsrc_list);
0264     ref_node->done = false;
0265     return ref_node;
0266 }
0267 
0268 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
0269              struct io_rsrc_data *data_to_kill)
0270     __must_hold(&ctx->uring_lock)
0271 {
0272     WARN_ON_ONCE(!ctx->rsrc_backup_node);
0273     WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
0274 
0275     io_rsrc_refs_drop(ctx);
0276 
0277     if (data_to_kill) {
0278         struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
0279 
0280         rsrc_node->rsrc_data = data_to_kill;
0281         spin_lock_irq(&ctx->rsrc_ref_lock);
0282         list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
0283         spin_unlock_irq(&ctx->rsrc_ref_lock);
0284 
0285         atomic_inc(&data_to_kill->refs);
0286         percpu_ref_kill(&rsrc_node->refs);
0287         ctx->rsrc_node = NULL;
0288     }
0289 
0290     if (!ctx->rsrc_node) {
0291         ctx->rsrc_node = ctx->rsrc_backup_node;
0292         ctx->rsrc_backup_node = NULL;
0293     }
0294 }
0295 
0296 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
0297 {
0298     if (ctx->rsrc_backup_node)
0299         return 0;
0300     ctx->rsrc_backup_node = io_rsrc_node_alloc();
0301     return ctx->rsrc_backup_node ? 0 : -ENOMEM;
0302 }
0303 
0304 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
0305                       struct io_ring_ctx *ctx)
0306 {
0307     int ret;
0308 
0309     /* As we may drop ->uring_lock, other task may have started quiesce */
0310     if (data->quiesce)
0311         return -ENXIO;
0312 
0313     data->quiesce = true;
0314     do {
0315         ret = io_rsrc_node_switch_start(ctx);
0316         if (ret)
0317             break;
0318         io_rsrc_node_switch(ctx, data);
0319 
0320         /* kill initial ref, already quiesced if zero */
0321         if (atomic_dec_and_test(&data->refs))
0322             break;
0323         mutex_unlock(&ctx->uring_lock);
0324         flush_delayed_work(&ctx->rsrc_put_work);
0325         ret = wait_for_completion_interruptible(&data->done);
0326         if (!ret) {
0327             mutex_lock(&ctx->uring_lock);
0328             if (atomic_read(&data->refs) > 0) {
0329                 /*
0330                  * it has been revived by another thread while
0331                  * we were unlocked
0332                  */
0333                 mutex_unlock(&ctx->uring_lock);
0334             } else {
0335                 break;
0336             }
0337         }
0338 
0339         atomic_inc(&data->refs);
0340         /* wait for all works potentially completing data->done */
0341         flush_delayed_work(&ctx->rsrc_put_work);
0342         reinit_completion(&data->done);
0343 
0344         ret = io_run_task_work_sig();
0345         mutex_lock(&ctx->uring_lock);
0346     } while (ret >= 0);
0347     data->quiesce = false;
0348 
0349     return ret;
0350 }
0351 
0352 static void io_free_page_table(void **table, size_t size)
0353 {
0354     unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
0355 
0356     for (i = 0; i < nr_tables; i++)
0357         kfree(table[i]);
0358     kfree(table);
0359 }
0360 
0361 static void io_rsrc_data_free(struct io_rsrc_data *data)
0362 {
0363     size_t size = data->nr * sizeof(data->tags[0][0]);
0364 
0365     if (data->tags)
0366         io_free_page_table((void **)data->tags, size);
0367     kfree(data);
0368 }
0369 
0370 static __cold void **io_alloc_page_table(size_t size)
0371 {
0372     unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
0373     size_t init_size = size;
0374     void **table;
0375 
0376     table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
0377     if (!table)
0378         return NULL;
0379 
0380     for (i = 0; i < nr_tables; i++) {
0381         unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
0382 
0383         table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
0384         if (!table[i]) {
0385             io_free_page_table(table, init_size);
0386             return NULL;
0387         }
0388         size -= this_size;
0389     }
0390     return table;
0391 }
0392 
0393 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
0394                      rsrc_put_fn *do_put, u64 __user *utags,
0395                      unsigned nr, struct io_rsrc_data **pdata)
0396 {
0397     struct io_rsrc_data *data;
0398     int ret = -ENOMEM;
0399     unsigned i;
0400 
0401     data = kzalloc(sizeof(*data), GFP_KERNEL);
0402     if (!data)
0403         return -ENOMEM;
0404     data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
0405     if (!data->tags) {
0406         kfree(data);
0407         return -ENOMEM;
0408     }
0409 
0410     data->nr = nr;
0411     data->ctx = ctx;
0412     data->do_put = do_put;
0413     if (utags) {
0414         ret = -EFAULT;
0415         for (i = 0; i < nr; i++) {
0416             u64 *tag_slot = io_get_tag_slot(data, i);
0417 
0418             if (copy_from_user(tag_slot, &utags[i],
0419                        sizeof(*tag_slot)))
0420                 goto fail;
0421         }
0422     }
0423 
0424     atomic_set(&data->refs, 1);
0425     init_completion(&data->done);
0426     *pdata = data;
0427     return 0;
0428 fail:
0429     io_rsrc_data_free(data);
0430     return ret;
0431 }
0432 
0433 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
0434                  struct io_uring_rsrc_update2 *up,
0435                  unsigned nr_args)
0436 {
0437     u64 __user *tags = u64_to_user_ptr(up->tags);
0438     __s32 __user *fds = u64_to_user_ptr(up->data);
0439     struct io_rsrc_data *data = ctx->file_data;
0440     struct io_fixed_file *file_slot;
0441     struct file *file;
0442     int fd, i, err = 0;
0443     unsigned int done;
0444     bool needs_switch = false;
0445 
0446     if (!ctx->file_data)
0447         return -ENXIO;
0448     if (up->offset + nr_args > ctx->nr_user_files)
0449         return -EINVAL;
0450 
0451     for (done = 0; done < nr_args; done++) {
0452         u64 tag = 0;
0453 
0454         if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
0455             copy_from_user(&fd, &fds[done], sizeof(fd))) {
0456             err = -EFAULT;
0457             break;
0458         }
0459         if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
0460             err = -EINVAL;
0461             break;
0462         }
0463         if (fd == IORING_REGISTER_FILES_SKIP)
0464             continue;
0465 
0466         i = array_index_nospec(up->offset + done, ctx->nr_user_files);
0467         file_slot = io_fixed_file_slot(&ctx->file_table, i);
0468 
0469         if (file_slot->file_ptr) {
0470             file = (struct file *)(file_slot->file_ptr & FFS_MASK);
0471             err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
0472             if (err)
0473                 break;
0474             file_slot->file_ptr = 0;
0475             io_file_bitmap_clear(&ctx->file_table, i);
0476             needs_switch = true;
0477         }
0478         if (fd != -1) {
0479             file = fget(fd);
0480             if (!file) {
0481                 err = -EBADF;
0482                 break;
0483             }
0484             /*
0485              * Don't allow io_uring instances to be registered. If
0486              * UNIX isn't enabled, then this causes a reference
0487              * cycle and this instance can never get freed. If UNIX
0488              * is enabled we'll handle it just fine, but there's
0489              * still no point in allowing a ring fd as it doesn't
0490              * support regular read/write anyway.
0491              */
0492             if (io_is_uring_fops(file)) {
0493                 fput(file);
0494                 err = -EBADF;
0495                 break;
0496             }
0497             err = io_scm_file_account(ctx, file);
0498             if (err) {
0499                 fput(file);
0500                 break;
0501             }
0502             *io_get_tag_slot(data, i) = tag;
0503             io_fixed_file_set(file_slot, file);
0504             io_file_bitmap_set(&ctx->file_table, i);
0505         }
0506     }
0507 
0508     if (needs_switch)
0509         io_rsrc_node_switch(ctx, data);
0510     return done ? done : err;
0511 }
0512 
0513 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
0514                    struct io_uring_rsrc_update2 *up,
0515                    unsigned int nr_args)
0516 {
0517     u64 __user *tags = u64_to_user_ptr(up->tags);
0518     struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
0519     struct page *last_hpage = NULL;
0520     bool needs_switch = false;
0521     __u32 done;
0522     int i, err;
0523 
0524     if (!ctx->buf_data)
0525         return -ENXIO;
0526     if (up->offset + nr_args > ctx->nr_user_bufs)
0527         return -EINVAL;
0528 
0529     for (done = 0; done < nr_args; done++) {
0530         struct io_mapped_ubuf *imu;
0531         int offset = up->offset + done;
0532         u64 tag = 0;
0533 
0534         err = io_copy_iov(ctx, &iov, iovs, done);
0535         if (err)
0536             break;
0537         if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
0538             err = -EFAULT;
0539             break;
0540         }
0541         err = io_buffer_validate(&iov);
0542         if (err)
0543             break;
0544         if (!iov.iov_base && tag) {
0545             err = -EINVAL;
0546             break;
0547         }
0548         err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
0549         if (err)
0550             break;
0551 
0552         i = array_index_nospec(offset, ctx->nr_user_bufs);
0553         if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
0554             err = io_queue_rsrc_removal(ctx->buf_data, i,
0555                             ctx->rsrc_node, ctx->user_bufs[i]);
0556             if (unlikely(err)) {
0557                 io_buffer_unmap(ctx, &imu);
0558                 break;
0559             }
0560             ctx->user_bufs[i] = ctx->dummy_ubuf;
0561             needs_switch = true;
0562         }
0563 
0564         ctx->user_bufs[i] = imu;
0565         *io_get_tag_slot(ctx->buf_data, offset) = tag;
0566     }
0567 
0568     if (needs_switch)
0569         io_rsrc_node_switch(ctx, ctx->buf_data);
0570     return done ? done : err;
0571 }
0572 
0573 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
0574                      struct io_uring_rsrc_update2 *up,
0575                      unsigned nr_args)
0576 {
0577     __u32 tmp;
0578     int err;
0579 
0580     if (check_add_overflow(up->offset, nr_args, &tmp))
0581         return -EOVERFLOW;
0582     err = io_rsrc_node_switch_start(ctx);
0583     if (err)
0584         return err;
0585 
0586     switch (type) {
0587     case IORING_RSRC_FILE:
0588         return __io_sqe_files_update(ctx, up, nr_args);
0589     case IORING_RSRC_BUFFER:
0590         return __io_sqe_buffers_update(ctx, up, nr_args);
0591     }
0592     return -EINVAL;
0593 }
0594 
0595 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
0596                  unsigned nr_args)
0597 {
0598     struct io_uring_rsrc_update2 up;
0599 
0600     if (!nr_args)
0601         return -EINVAL;
0602     memset(&up, 0, sizeof(up));
0603     if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
0604         return -EFAULT;
0605     if (up.resv || up.resv2)
0606         return -EINVAL;
0607     return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
0608 }
0609 
0610 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
0611                 unsigned size, unsigned type)
0612 {
0613     struct io_uring_rsrc_update2 up;
0614 
0615     if (size != sizeof(up))
0616         return -EINVAL;
0617     if (copy_from_user(&up, arg, sizeof(up)))
0618         return -EFAULT;
0619     if (!up.nr || up.resv || up.resv2)
0620         return -EINVAL;
0621     return __io_register_rsrc_update(ctx, type, &up, up.nr);
0622 }
0623 
0624 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
0625                 unsigned int size, unsigned int type)
0626 {
0627     struct io_uring_rsrc_register rr;
0628 
0629     /* keep it extendible */
0630     if (size != sizeof(rr))
0631         return -EINVAL;
0632 
0633     memset(&rr, 0, sizeof(rr));
0634     if (copy_from_user(&rr, arg, size))
0635         return -EFAULT;
0636     if (!rr.nr || rr.resv2)
0637         return -EINVAL;
0638     if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
0639         return -EINVAL;
0640 
0641     switch (type) {
0642     case IORING_RSRC_FILE:
0643         if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
0644             break;
0645         return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
0646                          rr.nr, u64_to_user_ptr(rr.tags));
0647     case IORING_RSRC_BUFFER:
0648         if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
0649             break;
0650         return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
0651                            rr.nr, u64_to_user_ptr(rr.tags));
0652     }
0653     return -EINVAL;
0654 }
0655 
0656 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
0657 {
0658     struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0659 
0660     if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
0661         return -EINVAL;
0662     if (sqe->rw_flags || sqe->splice_fd_in)
0663         return -EINVAL;
0664 
0665     up->offset = READ_ONCE(sqe->off);
0666     up->nr_args = READ_ONCE(sqe->len);
0667     if (!up->nr_args)
0668         return -EINVAL;
0669     up->arg = READ_ONCE(sqe->addr);
0670     return 0;
0671 }
0672 
0673 static int io_files_update_with_index_alloc(struct io_kiocb *req,
0674                         unsigned int issue_flags)
0675 {
0676     struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0677     __s32 __user *fds = u64_to_user_ptr(up->arg);
0678     unsigned int done;
0679     struct file *file;
0680     int ret, fd;
0681 
0682     if (!req->ctx->file_data)
0683         return -ENXIO;
0684 
0685     for (done = 0; done < up->nr_args; done++) {
0686         if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
0687             ret = -EFAULT;
0688             break;
0689         }
0690 
0691         file = fget(fd);
0692         if (!file) {
0693             ret = -EBADF;
0694             break;
0695         }
0696         ret = io_fixed_fd_install(req, issue_flags, file,
0697                       IORING_FILE_INDEX_ALLOC);
0698         if (ret < 0)
0699             break;
0700         if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
0701             __io_close_fixed(req->ctx, issue_flags, ret);
0702             ret = -EFAULT;
0703             break;
0704         }
0705     }
0706 
0707     if (done)
0708         return done;
0709     return ret;
0710 }
0711 
0712 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
0713 {
0714     struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
0715     struct io_ring_ctx *ctx = req->ctx;
0716     struct io_uring_rsrc_update2 up2;
0717     int ret;
0718 
0719     up2.offset = up->offset;
0720     up2.data = up->arg;
0721     up2.nr = 0;
0722     up2.tags = 0;
0723     up2.resv = 0;
0724     up2.resv2 = 0;
0725 
0726     if (up->offset == IORING_FILE_INDEX_ALLOC) {
0727         ret = io_files_update_with_index_alloc(req, issue_flags);
0728     } else {
0729         io_ring_submit_lock(ctx, issue_flags);
0730         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
0731                         &up2, up->nr_args);
0732         io_ring_submit_unlock(ctx, issue_flags);
0733     }
0734 
0735     if (ret < 0)
0736         req_set_fail(req);
0737     io_req_set_res(req, ret, 0);
0738     return IOU_OK;
0739 }
0740 
0741 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
0742               struct io_rsrc_node *node, void *rsrc)
0743 {
0744     u64 *tag_slot = io_get_tag_slot(data, idx);
0745     struct io_rsrc_put *prsrc;
0746 
0747     prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
0748     if (!prsrc)
0749         return -ENOMEM;
0750 
0751     prsrc->tag = *tag_slot;
0752     *tag_slot = 0;
0753     prsrc->rsrc = rsrc;
0754     list_add(&prsrc->list, &node->rsrc_list);
0755     return 0;
0756 }
0757 
0758 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
0759 {
0760 #if !defined(IO_URING_SCM_ALL)
0761     int i;
0762 
0763     for (i = 0; i < ctx->nr_user_files; i++) {
0764         struct file *file = io_file_from_index(&ctx->file_table, i);
0765 
0766         if (!file)
0767             continue;
0768         if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
0769             continue;
0770         io_file_bitmap_clear(&ctx->file_table, i);
0771         fput(file);
0772     }
0773 #endif
0774 
0775 #if defined(CONFIG_UNIX)
0776     if (ctx->ring_sock) {
0777         struct sock *sock = ctx->ring_sock->sk;
0778         struct sk_buff *skb;
0779 
0780         while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
0781             kfree_skb(skb);
0782     }
0783 #endif
0784     io_free_file_tables(&ctx->file_table);
0785     io_rsrc_data_free(ctx->file_data);
0786     ctx->file_data = NULL;
0787     ctx->nr_user_files = 0;
0788 }
0789 
0790 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
0791 {
0792     unsigned nr = ctx->nr_user_files;
0793     int ret;
0794 
0795     if (!ctx->file_data)
0796         return -ENXIO;
0797 
0798     /*
0799      * Quiesce may unlock ->uring_lock, and while it's not held
0800      * prevent new requests using the table.
0801      */
0802     ctx->nr_user_files = 0;
0803     ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
0804     ctx->nr_user_files = nr;
0805     if (!ret)
0806         __io_sqe_files_unregister(ctx);
0807     return ret;
0808 }
0809 
0810 /*
0811  * Ensure the UNIX gc is aware of our file set, so we are certain that
0812  * the io_uring can be safely unregistered on process exit, even if we have
0813  * loops in the file referencing. We account only files that can hold other
0814  * files because otherwise they can't form a loop and so are not interesting
0815  * for GC.
0816  */
0817 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
0818 {
0819 #if defined(CONFIG_UNIX)
0820     struct sock *sk = ctx->ring_sock->sk;
0821     struct sk_buff_head *head = &sk->sk_receive_queue;
0822     struct scm_fp_list *fpl;
0823     struct sk_buff *skb;
0824 
0825     if (likely(!io_file_need_scm(file)))
0826         return 0;
0827 
0828     /*
0829      * See if we can merge this file into an existing skb SCM_RIGHTS
0830      * file set. If there's no room, fall back to allocating a new skb
0831      * and filling it in.
0832      */
0833     spin_lock_irq(&head->lock);
0834     skb = skb_peek(head);
0835     if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
0836         __skb_unlink(skb, head);
0837     else
0838         skb = NULL;
0839     spin_unlock_irq(&head->lock);
0840 
0841     if (!skb) {
0842         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
0843         if (!fpl)
0844             return -ENOMEM;
0845 
0846         skb = alloc_skb(0, GFP_KERNEL);
0847         if (!skb) {
0848             kfree(fpl);
0849             return -ENOMEM;
0850         }
0851 
0852         fpl->user = get_uid(current_user());
0853         fpl->max = SCM_MAX_FD;
0854         fpl->count = 0;
0855 
0856         UNIXCB(skb).fp = fpl;
0857         skb->sk = sk;
0858         skb->destructor = unix_destruct_scm;
0859         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
0860     }
0861 
0862     fpl = UNIXCB(skb).fp;
0863     fpl->fp[fpl->count++] = get_file(file);
0864     unix_inflight(fpl->user, file);
0865     skb_queue_head(head, skb);
0866     fput(file);
0867 #endif
0868     return 0;
0869 }
0870 
0871 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
0872 {
0873     struct file *file = prsrc->file;
0874 #if defined(CONFIG_UNIX)
0875     struct sock *sock = ctx->ring_sock->sk;
0876     struct sk_buff_head list, *head = &sock->sk_receive_queue;
0877     struct sk_buff *skb;
0878     int i;
0879 
0880     if (!io_file_need_scm(file)) {
0881         fput(file);
0882         return;
0883     }
0884 
0885     __skb_queue_head_init(&list);
0886 
0887     /*
0888      * Find the skb that holds this file in its SCM_RIGHTS. When found,
0889      * remove this entry and rearrange the file array.
0890      */
0891     skb = skb_dequeue(head);
0892     while (skb) {
0893         struct scm_fp_list *fp;
0894 
0895         fp = UNIXCB(skb).fp;
0896         for (i = 0; i < fp->count; i++) {
0897             int left;
0898 
0899             if (fp->fp[i] != file)
0900                 continue;
0901 
0902             unix_notinflight(fp->user, fp->fp[i]);
0903             left = fp->count - 1 - i;
0904             if (left) {
0905                 memmove(&fp->fp[i], &fp->fp[i + 1],
0906                         left * sizeof(struct file *));
0907             }
0908             fp->count--;
0909             if (!fp->count) {
0910                 kfree_skb(skb);
0911                 skb = NULL;
0912             } else {
0913                 __skb_queue_tail(&list, skb);
0914             }
0915             fput(file);
0916             file = NULL;
0917             break;
0918         }
0919 
0920         if (!file)
0921             break;
0922 
0923         __skb_queue_tail(&list, skb);
0924 
0925         skb = skb_dequeue(head);
0926     }
0927 
0928     if (skb_peek(&list)) {
0929         spin_lock_irq(&head->lock);
0930         while ((skb = __skb_dequeue(&list)) != NULL)
0931             __skb_queue_tail(head, skb);
0932         spin_unlock_irq(&head->lock);
0933     }
0934 #else
0935     fput(file);
0936 #endif
0937 }
0938 
0939 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
0940               unsigned nr_args, u64 __user *tags)
0941 {
0942     __s32 __user *fds = (__s32 __user *) arg;
0943     struct file *file;
0944     int fd, ret;
0945     unsigned i;
0946 
0947     if (ctx->file_data)
0948         return -EBUSY;
0949     if (!nr_args)
0950         return -EINVAL;
0951     if (nr_args > IORING_MAX_FIXED_FILES)
0952         return -EMFILE;
0953     if (nr_args > rlimit(RLIMIT_NOFILE))
0954         return -EMFILE;
0955     ret = io_rsrc_node_switch_start(ctx);
0956     if (ret)
0957         return ret;
0958     ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
0959                  &ctx->file_data);
0960     if (ret)
0961         return ret;
0962 
0963     if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
0964         io_rsrc_data_free(ctx->file_data);
0965         ctx->file_data = NULL;
0966         return -ENOMEM;
0967     }
0968 
0969     for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
0970         struct io_fixed_file *file_slot;
0971 
0972         if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
0973             ret = -EFAULT;
0974             goto fail;
0975         }
0976         /* allow sparse sets */
0977         if (!fds || fd == -1) {
0978             ret = -EINVAL;
0979             if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
0980                 goto fail;
0981             continue;
0982         }
0983 
0984         file = fget(fd);
0985         ret = -EBADF;
0986         if (unlikely(!file))
0987             goto fail;
0988 
0989         /*
0990          * Don't allow io_uring instances to be registered. If UNIX
0991          * isn't enabled, then this causes a reference cycle and this
0992          * instance can never get freed. If UNIX is enabled we'll
0993          * handle it just fine, but there's still no point in allowing
0994          * a ring fd as it doesn't support regular read/write anyway.
0995          */
0996         if (io_is_uring_fops(file)) {
0997             fput(file);
0998             goto fail;
0999         }
1000         ret = io_scm_file_account(ctx, file);
1001         if (ret) {
1002             fput(file);
1003             goto fail;
1004         }
1005         file_slot = io_fixed_file_slot(&ctx->file_table, i);
1006         io_fixed_file_set(file_slot, file);
1007         io_file_bitmap_set(&ctx->file_table, i);
1008     }
1009 
1010     /* default it to the whole table */
1011     io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
1012     io_rsrc_node_switch(ctx, NULL);
1013     return 0;
1014 fail:
1015     __io_sqe_files_unregister(ctx);
1016     return ret;
1017 }
1018 
1019 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1020 {
1021     io_buffer_unmap(ctx, &prsrc->buf);
1022     prsrc->buf = NULL;
1023 }
1024 
1025 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1026 {
1027     unsigned int i;
1028 
1029     for (i = 0; i < ctx->nr_user_bufs; i++)
1030         io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1031     kfree(ctx->user_bufs);
1032     io_rsrc_data_free(ctx->buf_data);
1033     ctx->user_bufs = NULL;
1034     ctx->buf_data = NULL;
1035     ctx->nr_user_bufs = 0;
1036 }
1037 
1038 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1039 {
1040     unsigned nr = ctx->nr_user_bufs;
1041     int ret;
1042 
1043     if (!ctx->buf_data)
1044         return -ENXIO;
1045 
1046     /*
1047      * Quiesce may unlock ->uring_lock, and while it's not held
1048      * prevent new requests using the table.
1049      */
1050     ctx->nr_user_bufs = 0;
1051     ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1052     ctx->nr_user_bufs = nr;
1053     if (!ret)
1054         __io_sqe_buffers_unregister(ctx);
1055     return ret;
1056 }
1057 
1058 /*
1059  * Not super efficient, but this is just a registration time. And we do cache
1060  * the last compound head, so generally we'll only do a full search if we don't
1061  * match that one.
1062  *
1063  * We check if the given compound head page has already been accounted, to
1064  * avoid double accounting it. This allows us to account the full size of the
1065  * page, not just the constituent pages of a huge page.
1066  */
1067 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1068                   int nr_pages, struct page *hpage)
1069 {
1070     int i, j;
1071 
1072     /* check current page array */
1073     for (i = 0; i < nr_pages; i++) {
1074         if (!PageCompound(pages[i]))
1075             continue;
1076         if (compound_head(pages[i]) == hpage)
1077             return true;
1078     }
1079 
1080     /* check previously registered pages */
1081     for (i = 0; i < ctx->nr_user_bufs; i++) {
1082         struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1083 
1084         for (j = 0; j < imu->nr_bvecs; j++) {
1085             if (!PageCompound(imu->bvec[j].bv_page))
1086                 continue;
1087             if (compound_head(imu->bvec[j].bv_page) == hpage)
1088                 return true;
1089         }
1090     }
1091 
1092     return false;
1093 }
1094 
1095 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1096                  int nr_pages, struct io_mapped_ubuf *imu,
1097                  struct page **last_hpage)
1098 {
1099     int i, ret;
1100 
1101     imu->acct_pages = 0;
1102     for (i = 0; i < nr_pages; i++) {
1103         if (!PageCompound(pages[i])) {
1104             imu->acct_pages++;
1105         } else {
1106             struct page *hpage;
1107 
1108             hpage = compound_head(pages[i]);
1109             if (hpage == *last_hpage)
1110                 continue;
1111             *last_hpage = hpage;
1112             if (headpage_already_acct(ctx, pages, i, hpage))
1113                 continue;
1114             imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1115         }
1116     }
1117 
1118     if (!imu->acct_pages)
1119         return 0;
1120 
1121     ret = io_account_mem(ctx, imu->acct_pages);
1122     if (ret)
1123         imu->acct_pages = 0;
1124     return ret;
1125 }
1126 
1127 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1128 {
1129     unsigned long start, end, nr_pages;
1130     struct vm_area_struct **vmas = NULL;
1131     struct page **pages = NULL;
1132     int i, pret, ret = -ENOMEM;
1133 
1134     end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1135     start = ubuf >> PAGE_SHIFT;
1136     nr_pages = end - start;
1137 
1138     pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1139     if (!pages)
1140         goto done;
1141 
1142     vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1143                   GFP_KERNEL);
1144     if (!vmas)
1145         goto done;
1146 
1147     ret = 0;
1148     mmap_read_lock(current->mm);
1149     pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1150                   pages, vmas);
1151     if (pret == nr_pages) {
1152         /* don't support file backed memory */
1153         for (i = 0; i < nr_pages; i++) {
1154             struct vm_area_struct *vma = vmas[i];
1155 
1156             if (vma_is_shmem(vma))
1157                 continue;
1158             if (vma->vm_file &&
1159                 !is_file_hugepages(vma->vm_file)) {
1160                 ret = -EOPNOTSUPP;
1161                 break;
1162             }
1163         }
1164         *npages = nr_pages;
1165     } else {
1166         ret = pret < 0 ? pret : -EFAULT;
1167     }
1168     mmap_read_unlock(current->mm);
1169     if (ret) {
1170         /*
1171          * if we did partial map, or found file backed vmas,
1172          * release any pages we did get
1173          */
1174         if (pret > 0)
1175             unpin_user_pages(pages, pret);
1176         goto done;
1177     }
1178     ret = 0;
1179 done:
1180     kvfree(vmas);
1181     if (ret < 0) {
1182         kvfree(pages);
1183         pages = ERR_PTR(ret);
1184     }
1185     return pages;
1186 }
1187 
1188 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1189                   struct io_mapped_ubuf **pimu,
1190                   struct page **last_hpage)
1191 {
1192     struct io_mapped_ubuf *imu = NULL;
1193     struct page **pages = NULL;
1194     unsigned long off;
1195     size_t size;
1196     int ret, nr_pages, i;
1197 
1198     *pimu = ctx->dummy_ubuf;
1199     if (!iov->iov_base)
1200         return 0;
1201 
1202     ret = -ENOMEM;
1203     pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1204                 &nr_pages);
1205     if (IS_ERR(pages)) {
1206         ret = PTR_ERR(pages);
1207         pages = NULL;
1208         goto done;
1209     }
1210 
1211     imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1212     if (!imu)
1213         goto done;
1214 
1215     ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1216     if (ret) {
1217         unpin_user_pages(pages, nr_pages);
1218         goto done;
1219     }
1220 
1221     off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1222     size = iov->iov_len;
1223     for (i = 0; i < nr_pages; i++) {
1224         size_t vec_len;
1225 
1226         vec_len = min_t(size_t, size, PAGE_SIZE - off);
1227         imu->bvec[i].bv_page = pages[i];
1228         imu->bvec[i].bv_len = vec_len;
1229         imu->bvec[i].bv_offset = off;
1230         off = 0;
1231         size -= vec_len;
1232     }
1233     /* store original address for later verification */
1234     imu->ubuf = (unsigned long) iov->iov_base;
1235     imu->ubuf_end = imu->ubuf + iov->iov_len;
1236     imu->nr_bvecs = nr_pages;
1237     *pimu = imu;
1238     ret = 0;
1239 done:
1240     if (ret)
1241         kvfree(imu);
1242     kvfree(pages);
1243     return ret;
1244 }
1245 
1246 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1247 {
1248     ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1249     return ctx->user_bufs ? 0 : -ENOMEM;
1250 }
1251 
1252 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1253                 unsigned int nr_args, u64 __user *tags)
1254 {
1255     struct page *last_hpage = NULL;
1256     struct io_rsrc_data *data;
1257     int i, ret;
1258     struct iovec iov;
1259 
1260     BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1261 
1262     if (ctx->user_bufs)
1263         return -EBUSY;
1264     if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1265         return -EINVAL;
1266     ret = io_rsrc_node_switch_start(ctx);
1267     if (ret)
1268         return ret;
1269     ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1270     if (ret)
1271         return ret;
1272     ret = io_buffers_map_alloc(ctx, nr_args);
1273     if (ret) {
1274         io_rsrc_data_free(data);
1275         return ret;
1276     }
1277 
1278     for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1279         if (arg) {
1280             ret = io_copy_iov(ctx, &iov, arg, i);
1281             if (ret)
1282                 break;
1283             ret = io_buffer_validate(&iov);
1284             if (ret)
1285                 break;
1286         } else {
1287             memset(&iov, 0, sizeof(iov));
1288         }
1289 
1290         if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1291             ret = -EINVAL;
1292             break;
1293         }
1294 
1295         ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1296                          &last_hpage);
1297         if (ret)
1298             break;
1299     }
1300 
1301     WARN_ON_ONCE(ctx->buf_data);
1302 
1303     ctx->buf_data = data;
1304     if (ret)
1305         __io_sqe_buffers_unregister(ctx);
1306     else
1307         io_rsrc_node_switch(ctx, NULL);
1308     return ret;
1309 }
1310 
1311 int io_import_fixed(int ddir, struct iov_iter *iter,
1312                struct io_mapped_ubuf *imu,
1313                u64 buf_addr, size_t len)
1314 {
1315     u64 buf_end;
1316     size_t offset;
1317 
1318     if (WARN_ON_ONCE(!imu))
1319         return -EFAULT;
1320     if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1321         return -EFAULT;
1322     /* not inside the mapped region */
1323     if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1324         return -EFAULT;
1325 
1326     /*
1327      * May not be a start of buffer, set size appropriately
1328      * and advance us to the beginning.
1329      */
1330     offset = buf_addr - imu->ubuf;
1331     iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1332 
1333     if (offset) {
1334         /*
1335          * Don't use iov_iter_advance() here, as it's really slow for
1336          * using the latter parts of a big fixed buffer - it iterates
1337          * over each segment manually. We can cheat a bit here, because
1338          * we know that:
1339          *
1340          * 1) it's a BVEC iter, we set it up
1341          * 2) all bvecs are PAGE_SIZE in size, except potentially the
1342          *    first and last bvec
1343          *
1344          * So just find our index, and adjust the iterator afterwards.
1345          * If the offset is within the first bvec (or the whole first
1346          * bvec, just use iov_iter_advance(). This makes it easier
1347          * since we can just skip the first segment, which may not
1348          * be PAGE_SIZE aligned.
1349          */
1350         const struct bio_vec *bvec = imu->bvec;
1351 
1352         if (offset <= bvec->bv_len) {
1353             iov_iter_advance(iter, offset);
1354         } else {
1355             unsigned long seg_skip;
1356 
1357             /* skip first vec */
1358             offset -= bvec->bv_len;
1359             seg_skip = 1 + (offset >> PAGE_SHIFT);
1360 
1361             iter->bvec = bvec + seg_skip;
1362             iter->nr_segs -= seg_skip;
1363             iter->count -= bvec->bv_len + offset;
1364             iter->iov_offset = offset & ~PAGE_MASK;
1365         }
1366     }
1367 
1368     return 0;
1369 }