Back to home page

OSCL-LXR

 
 

    


0001 #ifndef IO_URING_TYPES_H
0002 #define IO_URING_TYPES_H
0003 
0004 #include <linux/blkdev.h>
0005 #include <linux/task_work.h>
0006 #include <linux/bitmap.h>
0007 #include <linux/llist.h>
0008 #include <uapi/linux/io_uring.h>
0009 
0010 struct io_wq_work_node {
0011     struct io_wq_work_node *next;
0012 };
0013 
0014 struct io_wq_work_list {
0015     struct io_wq_work_node *first;
0016     struct io_wq_work_node *last;
0017 };
0018 
0019 struct io_wq_work {
0020     struct io_wq_work_node list;
0021     unsigned flags;
0022     /* place it here instead of io_kiocb as it fills padding and saves 4B */
0023     int cancel_seq;
0024 };
0025 
0026 struct io_fixed_file {
0027     /* file * with additional FFS_* flags */
0028     unsigned long file_ptr;
0029 };
0030 
0031 struct io_file_table {
0032     struct io_fixed_file *files;
0033     unsigned long *bitmap;
0034     unsigned int alloc_hint;
0035 };
0036 
0037 struct io_notif;
0038 struct io_notif_slot;
0039 
0040 struct io_hash_bucket {
0041     spinlock_t      lock;
0042     struct hlist_head   list;
0043 } ____cacheline_aligned_in_smp;
0044 
0045 struct io_hash_table {
0046     struct io_hash_bucket   *hbs;
0047     unsigned        hash_bits;
0048 };
0049 
0050 /*
0051  * Arbitrary limit, can be raised if need be
0052  */
0053 #define IO_RINGFD_REG_MAX 16
0054 
0055 struct io_uring_task {
0056     /* submission side */
0057     int             cached_refs;
0058     const struct io_ring_ctx    *last;
0059     struct io_wq            *io_wq;
0060     struct file         *registered_rings[IO_RINGFD_REG_MAX];
0061 
0062     struct xarray           xa;
0063     struct wait_queue_head      wait;
0064     atomic_t            in_idle;
0065     atomic_t            inflight_tracked;
0066     struct percpu_counter       inflight;
0067 
0068     struct { /* task_work */
0069         struct llist_head   task_list;
0070         struct callback_head    task_work;
0071     } ____cacheline_aligned_in_smp;
0072 };
0073 
0074 struct io_uring {
0075     u32 head ____cacheline_aligned_in_smp;
0076     u32 tail ____cacheline_aligned_in_smp;
0077 };
0078 
0079 /*
0080  * This data is shared with the application through the mmap at offsets
0081  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
0082  *
0083  * The offsets to the member fields are published through struct
0084  * io_sqring_offsets when calling io_uring_setup.
0085  */
0086 struct io_rings {
0087     /*
0088      * Head and tail offsets into the ring; the offsets need to be
0089      * masked to get valid indices.
0090      *
0091      * The kernel controls head of the sq ring and the tail of the cq ring,
0092      * and the application controls tail of the sq ring and the head of the
0093      * cq ring.
0094      */
0095     struct io_uring     sq, cq;
0096     /*
0097      * Bitmasks to apply to head and tail offsets (constant, equals
0098      * ring_entries - 1)
0099      */
0100     u32         sq_ring_mask, cq_ring_mask;
0101     /* Ring sizes (constant, power of 2) */
0102     u32         sq_ring_entries, cq_ring_entries;
0103     /*
0104      * Number of invalid entries dropped by the kernel due to
0105      * invalid index stored in array
0106      *
0107      * Written by the kernel, shouldn't be modified by the
0108      * application (i.e. get number of "new events" by comparing to
0109      * cached value).
0110      *
0111      * After a new SQ head value was read by the application this
0112      * counter includes all submissions that were dropped reaching
0113      * the new SQ head (and possibly more).
0114      */
0115     u32         sq_dropped;
0116     /*
0117      * Runtime SQ flags
0118      *
0119      * Written by the kernel, shouldn't be modified by the
0120      * application.
0121      *
0122      * The application needs a full memory barrier before checking
0123      * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
0124      */
0125     atomic_t        sq_flags;
0126     /*
0127      * Runtime CQ flags
0128      *
0129      * Written by the application, shouldn't be modified by the
0130      * kernel.
0131      */
0132     u32         cq_flags;
0133     /*
0134      * Number of completion events lost because the queue was full;
0135      * this should be avoided by the application by making sure
0136      * there are not more requests pending than there is space in
0137      * the completion queue.
0138      *
0139      * Written by the kernel, shouldn't be modified by the
0140      * application (i.e. get number of "new events" by comparing to
0141      * cached value).
0142      *
0143      * As completion events come in out of order this counter is not
0144      * ordered with any other data.
0145      */
0146     u32         cq_overflow;
0147     /*
0148      * Ring buffer of completion events.
0149      *
0150      * The kernel writes completion events fresh every time they are
0151      * produced, so the application is allowed to modify pending
0152      * entries.
0153      */
0154     struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
0155 };
0156 
0157 struct io_restriction {
0158     DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
0159     DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
0160     u8 sqe_flags_allowed;
0161     u8 sqe_flags_required;
0162     bool registered;
0163 };
0164 
0165 struct io_submit_link {
0166     struct io_kiocb     *head;
0167     struct io_kiocb     *last;
0168 };
0169 
0170 struct io_submit_state {
0171     /* inline/task_work completion list, under ->uring_lock */
0172     struct io_wq_work_node  free_list;
0173     /* batch completion logic */
0174     struct io_wq_work_list  compl_reqs;
0175     struct io_submit_link   link;
0176 
0177     bool            plug_started;
0178     bool            need_plug;
0179     unsigned short      submit_nr;
0180     struct blk_plug     plug;
0181 };
0182 
0183 struct io_ev_fd {
0184     struct eventfd_ctx  *cq_ev_fd;
0185     unsigned int        eventfd_async: 1;
0186     struct rcu_head     rcu;
0187 };
0188 
0189 struct io_alloc_cache {
0190     struct hlist_head   list;
0191     unsigned int        nr_cached;
0192 };
0193 
0194 struct io_ring_ctx {
0195     /* const or read-mostly hot data */
0196     struct {
0197         struct percpu_ref   refs;
0198 
0199         struct io_rings     *rings;
0200         unsigned int        flags;
0201         enum task_work_notify_mode  notify_method;
0202         unsigned int        compat: 1;
0203         unsigned int        drain_next: 1;
0204         unsigned int        restricted: 1;
0205         unsigned int        off_timeout_used: 1;
0206         unsigned int        drain_active: 1;
0207         unsigned int        drain_disabled: 1;
0208         unsigned int        has_evfd: 1;
0209         unsigned int        syscall_iopoll: 1;
0210     } ____cacheline_aligned_in_smp;
0211 
0212     /* submission data */
0213     struct {
0214         struct mutex        uring_lock;
0215 
0216         /*
0217          * Ring buffer of indices into array of io_uring_sqe, which is
0218          * mmapped by the application using the IORING_OFF_SQES offset.
0219          *
0220          * This indirection could e.g. be used to assign fixed
0221          * io_uring_sqe entries to operations and only submit them to
0222          * the queue when needed.
0223          *
0224          * The kernel modifies neither the indices array nor the entries
0225          * array.
0226          */
0227         u32         *sq_array;
0228         struct io_uring_sqe *sq_sqes;
0229         unsigned        cached_sq_head;
0230         unsigned        sq_entries;
0231 
0232         /*
0233          * Fixed resources fast path, should be accessed only under
0234          * uring_lock, and updated through io_uring_register(2)
0235          */
0236         struct io_rsrc_node *rsrc_node;
0237         int         rsrc_cached_refs;
0238         atomic_t        cancel_seq;
0239         struct io_file_table    file_table;
0240         unsigned        nr_user_files;
0241         unsigned        nr_user_bufs;
0242         struct io_mapped_ubuf   **user_bufs;
0243         struct io_notif_slot    *notif_slots;
0244         unsigned        nr_notif_slots;
0245 
0246         struct io_submit_state  submit_state;
0247 
0248         struct io_buffer_list   *io_bl;
0249         struct xarray       io_bl_xa;
0250         struct list_head    io_buffers_cache;
0251 
0252         struct io_hash_table    cancel_table_locked;
0253         struct list_head    cq_overflow_list;
0254         struct io_alloc_cache   apoll_cache;
0255         struct io_alloc_cache   netmsg_cache;
0256     } ____cacheline_aligned_in_smp;
0257 
0258     /* IRQ completion list, under ->completion_lock */
0259     struct io_wq_work_list  locked_free_list;
0260     unsigned int        locked_free_nr;
0261 
0262     const struct cred   *sq_creds;  /* cred used for __io_sq_thread() */
0263     struct io_sq_data   *sq_data;   /* if using sq thread polling */
0264 
0265     struct wait_queue_head  sqo_sq_wait;
0266     struct list_head    sqd_list;
0267 
0268     unsigned long       check_cq;
0269 
0270     unsigned int        file_alloc_start;
0271     unsigned int        file_alloc_end;
0272 
0273     struct xarray       personalities;
0274     u32         pers_next;
0275 
0276     struct {
0277         /*
0278          * We cache a range of free CQEs we can use, once exhausted it
0279          * should go through a slower range setup, see __io_get_cqe()
0280          */
0281         struct io_uring_cqe *cqe_cached;
0282         struct io_uring_cqe *cqe_sentinel;
0283 
0284         unsigned        cached_cq_tail;
0285         unsigned        cq_entries;
0286         struct io_ev_fd __rcu   *io_ev_fd;
0287         struct wait_queue_head  cq_wait;
0288         unsigned        cq_extra;
0289     } ____cacheline_aligned_in_smp;
0290 
0291     struct {
0292         spinlock_t      completion_lock;
0293 
0294         /*
0295          * ->iopoll_list is protected by the ctx->uring_lock for
0296          * io_uring instances that don't use IORING_SETUP_SQPOLL.
0297          * For SQPOLL, only the single threaded io_sq_thread() will
0298          * manipulate the list, hence no extra locking is needed there.
0299          */
0300         struct io_wq_work_list  iopoll_list;
0301         struct io_hash_table    cancel_table;
0302         bool            poll_multi_queue;
0303 
0304         struct list_head    io_buffers_comp;
0305     } ____cacheline_aligned_in_smp;
0306 
0307     /* timeouts */
0308     struct {
0309         spinlock_t      timeout_lock;
0310         atomic_t        cq_timeouts;
0311         struct list_head    timeout_list;
0312         struct list_head    ltimeout_list;
0313         unsigned        cq_last_tm_flush;
0314     } ____cacheline_aligned_in_smp;
0315 
0316     /* Keep this last, we don't need it for the fast path */
0317 
0318     struct io_restriction       restrictions;
0319     struct task_struct      *submitter_task;
0320 
0321     /* slow path rsrc auxilary data, used by update/register */
0322     struct io_rsrc_node     *rsrc_backup_node;
0323     struct io_mapped_ubuf       *dummy_ubuf;
0324     struct io_rsrc_data     *file_data;
0325     struct io_rsrc_data     *buf_data;
0326 
0327     struct delayed_work     rsrc_put_work;
0328     struct llist_head       rsrc_put_llist;
0329     struct list_head        rsrc_ref_list;
0330     spinlock_t          rsrc_ref_lock;
0331 
0332     struct list_head        io_buffers_pages;
0333 
0334     #if defined(CONFIG_UNIX)
0335         struct socket       *ring_sock;
0336     #endif
0337     /* hashed buffered write serialization */
0338     struct io_wq_hash       *hash_map;
0339 
0340     /* Only used for accounting purposes */
0341     struct user_struct      *user;
0342     struct mm_struct        *mm_account;
0343 
0344     /* ctx exit and cancelation */
0345     struct llist_head       fallback_llist;
0346     struct delayed_work     fallback_work;
0347     struct work_struct      exit_work;
0348     struct list_head        tctx_list;
0349     struct completion       ref_comp;
0350 
0351     /* io-wq management, e.g. thread count */
0352     u32             iowq_limits[2];
0353     bool                iowq_limits_set;
0354 
0355     struct list_head        defer_list;
0356     unsigned            sq_thread_idle;
0357     /* protected by ->completion_lock */
0358     unsigned            evfd_last_cq_tail;
0359 };
0360 
0361 enum {
0362     REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
0363     REQ_F_IO_DRAIN_BIT  = IOSQE_IO_DRAIN_BIT,
0364     REQ_F_LINK_BIT      = IOSQE_IO_LINK_BIT,
0365     REQ_F_HARDLINK_BIT  = IOSQE_IO_HARDLINK_BIT,
0366     REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
0367     REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
0368     REQ_F_CQE_SKIP_BIT  = IOSQE_CQE_SKIP_SUCCESS_BIT,
0369 
0370     /* first byte is taken by user flags, shift it to not overlap */
0371     REQ_F_FAIL_BIT      = 8,
0372     REQ_F_INFLIGHT_BIT,
0373     REQ_F_CUR_POS_BIT,
0374     REQ_F_NOWAIT_BIT,
0375     REQ_F_LINK_TIMEOUT_BIT,
0376     REQ_F_NEED_CLEANUP_BIT,
0377     REQ_F_POLLED_BIT,
0378     REQ_F_BUFFER_SELECTED_BIT,
0379     REQ_F_BUFFER_RING_BIT,
0380     REQ_F_REISSUE_BIT,
0381     REQ_F_CREDS_BIT,
0382     REQ_F_REFCOUNT_BIT,
0383     REQ_F_ARM_LTIMEOUT_BIT,
0384     REQ_F_ASYNC_DATA_BIT,
0385     REQ_F_SKIP_LINK_CQES_BIT,
0386     REQ_F_SINGLE_POLL_BIT,
0387     REQ_F_DOUBLE_POLL_BIT,
0388     REQ_F_PARTIAL_IO_BIT,
0389     REQ_F_CQE32_INIT_BIT,
0390     REQ_F_APOLL_MULTISHOT_BIT,
0391     REQ_F_CLEAR_POLLIN_BIT,
0392     REQ_F_HASH_LOCKED_BIT,
0393     /* keep async read/write and isreg together and in order */
0394     REQ_F_SUPPORT_NOWAIT_BIT,
0395     REQ_F_ISREG_BIT,
0396 
0397     /* not a real bit, just to check we're not overflowing the space */
0398     __REQ_F_LAST_BIT,
0399 };
0400 
0401 enum {
0402     /* ctx owns file */
0403     REQ_F_FIXED_FILE    = BIT(REQ_F_FIXED_FILE_BIT),
0404     /* drain existing IO first */
0405     REQ_F_IO_DRAIN      = BIT(REQ_F_IO_DRAIN_BIT),
0406     /* linked sqes */
0407     REQ_F_LINK      = BIT(REQ_F_LINK_BIT),
0408     /* doesn't sever on completion < 0 */
0409     REQ_F_HARDLINK      = BIT(REQ_F_HARDLINK_BIT),
0410     /* IOSQE_ASYNC */
0411     REQ_F_FORCE_ASYNC   = BIT(REQ_F_FORCE_ASYNC_BIT),
0412     /* IOSQE_BUFFER_SELECT */
0413     REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
0414     /* IOSQE_CQE_SKIP_SUCCESS */
0415     REQ_F_CQE_SKIP      = BIT(REQ_F_CQE_SKIP_BIT),
0416 
0417     /* fail rest of links */
0418     REQ_F_FAIL      = BIT(REQ_F_FAIL_BIT),
0419     /* on inflight list, should be cancelled and waited on exit reliably */
0420     REQ_F_INFLIGHT      = BIT(REQ_F_INFLIGHT_BIT),
0421     /* read/write uses file position */
0422     REQ_F_CUR_POS       = BIT(REQ_F_CUR_POS_BIT),
0423     /* must not punt to workers */
0424     REQ_F_NOWAIT        = BIT(REQ_F_NOWAIT_BIT),
0425     /* has or had linked timeout */
0426     REQ_F_LINK_TIMEOUT  = BIT(REQ_F_LINK_TIMEOUT_BIT),
0427     /* needs cleanup */
0428     REQ_F_NEED_CLEANUP  = BIT(REQ_F_NEED_CLEANUP_BIT),
0429     /* already went through poll handler */
0430     REQ_F_POLLED        = BIT(REQ_F_POLLED_BIT),
0431     /* buffer already selected */
0432     REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
0433     /* buffer selected from ring, needs commit */
0434     REQ_F_BUFFER_RING   = BIT(REQ_F_BUFFER_RING_BIT),
0435     /* caller should reissue async */
0436     REQ_F_REISSUE       = BIT(REQ_F_REISSUE_BIT),
0437     /* supports async reads/writes */
0438     REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
0439     /* regular file */
0440     REQ_F_ISREG     = BIT(REQ_F_ISREG_BIT),
0441     /* has creds assigned */
0442     REQ_F_CREDS     = BIT(REQ_F_CREDS_BIT),
0443     /* skip refcounting if not set */
0444     REQ_F_REFCOUNT      = BIT(REQ_F_REFCOUNT_BIT),
0445     /* there is a linked timeout that has to be armed */
0446     REQ_F_ARM_LTIMEOUT  = BIT(REQ_F_ARM_LTIMEOUT_BIT),
0447     /* ->async_data allocated */
0448     REQ_F_ASYNC_DATA    = BIT(REQ_F_ASYNC_DATA_BIT),
0449     /* don't post CQEs while failing linked requests */
0450     REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
0451     /* single poll may be active */
0452     REQ_F_SINGLE_POLL   = BIT(REQ_F_SINGLE_POLL_BIT),
0453     /* double poll may active */
0454     REQ_F_DOUBLE_POLL   = BIT(REQ_F_DOUBLE_POLL_BIT),
0455     /* request has already done partial IO */
0456     REQ_F_PARTIAL_IO    = BIT(REQ_F_PARTIAL_IO_BIT),
0457     /* fast poll multishot mode */
0458     REQ_F_APOLL_MULTISHOT   = BIT(REQ_F_APOLL_MULTISHOT_BIT),
0459     /* ->extra1 and ->extra2 are initialised */
0460     REQ_F_CQE32_INIT    = BIT(REQ_F_CQE32_INIT_BIT),
0461     /* recvmsg special flag, clear EPOLLIN */
0462     REQ_F_CLEAR_POLLIN  = BIT(REQ_F_CLEAR_POLLIN_BIT),
0463     /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
0464     REQ_F_HASH_LOCKED   = BIT(REQ_F_HASH_LOCKED_BIT),
0465 };
0466 
0467 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
0468 
0469 struct io_task_work {
0470     struct llist_node       node;
0471     io_req_tw_func_t        func;
0472 };
0473 
0474 struct io_cqe {
0475     __u64   user_data;
0476     __s32   res;
0477     /* fd initially, then cflags for completion */
0478     union {
0479         __u32   flags;
0480         int fd;
0481     };
0482 };
0483 
0484 /*
0485  * Each request type overlays its private data structure on top of this one.
0486  * They must not exceed this one in size.
0487  */
0488 struct io_cmd_data {
0489     struct file     *file;
0490     /* each command gets 56 bytes of data */
0491     __u8            data[56];
0492 };
0493 
0494 static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
0495 {
0496     BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data));
0497 }
0498 #define io_kiocb_to_cmd(req, cmd_type) ( \
0499     io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \
0500     ((cmd_type *)&(req)->cmd) \
0501 )
0502 #define cmd_to_io_kiocb(ptr)    ((struct io_kiocb *) ptr)
0503 
0504 struct io_kiocb {
0505     union {
0506         /*
0507          * NOTE! Each of the io_kiocb union members has the file pointer
0508          * as the first entry in their struct definition. So you can
0509          * access the file pointer through any of the sub-structs,
0510          * or directly as just 'file' in this struct.
0511          */
0512         struct file     *file;
0513         struct io_cmd_data  cmd;
0514     };
0515 
0516     u8              opcode;
0517     /* polled IO has completed */
0518     u8              iopoll_completed;
0519     /*
0520      * Can be either a fixed buffer index, or used with provided buffers.
0521      * For the latter, before issue it points to the buffer group ID,
0522      * and after selection it points to the buffer ID itself.
0523      */
0524     u16             buf_index;
0525     unsigned int            flags;
0526 
0527     struct io_cqe           cqe;
0528 
0529     struct io_ring_ctx      *ctx;
0530     struct task_struct      *task;
0531 
0532     struct io_rsrc_node     *rsrc_node;
0533 
0534     union {
0535         /* store used ubuf, so we can prevent reloading */
0536         struct io_mapped_ubuf   *imu;
0537 
0538         /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
0539         struct io_buffer    *kbuf;
0540 
0541         /*
0542          * stores buffer ID for ring provided buffers, valid IFF
0543          * REQ_F_BUFFER_RING is set.
0544          */
0545         struct io_buffer_list   *buf_list;
0546     };
0547 
0548     union {
0549         /* used by request caches, completion batching and iopoll */
0550         struct io_wq_work_node  comp_list;
0551         /* cache ->apoll->events */
0552         __poll_t apoll_events;
0553     };
0554     atomic_t            refs;
0555     atomic_t            poll_refs;
0556     struct io_task_work     io_task_work;
0557     /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
0558     union {
0559         struct hlist_node   hash_node;
0560         struct {
0561             u64     extra1;
0562             u64     extra2;
0563         };
0564     };
0565     /* internal polling, see IORING_FEAT_FAST_POLL */
0566     struct async_poll       *apoll;
0567     /* opcode allocated if it needs to store data for async defer */
0568     void                *async_data;
0569     /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
0570     struct io_kiocb         *link;
0571     /* custom credentials, valid IFF REQ_F_CREDS is set */
0572     const struct cred       *creds;
0573     struct io_wq_work       work;
0574 };
0575 
0576 struct io_overflow_cqe {
0577     struct list_head list;
0578     struct io_uring_cqe cqe;
0579 };
0580 
0581 #endif