include/linux/blk-mq.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef BLK_MQ_H
0003 #define BLK_MQ_H
0004
0005 #include <linux/blkdev.h>
0006 #include <linux/sbitmap.h>
0007 #include <linux/lockdep.h>
0008 #include <linux/scatterlist.h>
0009 #include <linux/prefetch.h>
0010
0011 struct blk_mq_tags;
0012 struct blk_flush_queue;
0013
0014 #define BLKDEV_MIN_RQ   4
0015 #define BLKDEV_DEFAULT_RQ   128
0016
0017 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
0018
0019 /*
0020  * request flags */
0021 typedef __u32 __bitwise req_flags_t;
0022
0023 /* drive already may have started this one */
0024 #define RQF_STARTED     ((__force req_flags_t)(1 << 1))
0025 /* may not be passed by ioscheduler */
0026 #define RQF_SOFTBARRIER     ((__force req_flags_t)(1 << 3))
0027 /* request for flush sequence */
0028 #define RQF_FLUSH_SEQ       ((__force req_flags_t)(1 << 4))
0029 /* merge of different types, fail separately */
0030 #define RQF_MIXED_MERGE     ((__force req_flags_t)(1 << 5))
0031 /* track inflight for MQ */
0032 #define RQF_MQ_INFLIGHT     ((__force req_flags_t)(1 << 6))
0033 /* don't call prep for this one */
0034 #define RQF_DONTPREP        ((__force req_flags_t)(1 << 7))
0035 /* vaguely specified driver internal error.  Ignored by the block layer */
0036 #define RQF_FAILED      ((__force req_flags_t)(1 << 10))
0037 /* don't warn about errors */
0038 #define RQF_QUIET       ((__force req_flags_t)(1 << 11))
0039 /* elevator private data attached */
0040 #define RQF_ELVPRIV     ((__force req_flags_t)(1 << 12))
0041 /* account into disk and partition IO statistics */
0042 #define RQF_IO_STAT     ((__force req_flags_t)(1 << 13))
0043 /* runtime pm request */
0044 #define RQF_PM          ((__force req_flags_t)(1 << 15))
0045 /* on IO scheduler merge hash */
0046 #define RQF_HASHED      ((__force req_flags_t)(1 << 16))
0047 /* track IO completion time */
0048 #define RQF_STATS       ((__force req_flags_t)(1 << 17))
0049 /* Look at ->special_vec for the actual data payload instead of the
0050    bio chain. */
0051 #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
0052 /* The per-zone write lock is held for this request */
0053 #define RQF_ZONE_WRITE_LOCKED   ((__force req_flags_t)(1 << 19))
0054 /* already slept for hybrid poll */
0055 #define RQF_MQ_POLL_SLEPT   ((__force req_flags_t)(1 << 20))
0056 /* ->timeout has been called, don't expire again */
0057 #define RQF_TIMED_OUT       ((__force req_flags_t)(1 << 21))
0058 /* queue has elevator attached */
0059 #define RQF_ELV         ((__force req_flags_t)(1 << 22))
0060 #define RQF_RESV            ((__force req_flags_t)(1 << 23))
0061
0062 /* flags that prevent us from merging requests: */
0063 #define RQF_NOMERGE_FLAGS \
0064     (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
0065
0066 enum mq_rq_state {
0067     MQ_RQ_IDLE      = 0,
0068     MQ_RQ_IN_FLIGHT     = 1,
0069     MQ_RQ_COMPLETE      = 2,
0070 };
0071
0072 /*
0073  * Try to put the fields that are referenced together in the same cacheline.
0074  *
0075  * If you modify this structure, make sure to update blk_rq_init() and
0076  * especially blk_mq_rq_ctx_init() to take care of the added fields.
0077  */
0078 struct request {
0079     struct request_queue *q;
0080     struct blk_mq_ctx *mq_ctx;
0081     struct blk_mq_hw_ctx *mq_hctx;
0082
0083     blk_opf_t cmd_flags;        /* op and common flags */
0084     req_flags_t rq_flags;
0085
0086     int tag;
0087     int internal_tag;
0088
0089     unsigned int timeout;
0090
0091     /* the following two fields are internal, NEVER access directly */
0092     unsigned int __data_len;    /* total data len */
0093     sector_t __sector;      /* sector cursor */
0094
0095     struct bio *bio;
0096     struct bio *biotail;
0097
0098     union {
0099         struct list_head queuelist;
0100         struct request *rq_next;
0101     };
0102
0103     struct block_device *part;
0104 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
0105     /* Time that the first bio started allocating this request. */
0106     u64 alloc_time_ns;
0107 #endif
0108     /* Time that this request was allocated for this IO. */
0109     u64 start_time_ns;
0110     /* Time that I/O was submitted to the device. */
0111     u64 io_start_time_ns;
0112
0113 #ifdef CONFIG_BLK_WBT
0114     unsigned short wbt_flags;
0115 #endif
0116     /*
0117      * rq sectors used for blk stats. It has the same value
0118      * with blk_rq_sectors(rq), except that it never be zeroed
0119      * by completion.
0120      */
0121     unsigned short stats_sectors;
0122
0123     /*
0124      * Number of scatter-gather DMA addr+len pairs after
0125      * physical address coalescing is performed.
0126      */
0127     unsigned short nr_phys_segments;
0128
0129 #ifdef CONFIG_BLK_DEV_INTEGRITY
0130     unsigned short nr_integrity_segments;
0131 #endif
0132
0133 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
0134     struct bio_crypt_ctx *crypt_ctx;
0135     struct blk_crypto_keyslot *crypt_keyslot;
0136 #endif
0137
0138     unsigned short write_hint;
0139     unsigned short ioprio;
0140
0141     enum mq_rq_state state;
0142     atomic_t ref;
0143
0144     unsigned long deadline;
0145
0146     /*
0147      * The hash is used inside the scheduler, and killed once the
0148      * request reaches the dispatch list. The ipi_list is only used
0149      * to queue the request for softirq completion, which is long
0150      * after the request has been unhashed (and even removed from
0151      * the dispatch list).
0152      */
0153     union {
0154         struct hlist_node hash; /* merge hash */
0155         struct llist_node ipi_list;
0156     };
0157
0158     /*
0159      * The rb_node is only used inside the io scheduler, requests
0160      * are pruned when moved to the dispatch queue. So let the
0161      * completion_data share space with the rb_node.
0162      */
0163     union {
0164         struct rb_node rb_node; /* sort/lookup */
0165         struct bio_vec special_vec;
0166         void *completion_data;
0167     };
0168
0169
0170     /*
0171      * Three pointers are available for the IO schedulers, if they need
0172      * more they have to dynamically allocate it.  Flush requests are
0173      * never put on the IO scheduler. So let the flush fields share
0174      * space with the elevator data.
0175      */
0176     union {
0177         struct {
0178             struct io_cq        *icq;
0179             void            *priv[2];
0180         } elv;
0181
0182         struct {
0183             unsigned int        seq;
0184             struct list_head    list;
0185             rq_end_io_fn        *saved_end_io;
0186         } flush;
0187     };
0188
0189     union {
0190         struct __call_single_data csd;
0191         u64 fifo_time;
0192     };
0193
0194     /*
0195      * completion callback.
0196      */
0197     rq_end_io_fn *end_io;
0198     void *end_io_data;
0199 };
0200
0201 static inline enum req_op req_op(const struct request *req)
0202 {
0203     return req->cmd_flags & REQ_OP_MASK;
0204 }
0205
0206 static inline bool blk_rq_is_passthrough(struct request *rq)
0207 {
0208     return blk_op_is_passthrough(req_op(rq));
0209 }
0210
0211 static inline unsigned short req_get_ioprio(struct request *req)
0212 {
0213     return req->ioprio;
0214 }
0215
0216 #define rq_data_dir(rq)     (op_is_write(req_op(rq)) ? WRITE : READ)
0217
0218 #define rq_dma_dir(rq) \
0219     (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
0220
0221 #define rq_list_add(listptr, rq)    do {        \
0222     (rq)->rq_next = *(listptr);         \
0223     *(listptr) = rq;                \
0224 } while (0)
0225
0226 #define rq_list_pop(listptr)                \
0227 ({                          \
0228     struct request *__req = NULL;           \
0229     if ((listptr) && *(listptr))    {       \
0230         __req = *(listptr);         \
0231         *(listptr) = __req->rq_next;        \
0232     }                       \
0233     __req;                      \
0234 })
0235
0236 #define rq_list_peek(listptr)               \
0237 ({                          \
0238     struct request *__req = NULL;           \
0239     if ((listptr) && *(listptr))            \
0240         __req = *(listptr);         \
0241     __req;                      \
0242 })
0243
0244 #define rq_list_for_each(listptr, pos)          \
0245     for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
0246
0247 #define rq_list_for_each_safe(listptr, pos, nxt)            \
0248     for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);    \
0249         pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
0250
0251 #define rq_list_next(rq)    (rq)->rq_next
0252 #define rq_list_empty(list) ((list) == (struct request *) NULL)
0253
0254 /**
0255  * rq_list_move() - move a struct request from one list to another
0256  * @src: The source list @rq is currently in
0257  * @dst: The destination list that @rq will be appended to
0258  * @rq: The request to move
0259  * @prev: The request preceding @rq in @src (NULL if @rq is the head)
0260  */
0261 static inline void rq_list_move(struct request **src, struct request **dst,
0262                 struct request *rq, struct request *prev)
0263 {
0264     if (prev)
0265         prev->rq_next = rq->rq_next;
0266     else
0267         *src = rq->rq_next;
0268     rq_list_add(dst, rq);
0269 }
0270
0271 enum blk_eh_timer_return {
0272     BLK_EH_DONE,        /* drivers has completed the command */
0273     BLK_EH_RESET_TIMER, /* reset timer and try again */
0274 };
0275
0276 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
0277 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
0278
0279 /**
0280  * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
0281  * block device
0282  */
0283 struct blk_mq_hw_ctx {
0284     struct {
0285         /** @lock: Protects the dispatch list. */
0286         spinlock_t      lock;
0287         /**
0288          * @dispatch: Used for requests that are ready to be
0289          * dispatched to the hardware but for some reason (e.g. lack of
0290          * resources) could not be sent to the hardware. As soon as the
0291          * driver can send new requests, requests at this list will
0292          * be sent first for a fairer dispatch.
0293          */
0294         struct list_head    dispatch;
0295          /**
0296           * @state: BLK_MQ_S_* flags. Defines the state of the hw
0297           * queue (active, scheduled to restart, stopped).
0298           */
0299         unsigned long       state;
0300     } ____cacheline_aligned_in_smp;
0301
0302     /**
0303      * @run_work: Used for scheduling a hardware queue run at a later time.
0304      */
0305     struct delayed_work run_work;
0306     /** @cpumask: Map of available CPUs where this hctx can run. */
0307     cpumask_var_t       cpumask;
0308     /**
0309      * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
0310      * selection from @cpumask.
0311      */
0312     int         next_cpu;
0313     /**
0314      * @next_cpu_batch: Counter of how many works left in the batch before
0315      * changing to the next CPU.
0316      */
0317     int         next_cpu_batch;
0318
0319     /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
0320     unsigned long       flags;
0321
0322     /**
0323      * @sched_data: Pointer owned by the IO scheduler attached to a request
0324      * queue. It's up to the IO scheduler how to use this pointer.
0325      */
0326     void            *sched_data;
0327     /**
0328      * @queue: Pointer to the request queue that owns this hardware context.
0329      */
0330     struct request_queue    *queue;
0331     /** @fq: Queue of requests that need to perform a flush operation. */
0332     struct blk_flush_queue  *fq;
0333
0334     /**
0335      * @driver_data: Pointer to data owned by the block driver that created
0336      * this hctx
0337      */
0338     void            *driver_data;
0339
0340     /**
0341      * @ctx_map: Bitmap for each software queue. If bit is on, there is a
0342      * pending request in that software queue.
0343      */
0344     struct sbitmap      ctx_map;
0345
0346     /**
0347      * @dispatch_from: Software queue to be used when no scheduler was
0348      * selected.
0349      */
0350     struct blk_mq_ctx   *dispatch_from;
0351     /**
0352      * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
0353      * decide if the hw_queue is busy using Exponential Weighted Moving
0354      * Average algorithm.
0355      */
0356     unsigned int        dispatch_busy;
0357
0358     /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
0359     unsigned short      type;
0360     /** @nr_ctx: Number of software queues. */
0361     unsigned short      nr_ctx;
0362     /** @ctxs: Array of software queues. */
0363     struct blk_mq_ctx   **ctxs;
0364
0365     /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
0366     spinlock_t      dispatch_wait_lock;
0367     /**
0368      * @dispatch_wait: Waitqueue to put requests when there is no tag
0369      * available at the moment, to wait for another try in the future.
0370      */
0371     wait_queue_entry_t  dispatch_wait;
0372
0373     /**
0374      * @wait_index: Index of next available dispatch_wait queue to insert
0375      * requests.
0376      */
0377     atomic_t        wait_index;
0378
0379     /**
0380      * @tags: Tags owned by the block driver. A tag at this set is only
0381      * assigned when a request is dispatched from a hardware queue.
0382      */
0383     struct blk_mq_tags  *tags;
0384     /**
0385      * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
0386      * scheduler associated with a request queue, a tag is assigned when
0387      * that request is allocated. Else, this member is not used.
0388      */
0389     struct blk_mq_tags  *sched_tags;
0390
0391     /** @queued: Number of queued requests. */
0392     unsigned long       queued;
0393     /** @run: Number of dispatched requests. */
0394     unsigned long       run;
0395
0396     /** @numa_node: NUMA node the storage adapter has been connected to. */
0397     unsigned int        numa_node;
0398     /** @queue_num: Index of this hardware queue. */
0399     unsigned int        queue_num;
0400
0401     /**
0402      * @nr_active: Number of active requests. Only used when a tag set is
0403      * shared across request queues.
0404      */
0405     atomic_t        nr_active;
0406
0407     /** @cpuhp_online: List to store request if CPU is going to die */
0408     struct hlist_node   cpuhp_online;
0409     /** @cpuhp_dead: List to store request if some CPU die. */
0410     struct hlist_node   cpuhp_dead;
0411     /** @kobj: Kernel object for sysfs. */
0412     struct kobject      kobj;
0413
0414 #ifdef CONFIG_BLK_DEBUG_FS
0415     /**
0416      * @debugfs_dir: debugfs directory for this hardware queue. Named
0417      * as cpu<cpu_number>.
0418      */
0419     struct dentry       *debugfs_dir;
0420     /** @sched_debugfs_dir: debugfs directory for the scheduler. */
0421     struct dentry       *sched_debugfs_dir;
0422 #endif
0423
0424     /**
0425      * @hctx_list: if this hctx is not in use, this is an entry in
0426      * q->unused_hctx_list.
0427      */
0428     struct list_head    hctx_list;
0429 };
0430
0431 /**
0432  * struct blk_mq_queue_map - Map software queues to hardware queues
0433  * @mq_map:       CPU ID to hardware queue index map. This is an array
0434  *  with nr_cpu_ids elements. Each element has a value in the range
0435  *  [@queue_offset, @queue_offset + @nr_queues).
0436  * @nr_queues:    Number of hardware queues to map CPU IDs onto.
0437  * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
0438  *  driver to map each hardware queue type (enum hctx_type) onto a distinct
0439  *  set of hardware queues.
0440  */
0441 struct blk_mq_queue_map {
0442     unsigned int *mq_map;
0443     unsigned int nr_queues;
0444     unsigned int queue_offset;
0445 };
0446
0447 /**
0448  * enum hctx_type - Type of hardware queue
0449  * @HCTX_TYPE_DEFAULT:  All I/O not otherwise accounted for.
0450  * @HCTX_TYPE_READ: Just for READ I/O.
0451  * @HCTX_TYPE_POLL: Polled I/O of any kind.
0452  * @HCTX_MAX_TYPES: Number of types of hctx.
0453  */
0454 enum hctx_type {
0455     HCTX_TYPE_DEFAULT,
0456     HCTX_TYPE_READ,
0457     HCTX_TYPE_POLL,
0458
0459     HCTX_MAX_TYPES,
0460 };
0461
0462 /**
0463  * struct blk_mq_tag_set - tag set that can be shared between request queues
0464  * @map:       One or more ctx -> hctx mappings. One map exists for each
0465  *         hardware queue type (enum hctx_type) that the driver wishes
0466  *         to support. There are no restrictions on maps being of the
0467  *         same size, and it's perfectly legal to share maps between
0468  *         types.
0469  * @nr_maps:       Number of elements in the @map array. A number in the range
0470  *         [1, HCTX_MAX_TYPES].
0471  * @ops:       Pointers to functions that implement block driver behavior.
0472  * @nr_hw_queues:  Number of hardware queues supported by the block driver that
0473  *         owns this data structure.
0474  * @queue_depth:   Number of tags per hardware queue, reserved tags included.
0475  * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
0476  *         allocations.
0477  * @cmd_size:      Number of additional bytes to allocate per request. The block
0478  *         driver owns these additional bytes.
0479  * @numa_node:     NUMA node the storage adapter has been connected to.
0480  * @timeout:       Request processing timeout in jiffies.
0481  * @flags:     Zero or more BLK_MQ_F_* flags.
0482  * @driver_data:   Pointer to data owned by the block driver that created this
0483  *         tag set.
0484  * @tags:      Tag sets. One tag set per hardware queue. Has @nr_hw_queues
0485  *         elements.
0486  * @shared_tags:
0487  *         Shared set of tags. Has @nr_hw_queues elements. If set,
0488  *         shared by all @tags.
0489  * @tag_list_lock: Serializes tag_list accesses.
0490  * @tag_list:      List of the request queues that use this tag set. See also
0491  *         request_queue.tag_set_list.
0492  */
0493 struct blk_mq_tag_set {
0494     struct blk_mq_queue_map map[HCTX_MAX_TYPES];
0495     unsigned int        nr_maps;
0496     const struct blk_mq_ops *ops;
0497     unsigned int        nr_hw_queues;
0498     unsigned int        queue_depth;
0499     unsigned int        reserved_tags;
0500     unsigned int        cmd_size;
0501     int         numa_node;
0502     unsigned int        timeout;
0503     unsigned int        flags;
0504     void            *driver_data;
0505
0506     struct blk_mq_tags  **tags;
0507
0508     struct blk_mq_tags  *shared_tags;
0509
0510     struct mutex        tag_list_lock;
0511     struct list_head    tag_list;
0512 };
0513
0514 /**
0515  * struct blk_mq_queue_data - Data about a request inserted in a queue
0516  *
0517  * @rq:   Request pointer.
0518  * @last: If it is the last request in the queue.
0519  */
0520 struct blk_mq_queue_data {
0521     struct request *rq;
0522     bool last;
0523 };
0524
0525 typedef bool (busy_tag_iter_fn)(struct request *, void *);
0526
0527 /**
0528  * struct blk_mq_ops - Callback functions that implements block driver
0529  * behaviour.
0530  */
0531 struct blk_mq_ops {
0532     /**
0533      * @queue_rq: Queue a new request from block IO.
0534      */
0535     blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
0536                  const struct blk_mq_queue_data *);
0537
0538     /**
0539      * @commit_rqs: If a driver uses bd->last to judge when to submit
0540      * requests to hardware, it must define this function. In case of errors
0541      * that make us stop issuing further requests, this hook serves the
0542      * purpose of kicking the hardware (which the last request otherwise
0543      * would have done).
0544      */
0545     void (*commit_rqs)(struct blk_mq_hw_ctx *);
0546
0547     /**
0548      * @queue_rqs: Queue a list of new requests. Driver is guaranteed
0549      * that each request belongs to the same queue. If the driver doesn't
0550      * empty the @rqlist completely, then the rest will be queued
0551      * individually by the block layer upon return.
0552      */
0553     void (*queue_rqs)(struct request **rqlist);
0554
0555     /**
0556      * @get_budget: Reserve budget before queue request, once .queue_rq is
0557      * run, it is driver's responsibility to release the
0558      * reserved budget. Also we have to handle failure case
0559      * of .get_budget for avoiding I/O deadlock.
0560      */
0561     int (*get_budget)(struct request_queue *);
0562
0563     /**
0564      * @put_budget: Release the reserved budget.
0565      */
0566     void (*put_budget)(struct request_queue *, int);
0567
0568     /**
0569      * @set_rq_budget_token: store rq's budget token
0570      */
0571     void (*set_rq_budget_token)(struct request *, int);
0572     /**
0573      * @get_rq_budget_token: retrieve rq's budget token
0574      */
0575     int (*get_rq_budget_token)(struct request *);
0576
0577     /**
0578      * @timeout: Called on request timeout.
0579      */
0580     enum blk_eh_timer_return (*timeout)(struct request *);
0581
0582     /**
0583      * @poll: Called to poll for completion of a specific tag.
0584      */
0585     int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
0586
0587     /**
0588      * @complete: Mark the request as complete.
0589      */
0590     void (*complete)(struct request *);
0591
0592     /**
0593      * @init_hctx: Called when the block layer side of a hardware queue has
0594      * been set up, allowing the driver to allocate/init matching
0595      * structures.
0596      */
0597     int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
0598     /**
0599      * @exit_hctx: Ditto for exit/teardown.
0600      */
0601     void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
0602
0603     /**
0604      * @init_request: Called for every command allocated by the block layer
0605      * to allow the driver to set up driver specific data.
0606      *
0607      * Tag greater than or equal to queue_depth is for setting up
0608      * flush request.
0609      */
0610     int (*init_request)(struct blk_mq_tag_set *set, struct request *,
0611                 unsigned int, unsigned int);
0612     /**
0613      * @exit_request: Ditto for exit/teardown.
0614      */
0615     void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
0616                  unsigned int);
0617
0618     /**
0619      * @cleanup_rq: Called before freeing one request which isn't completed
0620      * yet, and usually for freeing the driver private data.
0621      */
0622     void (*cleanup_rq)(struct request *);
0623
0624     /**
0625      * @busy: If set, returns whether or not this queue currently is busy.
0626      */
0627     bool (*busy)(struct request_queue *);
0628
0629     /**
0630      * @map_queues: This allows drivers specify their own queue mapping by
0631      * overriding the setup-time function that builds the mq_map.
0632      */
0633     int (*map_queues)(struct blk_mq_tag_set *set);
0634
0635 #ifdef CONFIG_BLK_DEBUG_FS
0636     /**
0637      * @show_rq: Used by the debugfs implementation to show driver-specific
0638      * information about a request.
0639      */
0640     void (*show_rq)(struct seq_file *m, struct request *rq);
0641 #endif
0642 };
0643
0644 enum {
0645     BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
0646     BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
0647     /*
0648      * Set when this device requires underlying blk-mq device for
0649      * completing IO:
0650      */
0651     BLK_MQ_F_STACKING   = 1 << 2,
0652     BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
0653     BLK_MQ_F_BLOCKING   = 1 << 5,
0654     /* Do not allow an I/O scheduler to be configured. */
0655     BLK_MQ_F_NO_SCHED   = 1 << 6,
0656     /*
0657      * Select 'none' during queue registration in case of a single hwq
0658      * or shared hwqs instead of 'mq-deadline'.
0659      */
0660     BLK_MQ_F_NO_SCHED_BY_DEFAULT    = 1 << 7,
0661     BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
0662     BLK_MQ_F_ALLOC_POLICY_BITS = 1,
0663
0664     BLK_MQ_S_STOPPED    = 0,
0665     BLK_MQ_S_TAG_ACTIVE = 1,
0666     BLK_MQ_S_SCHED_RESTART  = 2,
0667
0668     /* hw queue is inactive after all its CPUs become offline */
0669     BLK_MQ_S_INACTIVE   = 3,
0670
0671     BLK_MQ_MAX_DEPTH    = 10240,
0672
0673     BLK_MQ_CPU_WORK_BATCH   = 8,
0674 };
0675 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
0676     ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
0677         ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
0678 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
0679     ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
0680         << BLK_MQ_F_ALLOC_POLICY_START_BIT)
0681
0682 #define BLK_MQ_NO_HCTX_IDX  (-1U)
0683
0684 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
0685         struct lock_class_key *lkclass);
0686 #define blk_mq_alloc_disk(set, queuedata)               \
0687 ({                                  \
0688     static struct lock_class_key __key;             \
0689                                     \
0690     __blk_mq_alloc_disk(set, queuedata, &__key);            \
0691 })
0692 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
0693         struct lock_class_key *lkclass);
0694 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
0695 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
0696         struct request_queue *q);
0697 void blk_mq_destroy_queue(struct request_queue *);
0698
0699 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
0700 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
0701         const struct blk_mq_ops *ops, unsigned int queue_depth,
0702         unsigned int set_flags);
0703 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
0704
0705 void blk_mq_free_request(struct request *rq);
0706
0707 bool blk_mq_queue_inflight(struct request_queue *q);
0708
0709 enum {
0710     /* return when out of requests */
0711     BLK_MQ_REQ_NOWAIT   = (__force blk_mq_req_flags_t)(1 << 0),
0712     /* allocate from reserved pool */
0713     BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
0714     /* set RQF_PM */
0715     BLK_MQ_REQ_PM       = (__force blk_mq_req_flags_t)(1 << 2),
0716 };
0717
0718 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
0719         blk_mq_req_flags_t flags);
0720 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
0721         blk_opf_t opf, blk_mq_req_flags_t flags,
0722         unsigned int hctx_idx);
0723
0724 /*
0725  * Tag address space map.
0726  */
0727 struct blk_mq_tags {
0728     unsigned int nr_tags;
0729     unsigned int nr_reserved_tags;
0730
0731     atomic_t active_queues;
0732
0733     struct sbitmap_queue bitmap_tags;
0734     struct sbitmap_queue breserved_tags;
0735
0736     struct request **rqs;
0737     struct request **static_rqs;
0738     struct list_head page_list;
0739
0740     /*
0741      * used to clear request reference in rqs[] before freeing one
0742      * request pool
0743      */
0744     spinlock_t lock;
0745 };
0746
0747 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
0748                            unsigned int tag)
0749 {
0750     if (tag < tags->nr_tags) {
0751         prefetch(tags->rqs[tag]);
0752         return tags->rqs[tag];
0753     }
0754
0755     return NULL;
0756 }
0757
0758 enum {
0759     BLK_MQ_UNIQUE_TAG_BITS = 16,
0760     BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
0761 };
0762
0763 u32 blk_mq_unique_tag(struct request *rq);
0764
0765 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
0766 {
0767     return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
0768 }
0769
0770 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
0771 {
0772     return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
0773 }
0774
0775 /**
0776  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
0777  * @rq: target request.
0778  */
0779 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
0780 {
0781     return READ_ONCE(rq->state);
0782 }
0783
0784 static inline int blk_mq_request_started(struct request *rq)
0785 {
0786     return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
0787 }
0788
0789 static inline int blk_mq_request_completed(struct request *rq)
0790 {
0791     return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
0792 }
0793
0794 /*
0795  *
0796  * Set the state to complete when completing a request from inside ->queue_rq.
0797  * This is used by drivers that want to ensure special complete actions that
0798  * need access to the request are called on failure, e.g. by nvme for
0799  * multipathing.
0800  */
0801 static inline void blk_mq_set_request_complete(struct request *rq)
0802 {
0803     WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
0804 }
0805
0806 /*
0807  * Complete the request directly instead of deferring it to softirq or
0808  * completing it another CPU. Useful in preemptible instead of an interrupt.
0809  */
0810 static inline void blk_mq_complete_request_direct(struct request *rq,
0811            void (*complete)(struct request *rq))
0812 {
0813     WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
0814     complete(rq);
0815 }
0816
0817 void blk_mq_start_request(struct request *rq);
0818 void blk_mq_end_request(struct request *rq, blk_status_t error);
0819 void __blk_mq_end_request(struct request *rq, blk_status_t error);
0820 void blk_mq_end_request_batch(struct io_comp_batch *ib);
0821
0822 /*
0823  * Only need start/end time stamping if we have iostat or
0824  * blk stats enabled, or using an IO scheduler.
0825  */
0826 static inline bool blk_mq_need_time_stamp(struct request *rq)
0827 {
0828     return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
0829 }
0830
0831 static inline bool blk_mq_is_reserved_rq(struct request *rq)
0832 {
0833     return rq->rq_flags & RQF_RESV;
0834 }
0835
0836 /*
0837  * Batched completions only work when there is no I/O error and no special
0838  * ->end_io handler.
0839  */
0840 static inline bool blk_mq_add_to_batch(struct request *req,
0841                        struct io_comp_batch *iob, int ioerror,
0842                        void (*complete)(struct io_comp_batch *))
0843 {
0844     if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
0845         return false;
0846     if (!iob->complete)
0847         iob->complete = complete;
0848     else if (iob->complete != complete)
0849         return false;
0850     iob->need_ts |= blk_mq_need_time_stamp(req);
0851     rq_list_add(&iob->req_list, req);
0852     return true;
0853 }
0854
0855 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
0856 void blk_mq_kick_requeue_list(struct request_queue *q);
0857 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
0858 void blk_mq_complete_request(struct request *rq);
0859 bool blk_mq_complete_request_remote(struct request *rq);
0860 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
0861 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
0862 void blk_mq_stop_hw_queues(struct request_queue *q);
0863 void blk_mq_start_hw_queues(struct request_queue *q);
0864 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
0865 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
0866 void blk_mq_quiesce_queue(struct request_queue *q);
0867 void blk_mq_wait_quiesce_done(struct request_queue *q);
0868 void blk_mq_unquiesce_queue(struct request_queue *q);
0869 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
0870 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
0871 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
0872 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
0873 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
0874         busy_tag_iter_fn *fn, void *priv);
0875 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
0876 void blk_mq_freeze_queue(struct request_queue *q);
0877 void blk_mq_unfreeze_queue(struct request_queue *q);
0878 void blk_freeze_queue_start(struct request_queue *q);
0879 void blk_mq_freeze_queue_wait(struct request_queue *q);
0880 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
0881                      unsigned long timeout);
0882
0883 int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
0884 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
0885
0886 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
0887
0888 unsigned int blk_mq_rq_cpu(struct request *rq);
0889
0890 bool __blk_should_fake_timeout(struct request_queue *q);
0891 static inline bool blk_should_fake_timeout(struct request_queue *q)
0892 {
0893     if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
0894         test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
0895         return __blk_should_fake_timeout(q);
0896     return false;
0897 }
0898
0899 /**
0900  * blk_mq_rq_from_pdu - cast a PDU to a request
0901  * @pdu: the PDU (Protocol Data Unit) to be casted
0902  *
0903  * Return: request
0904  *
0905  * Driver command data is immediately after the request. So subtract request
0906  * size to get back to the original request.
0907  */
0908 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
0909 {
0910     return pdu - sizeof(struct request);
0911 }
0912
0913 /**
0914  * blk_mq_rq_to_pdu - cast a request to a PDU
0915  * @rq: the request to be casted
0916  *
0917  * Return: pointer to the PDU
0918  *
0919  * Driver command data is immediately after the request. So add request to get
0920  * the PDU.
0921  */
0922 static inline void *blk_mq_rq_to_pdu(struct request *rq)
0923 {
0924     return rq + 1;
0925 }
0926
0927 #define queue_for_each_hw_ctx(q, hctx, i)               \
0928     xa_for_each(&(q)->hctx_table, (i), (hctx))
0929
0930 #define hctx_for_each_ctx(hctx, ctx, i)                 \
0931     for ((i) = 0; (i) < (hctx)->nr_ctx &&               \
0932          ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
0933
0934 static inline void blk_mq_cleanup_rq(struct request *rq)
0935 {
0936     if (rq->q->mq_ops->cleanup_rq)
0937         rq->q->mq_ops->cleanup_rq(rq);
0938 }
0939
0940 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
0941         unsigned int nr_segs)
0942 {
0943     rq->nr_phys_segments = nr_segs;
0944     rq->__data_len = bio->bi_iter.bi_size;
0945     rq->bio = rq->biotail = bio;
0946     rq->ioprio = bio_prio(bio);
0947 }
0948
0949 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
0950         struct lock_class_key *key);
0951
0952 static inline bool rq_is_sync(struct request *rq)
0953 {
0954     return op_is_sync(rq->cmd_flags);
0955 }
0956
0957 void blk_rq_init(struct request_queue *q, struct request *rq);
0958 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
0959         struct bio_set *bs, gfp_t gfp_mask,
0960         int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
0961 void blk_rq_unprep_clone(struct request *rq);
0962 blk_status_t blk_insert_cloned_request(struct request *rq);
0963
0964 struct rq_map_data {
0965     struct page **pages;
0966     int page_order;
0967     int nr_entries;
0968     unsigned long offset;
0969     int null_mapped;
0970     int from_user;
0971 };
0972
0973 int blk_rq_map_user(struct request_queue *, struct request *,
0974         struct rq_map_data *, void __user *, unsigned long, gfp_t);
0975 int blk_rq_map_user_iov(struct request_queue *, struct request *,
0976         struct rq_map_data *, const struct iov_iter *, gfp_t);
0977 int blk_rq_unmap_user(struct bio *);
0978 int blk_rq_map_kern(struct request_queue *, struct request *, void *,
0979         unsigned int, gfp_t);
0980 int blk_rq_append_bio(struct request *rq, struct bio *bio);
0981 void blk_execute_rq_nowait(struct request *rq, bool at_head);
0982 blk_status_t blk_execute_rq(struct request *rq, bool at_head);
0983
0984 struct req_iterator {
0985     struct bvec_iter iter;
0986     struct bio *bio;
0987 };
0988
0989 #define __rq_for_each_bio(_bio, rq) \
0990     if ((rq->bio))          \
0991         for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
0992
0993 #define rq_for_each_segment(bvl, _rq, _iter)            \
0994     __rq_for_each_bio(_iter.bio, _rq)           \
0995         bio_for_each_segment(bvl, _iter.bio, _iter.iter)
0996
0997 #define rq_for_each_bvec(bvl, _rq, _iter)           \
0998     __rq_for_each_bio(_iter.bio, _rq)           \
0999         bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
1000
1001 #define rq_iter_last(bvec, _iter)               \
1002         (_iter.bio->bi_next == NULL &&          \
1003          bio_iter_last(bvec, _iter.iter))
1004
1005 /*
1006  * blk_rq_pos()         : the current sector
1007  * blk_rq_bytes()       : bytes left in the entire request
1008  * blk_rq_cur_bytes()       : bytes left in the current segment
1009  * blk_rq_sectors()     : sectors left in the entire request
1010  * blk_rq_cur_sectors()     : sectors left in the current segment
1011  * blk_rq_stats_sectors()   : sectors of the entire request used for stats
1012  */
1013 static inline sector_t blk_rq_pos(const struct request *rq)
1014 {
1015     return rq->__sector;
1016 }
1017
1018 static inline unsigned int blk_rq_bytes(const struct request *rq)
1019 {
1020     return rq->__data_len;
1021 }
1022
1023 static inline int blk_rq_cur_bytes(const struct request *rq)
1024 {
1025     if (!rq->bio)
1026         return 0;
1027     if (!bio_has_data(rq->bio)) /* dataless requests such as discard */
1028         return rq->bio->bi_iter.bi_size;
1029     return bio_iovec(rq->bio).bv_len;
1030 }
1031
1032 static inline unsigned int blk_rq_sectors(const struct request *rq)
1033 {
1034     return blk_rq_bytes(rq) >> SECTOR_SHIFT;
1035 }
1036
1037 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
1038 {
1039     return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
1040 }
1041
1042 static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
1043 {
1044     return rq->stats_sectors;
1045 }
1046
1047 /*
1048  * Some commands like WRITE SAME have a payload or data transfer size which
1049  * is different from the size of the request.  Any driver that supports such
1050  * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
1051  * calculate the data transfer size.
1052  */
1053 static inline unsigned int blk_rq_payload_bytes(struct request *rq)
1054 {
1055     if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1056         return rq->special_vec.bv_len;
1057     return blk_rq_bytes(rq);
1058 }
1059
1060 /*
1061  * Return the first full biovec in the request.  The caller needs to check that
1062  * there are any bvecs before calling this helper.
1063  */
1064 static inline struct bio_vec req_bvec(struct request *rq)
1065 {
1066     if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1067         return rq->special_vec;
1068     return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
1069 }
1070
1071 static inline unsigned int blk_rq_count_bios(struct request *rq)
1072 {
1073     unsigned int nr_bios = 0;
1074     struct bio *bio;
1075
1076     __rq_for_each_bio(bio, rq)
1077         nr_bios++;
1078
1079     return nr_bios;
1080 }
1081
1082 void blk_steal_bios(struct bio_list *list, struct request *rq);
1083
1084 /*
1085  * Request completion related functions.
1086  *
1087  * blk_update_request() completes given number of bytes and updates
1088  * the request without completing it.
1089  */
1090 bool blk_update_request(struct request *rq, blk_status_t error,
1091                    unsigned int nr_bytes);
1092 void blk_abort_request(struct request *);
1093
1094 /*
1095  * Number of physical segments as sent to the device.
1096  *
1097  * Normally this is the number of discontiguous data segments sent by the
1098  * submitter.  But for data-less command like discard we might have no
1099  * actual data segments submitted, but the driver might have to add it's
1100  * own special payload.  In that case we still return 1 here so that this
1101  * special payload will be mapped.
1102  */
1103 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
1104 {
1105     if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1106         return 1;
1107     return rq->nr_phys_segments;
1108 }
1109
1110 /*
1111  * Number of discard segments (or ranges) the driver needs to fill in.
1112  * Each discard bio merged into a request is counted as one segment.
1113  */
1114 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
1115 {
1116     return max_t(unsigned short, rq->nr_phys_segments, 1);
1117 }
1118
1119 int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
1120         struct scatterlist *sglist, struct scatterlist **last_sg);
1121 static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1122         struct scatterlist *sglist)
1123 {
1124     struct scatterlist *last_sg = NULL;
1125
1126     return __blk_rq_map_sg(q, rq, sglist, &last_sg);
1127 }
1128 void blk_dump_rq_flags(struct request *, char *);
1129
1130 #ifdef CONFIG_BLK_DEV_ZONED
1131 static inline unsigned int blk_rq_zone_no(struct request *rq)
1132 {
1133     return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
1134 }
1135
1136 static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
1137 {
1138     return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
1139 }
1140
1141 bool blk_req_needs_zone_write_lock(struct request *rq);
1142 bool blk_req_zone_write_trylock(struct request *rq);
1143 void __blk_req_zone_write_lock(struct request *rq);
1144 void __blk_req_zone_write_unlock(struct request *rq);
1145
1146 static inline void blk_req_zone_write_lock(struct request *rq)
1147 {
1148     if (blk_req_needs_zone_write_lock(rq))
1149         __blk_req_zone_write_lock(rq);
1150 }
1151
1152 static inline void blk_req_zone_write_unlock(struct request *rq)
1153 {
1154     if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
1155         __blk_req_zone_write_unlock(rq);
1156 }
1157
1158 static inline bool blk_req_zone_is_write_locked(struct request *rq)
1159 {
1160     return rq->q->disk->seq_zones_wlock &&
1161         test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
1162 }
1163
1164 static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1165 {
1166     if (!blk_req_needs_zone_write_lock(rq))
1167         return true;
1168     return !blk_req_zone_is_write_locked(rq);
1169 }
1170 #else /* CONFIG_BLK_DEV_ZONED */
1171 static inline bool blk_req_needs_zone_write_lock(struct request *rq)
1172 {
1173     return false;
1174 }
1175
1176 static inline void blk_req_zone_write_lock(struct request *rq)
1177 {
1178 }
1179
1180 static inline void blk_req_zone_write_unlock(struct request *rq)
1181 {
1182 }
1183 static inline bool blk_req_zone_is_write_locked(struct request *rq)
1184 {
1185     return false;
1186 }
1187
1188 static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1189 {
1190     return true;
1191 }
1192 #endif /* CONFIG_BLK_DEV_ZONED */
1193
1194 #endif /* BLK_MQ_H */