Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Userspace block device - block device which IO is handled from userspace
0004  *
0005  * Take full use of io_uring passthrough command for communicating with
0006  * ublk userspace daemon(ublksrvd) for handling basic IO request.
0007  *
0008  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
0009  *
0010  * (part of code stolen from loop.c)
0011  */
0012 #include <linux/module.h>
0013 #include <linux/moduleparam.h>
0014 #include <linux/sched.h>
0015 #include <linux/fs.h>
0016 #include <linux/pagemap.h>
0017 #include <linux/file.h>
0018 #include <linux/stat.h>
0019 #include <linux/errno.h>
0020 #include <linux/major.h>
0021 #include <linux/wait.h>
0022 #include <linux/blkdev.h>
0023 #include <linux/init.h>
0024 #include <linux/swap.h>
0025 #include <linux/slab.h>
0026 #include <linux/compat.h>
0027 #include <linux/mutex.h>
0028 #include <linux/writeback.h>
0029 #include <linux/completion.h>
0030 #include <linux/highmem.h>
0031 #include <linux/sysfs.h>
0032 #include <linux/miscdevice.h>
0033 #include <linux/falloc.h>
0034 #include <linux/uio.h>
0035 #include <linux/ioprio.h>
0036 #include <linux/sched/mm.h>
0037 #include <linux/uaccess.h>
0038 #include <linux/cdev.h>
0039 #include <linux/io_uring.h>
0040 #include <linux/blk-mq.h>
0041 #include <linux/delay.h>
0042 #include <linux/mm.h>
0043 #include <asm/page.h>
0044 #include <linux/task_work.h>
0045 #include <uapi/linux/ublk_cmd.h>
0046 
0047 #define UBLK_MINORS     (1U << MINORBITS)
0048 
0049 /* All UBLK_F_* have to be included into UBLK_F_ALL */
0050 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
0051         | UBLK_F_URING_CMD_COMP_IN_TASK \
0052         | UBLK_F_NEED_GET_DATA)
0053 
0054 /* All UBLK_PARAM_TYPE_* should be included here */
0055 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
0056 
0057 struct ublk_rq_data {
0058     struct callback_head work;
0059 };
0060 
0061 struct ublk_uring_cmd_pdu {
0062     struct request *req;
0063 };
0064 
0065 /*
0066  * io command is active: sqe cmd is received, and its cqe isn't done
0067  *
0068  * If the flag is set, the io command is owned by ublk driver, and waited
0069  * for incoming blk-mq request from the ublk block device.
0070  *
0071  * If the flag is cleared, the io command will be completed, and owned by
0072  * ublk server.
0073  */
0074 #define UBLK_IO_FLAG_ACTIVE 0x01
0075 
0076 /*
0077  * IO command is completed via cqe, and it is being handled by ublksrv, and
0078  * not committed yet
0079  *
0080  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
0081  * cross verification
0082  */
0083 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
0084 
0085 /*
0086  * IO command is aborted, so this flag is set in case of
0087  * !UBLK_IO_FLAG_ACTIVE.
0088  *
0089  * After this flag is observed, any pending or new incoming request
0090  * associated with this io command will be failed immediately
0091  */
0092 #define UBLK_IO_FLAG_ABORTED 0x04
0093 
0094 /*
0095  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
0096  * get data buffer address from ublksrv.
0097  *
0098  * Then, bio data could be copied into this data buffer for a WRITE request
0099  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
0100  */
0101 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
0102 
0103 struct ublk_io {
0104     /* userspace buffer address from io cmd */
0105     __u64   addr;
0106     unsigned int flags;
0107     int res;
0108 
0109     struct io_uring_cmd *cmd;
0110 };
0111 
0112 struct ublk_queue {
0113     int q_id;
0114     int q_depth;
0115 
0116     unsigned long flags;
0117     struct task_struct  *ubq_daemon;
0118     char *io_cmd_buf;
0119 
0120     unsigned long io_addr;  /* mapped vm address */
0121     unsigned int max_io_sz;
0122     bool abort_work_pending;
0123     unsigned short nr_io_ready; /* how many ios setup */
0124     struct ublk_device *dev;
0125     struct ublk_io ios[0];
0126 };
0127 
0128 #define UBLK_DAEMON_MONITOR_PERIOD  (5 * HZ)
0129 
0130 struct ublk_device {
0131     struct gendisk      *ub_disk;
0132 
0133     char    *__queues;
0134 
0135     unsigned short  queue_size;
0136     struct ublksrv_ctrl_dev_info    dev_info;
0137 
0138     struct blk_mq_tag_set   tag_set;
0139 
0140     struct cdev     cdev;
0141     struct device       cdev_dev;
0142 
0143 #define UB_STATE_OPEN       0
0144 #define UB_STATE_USED       1
0145     unsigned long       state;
0146     int         ub_number;
0147 
0148     struct mutex        mutex;
0149 
0150     spinlock_t      mm_lock;
0151     struct mm_struct    *mm;
0152 
0153     struct ublk_params  params;
0154 
0155     struct completion   completion;
0156     unsigned int        nr_queues_ready;
0157     atomic_t        nr_aborted_queues;
0158 
0159     /*
0160      * Our ubq->daemon may be killed without any notification, so
0161      * monitor each queue's daemon periodically
0162      */
0163     struct delayed_work monitor_work;
0164     struct work_struct  stop_work;
0165 };
0166 
0167 /* header of ublk_params */
0168 struct ublk_params_header {
0169     __u32   len;
0170     __u32   types;
0171 };
0172 
0173 static dev_t ublk_chr_devt;
0174 static struct class *ublk_chr_class;
0175 
0176 static DEFINE_IDR(ublk_index_idr);
0177 static DEFINE_SPINLOCK(ublk_idr_lock);
0178 static wait_queue_head_t ublk_idr_wq;   /* wait until one idr is freed */
0179 
0180 static DEFINE_MUTEX(ublk_ctl_mutex);
0181 
0182 static struct miscdevice ublk_misc;
0183 
0184 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
0185 {
0186     struct request_queue *q = ub->ub_disk->queue;
0187     const struct ublk_param_basic *p = &ub->params.basic;
0188 
0189     blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
0190     blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
0191     blk_queue_io_min(q, 1 << p->io_min_shift);
0192     blk_queue_io_opt(q, 1 << p->io_opt_shift);
0193 
0194     blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
0195             p->attrs & UBLK_ATTR_FUA);
0196     if (p->attrs & UBLK_ATTR_ROTATIONAL)
0197         blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
0198     else
0199         blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
0200 
0201     blk_queue_max_hw_sectors(q, p->max_sectors);
0202     blk_queue_chunk_sectors(q, p->chunk_sectors);
0203     blk_queue_virt_boundary(q, p->virt_boundary_mask);
0204 
0205     if (p->attrs & UBLK_ATTR_READ_ONLY)
0206         set_disk_ro(ub->ub_disk, true);
0207 
0208     set_capacity(ub->ub_disk, p->dev_sectors);
0209 }
0210 
0211 static void ublk_dev_param_discard_apply(struct ublk_device *ub)
0212 {
0213     struct request_queue *q = ub->ub_disk->queue;
0214     const struct ublk_param_discard *p = &ub->params.discard;
0215 
0216     q->limits.discard_alignment = p->discard_alignment;
0217     q->limits.discard_granularity = p->discard_granularity;
0218     blk_queue_max_discard_sectors(q, p->max_discard_sectors);
0219     blk_queue_max_write_zeroes_sectors(q,
0220             p->max_write_zeroes_sectors);
0221     blk_queue_max_discard_segments(q, p->max_discard_segments);
0222 }
0223 
0224 static int ublk_validate_params(const struct ublk_device *ub)
0225 {
0226     /* basic param is the only one which must be set */
0227     if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
0228         const struct ublk_param_basic *p = &ub->params.basic;
0229 
0230         if (p->logical_bs_shift > PAGE_SHIFT)
0231             return -EINVAL;
0232 
0233         if (p->logical_bs_shift > p->physical_bs_shift)
0234             return -EINVAL;
0235 
0236         if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
0237             return -EINVAL;
0238     } else
0239         return -EINVAL;
0240 
0241     if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
0242         const struct ublk_param_discard *p = &ub->params.discard;
0243 
0244         /* So far, only support single segment discard */
0245         if (p->max_discard_sectors && p->max_discard_segments != 1)
0246             return -EINVAL;
0247 
0248         if (!p->discard_granularity)
0249             return -EINVAL;
0250     }
0251 
0252     return 0;
0253 }
0254 
0255 static int ublk_apply_params(struct ublk_device *ub)
0256 {
0257     if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
0258         return -EINVAL;
0259 
0260     ublk_dev_param_basic_apply(ub);
0261 
0262     if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
0263         ublk_dev_param_discard_apply(ub);
0264 
0265     return 0;
0266 }
0267 
0268 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
0269 {
0270     if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
0271             !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
0272         return true;
0273     return false;
0274 }
0275 
0276 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
0277 {
0278     if (ubq->flags & UBLK_F_NEED_GET_DATA)
0279         return true;
0280     return false;
0281 }
0282 
0283 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
0284 {
0285     if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
0286         return ub;
0287     return NULL;
0288 }
0289 
0290 static void ublk_put_device(struct ublk_device *ub)
0291 {
0292     put_device(&ub->cdev_dev);
0293 }
0294 
0295 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
0296         int qid)
0297 {
0298        return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
0299 }
0300 
0301 static inline bool ublk_rq_has_data(const struct request *rq)
0302 {
0303     return rq->bio && bio_has_data(rq->bio);
0304 }
0305 
0306 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
0307         int tag)
0308 {
0309     return (struct ublksrv_io_desc *)
0310         &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
0311 }
0312 
0313 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
0314 {
0315     return ublk_get_queue(ub, q_id)->io_cmd_buf;
0316 }
0317 
0318 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
0319 {
0320     struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
0321 
0322     return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
0323             PAGE_SIZE);
0324 }
0325 
0326 static void ublk_free_disk(struct gendisk *disk)
0327 {
0328     struct ublk_device *ub = disk->private_data;
0329 
0330     clear_bit(UB_STATE_USED, &ub->state);
0331     put_device(&ub->cdev_dev);
0332 }
0333 
0334 static const struct block_device_operations ub_fops = {
0335     .owner =    THIS_MODULE,
0336     .free_disk =    ublk_free_disk,
0337 };
0338 
0339 #define UBLK_MAX_PIN_PAGES  32
0340 
0341 struct ublk_map_data {
0342     const struct ublk_queue *ubq;
0343     const struct request *rq;
0344     const struct ublk_io *io;
0345     unsigned max_bytes;
0346 };
0347 
0348 struct ublk_io_iter {
0349     struct page *pages[UBLK_MAX_PIN_PAGES];
0350     unsigned pg_off;    /* offset in the 1st page in pages */
0351     int nr_pages;       /* how many page pointers in pages */
0352     struct bio *bio;
0353     struct bvec_iter iter;
0354 };
0355 
0356 static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
0357         unsigned max_bytes, bool to_vm)
0358 {
0359     const unsigned total = min_t(unsigned, max_bytes,
0360             PAGE_SIZE - data->pg_off +
0361             ((data->nr_pages - 1) << PAGE_SHIFT));
0362     unsigned done = 0;
0363     unsigned pg_idx = 0;
0364 
0365     while (done < total) {
0366         struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
0367         const unsigned int bytes = min3(bv.bv_len, total - done,
0368                 (unsigned)(PAGE_SIZE - data->pg_off));
0369         void *bv_buf = bvec_kmap_local(&bv);
0370         void *pg_buf = kmap_local_page(data->pages[pg_idx]);
0371 
0372         if (to_vm)
0373             memcpy(pg_buf + data->pg_off, bv_buf, bytes);
0374         else
0375             memcpy(bv_buf, pg_buf + data->pg_off, bytes);
0376 
0377         kunmap_local(pg_buf);
0378         kunmap_local(bv_buf);
0379 
0380         /* advance page array */
0381         data->pg_off += bytes;
0382         if (data->pg_off == PAGE_SIZE) {
0383             pg_idx += 1;
0384             data->pg_off = 0;
0385         }
0386 
0387         done += bytes;
0388 
0389         /* advance bio */
0390         bio_advance_iter_single(data->bio, &data->iter, bytes);
0391         if (!data->iter.bi_size) {
0392             data->bio = data->bio->bi_next;
0393             if (data->bio == NULL)
0394                 break;
0395             data->iter = data->bio->bi_iter;
0396         }
0397     }
0398 
0399     return done;
0400 }
0401 
0402 static inline int ublk_copy_user_pages(struct ublk_map_data *data,
0403         bool to_vm)
0404 {
0405     const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
0406     const unsigned long start_vm = data->io->addr;
0407     unsigned int done = 0;
0408     struct ublk_io_iter iter = {
0409         .pg_off = start_vm & (PAGE_SIZE - 1),
0410         .bio    = data->rq->bio,
0411         .iter   = data->rq->bio->bi_iter,
0412     };
0413     const unsigned int nr_pages = round_up(data->max_bytes +
0414             (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
0415 
0416     while (done < nr_pages) {
0417         const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
0418                 nr_pages - done);
0419         unsigned i, len;
0420 
0421         iter.nr_pages = get_user_pages_fast(start_vm +
0422                 (done << PAGE_SHIFT), to_pin, gup_flags,
0423                 iter.pages);
0424         if (iter.nr_pages <= 0)
0425             return done == 0 ? iter.nr_pages : done;
0426         len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
0427         for (i = 0; i < iter.nr_pages; i++) {
0428             if (to_vm)
0429                 set_page_dirty(iter.pages[i]);
0430             put_page(iter.pages[i]);
0431         }
0432         data->max_bytes -= len;
0433         done += iter.nr_pages;
0434     }
0435 
0436     return done;
0437 }
0438 
0439 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
0440         struct ublk_io *io)
0441 {
0442     const unsigned int rq_bytes = blk_rq_bytes(req);
0443     /*
0444      * no zero copy, we delay copy WRITE request data into ublksrv
0445      * context and the big benefit is that pinning pages in current
0446      * context is pretty fast, see ublk_pin_user_pages
0447      */
0448     if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
0449         return rq_bytes;
0450 
0451     if (ublk_rq_has_data(req)) {
0452         struct ublk_map_data data = {
0453             .ubq    =   ubq,
0454             .rq =   req,
0455             .io =   io,
0456             .max_bytes =    rq_bytes,
0457         };
0458 
0459         ublk_copy_user_pages(&data, true);
0460 
0461         return rq_bytes - data.max_bytes;
0462     }
0463     return rq_bytes;
0464 }
0465 
0466 static int ublk_unmap_io(const struct ublk_queue *ubq,
0467         const struct request *req,
0468         struct ublk_io *io)
0469 {
0470     const unsigned int rq_bytes = blk_rq_bytes(req);
0471 
0472     if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
0473         struct ublk_map_data data = {
0474             .ubq    =   ubq,
0475             .rq =   req,
0476             .io =   io,
0477             .max_bytes =    io->res,
0478         };
0479 
0480         WARN_ON_ONCE(io->res > rq_bytes);
0481 
0482         ublk_copy_user_pages(&data, false);
0483 
0484         return io->res - data.max_bytes;
0485     }
0486     return rq_bytes;
0487 }
0488 
0489 static inline unsigned int ublk_req_build_flags(struct request *req)
0490 {
0491     unsigned flags = 0;
0492 
0493     if (req->cmd_flags & REQ_FAILFAST_DEV)
0494         flags |= UBLK_IO_F_FAILFAST_DEV;
0495 
0496     if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
0497         flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
0498 
0499     if (req->cmd_flags & REQ_FAILFAST_DRIVER)
0500         flags |= UBLK_IO_F_FAILFAST_DRIVER;
0501 
0502     if (req->cmd_flags & REQ_META)
0503         flags |= UBLK_IO_F_META;
0504 
0505     if (req->cmd_flags & REQ_FUA)
0506         flags |= UBLK_IO_F_FUA;
0507 
0508     if (req->cmd_flags & REQ_NOUNMAP)
0509         flags |= UBLK_IO_F_NOUNMAP;
0510 
0511     if (req->cmd_flags & REQ_SWAP)
0512         flags |= UBLK_IO_F_SWAP;
0513 
0514     return flags;
0515 }
0516 
0517 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
0518 {
0519     struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
0520     struct ublk_io *io = &ubq->ios[req->tag];
0521     u32 ublk_op;
0522 
0523     switch (req_op(req)) {
0524     case REQ_OP_READ:
0525         ublk_op = UBLK_IO_OP_READ;
0526         break;
0527     case REQ_OP_WRITE:
0528         ublk_op = UBLK_IO_OP_WRITE;
0529         break;
0530     case REQ_OP_FLUSH:
0531         ublk_op = UBLK_IO_OP_FLUSH;
0532         break;
0533     case REQ_OP_DISCARD:
0534         ublk_op = UBLK_IO_OP_DISCARD;
0535         break;
0536     case REQ_OP_WRITE_ZEROES:
0537         ublk_op = UBLK_IO_OP_WRITE_ZEROES;
0538         break;
0539     default:
0540         return BLK_STS_IOERR;
0541     }
0542 
0543     /* need to translate since kernel may change */
0544     iod->op_flags = ublk_op | ublk_req_build_flags(req);
0545     iod->nr_sectors = blk_rq_sectors(req);
0546     iod->start_sector = blk_rq_pos(req);
0547     iod->addr = io->addr;
0548 
0549     return BLK_STS_OK;
0550 }
0551 
0552 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
0553         struct io_uring_cmd *ioucmd)
0554 {
0555     return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
0556 }
0557 
0558 static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
0559 {
0560     return ubq->ubq_daemon->flags & PF_EXITING;
0561 }
0562 
0563 /* todo: handle partial completion */
0564 static void ublk_complete_rq(struct request *req)
0565 {
0566     struct ublk_queue *ubq = req->mq_hctx->driver_data;
0567     struct ublk_io *io = &ubq->ios[req->tag];
0568     unsigned int unmapped_bytes;
0569 
0570     /* failed read IO if nothing is read */
0571     if (!io->res && req_op(req) == REQ_OP_READ)
0572         io->res = -EIO;
0573 
0574     if (io->res < 0) {
0575         blk_mq_end_request(req, errno_to_blk_status(io->res));
0576         return;
0577     }
0578 
0579     /*
0580      * FLUSH or DISCARD usually won't return bytes returned, so end them
0581      * directly.
0582      *
0583      * Both the two needn't unmap.
0584      */
0585     if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
0586         blk_mq_end_request(req, BLK_STS_OK);
0587         return;
0588     }
0589 
0590     /* for READ request, writing data in iod->addr to rq buffers */
0591     unmapped_bytes = ublk_unmap_io(ubq, req, io);
0592 
0593     /*
0594      * Extremely impossible since we got data filled in just before
0595      *
0596      * Re-read simply for this unlikely case.
0597      */
0598     if (unlikely(unmapped_bytes < io->res))
0599         io->res = unmapped_bytes;
0600 
0601     if (blk_update_request(req, BLK_STS_OK, io->res))
0602         blk_mq_requeue_request(req, true);
0603     else
0604         __blk_mq_end_request(req, BLK_STS_OK);
0605 }
0606 
0607 /*
0608  * Since __ublk_rq_task_work always fails requests immediately during
0609  * exiting, __ublk_fail_req() is only called from abort context during
0610  * exiting. So lock is unnecessary.
0611  *
0612  * Also aborting may not be started yet, keep in mind that one failed
0613  * request may be issued by block layer again.
0614  */
0615 static void __ublk_fail_req(struct ublk_io *io, struct request *req)
0616 {
0617     WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
0618 
0619     if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
0620         io->flags |= UBLK_IO_FLAG_ABORTED;
0621         blk_mq_end_request(req, BLK_STS_IOERR);
0622     }
0623 }
0624 
0625 static void ubq_complete_io_cmd(struct ublk_io *io, int res)
0626 {
0627     /* mark this cmd owned by ublksrv */
0628     io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
0629 
0630     /*
0631      * clear ACTIVE since we are done with this sqe/cmd slot
0632      * We can only accept io cmd in case of being not active.
0633      */
0634     io->flags &= ~UBLK_IO_FLAG_ACTIVE;
0635 
0636     /* tell ublksrv one io request is coming */
0637     io_uring_cmd_done(io->cmd, res, 0);
0638 }
0639 
0640 #define UBLK_REQUEUE_DELAY_MS   3
0641 
0642 static inline void __ublk_rq_task_work(struct request *req)
0643 {
0644     struct ublk_queue *ubq = req->mq_hctx->driver_data;
0645     struct ublk_device *ub = ubq->dev;
0646     int tag = req->tag;
0647     struct ublk_io *io = &ubq->ios[tag];
0648     bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq);
0649     unsigned int mapped_bytes;
0650 
0651     pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
0652             __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
0653             ublk_get_iod(ubq, req->tag)->addr);
0654 
0655     if (unlikely(task_exiting)) {
0656         blk_mq_end_request(req, BLK_STS_IOERR);
0657         mod_delayed_work(system_wq, &ub->monitor_work, 0);
0658         return;
0659     }
0660 
0661     if (ublk_need_get_data(ubq) &&
0662             (req_op(req) == REQ_OP_WRITE ||
0663             req_op(req) == REQ_OP_FLUSH)) {
0664         /*
0665          * We have not handled UBLK_IO_NEED_GET_DATA command yet,
0666          * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
0667          * and notify it.
0668          */
0669         if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
0670             io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
0671             pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
0672                     __func__, io->cmd->cmd_op, ubq->q_id,
0673                     req->tag, io->flags);
0674             ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
0675             return;
0676         }
0677         /*
0678          * We have handled UBLK_IO_NEED_GET_DATA command,
0679          * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
0680          * do the copy work.
0681          */
0682         io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
0683         /* update iod->addr because ublksrv may have passed a new io buffer */
0684         ublk_get_iod(ubq, req->tag)->addr = io->addr;
0685         pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
0686                 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
0687                 ublk_get_iod(ubq, req->tag)->addr);
0688     }
0689 
0690     mapped_bytes = ublk_map_io(ubq, req, io);
0691 
0692     /* partially mapped, update io descriptor */
0693     if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
0694         /*
0695          * Nothing mapped, retry until we succeed.
0696          *
0697          * We may never succeed in mapping any bytes here because
0698          * of OOM. TODO: reserve one buffer with single page pinned
0699          * for providing forward progress guarantee.
0700          */
0701         if (unlikely(!mapped_bytes)) {
0702             blk_mq_requeue_request(req, false);
0703             blk_mq_delay_kick_requeue_list(req->q,
0704                     UBLK_REQUEUE_DELAY_MS);
0705             return;
0706         }
0707 
0708         ublk_get_iod(ubq, req->tag)->nr_sectors =
0709             mapped_bytes >> 9;
0710     }
0711 
0712     ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
0713 }
0714 
0715 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
0716 {
0717     struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
0718 
0719     __ublk_rq_task_work(pdu->req);
0720 }
0721 
0722 static void ublk_rq_task_work_fn(struct callback_head *work)
0723 {
0724     struct ublk_rq_data *data = container_of(work,
0725             struct ublk_rq_data, work);
0726     struct request *req = blk_mq_rq_from_pdu(data);
0727 
0728     __ublk_rq_task_work(req);
0729 }
0730 
0731 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
0732         const struct blk_mq_queue_data *bd)
0733 {
0734     struct ublk_queue *ubq = hctx->driver_data;
0735     struct request *rq = bd->rq;
0736     blk_status_t res;
0737 
0738     /* fill iod to slot in io cmd buffer */
0739     res = ublk_setup_iod(ubq, rq);
0740     if (unlikely(res != BLK_STS_OK))
0741         return BLK_STS_IOERR;
0742 
0743     blk_mq_start_request(bd->rq);
0744 
0745     if (unlikely(ubq_daemon_is_dying(ubq))) {
0746  fail:
0747         mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
0748         return BLK_STS_IOERR;
0749     }
0750 
0751     if (ublk_can_use_task_work(ubq)) {
0752         struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
0753         enum task_work_notify_mode notify_mode = bd->last ?
0754             TWA_SIGNAL_NO_IPI : TWA_NONE;
0755 
0756         if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
0757             goto fail;
0758     } else {
0759         struct ublk_io *io = &ubq->ios[rq->tag];
0760         struct io_uring_cmd *cmd = io->cmd;
0761         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
0762 
0763         /*
0764          * If the check pass, we know that this is a re-issued request aborted
0765          * previously in monitor_work because the ubq_daemon(cmd's task) is
0766          * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
0767          * because this ioucmd's io_uring context may be freed now if no inflight
0768          * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
0769          *
0770          * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
0771          * the tag). Then the request is re-started(allocating the tag) and we are here.
0772          * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
0773          * guarantees that here is a re-issued request aborted previously.
0774          */
0775         if ((io->flags & UBLK_IO_FLAG_ABORTED))
0776             goto fail;
0777 
0778         pdu->req = rq;
0779         io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
0780     }
0781 
0782     return BLK_STS_OK;
0783 }
0784 
0785 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
0786 {
0787     struct ublk_queue *ubq = hctx->driver_data;
0788 
0789     if (ublk_can_use_task_work(ubq))
0790         __set_notify_signal(ubq->ubq_daemon);
0791 }
0792 
0793 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
0794         unsigned int hctx_idx)
0795 {
0796     struct ublk_device *ub = driver_data;
0797     struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
0798 
0799     hctx->driver_data = ubq;
0800     return 0;
0801 }
0802 
0803 static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
0804         unsigned int hctx_idx, unsigned int numa_node)
0805 {
0806     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
0807 
0808     init_task_work(&data->work, ublk_rq_task_work_fn);
0809     return 0;
0810 }
0811 
0812 static const struct blk_mq_ops ublk_mq_ops = {
0813     .queue_rq       = ublk_queue_rq,
0814     .commit_rqs     = ublk_commit_rqs,
0815     .init_hctx  = ublk_init_hctx,
0816     .init_request   = ublk_init_rq,
0817 };
0818 
0819 static int ublk_ch_open(struct inode *inode, struct file *filp)
0820 {
0821     struct ublk_device *ub = container_of(inode->i_cdev,
0822             struct ublk_device, cdev);
0823 
0824     if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
0825         return -EBUSY;
0826     filp->private_data = ub;
0827     return 0;
0828 }
0829 
0830 static int ublk_ch_release(struct inode *inode, struct file *filp)
0831 {
0832     struct ublk_device *ub = filp->private_data;
0833 
0834     clear_bit(UB_STATE_OPEN, &ub->state);
0835     return 0;
0836 }
0837 
0838 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
0839 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
0840 {
0841     struct ublk_device *ub = filp->private_data;
0842     size_t sz = vma->vm_end - vma->vm_start;
0843     unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
0844     unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
0845     int q_id, ret = 0;
0846 
0847     spin_lock(&ub->mm_lock);
0848     if (!ub->mm)
0849         ub->mm = current->mm;
0850     if (current->mm != ub->mm)
0851         ret = -EINVAL;
0852     spin_unlock(&ub->mm_lock);
0853 
0854     if (ret)
0855         return ret;
0856 
0857     if (vma->vm_flags & VM_WRITE)
0858         return -EPERM;
0859 
0860     end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
0861     if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
0862         return -EINVAL;
0863 
0864     q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
0865     pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
0866             __func__, q_id, current->pid, vma->vm_start,
0867             phys_off, (unsigned long)sz);
0868 
0869     if (sz != ublk_queue_cmd_buf_size(ub, q_id))
0870         return -EINVAL;
0871 
0872     pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
0873     return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
0874 }
0875 
0876 static void ublk_commit_completion(struct ublk_device *ub,
0877         struct ublksrv_io_cmd *ub_cmd)
0878 {
0879     u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
0880     struct ublk_queue *ubq = ublk_get_queue(ub, qid);
0881     struct ublk_io *io = &ubq->ios[tag];
0882     struct request *req;
0883 
0884     /* now this cmd slot is owned by nbd driver */
0885     io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
0886     io->res = ub_cmd->result;
0887 
0888     /* find the io request and complete */
0889     req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
0890 
0891     if (req && likely(!blk_should_fake_timeout(req->q)))
0892         ublk_complete_rq(req);
0893 }
0894 
0895 /*
0896  * When ->ubq_daemon is exiting, either new request is ended immediately,
0897  * or any queued io command is drained, so it is safe to abort queue
0898  * lockless
0899  */
0900 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
0901 {
0902     int i;
0903 
0904     if (!ublk_get_device(ub))
0905         return;
0906 
0907     for (i = 0; i < ubq->q_depth; i++) {
0908         struct ublk_io *io = &ubq->ios[i];
0909 
0910         if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
0911             struct request *rq;
0912 
0913             /*
0914              * Either we fail the request or ublk_rq_task_work_fn
0915              * will do it
0916              */
0917             rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
0918             if (rq)
0919                 __ublk_fail_req(io, rq);
0920         }
0921     }
0922     ublk_put_device(ub);
0923 }
0924 
0925 static void ublk_daemon_monitor_work(struct work_struct *work)
0926 {
0927     struct ublk_device *ub =
0928         container_of(work, struct ublk_device, monitor_work.work);
0929     int i;
0930 
0931     for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
0932         struct ublk_queue *ubq = ublk_get_queue(ub, i);
0933 
0934         if (ubq_daemon_is_dying(ubq)) {
0935             schedule_work(&ub->stop_work);
0936 
0937             /* abort queue is for making forward progress */
0938             ublk_abort_queue(ub, ubq);
0939         }
0940     }
0941 
0942     /*
0943      * We can't schedule monitor work after ublk_remove() is started.
0944      *
0945      * No need ub->mutex, monitor work are canceled after state is marked
0946      * as DEAD, so DEAD state is observed reliably.
0947      */
0948     if (ub->dev_info.state != UBLK_S_DEV_DEAD)
0949         schedule_delayed_work(&ub->monitor_work,
0950                 UBLK_DAEMON_MONITOR_PERIOD);
0951 }
0952 
0953 static inline bool ublk_queue_ready(struct ublk_queue *ubq)
0954 {
0955     return ubq->nr_io_ready == ubq->q_depth;
0956 }
0957 
0958 static void ublk_cancel_queue(struct ublk_queue *ubq)
0959 {
0960     int i;
0961 
0962     if (!ublk_queue_ready(ubq))
0963         return;
0964 
0965     for (i = 0; i < ubq->q_depth; i++) {
0966         struct ublk_io *io = &ubq->ios[i];
0967 
0968         if (io->flags & UBLK_IO_FLAG_ACTIVE)
0969             io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
0970     }
0971 
0972     /* all io commands are canceled */
0973     ubq->nr_io_ready = 0;
0974 }
0975 
0976 /* Cancel all pending commands, must be called after del_gendisk() returns */
0977 static void ublk_cancel_dev(struct ublk_device *ub)
0978 {
0979     int i;
0980 
0981     for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
0982         ublk_cancel_queue(ublk_get_queue(ub, i));
0983 }
0984 
0985 static void ublk_stop_dev(struct ublk_device *ub)
0986 {
0987     mutex_lock(&ub->mutex);
0988     if (ub->dev_info.state != UBLK_S_DEV_LIVE)
0989         goto unlock;
0990 
0991     del_gendisk(ub->ub_disk);
0992     ub->dev_info.state = UBLK_S_DEV_DEAD;
0993     ub->dev_info.ublksrv_pid = -1;
0994     put_disk(ub->ub_disk);
0995     ub->ub_disk = NULL;
0996  unlock:
0997     ublk_cancel_dev(ub);
0998     mutex_unlock(&ub->mutex);
0999     cancel_delayed_work_sync(&ub->monitor_work);
1000 }
1001 
1002 /* device can only be started after all IOs are ready */
1003 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1004 {
1005     mutex_lock(&ub->mutex);
1006     ubq->nr_io_ready++;
1007     if (ublk_queue_ready(ubq)) {
1008         ubq->ubq_daemon = current;
1009         get_task_struct(ubq->ubq_daemon);
1010         ub->nr_queues_ready++;
1011     }
1012     if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1013         complete_all(&ub->completion);
1014     mutex_unlock(&ub->mutex);
1015 }
1016 
1017 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1018         int tag, struct io_uring_cmd *cmd)
1019 {
1020     struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1021     struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1022 
1023     if (ublk_can_use_task_work(ubq)) {
1024         struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
1025 
1026         /* should not fail since we call it just in ubq->ubq_daemon */
1027         task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
1028     } else {
1029         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1030 
1031         pdu->req = req;
1032         io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
1033     }
1034 }
1035 
1036 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1037 {
1038     struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
1039     struct ublk_device *ub = cmd->file->private_data;
1040     struct ublk_queue *ubq;
1041     struct ublk_io *io;
1042     u32 cmd_op = cmd->cmd_op;
1043     unsigned tag = ub_cmd->tag;
1044     int ret = -EINVAL;
1045 
1046     pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1047             __func__, cmd->cmd_op, ub_cmd->q_id, tag,
1048             ub_cmd->result);
1049 
1050     if (!(issue_flags & IO_URING_F_SQE128))
1051         goto out;
1052 
1053     if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1054         goto out;
1055 
1056     ubq = ublk_get_queue(ub, ub_cmd->q_id);
1057     if (!ubq || ub_cmd->q_id != ubq->q_id)
1058         goto out;
1059 
1060     if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1061         goto out;
1062 
1063     if (tag >= ubq->q_depth)
1064         goto out;
1065 
1066     io = &ubq->ios[tag];
1067 
1068     /* there is pending io cmd, something must be wrong */
1069     if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1070         ret = -EBUSY;
1071         goto out;
1072     }
1073 
1074     /*
1075      * ensure that the user issues UBLK_IO_NEED_GET_DATA
1076      * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1077      */
1078     if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1079             ^ (cmd_op == UBLK_IO_NEED_GET_DATA))
1080         goto out;
1081 
1082     switch (cmd_op) {
1083     case UBLK_IO_FETCH_REQ:
1084         /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1085         if (ublk_queue_ready(ubq)) {
1086             ret = -EBUSY;
1087             goto out;
1088         }
1089         /*
1090          * The io is being handled by server, so COMMIT_RQ is expected
1091          * instead of FETCH_REQ
1092          */
1093         if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1094             goto out;
1095         /* FETCH_RQ has to provide IO buffer */
1096         if (!ub_cmd->addr)
1097             goto out;
1098         io->cmd = cmd;
1099         io->flags |= UBLK_IO_FLAG_ACTIVE;
1100         io->addr = ub_cmd->addr;
1101 
1102         ublk_mark_io_ready(ub, ubq);
1103         break;
1104     case UBLK_IO_COMMIT_AND_FETCH_REQ:
1105         /* FETCH_RQ has to provide IO buffer */
1106         if (!ub_cmd->addr)
1107             goto out;
1108         if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1109             goto out;
1110         io->addr = ub_cmd->addr;
1111         io->flags |= UBLK_IO_FLAG_ACTIVE;
1112         io->cmd = cmd;
1113         ublk_commit_completion(ub, ub_cmd);
1114         break;
1115     case UBLK_IO_NEED_GET_DATA:
1116         if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1117             goto out;
1118         io->addr = ub_cmd->addr;
1119         io->cmd = cmd;
1120         io->flags |= UBLK_IO_FLAG_ACTIVE;
1121         ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
1122         break;
1123     default:
1124         goto out;
1125     }
1126     return -EIOCBQUEUED;
1127 
1128  out:
1129     io_uring_cmd_done(cmd, ret, 0);
1130     pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1131             __func__, cmd_op, tag, ret, io->flags);
1132     return -EIOCBQUEUED;
1133 }
1134 
1135 static const struct file_operations ublk_ch_fops = {
1136     .owner = THIS_MODULE,
1137     .open = ublk_ch_open,
1138     .release = ublk_ch_release,
1139     .llseek = no_llseek,
1140     .uring_cmd = ublk_ch_uring_cmd,
1141     .mmap = ublk_ch_mmap,
1142 };
1143 
1144 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
1145 {
1146     int size = ublk_queue_cmd_buf_size(ub, q_id);
1147     struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1148 
1149     if (ubq->ubq_daemon)
1150         put_task_struct(ubq->ubq_daemon);
1151     if (ubq->io_cmd_buf)
1152         free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
1153 }
1154 
1155 static int ublk_init_queue(struct ublk_device *ub, int q_id)
1156 {
1157     struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1158     gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
1159     void *ptr;
1160     int size;
1161 
1162     ubq->flags = ub->dev_info.flags;
1163     ubq->q_id = q_id;
1164     ubq->q_depth = ub->dev_info.queue_depth;
1165     size = ublk_queue_cmd_buf_size(ub, q_id);
1166 
1167     ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
1168     if (!ptr)
1169         return -ENOMEM;
1170 
1171     ubq->io_cmd_buf = ptr;
1172     ubq->dev = ub;
1173     return 0;
1174 }
1175 
1176 static void ublk_deinit_queues(struct ublk_device *ub)
1177 {
1178     int nr_queues = ub->dev_info.nr_hw_queues;
1179     int i;
1180 
1181     if (!ub->__queues)
1182         return;
1183 
1184     for (i = 0; i < nr_queues; i++)
1185         ublk_deinit_queue(ub, i);
1186     kfree(ub->__queues);
1187 }
1188 
1189 static int ublk_init_queues(struct ublk_device *ub)
1190 {
1191     int nr_queues = ub->dev_info.nr_hw_queues;
1192     int depth = ub->dev_info.queue_depth;
1193     int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
1194     int i, ret = -ENOMEM;
1195 
1196     ub->queue_size = ubq_size;
1197     ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
1198     if (!ub->__queues)
1199         return ret;
1200 
1201     for (i = 0; i < nr_queues; i++) {
1202         if (ublk_init_queue(ub, i))
1203             goto fail;
1204     }
1205 
1206     init_completion(&ub->completion);
1207     return 0;
1208 
1209  fail:
1210     ublk_deinit_queues(ub);
1211     return ret;
1212 }
1213 
1214 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
1215 {
1216     int i = idx;
1217     int err;
1218 
1219     spin_lock(&ublk_idr_lock);
1220     /* allocate id, if @id >= 0, we're requesting that specific id */
1221     if (i >= 0) {
1222         err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
1223         if (err == -ENOSPC)
1224             err = -EEXIST;
1225     } else {
1226         err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
1227     }
1228     spin_unlock(&ublk_idr_lock);
1229 
1230     if (err >= 0)
1231         ub->ub_number = err;
1232 
1233     return err;
1234 }
1235 
1236 static void ublk_free_dev_number(struct ublk_device *ub)
1237 {
1238     spin_lock(&ublk_idr_lock);
1239     idr_remove(&ublk_index_idr, ub->ub_number);
1240     wake_up_all(&ublk_idr_wq);
1241     spin_unlock(&ublk_idr_lock);
1242 }
1243 
1244 static void ublk_cdev_rel(struct device *dev)
1245 {
1246     struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
1247 
1248     blk_mq_free_tag_set(&ub->tag_set);
1249     ublk_deinit_queues(ub);
1250     ublk_free_dev_number(ub);
1251     mutex_destroy(&ub->mutex);
1252     kfree(ub);
1253 }
1254 
1255 static int ublk_add_chdev(struct ublk_device *ub)
1256 {
1257     struct device *dev = &ub->cdev_dev;
1258     int minor = ub->ub_number;
1259     int ret;
1260 
1261     dev->parent = ublk_misc.this_device;
1262     dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
1263     dev->class = ublk_chr_class;
1264     dev->release = ublk_cdev_rel;
1265     device_initialize(dev);
1266 
1267     ret = dev_set_name(dev, "ublkc%d", minor);
1268     if (ret)
1269         goto fail;
1270 
1271     cdev_init(&ub->cdev, &ublk_ch_fops);
1272     ret = cdev_device_add(&ub->cdev, dev);
1273     if (ret)
1274         goto fail;
1275     return 0;
1276  fail:
1277     put_device(dev);
1278     return ret;
1279 }
1280 
1281 static void ublk_stop_work_fn(struct work_struct *work)
1282 {
1283     struct ublk_device *ub =
1284         container_of(work, struct ublk_device, stop_work);
1285 
1286     ublk_stop_dev(ub);
1287 }
1288 
1289 /* align max io buffer size with PAGE_SIZE */
1290 static void ublk_align_max_io_size(struct ublk_device *ub)
1291 {
1292     unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
1293 
1294     ub->dev_info.max_io_buf_bytes =
1295         round_down(max_io_bytes, PAGE_SIZE);
1296 }
1297 
1298 static int ublk_add_tag_set(struct ublk_device *ub)
1299 {
1300     ub->tag_set.ops = &ublk_mq_ops;
1301     ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
1302     ub->tag_set.queue_depth = ub->dev_info.queue_depth;
1303     ub->tag_set.numa_node = NUMA_NO_NODE;
1304     ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
1305     ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1306     ub->tag_set.driver_data = ub;
1307     return blk_mq_alloc_tag_set(&ub->tag_set);
1308 }
1309 
1310 static void ublk_remove(struct ublk_device *ub)
1311 {
1312     ublk_stop_dev(ub);
1313     cancel_work_sync(&ub->stop_work);
1314     cdev_device_del(&ub->cdev, &ub->cdev_dev);
1315     put_device(&ub->cdev_dev);
1316 }
1317 
1318 static struct ublk_device *ublk_get_device_from_id(int idx)
1319 {
1320     struct ublk_device *ub = NULL;
1321 
1322     if (idx < 0)
1323         return NULL;
1324 
1325     spin_lock(&ublk_idr_lock);
1326     ub = idr_find(&ublk_index_idr, idx);
1327     if (ub)
1328         ub = ublk_get_device(ub);
1329     spin_unlock(&ublk_idr_lock);
1330 
1331     return ub;
1332 }
1333 
1334 static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
1335 {
1336     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1337     int ublksrv_pid = (int)header->data[0];
1338     struct ublk_device *ub;
1339     struct gendisk *disk;
1340     int ret = -EINVAL;
1341 
1342     if (ublksrv_pid <= 0)
1343         return -EINVAL;
1344 
1345     ub = ublk_get_device_from_id(header->dev_id);
1346     if (!ub)
1347         return -EINVAL;
1348 
1349     wait_for_completion_interruptible(&ub->completion);
1350 
1351     schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1352 
1353     mutex_lock(&ub->mutex);
1354     if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
1355         test_bit(UB_STATE_USED, &ub->state)) {
1356         ret = -EEXIST;
1357         goto out_unlock;
1358     }
1359 
1360     disk = blk_mq_alloc_disk(&ub->tag_set, ub);
1361     if (IS_ERR(disk)) {
1362         ret = PTR_ERR(disk);
1363         goto out_unlock;
1364     }
1365     sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
1366     disk->fops = &ub_fops;
1367     disk->private_data = ub;
1368 
1369     ub->dev_info.ublksrv_pid = ublksrv_pid;
1370     ub->ub_disk = disk;
1371 
1372     ret = ublk_apply_params(ub);
1373     if (ret)
1374         goto out_put_disk;
1375 
1376     get_device(&ub->cdev_dev);
1377     ret = add_disk(disk);
1378     if (ret) {
1379         /*
1380          * Has to drop the reference since ->free_disk won't be
1381          * called in case of add_disk failure.
1382          */
1383         ublk_put_device(ub);
1384         goto out_put_disk;
1385     }
1386     set_bit(UB_STATE_USED, &ub->state);
1387     ub->dev_info.state = UBLK_S_DEV_LIVE;
1388 out_put_disk:
1389     if (ret)
1390         put_disk(disk);
1391 out_unlock:
1392     mutex_unlock(&ub->mutex);
1393     ublk_put_device(ub);
1394     return ret;
1395 }
1396 
1397 static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
1398 {
1399     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1400     void __user *argp = (void __user *)(unsigned long)header->addr;
1401     struct ublk_device *ub;
1402     cpumask_var_t cpumask;
1403     unsigned long queue;
1404     unsigned int retlen;
1405     unsigned int i;
1406     int ret = -EINVAL;
1407     
1408     if (header->len * BITS_PER_BYTE < nr_cpu_ids)
1409         return -EINVAL;
1410     if (header->len & (sizeof(unsigned long)-1))
1411         return -EINVAL;
1412     if (!header->addr)
1413         return -EINVAL;
1414 
1415     ub = ublk_get_device_from_id(header->dev_id);
1416     if (!ub)
1417         return -EINVAL;
1418 
1419     queue = header->data[0];
1420     if (queue >= ub->dev_info.nr_hw_queues)
1421         goto out_put_device;
1422 
1423     ret = -ENOMEM;
1424     if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
1425         goto out_put_device;
1426 
1427     for_each_possible_cpu(i) {
1428         if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
1429             cpumask_set_cpu(i, cpumask);
1430     }
1431 
1432     ret = -EFAULT;
1433     retlen = min_t(unsigned short, header->len, cpumask_size());
1434     if (copy_to_user(argp, cpumask, retlen))
1435         goto out_free_cpumask;
1436     if (retlen != header->len &&
1437         clear_user(argp + retlen, header->len - retlen))
1438         goto out_free_cpumask;
1439 
1440     ret = 0;
1441 out_free_cpumask:
1442     free_cpumask_var(cpumask);
1443 out_put_device:
1444     ublk_put_device(ub);
1445     return ret;
1446 }
1447 
1448 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
1449 {
1450     pr_devel("%s: dev id %d flags %llx\n", __func__,
1451             info->dev_id, info->flags);
1452     pr_devel("\t nr_hw_queues %d queue_depth %d\n",
1453             info->nr_hw_queues, info->queue_depth);
1454 }
1455 
1456 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
1457 {
1458     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1459     void __user *argp = (void __user *)(unsigned long)header->addr;
1460     struct ublksrv_ctrl_dev_info info;
1461     struct ublk_device *ub;
1462     int ret = -EINVAL;
1463 
1464     if (header->len < sizeof(info) || !header->addr)
1465         return -EINVAL;
1466     if (header->queue_id != (u16)-1) {
1467         pr_warn("%s: queue_id is wrong %x\n",
1468             __func__, header->queue_id);
1469         return -EINVAL;
1470     }
1471     if (copy_from_user(&info, argp, sizeof(info)))
1472         return -EFAULT;
1473     ublk_dump_dev_info(&info);
1474     if (header->dev_id != info.dev_id) {
1475         pr_warn("%s: dev id not match %u %u\n",
1476             __func__, header->dev_id, info.dev_id);
1477         return -EINVAL;
1478     }
1479 
1480     ret = mutex_lock_killable(&ublk_ctl_mutex);
1481     if (ret)
1482         return ret;
1483 
1484     ret = -ENOMEM;
1485     ub = kzalloc(sizeof(*ub), GFP_KERNEL);
1486     if (!ub)
1487         goto out_unlock;
1488     mutex_init(&ub->mutex);
1489     spin_lock_init(&ub->mm_lock);
1490     INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
1491     INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
1492 
1493     ret = ublk_alloc_dev_number(ub, header->dev_id);
1494     if (ret < 0)
1495         goto out_free_ub;
1496 
1497     memcpy(&ub->dev_info, &info, sizeof(info));
1498 
1499     /* update device id */
1500     ub->dev_info.dev_id = ub->ub_number;
1501 
1502     /*
1503      * 64bit flags will be copied back to userspace as feature
1504      * negotiation result, so have to clear flags which driver
1505      * doesn't support yet, then userspace can get correct flags
1506      * (features) to handle.
1507      */
1508     ub->dev_info.flags &= UBLK_F_ALL;
1509 
1510     /* We are not ready to support zero copy */
1511     ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
1512 
1513     ub->dev_info.nr_hw_queues = min_t(unsigned int,
1514             ub->dev_info.nr_hw_queues, nr_cpu_ids);
1515     ublk_align_max_io_size(ub);
1516 
1517     ret = ublk_init_queues(ub);
1518     if (ret)
1519         goto out_free_dev_number;
1520 
1521     ret = ublk_add_tag_set(ub);
1522     if (ret)
1523         goto out_deinit_queues;
1524 
1525     ret = -EFAULT;
1526     if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
1527         goto out_free_tag_set;
1528 
1529     /*
1530      * Add the char dev so that ublksrv daemon can be setup.
1531      * ublk_add_chdev() will cleanup everything if it fails.
1532      */
1533     ret = ublk_add_chdev(ub);
1534     goto out_unlock;
1535 
1536 out_free_tag_set:
1537     blk_mq_free_tag_set(&ub->tag_set);
1538 out_deinit_queues:
1539     ublk_deinit_queues(ub);
1540 out_free_dev_number:
1541     ublk_free_dev_number(ub);
1542 out_free_ub:
1543     mutex_destroy(&ub->mutex);
1544     kfree(ub);
1545 out_unlock:
1546     mutex_unlock(&ublk_ctl_mutex);
1547     return ret;
1548 }
1549 
1550 static inline bool ublk_idr_freed(int id)
1551 {
1552     void *ptr;
1553 
1554     spin_lock(&ublk_idr_lock);
1555     ptr = idr_find(&ublk_index_idr, id);
1556     spin_unlock(&ublk_idr_lock);
1557 
1558     return ptr == NULL;
1559 }
1560 
1561 static int ublk_ctrl_del_dev(int idx)
1562 {
1563     struct ublk_device *ub;
1564     int ret;
1565 
1566     ret = mutex_lock_killable(&ublk_ctl_mutex);
1567     if (ret)
1568         return ret;
1569 
1570     ub = ublk_get_device_from_id(idx);
1571     if (ub) {
1572         ublk_remove(ub);
1573         ublk_put_device(ub);
1574         ret = 0;
1575     } else {
1576         ret = -ENODEV;
1577     }
1578 
1579     /*
1580      * Wait until the idr is removed, then it can be reused after
1581      * DEL_DEV command is returned.
1582      */
1583     if (!ret)
1584         wait_event(ublk_idr_wq, ublk_idr_freed(idx));
1585     mutex_unlock(&ublk_ctl_mutex);
1586 
1587     return ret;
1588 }
1589 
1590 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
1591 {
1592     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1593 
1594     pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1595             __func__, cmd->cmd_op, header->dev_id, header->queue_id,
1596             header->data[0], header->addr, header->len);
1597 }
1598 
1599 static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
1600 {
1601     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1602     struct ublk_device *ub;
1603 
1604     ub = ublk_get_device_from_id(header->dev_id);
1605     if (!ub)
1606         return -EINVAL;
1607 
1608     ublk_stop_dev(ub);
1609     cancel_work_sync(&ub->stop_work);
1610 
1611     ublk_put_device(ub);
1612     return 0;
1613 }
1614 
1615 static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
1616 {
1617     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1618     void __user *argp = (void __user *)(unsigned long)header->addr;
1619     struct ublk_device *ub;
1620     int ret = 0;
1621 
1622     if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
1623         return -EINVAL;
1624 
1625     ub = ublk_get_device_from_id(header->dev_id);
1626     if (!ub)
1627         return -EINVAL;
1628 
1629     if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
1630         ret = -EFAULT;
1631     ublk_put_device(ub);
1632 
1633     return ret;
1634 }
1635 
1636 static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
1637 {
1638     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1639     void __user *argp = (void __user *)(unsigned long)header->addr;
1640     struct ublk_params_header ph;
1641     struct ublk_device *ub;
1642     int ret;
1643 
1644     if (header->len <= sizeof(ph) || !header->addr)
1645         return -EINVAL;
1646 
1647     if (copy_from_user(&ph, argp, sizeof(ph)))
1648         return -EFAULT;
1649 
1650     if (ph.len > header->len || !ph.len)
1651         return -EINVAL;
1652 
1653     if (ph.len > sizeof(struct ublk_params))
1654         ph.len = sizeof(struct ublk_params);
1655 
1656     ub = ublk_get_device_from_id(header->dev_id);
1657     if (!ub)
1658         return -EINVAL;
1659 
1660     mutex_lock(&ub->mutex);
1661     if (copy_to_user(argp, &ub->params, ph.len))
1662         ret = -EFAULT;
1663     else
1664         ret = 0;
1665     mutex_unlock(&ub->mutex);
1666 
1667     ublk_put_device(ub);
1668     return ret;
1669 }
1670 
1671 static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
1672 {
1673     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1674     void __user *argp = (void __user *)(unsigned long)header->addr;
1675     struct ublk_params_header ph;
1676     struct ublk_device *ub;
1677     int ret = -EFAULT;
1678 
1679     if (header->len <= sizeof(ph) || !header->addr)
1680         return -EINVAL;
1681 
1682     if (copy_from_user(&ph, argp, sizeof(ph)))
1683         return -EFAULT;
1684 
1685     if (ph.len > header->len || !ph.len || !ph.types)
1686         return -EINVAL;
1687 
1688     if (ph.len > sizeof(struct ublk_params))
1689         ph.len = sizeof(struct ublk_params);
1690 
1691     ub = ublk_get_device_from_id(header->dev_id);
1692     if (!ub)
1693         return -EINVAL;
1694 
1695     /* parameters can only be changed when device isn't live */
1696     mutex_lock(&ub->mutex);
1697     if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
1698         ret = -EACCES;
1699     } else if (copy_from_user(&ub->params, argp, ph.len)) {
1700         ret = -EFAULT;
1701     } else {
1702         /* clear all we don't support yet */
1703         ub->params.types &= UBLK_PARAM_TYPE_ALL;
1704         ret = ublk_validate_params(ub);
1705     }
1706     mutex_unlock(&ub->mutex);
1707     ublk_put_device(ub);
1708 
1709     return ret;
1710 }
1711 
1712 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
1713         unsigned int issue_flags)
1714 {
1715     struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1716     int ret = -EINVAL;
1717 
1718     ublk_ctrl_cmd_dump(cmd);
1719 
1720     if (!(issue_flags & IO_URING_F_SQE128))
1721         goto out;
1722 
1723     ret = -EPERM;
1724     if (!capable(CAP_SYS_ADMIN))
1725         goto out;
1726 
1727     ret = -ENODEV;
1728     switch (cmd->cmd_op) {
1729     case UBLK_CMD_START_DEV:
1730         ret = ublk_ctrl_start_dev(cmd);
1731         break;
1732     case UBLK_CMD_STOP_DEV:
1733         ret = ublk_ctrl_stop_dev(cmd);
1734         break;
1735     case UBLK_CMD_GET_DEV_INFO:
1736         ret = ublk_ctrl_get_dev_info(cmd);
1737         break;
1738     case UBLK_CMD_ADD_DEV:
1739         ret = ublk_ctrl_add_dev(cmd);
1740         break;
1741     case UBLK_CMD_DEL_DEV:
1742         ret = ublk_ctrl_del_dev(header->dev_id);
1743         break;
1744     case UBLK_CMD_GET_QUEUE_AFFINITY:
1745         ret = ublk_ctrl_get_queue_affinity(cmd);
1746         break;
1747     case UBLK_CMD_GET_PARAMS:
1748         ret = ublk_ctrl_get_params(cmd);
1749         break;
1750     case UBLK_CMD_SET_PARAMS:
1751         ret = ublk_ctrl_set_params(cmd);
1752         break;
1753     default:
1754         break;
1755     }
1756  out:
1757     io_uring_cmd_done(cmd, ret, 0);
1758     pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
1759             __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
1760     return -EIOCBQUEUED;
1761 }
1762 
1763 static const struct file_operations ublk_ctl_fops = {
1764     .open       = nonseekable_open,
1765     .uring_cmd      = ublk_ctrl_uring_cmd,
1766     .owner      = THIS_MODULE,
1767     .llseek     = noop_llseek,
1768 };
1769 
1770 static struct miscdevice ublk_misc = {
1771     .minor      = MISC_DYNAMIC_MINOR,
1772     .name       = "ublk-control",
1773     .fops       = &ublk_ctl_fops,
1774 };
1775 
1776 static int __init ublk_init(void)
1777 {
1778     int ret;
1779 
1780     init_waitqueue_head(&ublk_idr_wq);
1781 
1782     ret = misc_register(&ublk_misc);
1783     if (ret)
1784         return ret;
1785 
1786     ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
1787     if (ret)
1788         goto unregister_mis;
1789 
1790     ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
1791     if (IS_ERR(ublk_chr_class)) {
1792         ret = PTR_ERR(ublk_chr_class);
1793         goto free_chrdev_region;
1794     }
1795     return 0;
1796 
1797 free_chrdev_region:
1798     unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1799 unregister_mis:
1800     misc_deregister(&ublk_misc);
1801     return ret;
1802 }
1803 
1804 static void __exit ublk_exit(void)
1805 {
1806     struct ublk_device *ub;
1807     int id;
1808 
1809     class_destroy(ublk_chr_class);
1810 
1811     misc_deregister(&ublk_misc);
1812 
1813     idr_for_each_entry(&ublk_index_idr, ub, id)
1814         ublk_remove(ub);
1815 
1816     idr_destroy(&ublk_index_idr);
1817     unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1818 }
1819 
1820 module_init(ublk_init);
1821 module_exit(ublk_exit);
1822 
1823 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
1824 MODULE_LICENSE("GPL");