Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
0004  * Shaohua Li <shli@fb.com>
0005  */
0006 #include <linux/module.h>
0007 
0008 #include <linux/moduleparam.h>
0009 #include <linux/sched.h>
0010 #include <linux/fs.h>
0011 #include <linux/init.h>
0012 #include "null_blk.h"
0013 
0014 #undef pr_fmt
0015 #define pr_fmt(fmt) "null_blk: " fmt
0016 
0017 #define FREE_BATCH      16
0018 
0019 #define TICKS_PER_SEC       50ULL
0020 #define TIMER_INTERVAL      (NSEC_PER_SEC / TICKS_PER_SEC)
0021 
0022 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
0023 static DECLARE_FAULT_ATTR(null_timeout_attr);
0024 static DECLARE_FAULT_ATTR(null_requeue_attr);
0025 static DECLARE_FAULT_ATTR(null_init_hctx_attr);
0026 #endif
0027 
0028 static inline u64 mb_per_tick(int mbps)
0029 {
0030     return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
0031 }
0032 
0033 /*
0034  * Status flags for nullb_device.
0035  *
0036  * CONFIGURED:  Device has been configured and turned on. Cannot reconfigure.
0037  * UP:      Device is currently on and visible in userspace.
0038  * THROTTLED:   Device is being throttled.
0039  * CACHE:   Device is using a write-back cache.
0040  */
0041 enum nullb_device_flags {
0042     NULLB_DEV_FL_CONFIGURED = 0,
0043     NULLB_DEV_FL_UP     = 1,
0044     NULLB_DEV_FL_THROTTLED  = 2,
0045     NULLB_DEV_FL_CACHE  = 3,
0046 };
0047 
0048 #define MAP_SZ      ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
0049 /*
0050  * nullb_page is a page in memory for nullb devices.
0051  *
0052  * @page:   The page holding the data.
0053  * @bitmap: The bitmap represents which sector in the page has data.
0054  *      Each bit represents one block size. For example, sector 8
0055  *      will use the 7th bit
0056  * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
0057  * page is being flushing to storage. FREE means the cache page is freed and
0058  * should be skipped from flushing to storage. Please see
0059  * null_make_cache_space
0060  */
0061 struct nullb_page {
0062     struct page *page;
0063     DECLARE_BITMAP(bitmap, MAP_SZ);
0064 };
0065 #define NULLB_PAGE_LOCK (MAP_SZ - 1)
0066 #define NULLB_PAGE_FREE (MAP_SZ - 2)
0067 
0068 static LIST_HEAD(nullb_list);
0069 static struct mutex lock;
0070 static int null_major;
0071 static DEFINE_IDA(nullb_indexes);
0072 static struct blk_mq_tag_set tag_set;
0073 
0074 enum {
0075     NULL_IRQ_NONE       = 0,
0076     NULL_IRQ_SOFTIRQ    = 1,
0077     NULL_IRQ_TIMER      = 2,
0078 };
0079 
0080 static bool g_virt_boundary = false;
0081 module_param_named(virt_boundary, g_virt_boundary, bool, 0444);
0082 MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False");
0083 
0084 static int g_no_sched;
0085 module_param_named(no_sched, g_no_sched, int, 0444);
0086 MODULE_PARM_DESC(no_sched, "No io scheduler");
0087 
0088 static int g_submit_queues = 1;
0089 module_param_named(submit_queues, g_submit_queues, int, 0444);
0090 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
0091 
0092 static int g_poll_queues = 1;
0093 module_param_named(poll_queues, g_poll_queues, int, 0444);
0094 MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");
0095 
0096 static int g_home_node = NUMA_NO_NODE;
0097 module_param_named(home_node, g_home_node, int, 0444);
0098 MODULE_PARM_DESC(home_node, "Home node for the device");
0099 
0100 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
0101 /*
0102  * For more details about fault injection, please refer to
0103  * Documentation/fault-injection/fault-injection.rst.
0104  */
0105 static char g_timeout_str[80];
0106 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
0107 MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");
0108 
0109 static char g_requeue_str[80];
0110 module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
0111 MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");
0112 
0113 static char g_init_hctx_str[80];
0114 module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
0115 MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
0116 #endif
0117 
0118 static int g_queue_mode = NULL_Q_MQ;
0119 
0120 static int null_param_store_val(const char *str, int *val, int min, int max)
0121 {
0122     int ret, new_val;
0123 
0124     ret = kstrtoint(str, 10, &new_val);
0125     if (ret)
0126         return -EINVAL;
0127 
0128     if (new_val < min || new_val > max)
0129         return -EINVAL;
0130 
0131     *val = new_val;
0132     return 0;
0133 }
0134 
0135 static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
0136 {
0137     return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
0138 }
0139 
0140 static const struct kernel_param_ops null_queue_mode_param_ops = {
0141     .set    = null_set_queue_mode,
0142     .get    = param_get_int,
0143 };
0144 
0145 device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
0146 MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
0147 
0148 static int g_gb = 250;
0149 module_param_named(gb, g_gb, int, 0444);
0150 MODULE_PARM_DESC(gb, "Size in GB");
0151 
0152 static int g_bs = 512;
0153 module_param_named(bs, g_bs, int, 0444);
0154 MODULE_PARM_DESC(bs, "Block size (in bytes)");
0155 
0156 static int g_max_sectors;
0157 module_param_named(max_sectors, g_max_sectors, int, 0444);
0158 MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)");
0159 
0160 static unsigned int nr_devices = 1;
0161 module_param(nr_devices, uint, 0444);
0162 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
0163 
0164 static bool g_blocking;
0165 module_param_named(blocking, g_blocking, bool, 0444);
0166 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
0167 
0168 static bool shared_tags;
0169 module_param(shared_tags, bool, 0444);
0170 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
0171 
0172 static bool g_shared_tag_bitmap;
0173 module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
0174 MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");
0175 
0176 static int g_irqmode = NULL_IRQ_SOFTIRQ;
0177 
0178 static int null_set_irqmode(const char *str, const struct kernel_param *kp)
0179 {
0180     return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
0181                     NULL_IRQ_TIMER);
0182 }
0183 
0184 static const struct kernel_param_ops null_irqmode_param_ops = {
0185     .set    = null_set_irqmode,
0186     .get    = param_get_int,
0187 };
0188 
0189 device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
0190 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
0191 
0192 static unsigned long g_completion_nsec = 10000;
0193 module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
0194 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
0195 
0196 static int g_hw_queue_depth = 64;
0197 module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
0198 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
0199 
0200 static bool g_use_per_node_hctx;
0201 module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
0202 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
0203 
0204 static bool g_memory_backed;
0205 module_param_named(memory_backed, g_memory_backed, bool, 0444);
0206 MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
0207 
0208 static bool g_discard;
0209 module_param_named(discard, g_discard, bool, 0444);
0210 MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
0211 
0212 static unsigned long g_cache_size;
0213 module_param_named(cache_size, g_cache_size, ulong, 0444);
0214 MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
0215 
0216 static unsigned int g_mbps;
0217 module_param_named(mbps, g_mbps, uint, 0444);
0218 MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
0219 
0220 static bool g_zoned;
0221 module_param_named(zoned, g_zoned, bool, S_IRUGO);
0222 MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
0223 
0224 static unsigned long g_zone_size = 256;
0225 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
0226 MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
0227 
0228 static unsigned long g_zone_capacity;
0229 module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
0230 MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
0231 
0232 static unsigned int g_zone_nr_conv;
0233 module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
0234 MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
0235 
0236 static unsigned int g_zone_max_open;
0237 module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
0238 MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
0239 
0240 static unsigned int g_zone_max_active;
0241 module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
0242 MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
0243 
0244 static struct nullb_device *null_alloc_dev(void);
0245 static void null_free_dev(struct nullb_device *dev);
0246 static void null_del_dev(struct nullb *nullb);
0247 static int null_add_dev(struct nullb_device *dev);
0248 static struct nullb *null_find_dev_by_name(const char *name);
0249 static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
0250 
0251 static inline struct nullb_device *to_nullb_device(struct config_item *item)
0252 {
0253     return item ? container_of(item, struct nullb_device, item) : NULL;
0254 }
0255 
0256 static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
0257 {
0258     return snprintf(page, PAGE_SIZE, "%u\n", val);
0259 }
0260 
0261 static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
0262     char *page)
0263 {
0264     return snprintf(page, PAGE_SIZE, "%lu\n", val);
0265 }
0266 
0267 static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
0268 {
0269     return snprintf(page, PAGE_SIZE, "%u\n", val);
0270 }
0271 
0272 static ssize_t nullb_device_uint_attr_store(unsigned int *val,
0273     const char *page, size_t count)
0274 {
0275     unsigned int tmp;
0276     int result;
0277 
0278     result = kstrtouint(page, 0, &tmp);
0279     if (result < 0)
0280         return result;
0281 
0282     *val = tmp;
0283     return count;
0284 }
0285 
0286 static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
0287     const char *page, size_t count)
0288 {
0289     int result;
0290     unsigned long tmp;
0291 
0292     result = kstrtoul(page, 0, &tmp);
0293     if (result < 0)
0294         return result;
0295 
0296     *val = tmp;
0297     return count;
0298 }
0299 
0300 static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
0301     size_t count)
0302 {
0303     bool tmp;
0304     int result;
0305 
0306     result = kstrtobool(page,  &tmp);
0307     if (result < 0)
0308         return result;
0309 
0310     *val = tmp;
0311     return count;
0312 }
0313 
0314 /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
0315 #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY)                \
0316 static ssize_t                              \
0317 nullb_device_##NAME##_show(struct config_item *item, char *page)    \
0318 {                                   \
0319     return nullb_device_##TYPE##_attr_show(             \
0320                 to_nullb_device(item)->NAME, page); \
0321 }                                   \
0322 static ssize_t                              \
0323 nullb_device_##NAME##_store(struct config_item *item, const char *page, \
0324                 size_t count)               \
0325 {                                   \
0326     int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
0327     struct nullb_device *dev = to_nullb_device(item);       \
0328     TYPE new_value = 0;                     \
0329     int ret;                            \
0330                                     \
0331     ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
0332     if (ret < 0)                            \
0333         return ret;                     \
0334     if (apply_fn)                           \
0335         ret = apply_fn(dev, new_value);             \
0336     else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags))    \
0337         ret = -EBUSY;                       \
0338     if (ret < 0)                            \
0339         return ret;                     \
0340     dev->NAME = new_value;                      \
0341     return count;                           \
0342 }                                   \
0343 CONFIGFS_ATTR(nullb_device_, NAME);
0344 
0345 static int nullb_update_nr_hw_queues(struct nullb_device *dev,
0346                      unsigned int submit_queues,
0347                      unsigned int poll_queues)
0348 
0349 {
0350     struct blk_mq_tag_set *set;
0351     int ret, nr_hw_queues;
0352 
0353     if (!dev->nullb)
0354         return 0;
0355 
0356     /*
0357      * Make sure at least one submit queue exists.
0358      */
0359     if (!submit_queues)
0360         return -EINVAL;
0361 
0362     /*
0363      * Make sure that null_init_hctx() does not access nullb->queues[] past
0364      * the end of that array.
0365      */
0366     if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
0367         return -EINVAL;
0368 
0369     /*
0370      * Keep previous and new queue numbers in nullb_device for reference in
0371      * the call back function null_map_queues().
0372      */
0373     dev->prev_submit_queues = dev->submit_queues;
0374     dev->prev_poll_queues = dev->poll_queues;
0375     dev->submit_queues = submit_queues;
0376     dev->poll_queues = poll_queues;
0377 
0378     set = dev->nullb->tag_set;
0379     nr_hw_queues = submit_queues + poll_queues;
0380     blk_mq_update_nr_hw_queues(set, nr_hw_queues);
0381     ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;
0382 
0383     if (ret) {
0384         /* on error, revert the queue numbers */
0385         dev->submit_queues = dev->prev_submit_queues;
0386         dev->poll_queues = dev->prev_poll_queues;
0387     }
0388 
0389     return ret;
0390 }
0391 
0392 static int nullb_apply_submit_queues(struct nullb_device *dev,
0393                      unsigned int submit_queues)
0394 {
0395     return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
0396 }
0397 
0398 static int nullb_apply_poll_queues(struct nullb_device *dev,
0399                    unsigned int poll_queues)
0400 {
0401     return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
0402 }
0403 
0404 NULLB_DEVICE_ATTR(size, ulong, NULL);
0405 NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
0406 NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
0407 NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
0408 NULLB_DEVICE_ATTR(home_node, uint, NULL);
0409 NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
0410 NULLB_DEVICE_ATTR(blocksize, uint, NULL);
0411 NULLB_DEVICE_ATTR(max_sectors, uint, NULL);
0412 NULLB_DEVICE_ATTR(irqmode, uint, NULL);
0413 NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
0414 NULLB_DEVICE_ATTR(index, uint, NULL);
0415 NULLB_DEVICE_ATTR(blocking, bool, NULL);
0416 NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL);
0417 NULLB_DEVICE_ATTR(memory_backed, bool, NULL);
0418 NULLB_DEVICE_ATTR(discard, bool, NULL);
0419 NULLB_DEVICE_ATTR(mbps, uint, NULL);
0420 NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
0421 NULLB_DEVICE_ATTR(zoned, bool, NULL);
0422 NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
0423 NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
0424 NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
0425 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
0426 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
0427 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
0428 NULLB_DEVICE_ATTR(no_sched, bool, NULL);
0429 NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
0430 
0431 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
0432 {
0433     return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
0434 }
0435 
0436 static ssize_t nullb_device_power_store(struct config_item *item,
0437                      const char *page, size_t count)
0438 {
0439     struct nullb_device *dev = to_nullb_device(item);
0440     bool newp = false;
0441     ssize_t ret;
0442 
0443     ret = nullb_device_bool_attr_store(&newp, page, count);
0444     if (ret < 0)
0445         return ret;
0446 
0447     if (!dev->power && newp) {
0448         if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
0449             return count;
0450         ret = null_add_dev(dev);
0451         if (ret) {
0452             clear_bit(NULLB_DEV_FL_UP, &dev->flags);
0453             return ret;
0454         }
0455 
0456         set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
0457         dev->power = newp;
0458     } else if (dev->power && !newp) {
0459         if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
0460             mutex_lock(&lock);
0461             dev->power = newp;
0462             null_del_dev(dev->nullb);
0463             mutex_unlock(&lock);
0464         }
0465         clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
0466     }
0467 
0468     return count;
0469 }
0470 
0471 CONFIGFS_ATTR(nullb_device_, power);
0472 
0473 static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
0474 {
0475     struct nullb_device *t_dev = to_nullb_device(item);
0476 
0477     return badblocks_show(&t_dev->badblocks, page, 0);
0478 }
0479 
0480 static ssize_t nullb_device_badblocks_store(struct config_item *item,
0481                      const char *page, size_t count)
0482 {
0483     struct nullb_device *t_dev = to_nullb_device(item);
0484     char *orig, *buf, *tmp;
0485     u64 start, end;
0486     int ret;
0487 
0488     orig = kstrndup(page, count, GFP_KERNEL);
0489     if (!orig)
0490         return -ENOMEM;
0491 
0492     buf = strstrip(orig);
0493 
0494     ret = -EINVAL;
0495     if (buf[0] != '+' && buf[0] != '-')
0496         goto out;
0497     tmp = strchr(&buf[1], '-');
0498     if (!tmp)
0499         goto out;
0500     *tmp = '\0';
0501     ret = kstrtoull(buf + 1, 0, &start);
0502     if (ret)
0503         goto out;
0504     ret = kstrtoull(tmp + 1, 0, &end);
0505     if (ret)
0506         goto out;
0507     ret = -EINVAL;
0508     if (start > end)
0509         goto out;
0510     /* enable badblocks */
0511     cmpxchg(&t_dev->badblocks.shift, -1, 0);
0512     if (buf[0] == '+')
0513         ret = badblocks_set(&t_dev->badblocks, start,
0514             end - start + 1, 1);
0515     else
0516         ret = badblocks_clear(&t_dev->badblocks, start,
0517             end - start + 1);
0518     if (ret == 0)
0519         ret = count;
0520 out:
0521     kfree(orig);
0522     return ret;
0523 }
0524 CONFIGFS_ATTR(nullb_device_, badblocks);
0525 
0526 static struct configfs_attribute *nullb_device_attrs[] = {
0527     &nullb_device_attr_size,
0528     &nullb_device_attr_completion_nsec,
0529     &nullb_device_attr_submit_queues,
0530     &nullb_device_attr_poll_queues,
0531     &nullb_device_attr_home_node,
0532     &nullb_device_attr_queue_mode,
0533     &nullb_device_attr_blocksize,
0534     &nullb_device_attr_max_sectors,
0535     &nullb_device_attr_irqmode,
0536     &nullb_device_attr_hw_queue_depth,
0537     &nullb_device_attr_index,
0538     &nullb_device_attr_blocking,
0539     &nullb_device_attr_use_per_node_hctx,
0540     &nullb_device_attr_power,
0541     &nullb_device_attr_memory_backed,
0542     &nullb_device_attr_discard,
0543     &nullb_device_attr_mbps,
0544     &nullb_device_attr_cache_size,
0545     &nullb_device_attr_badblocks,
0546     &nullb_device_attr_zoned,
0547     &nullb_device_attr_zone_size,
0548     &nullb_device_attr_zone_capacity,
0549     &nullb_device_attr_zone_nr_conv,
0550     &nullb_device_attr_zone_max_open,
0551     &nullb_device_attr_zone_max_active,
0552     &nullb_device_attr_virt_boundary,
0553     &nullb_device_attr_no_sched,
0554     &nullb_device_attr_shared_tag_bitmap,
0555     NULL,
0556 };
0557 
0558 static void nullb_device_release(struct config_item *item)
0559 {
0560     struct nullb_device *dev = to_nullb_device(item);
0561 
0562     null_free_device_storage(dev, false);
0563     null_free_dev(dev);
0564 }
0565 
0566 static struct configfs_item_operations nullb_device_ops = {
0567     .release    = nullb_device_release,
0568 };
0569 
0570 static const struct config_item_type nullb_device_type = {
0571     .ct_item_ops    = &nullb_device_ops,
0572     .ct_attrs   = nullb_device_attrs,
0573     .ct_owner   = THIS_MODULE,
0574 };
0575 
0576 static struct
0577 config_item *nullb_group_make_item(struct config_group *group, const char *name)
0578 {
0579     struct nullb_device *dev;
0580 
0581     if (null_find_dev_by_name(name))
0582         return ERR_PTR(-EEXIST);
0583 
0584     dev = null_alloc_dev();
0585     if (!dev)
0586         return ERR_PTR(-ENOMEM);
0587 
0588     config_item_init_type_name(&dev->item, name, &nullb_device_type);
0589 
0590     return &dev->item;
0591 }
0592 
0593 static void
0594 nullb_group_drop_item(struct config_group *group, struct config_item *item)
0595 {
0596     struct nullb_device *dev = to_nullb_device(item);
0597 
0598     if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
0599         mutex_lock(&lock);
0600         dev->power = false;
0601         null_del_dev(dev->nullb);
0602         mutex_unlock(&lock);
0603     }
0604 
0605     config_item_put(item);
0606 }
0607 
0608 static ssize_t memb_group_features_show(struct config_item *item, char *page)
0609 {
0610     return snprintf(page, PAGE_SIZE,
0611             "badblocks,blocking,blocksize,cache_size,"
0612             "completion_nsec,discard,home_node,hw_queue_depth,"
0613             "irqmode,max_sectors,mbps,memory_backed,no_sched,"
0614             "poll_queues,power,queue_mode,shared_tag_bitmap,size,"
0615             "submit_queues,use_per_node_hctx,virt_boundary,zoned,"
0616             "zone_capacity,zone_max_active,zone_max_open,"
0617             "zone_nr_conv,zone_size\n");
0618 }
0619 
0620 CONFIGFS_ATTR_RO(memb_group_, features);
0621 
0622 static struct configfs_attribute *nullb_group_attrs[] = {
0623     &memb_group_attr_features,
0624     NULL,
0625 };
0626 
0627 static struct configfs_group_operations nullb_group_ops = {
0628     .make_item  = nullb_group_make_item,
0629     .drop_item  = nullb_group_drop_item,
0630 };
0631 
0632 static const struct config_item_type nullb_group_type = {
0633     .ct_group_ops   = &nullb_group_ops,
0634     .ct_attrs   = nullb_group_attrs,
0635     .ct_owner   = THIS_MODULE,
0636 };
0637 
0638 static struct configfs_subsystem nullb_subsys = {
0639     .su_group = {
0640         .cg_item = {
0641             .ci_namebuf = "nullb",
0642             .ci_type = &nullb_group_type,
0643         },
0644     },
0645 };
0646 
0647 static inline int null_cache_active(struct nullb *nullb)
0648 {
0649     return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
0650 }
0651 
0652 static struct nullb_device *null_alloc_dev(void)
0653 {
0654     struct nullb_device *dev;
0655 
0656     dev = kzalloc(sizeof(*dev), GFP_KERNEL);
0657     if (!dev)
0658         return NULL;
0659     INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
0660     INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
0661     if (badblocks_init(&dev->badblocks, 0)) {
0662         kfree(dev);
0663         return NULL;
0664     }
0665 
0666     dev->size = g_gb * 1024;
0667     dev->completion_nsec = g_completion_nsec;
0668     dev->submit_queues = g_submit_queues;
0669     dev->prev_submit_queues = g_submit_queues;
0670     dev->poll_queues = g_poll_queues;
0671     dev->prev_poll_queues = g_poll_queues;
0672     dev->home_node = g_home_node;
0673     dev->queue_mode = g_queue_mode;
0674     dev->blocksize = g_bs;
0675     dev->max_sectors = g_max_sectors;
0676     dev->irqmode = g_irqmode;
0677     dev->hw_queue_depth = g_hw_queue_depth;
0678     dev->blocking = g_blocking;
0679     dev->memory_backed = g_memory_backed;
0680     dev->discard = g_discard;
0681     dev->cache_size = g_cache_size;
0682     dev->mbps = g_mbps;
0683     dev->use_per_node_hctx = g_use_per_node_hctx;
0684     dev->zoned = g_zoned;
0685     dev->zone_size = g_zone_size;
0686     dev->zone_capacity = g_zone_capacity;
0687     dev->zone_nr_conv = g_zone_nr_conv;
0688     dev->zone_max_open = g_zone_max_open;
0689     dev->zone_max_active = g_zone_max_active;
0690     dev->virt_boundary = g_virt_boundary;
0691     dev->no_sched = g_no_sched;
0692     dev->shared_tag_bitmap = g_shared_tag_bitmap;
0693     return dev;
0694 }
0695 
0696 static void null_free_dev(struct nullb_device *dev)
0697 {
0698     if (!dev)
0699         return;
0700 
0701     null_free_zoned_dev(dev);
0702     badblocks_exit(&dev->badblocks);
0703     kfree(dev);
0704 }
0705 
0706 static void put_tag(struct nullb_queue *nq, unsigned int tag)
0707 {
0708     clear_bit_unlock(tag, nq->tag_map);
0709 
0710     if (waitqueue_active(&nq->wait))
0711         wake_up(&nq->wait);
0712 }
0713 
0714 static unsigned int get_tag(struct nullb_queue *nq)
0715 {
0716     unsigned int tag;
0717 
0718     do {
0719         tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
0720         if (tag >= nq->queue_depth)
0721             return -1U;
0722     } while (test_and_set_bit_lock(tag, nq->tag_map));
0723 
0724     return tag;
0725 }
0726 
0727 static void free_cmd(struct nullb_cmd *cmd)
0728 {
0729     put_tag(cmd->nq, cmd->tag);
0730 }
0731 
0732 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
0733 
0734 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
0735 {
0736     struct nullb_cmd *cmd;
0737     unsigned int tag;
0738 
0739     tag = get_tag(nq);
0740     if (tag != -1U) {
0741         cmd = &nq->cmds[tag];
0742         cmd->tag = tag;
0743         cmd->error = BLK_STS_OK;
0744         cmd->nq = nq;
0745         if (nq->dev->irqmode == NULL_IRQ_TIMER) {
0746             hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
0747                      HRTIMER_MODE_REL);
0748             cmd->timer.function = null_cmd_timer_expired;
0749         }
0750         return cmd;
0751     }
0752 
0753     return NULL;
0754 }
0755 
0756 static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)
0757 {
0758     struct nullb_cmd *cmd;
0759     DEFINE_WAIT(wait);
0760 
0761     do {
0762         /*
0763          * This avoids multiple return statements, multiple calls to
0764          * __alloc_cmd() and a fast path call to prepare_to_wait().
0765          */
0766         cmd = __alloc_cmd(nq);
0767         if (cmd) {
0768             cmd->bio = bio;
0769             return cmd;
0770         }
0771         prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
0772         io_schedule();
0773         finish_wait(&nq->wait, &wait);
0774     } while (1);
0775 }
0776 
0777 static void end_cmd(struct nullb_cmd *cmd)
0778 {
0779     int queue_mode = cmd->nq->dev->queue_mode;
0780 
0781     switch (queue_mode)  {
0782     case NULL_Q_MQ:
0783         blk_mq_end_request(cmd->rq, cmd->error);
0784         return;
0785     case NULL_Q_BIO:
0786         cmd->bio->bi_status = cmd->error;
0787         bio_endio(cmd->bio);
0788         break;
0789     }
0790 
0791     free_cmd(cmd);
0792 }
0793 
0794 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
0795 {
0796     end_cmd(container_of(timer, struct nullb_cmd, timer));
0797 
0798     return HRTIMER_NORESTART;
0799 }
0800 
0801 static void null_cmd_end_timer(struct nullb_cmd *cmd)
0802 {
0803     ktime_t kt = cmd->nq->dev->completion_nsec;
0804 
0805     hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
0806 }
0807 
0808 static void null_complete_rq(struct request *rq)
0809 {
0810     end_cmd(blk_mq_rq_to_pdu(rq));
0811 }
0812 
0813 static struct nullb_page *null_alloc_page(void)
0814 {
0815     struct nullb_page *t_page;
0816 
0817     t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO);
0818     if (!t_page)
0819         return NULL;
0820 
0821     t_page->page = alloc_pages(GFP_NOIO, 0);
0822     if (!t_page->page) {
0823         kfree(t_page);
0824         return NULL;
0825     }
0826 
0827     memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
0828     return t_page;
0829 }
0830 
0831 static void null_free_page(struct nullb_page *t_page)
0832 {
0833     __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
0834     if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
0835         return;
0836     __free_page(t_page->page);
0837     kfree(t_page);
0838 }
0839 
0840 static bool null_page_empty(struct nullb_page *page)
0841 {
0842     int size = MAP_SZ - 2;
0843 
0844     return find_first_bit(page->bitmap, size) == size;
0845 }
0846 
0847 static void null_free_sector(struct nullb *nullb, sector_t sector,
0848     bool is_cache)
0849 {
0850     unsigned int sector_bit;
0851     u64 idx;
0852     struct nullb_page *t_page, *ret;
0853     struct radix_tree_root *root;
0854 
0855     root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
0856     idx = sector >> PAGE_SECTORS_SHIFT;
0857     sector_bit = (sector & SECTOR_MASK);
0858 
0859     t_page = radix_tree_lookup(root, idx);
0860     if (t_page) {
0861         __clear_bit(sector_bit, t_page->bitmap);
0862 
0863         if (null_page_empty(t_page)) {
0864             ret = radix_tree_delete_item(root, idx, t_page);
0865             WARN_ON(ret != t_page);
0866             null_free_page(ret);
0867             if (is_cache)
0868                 nullb->dev->curr_cache -= PAGE_SIZE;
0869         }
0870     }
0871 }
0872 
0873 static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
0874     struct nullb_page *t_page, bool is_cache)
0875 {
0876     struct radix_tree_root *root;
0877 
0878     root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
0879 
0880     if (radix_tree_insert(root, idx, t_page)) {
0881         null_free_page(t_page);
0882         t_page = radix_tree_lookup(root, idx);
0883         WARN_ON(!t_page || t_page->page->index != idx);
0884     } else if (is_cache)
0885         nullb->dev->curr_cache += PAGE_SIZE;
0886 
0887     return t_page;
0888 }
0889 
0890 static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
0891 {
0892     unsigned long pos = 0;
0893     int nr_pages;
0894     struct nullb_page *ret, *t_pages[FREE_BATCH];
0895     struct radix_tree_root *root;
0896 
0897     root = is_cache ? &dev->cache : &dev->data;
0898 
0899     do {
0900         int i;
0901 
0902         nr_pages = radix_tree_gang_lookup(root,
0903                 (void **)t_pages, pos, FREE_BATCH);
0904 
0905         for (i = 0; i < nr_pages; i++) {
0906             pos = t_pages[i]->page->index;
0907             ret = radix_tree_delete_item(root, pos, t_pages[i]);
0908             WARN_ON(ret != t_pages[i]);
0909             null_free_page(ret);
0910         }
0911 
0912         pos++;
0913     } while (nr_pages == FREE_BATCH);
0914 
0915     if (is_cache)
0916         dev->curr_cache = 0;
0917 }
0918 
0919 static struct nullb_page *__null_lookup_page(struct nullb *nullb,
0920     sector_t sector, bool for_write, bool is_cache)
0921 {
0922     unsigned int sector_bit;
0923     u64 idx;
0924     struct nullb_page *t_page;
0925     struct radix_tree_root *root;
0926 
0927     idx = sector >> PAGE_SECTORS_SHIFT;
0928     sector_bit = (sector & SECTOR_MASK);
0929 
0930     root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
0931     t_page = radix_tree_lookup(root, idx);
0932     WARN_ON(t_page && t_page->page->index != idx);
0933 
0934     if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
0935         return t_page;
0936 
0937     return NULL;
0938 }
0939 
0940 static struct nullb_page *null_lookup_page(struct nullb *nullb,
0941     sector_t sector, bool for_write, bool ignore_cache)
0942 {
0943     struct nullb_page *page = NULL;
0944 
0945     if (!ignore_cache)
0946         page = __null_lookup_page(nullb, sector, for_write, true);
0947     if (page)
0948         return page;
0949     return __null_lookup_page(nullb, sector, for_write, false);
0950 }
0951 
0952 static struct nullb_page *null_insert_page(struct nullb *nullb,
0953                        sector_t sector, bool ignore_cache)
0954     __releases(&nullb->lock)
0955     __acquires(&nullb->lock)
0956 {
0957     u64 idx;
0958     struct nullb_page *t_page;
0959 
0960     t_page = null_lookup_page(nullb, sector, true, ignore_cache);
0961     if (t_page)
0962         return t_page;
0963 
0964     spin_unlock_irq(&nullb->lock);
0965 
0966     t_page = null_alloc_page();
0967     if (!t_page)
0968         goto out_lock;
0969 
0970     if (radix_tree_preload(GFP_NOIO))
0971         goto out_freepage;
0972 
0973     spin_lock_irq(&nullb->lock);
0974     idx = sector >> PAGE_SECTORS_SHIFT;
0975     t_page->page->index = idx;
0976     t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
0977     radix_tree_preload_end();
0978 
0979     return t_page;
0980 out_freepage:
0981     null_free_page(t_page);
0982 out_lock:
0983     spin_lock_irq(&nullb->lock);
0984     return null_lookup_page(nullb, sector, true, ignore_cache);
0985 }
0986 
0987 static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
0988 {
0989     int i;
0990     unsigned int offset;
0991     u64 idx;
0992     struct nullb_page *t_page, *ret;
0993     void *dst, *src;
0994 
0995     idx = c_page->page->index;
0996 
0997     t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
0998 
0999     __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
1000     if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
1001         null_free_page(c_page);
1002         if (t_page && null_page_empty(t_page)) {
1003             ret = radix_tree_delete_item(&nullb->dev->data,
1004                 idx, t_page);
1005             null_free_page(t_page);
1006         }
1007         return 0;
1008     }
1009 
1010     if (!t_page)
1011         return -ENOMEM;
1012 
1013     src = kmap_atomic(c_page->page);
1014     dst = kmap_atomic(t_page->page);
1015 
1016     for (i = 0; i < PAGE_SECTORS;
1017             i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
1018         if (test_bit(i, c_page->bitmap)) {
1019             offset = (i << SECTOR_SHIFT);
1020             memcpy(dst + offset, src + offset,
1021                 nullb->dev->blocksize);
1022             __set_bit(i, t_page->bitmap);
1023         }
1024     }
1025 
1026     kunmap_atomic(dst);
1027     kunmap_atomic(src);
1028 
1029     ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
1030     null_free_page(ret);
1031     nullb->dev->curr_cache -= PAGE_SIZE;
1032 
1033     return 0;
1034 }
1035 
1036 static int null_make_cache_space(struct nullb *nullb, unsigned long n)
1037 {
1038     int i, err, nr_pages;
1039     struct nullb_page *c_pages[FREE_BATCH];
1040     unsigned long flushed = 0, one_round;
1041 
1042 again:
1043     if ((nullb->dev->cache_size * 1024 * 1024) >
1044          nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
1045         return 0;
1046 
1047     nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
1048             (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
1049     /*
1050      * nullb_flush_cache_page could unlock before using the c_pages. To
1051      * avoid race, we don't allow page free
1052      */
1053     for (i = 0; i < nr_pages; i++) {
1054         nullb->cache_flush_pos = c_pages[i]->page->index;
1055         /*
1056          * We found the page which is being flushed to disk by other
1057          * threads
1058          */
1059         if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
1060             c_pages[i] = NULL;
1061         else
1062             __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
1063     }
1064 
1065     one_round = 0;
1066     for (i = 0; i < nr_pages; i++) {
1067         if (c_pages[i] == NULL)
1068             continue;
1069         err = null_flush_cache_page(nullb, c_pages[i]);
1070         if (err)
1071             return err;
1072         one_round++;
1073     }
1074     flushed += one_round << PAGE_SHIFT;
1075 
1076     if (n > flushed) {
1077         if (nr_pages == 0)
1078             nullb->cache_flush_pos = 0;
1079         if (one_round == 0) {
1080             /* give other threads a chance */
1081             spin_unlock_irq(&nullb->lock);
1082             spin_lock_irq(&nullb->lock);
1083         }
1084         goto again;
1085     }
1086     return 0;
1087 }
1088 
1089 static int copy_to_nullb(struct nullb *nullb, struct page *source,
1090     unsigned int off, sector_t sector, size_t n, bool is_fua)
1091 {
1092     size_t temp, count = 0;
1093     unsigned int offset;
1094     struct nullb_page *t_page;
1095     void *dst, *src;
1096 
1097     while (count < n) {
1098         temp = min_t(size_t, nullb->dev->blocksize, n - count);
1099 
1100         if (null_cache_active(nullb) && !is_fua)
1101             null_make_cache_space(nullb, PAGE_SIZE);
1102 
1103         offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
1104         t_page = null_insert_page(nullb, sector,
1105             !null_cache_active(nullb) || is_fua);
1106         if (!t_page)
1107             return -ENOSPC;
1108 
1109         src = kmap_atomic(source);
1110         dst = kmap_atomic(t_page->page);
1111         memcpy(dst + offset, src + off + count, temp);
1112         kunmap_atomic(dst);
1113         kunmap_atomic(src);
1114 
1115         __set_bit(sector & SECTOR_MASK, t_page->bitmap);
1116 
1117         if (is_fua)
1118             null_free_sector(nullb, sector, true);
1119 
1120         count += temp;
1121         sector += temp >> SECTOR_SHIFT;
1122     }
1123     return 0;
1124 }
1125 
1126 static int copy_from_nullb(struct nullb *nullb, struct page *dest,
1127     unsigned int off, sector_t sector, size_t n)
1128 {
1129     size_t temp, count = 0;
1130     unsigned int offset;
1131     struct nullb_page *t_page;
1132     void *dst, *src;
1133 
1134     while (count < n) {
1135         temp = min_t(size_t, nullb->dev->blocksize, n - count);
1136 
1137         offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
1138         t_page = null_lookup_page(nullb, sector, false,
1139             !null_cache_active(nullb));
1140 
1141         dst = kmap_atomic(dest);
1142         if (!t_page) {
1143             memset(dst + off + count, 0, temp);
1144             goto next;
1145         }
1146         src = kmap_atomic(t_page->page);
1147         memcpy(dst + off + count, src + offset, temp);
1148         kunmap_atomic(src);
1149 next:
1150         kunmap_atomic(dst);
1151 
1152         count += temp;
1153         sector += temp >> SECTOR_SHIFT;
1154     }
1155     return 0;
1156 }
1157 
1158 static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
1159                    unsigned int len, unsigned int off)
1160 {
1161     void *dst;
1162 
1163     dst = kmap_atomic(page);
1164     memset(dst + off, 0xFF, len);
1165     kunmap_atomic(dst);
1166 }
1167 
1168 blk_status_t null_handle_discard(struct nullb_device *dev,
1169                  sector_t sector, sector_t nr_sectors)
1170 {
1171     struct nullb *nullb = dev->nullb;
1172     size_t n = nr_sectors << SECTOR_SHIFT;
1173     size_t temp;
1174 
1175     spin_lock_irq(&nullb->lock);
1176     while (n > 0) {
1177         temp = min_t(size_t, n, dev->blocksize);
1178         null_free_sector(nullb, sector, false);
1179         if (null_cache_active(nullb))
1180             null_free_sector(nullb, sector, true);
1181         sector += temp >> SECTOR_SHIFT;
1182         n -= temp;
1183     }
1184     spin_unlock_irq(&nullb->lock);
1185 
1186     return BLK_STS_OK;
1187 }
1188 
1189 static int null_handle_flush(struct nullb *nullb)
1190 {
1191     int err;
1192 
1193     if (!null_cache_active(nullb))
1194         return 0;
1195 
1196     spin_lock_irq(&nullb->lock);
1197     while (true) {
1198         err = null_make_cache_space(nullb,
1199             nullb->dev->cache_size * 1024 * 1024);
1200         if (err || nullb->dev->curr_cache == 0)
1201             break;
1202     }
1203 
1204     WARN_ON(!radix_tree_empty(&nullb->dev->cache));
1205     spin_unlock_irq(&nullb->lock);
1206     return err;
1207 }
1208 
1209 static int null_transfer(struct nullb *nullb, struct page *page,
1210     unsigned int len, unsigned int off, bool is_write, sector_t sector,
1211     bool is_fua)
1212 {
1213     struct nullb_device *dev = nullb->dev;
1214     unsigned int valid_len = len;
1215     int err = 0;
1216 
1217     if (!is_write) {
1218         if (dev->zoned)
1219             valid_len = null_zone_valid_read_len(nullb,
1220                 sector, len);
1221 
1222         if (valid_len) {
1223             err = copy_from_nullb(nullb, page, off,
1224                 sector, valid_len);
1225             off += valid_len;
1226             len -= valid_len;
1227         }
1228 
1229         if (len)
1230             nullb_fill_pattern(nullb, page, len, off);
1231         flush_dcache_page(page);
1232     } else {
1233         flush_dcache_page(page);
1234         err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
1235     }
1236 
1237     return err;
1238 }
1239 
1240 static int null_handle_rq(struct nullb_cmd *cmd)
1241 {
1242     struct request *rq = cmd->rq;
1243     struct nullb *nullb = cmd->nq->dev->nullb;
1244     int err;
1245     unsigned int len;
1246     sector_t sector = blk_rq_pos(rq);
1247     struct req_iterator iter;
1248     struct bio_vec bvec;
1249 
1250     spin_lock_irq(&nullb->lock);
1251     rq_for_each_segment(bvec, rq, iter) {
1252         len = bvec.bv_len;
1253         err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
1254                      op_is_write(req_op(rq)), sector,
1255                      rq->cmd_flags & REQ_FUA);
1256         if (err) {
1257             spin_unlock_irq(&nullb->lock);
1258             return err;
1259         }
1260         sector += len >> SECTOR_SHIFT;
1261     }
1262     spin_unlock_irq(&nullb->lock);
1263 
1264     return 0;
1265 }
1266 
1267 static int null_handle_bio(struct nullb_cmd *cmd)
1268 {
1269     struct bio *bio = cmd->bio;
1270     struct nullb *nullb = cmd->nq->dev->nullb;
1271     int err;
1272     unsigned int len;
1273     sector_t sector = bio->bi_iter.bi_sector;
1274     struct bio_vec bvec;
1275     struct bvec_iter iter;
1276 
1277     spin_lock_irq(&nullb->lock);
1278     bio_for_each_segment(bvec, bio, iter) {
1279         len = bvec.bv_len;
1280         err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
1281                      op_is_write(bio_op(bio)), sector,
1282                      bio->bi_opf & REQ_FUA);
1283         if (err) {
1284             spin_unlock_irq(&nullb->lock);
1285             return err;
1286         }
1287         sector += len >> SECTOR_SHIFT;
1288     }
1289     spin_unlock_irq(&nullb->lock);
1290     return 0;
1291 }
1292 
1293 static void null_stop_queue(struct nullb *nullb)
1294 {
1295     struct request_queue *q = nullb->q;
1296 
1297     if (nullb->dev->queue_mode == NULL_Q_MQ)
1298         blk_mq_stop_hw_queues(q);
1299 }
1300 
1301 static void null_restart_queue_async(struct nullb *nullb)
1302 {
1303     struct request_queue *q = nullb->q;
1304 
1305     if (nullb->dev->queue_mode == NULL_Q_MQ)
1306         blk_mq_start_stopped_hw_queues(q, true);
1307 }
1308 
1309 static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
1310 {
1311     struct nullb_device *dev = cmd->nq->dev;
1312     struct nullb *nullb = dev->nullb;
1313     blk_status_t sts = BLK_STS_OK;
1314     struct request *rq = cmd->rq;
1315 
1316     if (!hrtimer_active(&nullb->bw_timer))
1317         hrtimer_restart(&nullb->bw_timer);
1318 
1319     if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
1320         null_stop_queue(nullb);
1321         /* race with timer */
1322         if (atomic_long_read(&nullb->cur_bytes) > 0)
1323             null_restart_queue_async(nullb);
1324         /* requeue request */
1325         sts = BLK_STS_DEV_RESOURCE;
1326     }
1327     return sts;
1328 }
1329 
1330 static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
1331                          sector_t sector,
1332                          sector_t nr_sectors)
1333 {
1334     struct badblocks *bb = &cmd->nq->dev->badblocks;
1335     sector_t first_bad;
1336     int bad_sectors;
1337 
1338     if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
1339         return BLK_STS_IOERR;
1340 
1341     return BLK_STS_OK;
1342 }
1343 
1344 static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
1345                              enum req_op op,
1346                              sector_t sector,
1347                              sector_t nr_sectors)
1348 {
1349     struct nullb_device *dev = cmd->nq->dev;
1350     int err;
1351 
1352     if (op == REQ_OP_DISCARD)
1353         return null_handle_discard(dev, sector, nr_sectors);
1354 
1355     if (dev->queue_mode == NULL_Q_BIO)
1356         err = null_handle_bio(cmd);
1357     else
1358         err = null_handle_rq(cmd);
1359 
1360     return errno_to_blk_status(err);
1361 }
1362 
1363 static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
1364 {
1365     struct nullb_device *dev = cmd->nq->dev;
1366     struct bio *bio;
1367 
1368     if (dev->memory_backed)
1369         return;
1370 
1371     if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
1372         zero_fill_bio(cmd->bio);
1373     } else if (req_op(cmd->rq) == REQ_OP_READ) {
1374         __rq_for_each_bio(bio, cmd->rq)
1375             zero_fill_bio(bio);
1376     }
1377 }
1378 
1379 static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
1380 {
1381     /*
1382      * Since root privileges are required to configure the null_blk
1383      * driver, it is fine that this driver does not initialize the
1384      * data buffers of read commands. Zero-initialize these buffers
1385      * anyway if KMSAN is enabled to prevent that KMSAN complains
1386      * about null_blk not initializing read data buffers.
1387      */
1388     if (IS_ENABLED(CONFIG_KMSAN))
1389         nullb_zero_read_cmd_buffer(cmd);
1390 
1391     /* Complete IO by inline, softirq or timer */
1392     switch (cmd->nq->dev->irqmode) {
1393     case NULL_IRQ_SOFTIRQ:
1394         switch (cmd->nq->dev->queue_mode) {
1395         case NULL_Q_MQ:
1396             if (likely(!blk_should_fake_timeout(cmd->rq->q)))
1397                 blk_mq_complete_request(cmd->rq);
1398             break;
1399         case NULL_Q_BIO:
1400             /*
1401              * XXX: no proper submitting cpu information available.
1402              */
1403             end_cmd(cmd);
1404             break;
1405         }
1406         break;
1407     case NULL_IRQ_NONE:
1408         end_cmd(cmd);
1409         break;
1410     case NULL_IRQ_TIMER:
1411         null_cmd_end_timer(cmd);
1412         break;
1413     }
1414 }
1415 
1416 blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
1417                   sector_t sector, unsigned int nr_sectors)
1418 {
1419     struct nullb_device *dev = cmd->nq->dev;
1420     blk_status_t ret;
1421 
1422     if (dev->badblocks.shift != -1) {
1423         ret = null_handle_badblocks(cmd, sector, nr_sectors);
1424         if (ret != BLK_STS_OK)
1425             return ret;
1426     }
1427 
1428     if (dev->memory_backed)
1429         return null_handle_memory_backed(cmd, op, sector, nr_sectors);
1430 
1431     return BLK_STS_OK;
1432 }
1433 
1434 static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
1435                     sector_t nr_sectors, enum req_op op)
1436 {
1437     struct nullb_device *dev = cmd->nq->dev;
1438     struct nullb *nullb = dev->nullb;
1439     blk_status_t sts;
1440 
1441     if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
1442         sts = null_handle_throttled(cmd);
1443         if (sts != BLK_STS_OK)
1444             return sts;
1445     }
1446 
1447     if (op == REQ_OP_FLUSH) {
1448         cmd->error = errno_to_blk_status(null_handle_flush(nullb));
1449         goto out;
1450     }
1451 
1452     if (dev->zoned)
1453         sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
1454     else
1455         sts = null_process_cmd(cmd, op, sector, nr_sectors);
1456 
1457     /* Do not overwrite errors (e.g. timeout errors) */
1458     if (cmd->error == BLK_STS_OK)
1459         cmd->error = sts;
1460 
1461 out:
1462     nullb_complete_cmd(cmd);
1463     return BLK_STS_OK;
1464 }
1465 
1466 static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
1467 {
1468     struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
1469     ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
1470     unsigned int mbps = nullb->dev->mbps;
1471 
1472     if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
1473         return HRTIMER_NORESTART;
1474 
1475     atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
1476     null_restart_queue_async(nullb);
1477 
1478     hrtimer_forward_now(&nullb->bw_timer, timer_interval);
1479 
1480     return HRTIMER_RESTART;
1481 }
1482 
1483 static void nullb_setup_bwtimer(struct nullb *nullb)
1484 {
1485     ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
1486 
1487     hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1488     nullb->bw_timer.function = nullb_bwtimer_fn;
1489     atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
1490     hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
1491 }
1492 
1493 static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
1494 {
1495     int index = 0;
1496 
1497     if (nullb->nr_queues != 1)
1498         index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
1499 
1500     return &nullb->queues[index];
1501 }
1502 
1503 static void null_submit_bio(struct bio *bio)
1504 {
1505     sector_t sector = bio->bi_iter.bi_sector;
1506     sector_t nr_sectors = bio_sectors(bio);
1507     struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
1508     struct nullb_queue *nq = nullb_to_queue(nullb);
1509 
1510     null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio));
1511 }
1512 
1513 static bool should_timeout_request(struct request *rq)
1514 {
1515 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1516     if (g_timeout_str[0])
1517         return should_fail(&null_timeout_attr, 1);
1518 #endif
1519     return false;
1520 }
1521 
1522 static bool should_requeue_request(struct request *rq)
1523 {
1524 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1525     if (g_requeue_str[0])
1526         return should_fail(&null_requeue_attr, 1);
1527 #endif
1528     return false;
1529 }
1530 
1531 static int null_map_queues(struct blk_mq_tag_set *set)
1532 {
1533     struct nullb *nullb = set->driver_data;
1534     int i, qoff;
1535     unsigned int submit_queues = g_submit_queues;
1536     unsigned int poll_queues = g_poll_queues;
1537 
1538     if (nullb) {
1539         struct nullb_device *dev = nullb->dev;
1540 
1541         /*
1542          * Refer nr_hw_queues of the tag set to check if the expected
1543          * number of hardware queues are prepared. If block layer failed
1544          * to prepare them, use previous numbers of submit queues and
1545          * poll queues to map queues.
1546          */
1547         if (set->nr_hw_queues ==
1548             dev->submit_queues + dev->poll_queues) {
1549             submit_queues = dev->submit_queues;
1550             poll_queues = dev->poll_queues;
1551         } else if (set->nr_hw_queues ==
1552                dev->prev_submit_queues + dev->prev_poll_queues) {
1553             submit_queues = dev->prev_submit_queues;
1554             poll_queues = dev->prev_poll_queues;
1555         } else {
1556             pr_warn("tag set has unexpected nr_hw_queues: %d\n",
1557                 set->nr_hw_queues);
1558             return -EINVAL;
1559         }
1560     }
1561 
1562     for (i = 0, qoff = 0; i < set->nr_maps; i++) {
1563         struct blk_mq_queue_map *map = &set->map[i];
1564 
1565         switch (i) {
1566         case HCTX_TYPE_DEFAULT:
1567             map->nr_queues = submit_queues;
1568             break;
1569         case HCTX_TYPE_READ:
1570             map->nr_queues = 0;
1571             continue;
1572         case HCTX_TYPE_POLL:
1573             map->nr_queues = poll_queues;
1574             break;
1575         }
1576         map->queue_offset = qoff;
1577         qoff += map->nr_queues;
1578         blk_mq_map_queues(map);
1579     }
1580 
1581     return 0;
1582 }
1583 
1584 static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
1585 {
1586     struct nullb_queue *nq = hctx->driver_data;
1587     LIST_HEAD(list);
1588     int nr = 0;
1589 
1590     spin_lock(&nq->poll_lock);
1591     list_splice_init(&nq->poll_list, &list);
1592     spin_unlock(&nq->poll_lock);
1593 
1594     while (!list_empty(&list)) {
1595         struct nullb_cmd *cmd;
1596         struct request *req;
1597 
1598         req = list_first_entry(&list, struct request, queuelist);
1599         list_del_init(&req->queuelist);
1600         cmd = blk_mq_rq_to_pdu(req);
1601         cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
1602                         blk_rq_sectors(req));
1603         if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
1604                     blk_mq_end_request_batch))
1605             end_cmd(cmd);
1606         nr++;
1607     }
1608 
1609     return nr;
1610 }
1611 
1612 static enum blk_eh_timer_return null_timeout_rq(struct request *rq)
1613 {
1614     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1615     struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
1616 
1617     pr_info("rq %p timed out\n", rq);
1618 
1619     if (hctx->type == HCTX_TYPE_POLL) {
1620         struct nullb_queue *nq = hctx->driver_data;
1621 
1622         spin_lock(&nq->poll_lock);
1623         list_del_init(&rq->queuelist);
1624         spin_unlock(&nq->poll_lock);
1625     }
1626 
1627     /*
1628      * If the device is marked as blocking (i.e. memory backed or zoned
1629      * device), the submission path may be blocked waiting for resources
1630      * and cause real timeouts. For these real timeouts, the submission
1631      * path will complete the request using blk_mq_complete_request().
1632      * Only fake timeouts need to execute blk_mq_complete_request() here.
1633      */
1634     cmd->error = BLK_STS_TIMEOUT;
1635     if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL)
1636         blk_mq_complete_request(rq);
1637     return BLK_EH_DONE;
1638 }
1639 
1640 static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
1641              const struct blk_mq_queue_data *bd)
1642 {
1643     struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
1644     struct nullb_queue *nq = hctx->driver_data;
1645     sector_t nr_sectors = blk_rq_sectors(bd->rq);
1646     sector_t sector = blk_rq_pos(bd->rq);
1647     const bool is_poll = hctx->type == HCTX_TYPE_POLL;
1648 
1649     might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1650 
1651     if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
1652         hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1653         cmd->timer.function = null_cmd_timer_expired;
1654     }
1655     cmd->rq = bd->rq;
1656     cmd->error = BLK_STS_OK;
1657     cmd->nq = nq;
1658     cmd->fake_timeout = should_timeout_request(bd->rq);
1659 
1660     blk_mq_start_request(bd->rq);
1661 
1662     if (should_requeue_request(bd->rq)) {
1663         /*
1664          * Alternate between hitting the core BUSY path, and the
1665          * driver driven requeue path
1666          */
1667         nq->requeue_selection++;
1668         if (nq->requeue_selection & 1)
1669             return BLK_STS_RESOURCE;
1670         else {
1671             blk_mq_requeue_request(bd->rq, true);
1672             return BLK_STS_OK;
1673         }
1674     }
1675 
1676     if (is_poll) {
1677         spin_lock(&nq->poll_lock);
1678         list_add_tail(&bd->rq->queuelist, &nq->poll_list);
1679         spin_unlock(&nq->poll_lock);
1680         return BLK_STS_OK;
1681     }
1682     if (cmd->fake_timeout)
1683         return BLK_STS_OK;
1684 
1685     return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
1686 }
1687 
1688 static void cleanup_queue(struct nullb_queue *nq)
1689 {
1690     bitmap_free(nq->tag_map);
1691     kfree(nq->cmds);
1692 }
1693 
1694 static void cleanup_queues(struct nullb *nullb)
1695 {
1696     int i;
1697 
1698     for (i = 0; i < nullb->nr_queues; i++)
1699         cleanup_queue(&nullb->queues[i]);
1700 
1701     kfree(nullb->queues);
1702 }
1703 
1704 static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1705 {
1706     struct nullb_queue *nq = hctx->driver_data;
1707     struct nullb *nullb = nq->dev->nullb;
1708 
1709     nullb->nr_queues--;
1710 }
1711 
1712 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
1713 {
1714     init_waitqueue_head(&nq->wait);
1715     nq->queue_depth = nullb->queue_depth;
1716     nq->dev = nullb->dev;
1717     INIT_LIST_HEAD(&nq->poll_list);
1718     spin_lock_init(&nq->poll_lock);
1719 }
1720 
1721 static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1722               unsigned int hctx_idx)
1723 {
1724     struct nullb *nullb = hctx->queue->queuedata;
1725     struct nullb_queue *nq;
1726 
1727 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1728     if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1))
1729         return -EFAULT;
1730 #endif
1731 
1732     nq = &nullb->queues[hctx_idx];
1733     hctx->driver_data = nq;
1734     null_init_queue(nullb, nq);
1735     nullb->nr_queues++;
1736 
1737     return 0;
1738 }
1739 
1740 static const struct blk_mq_ops null_mq_ops = {
1741     .queue_rq       = null_queue_rq,
1742     .complete   = null_complete_rq,
1743     .timeout    = null_timeout_rq,
1744     .poll       = null_poll,
1745     .map_queues = null_map_queues,
1746     .init_hctx  = null_init_hctx,
1747     .exit_hctx  = null_exit_hctx,
1748 };
1749 
1750 static void null_del_dev(struct nullb *nullb)
1751 {
1752     struct nullb_device *dev;
1753 
1754     if (!nullb)
1755         return;
1756 
1757     dev = nullb->dev;
1758 
1759     ida_simple_remove(&nullb_indexes, nullb->index);
1760 
1761     list_del_init(&nullb->list);
1762 
1763     del_gendisk(nullb->disk);
1764 
1765     if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
1766         hrtimer_cancel(&nullb->bw_timer);
1767         atomic_long_set(&nullb->cur_bytes, LONG_MAX);
1768         null_restart_queue_async(nullb);
1769     }
1770 
1771     put_disk(nullb->disk);
1772     if (dev->queue_mode == NULL_Q_MQ &&
1773         nullb->tag_set == &nullb->__tag_set)
1774         blk_mq_free_tag_set(nullb->tag_set);
1775     cleanup_queues(nullb);
1776     if (null_cache_active(nullb))
1777         null_free_device_storage(nullb->dev, true);
1778     kfree(nullb);
1779     dev->nullb = NULL;
1780 }
1781 
1782 static void null_config_discard(struct nullb *nullb)
1783 {
1784     if (nullb->dev->discard == false)
1785         return;
1786 
1787     if (!nullb->dev->memory_backed) {
1788         nullb->dev->discard = false;
1789         pr_info("discard option is ignored without memory backing\n");
1790         return;
1791     }
1792 
1793     if (nullb->dev->zoned) {
1794         nullb->dev->discard = false;
1795         pr_info("discard option is ignored in zoned mode\n");
1796         return;
1797     }
1798 
1799     nullb->q->limits.discard_granularity = nullb->dev->blocksize;
1800     blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
1801 }
1802 
1803 static const struct block_device_operations null_bio_ops = {
1804     .owner      = THIS_MODULE,
1805     .submit_bio = null_submit_bio,
1806     .report_zones   = null_report_zones,
1807 };
1808 
1809 static const struct block_device_operations null_rq_ops = {
1810     .owner      = THIS_MODULE,
1811     .report_zones   = null_report_zones,
1812 };
1813 
1814 static int setup_commands(struct nullb_queue *nq)
1815 {
1816     struct nullb_cmd *cmd;
1817     int i;
1818 
1819     nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
1820     if (!nq->cmds)
1821         return -ENOMEM;
1822 
1823     nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
1824     if (!nq->tag_map) {
1825         kfree(nq->cmds);
1826         return -ENOMEM;
1827     }
1828 
1829     for (i = 0; i < nq->queue_depth; i++) {
1830         cmd = &nq->cmds[i];
1831         cmd->tag = -1U;
1832     }
1833 
1834     return 0;
1835 }
1836 
1837 static int setup_queues(struct nullb *nullb)
1838 {
1839     int nqueues = nr_cpu_ids;
1840 
1841     if (g_poll_queues)
1842         nqueues += g_poll_queues;
1843 
1844     nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue),
1845                 GFP_KERNEL);
1846     if (!nullb->queues)
1847         return -ENOMEM;
1848 
1849     nullb->queue_depth = nullb->dev->hw_queue_depth;
1850     return 0;
1851 }
1852 
1853 static int init_driver_queues(struct nullb *nullb)
1854 {
1855     struct nullb_queue *nq;
1856     int i, ret = 0;
1857 
1858     for (i = 0; i < nullb->dev->submit_queues; i++) {
1859         nq = &nullb->queues[i];
1860 
1861         null_init_queue(nullb, nq);
1862 
1863         ret = setup_commands(nq);
1864         if (ret)
1865             return ret;
1866         nullb->nr_queues++;
1867     }
1868     return 0;
1869 }
1870 
1871 static int null_gendisk_register(struct nullb *nullb)
1872 {
1873     sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
1874     struct gendisk *disk = nullb->disk;
1875 
1876     set_capacity(disk, size);
1877 
1878     disk->major     = null_major;
1879     disk->first_minor   = nullb->index;
1880     disk->minors        = 1;
1881     if (queue_is_mq(nullb->q))
1882         disk->fops      = &null_rq_ops;
1883     else
1884         disk->fops      = &null_bio_ops;
1885     disk->private_data  = nullb;
1886     strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
1887 
1888     if (nullb->dev->zoned) {
1889         int ret = null_register_zoned_dev(nullb);
1890 
1891         if (ret)
1892             return ret;
1893     }
1894 
1895     return add_disk(disk);
1896 }
1897 
1898 static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
1899 {
1900     unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
1901     int hw_queues, numa_node;
1902     unsigned int queue_depth;
1903     int poll_queues;
1904 
1905     if (nullb) {
1906         hw_queues = nullb->dev->submit_queues;
1907         poll_queues = nullb->dev->poll_queues;
1908         queue_depth = nullb->dev->hw_queue_depth;
1909         numa_node = nullb->dev->home_node;
1910         if (nullb->dev->no_sched)
1911             flags |= BLK_MQ_F_NO_SCHED;
1912         if (nullb->dev->shared_tag_bitmap)
1913             flags |= BLK_MQ_F_TAG_HCTX_SHARED;
1914         if (nullb->dev->blocking)
1915             flags |= BLK_MQ_F_BLOCKING;
1916     } else {
1917         hw_queues = g_submit_queues;
1918         poll_queues = g_poll_queues;
1919         queue_depth = g_hw_queue_depth;
1920         numa_node = g_home_node;
1921         if (g_no_sched)
1922             flags |= BLK_MQ_F_NO_SCHED;
1923         if (g_shared_tag_bitmap)
1924             flags |= BLK_MQ_F_TAG_HCTX_SHARED;
1925         if (g_blocking)
1926             flags |= BLK_MQ_F_BLOCKING;
1927     }
1928 
1929     set->ops = &null_mq_ops;
1930     set->cmd_size   = sizeof(struct nullb_cmd);
1931     set->flags = flags;
1932     set->driver_data = nullb;
1933     set->nr_hw_queues = hw_queues;
1934     set->queue_depth = queue_depth;
1935     set->numa_node = numa_node;
1936     if (poll_queues) {
1937         set->nr_hw_queues += poll_queues;
1938         set->nr_maps = 3;
1939     } else {
1940         set->nr_maps = 1;
1941     }
1942 
1943     return blk_mq_alloc_tag_set(set);
1944 }
1945 
1946 static int null_validate_conf(struct nullb_device *dev)
1947 {
1948     dev->blocksize = round_down(dev->blocksize, 512);
1949     dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
1950 
1951     if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
1952         if (dev->submit_queues != nr_online_nodes)
1953             dev->submit_queues = nr_online_nodes;
1954     } else if (dev->submit_queues > nr_cpu_ids)
1955         dev->submit_queues = nr_cpu_ids;
1956     else if (dev->submit_queues == 0)
1957         dev->submit_queues = 1;
1958     dev->prev_submit_queues = dev->submit_queues;
1959 
1960     if (dev->poll_queues > g_poll_queues)
1961         dev->poll_queues = g_poll_queues;
1962     dev->prev_poll_queues = dev->poll_queues;
1963 
1964     dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
1965     dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
1966 
1967     /* Do memory allocation, so set blocking */
1968     if (dev->memory_backed)
1969         dev->blocking = true;
1970     else /* cache is meaningless */
1971         dev->cache_size = 0;
1972     dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
1973                         dev->cache_size);
1974     dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
1975     /* can not stop a queue */
1976     if (dev->queue_mode == NULL_Q_BIO)
1977         dev->mbps = 0;
1978 
1979     if (dev->zoned &&
1980         (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
1981         pr_err("zone_size must be power-of-two\n");
1982         return -EINVAL;
1983     }
1984 
1985     return 0;
1986 }
1987 
1988 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1989 static bool __null_setup_fault(struct fault_attr *attr, char *str)
1990 {
1991     if (!str[0])
1992         return true;
1993 
1994     if (!setup_fault_attr(attr, str))
1995         return false;
1996 
1997     attr->verbose = 0;
1998     return true;
1999 }
2000 #endif
2001 
2002 static bool null_setup_fault(void)
2003 {
2004 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
2005     if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
2006         return false;
2007     if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
2008         return false;
2009     if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str))
2010         return false;
2011 #endif
2012     return true;
2013 }
2014 
2015 static int null_add_dev(struct nullb_device *dev)
2016 {
2017     struct nullb *nullb;
2018     int rv;
2019 
2020     rv = null_validate_conf(dev);
2021     if (rv)
2022         return rv;
2023 
2024     nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
2025     if (!nullb) {
2026         rv = -ENOMEM;
2027         goto out;
2028     }
2029     nullb->dev = dev;
2030     dev->nullb = nullb;
2031 
2032     spin_lock_init(&nullb->lock);
2033 
2034     rv = setup_queues(nullb);
2035     if (rv)
2036         goto out_free_nullb;
2037 
2038     if (dev->queue_mode == NULL_Q_MQ) {
2039         if (shared_tags) {
2040             nullb->tag_set = &tag_set;
2041             rv = 0;
2042         } else {
2043             nullb->tag_set = &nullb->__tag_set;
2044             rv = null_init_tag_set(nullb, nullb->tag_set);
2045         }
2046 
2047         if (rv)
2048             goto out_cleanup_queues;
2049 
2050         if (!null_setup_fault())
2051             goto out_cleanup_tags;
2052 
2053         nullb->tag_set->timeout = 5 * HZ;
2054         nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb);
2055         if (IS_ERR(nullb->disk)) {
2056             rv = PTR_ERR(nullb->disk);
2057             goto out_cleanup_tags;
2058         }
2059         nullb->q = nullb->disk->queue;
2060     } else if (dev->queue_mode == NULL_Q_BIO) {
2061         rv = -ENOMEM;
2062         nullb->disk = blk_alloc_disk(nullb->dev->home_node);
2063         if (!nullb->disk)
2064             goto out_cleanup_queues;
2065 
2066         nullb->q = nullb->disk->queue;
2067         rv = init_driver_queues(nullb);
2068         if (rv)
2069             goto out_cleanup_disk;
2070     }
2071 
2072     if (dev->mbps) {
2073         set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
2074         nullb_setup_bwtimer(nullb);
2075     }
2076 
2077     if (dev->cache_size > 0) {
2078         set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
2079         blk_queue_write_cache(nullb->q, true, true);
2080     }
2081 
2082     if (dev->zoned) {
2083         rv = null_init_zoned_dev(dev, nullb->q);
2084         if (rv)
2085             goto out_cleanup_disk;
2086     }
2087 
2088     nullb->q->queuedata = nullb;
2089     blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
2090     blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
2091 
2092     mutex_lock(&lock);
2093     rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
2094     if (rv < 0) {
2095         mutex_unlock(&lock);
2096         goto out_cleanup_zone;
2097     }
2098     nullb->index = rv;
2099     dev->index = rv;
2100     mutex_unlock(&lock);
2101 
2102     blk_queue_logical_block_size(nullb->q, dev->blocksize);
2103     blk_queue_physical_block_size(nullb->q, dev->blocksize);
2104     if (!dev->max_sectors)
2105         dev->max_sectors = queue_max_hw_sectors(nullb->q);
2106     dev->max_sectors = min_t(unsigned int, dev->max_sectors,
2107                  BLK_DEF_MAX_SECTORS);
2108     blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
2109 
2110     if (dev->virt_boundary)
2111         blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
2112 
2113     null_config_discard(nullb);
2114 
2115     if (config_item_name(&dev->item)) {
2116         /* Use configfs dir name as the device name */
2117         snprintf(nullb->disk_name, sizeof(nullb->disk_name),
2118              "%s", config_item_name(&dev->item));
2119     } else {
2120         sprintf(nullb->disk_name, "nullb%d", nullb->index);
2121     }
2122 
2123     rv = null_gendisk_register(nullb);
2124     if (rv)
2125         goto out_ida_free;
2126 
2127     mutex_lock(&lock);
2128     list_add_tail(&nullb->list, &nullb_list);
2129     mutex_unlock(&lock);
2130 
2131     pr_info("disk %s created\n", nullb->disk_name);
2132 
2133     return 0;
2134 
2135 out_ida_free:
2136     ida_free(&nullb_indexes, nullb->index);
2137 out_cleanup_zone:
2138     null_free_zoned_dev(dev);
2139 out_cleanup_disk:
2140     put_disk(nullb->disk);
2141 out_cleanup_tags:
2142     if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
2143         blk_mq_free_tag_set(nullb->tag_set);
2144 out_cleanup_queues:
2145     cleanup_queues(nullb);
2146 out_free_nullb:
2147     kfree(nullb);
2148     dev->nullb = NULL;
2149 out:
2150     return rv;
2151 }
2152 
2153 static struct nullb *null_find_dev_by_name(const char *name)
2154 {
2155     struct nullb *nullb = NULL, *nb;
2156 
2157     mutex_lock(&lock);
2158     list_for_each_entry(nb, &nullb_list, list) {
2159         if (strcmp(nb->disk_name, name) == 0) {
2160             nullb = nb;
2161             break;
2162         }
2163     }
2164     mutex_unlock(&lock);
2165 
2166     return nullb;
2167 }
2168 
2169 static int null_create_dev(void)
2170 {
2171     struct nullb_device *dev;
2172     int ret;
2173 
2174     dev = null_alloc_dev();
2175     if (!dev)
2176         return -ENOMEM;
2177 
2178     ret = null_add_dev(dev);
2179     if (ret) {
2180         null_free_dev(dev);
2181         return ret;
2182     }
2183 
2184     return 0;
2185 }
2186 
2187 static void null_destroy_dev(struct nullb *nullb)
2188 {
2189     struct nullb_device *dev = nullb->dev;
2190 
2191     null_del_dev(nullb);
2192     null_free_dev(dev);
2193 }
2194 
2195 static int __init null_init(void)
2196 {
2197     int ret = 0;
2198     unsigned int i;
2199     struct nullb *nullb;
2200 
2201     if (g_bs > PAGE_SIZE) {
2202         pr_warn("invalid block size\n");
2203         pr_warn("defaults block size to %lu\n", PAGE_SIZE);
2204         g_bs = PAGE_SIZE;
2205     }
2206 
2207     if (g_max_sectors > BLK_DEF_MAX_SECTORS) {
2208         pr_warn("invalid max sectors\n");
2209         pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS);
2210         g_max_sectors = BLK_DEF_MAX_SECTORS;
2211     }
2212 
2213     if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
2214         pr_err("invalid home_node value\n");
2215         g_home_node = NUMA_NO_NODE;
2216     }
2217 
2218     if (g_queue_mode == NULL_Q_RQ) {
2219         pr_err("legacy IO path is no longer available\n");
2220         return -EINVAL;
2221     }
2222 
2223     if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
2224         if (g_submit_queues != nr_online_nodes) {
2225             pr_warn("submit_queues param is set to %u.\n",
2226                 nr_online_nodes);
2227             g_submit_queues = nr_online_nodes;
2228         }
2229     } else if (g_submit_queues > nr_cpu_ids) {
2230         g_submit_queues = nr_cpu_ids;
2231     } else if (g_submit_queues <= 0) {
2232         g_submit_queues = 1;
2233     }
2234 
2235     if (g_queue_mode == NULL_Q_MQ && shared_tags) {
2236         ret = null_init_tag_set(NULL, &tag_set);
2237         if (ret)
2238             return ret;
2239     }
2240 
2241     config_group_init(&nullb_subsys.su_group);
2242     mutex_init(&nullb_subsys.su_mutex);
2243 
2244     ret = configfs_register_subsystem(&nullb_subsys);
2245     if (ret)
2246         goto err_tagset;
2247 
2248     mutex_init(&lock);
2249 
2250     null_major = register_blkdev(0, "nullb");
2251     if (null_major < 0) {
2252         ret = null_major;
2253         goto err_conf;
2254     }
2255 
2256     for (i = 0; i < nr_devices; i++) {
2257         ret = null_create_dev();
2258         if (ret)
2259             goto err_dev;
2260     }
2261 
2262     pr_info("module loaded\n");
2263     return 0;
2264 
2265 err_dev:
2266     while (!list_empty(&nullb_list)) {
2267         nullb = list_entry(nullb_list.next, struct nullb, list);
2268         null_destroy_dev(nullb);
2269     }
2270     unregister_blkdev(null_major, "nullb");
2271 err_conf:
2272     configfs_unregister_subsystem(&nullb_subsys);
2273 err_tagset:
2274     if (g_queue_mode == NULL_Q_MQ && shared_tags)
2275         blk_mq_free_tag_set(&tag_set);
2276     return ret;
2277 }
2278 
2279 static void __exit null_exit(void)
2280 {
2281     struct nullb *nullb;
2282 
2283     configfs_unregister_subsystem(&nullb_subsys);
2284 
2285     unregister_blkdev(null_major, "nullb");
2286 
2287     mutex_lock(&lock);
2288     while (!list_empty(&nullb_list)) {
2289         nullb = list_entry(nullb_list.next, struct nullb, list);
2290         null_destroy_dev(nullb);
2291     }
2292     mutex_unlock(&lock);
2293 
2294     if (g_queue_mode == NULL_Q_MQ && shared_tags)
2295         blk_mq_free_tag_set(&tag_set);
2296 }
2297 
2298 module_init(null_init);
2299 module_exit(null_exit);
2300 
2301 MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
2302 MODULE_LICENSE("GPL");