Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2017-2018 Christoph Hellwig.
0004  */
0005 
0006 #include <linux/backing-dev.h>
0007 #include <linux/moduleparam.h>
0008 #include <linux/vmalloc.h>
0009 #include <trace/events/block.h>
0010 #include "nvme.h"
0011 
0012 bool multipath = true;
0013 module_param(multipath, bool, 0444);
0014 MODULE_PARM_DESC(multipath,
0015     "turn on native support for multiple controllers per subsystem");
0016 
0017 static const char *nvme_iopolicy_names[] = {
0018     [NVME_IOPOLICY_NUMA]    = "numa",
0019     [NVME_IOPOLICY_RR]  = "round-robin",
0020 };
0021 
0022 static int iopolicy = NVME_IOPOLICY_NUMA;
0023 
0024 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
0025 {
0026     if (!val)
0027         return -EINVAL;
0028     if (!strncmp(val, "numa", 4))
0029         iopolicy = NVME_IOPOLICY_NUMA;
0030     else if (!strncmp(val, "round-robin", 11))
0031         iopolicy = NVME_IOPOLICY_RR;
0032     else
0033         return -EINVAL;
0034 
0035     return 0;
0036 }
0037 
0038 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
0039 {
0040     return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
0041 }
0042 
0043 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
0044     &iopolicy, 0644);
0045 MODULE_PARM_DESC(iopolicy,
0046     "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
0047 
0048 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
0049 {
0050     subsys->iopolicy = iopolicy;
0051 }
0052 
0053 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
0054 {
0055     struct nvme_ns_head *h;
0056 
0057     lockdep_assert_held(&subsys->lock);
0058     list_for_each_entry(h, &subsys->nsheads, entry)
0059         if (h->disk)
0060             blk_mq_unfreeze_queue(h->disk->queue);
0061 }
0062 
0063 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
0064 {
0065     struct nvme_ns_head *h;
0066 
0067     lockdep_assert_held(&subsys->lock);
0068     list_for_each_entry(h, &subsys->nsheads, entry)
0069         if (h->disk)
0070             blk_mq_freeze_queue_wait(h->disk->queue);
0071 }
0072 
0073 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
0074 {
0075     struct nvme_ns_head *h;
0076 
0077     lockdep_assert_held(&subsys->lock);
0078     list_for_each_entry(h, &subsys->nsheads, entry)
0079         if (h->disk)
0080             blk_freeze_queue_start(h->disk->queue);
0081 }
0082 
0083 void nvme_failover_req(struct request *req)
0084 {
0085     struct nvme_ns *ns = req->q->queuedata;
0086     u16 status = nvme_req(req)->status & 0x7ff;
0087     unsigned long flags;
0088     struct bio *bio;
0089 
0090     nvme_mpath_clear_current_path(ns);
0091 
0092     /*
0093      * If we got back an ANA error, we know the controller is alive but not
0094      * ready to serve this namespace.  Kick of a re-read of the ANA
0095      * information page, and just try any other available path for now.
0096      */
0097     if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
0098         set_bit(NVME_NS_ANA_PENDING, &ns->flags);
0099         queue_work(nvme_wq, &ns->ctrl->ana_work);
0100     }
0101 
0102     spin_lock_irqsave(&ns->head->requeue_lock, flags);
0103     for (bio = req->bio; bio; bio = bio->bi_next) {
0104         bio_set_dev(bio, ns->head->disk->part0);
0105         if (bio->bi_opf & REQ_POLLED) {
0106             bio->bi_opf &= ~REQ_POLLED;
0107             bio->bi_cookie = BLK_QC_T_NONE;
0108         }
0109     }
0110     blk_steal_bios(&ns->head->requeue_list, req);
0111     spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
0112 
0113     blk_mq_end_request(req, 0);
0114     kblockd_schedule_work(&ns->head->requeue_work);
0115 }
0116 
0117 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
0118 {
0119     struct nvme_ns *ns;
0120 
0121     down_read(&ctrl->namespaces_rwsem);
0122     list_for_each_entry(ns, &ctrl->namespaces, list) {
0123         if (!ns->head->disk)
0124             continue;
0125         kblockd_schedule_work(&ns->head->requeue_work);
0126         if (ctrl->state == NVME_CTRL_LIVE)
0127             disk_uevent(ns->head->disk, KOBJ_CHANGE);
0128     }
0129     up_read(&ctrl->namespaces_rwsem);
0130 }
0131 
0132 static const char *nvme_ana_state_names[] = {
0133     [0]             = "invalid state",
0134     [NVME_ANA_OPTIMIZED]        = "optimized",
0135     [NVME_ANA_NONOPTIMIZED]     = "non-optimized",
0136     [NVME_ANA_INACCESSIBLE]     = "inaccessible",
0137     [NVME_ANA_PERSISTENT_LOSS]  = "persistent-loss",
0138     [NVME_ANA_CHANGE]       = "change",
0139 };
0140 
0141 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
0142 {
0143     struct nvme_ns_head *head = ns->head;
0144     bool changed = false;
0145     int node;
0146 
0147     if (!head)
0148         goto out;
0149 
0150     for_each_node(node) {
0151         if (ns == rcu_access_pointer(head->current_path[node])) {
0152             rcu_assign_pointer(head->current_path[node], NULL);
0153             changed = true;
0154         }
0155     }
0156 out:
0157     return changed;
0158 }
0159 
0160 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
0161 {
0162     struct nvme_ns *ns;
0163 
0164     down_read(&ctrl->namespaces_rwsem);
0165     list_for_each_entry(ns, &ctrl->namespaces, list) {
0166         nvme_mpath_clear_current_path(ns);
0167         kblockd_schedule_work(&ns->head->requeue_work);
0168     }
0169     up_read(&ctrl->namespaces_rwsem);
0170 }
0171 
0172 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
0173 {
0174     struct nvme_ns_head *head = ns->head;
0175     sector_t capacity = get_capacity(head->disk);
0176     int node;
0177 
0178     list_for_each_entry_rcu(ns, &head->list, siblings) {
0179         if (capacity != get_capacity(ns->disk))
0180             clear_bit(NVME_NS_READY, &ns->flags);
0181     }
0182 
0183     for_each_node(node)
0184         rcu_assign_pointer(head->current_path[node], NULL);
0185 }
0186 
0187 static bool nvme_path_is_disabled(struct nvme_ns *ns)
0188 {
0189     /*
0190      * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
0191      * still be able to complete assuming that the controller is connected.
0192      * Otherwise it will fail immediately and return to the requeue list.
0193      */
0194     if (ns->ctrl->state != NVME_CTRL_LIVE &&
0195         ns->ctrl->state != NVME_CTRL_DELETING)
0196         return true;
0197     if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
0198         !test_bit(NVME_NS_READY, &ns->flags))
0199         return true;
0200     return false;
0201 }
0202 
0203 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
0204 {
0205     int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
0206     struct nvme_ns *found = NULL, *fallback = NULL, *ns;
0207 
0208     list_for_each_entry_rcu(ns, &head->list, siblings) {
0209         if (nvme_path_is_disabled(ns))
0210             continue;
0211 
0212         if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
0213             distance = node_distance(node, ns->ctrl->numa_node);
0214         else
0215             distance = LOCAL_DISTANCE;
0216 
0217         switch (ns->ana_state) {
0218         case NVME_ANA_OPTIMIZED:
0219             if (distance < found_distance) {
0220                 found_distance = distance;
0221                 found = ns;
0222             }
0223             break;
0224         case NVME_ANA_NONOPTIMIZED:
0225             if (distance < fallback_distance) {
0226                 fallback_distance = distance;
0227                 fallback = ns;
0228             }
0229             break;
0230         default:
0231             break;
0232         }
0233     }
0234 
0235     if (!found)
0236         found = fallback;
0237     if (found)
0238         rcu_assign_pointer(head->current_path[node], found);
0239     return found;
0240 }
0241 
0242 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
0243         struct nvme_ns *ns)
0244 {
0245     ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
0246             siblings);
0247     if (ns)
0248         return ns;
0249     return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
0250 }
0251 
0252 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
0253         int node, struct nvme_ns *old)
0254 {
0255     struct nvme_ns *ns, *found = NULL;
0256 
0257     if (list_is_singular(&head->list)) {
0258         if (nvme_path_is_disabled(old))
0259             return NULL;
0260         return old;
0261     }
0262 
0263     for (ns = nvme_next_ns(head, old);
0264          ns && ns != old;
0265          ns = nvme_next_ns(head, ns)) {
0266         if (nvme_path_is_disabled(ns))
0267             continue;
0268 
0269         if (ns->ana_state == NVME_ANA_OPTIMIZED) {
0270             found = ns;
0271             goto out;
0272         }
0273         if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
0274             found = ns;
0275     }
0276 
0277     /*
0278      * The loop above skips the current path for round-robin semantics.
0279      * Fall back to the current path if either:
0280      *  - no other optimized path found and current is optimized,
0281      *  - no other usable path found and current is usable.
0282      */
0283     if (!nvme_path_is_disabled(old) &&
0284         (old->ana_state == NVME_ANA_OPTIMIZED ||
0285          (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
0286         return old;
0287 
0288     if (!found)
0289         return NULL;
0290 out:
0291     rcu_assign_pointer(head->current_path[node], found);
0292     return found;
0293 }
0294 
0295 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
0296 {
0297     return ns->ctrl->state == NVME_CTRL_LIVE &&
0298         ns->ana_state == NVME_ANA_OPTIMIZED;
0299 }
0300 
0301 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
0302 {
0303     int node = numa_node_id();
0304     struct nvme_ns *ns;
0305 
0306     ns = srcu_dereference(head->current_path[node], &head->srcu);
0307     if (unlikely(!ns))
0308         return __nvme_find_path(head, node);
0309 
0310     if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
0311         return nvme_round_robin_path(head, node, ns);
0312     if (unlikely(!nvme_path_is_optimized(ns)))
0313         return __nvme_find_path(head, node);
0314     return ns;
0315 }
0316 
0317 static bool nvme_available_path(struct nvme_ns_head *head)
0318 {
0319     struct nvme_ns *ns;
0320 
0321     list_for_each_entry_rcu(ns, &head->list, siblings) {
0322         if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
0323             continue;
0324         switch (ns->ctrl->state) {
0325         case NVME_CTRL_LIVE:
0326         case NVME_CTRL_RESETTING:
0327         case NVME_CTRL_CONNECTING:
0328             /* fallthru */
0329             return true;
0330         default:
0331             break;
0332         }
0333     }
0334     return false;
0335 }
0336 
0337 static void nvme_ns_head_submit_bio(struct bio *bio)
0338 {
0339     struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
0340     struct device *dev = disk_to_dev(head->disk);
0341     struct nvme_ns *ns;
0342     int srcu_idx;
0343 
0344     /*
0345      * The namespace might be going away and the bio might be moved to a
0346      * different queue via blk_steal_bios(), so we need to use the bio_split
0347      * pool from the original queue to allocate the bvecs from.
0348      */
0349     bio = bio_split_to_limits(bio);
0350 
0351     srcu_idx = srcu_read_lock(&head->srcu);
0352     ns = nvme_find_path(head);
0353     if (likely(ns)) {
0354         bio_set_dev(bio, ns->disk->part0);
0355         bio->bi_opf |= REQ_NVME_MPATH;
0356         trace_block_bio_remap(bio, disk_devt(ns->head->disk),
0357                       bio->bi_iter.bi_sector);
0358         submit_bio_noacct(bio);
0359     } else if (nvme_available_path(head)) {
0360         dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
0361 
0362         spin_lock_irq(&head->requeue_lock);
0363         bio_list_add(&head->requeue_list, bio);
0364         spin_unlock_irq(&head->requeue_lock);
0365     } else {
0366         dev_warn_ratelimited(dev, "no available path - failing I/O\n");
0367 
0368         bio_io_error(bio);
0369     }
0370 
0371     srcu_read_unlock(&head->srcu, srcu_idx);
0372 }
0373 
0374 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
0375 {
0376     if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
0377         return -ENXIO;
0378     return 0;
0379 }
0380 
0381 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
0382 {
0383     nvme_put_ns_head(disk->private_data);
0384 }
0385 
0386 #ifdef CONFIG_BLK_DEV_ZONED
0387 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
0388         unsigned int nr_zones, report_zones_cb cb, void *data)
0389 {
0390     struct nvme_ns_head *head = disk->private_data;
0391     struct nvme_ns *ns;
0392     int srcu_idx, ret = -EWOULDBLOCK;
0393 
0394     srcu_idx = srcu_read_lock(&head->srcu);
0395     ns = nvme_find_path(head);
0396     if (ns)
0397         ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
0398     srcu_read_unlock(&head->srcu, srcu_idx);
0399     return ret;
0400 }
0401 #else
0402 #define nvme_ns_head_report_zones   NULL
0403 #endif /* CONFIG_BLK_DEV_ZONED */
0404 
0405 const struct block_device_operations nvme_ns_head_ops = {
0406     .owner      = THIS_MODULE,
0407     .submit_bio = nvme_ns_head_submit_bio,
0408     .open       = nvme_ns_head_open,
0409     .release    = nvme_ns_head_release,
0410     .ioctl      = nvme_ns_head_ioctl,
0411     .compat_ioctl   = blkdev_compat_ptr_ioctl,
0412     .getgeo     = nvme_getgeo,
0413     .report_zones   = nvme_ns_head_report_zones,
0414     .pr_ops     = &nvme_pr_ops,
0415 };
0416 
0417 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
0418 {
0419     return container_of(cdev, struct nvme_ns_head, cdev);
0420 }
0421 
0422 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
0423 {
0424     if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
0425         return -ENXIO;
0426     return 0;
0427 }
0428 
0429 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
0430 {
0431     nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
0432     return 0;
0433 }
0434 
0435 static const struct file_operations nvme_ns_head_chr_fops = {
0436     .owner      = THIS_MODULE,
0437     .open       = nvme_ns_head_chr_open,
0438     .release    = nvme_ns_head_chr_release,
0439     .unlocked_ioctl = nvme_ns_head_chr_ioctl,
0440     .compat_ioctl   = compat_ptr_ioctl,
0441     .uring_cmd  = nvme_ns_head_chr_uring_cmd,
0442 };
0443 
0444 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
0445 {
0446     int ret;
0447 
0448     head->cdev_device.parent = &head->subsys->dev;
0449     ret = dev_set_name(&head->cdev_device, "ng%dn%d",
0450                head->subsys->instance, head->instance);
0451     if (ret)
0452         return ret;
0453     ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
0454                 &nvme_ns_head_chr_fops, THIS_MODULE);
0455     return ret;
0456 }
0457 
0458 static void nvme_requeue_work(struct work_struct *work)
0459 {
0460     struct nvme_ns_head *head =
0461         container_of(work, struct nvme_ns_head, requeue_work);
0462     struct bio *bio, *next;
0463 
0464     spin_lock_irq(&head->requeue_lock);
0465     next = bio_list_get(&head->requeue_list);
0466     spin_unlock_irq(&head->requeue_lock);
0467 
0468     while ((bio = next) != NULL) {
0469         next = bio->bi_next;
0470         bio->bi_next = NULL;
0471 
0472         submit_bio_noacct(bio);
0473     }
0474 }
0475 
0476 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
0477 {
0478     bool vwc = false;
0479 
0480     mutex_init(&head->lock);
0481     bio_list_init(&head->requeue_list);
0482     spin_lock_init(&head->requeue_lock);
0483     INIT_WORK(&head->requeue_work, nvme_requeue_work);
0484 
0485     /*
0486      * Add a multipath node if the subsystems supports multiple controllers.
0487      * We also do this for private namespaces as the namespace sharing flag
0488      * could change after a rescan.
0489      */
0490     if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
0491         !nvme_is_unique_nsid(ctrl, head) || !multipath)
0492         return 0;
0493 
0494     head->disk = blk_alloc_disk(ctrl->numa_node);
0495     if (!head->disk)
0496         return -ENOMEM;
0497     head->disk->fops = &nvme_ns_head_ops;
0498     head->disk->private_data = head;
0499     sprintf(head->disk->disk_name, "nvme%dn%d",
0500             ctrl->subsys->instance, head->instance);
0501 
0502     blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
0503     blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
0504     /*
0505      * This assumes all controllers that refer to a namespace either
0506      * support poll queues or not.  That is not a strict guarantee,
0507      * but if the assumption is wrong the effect is only suboptimal
0508      * performance but not correctness problem.
0509      */
0510     if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
0511         ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
0512         blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
0513 
0514     /* set to a default value of 512 until the disk is validated */
0515     blk_queue_logical_block_size(head->disk->queue, 512);
0516     blk_set_stacking_limits(&head->disk->queue->limits);
0517 
0518     /* we need to propagate up the VMC settings */
0519     if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
0520         vwc = true;
0521     blk_queue_write_cache(head->disk->queue, vwc, vwc);
0522     return 0;
0523 }
0524 
0525 static void nvme_mpath_set_live(struct nvme_ns *ns)
0526 {
0527     struct nvme_ns_head *head = ns->head;
0528     int rc;
0529 
0530     if (!head->disk)
0531         return;
0532 
0533     /*
0534      * test_and_set_bit() is used because it is protecting against two nvme
0535      * paths simultaneously calling device_add_disk() on the same namespace
0536      * head.
0537      */
0538     if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
0539         rc = device_add_disk(&head->subsys->dev, head->disk,
0540                      nvme_ns_id_attr_groups);
0541         if (rc) {
0542             clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
0543             return;
0544         }
0545         nvme_add_ns_head_cdev(head);
0546     }
0547 
0548     mutex_lock(&head->lock);
0549     if (nvme_path_is_optimized(ns)) {
0550         int node, srcu_idx;
0551 
0552         srcu_idx = srcu_read_lock(&head->srcu);
0553         for_each_node(node)
0554             __nvme_find_path(head, node);
0555         srcu_read_unlock(&head->srcu, srcu_idx);
0556     }
0557     mutex_unlock(&head->lock);
0558 
0559     synchronize_srcu(&head->srcu);
0560     kblockd_schedule_work(&head->requeue_work);
0561 }
0562 
0563 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
0564         int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
0565             void *))
0566 {
0567     void *base = ctrl->ana_log_buf;
0568     size_t offset = sizeof(struct nvme_ana_rsp_hdr);
0569     int error, i;
0570 
0571     lockdep_assert_held(&ctrl->ana_lock);
0572 
0573     for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
0574         struct nvme_ana_group_desc *desc = base + offset;
0575         u32 nr_nsids;
0576         size_t nsid_buf_size;
0577 
0578         if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
0579             return -EINVAL;
0580 
0581         nr_nsids = le32_to_cpu(desc->nnsids);
0582         nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
0583 
0584         if (WARN_ON_ONCE(desc->grpid == 0))
0585             return -EINVAL;
0586         if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
0587             return -EINVAL;
0588         if (WARN_ON_ONCE(desc->state == 0))
0589             return -EINVAL;
0590         if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
0591             return -EINVAL;
0592 
0593         offset += sizeof(*desc);
0594         if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
0595             return -EINVAL;
0596 
0597         error = cb(ctrl, desc, data);
0598         if (error)
0599             return error;
0600 
0601         offset += nsid_buf_size;
0602     }
0603 
0604     return 0;
0605 }
0606 
0607 static inline bool nvme_state_is_live(enum nvme_ana_state state)
0608 {
0609     return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
0610 }
0611 
0612 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
0613         struct nvme_ns *ns)
0614 {
0615     ns->ana_grpid = le32_to_cpu(desc->grpid);
0616     ns->ana_state = desc->state;
0617     clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
0618     /*
0619      * nvme_mpath_set_live() will trigger I/O to the multipath path device
0620      * and in turn to this path device.  However we cannot accept this I/O
0621      * if the controller is not live.  This may deadlock if called from
0622      * nvme_mpath_init_identify() and the ctrl will never complete
0623      * initialization, preventing I/O from completing.  For this case we
0624      * will reprocess the ANA log page in nvme_mpath_update() once the
0625      * controller is ready.
0626      */
0627     if (nvme_state_is_live(ns->ana_state) &&
0628         ns->ctrl->state == NVME_CTRL_LIVE)
0629         nvme_mpath_set_live(ns);
0630 }
0631 
0632 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
0633         struct nvme_ana_group_desc *desc, void *data)
0634 {
0635     u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
0636     unsigned *nr_change_groups = data;
0637     struct nvme_ns *ns;
0638 
0639     dev_dbg(ctrl->device, "ANA group %d: %s.\n",
0640             le32_to_cpu(desc->grpid),
0641             nvme_ana_state_names[desc->state]);
0642 
0643     if (desc->state == NVME_ANA_CHANGE)
0644         (*nr_change_groups)++;
0645 
0646     if (!nr_nsids)
0647         return 0;
0648 
0649     down_read(&ctrl->namespaces_rwsem);
0650     list_for_each_entry(ns, &ctrl->namespaces, list) {
0651         unsigned nsid;
0652 again:
0653         nsid = le32_to_cpu(desc->nsids[n]);
0654         if (ns->head->ns_id < nsid)
0655             continue;
0656         if (ns->head->ns_id == nsid)
0657             nvme_update_ns_ana_state(desc, ns);
0658         if (++n == nr_nsids)
0659             break;
0660         if (ns->head->ns_id > nsid)
0661             goto again;
0662     }
0663     up_read(&ctrl->namespaces_rwsem);
0664     return 0;
0665 }
0666 
0667 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
0668 {
0669     u32 nr_change_groups = 0;
0670     int error;
0671 
0672     mutex_lock(&ctrl->ana_lock);
0673     error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
0674             ctrl->ana_log_buf, ctrl->ana_log_size, 0);
0675     if (error) {
0676         dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
0677         goto out_unlock;
0678     }
0679 
0680     error = nvme_parse_ana_log(ctrl, &nr_change_groups,
0681             nvme_update_ana_state);
0682     if (error)
0683         goto out_unlock;
0684 
0685     /*
0686      * In theory we should have an ANATT timer per group as they might enter
0687      * the change state at different times.  But that is a lot of overhead
0688      * just to protect against a target that keeps entering new changes
0689      * states while never finishing previous ones.  But we'll still
0690      * eventually time out once all groups are in change state, so this
0691      * isn't a big deal.
0692      *
0693      * We also double the ANATT value to provide some slack for transports
0694      * or AEN processing overhead.
0695      */
0696     if (nr_change_groups)
0697         mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
0698     else
0699         del_timer_sync(&ctrl->anatt_timer);
0700 out_unlock:
0701     mutex_unlock(&ctrl->ana_lock);
0702     return error;
0703 }
0704 
0705 static void nvme_ana_work(struct work_struct *work)
0706 {
0707     struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
0708 
0709     if (ctrl->state != NVME_CTRL_LIVE)
0710         return;
0711 
0712     nvme_read_ana_log(ctrl);
0713 }
0714 
0715 void nvme_mpath_update(struct nvme_ctrl *ctrl)
0716 {
0717     u32 nr_change_groups = 0;
0718 
0719     if (!ctrl->ana_log_buf)
0720         return;
0721 
0722     mutex_lock(&ctrl->ana_lock);
0723     nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
0724     mutex_unlock(&ctrl->ana_lock);
0725 }
0726 
0727 static void nvme_anatt_timeout(struct timer_list *t)
0728 {
0729     struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
0730 
0731     dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
0732     nvme_reset_ctrl(ctrl);
0733 }
0734 
0735 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
0736 {
0737     if (!nvme_ctrl_use_ana(ctrl))
0738         return;
0739     del_timer_sync(&ctrl->anatt_timer);
0740     cancel_work_sync(&ctrl->ana_work);
0741 }
0742 
0743 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
0744     struct device_attribute subsys_attr_##_name =   \
0745         __ATTR(_name, _mode, _show, _store)
0746 
0747 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
0748         struct device_attribute *attr, char *buf)
0749 {
0750     struct nvme_subsystem *subsys =
0751         container_of(dev, struct nvme_subsystem, dev);
0752 
0753     return sysfs_emit(buf, "%s\n",
0754               nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
0755 }
0756 
0757 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
0758         struct device_attribute *attr, const char *buf, size_t count)
0759 {
0760     struct nvme_subsystem *subsys =
0761         container_of(dev, struct nvme_subsystem, dev);
0762     int i;
0763 
0764     for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
0765         if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
0766             WRITE_ONCE(subsys->iopolicy, i);
0767             return count;
0768         }
0769     }
0770 
0771     return -EINVAL;
0772 }
0773 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
0774               nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
0775 
0776 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
0777         char *buf)
0778 {
0779     return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
0780 }
0781 DEVICE_ATTR_RO(ana_grpid);
0782 
0783 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
0784         char *buf)
0785 {
0786     struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
0787 
0788     return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
0789 }
0790 DEVICE_ATTR_RO(ana_state);
0791 
0792 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
0793         struct nvme_ana_group_desc *desc, void *data)
0794 {
0795     struct nvme_ana_group_desc *dst = data;
0796 
0797     if (desc->grpid != dst->grpid)
0798         return 0;
0799 
0800     *dst = *desc;
0801     return -ENXIO; /* just break out of the loop */
0802 }
0803 
0804 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
0805 {
0806     if (nvme_ctrl_use_ana(ns->ctrl)) {
0807         struct nvme_ana_group_desc desc = {
0808             .grpid = anagrpid,
0809             .state = 0,
0810         };
0811 
0812         mutex_lock(&ns->ctrl->ana_lock);
0813         ns->ana_grpid = le32_to_cpu(anagrpid);
0814         nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
0815         mutex_unlock(&ns->ctrl->ana_lock);
0816         if (desc.state) {
0817             /* found the group desc: update */
0818             nvme_update_ns_ana_state(&desc, ns);
0819         } else {
0820             /* group desc not found: trigger a re-read */
0821             set_bit(NVME_NS_ANA_PENDING, &ns->flags);
0822             queue_work(nvme_wq, &ns->ctrl->ana_work);
0823         }
0824     } else {
0825         ns->ana_state = NVME_ANA_OPTIMIZED;
0826         nvme_mpath_set_live(ns);
0827     }
0828 
0829     if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
0830         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
0831                    ns->head->disk->queue);
0832 #ifdef CONFIG_BLK_DEV_ZONED
0833     if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
0834         ns->head->disk->nr_zones = ns->disk->nr_zones;
0835 #endif
0836 }
0837 
0838 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
0839 {
0840     if (!head->disk)
0841         return;
0842     kblockd_schedule_work(&head->requeue_work);
0843     if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
0844         nvme_cdev_del(&head->cdev, &head->cdev_device);
0845         del_gendisk(head->disk);
0846     }
0847 }
0848 
0849 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
0850 {
0851     if (!head->disk)
0852         return;
0853     blk_mark_disk_dead(head->disk);
0854     /* make sure all pending bios are cleaned up */
0855     kblockd_schedule_work(&head->requeue_work);
0856     flush_work(&head->requeue_work);
0857     put_disk(head->disk);
0858 }
0859 
0860 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
0861 {
0862     mutex_init(&ctrl->ana_lock);
0863     timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
0864     INIT_WORK(&ctrl->ana_work, nvme_ana_work);
0865 }
0866 
0867 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
0868 {
0869     size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
0870     size_t ana_log_size;
0871     int error = 0;
0872 
0873     /* check if multipath is enabled and we have the capability */
0874     if (!multipath || !ctrl->subsys ||
0875         !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
0876         return 0;
0877 
0878     if (!ctrl->max_namespaces ||
0879         ctrl->max_namespaces > le32_to_cpu(id->nn)) {
0880         dev_err(ctrl->device,
0881             "Invalid MNAN value %u\n", ctrl->max_namespaces);
0882         return -EINVAL;
0883     }
0884 
0885     ctrl->anacap = id->anacap;
0886     ctrl->anatt = id->anatt;
0887     ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
0888     ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
0889 
0890     ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
0891         ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
0892         ctrl->max_namespaces * sizeof(__le32);
0893     if (ana_log_size > max_transfer_size) {
0894         dev_err(ctrl->device,
0895             "ANA log page size (%zd) larger than MDTS (%zd).\n",
0896             ana_log_size, max_transfer_size);
0897         dev_err(ctrl->device, "disabling ANA support.\n");
0898         goto out_uninit;
0899     }
0900     if (ana_log_size > ctrl->ana_log_size) {
0901         nvme_mpath_stop(ctrl);
0902         nvme_mpath_uninit(ctrl);
0903         ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
0904         if (!ctrl->ana_log_buf)
0905             return -ENOMEM;
0906     }
0907     ctrl->ana_log_size = ana_log_size;
0908     error = nvme_read_ana_log(ctrl);
0909     if (error)
0910         goto out_uninit;
0911     return 0;
0912 
0913 out_uninit:
0914     nvme_mpath_uninit(ctrl);
0915     return error;
0916 }
0917 
0918 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
0919 {
0920     kvfree(ctrl->ana_log_buf);
0921     ctrl->ana_log_buf = NULL;
0922     ctrl->ana_log_size = 0;
0923 }