0001
0002
0003
0004
0005
0006 #include <linux/backing-dev.h>
0007 #include <linux/moduleparam.h>
0008 #include <linux/vmalloc.h>
0009 #include <trace/events/block.h>
0010 #include "nvme.h"
0011
0012 bool multipath = true;
0013 module_param(multipath, bool, 0444);
0014 MODULE_PARM_DESC(multipath,
0015 "turn on native support for multiple controllers per subsystem");
0016
0017 static const char *nvme_iopolicy_names[] = {
0018 [NVME_IOPOLICY_NUMA] = "numa",
0019 [NVME_IOPOLICY_RR] = "round-robin",
0020 };
0021
0022 static int iopolicy = NVME_IOPOLICY_NUMA;
0023
0024 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
0025 {
0026 if (!val)
0027 return -EINVAL;
0028 if (!strncmp(val, "numa", 4))
0029 iopolicy = NVME_IOPOLICY_NUMA;
0030 else if (!strncmp(val, "round-robin", 11))
0031 iopolicy = NVME_IOPOLICY_RR;
0032 else
0033 return -EINVAL;
0034
0035 return 0;
0036 }
0037
0038 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
0039 {
0040 return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
0041 }
0042
0043 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
0044 &iopolicy, 0644);
0045 MODULE_PARM_DESC(iopolicy,
0046 "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
0047
0048 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
0049 {
0050 subsys->iopolicy = iopolicy;
0051 }
0052
0053 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
0054 {
0055 struct nvme_ns_head *h;
0056
0057 lockdep_assert_held(&subsys->lock);
0058 list_for_each_entry(h, &subsys->nsheads, entry)
0059 if (h->disk)
0060 blk_mq_unfreeze_queue(h->disk->queue);
0061 }
0062
0063 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
0064 {
0065 struct nvme_ns_head *h;
0066
0067 lockdep_assert_held(&subsys->lock);
0068 list_for_each_entry(h, &subsys->nsheads, entry)
0069 if (h->disk)
0070 blk_mq_freeze_queue_wait(h->disk->queue);
0071 }
0072
0073 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
0074 {
0075 struct nvme_ns_head *h;
0076
0077 lockdep_assert_held(&subsys->lock);
0078 list_for_each_entry(h, &subsys->nsheads, entry)
0079 if (h->disk)
0080 blk_freeze_queue_start(h->disk->queue);
0081 }
0082
0083 void nvme_failover_req(struct request *req)
0084 {
0085 struct nvme_ns *ns = req->q->queuedata;
0086 u16 status = nvme_req(req)->status & 0x7ff;
0087 unsigned long flags;
0088 struct bio *bio;
0089
0090 nvme_mpath_clear_current_path(ns);
0091
0092
0093
0094
0095
0096
0097 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
0098 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
0099 queue_work(nvme_wq, &ns->ctrl->ana_work);
0100 }
0101
0102 spin_lock_irqsave(&ns->head->requeue_lock, flags);
0103 for (bio = req->bio; bio; bio = bio->bi_next) {
0104 bio_set_dev(bio, ns->head->disk->part0);
0105 if (bio->bi_opf & REQ_POLLED) {
0106 bio->bi_opf &= ~REQ_POLLED;
0107 bio->bi_cookie = BLK_QC_T_NONE;
0108 }
0109 }
0110 blk_steal_bios(&ns->head->requeue_list, req);
0111 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
0112
0113 blk_mq_end_request(req, 0);
0114 kblockd_schedule_work(&ns->head->requeue_work);
0115 }
0116
0117 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
0118 {
0119 struct nvme_ns *ns;
0120
0121 down_read(&ctrl->namespaces_rwsem);
0122 list_for_each_entry(ns, &ctrl->namespaces, list) {
0123 if (!ns->head->disk)
0124 continue;
0125 kblockd_schedule_work(&ns->head->requeue_work);
0126 if (ctrl->state == NVME_CTRL_LIVE)
0127 disk_uevent(ns->head->disk, KOBJ_CHANGE);
0128 }
0129 up_read(&ctrl->namespaces_rwsem);
0130 }
0131
0132 static const char *nvme_ana_state_names[] = {
0133 [0] = "invalid state",
0134 [NVME_ANA_OPTIMIZED] = "optimized",
0135 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
0136 [NVME_ANA_INACCESSIBLE] = "inaccessible",
0137 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
0138 [NVME_ANA_CHANGE] = "change",
0139 };
0140
0141 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
0142 {
0143 struct nvme_ns_head *head = ns->head;
0144 bool changed = false;
0145 int node;
0146
0147 if (!head)
0148 goto out;
0149
0150 for_each_node(node) {
0151 if (ns == rcu_access_pointer(head->current_path[node])) {
0152 rcu_assign_pointer(head->current_path[node], NULL);
0153 changed = true;
0154 }
0155 }
0156 out:
0157 return changed;
0158 }
0159
0160 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
0161 {
0162 struct nvme_ns *ns;
0163
0164 down_read(&ctrl->namespaces_rwsem);
0165 list_for_each_entry(ns, &ctrl->namespaces, list) {
0166 nvme_mpath_clear_current_path(ns);
0167 kblockd_schedule_work(&ns->head->requeue_work);
0168 }
0169 up_read(&ctrl->namespaces_rwsem);
0170 }
0171
0172 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
0173 {
0174 struct nvme_ns_head *head = ns->head;
0175 sector_t capacity = get_capacity(head->disk);
0176 int node;
0177
0178 list_for_each_entry_rcu(ns, &head->list, siblings) {
0179 if (capacity != get_capacity(ns->disk))
0180 clear_bit(NVME_NS_READY, &ns->flags);
0181 }
0182
0183 for_each_node(node)
0184 rcu_assign_pointer(head->current_path[node], NULL);
0185 }
0186
0187 static bool nvme_path_is_disabled(struct nvme_ns *ns)
0188 {
0189
0190
0191
0192
0193
0194 if (ns->ctrl->state != NVME_CTRL_LIVE &&
0195 ns->ctrl->state != NVME_CTRL_DELETING)
0196 return true;
0197 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
0198 !test_bit(NVME_NS_READY, &ns->flags))
0199 return true;
0200 return false;
0201 }
0202
0203 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
0204 {
0205 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
0206 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
0207
0208 list_for_each_entry_rcu(ns, &head->list, siblings) {
0209 if (nvme_path_is_disabled(ns))
0210 continue;
0211
0212 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
0213 distance = node_distance(node, ns->ctrl->numa_node);
0214 else
0215 distance = LOCAL_DISTANCE;
0216
0217 switch (ns->ana_state) {
0218 case NVME_ANA_OPTIMIZED:
0219 if (distance < found_distance) {
0220 found_distance = distance;
0221 found = ns;
0222 }
0223 break;
0224 case NVME_ANA_NONOPTIMIZED:
0225 if (distance < fallback_distance) {
0226 fallback_distance = distance;
0227 fallback = ns;
0228 }
0229 break;
0230 default:
0231 break;
0232 }
0233 }
0234
0235 if (!found)
0236 found = fallback;
0237 if (found)
0238 rcu_assign_pointer(head->current_path[node], found);
0239 return found;
0240 }
0241
0242 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
0243 struct nvme_ns *ns)
0244 {
0245 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
0246 siblings);
0247 if (ns)
0248 return ns;
0249 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
0250 }
0251
0252 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
0253 int node, struct nvme_ns *old)
0254 {
0255 struct nvme_ns *ns, *found = NULL;
0256
0257 if (list_is_singular(&head->list)) {
0258 if (nvme_path_is_disabled(old))
0259 return NULL;
0260 return old;
0261 }
0262
0263 for (ns = nvme_next_ns(head, old);
0264 ns && ns != old;
0265 ns = nvme_next_ns(head, ns)) {
0266 if (nvme_path_is_disabled(ns))
0267 continue;
0268
0269 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
0270 found = ns;
0271 goto out;
0272 }
0273 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
0274 found = ns;
0275 }
0276
0277
0278
0279
0280
0281
0282
0283 if (!nvme_path_is_disabled(old) &&
0284 (old->ana_state == NVME_ANA_OPTIMIZED ||
0285 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
0286 return old;
0287
0288 if (!found)
0289 return NULL;
0290 out:
0291 rcu_assign_pointer(head->current_path[node], found);
0292 return found;
0293 }
0294
0295 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
0296 {
0297 return ns->ctrl->state == NVME_CTRL_LIVE &&
0298 ns->ana_state == NVME_ANA_OPTIMIZED;
0299 }
0300
0301 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
0302 {
0303 int node = numa_node_id();
0304 struct nvme_ns *ns;
0305
0306 ns = srcu_dereference(head->current_path[node], &head->srcu);
0307 if (unlikely(!ns))
0308 return __nvme_find_path(head, node);
0309
0310 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
0311 return nvme_round_robin_path(head, node, ns);
0312 if (unlikely(!nvme_path_is_optimized(ns)))
0313 return __nvme_find_path(head, node);
0314 return ns;
0315 }
0316
0317 static bool nvme_available_path(struct nvme_ns_head *head)
0318 {
0319 struct nvme_ns *ns;
0320
0321 list_for_each_entry_rcu(ns, &head->list, siblings) {
0322 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
0323 continue;
0324 switch (ns->ctrl->state) {
0325 case NVME_CTRL_LIVE:
0326 case NVME_CTRL_RESETTING:
0327 case NVME_CTRL_CONNECTING:
0328
0329 return true;
0330 default:
0331 break;
0332 }
0333 }
0334 return false;
0335 }
0336
0337 static void nvme_ns_head_submit_bio(struct bio *bio)
0338 {
0339 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
0340 struct device *dev = disk_to_dev(head->disk);
0341 struct nvme_ns *ns;
0342 int srcu_idx;
0343
0344
0345
0346
0347
0348
0349 bio = bio_split_to_limits(bio);
0350
0351 srcu_idx = srcu_read_lock(&head->srcu);
0352 ns = nvme_find_path(head);
0353 if (likely(ns)) {
0354 bio_set_dev(bio, ns->disk->part0);
0355 bio->bi_opf |= REQ_NVME_MPATH;
0356 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
0357 bio->bi_iter.bi_sector);
0358 submit_bio_noacct(bio);
0359 } else if (nvme_available_path(head)) {
0360 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
0361
0362 spin_lock_irq(&head->requeue_lock);
0363 bio_list_add(&head->requeue_list, bio);
0364 spin_unlock_irq(&head->requeue_lock);
0365 } else {
0366 dev_warn_ratelimited(dev, "no available path - failing I/O\n");
0367
0368 bio_io_error(bio);
0369 }
0370
0371 srcu_read_unlock(&head->srcu, srcu_idx);
0372 }
0373
0374 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
0375 {
0376 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
0377 return -ENXIO;
0378 return 0;
0379 }
0380
0381 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
0382 {
0383 nvme_put_ns_head(disk->private_data);
0384 }
0385
0386 #ifdef CONFIG_BLK_DEV_ZONED
0387 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
0388 unsigned int nr_zones, report_zones_cb cb, void *data)
0389 {
0390 struct nvme_ns_head *head = disk->private_data;
0391 struct nvme_ns *ns;
0392 int srcu_idx, ret = -EWOULDBLOCK;
0393
0394 srcu_idx = srcu_read_lock(&head->srcu);
0395 ns = nvme_find_path(head);
0396 if (ns)
0397 ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
0398 srcu_read_unlock(&head->srcu, srcu_idx);
0399 return ret;
0400 }
0401 #else
0402 #define nvme_ns_head_report_zones NULL
0403 #endif
0404
0405 const struct block_device_operations nvme_ns_head_ops = {
0406 .owner = THIS_MODULE,
0407 .submit_bio = nvme_ns_head_submit_bio,
0408 .open = nvme_ns_head_open,
0409 .release = nvme_ns_head_release,
0410 .ioctl = nvme_ns_head_ioctl,
0411 .compat_ioctl = blkdev_compat_ptr_ioctl,
0412 .getgeo = nvme_getgeo,
0413 .report_zones = nvme_ns_head_report_zones,
0414 .pr_ops = &nvme_pr_ops,
0415 };
0416
0417 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
0418 {
0419 return container_of(cdev, struct nvme_ns_head, cdev);
0420 }
0421
0422 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
0423 {
0424 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
0425 return -ENXIO;
0426 return 0;
0427 }
0428
0429 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
0430 {
0431 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
0432 return 0;
0433 }
0434
0435 static const struct file_operations nvme_ns_head_chr_fops = {
0436 .owner = THIS_MODULE,
0437 .open = nvme_ns_head_chr_open,
0438 .release = nvme_ns_head_chr_release,
0439 .unlocked_ioctl = nvme_ns_head_chr_ioctl,
0440 .compat_ioctl = compat_ptr_ioctl,
0441 .uring_cmd = nvme_ns_head_chr_uring_cmd,
0442 };
0443
0444 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
0445 {
0446 int ret;
0447
0448 head->cdev_device.parent = &head->subsys->dev;
0449 ret = dev_set_name(&head->cdev_device, "ng%dn%d",
0450 head->subsys->instance, head->instance);
0451 if (ret)
0452 return ret;
0453 ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
0454 &nvme_ns_head_chr_fops, THIS_MODULE);
0455 return ret;
0456 }
0457
0458 static void nvme_requeue_work(struct work_struct *work)
0459 {
0460 struct nvme_ns_head *head =
0461 container_of(work, struct nvme_ns_head, requeue_work);
0462 struct bio *bio, *next;
0463
0464 spin_lock_irq(&head->requeue_lock);
0465 next = bio_list_get(&head->requeue_list);
0466 spin_unlock_irq(&head->requeue_lock);
0467
0468 while ((bio = next) != NULL) {
0469 next = bio->bi_next;
0470 bio->bi_next = NULL;
0471
0472 submit_bio_noacct(bio);
0473 }
0474 }
0475
0476 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
0477 {
0478 bool vwc = false;
0479
0480 mutex_init(&head->lock);
0481 bio_list_init(&head->requeue_list);
0482 spin_lock_init(&head->requeue_lock);
0483 INIT_WORK(&head->requeue_work, nvme_requeue_work);
0484
0485
0486
0487
0488
0489
0490 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
0491 !nvme_is_unique_nsid(ctrl, head) || !multipath)
0492 return 0;
0493
0494 head->disk = blk_alloc_disk(ctrl->numa_node);
0495 if (!head->disk)
0496 return -ENOMEM;
0497 head->disk->fops = &nvme_ns_head_ops;
0498 head->disk->private_data = head;
0499 sprintf(head->disk->disk_name, "nvme%dn%d",
0500 ctrl->subsys->instance, head->instance);
0501
0502 blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
0503 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
0504
0505
0506
0507
0508
0509
0510 if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
0511 ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
0512 blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
0513
0514
0515 blk_queue_logical_block_size(head->disk->queue, 512);
0516 blk_set_stacking_limits(&head->disk->queue->limits);
0517
0518
0519 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
0520 vwc = true;
0521 blk_queue_write_cache(head->disk->queue, vwc, vwc);
0522 return 0;
0523 }
0524
0525 static void nvme_mpath_set_live(struct nvme_ns *ns)
0526 {
0527 struct nvme_ns_head *head = ns->head;
0528 int rc;
0529
0530 if (!head->disk)
0531 return;
0532
0533
0534
0535
0536
0537
0538 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
0539 rc = device_add_disk(&head->subsys->dev, head->disk,
0540 nvme_ns_id_attr_groups);
0541 if (rc) {
0542 clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
0543 return;
0544 }
0545 nvme_add_ns_head_cdev(head);
0546 }
0547
0548 mutex_lock(&head->lock);
0549 if (nvme_path_is_optimized(ns)) {
0550 int node, srcu_idx;
0551
0552 srcu_idx = srcu_read_lock(&head->srcu);
0553 for_each_node(node)
0554 __nvme_find_path(head, node);
0555 srcu_read_unlock(&head->srcu, srcu_idx);
0556 }
0557 mutex_unlock(&head->lock);
0558
0559 synchronize_srcu(&head->srcu);
0560 kblockd_schedule_work(&head->requeue_work);
0561 }
0562
0563 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
0564 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
0565 void *))
0566 {
0567 void *base = ctrl->ana_log_buf;
0568 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
0569 int error, i;
0570
0571 lockdep_assert_held(&ctrl->ana_lock);
0572
0573 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
0574 struct nvme_ana_group_desc *desc = base + offset;
0575 u32 nr_nsids;
0576 size_t nsid_buf_size;
0577
0578 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
0579 return -EINVAL;
0580
0581 nr_nsids = le32_to_cpu(desc->nnsids);
0582 nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
0583
0584 if (WARN_ON_ONCE(desc->grpid == 0))
0585 return -EINVAL;
0586 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
0587 return -EINVAL;
0588 if (WARN_ON_ONCE(desc->state == 0))
0589 return -EINVAL;
0590 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
0591 return -EINVAL;
0592
0593 offset += sizeof(*desc);
0594 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
0595 return -EINVAL;
0596
0597 error = cb(ctrl, desc, data);
0598 if (error)
0599 return error;
0600
0601 offset += nsid_buf_size;
0602 }
0603
0604 return 0;
0605 }
0606
0607 static inline bool nvme_state_is_live(enum nvme_ana_state state)
0608 {
0609 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
0610 }
0611
0612 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
0613 struct nvme_ns *ns)
0614 {
0615 ns->ana_grpid = le32_to_cpu(desc->grpid);
0616 ns->ana_state = desc->state;
0617 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627 if (nvme_state_is_live(ns->ana_state) &&
0628 ns->ctrl->state == NVME_CTRL_LIVE)
0629 nvme_mpath_set_live(ns);
0630 }
0631
0632 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
0633 struct nvme_ana_group_desc *desc, void *data)
0634 {
0635 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
0636 unsigned *nr_change_groups = data;
0637 struct nvme_ns *ns;
0638
0639 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
0640 le32_to_cpu(desc->grpid),
0641 nvme_ana_state_names[desc->state]);
0642
0643 if (desc->state == NVME_ANA_CHANGE)
0644 (*nr_change_groups)++;
0645
0646 if (!nr_nsids)
0647 return 0;
0648
0649 down_read(&ctrl->namespaces_rwsem);
0650 list_for_each_entry(ns, &ctrl->namespaces, list) {
0651 unsigned nsid;
0652 again:
0653 nsid = le32_to_cpu(desc->nsids[n]);
0654 if (ns->head->ns_id < nsid)
0655 continue;
0656 if (ns->head->ns_id == nsid)
0657 nvme_update_ns_ana_state(desc, ns);
0658 if (++n == nr_nsids)
0659 break;
0660 if (ns->head->ns_id > nsid)
0661 goto again;
0662 }
0663 up_read(&ctrl->namespaces_rwsem);
0664 return 0;
0665 }
0666
0667 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
0668 {
0669 u32 nr_change_groups = 0;
0670 int error;
0671
0672 mutex_lock(&ctrl->ana_lock);
0673 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
0674 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
0675 if (error) {
0676 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
0677 goto out_unlock;
0678 }
0679
0680 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
0681 nvme_update_ana_state);
0682 if (error)
0683 goto out_unlock;
0684
0685
0686
0687
0688
0689
0690
0691
0692
0693
0694
0695
0696 if (nr_change_groups)
0697 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
0698 else
0699 del_timer_sync(&ctrl->anatt_timer);
0700 out_unlock:
0701 mutex_unlock(&ctrl->ana_lock);
0702 return error;
0703 }
0704
0705 static void nvme_ana_work(struct work_struct *work)
0706 {
0707 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
0708
0709 if (ctrl->state != NVME_CTRL_LIVE)
0710 return;
0711
0712 nvme_read_ana_log(ctrl);
0713 }
0714
0715 void nvme_mpath_update(struct nvme_ctrl *ctrl)
0716 {
0717 u32 nr_change_groups = 0;
0718
0719 if (!ctrl->ana_log_buf)
0720 return;
0721
0722 mutex_lock(&ctrl->ana_lock);
0723 nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
0724 mutex_unlock(&ctrl->ana_lock);
0725 }
0726
0727 static void nvme_anatt_timeout(struct timer_list *t)
0728 {
0729 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
0730
0731 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
0732 nvme_reset_ctrl(ctrl);
0733 }
0734
0735 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
0736 {
0737 if (!nvme_ctrl_use_ana(ctrl))
0738 return;
0739 del_timer_sync(&ctrl->anatt_timer);
0740 cancel_work_sync(&ctrl->ana_work);
0741 }
0742
0743 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
0744 struct device_attribute subsys_attr_##_name = \
0745 __ATTR(_name, _mode, _show, _store)
0746
0747 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
0748 struct device_attribute *attr, char *buf)
0749 {
0750 struct nvme_subsystem *subsys =
0751 container_of(dev, struct nvme_subsystem, dev);
0752
0753 return sysfs_emit(buf, "%s\n",
0754 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
0755 }
0756
0757 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
0758 struct device_attribute *attr, const char *buf, size_t count)
0759 {
0760 struct nvme_subsystem *subsys =
0761 container_of(dev, struct nvme_subsystem, dev);
0762 int i;
0763
0764 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
0765 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
0766 WRITE_ONCE(subsys->iopolicy, i);
0767 return count;
0768 }
0769 }
0770
0771 return -EINVAL;
0772 }
0773 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
0774 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
0775
0776 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
0777 char *buf)
0778 {
0779 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
0780 }
0781 DEVICE_ATTR_RO(ana_grpid);
0782
0783 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
0784 char *buf)
0785 {
0786 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
0787
0788 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
0789 }
0790 DEVICE_ATTR_RO(ana_state);
0791
0792 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
0793 struct nvme_ana_group_desc *desc, void *data)
0794 {
0795 struct nvme_ana_group_desc *dst = data;
0796
0797 if (desc->grpid != dst->grpid)
0798 return 0;
0799
0800 *dst = *desc;
0801 return -ENXIO;
0802 }
0803
0804 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
0805 {
0806 if (nvme_ctrl_use_ana(ns->ctrl)) {
0807 struct nvme_ana_group_desc desc = {
0808 .grpid = anagrpid,
0809 .state = 0,
0810 };
0811
0812 mutex_lock(&ns->ctrl->ana_lock);
0813 ns->ana_grpid = le32_to_cpu(anagrpid);
0814 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
0815 mutex_unlock(&ns->ctrl->ana_lock);
0816 if (desc.state) {
0817
0818 nvme_update_ns_ana_state(&desc, ns);
0819 } else {
0820
0821 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
0822 queue_work(nvme_wq, &ns->ctrl->ana_work);
0823 }
0824 } else {
0825 ns->ana_state = NVME_ANA_OPTIMIZED;
0826 nvme_mpath_set_live(ns);
0827 }
0828
0829 if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
0830 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
0831 ns->head->disk->queue);
0832 #ifdef CONFIG_BLK_DEV_ZONED
0833 if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
0834 ns->head->disk->nr_zones = ns->disk->nr_zones;
0835 #endif
0836 }
0837
0838 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
0839 {
0840 if (!head->disk)
0841 return;
0842 kblockd_schedule_work(&head->requeue_work);
0843 if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
0844 nvme_cdev_del(&head->cdev, &head->cdev_device);
0845 del_gendisk(head->disk);
0846 }
0847 }
0848
0849 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
0850 {
0851 if (!head->disk)
0852 return;
0853 blk_mark_disk_dead(head->disk);
0854
0855 kblockd_schedule_work(&head->requeue_work);
0856 flush_work(&head->requeue_work);
0857 put_disk(head->disk);
0858 }
0859
0860 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
0861 {
0862 mutex_init(&ctrl->ana_lock);
0863 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
0864 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
0865 }
0866
0867 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
0868 {
0869 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
0870 size_t ana_log_size;
0871 int error = 0;
0872
0873
0874 if (!multipath || !ctrl->subsys ||
0875 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
0876 return 0;
0877
0878 if (!ctrl->max_namespaces ||
0879 ctrl->max_namespaces > le32_to_cpu(id->nn)) {
0880 dev_err(ctrl->device,
0881 "Invalid MNAN value %u\n", ctrl->max_namespaces);
0882 return -EINVAL;
0883 }
0884
0885 ctrl->anacap = id->anacap;
0886 ctrl->anatt = id->anatt;
0887 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
0888 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
0889
0890 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
0891 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
0892 ctrl->max_namespaces * sizeof(__le32);
0893 if (ana_log_size > max_transfer_size) {
0894 dev_err(ctrl->device,
0895 "ANA log page size (%zd) larger than MDTS (%zd).\n",
0896 ana_log_size, max_transfer_size);
0897 dev_err(ctrl->device, "disabling ANA support.\n");
0898 goto out_uninit;
0899 }
0900 if (ana_log_size > ctrl->ana_log_size) {
0901 nvme_mpath_stop(ctrl);
0902 nvme_mpath_uninit(ctrl);
0903 ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
0904 if (!ctrl->ana_log_buf)
0905 return -ENOMEM;
0906 }
0907 ctrl->ana_log_size = ana_log_size;
0908 error = nvme_read_ana_log(ctrl);
0909 if (error)
0910 goto out_uninit;
0911 return 0;
0912
0913 out_uninit:
0914 nvme_mpath_uninit(ctrl);
0915 return error;
0916 }
0917
0918 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
0919 {
0920 kvfree(ctrl->ana_log_buf);
0921 ctrl->ana_log_buf = NULL;
0922 ctrl->ana_log_size = 0;
0923 }