Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Persistent Memory Driver
0004  *
0005  * Copyright (c) 2014-2015, Intel Corporation.
0006  * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
0007  * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
0008  */
0009 
0010 #include <linux/blkdev.h>
0011 #include <linux/pagemap.h>
0012 #include <linux/hdreg.h>
0013 #include <linux/init.h>
0014 #include <linux/platform_device.h>
0015 #include <linux/set_memory.h>
0016 #include <linux/module.h>
0017 #include <linux/moduleparam.h>
0018 #include <linux/badblocks.h>
0019 #include <linux/memremap.h>
0020 #include <linux/vmalloc.h>
0021 #include <linux/blk-mq.h>
0022 #include <linux/pfn_t.h>
0023 #include <linux/slab.h>
0024 #include <linux/uio.h>
0025 #include <linux/dax.h>
0026 #include <linux/nd.h>
0027 #include <linux/mm.h>
0028 #include <asm/cacheflush.h>
0029 #include "pmem.h"
0030 #include "btt.h"
0031 #include "pfn.h"
0032 #include "nd.h"
0033 
0034 static struct device *to_dev(struct pmem_device *pmem)
0035 {
0036     /*
0037      * nvdimm bus services need a 'dev' parameter, and we record the device
0038      * at init in bb.dev.
0039      */
0040     return pmem->bb.dev;
0041 }
0042 
0043 static struct nd_region *to_region(struct pmem_device *pmem)
0044 {
0045     return to_nd_region(to_dev(pmem)->parent);
0046 }
0047 
0048 static phys_addr_t pmem_to_phys(struct pmem_device *pmem, phys_addr_t offset)
0049 {
0050     return pmem->phys_addr + offset;
0051 }
0052 
0053 static sector_t to_sect(struct pmem_device *pmem, phys_addr_t offset)
0054 {
0055     return (offset - pmem->data_offset) >> SECTOR_SHIFT;
0056 }
0057 
0058 static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector)
0059 {
0060     return (sector << SECTOR_SHIFT) + pmem->data_offset;
0061 }
0062 
0063 static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset,
0064         unsigned int len)
0065 {
0066     phys_addr_t phys = pmem_to_phys(pmem, offset);
0067     unsigned long pfn_start, pfn_end, pfn;
0068 
0069     /* only pmem in the linear map supports HWPoison */
0070     if (is_vmalloc_addr(pmem->virt_addr))
0071         return;
0072 
0073     pfn_start = PHYS_PFN(phys);
0074     pfn_end = pfn_start + PHYS_PFN(len);
0075     for (pfn = pfn_start; pfn < pfn_end; pfn++) {
0076         struct page *page = pfn_to_page(pfn);
0077 
0078         /*
0079          * Note, no need to hold a get_dev_pagemap() reference
0080          * here since we're in the driver I/O path and
0081          * outstanding I/O requests pin the dev_pagemap.
0082          */
0083         if (test_and_clear_pmem_poison(page))
0084             clear_mce_nospec(pfn);
0085     }
0086 }
0087 
0088 static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks)
0089 {
0090     if (blks == 0)
0091         return;
0092     badblocks_clear(&pmem->bb, sector, blks);
0093     if (pmem->bb_state)
0094         sysfs_notify_dirent(pmem->bb_state);
0095 }
0096 
0097 static long __pmem_clear_poison(struct pmem_device *pmem,
0098         phys_addr_t offset, unsigned int len)
0099 {
0100     phys_addr_t phys = pmem_to_phys(pmem, offset);
0101     long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len);
0102 
0103     if (cleared > 0) {
0104         pmem_mkpage_present(pmem, offset, cleared);
0105         arch_invalidate_pmem(pmem->virt_addr + offset, len);
0106     }
0107     return cleared;
0108 }
0109 
0110 static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
0111         phys_addr_t offset, unsigned int len)
0112 {
0113     long cleared = __pmem_clear_poison(pmem, offset, len);
0114 
0115     if (cleared < 0)
0116         return BLK_STS_IOERR;
0117 
0118     pmem_clear_bb(pmem, to_sect(pmem, offset), cleared >> SECTOR_SHIFT);
0119     if (cleared < len)
0120         return BLK_STS_IOERR;
0121     return BLK_STS_OK;
0122 }
0123 
0124 static void write_pmem(void *pmem_addr, struct page *page,
0125         unsigned int off, unsigned int len)
0126 {
0127     unsigned int chunk;
0128     void *mem;
0129 
0130     while (len) {
0131         mem = kmap_atomic(page);
0132         chunk = min_t(unsigned int, len, PAGE_SIZE - off);
0133         memcpy_flushcache(pmem_addr, mem + off, chunk);
0134         kunmap_atomic(mem);
0135         len -= chunk;
0136         off = 0;
0137         page++;
0138         pmem_addr += chunk;
0139     }
0140 }
0141 
0142 static blk_status_t read_pmem(struct page *page, unsigned int off,
0143         void *pmem_addr, unsigned int len)
0144 {
0145     unsigned int chunk;
0146     unsigned long rem;
0147     void *mem;
0148 
0149     while (len) {
0150         mem = kmap_atomic(page);
0151         chunk = min_t(unsigned int, len, PAGE_SIZE - off);
0152         rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk);
0153         kunmap_atomic(mem);
0154         if (rem)
0155             return BLK_STS_IOERR;
0156         len -= chunk;
0157         off = 0;
0158         page++;
0159         pmem_addr += chunk;
0160     }
0161     return BLK_STS_OK;
0162 }
0163 
0164 static blk_status_t pmem_do_read(struct pmem_device *pmem,
0165             struct page *page, unsigned int page_off,
0166             sector_t sector, unsigned int len)
0167 {
0168     blk_status_t rc;
0169     phys_addr_t pmem_off = to_offset(pmem, sector);
0170     void *pmem_addr = pmem->virt_addr + pmem_off;
0171 
0172     if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
0173         return BLK_STS_IOERR;
0174 
0175     rc = read_pmem(page, page_off, pmem_addr, len);
0176     flush_dcache_page(page);
0177     return rc;
0178 }
0179 
0180 static blk_status_t pmem_do_write(struct pmem_device *pmem,
0181             struct page *page, unsigned int page_off,
0182             sector_t sector, unsigned int len)
0183 {
0184     phys_addr_t pmem_off = to_offset(pmem, sector);
0185     void *pmem_addr = pmem->virt_addr + pmem_off;
0186 
0187     if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
0188         blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
0189 
0190         if (rc != BLK_STS_OK)
0191             return rc;
0192     }
0193 
0194     flush_dcache_page(page);
0195     write_pmem(pmem_addr, page, page_off, len);
0196 
0197     return BLK_STS_OK;
0198 }
0199 
0200 static void pmem_submit_bio(struct bio *bio)
0201 {
0202     int ret = 0;
0203     blk_status_t rc = 0;
0204     bool do_acct;
0205     unsigned long start;
0206     struct bio_vec bvec;
0207     struct bvec_iter iter;
0208     struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
0209     struct nd_region *nd_region = to_region(pmem);
0210 
0211     if (bio->bi_opf & REQ_PREFLUSH)
0212         ret = nvdimm_flush(nd_region, bio);
0213 
0214     do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
0215     if (do_acct)
0216         start = bio_start_io_acct(bio);
0217     bio_for_each_segment(bvec, bio, iter) {
0218         if (op_is_write(bio_op(bio)))
0219             rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
0220                 iter.bi_sector, bvec.bv_len);
0221         else
0222             rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
0223                 iter.bi_sector, bvec.bv_len);
0224         if (rc) {
0225             bio->bi_status = rc;
0226             break;
0227         }
0228     }
0229     if (do_acct)
0230         bio_end_io_acct(bio, start);
0231 
0232     if (bio->bi_opf & REQ_FUA)
0233         ret = nvdimm_flush(nd_region, bio);
0234 
0235     if (ret)
0236         bio->bi_status = errno_to_blk_status(ret);
0237 
0238     bio_endio(bio);
0239 }
0240 
0241 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
0242                struct page *page, enum req_op op)
0243 {
0244     struct pmem_device *pmem = bdev->bd_disk->private_data;
0245     blk_status_t rc;
0246 
0247     if (op_is_write(op))
0248         rc = pmem_do_write(pmem, page, 0, sector, thp_size(page));
0249     else
0250         rc = pmem_do_read(pmem, page, 0, sector, thp_size(page));
0251     /*
0252      * The ->rw_page interface is subtle and tricky.  The core
0253      * retries on any error, so we can only invoke page_endio() in
0254      * the successful completion case.  Otherwise, we'll see crashes
0255      * caused by double completion.
0256      */
0257     if (rc == 0)
0258         page_endio(page, op_is_write(op), 0);
0259 
0260     return blk_status_to_errno(rc);
0261 }
0262 
0263 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
0264 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
0265         long nr_pages, enum dax_access_mode mode, void **kaddr,
0266         pfn_t *pfn)
0267 {
0268     resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
0269     sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
0270     unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
0271     struct badblocks *bb = &pmem->bb;
0272     sector_t first_bad;
0273     int num_bad;
0274 
0275     if (kaddr)
0276         *kaddr = pmem->virt_addr + offset;
0277     if (pfn)
0278         *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
0279 
0280     if (bb->count &&
0281         badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
0282         long actual_nr;
0283 
0284         if (mode != DAX_RECOVERY_WRITE)
0285             return -EIO;
0286 
0287         /*
0288          * Set the recovery stride is set to kernel page size because
0289          * the underlying driver and firmware clear poison functions
0290          * don't appear to handle large chunk(such as 2MiB) reliably.
0291          */
0292         actual_nr = PHYS_PFN(
0293             PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT));
0294         dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n",
0295                 sector, nr_pages, first_bad, actual_nr);
0296         if (actual_nr)
0297             return actual_nr;
0298         return 1;
0299     }
0300 
0301     /*
0302      * If badblocks are present but not in the range, limit known good range
0303      * to the requested range.
0304      */
0305     if (bb->count)
0306         return nr_pages;
0307     return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
0308 }
0309 
0310 static const struct block_device_operations pmem_fops = {
0311     .owner =        THIS_MODULE,
0312     .submit_bio =       pmem_submit_bio,
0313     .rw_page =      pmem_rw_page,
0314 };
0315 
0316 static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
0317                     size_t nr_pages)
0318 {
0319     struct pmem_device *pmem = dax_get_private(dax_dev);
0320 
0321     return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
0322                    PFN_PHYS(pgoff) >> SECTOR_SHIFT,
0323                    PAGE_SIZE));
0324 }
0325 
0326 static long pmem_dax_direct_access(struct dax_device *dax_dev,
0327         pgoff_t pgoff, long nr_pages, enum dax_access_mode mode,
0328         void **kaddr, pfn_t *pfn)
0329 {
0330     struct pmem_device *pmem = dax_get_private(dax_dev);
0331 
0332     return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
0333 }
0334 
0335 /*
0336  * The recovery write thread started out as a normal pwrite thread and
0337  * when the filesystem was told about potential media error in the
0338  * range, filesystem turns the normal pwrite to a dax_recovery_write.
0339  *
0340  * The recovery write consists of clearing media poison, clearing page
0341  * HWPoison bit, reenable page-wide read-write permission, flush the
0342  * caches and finally write.  A competing pread thread will be held
0343  * off during the recovery process since data read back might not be
0344  * valid, and this is achieved by clearing the badblock records after
0345  * the recovery write is complete. Competing recovery write threads
0346  * are already serialized by writer lock held by dax_iomap_rw().
0347  */
0348 static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
0349         void *addr, size_t bytes, struct iov_iter *i)
0350 {
0351     struct pmem_device *pmem = dax_get_private(dax_dev);
0352     size_t olen, len, off;
0353     phys_addr_t pmem_off;
0354     struct device *dev = pmem->bb.dev;
0355     long cleared;
0356 
0357     off = offset_in_page(addr);
0358     len = PFN_PHYS(PFN_UP(off + bytes));
0359     if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
0360         return _copy_from_iter_flushcache(addr, bytes, i);
0361 
0362     /*
0363      * Not page-aligned range cannot be recovered. This should not
0364      * happen unless something else went wrong.
0365      */
0366     if (off || !PAGE_ALIGNED(bytes)) {
0367         dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n",
0368             addr, bytes);
0369         return 0;
0370     }
0371 
0372     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
0373     cleared = __pmem_clear_poison(pmem, pmem_off, len);
0374     if (cleared > 0 && cleared < len) {
0375         dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n",
0376             cleared, len);
0377         return 0;
0378     }
0379     if (cleared < 0) {
0380         dev_dbg(dev, "poison clear failed: %ld\n", cleared);
0381         return 0;
0382     }
0383 
0384     olen = _copy_from_iter_flushcache(addr, bytes, i);
0385     pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
0386 
0387     return olen;
0388 }
0389 
0390 static const struct dax_operations pmem_dax_ops = {
0391     .direct_access = pmem_dax_direct_access,
0392     .zero_page_range = pmem_dax_zero_page_range,
0393     .recovery_write = pmem_recovery_write,
0394 };
0395 
0396 static ssize_t write_cache_show(struct device *dev,
0397         struct device_attribute *attr, char *buf)
0398 {
0399     struct pmem_device *pmem = dev_to_disk(dev)->private_data;
0400 
0401     return sprintf(buf, "%d\n", !!dax_write_cache_enabled(pmem->dax_dev));
0402 }
0403 
0404 static ssize_t write_cache_store(struct device *dev,
0405         struct device_attribute *attr, const char *buf, size_t len)
0406 {
0407     struct pmem_device *pmem = dev_to_disk(dev)->private_data;
0408     bool write_cache;
0409     int rc;
0410 
0411     rc = strtobool(buf, &write_cache);
0412     if (rc)
0413         return rc;
0414     dax_write_cache(pmem->dax_dev, write_cache);
0415     return len;
0416 }
0417 static DEVICE_ATTR_RW(write_cache);
0418 
0419 static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
0420 {
0421 #ifndef CONFIG_ARCH_HAS_PMEM_API
0422     if (a == &dev_attr_write_cache.attr)
0423         return 0;
0424 #endif
0425     return a->mode;
0426 }
0427 
0428 static struct attribute *dax_attributes[] = {
0429     &dev_attr_write_cache.attr,
0430     NULL,
0431 };
0432 
0433 static const struct attribute_group dax_attribute_group = {
0434     .name       = "dax",
0435     .attrs      = dax_attributes,
0436     .is_visible = dax_visible,
0437 };
0438 
0439 static const struct attribute_group *pmem_attribute_groups[] = {
0440     &dax_attribute_group,
0441     NULL,
0442 };
0443 
0444 static void pmem_release_disk(void *__pmem)
0445 {
0446     struct pmem_device *pmem = __pmem;
0447 
0448     dax_remove_host(pmem->disk);
0449     kill_dax(pmem->dax_dev);
0450     put_dax(pmem->dax_dev);
0451     del_gendisk(pmem->disk);
0452 
0453     put_disk(pmem->disk);
0454 }
0455 
0456 static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
0457         unsigned long pfn, unsigned long nr_pages, int mf_flags)
0458 {
0459     struct pmem_device *pmem =
0460             container_of(pgmap, struct pmem_device, pgmap);
0461     u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
0462     u64 len = nr_pages << PAGE_SHIFT;
0463 
0464     return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
0465 }
0466 
0467 static const struct dev_pagemap_ops fsdax_pagemap_ops = {
0468     .memory_failure     = pmem_pagemap_memory_failure,
0469 };
0470 
0471 static int pmem_attach_disk(struct device *dev,
0472         struct nd_namespace_common *ndns)
0473 {
0474     struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
0475     struct nd_region *nd_region = to_nd_region(dev->parent);
0476     int nid = dev_to_node(dev), fua;
0477     struct resource *res = &nsio->res;
0478     struct range bb_range;
0479     struct nd_pfn *nd_pfn = NULL;
0480     struct dax_device *dax_dev;
0481     struct nd_pfn_sb *pfn_sb;
0482     struct pmem_device *pmem;
0483     struct request_queue *q;
0484     struct gendisk *disk;
0485     void *addr;
0486     int rc;
0487 
0488     pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
0489     if (!pmem)
0490         return -ENOMEM;
0491 
0492     rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
0493     if (rc)
0494         return rc;
0495 
0496     /* while nsio_rw_bytes is active, parse a pfn info block if present */
0497     if (is_nd_pfn(dev)) {
0498         nd_pfn = to_nd_pfn(dev);
0499         rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
0500         if (rc)
0501             return rc;
0502     }
0503 
0504     /* we're attaching a block device, disable raw namespace access */
0505     devm_namespace_disable(dev, ndns);
0506 
0507     dev_set_drvdata(dev, pmem);
0508     pmem->phys_addr = res->start;
0509     pmem->size = resource_size(res);
0510     fua = nvdimm_has_flush(nd_region);
0511     if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
0512         dev_warn(dev, "unable to guarantee persistence of writes\n");
0513         fua = 0;
0514     }
0515 
0516     if (!devm_request_mem_region(dev, res->start, resource_size(res),
0517                 dev_name(&ndns->dev))) {
0518         dev_warn(dev, "could not reserve region %pR\n", res);
0519         return -EBUSY;
0520     }
0521 
0522     disk = blk_alloc_disk(nid);
0523     if (!disk)
0524         return -ENOMEM;
0525     q = disk->queue;
0526 
0527     pmem->disk = disk;
0528     pmem->pgmap.owner = pmem;
0529     pmem->pfn_flags = PFN_DEV;
0530     if (is_nd_pfn(dev)) {
0531         pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
0532         pmem->pgmap.ops = &fsdax_pagemap_ops;
0533         addr = devm_memremap_pages(dev, &pmem->pgmap);
0534         pfn_sb = nd_pfn->pfn_sb;
0535         pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
0536         pmem->pfn_pad = resource_size(res) -
0537             range_len(&pmem->pgmap.range);
0538         pmem->pfn_flags |= PFN_MAP;
0539         bb_range = pmem->pgmap.range;
0540         bb_range.start += pmem->data_offset;
0541     } else if (pmem_should_map_pages(dev)) {
0542         pmem->pgmap.range.start = res->start;
0543         pmem->pgmap.range.end = res->end;
0544         pmem->pgmap.nr_range = 1;
0545         pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
0546         pmem->pgmap.ops = &fsdax_pagemap_ops;
0547         addr = devm_memremap_pages(dev, &pmem->pgmap);
0548         pmem->pfn_flags |= PFN_MAP;
0549         bb_range = pmem->pgmap.range;
0550     } else {
0551         addr = devm_memremap(dev, pmem->phys_addr,
0552                 pmem->size, ARCH_MEMREMAP_PMEM);
0553         bb_range.start =  res->start;
0554         bb_range.end = res->end;
0555     }
0556 
0557     if (IS_ERR(addr)) {
0558         rc = PTR_ERR(addr);
0559         goto out;
0560     }
0561     pmem->virt_addr = addr;
0562 
0563     blk_queue_write_cache(q, true, fua);
0564     blk_queue_physical_block_size(q, PAGE_SIZE);
0565     blk_queue_logical_block_size(q, pmem_sector_size(ndns));
0566     blk_queue_max_hw_sectors(q, UINT_MAX);
0567     blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
0568     if (pmem->pfn_flags & PFN_MAP)
0569         blk_queue_flag_set(QUEUE_FLAG_DAX, q);
0570 
0571     disk->fops      = &pmem_fops;
0572     disk->private_data  = pmem;
0573     nvdimm_namespace_disk_name(ndns, disk->disk_name);
0574     set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
0575             / 512);
0576     if (devm_init_badblocks(dev, &pmem->bb))
0577         return -ENOMEM;
0578     nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
0579     disk->bb = &pmem->bb;
0580 
0581     dax_dev = alloc_dax(pmem, &pmem_dax_ops);
0582     if (IS_ERR(dax_dev)) {
0583         rc = PTR_ERR(dax_dev);
0584         goto out;
0585     }
0586     set_dax_nocache(dax_dev);
0587     set_dax_nomc(dax_dev);
0588     if (is_nvdimm_sync(nd_region))
0589         set_dax_synchronous(dax_dev);
0590     rc = dax_add_host(dax_dev, disk);
0591     if (rc)
0592         goto out_cleanup_dax;
0593     dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
0594     pmem->dax_dev = dax_dev;
0595 
0596     rc = device_add_disk(dev, disk, pmem_attribute_groups);
0597     if (rc)
0598         goto out_remove_host;
0599     if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
0600         return -ENOMEM;
0601 
0602     nvdimm_check_and_set_ro(disk);
0603 
0604     pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
0605                       "badblocks");
0606     if (!pmem->bb_state)
0607         dev_warn(dev, "'badblocks' notification disabled\n");
0608     return 0;
0609 
0610 out_remove_host:
0611     dax_remove_host(pmem->disk);
0612 out_cleanup_dax:
0613     kill_dax(pmem->dax_dev);
0614     put_dax(pmem->dax_dev);
0615 out:
0616     put_disk(pmem->disk);
0617     return rc;
0618 }
0619 
0620 static int nd_pmem_probe(struct device *dev)
0621 {
0622     int ret;
0623     struct nd_namespace_common *ndns;
0624 
0625     ndns = nvdimm_namespace_common_probe(dev);
0626     if (IS_ERR(ndns))
0627         return PTR_ERR(ndns);
0628 
0629     if (is_nd_btt(dev))
0630         return nvdimm_namespace_attach_btt(ndns);
0631 
0632     if (is_nd_pfn(dev))
0633         return pmem_attach_disk(dev, ndns);
0634 
0635     ret = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
0636     if (ret)
0637         return ret;
0638 
0639     ret = nd_btt_probe(dev, ndns);
0640     if (ret == 0)
0641         return -ENXIO;
0642 
0643     /*
0644      * We have two failure conditions here, there is no
0645      * info reserver block or we found a valid info reserve block
0646      * but failed to initialize the pfn superblock.
0647      *
0648      * For the first case consider namespace as a raw pmem namespace
0649      * and attach a disk.
0650      *
0651      * For the latter, consider this a success and advance the namespace
0652      * seed.
0653      */
0654     ret = nd_pfn_probe(dev, ndns);
0655     if (ret == 0)
0656         return -ENXIO;
0657     else if (ret == -EOPNOTSUPP)
0658         return ret;
0659 
0660     ret = nd_dax_probe(dev, ndns);
0661     if (ret == 0)
0662         return -ENXIO;
0663     else if (ret == -EOPNOTSUPP)
0664         return ret;
0665 
0666     /* probe complete, attach handles namespace enabling */
0667     devm_namespace_disable(dev, ndns);
0668 
0669     return pmem_attach_disk(dev, ndns);
0670 }
0671 
0672 static void nd_pmem_remove(struct device *dev)
0673 {
0674     struct pmem_device *pmem = dev_get_drvdata(dev);
0675 
0676     if (is_nd_btt(dev))
0677         nvdimm_namespace_detach_btt(to_nd_btt(dev));
0678     else {
0679         /*
0680          * Note, this assumes device_lock() context to not
0681          * race nd_pmem_notify()
0682          */
0683         sysfs_put(pmem->bb_state);
0684         pmem->bb_state = NULL;
0685     }
0686     nvdimm_flush(to_nd_region(dev->parent), NULL);
0687 }
0688 
0689 static void nd_pmem_shutdown(struct device *dev)
0690 {
0691     nvdimm_flush(to_nd_region(dev->parent), NULL);
0692 }
0693 
0694 static void pmem_revalidate_poison(struct device *dev)
0695 {
0696     struct nd_region *nd_region;
0697     resource_size_t offset = 0, end_trunc = 0;
0698     struct nd_namespace_common *ndns;
0699     struct nd_namespace_io *nsio;
0700     struct badblocks *bb;
0701     struct range range;
0702     struct kernfs_node *bb_state;
0703 
0704     if (is_nd_btt(dev)) {
0705         struct nd_btt *nd_btt = to_nd_btt(dev);
0706 
0707         ndns = nd_btt->ndns;
0708         nd_region = to_nd_region(ndns->dev.parent);
0709         nsio = to_nd_namespace_io(&ndns->dev);
0710         bb = &nsio->bb;
0711         bb_state = NULL;
0712     } else {
0713         struct pmem_device *pmem = dev_get_drvdata(dev);
0714 
0715         nd_region = to_region(pmem);
0716         bb = &pmem->bb;
0717         bb_state = pmem->bb_state;
0718 
0719         if (is_nd_pfn(dev)) {
0720             struct nd_pfn *nd_pfn = to_nd_pfn(dev);
0721             struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
0722 
0723             ndns = nd_pfn->ndns;
0724             offset = pmem->data_offset +
0725                     __le32_to_cpu(pfn_sb->start_pad);
0726             end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
0727         } else {
0728             ndns = to_ndns(dev);
0729         }
0730 
0731         nsio = to_nd_namespace_io(&ndns->dev);
0732     }
0733 
0734     range.start = nsio->res.start + offset;
0735     range.end = nsio->res.end - end_trunc;
0736     nvdimm_badblocks_populate(nd_region, bb, &range);
0737     if (bb_state)
0738         sysfs_notify_dirent(bb_state);
0739 }
0740 
0741 static void pmem_revalidate_region(struct device *dev)
0742 {
0743     struct pmem_device *pmem;
0744 
0745     if (is_nd_btt(dev)) {
0746         struct nd_btt *nd_btt = to_nd_btt(dev);
0747         struct btt *btt = nd_btt->btt;
0748 
0749         nvdimm_check_and_set_ro(btt->btt_disk);
0750         return;
0751     }
0752 
0753     pmem = dev_get_drvdata(dev);
0754     nvdimm_check_and_set_ro(pmem->disk);
0755 }
0756 
0757 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
0758 {
0759     switch (event) {
0760     case NVDIMM_REVALIDATE_POISON:
0761         pmem_revalidate_poison(dev);
0762         break;
0763     case NVDIMM_REVALIDATE_REGION:
0764         pmem_revalidate_region(dev);
0765         break;
0766     default:
0767         dev_WARN_ONCE(dev, 1, "notify: unknown event: %d\n", event);
0768         break;
0769     }
0770 }
0771 
0772 MODULE_ALIAS("pmem");
0773 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
0774 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
0775 static struct nd_device_driver nd_pmem_driver = {
0776     .probe = nd_pmem_probe,
0777     .remove = nd_pmem_remove,
0778     .notify = nd_pmem_notify,
0779     .shutdown = nd_pmem_shutdown,
0780     .drv = {
0781         .name = "nd_pmem",
0782     },
0783     .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
0784 };
0785 
0786 module_nd_driver(nd_pmem_driver);
0787 
0788 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
0789 MODULE_LICENSE("GPL v2");