Back to home page

LXR

 
 

    


0001 /*
0002  * fs/dax.c - Direct Access filesystem code
0003  * Copyright (c) 2013-2014 Intel Corporation
0004  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
0005  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
0006  *
0007  * This program is free software; you can redistribute it and/or modify it
0008  * under the terms and conditions of the GNU General Public License,
0009  * version 2, as published by the Free Software Foundation.
0010  *
0011  * This program is distributed in the hope it will be useful, but WITHOUT
0012  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
0014  * more details.
0015  */
0016 
0017 #include <linux/atomic.h>
0018 #include <linux/blkdev.h>
0019 #include <linux/buffer_head.h>
0020 #include <linux/dax.h>
0021 #include <linux/fs.h>
0022 #include <linux/genhd.h>
0023 #include <linux/highmem.h>
0024 #include <linux/memcontrol.h>
0025 #include <linux/mm.h>
0026 #include <linux/mutex.h>
0027 #include <linux/pagevec.h>
0028 #include <linux/pmem.h>
0029 #include <linux/sched.h>
0030 #include <linux/uio.h>
0031 #include <linux/vmstat.h>
0032 #include <linux/pfn_t.h>
0033 #include <linux/sizes.h>
0034 #include <linux/mmu_notifier.h>
0035 #include <linux/iomap.h>
0036 #include "internal.h"
0037 
0038 /* We choose 4096 entries - same as per-zone page wait tables */
0039 #define DAX_WAIT_TABLE_BITS 12
0040 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
0041 
0042 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
0043 
0044 static int __init init_dax_wait_table(void)
0045 {
0046     int i;
0047 
0048     for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
0049         init_waitqueue_head(wait_table + i);
0050     return 0;
0051 }
0052 fs_initcall(init_dax_wait_table);
0053 
0054 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
0055 {
0056     struct request_queue *q = bdev->bd_queue;
0057     long rc = -EIO;
0058 
0059     dax->addr = ERR_PTR(-EIO);
0060     if (blk_queue_enter(q, true) != 0)
0061         return rc;
0062 
0063     rc = bdev_direct_access(bdev, dax);
0064     if (rc < 0) {
0065         dax->addr = ERR_PTR(rc);
0066         blk_queue_exit(q);
0067         return rc;
0068     }
0069     return rc;
0070 }
0071 
0072 static void dax_unmap_atomic(struct block_device *bdev,
0073         const struct blk_dax_ctl *dax)
0074 {
0075     if (IS_ERR(dax->addr))
0076         return;
0077     blk_queue_exit(bdev->bd_queue);
0078 }
0079 
0080 static int dax_is_pmd_entry(void *entry)
0081 {
0082     return (unsigned long)entry & RADIX_DAX_PMD;
0083 }
0084 
0085 static int dax_is_pte_entry(void *entry)
0086 {
0087     return !((unsigned long)entry & RADIX_DAX_PMD);
0088 }
0089 
0090 static int dax_is_zero_entry(void *entry)
0091 {
0092     return (unsigned long)entry & RADIX_DAX_HZP;
0093 }
0094 
0095 static int dax_is_empty_entry(void *entry)
0096 {
0097     return (unsigned long)entry & RADIX_DAX_EMPTY;
0098 }
0099 
0100 struct page *read_dax_sector(struct block_device *bdev, sector_t n)
0101 {
0102     struct page *page = alloc_pages(GFP_KERNEL, 0);
0103     struct blk_dax_ctl dax = {
0104         .size = PAGE_SIZE,
0105         .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
0106     };
0107     long rc;
0108 
0109     if (!page)
0110         return ERR_PTR(-ENOMEM);
0111 
0112     rc = dax_map_atomic(bdev, &dax);
0113     if (rc < 0)
0114         return ERR_PTR(rc);
0115     memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
0116     dax_unmap_atomic(bdev, &dax);
0117     return page;
0118 }
0119 
0120 /*
0121  * DAX radix tree locking
0122  */
0123 struct exceptional_entry_key {
0124     struct address_space *mapping;
0125     pgoff_t entry_start;
0126 };
0127 
0128 struct wait_exceptional_entry_queue {
0129     wait_queue_t wait;
0130     struct exceptional_entry_key key;
0131 };
0132 
0133 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
0134         pgoff_t index, void *entry, struct exceptional_entry_key *key)
0135 {
0136     unsigned long hash;
0137 
0138     /*
0139      * If 'entry' is a PMD, align the 'index' that we use for the wait
0140      * queue to the start of that PMD.  This ensures that all offsets in
0141      * the range covered by the PMD map to the same bit lock.
0142      */
0143     if (dax_is_pmd_entry(entry))
0144         index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
0145 
0146     key->mapping = mapping;
0147     key->entry_start = index;
0148 
0149     hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
0150     return wait_table + hash;
0151 }
0152 
0153 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
0154                        int sync, void *keyp)
0155 {
0156     struct exceptional_entry_key *key = keyp;
0157     struct wait_exceptional_entry_queue *ewait =
0158         container_of(wait, struct wait_exceptional_entry_queue, wait);
0159 
0160     if (key->mapping != ewait->key.mapping ||
0161         key->entry_start != ewait->key.entry_start)
0162         return 0;
0163     return autoremove_wake_function(wait, mode, sync, NULL);
0164 }
0165 
0166 /*
0167  * Check whether the given slot is locked. The function must be called with
0168  * mapping->tree_lock held
0169  */
0170 static inline int slot_locked(struct address_space *mapping, void **slot)
0171 {
0172     unsigned long entry = (unsigned long)
0173         radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
0174     return entry & RADIX_DAX_ENTRY_LOCK;
0175 }
0176 
0177 /*
0178  * Mark the given slot is locked. The function must be called with
0179  * mapping->tree_lock held
0180  */
0181 static inline void *lock_slot(struct address_space *mapping, void **slot)
0182 {
0183     unsigned long entry = (unsigned long)
0184         radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
0185 
0186     entry |= RADIX_DAX_ENTRY_LOCK;
0187     radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
0188     return (void *)entry;
0189 }
0190 
0191 /*
0192  * Mark the given slot is unlocked. The function must be called with
0193  * mapping->tree_lock held
0194  */
0195 static inline void *unlock_slot(struct address_space *mapping, void **slot)
0196 {
0197     unsigned long entry = (unsigned long)
0198         radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
0199 
0200     entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
0201     radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
0202     return (void *)entry;
0203 }
0204 
0205 /*
0206  * Lookup entry in radix tree, wait for it to become unlocked if it is
0207  * exceptional entry and return it. The caller must call
0208  * put_unlocked_mapping_entry() when he decided not to lock the entry or
0209  * put_locked_mapping_entry() when he locked the entry and now wants to
0210  * unlock it.
0211  *
0212  * The function must be called with mapping->tree_lock held.
0213  */
0214 static void *get_unlocked_mapping_entry(struct address_space *mapping,
0215                     pgoff_t index, void ***slotp)
0216 {
0217     void *entry, **slot;
0218     struct wait_exceptional_entry_queue ewait;
0219     wait_queue_head_t *wq;
0220 
0221     init_wait(&ewait.wait);
0222     ewait.wait.func = wake_exceptional_entry_func;
0223 
0224     for (;;) {
0225         entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
0226                       &slot);
0227         if (!entry || !radix_tree_exceptional_entry(entry) ||
0228             !slot_locked(mapping, slot)) {
0229             if (slotp)
0230                 *slotp = slot;
0231             return entry;
0232         }
0233 
0234         wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
0235         prepare_to_wait_exclusive(wq, &ewait.wait,
0236                       TASK_UNINTERRUPTIBLE);
0237         spin_unlock_irq(&mapping->tree_lock);
0238         schedule();
0239         finish_wait(wq, &ewait.wait);
0240         spin_lock_irq(&mapping->tree_lock);
0241     }
0242 }
0243 
0244 static void dax_unlock_mapping_entry(struct address_space *mapping,
0245                      pgoff_t index)
0246 {
0247     void *entry, **slot;
0248 
0249     spin_lock_irq(&mapping->tree_lock);
0250     entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
0251     if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
0252              !slot_locked(mapping, slot))) {
0253         spin_unlock_irq(&mapping->tree_lock);
0254         return;
0255     }
0256     unlock_slot(mapping, slot);
0257     spin_unlock_irq(&mapping->tree_lock);
0258     dax_wake_mapping_entry_waiter(mapping, index, entry, false);
0259 }
0260 
0261 static void put_locked_mapping_entry(struct address_space *mapping,
0262                      pgoff_t index, void *entry)
0263 {
0264     if (!radix_tree_exceptional_entry(entry)) {
0265         unlock_page(entry);
0266         put_page(entry);
0267     } else {
0268         dax_unlock_mapping_entry(mapping, index);
0269     }
0270 }
0271 
0272 /*
0273  * Called when we are done with radix tree entry we looked up via
0274  * get_unlocked_mapping_entry() and which we didn't lock in the end.
0275  */
0276 static void put_unlocked_mapping_entry(struct address_space *mapping,
0277                        pgoff_t index, void *entry)
0278 {
0279     if (!radix_tree_exceptional_entry(entry))
0280         return;
0281 
0282     /* We have to wake up next waiter for the radix tree entry lock */
0283     dax_wake_mapping_entry_waiter(mapping, index, entry, false);
0284 }
0285 
0286 /*
0287  * Find radix tree entry at given index. If it points to a page, return with
0288  * the page locked. If it points to the exceptional entry, return with the
0289  * radix tree entry locked. If the radix tree doesn't contain given index,
0290  * create empty exceptional entry for the index and return with it locked.
0291  *
0292  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
0293  * either return that locked entry or will return an error.  This error will
0294  * happen if there are any 4k entries (either zero pages or DAX entries)
0295  * within the 2MiB range that we are requesting.
0296  *
0297  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
0298  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
0299  * insertion will fail if it finds any 4k entries already in the tree, and a
0300  * 4k insertion will cause an existing 2MiB entry to be unmapped and
0301  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
0302  * well as 2MiB empty entries.
0303  *
0304  * The exception to this downgrade path is for 2MiB DAX PMD entries that have
0305  * real storage backing them.  We will leave these real 2MiB DAX entries in
0306  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
0307  *
0308  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
0309  * persistent memory the benefit is doubtful. We can add that later if we can
0310  * show it helps.
0311  */
0312 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
0313         unsigned long size_flag)
0314 {
0315     bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
0316     void *entry, **slot;
0317 
0318 restart:
0319     spin_lock_irq(&mapping->tree_lock);
0320     entry = get_unlocked_mapping_entry(mapping, index, &slot);
0321 
0322     if (entry) {
0323         if (size_flag & RADIX_DAX_PMD) {
0324             if (!radix_tree_exceptional_entry(entry) ||
0325                 dax_is_pte_entry(entry)) {
0326                 put_unlocked_mapping_entry(mapping, index,
0327                         entry);
0328                 entry = ERR_PTR(-EEXIST);
0329                 goto out_unlock;
0330             }
0331         } else { /* trying to grab a PTE entry */
0332             if (radix_tree_exceptional_entry(entry) &&
0333                 dax_is_pmd_entry(entry) &&
0334                 (dax_is_zero_entry(entry) ||
0335                  dax_is_empty_entry(entry))) {
0336                 pmd_downgrade = true;
0337             }
0338         }
0339     }
0340 
0341     /* No entry for given index? Make sure radix tree is big enough. */
0342     if (!entry || pmd_downgrade) {
0343         int err;
0344 
0345         if (pmd_downgrade) {
0346             /*
0347              * Make sure 'entry' remains valid while we drop
0348              * mapping->tree_lock.
0349              */
0350             entry = lock_slot(mapping, slot);
0351         }
0352 
0353         spin_unlock_irq(&mapping->tree_lock);
0354         /*
0355          * Besides huge zero pages the only other thing that gets
0356          * downgraded are empty entries which don't need to be
0357          * unmapped.
0358          */
0359         if (pmd_downgrade && dax_is_zero_entry(entry))
0360             unmap_mapping_range(mapping,
0361                 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
0362 
0363         err = radix_tree_preload(
0364                 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
0365         if (err) {
0366             if (pmd_downgrade)
0367                 put_locked_mapping_entry(mapping, index, entry);
0368             return ERR_PTR(err);
0369         }
0370         spin_lock_irq(&mapping->tree_lock);
0371 
0372         if (pmd_downgrade) {
0373             radix_tree_delete(&mapping->page_tree, index);
0374             mapping->nrexceptional--;
0375             dax_wake_mapping_entry_waiter(mapping, index, entry,
0376                     true);
0377         }
0378 
0379         entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
0380 
0381         err = __radix_tree_insert(&mapping->page_tree, index,
0382                 dax_radix_order(entry), entry);
0383         radix_tree_preload_end();
0384         if (err) {
0385             spin_unlock_irq(&mapping->tree_lock);
0386             /*
0387              * Someone already created the entry?  This is a
0388              * normal failure when inserting PMDs in a range
0389              * that already contains PTEs.  In that case we want
0390              * to return -EEXIST immediately.
0391              */
0392             if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
0393                 goto restart;
0394             /*
0395              * Our insertion of a DAX PMD entry failed, most
0396              * likely because it collided with a PTE sized entry
0397              * at a different index in the PMD range.  We haven't
0398              * inserted anything into the radix tree and have no
0399              * waiters to wake.
0400              */
0401             return ERR_PTR(err);
0402         }
0403         /* Good, we have inserted empty locked entry into the tree. */
0404         mapping->nrexceptional++;
0405         spin_unlock_irq(&mapping->tree_lock);
0406         return entry;
0407     }
0408     /* Normal page in radix tree? */
0409     if (!radix_tree_exceptional_entry(entry)) {
0410         struct page *page = entry;
0411 
0412         get_page(page);
0413         spin_unlock_irq(&mapping->tree_lock);
0414         lock_page(page);
0415         /* Page got truncated? Retry... */
0416         if (unlikely(page->mapping != mapping)) {
0417             unlock_page(page);
0418             put_page(page);
0419             goto restart;
0420         }
0421         return page;
0422     }
0423     entry = lock_slot(mapping, slot);
0424  out_unlock:
0425     spin_unlock_irq(&mapping->tree_lock);
0426     return entry;
0427 }
0428 
0429 /*
0430  * We do not necessarily hold the mapping->tree_lock when we call this
0431  * function so it is possible that 'entry' is no longer a valid item in the
0432  * radix tree.  This is okay because all we really need to do is to find the
0433  * correct waitqueue where tasks might be waiting for that old 'entry' and
0434  * wake them.
0435  */
0436 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
0437         pgoff_t index, void *entry, bool wake_all)
0438 {
0439     struct exceptional_entry_key key;
0440     wait_queue_head_t *wq;
0441 
0442     wq = dax_entry_waitqueue(mapping, index, entry, &key);
0443 
0444     /*
0445      * Checking for locked entry and prepare_to_wait_exclusive() happens
0446      * under mapping->tree_lock, ditto for entry handling in our callers.
0447      * So at this point all tasks that could have seen our entry locked
0448      * must be in the waitqueue and the following check will see them.
0449      */
0450     if (waitqueue_active(wq))
0451         __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
0452 }
0453 
0454 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
0455                       pgoff_t index, bool trunc)
0456 {
0457     int ret = 0;
0458     void *entry;
0459     struct radix_tree_root *page_tree = &mapping->page_tree;
0460 
0461     spin_lock_irq(&mapping->tree_lock);
0462     entry = get_unlocked_mapping_entry(mapping, index, NULL);
0463     if (!entry || !radix_tree_exceptional_entry(entry))
0464         goto out;
0465     if (!trunc &&
0466         (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
0467          radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
0468         goto out;
0469     radix_tree_delete(page_tree, index);
0470     mapping->nrexceptional--;
0471     ret = 1;
0472 out:
0473     put_unlocked_mapping_entry(mapping, index, entry);
0474     spin_unlock_irq(&mapping->tree_lock);
0475     return ret;
0476 }
0477 /*
0478  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
0479  * entry to get unlocked before deleting it.
0480  */
0481 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
0482 {
0483     int ret = __dax_invalidate_mapping_entry(mapping, index, true);
0484 
0485     /*
0486      * This gets called from truncate / punch_hole path. As such, the caller
0487      * must hold locks protecting against concurrent modifications of the
0488      * radix tree (usually fs-private i_mmap_sem for writing). Since the
0489      * caller has seen exceptional entry for this index, we better find it
0490      * at that index as well...
0491      */
0492     WARN_ON_ONCE(!ret);
0493     return ret;
0494 }
0495 
0496 /*
0497  * Invalidate exceptional DAX entry if easily possible. This handles DAX
0498  * entries for invalidate_inode_pages() so we evict the entry only if we can
0499  * do so without blocking.
0500  */
0501 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
0502 {
0503     int ret = 0;
0504     void *entry, **slot;
0505     struct radix_tree_root *page_tree = &mapping->page_tree;
0506 
0507     spin_lock_irq(&mapping->tree_lock);
0508     entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
0509     if (!entry || !radix_tree_exceptional_entry(entry) ||
0510         slot_locked(mapping, slot))
0511         goto out;
0512     if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
0513         radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
0514         goto out;
0515     radix_tree_delete(page_tree, index);
0516     mapping->nrexceptional--;
0517     ret = 1;
0518 out:
0519     spin_unlock_irq(&mapping->tree_lock);
0520     if (ret)
0521         dax_wake_mapping_entry_waiter(mapping, index, entry, true);
0522     return ret;
0523 }
0524 
0525 /*
0526  * Invalidate exceptional DAX entry if it is clean.
0527  */
0528 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
0529                       pgoff_t index)
0530 {
0531     return __dax_invalidate_mapping_entry(mapping, index, false);
0532 }
0533 
0534 /*
0535  * The user has performed a load from a hole in the file.  Allocating
0536  * a new page in the file would cause excessive storage usage for
0537  * workloads with sparse files.  We allocate a page cache page instead.
0538  * We'll kick it out of the page cache if it's ever written to,
0539  * otherwise it will simply fall out of the page cache under memory
0540  * pressure without ever having been dirtied.
0541  */
0542 static int dax_load_hole(struct address_space *mapping, void **entry,
0543              struct vm_fault *vmf)
0544 {
0545     struct page *page;
0546     int ret;
0547 
0548     /* Hole page already exists? Return it...  */
0549     if (!radix_tree_exceptional_entry(*entry)) {
0550         page = *entry;
0551         goto out;
0552     }
0553 
0554     /* This will replace locked radix tree entry with a hole page */
0555     page = find_or_create_page(mapping, vmf->pgoff,
0556                    vmf->gfp_mask | __GFP_ZERO);
0557     if (!page)
0558         return VM_FAULT_OOM;
0559  out:
0560     vmf->page = page;
0561     ret = finish_fault(vmf);
0562     vmf->page = NULL;
0563     *entry = page;
0564     if (!ret) {
0565         /* Grab reference for PTE that is now referencing the page */
0566         get_page(page);
0567         return VM_FAULT_NOPAGE;
0568     }
0569     return ret;
0570 }
0571 
0572 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
0573         struct page *to, unsigned long vaddr)
0574 {
0575     struct blk_dax_ctl dax = {
0576         .sector = sector,
0577         .size = size,
0578     };
0579     void *vto;
0580 
0581     if (dax_map_atomic(bdev, &dax) < 0)
0582         return PTR_ERR(dax.addr);
0583     vto = kmap_atomic(to);
0584     copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
0585     kunmap_atomic(vto);
0586     dax_unmap_atomic(bdev, &dax);
0587     return 0;
0588 }
0589 
0590 /*
0591  * By this point grab_mapping_entry() has ensured that we have a locked entry
0592  * of the appropriate size so we don't have to worry about downgrading PMDs to
0593  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
0594  * already in the tree, we will skip the insertion and just dirty the PMD as
0595  * appropriate.
0596  */
0597 static void *dax_insert_mapping_entry(struct address_space *mapping,
0598                       struct vm_fault *vmf,
0599                       void *entry, sector_t sector,
0600                       unsigned long flags)
0601 {
0602     struct radix_tree_root *page_tree = &mapping->page_tree;
0603     int error = 0;
0604     bool hole_fill = false;
0605     void *new_entry;
0606     pgoff_t index = vmf->pgoff;
0607 
0608     if (vmf->flags & FAULT_FLAG_WRITE)
0609         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
0610 
0611     /* Replacing hole page with block mapping? */
0612     if (!radix_tree_exceptional_entry(entry)) {
0613         hole_fill = true;
0614         /*
0615          * Unmap the page now before we remove it from page cache below.
0616          * The page is locked so it cannot be faulted in again.
0617          */
0618         unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
0619                     PAGE_SIZE, 0);
0620         error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
0621         if (error)
0622             return ERR_PTR(error);
0623     } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
0624         /* replacing huge zero page with PMD block mapping */
0625         unmap_mapping_range(mapping,
0626             (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
0627     }
0628 
0629     spin_lock_irq(&mapping->tree_lock);
0630     new_entry = dax_radix_locked_entry(sector, flags);
0631 
0632     if (hole_fill) {
0633         __delete_from_page_cache(entry, NULL);
0634         /* Drop pagecache reference */
0635         put_page(entry);
0636         error = __radix_tree_insert(page_tree, index,
0637                 dax_radix_order(new_entry), new_entry);
0638         if (error) {
0639             new_entry = ERR_PTR(error);
0640             goto unlock;
0641         }
0642         mapping->nrexceptional++;
0643     } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
0644         /*
0645          * Only swap our new entry into the radix tree if the current
0646          * entry is a zero page or an empty entry.  If a normal PTE or
0647          * PMD entry is already in the tree, we leave it alone.  This
0648          * means that if we are trying to insert a PTE and the
0649          * existing entry is a PMD, we will just leave the PMD in the
0650          * tree and dirty it if necessary.
0651          */
0652         struct radix_tree_node *node;
0653         void **slot;
0654         void *ret;
0655 
0656         ret = __radix_tree_lookup(page_tree, index, &node, &slot);
0657         WARN_ON_ONCE(ret != entry);
0658         __radix_tree_replace(page_tree, node, slot,
0659                      new_entry, NULL, NULL);
0660     }
0661     if (vmf->flags & FAULT_FLAG_WRITE)
0662         radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
0663  unlock:
0664     spin_unlock_irq(&mapping->tree_lock);
0665     if (hole_fill) {
0666         radix_tree_preload_end();
0667         /*
0668          * We don't need hole page anymore, it has been replaced with
0669          * locked radix tree entry now.
0670          */
0671         if (mapping->a_ops->freepage)
0672             mapping->a_ops->freepage(entry);
0673         unlock_page(entry);
0674         put_page(entry);
0675     }
0676     return new_entry;
0677 }
0678 
0679 static inline unsigned long
0680 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
0681 {
0682     unsigned long address;
0683 
0684     address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
0685     VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
0686     return address;
0687 }
0688 
0689 /* Walk all mappings of a given index of a file and writeprotect them */
0690 static void dax_mapping_entry_mkclean(struct address_space *mapping,
0691                       pgoff_t index, unsigned long pfn)
0692 {
0693     struct vm_area_struct *vma;
0694     pte_t pte, *ptep = NULL;
0695     pmd_t *pmdp = NULL;
0696     spinlock_t *ptl;
0697     bool changed;
0698 
0699     i_mmap_lock_read(mapping);
0700     vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
0701         unsigned long address;
0702 
0703         cond_resched();
0704 
0705         if (!(vma->vm_flags & VM_SHARED))
0706             continue;
0707 
0708         address = pgoff_address(index, vma);
0709         changed = false;
0710         if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
0711             continue;
0712 
0713         if (pmdp) {
0714 #ifdef CONFIG_FS_DAX_PMD
0715             pmd_t pmd;
0716 
0717             if (pfn != pmd_pfn(*pmdp))
0718                 goto unlock_pmd;
0719             if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
0720                 goto unlock_pmd;
0721 
0722             flush_cache_page(vma, address, pfn);
0723             pmd = pmdp_huge_clear_flush(vma, address, pmdp);
0724             pmd = pmd_wrprotect(pmd);
0725             pmd = pmd_mkclean(pmd);
0726             set_pmd_at(vma->vm_mm, address, pmdp, pmd);
0727             changed = true;
0728 unlock_pmd:
0729             spin_unlock(ptl);
0730 #endif
0731         } else {
0732             if (pfn != pte_pfn(*ptep))
0733                 goto unlock_pte;
0734             if (!pte_dirty(*ptep) && !pte_write(*ptep))
0735                 goto unlock_pte;
0736 
0737             flush_cache_page(vma, address, pfn);
0738             pte = ptep_clear_flush(vma, address, ptep);
0739             pte = pte_wrprotect(pte);
0740             pte = pte_mkclean(pte);
0741             set_pte_at(vma->vm_mm, address, ptep, pte);
0742             changed = true;
0743 unlock_pte:
0744             pte_unmap_unlock(ptep, ptl);
0745         }
0746 
0747         if (changed)
0748             mmu_notifier_invalidate_page(vma->vm_mm, address);
0749     }
0750     i_mmap_unlock_read(mapping);
0751 }
0752 
0753 static int dax_writeback_one(struct block_device *bdev,
0754         struct address_space *mapping, pgoff_t index, void *entry)
0755 {
0756     struct radix_tree_root *page_tree = &mapping->page_tree;
0757     struct blk_dax_ctl dax;
0758     void *entry2, **slot;
0759     int ret = 0;
0760 
0761     /*
0762      * A page got tagged dirty in DAX mapping? Something is seriously
0763      * wrong.
0764      */
0765     if (WARN_ON(!radix_tree_exceptional_entry(entry)))
0766         return -EIO;
0767 
0768     spin_lock_irq(&mapping->tree_lock);
0769     entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
0770     /* Entry got punched out / reallocated? */
0771     if (!entry2 || !radix_tree_exceptional_entry(entry2))
0772         goto put_unlocked;
0773     /*
0774      * Entry got reallocated elsewhere? No need to writeback. We have to
0775      * compare sectors as we must not bail out due to difference in lockbit
0776      * or entry type.
0777      */
0778     if (dax_radix_sector(entry2) != dax_radix_sector(entry))
0779         goto put_unlocked;
0780     if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
0781                 dax_is_zero_entry(entry))) {
0782         ret = -EIO;
0783         goto put_unlocked;
0784     }
0785 
0786     /* Another fsync thread may have already written back this entry */
0787     if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
0788         goto put_unlocked;
0789     /* Lock the entry to serialize with page faults */
0790     entry = lock_slot(mapping, slot);
0791     /*
0792      * We can clear the tag now but we have to be careful so that concurrent
0793      * dax_writeback_one() calls for the same index cannot finish before we
0794      * actually flush the caches. This is achieved as the calls will look
0795      * at the entry only under tree_lock and once they do that they will
0796      * see the entry locked and wait for it to unlock.
0797      */
0798     radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
0799     spin_unlock_irq(&mapping->tree_lock);
0800 
0801     /*
0802      * Even if dax_writeback_mapping_range() was given a wbc->range_start
0803      * in the middle of a PMD, the 'index' we are given will be aligned to
0804      * the start index of the PMD, as will the sector we pull from
0805      * 'entry'.  This allows us to flush for PMD_SIZE and not have to
0806      * worry about partial PMD writebacks.
0807      */
0808     dax.sector = dax_radix_sector(entry);
0809     dax.size = PAGE_SIZE << dax_radix_order(entry);
0810 
0811     /*
0812      * We cannot hold tree_lock while calling dax_map_atomic() because it
0813      * eventually calls cond_resched().
0814      */
0815     ret = dax_map_atomic(bdev, &dax);
0816     if (ret < 0) {
0817         put_locked_mapping_entry(mapping, index, entry);
0818         return ret;
0819     }
0820 
0821     if (WARN_ON_ONCE(ret < dax.size)) {
0822         ret = -EIO;
0823         goto unmap;
0824     }
0825 
0826     dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
0827     wb_cache_pmem(dax.addr, dax.size);
0828     /*
0829      * After we have flushed the cache, we can clear the dirty tag. There
0830      * cannot be new dirty data in the pfn after the flush has completed as
0831      * the pfn mappings are writeprotected and fault waits for mapping
0832      * entry lock.
0833      */
0834     spin_lock_irq(&mapping->tree_lock);
0835     radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
0836     spin_unlock_irq(&mapping->tree_lock);
0837  unmap:
0838     dax_unmap_atomic(bdev, &dax);
0839     put_locked_mapping_entry(mapping, index, entry);
0840     return ret;
0841 
0842  put_unlocked:
0843     put_unlocked_mapping_entry(mapping, index, entry2);
0844     spin_unlock_irq(&mapping->tree_lock);
0845     return ret;
0846 }
0847 
0848 /*
0849  * Flush the mapping to the persistent domain within the byte range of [start,
0850  * end]. This is required by data integrity operations to ensure file data is
0851  * on persistent storage prior to completion of the operation.
0852  */
0853 int dax_writeback_mapping_range(struct address_space *mapping,
0854         struct block_device *bdev, struct writeback_control *wbc)
0855 {
0856     struct inode *inode = mapping->host;
0857     pgoff_t start_index, end_index;
0858     pgoff_t indices[PAGEVEC_SIZE];
0859     struct pagevec pvec;
0860     bool done = false;
0861     int i, ret = 0;
0862 
0863     if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
0864         return -EIO;
0865 
0866     if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
0867         return 0;
0868 
0869     start_index = wbc->range_start >> PAGE_SHIFT;
0870     end_index = wbc->range_end >> PAGE_SHIFT;
0871 
0872     tag_pages_for_writeback(mapping, start_index, end_index);
0873 
0874     pagevec_init(&pvec, 0);
0875     while (!done) {
0876         pvec.nr = find_get_entries_tag(mapping, start_index,
0877                 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
0878                 pvec.pages, indices);
0879 
0880         if (pvec.nr == 0)
0881             break;
0882 
0883         for (i = 0; i < pvec.nr; i++) {
0884             if (indices[i] > end_index) {
0885                 done = true;
0886                 break;
0887             }
0888 
0889             ret = dax_writeback_one(bdev, mapping, indices[i],
0890                     pvec.pages[i]);
0891             if (ret < 0)
0892                 return ret;
0893         }
0894     }
0895     return 0;
0896 }
0897 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
0898 
0899 static int dax_insert_mapping(struct address_space *mapping,
0900         struct block_device *bdev, sector_t sector, size_t size,
0901         void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
0902 {
0903     unsigned long vaddr = vmf->address;
0904     struct blk_dax_ctl dax = {
0905         .sector = sector,
0906         .size = size,
0907     };
0908     void *ret;
0909     void *entry = *entryp;
0910 
0911     if (dax_map_atomic(bdev, &dax) < 0)
0912         return PTR_ERR(dax.addr);
0913     dax_unmap_atomic(bdev, &dax);
0914 
0915     ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
0916     if (IS_ERR(ret))
0917         return PTR_ERR(ret);
0918     *entryp = ret;
0919 
0920     return vm_insert_mixed(vma, vaddr, dax.pfn);
0921 }
0922 
0923 /**
0924  * dax_pfn_mkwrite - handle first write to DAX page
0925  * @vma: The virtual memory area where the fault occurred
0926  * @vmf: The description of the fault
0927  */
0928 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
0929 {
0930     struct file *file = vma->vm_file;
0931     struct address_space *mapping = file->f_mapping;
0932     void *entry, **slot;
0933     pgoff_t index = vmf->pgoff;
0934 
0935     spin_lock_irq(&mapping->tree_lock);
0936     entry = get_unlocked_mapping_entry(mapping, index, &slot);
0937     if (!entry || !radix_tree_exceptional_entry(entry)) {
0938         if (entry)
0939             put_unlocked_mapping_entry(mapping, index, entry);
0940         spin_unlock_irq(&mapping->tree_lock);
0941         return VM_FAULT_NOPAGE;
0942     }
0943     radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
0944     entry = lock_slot(mapping, slot);
0945     spin_unlock_irq(&mapping->tree_lock);
0946     /*
0947      * If we race with somebody updating the PTE and finish_mkwrite_fault()
0948      * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
0949      * the fault in either case.
0950      */
0951     finish_mkwrite_fault(vmf);
0952     put_locked_mapping_entry(mapping, index, entry);
0953     return VM_FAULT_NOPAGE;
0954 }
0955 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
0956 
0957 static bool dax_range_is_aligned(struct block_device *bdev,
0958                  unsigned int offset, unsigned int length)
0959 {
0960     unsigned short sector_size = bdev_logical_block_size(bdev);
0961 
0962     if (!IS_ALIGNED(offset, sector_size))
0963         return false;
0964     if (!IS_ALIGNED(length, sector_size))
0965         return false;
0966 
0967     return true;
0968 }
0969 
0970 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
0971         unsigned int offset, unsigned int length)
0972 {
0973     struct blk_dax_ctl dax = {
0974         .sector     = sector,
0975         .size       = PAGE_SIZE,
0976     };
0977 
0978     if (dax_range_is_aligned(bdev, offset, length)) {
0979         sector_t start_sector = dax.sector + (offset >> 9);
0980 
0981         return blkdev_issue_zeroout(bdev, start_sector,
0982                 length >> 9, GFP_NOFS, true);
0983     } else {
0984         if (dax_map_atomic(bdev, &dax) < 0)
0985             return PTR_ERR(dax.addr);
0986         clear_pmem(dax.addr + offset, length);
0987         dax_unmap_atomic(bdev, &dax);
0988     }
0989     return 0;
0990 }
0991 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
0992 
0993 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
0994 {
0995     return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
0996 }
0997 
0998 static loff_t
0999 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1000         struct iomap *iomap)
1001 {
1002     struct iov_iter *iter = data;
1003     loff_t end = pos + length, done = 0;
1004     ssize_t ret = 0;
1005 
1006     if (iov_iter_rw(iter) == READ) {
1007         end = min(end, i_size_read(inode));
1008         if (pos >= end)
1009             return 0;
1010 
1011         if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1012             return iov_iter_zero(min(length, end - pos), iter);
1013     }
1014 
1015     if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1016         return -EIO;
1017 
1018     /*
1019      * Write can allocate block for an area which has a hole page mapped
1020      * into page tables. We have to tear down these mappings so that data
1021      * written by write(2) is visible in mmap.
1022      */
1023     if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
1024         invalidate_inode_pages2_range(inode->i_mapping,
1025                           pos >> PAGE_SHIFT,
1026                           (end - 1) >> PAGE_SHIFT);
1027     }
1028 
1029     while (pos < end) {
1030         unsigned offset = pos & (PAGE_SIZE - 1);
1031         struct blk_dax_ctl dax = { 0 };
1032         ssize_t map_len;
1033 
1034         if (fatal_signal_pending(current)) {
1035             ret = -EINTR;
1036             break;
1037         }
1038 
1039         dax.sector = dax_iomap_sector(iomap, pos);
1040         dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1041         map_len = dax_map_atomic(iomap->bdev, &dax);
1042         if (map_len < 0) {
1043             ret = map_len;
1044             break;
1045         }
1046 
1047         dax.addr += offset;
1048         map_len -= offset;
1049         if (map_len > end - pos)
1050             map_len = end - pos;
1051 
1052         if (iov_iter_rw(iter) == WRITE)
1053             map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1054         else
1055             map_len = copy_to_iter(dax.addr, map_len, iter);
1056         dax_unmap_atomic(iomap->bdev, &dax);
1057         if (map_len <= 0) {
1058             ret = map_len ? map_len : -EFAULT;
1059             break;
1060         }
1061 
1062         pos += map_len;
1063         length -= map_len;
1064         done += map_len;
1065     }
1066 
1067     return done ? done : ret;
1068 }
1069 
1070 /**
1071  * dax_iomap_rw - Perform I/O to a DAX file
1072  * @iocb:   The control block for this I/O
1073  * @iter:   The addresses to do I/O from or to
1074  * @ops:    iomap ops passed from the file system
1075  *
1076  * This function performs read and write operations to directly mapped
1077  * persistent memory.  The callers needs to take care of read/write exclusion
1078  * and evicting any page cache pages in the region under I/O.
1079  */
1080 ssize_t
1081 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1082         struct iomap_ops *ops)
1083 {
1084     struct address_space *mapping = iocb->ki_filp->f_mapping;
1085     struct inode *inode = mapping->host;
1086     loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1087     unsigned flags = 0;
1088 
1089     if (iov_iter_rw(iter) == WRITE)
1090         flags |= IOMAP_WRITE;
1091 
1092     while (iov_iter_count(iter)) {
1093         ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1094                 iter, dax_iomap_actor);
1095         if (ret <= 0)
1096             break;
1097         pos += ret;
1098         done += ret;
1099     }
1100 
1101     iocb->ki_pos += done;
1102     return done ? done : ret;
1103 }
1104 EXPORT_SYMBOL_GPL(dax_iomap_rw);
1105 
1106 static int dax_fault_return(int error)
1107 {
1108     if (error == 0)
1109         return VM_FAULT_NOPAGE;
1110     if (error == -ENOMEM)
1111         return VM_FAULT_OOM;
1112     return VM_FAULT_SIGBUS;
1113 }
1114 
1115 /**
1116  * dax_iomap_fault - handle a page fault on a DAX file
1117  * @vma: The virtual memory area where the fault occurred
1118  * @vmf: The description of the fault
1119  * @ops: iomap ops passed from the file system
1120  *
1121  * When a page fault occurs, filesystems may call this helper in their fault
1122  * or mkwrite handler for DAX files. Assumes the caller has done all the
1123  * necessary locking for the page fault to proceed successfully.
1124  */
1125 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1126             struct iomap_ops *ops)
1127 {
1128     struct address_space *mapping = vma->vm_file->f_mapping;
1129     struct inode *inode = mapping->host;
1130     unsigned long vaddr = vmf->address;
1131     loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1132     sector_t sector;
1133     struct iomap iomap = { 0 };
1134     unsigned flags = IOMAP_FAULT;
1135     int error, major = 0;
1136     int vmf_ret = 0;
1137     void *entry;
1138 
1139     /*
1140      * Check whether offset isn't beyond end of file now. Caller is supposed
1141      * to hold locks serializing us with truncate / punch hole so this is
1142      * a reliable test.
1143      */
1144     if (pos >= i_size_read(inode))
1145         return VM_FAULT_SIGBUS;
1146 
1147     if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1148         flags |= IOMAP_WRITE;
1149 
1150     /*
1151      * Note that we don't bother to use iomap_apply here: DAX required
1152      * the file system block size to be equal the page size, which means
1153      * that we never have to deal with more than a single extent here.
1154      */
1155     error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1156     if (error)
1157         return dax_fault_return(error);
1158     if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1159         vmf_ret = dax_fault_return(-EIO);   /* fs corruption? */
1160         goto finish_iomap;
1161     }
1162 
1163     entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1164     if (IS_ERR(entry)) {
1165         vmf_ret = dax_fault_return(PTR_ERR(entry));
1166         goto finish_iomap;
1167     }
1168 
1169     sector = dax_iomap_sector(&iomap, pos);
1170 
1171     if (vmf->cow_page) {
1172         switch (iomap.type) {
1173         case IOMAP_HOLE:
1174         case IOMAP_UNWRITTEN:
1175             clear_user_highpage(vmf->cow_page, vaddr);
1176             break;
1177         case IOMAP_MAPPED:
1178             error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1179                     vmf->cow_page, vaddr);
1180             break;
1181         default:
1182             WARN_ON_ONCE(1);
1183             error = -EIO;
1184             break;
1185         }
1186 
1187         if (error)
1188             goto error_unlock_entry;
1189 
1190         __SetPageUptodate(vmf->cow_page);
1191         vmf_ret = finish_fault(vmf);
1192         if (!vmf_ret)
1193             vmf_ret = VM_FAULT_DONE_COW;
1194         goto unlock_entry;
1195     }
1196 
1197     switch (iomap.type) {
1198     case IOMAP_MAPPED:
1199         if (iomap.flags & IOMAP_F_NEW) {
1200             count_vm_event(PGMAJFAULT);
1201             mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1202             major = VM_FAULT_MAJOR;
1203         }
1204         error = dax_insert_mapping(mapping, iomap.bdev, sector,
1205                 PAGE_SIZE, &entry, vma, vmf);
1206         /* -EBUSY is fine, somebody else faulted on the same PTE */
1207         if (error == -EBUSY)
1208             error = 0;
1209         break;
1210     case IOMAP_UNWRITTEN:
1211     case IOMAP_HOLE:
1212         if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1213             vmf_ret = dax_load_hole(mapping, &entry, vmf);
1214             goto unlock_entry;
1215         }
1216         /*FALLTHRU*/
1217     default:
1218         WARN_ON_ONCE(1);
1219         error = -EIO;
1220         break;
1221     }
1222 
1223  error_unlock_entry:
1224     vmf_ret = dax_fault_return(error) | major;
1225  unlock_entry:
1226     put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1227  finish_iomap:
1228     if (ops->iomap_end) {
1229         int copied = PAGE_SIZE;
1230 
1231         if (vmf_ret & VM_FAULT_ERROR)
1232             copied = 0;
1233         /*
1234          * The fault is done by now and there's no way back (other
1235          * thread may be already happily using PTE we have installed).
1236          * Just ignore error from ->iomap_end since we cannot do much
1237          * with it.
1238          */
1239         ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1240     }
1241     return vmf_ret;
1242 }
1243 EXPORT_SYMBOL_GPL(dax_iomap_fault);
1244 
1245 #ifdef CONFIG_FS_DAX_PMD
1246 /*
1247  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
1248  * more often than one might expect in the below functions.
1249  */
1250 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
1251 
1252 static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1253         struct vm_fault *vmf, unsigned long address,
1254         struct iomap *iomap, loff_t pos, bool write, void **entryp)
1255 {
1256     struct address_space *mapping = vma->vm_file->f_mapping;
1257     struct block_device *bdev = iomap->bdev;
1258     struct blk_dax_ctl dax = {
1259         .sector = dax_iomap_sector(iomap, pos),
1260         .size = PMD_SIZE,
1261     };
1262     long length = dax_map_atomic(bdev, &dax);
1263     void *ret;
1264 
1265     if (length < 0) /* dax_map_atomic() failed */
1266         return VM_FAULT_FALLBACK;
1267     if (length < PMD_SIZE)
1268         goto unmap_fallback;
1269     if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1270         goto unmap_fallback;
1271     if (!pfn_t_devmap(dax.pfn))
1272         goto unmap_fallback;
1273 
1274     dax_unmap_atomic(bdev, &dax);
1275 
1276     ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1277             RADIX_DAX_PMD);
1278     if (IS_ERR(ret))
1279         return VM_FAULT_FALLBACK;
1280     *entryp = ret;
1281 
1282     return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1283 
1284  unmap_fallback:
1285     dax_unmap_atomic(bdev, &dax);
1286     return VM_FAULT_FALLBACK;
1287 }
1288 
1289 static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1290         struct vm_fault *vmf, unsigned long address,
1291         struct iomap *iomap, void **entryp)
1292 {
1293     struct address_space *mapping = vma->vm_file->f_mapping;
1294     unsigned long pmd_addr = address & PMD_MASK;
1295     struct page *zero_page;
1296     spinlock_t *ptl;
1297     pmd_t pmd_entry;
1298     void *ret;
1299 
1300     zero_page = mm_get_huge_zero_page(vma->vm_mm);
1301 
1302     if (unlikely(!zero_page))
1303         return VM_FAULT_FALLBACK;
1304 
1305     ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1306             RADIX_DAX_PMD | RADIX_DAX_HZP);
1307     if (IS_ERR(ret))
1308         return VM_FAULT_FALLBACK;
1309     *entryp = ret;
1310 
1311     ptl = pmd_lock(vma->vm_mm, pmd);
1312     if (!pmd_none(*pmd)) {
1313         spin_unlock(ptl);
1314         return VM_FAULT_FALLBACK;
1315     }
1316 
1317     pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1318     pmd_entry = pmd_mkhuge(pmd_entry);
1319     set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1320     spin_unlock(ptl);
1321     return VM_FAULT_NOPAGE;
1322 }
1323 
1324 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1325         pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
1326 {
1327     struct address_space *mapping = vma->vm_file->f_mapping;
1328     unsigned long pmd_addr = address & PMD_MASK;
1329     bool write = flags & FAULT_FLAG_WRITE;
1330     unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1331     struct inode *inode = mapping->host;
1332     int result = VM_FAULT_FALLBACK;
1333     struct iomap iomap = { 0 };
1334     pgoff_t max_pgoff, pgoff;
1335     struct vm_fault vmf;
1336     void *entry;
1337     loff_t pos;
1338     int error;
1339 
1340     /* Fall back to PTEs if we're going to COW */
1341     if (write && !(vma->vm_flags & VM_SHARED))
1342         goto fallback;
1343 
1344     /* If the PMD would extend outside the VMA */
1345     if (pmd_addr < vma->vm_start)
1346         goto fallback;
1347     if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1348         goto fallback;
1349 
1350     /*
1351      * Check whether offset isn't beyond end of file now. Caller is
1352      * supposed to hold locks serializing us with truncate / punch hole so
1353      * this is a reliable test.
1354      */
1355     pgoff = linear_page_index(vma, pmd_addr);
1356     max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1357 
1358     if (pgoff > max_pgoff)
1359         return VM_FAULT_SIGBUS;
1360 
1361     /* If the PMD would extend beyond the file size */
1362     if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1363         goto fallback;
1364 
1365     /*
1366      * Note that we don't use iomap_apply here.  We aren't doing I/O, only
1367      * setting up a mapping, so really we're using iomap_begin() as a way
1368      * to look up our filesystem block.
1369      */
1370     pos = (loff_t)pgoff << PAGE_SHIFT;
1371     error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1372     if (error)
1373         goto fallback;
1374 
1375     if (iomap.offset + iomap.length < pos + PMD_SIZE)
1376         goto finish_iomap;
1377 
1378     /*
1379      * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1380      * PMD or a HZP entry.  If it can't (because a 4k page is already in
1381      * the tree, for instance), it will return -EEXIST and we just fall
1382      * back to 4k entries.
1383      */
1384     entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1385     if (IS_ERR(entry))
1386         goto finish_iomap;
1387 
1388     vmf.pgoff = pgoff;
1389     vmf.flags = flags;
1390     vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
1391 
1392     switch (iomap.type) {
1393     case IOMAP_MAPPED:
1394         result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
1395                 &iomap, pos, write, &entry);
1396         break;
1397     case IOMAP_UNWRITTEN:
1398     case IOMAP_HOLE:
1399         if (WARN_ON_ONCE(write))
1400             goto unlock_entry;
1401         result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1402                 &entry);
1403         break;
1404     default:
1405         WARN_ON_ONCE(1);
1406         break;
1407     }
1408 
1409  unlock_entry:
1410     put_locked_mapping_entry(mapping, pgoff, entry);
1411  finish_iomap:
1412     if (ops->iomap_end) {
1413         int copied = PMD_SIZE;
1414 
1415         if (result == VM_FAULT_FALLBACK)
1416             copied = 0;
1417         /*
1418          * The fault is done by now and there's no way back (other
1419          * thread may be already happily using PMD we have installed).
1420          * Just ignore error from ->iomap_end since we cannot do much
1421          * with it.
1422          */
1423         ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1424                 &iomap);
1425     }
1426  fallback:
1427     if (result == VM_FAULT_FALLBACK) {
1428         split_huge_pmd(vma, pmd, address);
1429         count_vm_event(THP_FAULT_FALLBACK);
1430     }
1431     return result;
1432 }
1433 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1434 #endif /* CONFIG_FS_DAX_PMD */