0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/atomic.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/buffer_head.h>
0012 #include <linux/dax.h>
0013 #include <linux/fs.h>
0014 #include <linux/highmem.h>
0015 #include <linux/memcontrol.h>
0016 #include <linux/mm.h>
0017 #include <linux/mutex.h>
0018 #include <linux/pagevec.h>
0019 #include <linux/sched.h>
0020 #include <linux/sched/signal.h>
0021 #include <linux/uio.h>
0022 #include <linux/vmstat.h>
0023 #include <linux/pfn_t.h>
0024 #include <linux/sizes.h>
0025 #include <linux/mmu_notifier.h>
0026 #include <linux/iomap.h>
0027 #include <linux/rmap.h>
0028 #include <asm/pgalloc.h>
0029
0030 #define CREATE_TRACE_POINTS
0031 #include <trace/events/fs_dax.h>
0032
0033 static inline unsigned int pe_order(enum page_entry_size pe_size)
0034 {
0035 if (pe_size == PE_SIZE_PTE)
0036 return PAGE_SHIFT - PAGE_SHIFT;
0037 if (pe_size == PE_SIZE_PMD)
0038 return PMD_SHIFT - PAGE_SHIFT;
0039 if (pe_size == PE_SIZE_PUD)
0040 return PUD_SHIFT - PAGE_SHIFT;
0041 return ~0;
0042 }
0043
0044
0045 #define DAX_WAIT_TABLE_BITS 12
0046 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
0047
0048
0049 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
0050 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
0051
0052
0053 #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
0054
0055 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
0056
0057 static int __init init_dax_wait_table(void)
0058 {
0059 int i;
0060
0061 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
0062 init_waitqueue_head(wait_table + i);
0063 return 0;
0064 }
0065 fs_initcall(init_dax_wait_table);
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077 #define DAX_SHIFT (4)
0078 #define DAX_LOCKED (1UL << 0)
0079 #define DAX_PMD (1UL << 1)
0080 #define DAX_ZERO_PAGE (1UL << 2)
0081 #define DAX_EMPTY (1UL << 3)
0082
0083 static unsigned long dax_to_pfn(void *entry)
0084 {
0085 return xa_to_value(entry) >> DAX_SHIFT;
0086 }
0087
0088 static void *dax_make_entry(pfn_t pfn, unsigned long flags)
0089 {
0090 return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
0091 }
0092
0093 static bool dax_is_locked(void *entry)
0094 {
0095 return xa_to_value(entry) & DAX_LOCKED;
0096 }
0097
0098 static unsigned int dax_entry_order(void *entry)
0099 {
0100 if (xa_to_value(entry) & DAX_PMD)
0101 return PMD_ORDER;
0102 return 0;
0103 }
0104
0105 static unsigned long dax_is_pmd_entry(void *entry)
0106 {
0107 return xa_to_value(entry) & DAX_PMD;
0108 }
0109
0110 static bool dax_is_pte_entry(void *entry)
0111 {
0112 return !(xa_to_value(entry) & DAX_PMD);
0113 }
0114
0115 static int dax_is_zero_entry(void *entry)
0116 {
0117 return xa_to_value(entry) & DAX_ZERO_PAGE;
0118 }
0119
0120 static int dax_is_empty_entry(void *entry)
0121 {
0122 return xa_to_value(entry) & DAX_EMPTY;
0123 }
0124
0125
0126
0127
0128
0129 static bool dax_is_conflict(void *entry)
0130 {
0131 return entry == XA_RETRY_ENTRY;
0132 }
0133
0134
0135
0136
0137 struct exceptional_entry_key {
0138 struct xarray *xa;
0139 pgoff_t entry_start;
0140 };
0141
0142 struct wait_exceptional_entry_queue {
0143 wait_queue_entry_t wait;
0144 struct exceptional_entry_key key;
0145 };
0146
0147
0148
0149
0150
0151
0152 enum dax_wake_mode {
0153 WAKE_ALL,
0154 WAKE_NEXT,
0155 };
0156
0157 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
0158 void *entry, struct exceptional_entry_key *key)
0159 {
0160 unsigned long hash;
0161 unsigned long index = xas->xa_index;
0162
0163
0164
0165
0166
0167
0168 if (dax_is_pmd_entry(entry))
0169 index &= ~PG_PMD_COLOUR;
0170 key->xa = xas->xa;
0171 key->entry_start = index;
0172
0173 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
0174 return wait_table + hash;
0175 }
0176
0177 static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
0178 unsigned int mode, int sync, void *keyp)
0179 {
0180 struct exceptional_entry_key *key = keyp;
0181 struct wait_exceptional_entry_queue *ewait =
0182 container_of(wait, struct wait_exceptional_entry_queue, wait);
0183
0184 if (key->xa != ewait->key.xa ||
0185 key->entry_start != ewait->key.entry_start)
0186 return 0;
0187 return autoremove_wake_function(wait, mode, sync, NULL);
0188 }
0189
0190
0191
0192
0193
0194
0195 static void dax_wake_entry(struct xa_state *xas, void *entry,
0196 enum dax_wake_mode mode)
0197 {
0198 struct exceptional_entry_key key;
0199 wait_queue_head_t *wq;
0200
0201 wq = dax_entry_waitqueue(xas, entry, &key);
0202
0203
0204
0205
0206
0207
0208
0209 if (waitqueue_active(wq))
0210 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
0211 }
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223 static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
0224 {
0225 void *entry;
0226 struct wait_exceptional_entry_queue ewait;
0227 wait_queue_head_t *wq;
0228
0229 init_wait(&ewait.wait);
0230 ewait.wait.func = wake_exceptional_entry_func;
0231
0232 for (;;) {
0233 entry = xas_find_conflict(xas);
0234 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
0235 return entry;
0236 if (dax_entry_order(entry) < order)
0237 return XA_RETRY_ENTRY;
0238 if (!dax_is_locked(entry))
0239 return entry;
0240
0241 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
0242 prepare_to_wait_exclusive(wq, &ewait.wait,
0243 TASK_UNINTERRUPTIBLE);
0244 xas_unlock_irq(xas);
0245 xas_reset(xas);
0246 schedule();
0247 finish_wait(wq, &ewait.wait);
0248 xas_lock_irq(xas);
0249 }
0250 }
0251
0252
0253
0254
0255
0256
0257 static void wait_entry_unlocked(struct xa_state *xas, void *entry)
0258 {
0259 struct wait_exceptional_entry_queue ewait;
0260 wait_queue_head_t *wq;
0261
0262 init_wait(&ewait.wait);
0263 ewait.wait.func = wake_exceptional_entry_func;
0264
0265 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
0266
0267
0268
0269
0270
0271
0272 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
0273 xas_unlock_irq(xas);
0274 schedule();
0275 finish_wait(wq, &ewait.wait);
0276 }
0277
0278 static void put_unlocked_entry(struct xa_state *xas, void *entry,
0279 enum dax_wake_mode mode)
0280 {
0281 if (entry && !dax_is_conflict(entry))
0282 dax_wake_entry(xas, entry, mode);
0283 }
0284
0285
0286
0287
0288
0289
0290 static void dax_unlock_entry(struct xa_state *xas, void *entry)
0291 {
0292 void *old;
0293
0294 BUG_ON(dax_is_locked(entry));
0295 xas_reset(xas);
0296 xas_lock_irq(xas);
0297 old = xas_store(xas, entry);
0298 xas_unlock_irq(xas);
0299 BUG_ON(!dax_is_locked(old));
0300 dax_wake_entry(xas, entry, WAKE_NEXT);
0301 }
0302
0303
0304
0305
0306 static void *dax_lock_entry(struct xa_state *xas, void *entry)
0307 {
0308 unsigned long v = xa_to_value(entry);
0309 return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
0310 }
0311
0312 static unsigned long dax_entry_size(void *entry)
0313 {
0314 if (dax_is_zero_entry(entry))
0315 return 0;
0316 else if (dax_is_empty_entry(entry))
0317 return 0;
0318 else if (dax_is_pmd_entry(entry))
0319 return PMD_SIZE;
0320 else
0321 return PAGE_SIZE;
0322 }
0323
0324 static unsigned long dax_end_pfn(void *entry)
0325 {
0326 return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
0327 }
0328
0329
0330
0331
0332
0333 #define for_each_mapped_pfn(entry, pfn) \
0334 for (pfn = dax_to_pfn(entry); \
0335 pfn < dax_end_pfn(entry); pfn++)
0336
0337 static inline bool dax_mapping_is_cow(struct address_space *mapping)
0338 {
0339 return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
0340 }
0341
0342
0343
0344
0345 static inline void dax_mapping_set_cow(struct page *page)
0346 {
0347 if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
0348
0349
0350
0351
0352 if (page->mapping)
0353 page->index = 1;
0354 page->mapping = (void *)PAGE_MAPPING_DAX_COW;
0355 }
0356 page->index++;
0357 }
0358
0359
0360
0361
0362
0363
0364 static void dax_associate_entry(void *entry, struct address_space *mapping,
0365 struct vm_area_struct *vma, unsigned long address, bool cow)
0366 {
0367 unsigned long size = dax_entry_size(entry), pfn, index;
0368 int i = 0;
0369
0370 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
0371 return;
0372
0373 index = linear_page_index(vma, address & ~(size - 1));
0374 for_each_mapped_pfn(entry, pfn) {
0375 struct page *page = pfn_to_page(pfn);
0376
0377 if (cow) {
0378 dax_mapping_set_cow(page);
0379 } else {
0380 WARN_ON_ONCE(page->mapping);
0381 page->mapping = mapping;
0382 page->index = index + i++;
0383 }
0384 }
0385 }
0386
0387 static void dax_disassociate_entry(void *entry, struct address_space *mapping,
0388 bool trunc)
0389 {
0390 unsigned long pfn;
0391
0392 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
0393 return;
0394
0395 for_each_mapped_pfn(entry, pfn) {
0396 struct page *page = pfn_to_page(pfn);
0397
0398 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
0399 if (dax_mapping_is_cow(page->mapping)) {
0400
0401 if (page->index-- > 0)
0402 continue;
0403 } else
0404 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
0405 page->mapping = NULL;
0406 page->index = 0;
0407 }
0408 }
0409
0410 static struct page *dax_busy_page(void *entry)
0411 {
0412 unsigned long pfn;
0413
0414 for_each_mapped_pfn(entry, pfn) {
0415 struct page *page = pfn_to_page(pfn);
0416
0417 if (page_ref_count(page) > 1)
0418 return page;
0419 }
0420 return NULL;
0421 }
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431 dax_entry_t dax_lock_page(struct page *page)
0432 {
0433 XA_STATE(xas, NULL, 0);
0434 void *entry;
0435
0436
0437 rcu_read_lock();
0438 for (;;) {
0439 struct address_space *mapping = READ_ONCE(page->mapping);
0440
0441 entry = NULL;
0442 if (!mapping || !dax_mapping(mapping))
0443 break;
0444
0445
0446
0447
0448
0449
0450
0451
0452 entry = (void *)~0UL;
0453 if (S_ISCHR(mapping->host->i_mode))
0454 break;
0455
0456 xas.xa = &mapping->i_pages;
0457 xas_lock_irq(&xas);
0458 if (mapping != page->mapping) {
0459 xas_unlock_irq(&xas);
0460 continue;
0461 }
0462 xas_set(&xas, page->index);
0463 entry = xas_load(&xas);
0464 if (dax_is_locked(entry)) {
0465 rcu_read_unlock();
0466 wait_entry_unlocked(&xas, entry);
0467 rcu_read_lock();
0468 continue;
0469 }
0470 dax_lock_entry(&xas, entry);
0471 xas_unlock_irq(&xas);
0472 break;
0473 }
0474 rcu_read_unlock();
0475 return (dax_entry_t)entry;
0476 }
0477
0478 void dax_unlock_page(struct page *page, dax_entry_t cookie)
0479 {
0480 struct address_space *mapping = page->mapping;
0481 XA_STATE(xas, &mapping->i_pages, page->index);
0482
0483 if (S_ISCHR(mapping->host->i_mode))
0484 return;
0485
0486 dax_unlock_entry(&xas, (void *)cookie);
0487 }
0488
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
0499 struct page **page)
0500 {
0501 XA_STATE(xas, NULL, 0);
0502 void *entry;
0503
0504 rcu_read_lock();
0505 for (;;) {
0506 entry = NULL;
0507 if (!dax_mapping(mapping))
0508 break;
0509
0510 xas.xa = &mapping->i_pages;
0511 xas_lock_irq(&xas);
0512 xas_set(&xas, index);
0513 entry = xas_load(&xas);
0514 if (dax_is_locked(entry)) {
0515 rcu_read_unlock();
0516 wait_entry_unlocked(&xas, entry);
0517 rcu_read_lock();
0518 continue;
0519 }
0520 if (!entry ||
0521 dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
0522
0523
0524
0525
0526
0527
0528
0529 entry = (void *)~0UL;
0530 } else {
0531 *page = pfn_to_page(dax_to_pfn(entry));
0532 dax_lock_entry(&xas, entry);
0533 }
0534 xas_unlock_irq(&xas);
0535 break;
0536 }
0537 rcu_read_unlock();
0538 return (dax_entry_t)entry;
0539 }
0540
0541 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
0542 dax_entry_t cookie)
0543 {
0544 XA_STATE(xas, &mapping->i_pages, index);
0545
0546 if (cookie == ~0UL)
0547 return;
0548
0549 dax_unlock_entry(&xas, (void *)cookie);
0550 }
0551
0552
0553
0554
0555
0556
0557
0558
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579
0580
0581 static void *grab_mapping_entry(struct xa_state *xas,
0582 struct address_space *mapping, unsigned int order)
0583 {
0584 unsigned long index = xas->xa_index;
0585 bool pmd_downgrade;
0586 void *entry;
0587
0588 retry:
0589 pmd_downgrade = false;
0590 xas_lock_irq(xas);
0591 entry = get_unlocked_entry(xas, order);
0592
0593 if (entry) {
0594 if (dax_is_conflict(entry))
0595 goto fallback;
0596 if (!xa_is_value(entry)) {
0597 xas_set_err(xas, -EIO);
0598 goto out_unlock;
0599 }
0600
0601 if (order == 0) {
0602 if (dax_is_pmd_entry(entry) &&
0603 (dax_is_zero_entry(entry) ||
0604 dax_is_empty_entry(entry))) {
0605 pmd_downgrade = true;
0606 }
0607 }
0608 }
0609
0610 if (pmd_downgrade) {
0611
0612
0613
0614
0615 dax_lock_entry(xas, entry);
0616
0617
0618
0619
0620
0621
0622 if (dax_is_zero_entry(entry)) {
0623 xas_unlock_irq(xas);
0624 unmap_mapping_pages(mapping,
0625 xas->xa_index & ~PG_PMD_COLOUR,
0626 PG_PMD_NR, false);
0627 xas_reset(xas);
0628 xas_lock_irq(xas);
0629 }
0630
0631 dax_disassociate_entry(entry, mapping, false);
0632 xas_store(xas, NULL);
0633 dax_wake_entry(xas, entry, WAKE_ALL);
0634 mapping->nrpages -= PG_PMD_NR;
0635 entry = NULL;
0636 xas_set(xas, index);
0637 }
0638
0639 if (entry) {
0640 dax_lock_entry(xas, entry);
0641 } else {
0642 unsigned long flags = DAX_EMPTY;
0643
0644 if (order > 0)
0645 flags |= DAX_PMD;
0646 entry = dax_make_entry(pfn_to_pfn_t(0), flags);
0647 dax_lock_entry(xas, entry);
0648 if (xas_error(xas))
0649 goto out_unlock;
0650 mapping->nrpages += 1UL << order;
0651 }
0652
0653 out_unlock:
0654 xas_unlock_irq(xas);
0655 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
0656 goto retry;
0657 if (xas->xa_node == XA_ERROR(-ENOMEM))
0658 return xa_mk_internal(VM_FAULT_OOM);
0659 if (xas_error(xas))
0660 return xa_mk_internal(VM_FAULT_SIGBUS);
0661 return entry;
0662 fallback:
0663 xas_unlock_irq(xas);
0664 return xa_mk_internal(VM_FAULT_FALLBACK);
0665 }
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685 struct page *dax_layout_busy_page_range(struct address_space *mapping,
0686 loff_t start, loff_t end)
0687 {
0688 void *entry;
0689 unsigned int scanned = 0;
0690 struct page *page = NULL;
0691 pgoff_t start_idx = start >> PAGE_SHIFT;
0692 pgoff_t end_idx;
0693 XA_STATE(xas, &mapping->i_pages, start_idx);
0694
0695
0696
0697
0698 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
0699 return NULL;
0700
0701 if (!dax_mapping(mapping) || !mapping_mapped(mapping))
0702 return NULL;
0703
0704
0705 if (end == LLONG_MAX)
0706 end_idx = ULONG_MAX;
0707 else
0708 end_idx = end >> PAGE_SHIFT;
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721 unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
0722
0723 xas_lock_irq(&xas);
0724 xas_for_each(&xas, entry, end_idx) {
0725 if (WARN_ON_ONCE(!xa_is_value(entry)))
0726 continue;
0727 if (unlikely(dax_is_locked(entry)))
0728 entry = get_unlocked_entry(&xas, 0);
0729 if (entry)
0730 page = dax_busy_page(entry);
0731 put_unlocked_entry(&xas, entry, WAKE_NEXT);
0732 if (page)
0733 break;
0734 if (++scanned % XA_CHECK_SCHED)
0735 continue;
0736
0737 xas_pause(&xas);
0738 xas_unlock_irq(&xas);
0739 cond_resched();
0740 xas_lock_irq(&xas);
0741 }
0742 xas_unlock_irq(&xas);
0743 return page;
0744 }
0745 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
0746
0747 struct page *dax_layout_busy_page(struct address_space *mapping)
0748 {
0749 return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
0750 }
0751 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
0752
0753 static int __dax_invalidate_entry(struct address_space *mapping,
0754 pgoff_t index, bool trunc)
0755 {
0756 XA_STATE(xas, &mapping->i_pages, index);
0757 int ret = 0;
0758 void *entry;
0759
0760 xas_lock_irq(&xas);
0761 entry = get_unlocked_entry(&xas, 0);
0762 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
0763 goto out;
0764 if (!trunc &&
0765 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
0766 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
0767 goto out;
0768 dax_disassociate_entry(entry, mapping, trunc);
0769 xas_store(&xas, NULL);
0770 mapping->nrpages -= 1UL << dax_entry_order(entry);
0771 ret = 1;
0772 out:
0773 put_unlocked_entry(&xas, entry, WAKE_ALL);
0774 xas_unlock_irq(&xas);
0775 return ret;
0776 }
0777
0778
0779
0780
0781
0782 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
0783 {
0784 int ret = __dax_invalidate_entry(mapping, index, true);
0785
0786
0787
0788
0789
0790
0791
0792
0793 WARN_ON_ONCE(!ret);
0794 return ret;
0795 }
0796
0797
0798
0799
0800 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
0801 pgoff_t index)
0802 {
0803 return __dax_invalidate_entry(mapping, index, false);
0804 }
0805
0806 static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
0807 {
0808 return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
0809 }
0810
0811 static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
0812 {
0813 pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
0814 void *vto, *kaddr;
0815 long rc;
0816 int id;
0817
0818 id = dax_read_lock();
0819 rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
0820 &kaddr, NULL);
0821 if (rc < 0) {
0822 dax_read_unlock(id);
0823 return rc;
0824 }
0825 vto = kmap_atomic(vmf->cow_page);
0826 copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
0827 kunmap_atomic(vto);
0828 dax_read_unlock(id);
0829 return 0;
0830 }
0831
0832
0833
0834
0835
0836 static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
0837 struct vm_area_struct *vma)
0838 {
0839 return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
0840 (iter->iomap.flags & IOMAP_F_DIRTY);
0841 }
0842
0843 static bool dax_fault_is_cow(const struct iomap_iter *iter)
0844 {
0845 return (iter->flags & IOMAP_WRITE) &&
0846 (iter->iomap.flags & IOMAP_F_SHARED);
0847 }
0848
0849
0850
0851
0852
0853
0854
0855
0856 static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
0857 const struct iomap_iter *iter, void *entry, pfn_t pfn,
0858 unsigned long flags)
0859 {
0860 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
0861 void *new_entry = dax_make_entry(pfn, flags);
0862 bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
0863 bool cow = dax_fault_is_cow(iter);
0864
0865 if (dirty)
0866 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
0867
0868 if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
0869 unsigned long index = xas->xa_index;
0870
0871 if (dax_is_pmd_entry(entry))
0872 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
0873 PG_PMD_NR, false);
0874 else
0875 unmap_mapping_pages(mapping, index, 1, false);
0876 }
0877
0878 xas_reset(xas);
0879 xas_lock_irq(xas);
0880 if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
0881 void *old;
0882
0883 dax_disassociate_entry(entry, mapping, false);
0884 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
0885 cow);
0886
0887
0888
0889
0890
0891
0892
0893
0894 old = dax_lock_entry(xas, new_entry);
0895 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
0896 DAX_LOCKED));
0897 entry = new_entry;
0898 } else {
0899 xas_load(xas);
0900 }
0901
0902 if (dirty)
0903 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
0904
0905 if (cow)
0906 xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
0907
0908 xas_unlock_irq(xas);
0909 return entry;
0910 }
0911
0912 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
0913 struct address_space *mapping, void *entry)
0914 {
0915 unsigned long pfn, index, count, end;
0916 long ret = 0;
0917 struct vm_area_struct *vma;
0918
0919
0920
0921
0922
0923 if (WARN_ON(!xa_is_value(entry)))
0924 return -EIO;
0925
0926 if (unlikely(dax_is_locked(entry))) {
0927 void *old_entry = entry;
0928
0929 entry = get_unlocked_entry(xas, 0);
0930
0931
0932 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
0933 goto put_unlocked;
0934
0935
0936
0937
0938
0939 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
0940 goto put_unlocked;
0941 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
0942 dax_is_zero_entry(entry))) {
0943 ret = -EIO;
0944 goto put_unlocked;
0945 }
0946
0947
0948 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
0949 goto put_unlocked;
0950 }
0951
0952
0953 dax_lock_entry(xas, entry);
0954
0955
0956
0957
0958
0959
0960
0961
0962 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
0963 xas_unlock_irq(xas);
0964
0965
0966
0967
0968
0969
0970
0971
0972 pfn = dax_to_pfn(entry);
0973 count = 1UL << dax_entry_order(entry);
0974 index = xas->xa_index & ~(count - 1);
0975 end = index + count - 1;
0976
0977
0978 i_mmap_lock_read(mapping);
0979 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
0980 pfn_mkclean_range(pfn, count, index, vma);
0981 cond_resched();
0982 }
0983 i_mmap_unlock_read(mapping);
0984
0985 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
0986
0987
0988
0989
0990
0991
0992 xas_reset(xas);
0993 xas_lock_irq(xas);
0994 xas_store(xas, entry);
0995 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
0996 dax_wake_entry(xas, entry, WAKE_NEXT);
0997
0998 trace_dax_writeback_one(mapping->host, index, count);
0999 return ret;
1000
1001 put_unlocked:
1002 put_unlocked_entry(xas, entry, WAKE_NEXT);
1003 return ret;
1004 }
1005
1006
1007
1008
1009
1010
1011 int dax_writeback_mapping_range(struct address_space *mapping,
1012 struct dax_device *dax_dev, struct writeback_control *wbc)
1013 {
1014 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1015 struct inode *inode = mapping->host;
1016 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1017 void *entry;
1018 int ret = 0;
1019 unsigned int scanned = 0;
1020
1021 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1022 return -EIO;
1023
1024 if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
1025 return 0;
1026
1027 trace_dax_writeback_range(inode, xas.xa_index, end_index);
1028
1029 tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1030
1031 xas_lock_irq(&xas);
1032 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1033 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
1034 if (ret < 0) {
1035 mapping_set_error(mapping, ret);
1036 break;
1037 }
1038 if (++scanned % XA_CHECK_SCHED)
1039 continue;
1040
1041 xas_pause(&xas);
1042 xas_unlock_irq(&xas);
1043 cond_resched();
1044 xas_lock_irq(&xas);
1045 }
1046 xas_unlock_irq(&xas);
1047 trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1048 return ret;
1049 }
1050 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1051
1052 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
1053 size_t size, void **kaddr, pfn_t *pfnp)
1054 {
1055 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1056 int id, rc = 0;
1057 long length;
1058
1059 id = dax_read_lock();
1060 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
1061 DAX_ACCESS, kaddr, pfnp);
1062 if (length < 0) {
1063 rc = length;
1064 goto out;
1065 }
1066 if (!pfnp)
1067 goto out_check_addr;
1068 rc = -EINVAL;
1069 if (PFN_PHYS(length) < size)
1070 goto out;
1071 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
1072 goto out;
1073
1074 if (length > 1 && !pfn_t_devmap(*pfnp))
1075 goto out;
1076 rc = 0;
1077
1078 out_check_addr:
1079 if (!kaddr)
1080 goto out;
1081 if (!*kaddr)
1082 rc = -EFAULT;
1083 out:
1084 dax_read_unlock(id);
1085 return rc;
1086 }
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102 static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
1103 const struct iomap *srcmap, void *daddr)
1104 {
1105 loff_t head_off = pos & (align_size - 1);
1106 size_t size = ALIGN(head_off + length, align_size);
1107 loff_t end = pos + length;
1108 loff_t pg_end = round_up(end, align_size);
1109 bool copy_all = head_off == 0 && end == pg_end;
1110 void *saddr = 0;
1111 int ret = 0;
1112
1113 ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
1114 if (ret)
1115 return ret;
1116
1117 if (copy_all) {
1118 ret = copy_mc_to_kernel(daddr, saddr, length);
1119 return ret ? -EIO : 0;
1120 }
1121
1122
1123 if (head_off) {
1124 ret = copy_mc_to_kernel(daddr, saddr, head_off);
1125 if (ret)
1126 return -EIO;
1127 }
1128
1129
1130 if (end < pg_end) {
1131 loff_t tail_off = head_off + length;
1132 loff_t tail_len = pg_end - end;
1133
1134 ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
1135 tail_len);
1136 if (ret)
1137 return -EIO;
1138 }
1139 return 0;
1140 }
1141
1142
1143
1144
1145
1146
1147
1148
1149 static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1150 const struct iomap_iter *iter, void **entry)
1151 {
1152 struct inode *inode = iter->inode;
1153 unsigned long vaddr = vmf->address;
1154 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1155 vm_fault_t ret;
1156
1157 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
1158
1159 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1160 trace_dax_load_hole(inode, vmf, ret);
1161 return ret;
1162 }
1163
1164 #ifdef CONFIG_FS_DAX_PMD
1165 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1166 const struct iomap_iter *iter, void **entry)
1167 {
1168 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1169 unsigned long pmd_addr = vmf->address & PMD_MASK;
1170 struct vm_area_struct *vma = vmf->vma;
1171 struct inode *inode = mapping->host;
1172 pgtable_t pgtable = NULL;
1173 struct page *zero_page;
1174 spinlock_t *ptl;
1175 pmd_t pmd_entry;
1176 pfn_t pfn;
1177
1178 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
1179
1180 if (unlikely(!zero_page))
1181 goto fallback;
1182
1183 pfn = page_to_pfn_t(zero_page);
1184 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
1185 DAX_PMD | DAX_ZERO_PAGE);
1186
1187 if (arch_needs_pgtable_deposit()) {
1188 pgtable = pte_alloc_one(vma->vm_mm);
1189 if (!pgtable)
1190 return VM_FAULT_OOM;
1191 }
1192
1193 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1194 if (!pmd_none(*(vmf->pmd))) {
1195 spin_unlock(ptl);
1196 goto fallback;
1197 }
1198
1199 if (pgtable) {
1200 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1201 mm_inc_nr_ptes(vma->vm_mm);
1202 }
1203 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1204 pmd_entry = pmd_mkhuge(pmd_entry);
1205 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1206 spin_unlock(ptl);
1207 trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1208 return VM_FAULT_NOPAGE;
1209
1210 fallback:
1211 if (pgtable)
1212 pte_free(vma->vm_mm, pgtable);
1213 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1214 return VM_FAULT_FALLBACK;
1215 }
1216 #else
1217 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1218 const struct iomap_iter *iter, void **entry)
1219 {
1220 return VM_FAULT_FALLBACK;
1221 }
1222 #endif
1223
1224 static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
1225 {
1226 const struct iomap *iomap = &iter->iomap;
1227 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1228 unsigned offset = offset_in_page(pos);
1229 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1230 void *kaddr;
1231 long ret;
1232
1233 ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
1234 NULL);
1235 if (ret < 0)
1236 return ret;
1237 memset(kaddr + offset, 0, size);
1238 if (srcmap->addr != iomap->addr) {
1239 ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
1240 kaddr);
1241 if (ret < 0)
1242 return ret;
1243 dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
1244 } else
1245 dax_flush(iomap->dax_dev, kaddr + offset, size);
1246 return ret;
1247 }
1248
1249 static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
1250 {
1251 const struct iomap *iomap = &iter->iomap;
1252 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1253 loff_t pos = iter->pos;
1254 u64 length = iomap_length(iter);
1255 s64 written = 0;
1256
1257
1258 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1259 return length;
1260
1261 do {
1262 unsigned offset = offset_in_page(pos);
1263 unsigned size = min_t(u64, PAGE_SIZE - offset, length);
1264 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1265 long rc;
1266 int id;
1267
1268 id = dax_read_lock();
1269 if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
1270 rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
1271 else
1272 rc = dax_memzero(iter, pos, size);
1273 dax_read_unlock(id);
1274
1275 if (rc < 0)
1276 return rc;
1277 pos += size;
1278 length -= size;
1279 written += size;
1280 } while (length > 0);
1281
1282 if (did_zero)
1283 *did_zero = true;
1284 return written;
1285 }
1286
1287 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1288 const struct iomap_ops *ops)
1289 {
1290 struct iomap_iter iter = {
1291 .inode = inode,
1292 .pos = pos,
1293 .len = len,
1294 .flags = IOMAP_DAX | IOMAP_ZERO,
1295 };
1296 int ret;
1297
1298 while ((ret = iomap_iter(&iter, ops)) > 0)
1299 iter.processed = dax_zero_iter(&iter, did_zero);
1300 return ret;
1301 }
1302 EXPORT_SYMBOL_GPL(dax_zero_range);
1303
1304 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1305 const struct iomap_ops *ops)
1306 {
1307 unsigned int blocksize = i_blocksize(inode);
1308 unsigned int off = pos & (blocksize - 1);
1309
1310
1311 if (!off)
1312 return 0;
1313 return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1314 }
1315 EXPORT_SYMBOL_GPL(dax_truncate_page);
1316
1317 static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
1318 struct iov_iter *iter)
1319 {
1320 const struct iomap *iomap = &iomi->iomap;
1321 const struct iomap *srcmap = &iomi->srcmap;
1322 loff_t length = iomap_length(iomi);
1323 loff_t pos = iomi->pos;
1324 struct dax_device *dax_dev = iomap->dax_dev;
1325 loff_t end = pos + length, done = 0;
1326 bool write = iov_iter_rw(iter) == WRITE;
1327 ssize_t ret = 0;
1328 size_t xfer;
1329 int id;
1330
1331 if (!write) {
1332 end = min(end, i_size_read(iomi->inode));
1333 if (pos >= end)
1334 return 0;
1335
1336 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1337 return iov_iter_zero(min(length, end - pos), iter);
1338 }
1339
1340
1341
1342
1343
1344 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
1345 !(iomap->flags & IOMAP_F_SHARED)))
1346 return -EIO;
1347
1348
1349
1350
1351
1352
1353 if (iomap->flags & IOMAP_F_NEW) {
1354 invalidate_inode_pages2_range(iomi->inode->i_mapping,
1355 pos >> PAGE_SHIFT,
1356 (end - 1) >> PAGE_SHIFT);
1357 }
1358
1359 id = dax_read_lock();
1360 while (pos < end) {
1361 unsigned offset = pos & (PAGE_SIZE - 1);
1362 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1363 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1364 ssize_t map_len;
1365 bool recovery = false;
1366 void *kaddr;
1367
1368 if (fatal_signal_pending(current)) {
1369 ret = -EINTR;
1370 break;
1371 }
1372
1373 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1374 DAX_ACCESS, &kaddr, NULL);
1375 if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
1376 map_len = dax_direct_access(dax_dev, pgoff,
1377 PHYS_PFN(size), DAX_RECOVERY_WRITE,
1378 &kaddr, NULL);
1379 if (map_len > 0)
1380 recovery = true;
1381 }
1382 if (map_len < 0) {
1383 ret = map_len;
1384 break;
1385 }
1386
1387 if (write &&
1388 srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
1389 ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
1390 kaddr);
1391 if (ret)
1392 break;
1393 }
1394
1395 map_len = PFN_PHYS(map_len);
1396 kaddr += offset;
1397 map_len -= offset;
1398 if (map_len > end - pos)
1399 map_len = end - pos;
1400
1401 if (recovery)
1402 xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
1403 map_len, iter);
1404 else if (write)
1405 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1406 map_len, iter);
1407 else
1408 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1409 map_len, iter);
1410
1411 pos += xfer;
1412 length -= xfer;
1413 done += xfer;
1414
1415 if (xfer == 0)
1416 ret = -EFAULT;
1417 if (xfer < map_len)
1418 break;
1419 }
1420 dax_read_unlock(id);
1421
1422 return done ? done : ret;
1423 }
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435 ssize_t
1436 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1437 const struct iomap_ops *ops)
1438 {
1439 struct iomap_iter iomi = {
1440 .inode = iocb->ki_filp->f_mapping->host,
1441 .pos = iocb->ki_pos,
1442 .len = iov_iter_count(iter),
1443 .flags = IOMAP_DAX,
1444 };
1445 loff_t done = 0;
1446 int ret;
1447
1448 if (!iomi.len)
1449 return 0;
1450
1451 if (iov_iter_rw(iter) == WRITE) {
1452 lockdep_assert_held_write(&iomi.inode->i_rwsem);
1453 iomi.flags |= IOMAP_WRITE;
1454 } else {
1455 lockdep_assert_held(&iomi.inode->i_rwsem);
1456 }
1457
1458 if (iocb->ki_flags & IOCB_NOWAIT)
1459 iomi.flags |= IOMAP_NOWAIT;
1460
1461 while ((ret = iomap_iter(&iomi, ops)) > 0)
1462 iomi.processed = dax_iomap_iter(&iomi, iter);
1463
1464 done = iomi.pos - iocb->ki_pos;
1465 iocb->ki_pos = iomi.pos;
1466 return done ? done : ret;
1467 }
1468 EXPORT_SYMBOL_GPL(dax_iomap_rw);
1469
1470 static vm_fault_t dax_fault_return(int error)
1471 {
1472 if (error == 0)
1473 return VM_FAULT_NOPAGE;
1474 return vmf_error(error);
1475 }
1476
1477
1478
1479
1480
1481
1482
1483 static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
1484 {
1485 if (WARN_ON_ONCE(!pfnp))
1486 return VM_FAULT_SIGBUS;
1487 *pfnp = pfn;
1488 return VM_FAULT_NEEDDSYNC;
1489 }
1490
1491 static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1492 const struct iomap_iter *iter)
1493 {
1494 vm_fault_t ret;
1495 int error = 0;
1496
1497 switch (iter->iomap.type) {
1498 case IOMAP_HOLE:
1499 case IOMAP_UNWRITTEN:
1500 clear_user_highpage(vmf->cow_page, vmf->address);
1501 break;
1502 case IOMAP_MAPPED:
1503 error = copy_cow_page_dax(vmf, iter);
1504 break;
1505 default:
1506 WARN_ON_ONCE(1);
1507 error = -EIO;
1508 break;
1509 }
1510
1511 if (error)
1512 return dax_fault_return(error);
1513
1514 __SetPageUptodate(vmf->cow_page);
1515 ret = finish_fault(vmf);
1516 if (!ret)
1517 return VM_FAULT_DONE_COW;
1518 return ret;
1519 }
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530 static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1531 const struct iomap_iter *iter, pfn_t *pfnp,
1532 struct xa_state *xas, void **entry, bool pmd)
1533 {
1534 const struct iomap *iomap = &iter->iomap;
1535 const struct iomap *srcmap = &iter->srcmap;
1536 size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1537 loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1538 bool write = iter->flags & IOMAP_WRITE;
1539 unsigned long entry_flags = pmd ? DAX_PMD : 0;
1540 int err = 0;
1541 pfn_t pfn;
1542 void *kaddr;
1543
1544 if (!pmd && vmf->cow_page)
1545 return dax_fault_cow_page(vmf, iter);
1546
1547
1548 if (!write &&
1549 (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
1550 if (!pmd)
1551 return dax_load_hole(xas, vmf, iter, entry);
1552 return dax_pmd_load_hole(xas, vmf, iter, entry);
1553 }
1554
1555 if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
1556 WARN_ON_ONCE(1);
1557 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1558 }
1559
1560 err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
1561 if (err)
1562 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
1563
1564 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
1565
1566 if (write &&
1567 srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
1568 err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
1569 if (err)
1570 return dax_fault_return(err);
1571 }
1572
1573 if (dax_fault_is_synchronous(iter, vmf->vma))
1574 return dax_fault_synchronous_pfnp(pfnp, pfn);
1575
1576
1577 if (pmd)
1578 return vmf_insert_pfn_pmd(vmf, pfn, write);
1579
1580
1581 if (write)
1582 return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1583 return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
1584 }
1585
1586 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1587 int *iomap_errp, const struct iomap_ops *ops)
1588 {
1589 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1590 XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1591 struct iomap_iter iter = {
1592 .inode = mapping->host,
1593 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
1594 .len = PAGE_SIZE,
1595 .flags = IOMAP_DAX | IOMAP_FAULT,
1596 };
1597 vm_fault_t ret = 0;
1598 void *entry;
1599 int error;
1600
1601 trace_dax_pte_fault(iter.inode, vmf, ret);
1602
1603
1604
1605
1606
1607 if (iter.pos >= i_size_read(iter.inode)) {
1608 ret = VM_FAULT_SIGBUS;
1609 goto out;
1610 }
1611
1612 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1613 iter.flags |= IOMAP_WRITE;
1614
1615 entry = grab_mapping_entry(&xas, mapping, 0);
1616 if (xa_is_internal(entry)) {
1617 ret = xa_to_internal(entry);
1618 goto out;
1619 }
1620
1621
1622
1623
1624
1625
1626
1627 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1628 ret = VM_FAULT_NOPAGE;
1629 goto unlock_entry;
1630 }
1631
1632 while ((error = iomap_iter(&iter, ops)) > 0) {
1633 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1634 iter.processed = -EIO;
1635 continue;
1636 }
1637
1638 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
1639 if (ret != VM_FAULT_SIGBUS &&
1640 (iter.iomap.flags & IOMAP_F_NEW)) {
1641 count_vm_event(PGMAJFAULT);
1642 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
1643 ret |= VM_FAULT_MAJOR;
1644 }
1645
1646 if (!(ret & VM_FAULT_ERROR))
1647 iter.processed = PAGE_SIZE;
1648 }
1649
1650 if (iomap_errp)
1651 *iomap_errp = error;
1652 if (!ret && error)
1653 ret = dax_fault_return(error);
1654
1655 unlock_entry:
1656 dax_unlock_entry(&xas, entry);
1657 out:
1658 trace_dax_pte_fault_done(iter.inode, vmf, ret);
1659 return ret;
1660 }
1661
1662 #ifdef CONFIG_FS_DAX_PMD
1663 static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
1664 pgoff_t max_pgoff)
1665 {
1666 unsigned long pmd_addr = vmf->address & PMD_MASK;
1667 bool write = vmf->flags & FAULT_FLAG_WRITE;
1668
1669
1670
1671
1672
1673
1674
1675 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1676 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1677 return true;
1678
1679
1680 if (write && !(vmf->vma->vm_flags & VM_SHARED))
1681 return true;
1682
1683
1684 if (pmd_addr < vmf->vma->vm_start)
1685 return true;
1686 if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
1687 return true;
1688
1689
1690 if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
1691 return true;
1692
1693 return false;
1694 }
1695
1696 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1697 const struct iomap_ops *ops)
1698 {
1699 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1700 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1701 struct iomap_iter iter = {
1702 .inode = mapping->host,
1703 .len = PMD_SIZE,
1704 .flags = IOMAP_DAX | IOMAP_FAULT,
1705 };
1706 vm_fault_t ret = VM_FAULT_FALLBACK;
1707 pgoff_t max_pgoff;
1708 void *entry;
1709 int error;
1710
1711 if (vmf->flags & FAULT_FLAG_WRITE)
1712 iter.flags |= IOMAP_WRITE;
1713
1714
1715
1716
1717
1718
1719 max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
1720
1721 trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
1722
1723 if (xas.xa_index >= max_pgoff) {
1724 ret = VM_FAULT_SIGBUS;
1725 goto out;
1726 }
1727
1728 if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
1729 goto fallback;
1730
1731
1732
1733
1734
1735
1736
1737 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
1738 if (xa_is_internal(entry)) {
1739 ret = xa_to_internal(entry);
1740 goto fallback;
1741 }
1742
1743
1744
1745
1746
1747
1748
1749 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1750 !pmd_devmap(*vmf->pmd)) {
1751 ret = 0;
1752 goto unlock_entry;
1753 }
1754
1755 iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1756 while ((error = iomap_iter(&iter, ops)) > 0) {
1757 if (iomap_length(&iter) < PMD_SIZE)
1758 continue;
1759
1760 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
1761 if (ret != VM_FAULT_FALLBACK)
1762 iter.processed = PMD_SIZE;
1763 }
1764
1765 unlock_entry:
1766 dax_unlock_entry(&xas, entry);
1767 fallback:
1768 if (ret == VM_FAULT_FALLBACK) {
1769 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
1770 count_vm_event(THP_FAULT_FALLBACK);
1771 }
1772 out:
1773 trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
1774 return ret;
1775 }
1776 #else
1777 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1778 const struct iomap_ops *ops)
1779 {
1780 return VM_FAULT_FALLBACK;
1781 }
1782 #endif
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1798 pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
1799 {
1800 switch (pe_size) {
1801 case PE_SIZE_PTE:
1802 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1803 case PE_SIZE_PMD:
1804 return dax_iomap_pmd_fault(vmf, pfnp, ops);
1805 default:
1806 return VM_FAULT_FALLBACK;
1807 }
1808 }
1809 EXPORT_SYMBOL_GPL(dax_iomap_fault);
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820 static vm_fault_t
1821 dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1822 {
1823 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1824 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1825 void *entry;
1826 vm_fault_t ret;
1827
1828 xas_lock_irq(&xas);
1829 entry = get_unlocked_entry(&xas, order);
1830
1831 if (!entry || dax_is_conflict(entry) ||
1832 (order == 0 && !dax_is_pte_entry(entry))) {
1833 put_unlocked_entry(&xas, entry, WAKE_NEXT);
1834 xas_unlock_irq(&xas);
1835 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1836 VM_FAULT_NOPAGE);
1837 return VM_FAULT_NOPAGE;
1838 }
1839 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1840 dax_lock_entry(&xas, entry);
1841 xas_unlock_irq(&xas);
1842 if (order == 0)
1843 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1844 #ifdef CONFIG_FS_DAX_PMD
1845 else if (order == PMD_ORDER)
1846 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1847 #endif
1848 else
1849 ret = VM_FAULT_FALLBACK;
1850 dax_unlock_entry(&xas, entry);
1851 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1852 return ret;
1853 }
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
1866 enum page_entry_size pe_size, pfn_t pfn)
1867 {
1868 int err;
1869 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1870 unsigned int order = pe_order(pe_size);
1871 size_t len = PAGE_SIZE << order;
1872
1873 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1874 if (err)
1875 return VM_FAULT_SIGBUS;
1876 return dax_insert_pfn_mkwrite(vmf, pfn, order);
1877 }
1878 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
1879
1880 static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
1881 struct iomap_iter *it_dest, u64 len, bool *same)
1882 {
1883 const struct iomap *smap = &it_src->iomap;
1884 const struct iomap *dmap = &it_dest->iomap;
1885 loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
1886 void *saddr, *daddr;
1887 int id, ret;
1888
1889 len = min(len, min(smap->length, dmap->length));
1890
1891 if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
1892 *same = true;
1893 return len;
1894 }
1895
1896 if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
1897 *same = false;
1898 return 0;
1899 }
1900
1901 id = dax_read_lock();
1902 ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
1903 &saddr, NULL);
1904 if (ret < 0)
1905 goto out_unlock;
1906
1907 ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
1908 &daddr, NULL);
1909 if (ret < 0)
1910 goto out_unlock;
1911
1912 *same = !memcmp(saddr, daddr, len);
1913 if (!*same)
1914 len = 0;
1915 dax_read_unlock(id);
1916 return len;
1917
1918 out_unlock:
1919 dax_read_unlock(id);
1920 return -EIO;
1921 }
1922
1923 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1924 struct inode *dst, loff_t dstoff, loff_t len, bool *same,
1925 const struct iomap_ops *ops)
1926 {
1927 struct iomap_iter src_iter = {
1928 .inode = src,
1929 .pos = srcoff,
1930 .len = len,
1931 .flags = IOMAP_DAX,
1932 };
1933 struct iomap_iter dst_iter = {
1934 .inode = dst,
1935 .pos = dstoff,
1936 .len = len,
1937 .flags = IOMAP_DAX,
1938 };
1939 int ret;
1940
1941 while ((ret = iomap_iter(&src_iter, ops)) > 0) {
1942 while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
1943 dst_iter.processed = dax_range_compare_iter(&src_iter,
1944 &dst_iter, len, same);
1945 }
1946 if (ret <= 0)
1947 src_iter.processed = ret;
1948 }
1949 return ret;
1950 }
1951
1952 int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1953 struct file *file_out, loff_t pos_out,
1954 loff_t *len, unsigned int remap_flags,
1955 const struct iomap_ops *ops)
1956 {
1957 return __generic_remap_file_range_prep(file_in, pos_in, file_out,
1958 pos_out, len, remap_flags, ops);
1959 }
1960 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);