0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037 #define pr_fmt(fmt) "Memory failure: " fmt
0038
0039 #include <linux/kernel.h>
0040 #include <linux/mm.h>
0041 #include <linux/page-flags.h>
0042 #include <linux/kernel-page-flags.h>
0043 #include <linux/sched/signal.h>
0044 #include <linux/sched/task.h>
0045 #include <linux/dax.h>
0046 #include <linux/ksm.h>
0047 #include <linux/rmap.h>
0048 #include <linux/export.h>
0049 #include <linux/pagemap.h>
0050 #include <linux/swap.h>
0051 #include <linux/backing-dev.h>
0052 #include <linux/migrate.h>
0053 #include <linux/suspend.h>
0054 #include <linux/slab.h>
0055 #include <linux/swapops.h>
0056 #include <linux/hugetlb.h>
0057 #include <linux/memory_hotplug.h>
0058 #include <linux/mm_inline.h>
0059 #include <linux/memremap.h>
0060 #include <linux/kfifo.h>
0061 #include <linux/ratelimit.h>
0062 #include <linux/page-isolation.h>
0063 #include <linux/pagewalk.h>
0064 #include <linux/shmem_fs.h>
0065 #include "swap.h"
0066 #include "internal.h"
0067 #include "ras/ras_event.h"
0068
0069 int sysctl_memory_failure_early_kill __read_mostly = 0;
0070
0071 int sysctl_memory_failure_recovery __read_mostly = 1;
0072
0073 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
0074
0075 static bool hw_memory_failure __read_mostly = false;
0076
0077
0078
0079
0080
0081
0082
0083 static int __page_handle_poison(struct page *page)
0084 {
0085 int ret;
0086
0087 zone_pcp_disable(page_zone(page));
0088 ret = dissolve_free_huge_page(page);
0089 if (!ret)
0090 ret = take_page_off_buddy(page);
0091 zone_pcp_enable(page_zone(page));
0092
0093 return ret;
0094 }
0095
0096 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
0097 {
0098 if (hugepage_or_freepage) {
0099
0100
0101
0102
0103 if (__page_handle_poison(page) <= 0)
0104
0105
0106
0107
0108
0109
0110
0111 return false;
0112 }
0113
0114 SetPageHWPoison(page);
0115 if (release)
0116 put_page(page);
0117 page_ref_inc(page);
0118 num_poisoned_pages_inc();
0119
0120 return true;
0121 }
0122
0123 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
0124
0125 u32 hwpoison_filter_enable = 0;
0126 u32 hwpoison_filter_dev_major = ~0U;
0127 u32 hwpoison_filter_dev_minor = ~0U;
0128 u64 hwpoison_filter_flags_mask;
0129 u64 hwpoison_filter_flags_value;
0130 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
0131 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
0132 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
0133 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
0134 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
0135
0136 static int hwpoison_filter_dev(struct page *p)
0137 {
0138 struct address_space *mapping;
0139 dev_t dev;
0140
0141 if (hwpoison_filter_dev_major == ~0U &&
0142 hwpoison_filter_dev_minor == ~0U)
0143 return 0;
0144
0145 mapping = page_mapping(p);
0146 if (mapping == NULL || mapping->host == NULL)
0147 return -EINVAL;
0148
0149 dev = mapping->host->i_sb->s_dev;
0150 if (hwpoison_filter_dev_major != ~0U &&
0151 hwpoison_filter_dev_major != MAJOR(dev))
0152 return -EINVAL;
0153 if (hwpoison_filter_dev_minor != ~0U &&
0154 hwpoison_filter_dev_minor != MINOR(dev))
0155 return -EINVAL;
0156
0157 return 0;
0158 }
0159
0160 static int hwpoison_filter_flags(struct page *p)
0161 {
0162 if (!hwpoison_filter_flags_mask)
0163 return 0;
0164
0165 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
0166 hwpoison_filter_flags_value)
0167 return 0;
0168 else
0169 return -EINVAL;
0170 }
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182 #ifdef CONFIG_MEMCG
0183 u64 hwpoison_filter_memcg;
0184 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
0185 static int hwpoison_filter_task(struct page *p)
0186 {
0187 if (!hwpoison_filter_memcg)
0188 return 0;
0189
0190 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
0191 return -EINVAL;
0192
0193 return 0;
0194 }
0195 #else
0196 static int hwpoison_filter_task(struct page *p) { return 0; }
0197 #endif
0198
0199 int hwpoison_filter(struct page *p)
0200 {
0201 if (!hwpoison_filter_enable)
0202 return 0;
0203
0204 if (hwpoison_filter_dev(p))
0205 return -EINVAL;
0206
0207 if (hwpoison_filter_flags(p))
0208 return -EINVAL;
0209
0210 if (hwpoison_filter_task(p))
0211 return -EINVAL;
0212
0213 return 0;
0214 }
0215 #else
0216 int hwpoison_filter(struct page *p)
0217 {
0218 return 0;
0219 }
0220 #endif
0221
0222 EXPORT_SYMBOL_GPL(hwpoison_filter);
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246 struct to_kill {
0247 struct list_head nd;
0248 struct task_struct *tsk;
0249 unsigned long addr;
0250 short size_shift;
0251 };
0252
0253
0254
0255
0256
0257
0258 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
0259 {
0260 struct task_struct *t = tk->tsk;
0261 short addr_lsb = tk->size_shift;
0262 int ret = 0;
0263
0264 pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
0265 pfn, t->comm, t->pid);
0266
0267 if ((flags & MF_ACTION_REQUIRED) && (t == current))
0268 ret = force_sig_mceerr(BUS_MCEERR_AR,
0269 (void __user *)tk->addr, addr_lsb);
0270 else
0271
0272
0273
0274
0275
0276
0277
0278
0279 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
0280 addr_lsb, t);
0281 if (ret < 0)
0282 pr_info("Error sending signal to %s:%d: %d\n",
0283 t->comm, t->pid, ret);
0284 return ret;
0285 }
0286
0287
0288
0289
0290
0291 void shake_page(struct page *p)
0292 {
0293 if (PageHuge(p))
0294 return;
0295
0296 if (!PageSlab(p)) {
0297 lru_add_drain_all();
0298 if (PageLRU(p) || is_free_buddy_page(p))
0299 return;
0300 }
0301
0302
0303
0304
0305
0306 }
0307 EXPORT_SYMBOL_GPL(shake_page);
0308
0309 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
0310 unsigned long address)
0311 {
0312 unsigned long ret = 0;
0313 pgd_t *pgd;
0314 p4d_t *p4d;
0315 pud_t *pud;
0316 pmd_t *pmd;
0317 pte_t *pte;
0318
0319 VM_BUG_ON_VMA(address == -EFAULT, vma);
0320 pgd = pgd_offset(vma->vm_mm, address);
0321 if (!pgd_present(*pgd))
0322 return 0;
0323 p4d = p4d_offset(pgd, address);
0324 if (!p4d_present(*p4d))
0325 return 0;
0326 pud = pud_offset(p4d, address);
0327 if (!pud_present(*pud))
0328 return 0;
0329 if (pud_devmap(*pud))
0330 return PUD_SHIFT;
0331 pmd = pmd_offset(pud, address);
0332 if (!pmd_present(*pmd))
0333 return 0;
0334 if (pmd_devmap(*pmd))
0335 return PMD_SHIFT;
0336 pte = pte_offset_map(pmd, address);
0337 if (pte_present(*pte) && pte_devmap(*pte))
0338 ret = PAGE_SHIFT;
0339 pte_unmap(pte);
0340 return ret;
0341 }
0342
0343
0344
0345
0346
0347
0348 #define FSDAX_INVALID_PGOFF ULONG_MAX
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360 static void add_to_kill(struct task_struct *tsk, struct page *p,
0361 pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
0362 struct list_head *to_kill)
0363 {
0364 struct to_kill *tk;
0365
0366 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
0367 if (!tk) {
0368 pr_err("Out of memory while machine check handling\n");
0369 return;
0370 }
0371
0372 tk->addr = page_address_in_vma(p, vma);
0373 if (is_zone_device_page(p)) {
0374 if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
0375 tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
0376 tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
0377 } else
0378 tk->size_shift = page_shift(compound_head(p));
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390 if (tk->addr == -EFAULT) {
0391 pr_info("Unable to find user space address %lx in %s\n",
0392 page_to_pfn(p), tsk->comm);
0393 } else if (tk->size_shift == 0) {
0394 kfree(tk);
0395 return;
0396 }
0397
0398 get_task_struct(tsk);
0399 tk->tsk = tsk;
0400 list_add_tail(&tk->nd, to_kill);
0401 }
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411 static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
0412 unsigned long pfn, int flags)
0413 {
0414 struct to_kill *tk, *next;
0415
0416 list_for_each_entry_safe (tk, next, to_kill, nd) {
0417 if (forcekill) {
0418
0419
0420
0421
0422
0423 if (fail || tk->addr == -EFAULT) {
0424 pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
0425 pfn, tk->tsk->comm, tk->tsk->pid);
0426 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
0427 tk->tsk, PIDTYPE_PID);
0428 }
0429
0430
0431
0432
0433
0434
0435
0436 else if (kill_proc(tk, pfn, flags) < 0)
0437 pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
0438 pfn, tk->tsk->comm, tk->tsk->pid);
0439 }
0440 put_task_struct(tk->tsk);
0441 kfree(tk);
0442 }
0443 }
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
0454 {
0455 struct task_struct *t;
0456
0457 for_each_thread(tsk, t) {
0458 if (t->flags & PF_MCE_PROCESS) {
0459 if (t->flags & PF_MCE_EARLY)
0460 return t;
0461 } else {
0462 if (sysctl_memory_failure_early_kill)
0463 return t;
0464 }
0465 }
0466 return NULL;
0467 }
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481 static struct task_struct *task_early_kill(struct task_struct *tsk,
0482 int force_early)
0483 {
0484 if (!tsk->mm)
0485 return NULL;
0486
0487
0488
0489
0490 if (force_early && tsk->mm == current->mm)
0491 return current;
0492
0493 return find_early_kill_thread(tsk);
0494 }
0495
0496
0497
0498
0499 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
0500 int force_early)
0501 {
0502 struct folio *folio = page_folio(page);
0503 struct vm_area_struct *vma;
0504 struct task_struct *tsk;
0505 struct anon_vma *av;
0506 pgoff_t pgoff;
0507
0508 av = folio_lock_anon_vma_read(folio, NULL);
0509 if (av == NULL)
0510 return;
0511
0512 pgoff = page_to_pgoff(page);
0513 read_lock(&tasklist_lock);
0514 for_each_process (tsk) {
0515 struct anon_vma_chain *vmac;
0516 struct task_struct *t = task_early_kill(tsk, force_early);
0517
0518 if (!t)
0519 continue;
0520 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
0521 pgoff, pgoff) {
0522 vma = vmac->vma;
0523 if (!page_mapped_in_vma(page, vma))
0524 continue;
0525 if (vma->vm_mm == t->mm)
0526 add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
0527 to_kill);
0528 }
0529 }
0530 read_unlock(&tasklist_lock);
0531 page_unlock_anon_vma_read(av);
0532 }
0533
0534
0535
0536
0537 static void collect_procs_file(struct page *page, struct list_head *to_kill,
0538 int force_early)
0539 {
0540 struct vm_area_struct *vma;
0541 struct task_struct *tsk;
0542 struct address_space *mapping = page->mapping;
0543 pgoff_t pgoff;
0544
0545 i_mmap_lock_read(mapping);
0546 read_lock(&tasklist_lock);
0547 pgoff = page_to_pgoff(page);
0548 for_each_process(tsk) {
0549 struct task_struct *t = task_early_kill(tsk, force_early);
0550
0551 if (!t)
0552 continue;
0553 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
0554 pgoff) {
0555
0556
0557
0558
0559
0560
0561
0562 if (vma->vm_mm == t->mm)
0563 add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
0564 to_kill);
0565 }
0566 }
0567 read_unlock(&tasklist_lock);
0568 i_mmap_unlock_read(mapping);
0569 }
0570
0571 #ifdef CONFIG_FS_DAX
0572
0573
0574
0575 static void collect_procs_fsdax(struct page *page,
0576 struct address_space *mapping, pgoff_t pgoff,
0577 struct list_head *to_kill)
0578 {
0579 struct vm_area_struct *vma;
0580 struct task_struct *tsk;
0581
0582 i_mmap_lock_read(mapping);
0583 read_lock(&tasklist_lock);
0584 for_each_process(tsk) {
0585 struct task_struct *t = task_early_kill(tsk, true);
0586
0587 if (!t)
0588 continue;
0589 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
0590 if (vma->vm_mm == t->mm)
0591 add_to_kill(t, page, pgoff, vma, to_kill);
0592 }
0593 }
0594 read_unlock(&tasklist_lock);
0595 i_mmap_unlock_read(mapping);
0596 }
0597 #endif
0598
0599
0600
0601
0602 static void collect_procs(struct page *page, struct list_head *tokill,
0603 int force_early)
0604 {
0605 if (!page->mapping)
0606 return;
0607
0608 if (PageAnon(page))
0609 collect_procs_anon(page, tokill, force_early);
0610 else
0611 collect_procs_file(page, tokill, force_early);
0612 }
0613
0614 struct hwp_walk {
0615 struct to_kill tk;
0616 unsigned long pfn;
0617 int flags;
0618 };
0619
0620 static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
0621 {
0622 tk->addr = addr;
0623 tk->size_shift = shift;
0624 }
0625
0626 static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
0627 unsigned long poisoned_pfn, struct to_kill *tk)
0628 {
0629 unsigned long pfn = 0;
0630
0631 if (pte_present(pte)) {
0632 pfn = pte_pfn(pte);
0633 } else {
0634 swp_entry_t swp = pte_to_swp_entry(pte);
0635
0636 if (is_hwpoison_entry(swp))
0637 pfn = hwpoison_entry_to_pfn(swp);
0638 }
0639
0640 if (!pfn || pfn != poisoned_pfn)
0641 return 0;
0642
0643 set_to_kill(tk, addr, shift);
0644 return 1;
0645 }
0646
0647 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0648 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
0649 struct hwp_walk *hwp)
0650 {
0651 pmd_t pmd = *pmdp;
0652 unsigned long pfn;
0653 unsigned long hwpoison_vaddr;
0654
0655 if (!pmd_present(pmd))
0656 return 0;
0657 pfn = pmd_pfn(pmd);
0658 if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
0659 hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
0660 set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
0661 return 1;
0662 }
0663 return 0;
0664 }
0665 #else
0666 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
0667 struct hwp_walk *hwp)
0668 {
0669 return 0;
0670 }
0671 #endif
0672
0673 static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
0674 unsigned long end, struct mm_walk *walk)
0675 {
0676 struct hwp_walk *hwp = walk->private;
0677 int ret = 0;
0678 pte_t *ptep, *mapped_pte;
0679 spinlock_t *ptl;
0680
0681 ptl = pmd_trans_huge_lock(pmdp, walk->vma);
0682 if (ptl) {
0683 ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
0684 spin_unlock(ptl);
0685 goto out;
0686 }
0687
0688 if (pmd_trans_unstable(pmdp))
0689 goto out;
0690
0691 mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
0692 addr, &ptl);
0693 for (; addr != end; ptep++, addr += PAGE_SIZE) {
0694 ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
0695 hwp->pfn, &hwp->tk);
0696 if (ret == 1)
0697 break;
0698 }
0699 pte_unmap_unlock(mapped_pte, ptl);
0700 out:
0701 cond_resched();
0702 return ret;
0703 }
0704
0705 #ifdef CONFIG_HUGETLB_PAGE
0706 static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
0707 unsigned long addr, unsigned long end,
0708 struct mm_walk *walk)
0709 {
0710 struct hwp_walk *hwp = walk->private;
0711 pte_t pte = huge_ptep_get(ptep);
0712 struct hstate *h = hstate_vma(walk->vma);
0713
0714 return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
0715 hwp->pfn, &hwp->tk);
0716 }
0717 #else
0718 #define hwpoison_hugetlb_range NULL
0719 #endif
0720
0721 static const struct mm_walk_ops hwp_walk_ops = {
0722 .pmd_entry = hwpoison_pte_range,
0723 .hugetlb_entry = hwpoison_hugetlb_range,
0724 };
0725
0726
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739 static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
0740 int flags)
0741 {
0742 int ret;
0743 struct hwp_walk priv = {
0744 .pfn = pfn,
0745 };
0746 priv.tk.tsk = p;
0747
0748 if (!p->mm)
0749 return -EFAULT;
0750
0751 mmap_read_lock(p->mm);
0752 ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
0753 (void *)&priv);
0754 if (ret == 1 && priv.tk.addr)
0755 kill_proc(&priv.tk, pfn, flags);
0756 else
0757 ret = 0;
0758 mmap_read_unlock(p->mm);
0759 return ret > 0 ? -EHWPOISON : -EFAULT;
0760 }
0761
0762 static const char *action_name[] = {
0763 [MF_IGNORED] = "Ignored",
0764 [MF_FAILED] = "Failed",
0765 [MF_DELAYED] = "Delayed",
0766 [MF_RECOVERED] = "Recovered",
0767 };
0768
0769 static const char * const action_page_types[] = {
0770 [MF_MSG_KERNEL] = "reserved kernel page",
0771 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
0772 [MF_MSG_SLAB] = "kernel slab page",
0773 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
0774 [MF_MSG_HUGE] = "huge page",
0775 [MF_MSG_FREE_HUGE] = "free huge page",
0776 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
0777 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
0778 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
0779 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
0780 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
0781 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
0782 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
0783 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
0784 [MF_MSG_CLEAN_LRU] = "clean LRU page",
0785 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
0786 [MF_MSG_BUDDY] = "free buddy page",
0787 [MF_MSG_DAX] = "dax page",
0788 [MF_MSG_UNSPLIT_THP] = "unsplit thp",
0789 [MF_MSG_UNKNOWN] = "unknown page",
0790 };
0791
0792
0793
0794
0795
0796
0797
0798 static int delete_from_lru_cache(struct page *p)
0799 {
0800 if (!isolate_lru_page(p)) {
0801
0802
0803
0804
0805 ClearPageActive(p);
0806 ClearPageUnevictable(p);
0807
0808
0809
0810
0811
0812 mem_cgroup_uncharge(page_folio(p));
0813
0814
0815
0816
0817 put_page(p);
0818 return 0;
0819 }
0820 return -EIO;
0821 }
0822
0823 static int truncate_error_page(struct page *p, unsigned long pfn,
0824 struct address_space *mapping)
0825 {
0826 int ret = MF_FAILED;
0827
0828 if (mapping->a_ops->error_remove_page) {
0829 int err = mapping->a_ops->error_remove_page(mapping, p);
0830
0831 if (err != 0) {
0832 pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
0833 } else if (page_has_private(p) &&
0834 !try_to_release_page(p, GFP_NOIO)) {
0835 pr_info("%#lx: failed to release buffers\n", pfn);
0836 } else {
0837 ret = MF_RECOVERED;
0838 }
0839 } else {
0840
0841
0842
0843
0844 if (invalidate_inode_page(p))
0845 ret = MF_RECOVERED;
0846 else
0847 pr_info("%#lx: Failed to invalidate\n", pfn);
0848 }
0849
0850 return ret;
0851 }
0852
0853 struct page_state {
0854 unsigned long mask;
0855 unsigned long res;
0856 enum mf_action_page_type type;
0857
0858
0859 int (*action)(struct page_state *ps, struct page *p);
0860 };
0861
0862
0863
0864
0865
0866
0867
0868 static bool has_extra_refcount(struct page_state *ps, struct page *p,
0869 bool extra_pins)
0870 {
0871 int count = page_count(p) - 1;
0872
0873 if (extra_pins)
0874 count -= 1;
0875
0876 if (count > 0) {
0877 pr_err("%#lx: %s still referenced by %d users\n",
0878 page_to_pfn(p), action_page_types[ps->type], count);
0879 return true;
0880 }
0881
0882 return false;
0883 }
0884
0885
0886
0887
0888
0889
0890 static int me_kernel(struct page_state *ps, struct page *p)
0891 {
0892 unlock_page(p);
0893 return MF_IGNORED;
0894 }
0895
0896
0897
0898
0899 static int me_unknown(struct page_state *ps, struct page *p)
0900 {
0901 pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
0902 unlock_page(p);
0903 return MF_FAILED;
0904 }
0905
0906
0907
0908
0909 static int me_pagecache_clean(struct page_state *ps, struct page *p)
0910 {
0911 int ret;
0912 struct address_space *mapping;
0913 bool extra_pins;
0914
0915 delete_from_lru_cache(p);
0916
0917
0918
0919
0920
0921 if (PageAnon(p)) {
0922 ret = MF_RECOVERED;
0923 goto out;
0924 }
0925
0926
0927
0928
0929
0930
0931
0932
0933 mapping = page_mapping(p);
0934 if (!mapping) {
0935
0936
0937
0938 ret = MF_FAILED;
0939 goto out;
0940 }
0941
0942
0943
0944
0945
0946 extra_pins = shmem_mapping(mapping);
0947
0948
0949
0950
0951
0952
0953 ret = truncate_error_page(p, page_to_pfn(p), mapping);
0954 if (has_extra_refcount(ps, p, extra_pins))
0955 ret = MF_FAILED;
0956
0957 out:
0958 unlock_page(p);
0959
0960 return ret;
0961 }
0962
0963
0964
0965
0966
0967
0968 static int me_pagecache_dirty(struct page_state *ps, struct page *p)
0969 {
0970 struct address_space *mapping = page_mapping(p);
0971
0972 SetPageError(p);
0973
0974 if (mapping) {
0975
0976
0977
0978
0979
0980
0981
0982
0983
0984
0985
0986
0987
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009 mapping_set_error(mapping, -EIO);
1010 }
1011
1012 return me_pagecache_clean(ps, p);
1013 }
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 static int me_swapcache_dirty(struct page_state *ps, struct page *p)
1035 {
1036 int ret;
1037 bool extra_pins = false;
1038
1039 ClearPageDirty(p);
1040
1041 ClearPageUptodate(p);
1042
1043 ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
1044 unlock_page(p);
1045
1046 if (ret == MF_DELAYED)
1047 extra_pins = true;
1048
1049 if (has_extra_refcount(ps, p, extra_pins))
1050 ret = MF_FAILED;
1051
1052 return ret;
1053 }
1054
1055 static int me_swapcache_clean(struct page_state *ps, struct page *p)
1056 {
1057 struct folio *folio = page_folio(p);
1058 int ret;
1059
1060 delete_from_swap_cache(folio);
1061
1062 ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
1063 folio_unlock(folio);
1064
1065 if (has_extra_refcount(ps, p, false))
1066 ret = MF_FAILED;
1067
1068 return ret;
1069 }
1070
1071
1072
1073
1074
1075
1076
1077 static int me_huge_page(struct page_state *ps, struct page *p)
1078 {
1079 int res;
1080 struct page *hpage = compound_head(p);
1081 struct address_space *mapping;
1082
1083 if (!PageHuge(hpage))
1084 return MF_DELAYED;
1085
1086 mapping = page_mapping(hpage);
1087 if (mapping) {
1088 res = truncate_error_page(hpage, page_to_pfn(p), mapping);
1089 unlock_page(hpage);
1090 } else {
1091 unlock_page(hpage);
1092
1093
1094
1095
1096
1097 put_page(hpage);
1098 if (__page_handle_poison(p) >= 0) {
1099 page_ref_inc(p);
1100 res = MF_RECOVERED;
1101 } else {
1102 res = MF_FAILED;
1103 }
1104 }
1105
1106 if (has_extra_refcount(ps, p, false))
1107 res = MF_FAILED;
1108
1109 return res;
1110 }
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125 #define dirty (1UL << PG_dirty)
1126 #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1127 #define unevict (1UL << PG_unevictable)
1128 #define mlock (1UL << PG_mlocked)
1129 #define lru (1UL << PG_lru)
1130 #define head (1UL << PG_head)
1131 #define slab (1UL << PG_slab)
1132 #define reserved (1UL << PG_reserved)
1133
1134 static struct page_state error_states[] = {
1135 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 { slab, slab, MF_MSG_SLAB, me_kernel },
1147
1148 { head, head, MF_MSG_HUGE, me_huge_page },
1149
1150 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
1151 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
1152
1153 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
1154 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
1155
1156 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
1157 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
1158
1159 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
1160 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
1161
1162
1163
1164
1165 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
1166 };
1167
1168 #undef dirty
1169 #undef sc
1170 #undef unevict
1171 #undef mlock
1172 #undef lru
1173 #undef head
1174 #undef slab
1175 #undef reserved
1176
1177
1178
1179
1180
1181 static void action_result(unsigned long pfn, enum mf_action_page_type type,
1182 enum mf_result result)
1183 {
1184 trace_memory_failure_event(pfn, type, result);
1185
1186 num_poisoned_pages_inc();
1187 pr_err("%#lx: recovery action for %s: %s\n",
1188 pfn, action_page_types[type], action_name[result]);
1189 }
1190
1191 static int page_action(struct page_state *ps, struct page *p,
1192 unsigned long pfn)
1193 {
1194 int result;
1195
1196
1197 result = ps->action(ps, p);
1198
1199 action_result(pfn, ps->type, result);
1200
1201
1202
1203
1204
1205
1206 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1207 }
1208
1209 static inline bool PageHWPoisonTakenOff(struct page *page)
1210 {
1211 return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1212 }
1213
1214 void SetPageHWPoisonTakenOff(struct page *page)
1215 {
1216 set_page_private(page, MAGIC_HWPOISON);
1217 }
1218
1219 void ClearPageHWPoisonTakenOff(struct page *page)
1220 {
1221 if (PageHWPoison(page))
1222 set_page_private(page, 0);
1223 }
1224
1225
1226
1227
1228
1229
1230
1231 static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
1232 {
1233
1234 if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1235 return true;
1236
1237 return PageLRU(page) || is_free_buddy_page(page);
1238 }
1239
1240 static int __get_hwpoison_page(struct page *page, unsigned long flags)
1241 {
1242 struct page *head = compound_head(page);
1243 int ret = 0;
1244 bool hugetlb = false;
1245
1246 ret = get_hwpoison_huge_page(head, &hugetlb);
1247 if (hugetlb)
1248 return ret;
1249
1250
1251
1252
1253
1254
1255 if (!HWPoisonHandlable(head, flags))
1256 return -EBUSY;
1257
1258 if (get_page_unless_zero(head)) {
1259 if (head == compound_head(page))
1260 return 1;
1261
1262 pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1263 put_page(head);
1264 }
1265
1266 return 0;
1267 }
1268
1269 static int get_any_page(struct page *p, unsigned long flags)
1270 {
1271 int ret = 0, pass = 0;
1272 bool count_increased = false;
1273
1274 if (flags & MF_COUNT_INCREASED)
1275 count_increased = true;
1276
1277 try_again:
1278 if (!count_increased) {
1279 ret = __get_hwpoison_page(p, flags);
1280 if (!ret) {
1281 if (page_count(p)) {
1282
1283 if (pass++ < 3)
1284 goto try_again;
1285 ret = -EBUSY;
1286 } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1287
1288 if (pass++ < 3)
1289 goto try_again;
1290 ret = -EIO;
1291 }
1292 goto out;
1293 } else if (ret == -EBUSY) {
1294
1295
1296
1297
1298 if (pass++ < 3) {
1299 shake_page(p);
1300 goto try_again;
1301 }
1302 ret = -EIO;
1303 goto out;
1304 }
1305 }
1306
1307 if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
1308 ret = 1;
1309 } else {
1310
1311
1312
1313
1314 if (pass++ < 3) {
1315 put_page(p);
1316 shake_page(p);
1317 count_increased = false;
1318 goto try_again;
1319 }
1320 put_page(p);
1321 ret = -EIO;
1322 }
1323 out:
1324 if (ret == -EIO)
1325 pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1326
1327 return ret;
1328 }
1329
1330 static int __get_unpoison_page(struct page *page)
1331 {
1332 struct page *head = compound_head(page);
1333 int ret = 0;
1334 bool hugetlb = false;
1335
1336 ret = get_hwpoison_huge_page(head, &hugetlb);
1337 if (hugetlb)
1338 return ret;
1339
1340
1341
1342
1343
1344
1345 if (PageHWPoisonTakenOff(page))
1346 return -EHWPOISON;
1347
1348 return get_page_unless_zero(page) ? 1 : 0;
1349 }
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378 static int get_hwpoison_page(struct page *p, unsigned long flags)
1379 {
1380 int ret;
1381
1382 zone_pcp_disable(page_zone(p));
1383 if (flags & MF_UNPOISON)
1384 ret = __get_unpoison_page(p);
1385 else
1386 ret = get_any_page(p, flags);
1387 zone_pcp_enable(page_zone(p));
1388
1389 return ret;
1390 }
1391
1392
1393
1394
1395
1396 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1397 int flags, struct page *hpage)
1398 {
1399 struct folio *folio = page_folio(hpage);
1400 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
1401 struct address_space *mapping;
1402 LIST_HEAD(tokill);
1403 bool unmap_success;
1404 int kill = 1, forcekill;
1405 bool mlocked = PageMlocked(hpage);
1406
1407
1408
1409
1410
1411 if (PageReserved(p) || PageSlab(p))
1412 return true;
1413 if (!(PageLRU(hpage) || PageHuge(p)))
1414 return true;
1415
1416
1417
1418
1419
1420 if (!page_mapped(hpage))
1421 return true;
1422
1423 if (PageKsm(p)) {
1424 pr_err("%#lx: can't handle KSM pages.\n", pfn);
1425 return false;
1426 }
1427
1428 if (PageSwapCache(p)) {
1429 pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1430 ttu |= TTU_IGNORE_HWPOISON;
1431 }
1432
1433
1434
1435
1436
1437
1438
1439 mapping = page_mapping(hpage);
1440 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1441 mapping_can_writeback(mapping)) {
1442 if (page_mkclean(hpage)) {
1443 SetPageDirty(hpage);
1444 } else {
1445 kill = 0;
1446 ttu |= TTU_IGNORE_HWPOISON;
1447 pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1448 pfn);
1449 }
1450 }
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460 if (kill)
1461 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1462
1463 if (PageHuge(hpage) && !PageAnon(hpage)) {
1464
1465
1466
1467
1468
1469
1470
1471 mapping = hugetlb_page_mapping_lock_write(hpage);
1472 if (mapping) {
1473 try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
1474 i_mmap_unlock_write(mapping);
1475 } else
1476 pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1477 } else {
1478 try_to_unmap(folio, ttu);
1479 }
1480
1481 unmap_success = !page_mapped(hpage);
1482 if (!unmap_success)
1483 pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
1484 pfn, page_mapcount(hpage));
1485
1486
1487
1488
1489
1490 if (mlocked)
1491 shake_page(hpage);
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1504 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1505
1506 return unmap_success;
1507 }
1508
1509 static int identify_page_state(unsigned long pfn, struct page *p,
1510 unsigned long page_flags)
1511 {
1512 struct page_state *ps;
1513
1514
1515
1516
1517
1518
1519 for (ps = error_states;; ps++)
1520 if ((p->flags & ps->mask) == ps->res)
1521 break;
1522
1523 page_flags |= (p->flags & (1UL << PG_dirty));
1524
1525 if (!ps->mask)
1526 for (ps = error_states;; ps++)
1527 if ((page_flags & ps->mask) == ps->res)
1528 break;
1529 return page_action(ps, p, pfn);
1530 }
1531
1532 static int try_to_split_thp_page(struct page *page, const char *msg)
1533 {
1534 lock_page(page);
1535 if (unlikely(split_huge_page(page))) {
1536 unsigned long pfn = page_to_pfn(page);
1537
1538 unlock_page(page);
1539 pr_info("%s: %#lx: thp split failed\n", msg, pfn);
1540 put_page(page);
1541 return -EBUSY;
1542 }
1543 unlock_page(page);
1544
1545 return 0;
1546 }
1547
1548 static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
1549 struct address_space *mapping, pgoff_t index, int flags)
1550 {
1551 struct to_kill *tk;
1552 unsigned long size = 0;
1553
1554 list_for_each_entry(tk, to_kill, nd)
1555 if (tk->size_shift)
1556 size = max(size, 1UL << tk->size_shift);
1557
1558 if (size) {
1559
1560
1561
1562
1563
1564
1565 loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
1566
1567 unmap_mapping_range(mapping, start, size, 0);
1568 }
1569
1570 kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
1571 }
1572
1573 static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1574 struct dev_pagemap *pgmap)
1575 {
1576 struct page *page = pfn_to_page(pfn);
1577 LIST_HEAD(to_kill);
1578 dax_entry_t cookie;
1579 int rc = 0;
1580
1581
1582
1583
1584
1585 page = compound_head(page);
1586
1587
1588
1589
1590
1591
1592
1593
1594 cookie = dax_lock_page(page);
1595 if (!cookie)
1596 return -EBUSY;
1597
1598 if (hwpoison_filter(page)) {
1599 rc = -EOPNOTSUPP;
1600 goto unlock;
1601 }
1602
1603 switch (pgmap->type) {
1604 case MEMORY_DEVICE_PRIVATE:
1605 case MEMORY_DEVICE_COHERENT:
1606
1607
1608
1609
1610 rc = -ENXIO;
1611 goto unlock;
1612 default:
1613 break;
1614 }
1615
1616
1617
1618
1619
1620 SetPageHWPoison(page);
1621
1622
1623
1624
1625
1626
1627
1628 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1629 collect_procs(page, &to_kill, true);
1630
1631 unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
1632 unlock:
1633 dax_unlock_page(page, cookie);
1634 return rc;
1635 }
1636
1637 #ifdef CONFIG_FS_DAX
1638
1639
1640
1641
1642
1643
1644
1645 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1646 unsigned long count, int mf_flags)
1647 {
1648 LIST_HEAD(to_kill);
1649 dax_entry_t cookie;
1650 struct page *page;
1651 size_t end = index + count;
1652
1653 mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1654
1655 for (; index < end; index++) {
1656 page = NULL;
1657 cookie = dax_lock_mapping_entry(mapping, index, &page);
1658 if (!cookie)
1659 return -EBUSY;
1660 if (!page)
1661 goto unlock;
1662
1663 SetPageHWPoison(page);
1664
1665 collect_procs_fsdax(page, mapping, index, &to_kill);
1666 unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
1667 index, mf_flags);
1668 unlock:
1669 dax_unlock_mapping_entry(mapping, index, cookie);
1670 }
1671 return 0;
1672 }
1673 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1674 #endif
1675
1676 #ifdef CONFIG_HUGETLB_PAGE
1677
1678
1679
1680
1681
1682 struct raw_hwp_page {
1683 struct llist_node node;
1684 struct page *page;
1685 };
1686
1687 static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
1688 {
1689 return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
1690 }
1691
1692 static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
1693 {
1694 struct llist_head *head;
1695 struct llist_node *t, *tnode;
1696 unsigned long count = 0;
1697
1698 head = raw_hwp_list_head(hpage);
1699 llist_for_each_safe(tnode, t, head->first) {
1700 struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1701
1702 if (move_flag)
1703 SetPageHWPoison(p->page);
1704 kfree(p);
1705 count++;
1706 }
1707 llist_del_all(head);
1708 return count;
1709 }
1710
1711 static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
1712 {
1713 struct llist_head *head;
1714 struct raw_hwp_page *raw_hwp;
1715 struct llist_node *t, *tnode;
1716 int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
1717
1718
1719
1720
1721
1722
1723 if (HPageRawHwpUnreliable(hpage))
1724 return -EHWPOISON;
1725 head = raw_hwp_list_head(hpage);
1726 llist_for_each_safe(tnode, t, head->first) {
1727 struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1728
1729 if (p->page == page)
1730 return -EHWPOISON;
1731 }
1732
1733 raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
1734 if (raw_hwp) {
1735 raw_hwp->page = page;
1736 llist_add(&raw_hwp->node, head);
1737
1738 if (ret)
1739 num_poisoned_pages_inc();
1740 } else {
1741
1742
1743
1744
1745
1746 SetHPageRawHwpUnreliable(hpage);
1747
1748
1749
1750
1751 __free_raw_hwp_pages(hpage, false);
1752 }
1753 return ret;
1754 }
1755
1756 static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
1757 {
1758
1759
1760
1761
1762 if (move_flag && HPageVmemmapOptimized(hpage))
1763 return 0;
1764
1765
1766
1767
1768
1769 if (HPageRawHwpUnreliable(hpage))
1770 return 0;
1771
1772 return __free_raw_hwp_pages(hpage, move_flag);
1773 }
1774
1775 void hugetlb_clear_page_hwpoison(struct page *hpage)
1776 {
1777 if (HPageRawHwpUnreliable(hpage))
1778 return;
1779 ClearPageHWPoison(hpage);
1780 free_raw_hwp_pages(hpage, true);
1781 }
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
1794 {
1795 struct page *page = pfn_to_page(pfn);
1796 struct page *head = compound_head(page);
1797 int ret = 2;
1798 bool count_increased = false;
1799
1800 if (!PageHeadHuge(head))
1801 goto out;
1802
1803 if (flags & MF_COUNT_INCREASED) {
1804 ret = 1;
1805 count_increased = true;
1806 } else if (HPageFreed(head)) {
1807 ret = 0;
1808 } else if (HPageMigratable(head)) {
1809 ret = get_page_unless_zero(head);
1810 if (ret)
1811 count_increased = true;
1812 } else {
1813 ret = -EBUSY;
1814 if (!(flags & MF_NO_RETRY))
1815 goto out;
1816 }
1817
1818 if (hugetlb_set_page_hwpoison(head, page)) {
1819 ret = -EHWPOISON;
1820 goto out;
1821 }
1822
1823 return ret;
1824 out:
1825 if (count_increased)
1826 put_page(head);
1827 return ret;
1828 }
1829
1830
1831
1832
1833
1834
1835
1836 static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1837 {
1838 int res;
1839 struct page *p = pfn_to_page(pfn);
1840 struct page *head;
1841 unsigned long page_flags;
1842
1843 *hugetlb = 1;
1844 retry:
1845 res = get_huge_page_for_hwpoison(pfn, flags);
1846 if (res == 2) {
1847 *hugetlb = 0;
1848 return 0;
1849 } else if (res == -EHWPOISON) {
1850 pr_err("%#lx: already hardware poisoned\n", pfn);
1851 if (flags & MF_ACTION_REQUIRED) {
1852 head = compound_head(p);
1853 res = kill_accessing_process(current, page_to_pfn(head), flags);
1854 }
1855 return res;
1856 } else if (res == -EBUSY) {
1857 if (!(flags & MF_NO_RETRY)) {
1858 flags |= MF_NO_RETRY;
1859 goto retry;
1860 }
1861 action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
1862 return res;
1863 }
1864
1865 head = compound_head(p);
1866 lock_page(head);
1867
1868 if (hwpoison_filter(p)) {
1869 hugetlb_clear_page_hwpoison(head);
1870 res = -EOPNOTSUPP;
1871 goto out;
1872 }
1873
1874
1875
1876
1877
1878 if (res == 0) {
1879 unlock_page(head);
1880 if (__page_handle_poison(p) >= 0) {
1881 page_ref_inc(p);
1882 res = MF_RECOVERED;
1883 } else {
1884 res = MF_FAILED;
1885 }
1886 action_result(pfn, MF_MSG_FREE_HUGE, res);
1887 return res == MF_RECOVERED ? 0 : -EBUSY;
1888 }
1889
1890 page_flags = head->flags;
1891
1892 if (!hwpoison_user_mappings(p, pfn, flags, head)) {
1893 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1894 res = -EBUSY;
1895 goto out;
1896 }
1897
1898 return identify_page_state(pfn, p, page_flags);
1899 out:
1900 unlock_page(head);
1901 return res;
1902 }
1903
1904 #else
1905 static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1906 {
1907 return 0;
1908 }
1909
1910 static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
1911 {
1912 return 0;
1913 }
1914 #endif
1915
1916 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1917 struct dev_pagemap *pgmap)
1918 {
1919 struct page *page = pfn_to_page(pfn);
1920 int rc = -ENXIO;
1921
1922 if (flags & MF_COUNT_INCREASED)
1923
1924
1925
1926 put_page(page);
1927
1928
1929 if (!pgmap_pfn_valid(pgmap, pfn))
1930 goto out;
1931
1932
1933
1934
1935
1936 if (pgmap_has_memory_failure(pgmap)) {
1937 rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
1938
1939
1940
1941
1942 if (rc != -EOPNOTSUPP)
1943 goto out;
1944 }
1945
1946 rc = mf_generic_kill_procs(pfn, flags, pgmap);
1947 out:
1948
1949 put_dev_pagemap(pgmap);
1950 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1951 return rc;
1952 }
1953
1954 static DEFINE_MUTEX(mf_mutex);
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977 int memory_failure(unsigned long pfn, int flags)
1978 {
1979 struct page *p;
1980 struct page *hpage;
1981 struct dev_pagemap *pgmap;
1982 int res = 0;
1983 unsigned long page_flags;
1984 bool retry = true;
1985 int hugetlb = 0;
1986
1987 if (!sysctl_memory_failure_recovery)
1988 panic("Memory failure on page %lx", pfn);
1989
1990 mutex_lock(&mf_mutex);
1991
1992 if (!(flags & MF_SW_SIMULATED))
1993 hw_memory_failure = true;
1994
1995 p = pfn_to_online_page(pfn);
1996 if (!p) {
1997 res = arch_memory_failure(pfn, flags);
1998 if (res == 0)
1999 goto unlock_mutex;
2000
2001 if (pfn_valid(pfn)) {
2002 pgmap = get_dev_pagemap(pfn, NULL);
2003 if (pgmap) {
2004 res = memory_failure_dev_pagemap(pfn, flags,
2005 pgmap);
2006 goto unlock_mutex;
2007 }
2008 }
2009 pr_err("%#lx: memory outside kernel control\n", pfn);
2010 res = -ENXIO;
2011 goto unlock_mutex;
2012 }
2013
2014 try_again:
2015 res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
2016 if (hugetlb)
2017 goto unlock_mutex;
2018
2019 if (TestSetPageHWPoison(p)) {
2020 pr_err("%#lx: already hardware poisoned\n", pfn);
2021 res = -EHWPOISON;
2022 if (flags & MF_ACTION_REQUIRED)
2023 res = kill_accessing_process(current, pfn, flags);
2024 if (flags & MF_COUNT_INCREASED)
2025 put_page(p);
2026 goto unlock_mutex;
2027 }
2028
2029 hpage = compound_head(p);
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042 if (!(flags & MF_COUNT_INCREASED)) {
2043 res = get_hwpoison_page(p, flags);
2044 if (!res) {
2045 if (is_free_buddy_page(p)) {
2046 if (take_page_off_buddy(p)) {
2047 page_ref_inc(p);
2048 res = MF_RECOVERED;
2049 } else {
2050
2051 if (retry) {
2052 ClearPageHWPoison(p);
2053 retry = false;
2054 goto try_again;
2055 }
2056 res = MF_FAILED;
2057 }
2058 action_result(pfn, MF_MSG_BUDDY, res);
2059 res = res == MF_RECOVERED ? 0 : -EBUSY;
2060 } else {
2061 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
2062 res = -EBUSY;
2063 }
2064 goto unlock_mutex;
2065 } else if (res < 0) {
2066 action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
2067 res = -EBUSY;
2068 goto unlock_mutex;
2069 }
2070 }
2071
2072 if (PageTransHuge(hpage)) {
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086 SetPageHasHWPoisoned(hpage);
2087 if (try_to_split_thp_page(p, "Memory Failure") < 0) {
2088 action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
2089 res = -EBUSY;
2090 goto unlock_mutex;
2091 }
2092 VM_BUG_ON_PAGE(!page_count(p), p);
2093 }
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103 shake_page(p);
2104
2105 lock_page(p);
2106
2107
2108
2109
2110
2111
2112
2113 if (PageCompound(p)) {
2114 if (retry) {
2115 ClearPageHWPoison(p);
2116 unlock_page(p);
2117 put_page(p);
2118 flags &= ~MF_COUNT_INCREASED;
2119 retry = false;
2120 goto try_again;
2121 }
2122 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
2123 res = -EBUSY;
2124 goto unlock_page;
2125 }
2126
2127
2128
2129
2130
2131
2132
2133
2134 page_flags = p->flags;
2135
2136 if (hwpoison_filter(p)) {
2137 TestClearPageHWPoison(p);
2138 unlock_page(p);
2139 put_page(p);
2140 res = -EOPNOTSUPP;
2141 goto unlock_mutex;
2142 }
2143
2144
2145
2146
2147
2148
2149 if (!PageLRU(p) && !PageWriteback(p))
2150 goto identify_page_state;
2151
2152
2153
2154
2155
2156 wait_on_page_writeback(p);
2157
2158
2159
2160
2161
2162 if (!hwpoison_user_mappings(p, pfn, flags, p)) {
2163 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
2164 res = -EBUSY;
2165 goto unlock_page;
2166 }
2167
2168
2169
2170
2171 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
2172 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
2173 res = -EBUSY;
2174 goto unlock_page;
2175 }
2176
2177 identify_page_state:
2178 res = identify_page_state(pfn, p, page_flags);
2179 mutex_unlock(&mf_mutex);
2180 return res;
2181 unlock_page:
2182 unlock_page(p);
2183 unlock_mutex:
2184 mutex_unlock(&mf_mutex);
2185 return res;
2186 }
2187 EXPORT_SYMBOL_GPL(memory_failure);
2188
2189 #define MEMORY_FAILURE_FIFO_ORDER 4
2190 #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
2191
2192 struct memory_failure_entry {
2193 unsigned long pfn;
2194 int flags;
2195 };
2196
2197 struct memory_failure_cpu {
2198 DECLARE_KFIFO(fifo, struct memory_failure_entry,
2199 MEMORY_FAILURE_FIFO_SIZE);
2200 spinlock_t lock;
2201 struct work_struct work;
2202 };
2203
2204 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222 void memory_failure_queue(unsigned long pfn, int flags)
2223 {
2224 struct memory_failure_cpu *mf_cpu;
2225 unsigned long proc_flags;
2226 struct memory_failure_entry entry = {
2227 .pfn = pfn,
2228 .flags = flags,
2229 };
2230
2231 mf_cpu = &get_cpu_var(memory_failure_cpu);
2232 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2233 if (kfifo_put(&mf_cpu->fifo, entry))
2234 schedule_work_on(smp_processor_id(), &mf_cpu->work);
2235 else
2236 pr_err("buffer overflow when queuing memory failure at %#lx\n",
2237 pfn);
2238 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2239 put_cpu_var(memory_failure_cpu);
2240 }
2241 EXPORT_SYMBOL_GPL(memory_failure_queue);
2242
2243 static void memory_failure_work_func(struct work_struct *work)
2244 {
2245 struct memory_failure_cpu *mf_cpu;
2246 struct memory_failure_entry entry = { 0, };
2247 unsigned long proc_flags;
2248 int gotten;
2249
2250 mf_cpu = container_of(work, struct memory_failure_cpu, work);
2251 for (;;) {
2252 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2253 gotten = kfifo_get(&mf_cpu->fifo, &entry);
2254 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2255 if (!gotten)
2256 break;
2257 if (entry.flags & MF_SOFT_OFFLINE)
2258 soft_offline_page(entry.pfn, entry.flags);
2259 else
2260 memory_failure(entry.pfn, entry.flags);
2261 }
2262 }
2263
2264
2265
2266
2267
2268 void memory_failure_queue_kick(int cpu)
2269 {
2270 struct memory_failure_cpu *mf_cpu;
2271
2272 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2273 cancel_work_sync(&mf_cpu->work);
2274 memory_failure_work_func(&mf_cpu->work);
2275 }
2276
2277 static int __init memory_failure_init(void)
2278 {
2279 struct memory_failure_cpu *mf_cpu;
2280 int cpu;
2281
2282 for_each_possible_cpu(cpu) {
2283 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2284 spin_lock_init(&mf_cpu->lock);
2285 INIT_KFIFO(mf_cpu->fifo);
2286 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2287 }
2288
2289 return 0;
2290 }
2291 core_initcall(memory_failure_init);
2292
2293 #undef pr_fmt
2294 #define pr_fmt(fmt) "" fmt
2295 #define unpoison_pr_info(fmt, pfn, rs) \
2296 ({ \
2297 if (__ratelimit(rs)) \
2298 pr_info(fmt, pfn); \
2299 })
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313 int unpoison_memory(unsigned long pfn)
2314 {
2315 struct page *page;
2316 struct page *p;
2317 int ret = -EBUSY;
2318 int freeit = 0;
2319 unsigned long count = 1;
2320 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2321 DEFAULT_RATELIMIT_BURST);
2322
2323 if (!pfn_valid(pfn))
2324 return -ENXIO;
2325
2326 p = pfn_to_page(pfn);
2327 page = compound_head(p);
2328
2329 mutex_lock(&mf_mutex);
2330
2331 if (hw_memory_failure) {
2332 unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
2333 pfn, &unpoison_rs);
2334 ret = -EOPNOTSUPP;
2335 goto unlock_mutex;
2336 }
2337
2338 if (!PageHWPoison(p)) {
2339 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
2340 pfn, &unpoison_rs);
2341 goto unlock_mutex;
2342 }
2343
2344 if (page_count(page) > 1) {
2345 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
2346 pfn, &unpoison_rs);
2347 goto unlock_mutex;
2348 }
2349
2350 if (page_mapped(page)) {
2351 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
2352 pfn, &unpoison_rs);
2353 goto unlock_mutex;
2354 }
2355
2356 if (page_mapping(page)) {
2357 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
2358 pfn, &unpoison_rs);
2359 goto unlock_mutex;
2360 }
2361
2362 if (PageSlab(page) || PageTable(page))
2363 goto unlock_mutex;
2364
2365 ret = get_hwpoison_page(p, MF_UNPOISON);
2366 if (!ret) {
2367 if (PageHuge(p)) {
2368 count = free_raw_hwp_pages(page, false);
2369 if (count == 0) {
2370 ret = -EBUSY;
2371 goto unlock_mutex;
2372 }
2373 }
2374 ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
2375 } else if (ret < 0) {
2376 if (ret == -EHWPOISON) {
2377 ret = put_page_back_buddy(p) ? 0 : -EBUSY;
2378 } else
2379 unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
2380 pfn, &unpoison_rs);
2381 } else {
2382 if (PageHuge(p)) {
2383 count = free_raw_hwp_pages(page, false);
2384 if (count == 0) {
2385 ret = -EBUSY;
2386 goto unlock_mutex;
2387 }
2388 }
2389 freeit = !!TestClearPageHWPoison(p);
2390
2391 put_page(page);
2392 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
2393 put_page(page);
2394 ret = 0;
2395 }
2396 }
2397
2398 unlock_mutex:
2399 mutex_unlock(&mf_mutex);
2400 if (!ret || freeit) {
2401 num_poisoned_pages_sub(count);
2402 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2403 page_to_pfn(p), &unpoison_rs);
2404 }
2405 return ret;
2406 }
2407 EXPORT_SYMBOL(unpoison_memory);
2408
2409 static bool isolate_page(struct page *page, struct list_head *pagelist)
2410 {
2411 bool isolated = false;
2412 bool lru = PageLRU(page);
2413
2414 if (PageHuge(page)) {
2415 isolated = !isolate_hugetlb(page, pagelist);
2416 } else {
2417 if (lru)
2418 isolated = !isolate_lru_page(page);
2419 else
2420 isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
2421
2422 if (isolated)
2423 list_add(&page->lru, pagelist);
2424 }
2425
2426 if (isolated && lru)
2427 inc_node_page_state(page, NR_ISOLATED_ANON +
2428 page_is_file_lru(page));
2429
2430
2431
2432
2433
2434
2435
2436
2437 put_page(page);
2438 return isolated;
2439 }
2440
2441
2442
2443
2444
2445
2446 static int __soft_offline_page(struct page *page)
2447 {
2448 long ret = 0;
2449 unsigned long pfn = page_to_pfn(page);
2450 struct page *hpage = compound_head(page);
2451 char const *msg_page[] = {"page", "hugepage"};
2452 bool huge = PageHuge(page);
2453 LIST_HEAD(pagelist);
2454 struct migration_target_control mtc = {
2455 .nid = NUMA_NO_NODE,
2456 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2457 };
2458
2459 lock_page(page);
2460 if (!PageHuge(page))
2461 wait_on_page_writeback(page);
2462 if (PageHWPoison(page)) {
2463 unlock_page(page);
2464 put_page(page);
2465 pr_info("soft offline: %#lx page already poisoned\n", pfn);
2466 return 0;
2467 }
2468
2469 if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
2470
2471
2472
2473
2474 ret = invalidate_inode_page(page);
2475 unlock_page(page);
2476
2477 if (ret) {
2478 pr_info("soft_offline: %#lx: invalidated\n", pfn);
2479 page_handle_poison(page, false, true);
2480 return 0;
2481 }
2482
2483 if (isolate_page(hpage, &pagelist)) {
2484 ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2485 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2486 if (!ret) {
2487 bool release = !huge;
2488
2489 if (!page_handle_poison(page, huge, release))
2490 ret = -EBUSY;
2491 } else {
2492 if (!list_empty(&pagelist))
2493 putback_movable_pages(&pagelist);
2494
2495 pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
2496 pfn, msg_page[huge], ret, &page->flags);
2497 if (ret > 0)
2498 ret = -EBUSY;
2499 }
2500 } else {
2501 pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
2502 pfn, msg_page[huge], page_count(page), &page->flags);
2503 ret = -EBUSY;
2504 }
2505 return ret;
2506 }
2507
2508 static int soft_offline_in_use_page(struct page *page)
2509 {
2510 struct page *hpage = compound_head(page);
2511
2512 if (!PageHuge(page) && PageTransHuge(hpage))
2513 if (try_to_split_thp_page(page, "soft offline") < 0)
2514 return -EBUSY;
2515 return __soft_offline_page(page);
2516 }
2517
2518 static int soft_offline_free_page(struct page *page)
2519 {
2520 int rc = 0;
2521
2522 if (!page_handle_poison(page, true, false))
2523 rc = -EBUSY;
2524
2525 return rc;
2526 }
2527
2528 static void put_ref_page(struct page *page)
2529 {
2530 if (page)
2531 put_page(page);
2532 }
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558 int soft_offline_page(unsigned long pfn, int flags)
2559 {
2560 int ret;
2561 bool try_again = true;
2562 struct page *page, *ref_page = NULL;
2563
2564 WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
2565
2566 if (!pfn_valid(pfn))
2567 return -ENXIO;
2568 if (flags & MF_COUNT_INCREASED)
2569 ref_page = pfn_to_page(pfn);
2570
2571
2572 page = pfn_to_online_page(pfn);
2573 if (!page) {
2574 put_ref_page(ref_page);
2575 return -EIO;
2576 }
2577
2578 mutex_lock(&mf_mutex);
2579
2580 if (PageHWPoison(page)) {
2581 pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2582 put_ref_page(ref_page);
2583 mutex_unlock(&mf_mutex);
2584 return 0;
2585 }
2586
2587 retry:
2588 get_online_mems();
2589 ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
2590 put_online_mems();
2591
2592 if (hwpoison_filter(page)) {
2593 if (ret > 0)
2594 put_page(page);
2595 else
2596 put_ref_page(ref_page);
2597
2598 mutex_unlock(&mf_mutex);
2599 return -EOPNOTSUPP;
2600 }
2601
2602 if (ret > 0) {
2603 ret = soft_offline_in_use_page(page);
2604 } else if (ret == 0) {
2605 if (soft_offline_free_page(page) && try_again) {
2606 try_again = false;
2607 flags &= ~MF_COUNT_INCREASED;
2608 goto retry;
2609 }
2610 }
2611
2612 mutex_unlock(&mf_mutex);
2613
2614 return ret;
2615 }
2616
2617 void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
2618 {
2619 int i;
2620
2621
2622
2623
2624
2625
2626
2627 if (atomic_long_read(&num_poisoned_pages) == 0)
2628 return;
2629
2630 for (i = 0; i < nr_pages; i++) {
2631 if (PageHWPoison(&memmap[i])) {
2632 num_poisoned_pages_dec();
2633 ClearPageHWPoison(&memmap[i]);
2634 }
2635 }
2636 }