the-tree/mm/memory-failure.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (C) 2008, 2009 Intel Corporation
0004  * Authors: Andi Kleen, Fengguang Wu
0005  *
0006  * High level machine check handler. Handles pages reported by the
0007  * hardware as being corrupted usually due to a multi-bit ECC memory or cache
0008  * failure.
0009  *
0010  * In addition there is a "soft offline" entry point that allows stop using
0011  * not-yet-corrupted-by-suspicious pages without killing anything.
0012  *
0013  * Handles page cache pages in various states.  The tricky part
0014  * here is that we can access any page asynchronously in respect to
0015  * other VM users, because memory failures could happen anytime and
0016  * anywhere. This could violate some of their assumptions. This is why
0017  * this code has to be extremely careful. Generally it tries to use
0018  * normal locking rules, as in get the standard locks, even if that means
0019  * the error handling takes potentially a long time.
0020  *
0021  * It can be very tempting to add handling for obscure cases here.
0022  * In general any code for handling new cases should only be added iff:
0023  * - You know how to test it.
0024  * - You have a test that can be added to mce-test
0025  *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
0026  * - The case actually shows up as a frequent (top 10) page state in
0027  *   tools/vm/page-types when running a real workload.
0028  *
0029  * There are several operations here with exponential complexity because
0030  * of unsuitable VM data structures. For example the operation to map back
0031  * from RMAP chains to processes has to walk the complete process list and
0032  * has non linear complexity with the number. But since memory corruptions
0033  * are rare we hope to get away with this. This avoids impacting the core
0034  * VM.
0035  */
0036
0037 #define pr_fmt(fmt) "Memory failure: " fmt
0038
0039 #include <linux/kernel.h>
0040 #include <linux/mm.h>
0041 #include <linux/page-flags.h>
0042 #include <linux/kernel-page-flags.h>
0043 #include <linux/sched/signal.h>
0044 #include <linux/sched/task.h>
0045 #include <linux/dax.h>
0046 #include <linux/ksm.h>
0047 #include <linux/rmap.h>
0048 #include <linux/export.h>
0049 #include <linux/pagemap.h>
0050 #include <linux/swap.h>
0051 #include <linux/backing-dev.h>
0052 #include <linux/migrate.h>
0053 #include <linux/suspend.h>
0054 #include <linux/slab.h>
0055 #include <linux/swapops.h>
0056 #include <linux/hugetlb.h>
0057 #include <linux/memory_hotplug.h>
0058 #include <linux/mm_inline.h>
0059 #include <linux/memremap.h>
0060 #include <linux/kfifo.h>
0061 #include <linux/ratelimit.h>
0062 #include <linux/page-isolation.h>
0063 #include <linux/pagewalk.h>
0064 #include <linux/shmem_fs.h>
0065 #include "swap.h"
0066 #include "internal.h"
0067 #include "ras/ras_event.h"
0068
0069 int sysctl_memory_failure_early_kill __read_mostly = 0;
0070
0071 int sysctl_memory_failure_recovery __read_mostly = 1;
0072
0073 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
0074
0075 static bool hw_memory_failure __read_mostly = false;
0076
0077 /*
0078  * Return values:
0079  *   1:   the page is dissolved (if needed) and taken off from buddy,
0080  *   0:   the page is dissolved (if needed) and not taken off from buddy,
0081  *   < 0: failed to dissolve.
0082  */
0083 static int __page_handle_poison(struct page *page)
0084 {
0085     int ret;
0086
0087     zone_pcp_disable(page_zone(page));
0088     ret = dissolve_free_huge_page(page);
0089     if (!ret)
0090         ret = take_page_off_buddy(page);
0091     zone_pcp_enable(page_zone(page));
0092
0093     return ret;
0094 }
0095
0096 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
0097 {
0098     if (hugepage_or_freepage) {
0099         /*
0100          * Doing this check for free pages is also fine since dissolve_free_huge_page
0101          * returns 0 for non-hugetlb pages as well.
0102          */
0103         if (__page_handle_poison(page) <= 0)
0104             /*
0105              * We could fail to take off the target page from buddy
0106              * for example due to racy page allocation, but that's
0107              * acceptable because soft-offlined page is not broken
0108              * and if someone really want to use it, they should
0109              * take it.
0110              */
0111             return false;
0112     }
0113
0114     SetPageHWPoison(page);
0115     if (release)
0116         put_page(page);
0117     page_ref_inc(page);
0118     num_poisoned_pages_inc();
0119
0120     return true;
0121 }
0122
0123 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
0124
0125 u32 hwpoison_filter_enable = 0;
0126 u32 hwpoison_filter_dev_major = ~0U;
0127 u32 hwpoison_filter_dev_minor = ~0U;
0128 u64 hwpoison_filter_flags_mask;
0129 u64 hwpoison_filter_flags_value;
0130 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
0131 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
0132 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
0133 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
0134 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
0135
0136 static int hwpoison_filter_dev(struct page *p)
0137 {
0138     struct address_space *mapping;
0139     dev_t dev;
0140
0141     if (hwpoison_filter_dev_major == ~0U &&
0142         hwpoison_filter_dev_minor == ~0U)
0143         return 0;
0144
0145     mapping = page_mapping(p);
0146     if (mapping == NULL || mapping->host == NULL)
0147         return -EINVAL;
0148
0149     dev = mapping->host->i_sb->s_dev;
0150     if (hwpoison_filter_dev_major != ~0U &&
0151         hwpoison_filter_dev_major != MAJOR(dev))
0152         return -EINVAL;
0153     if (hwpoison_filter_dev_minor != ~0U &&
0154         hwpoison_filter_dev_minor != MINOR(dev))
0155         return -EINVAL;
0156
0157     return 0;
0158 }
0159
0160 static int hwpoison_filter_flags(struct page *p)
0161 {
0162     if (!hwpoison_filter_flags_mask)
0163         return 0;
0164
0165     if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
0166                     hwpoison_filter_flags_value)
0167         return 0;
0168     else
0169         return -EINVAL;
0170 }
0171
0172 /*
0173  * This allows stress tests to limit test scope to a collection of tasks
0174  * by putting them under some memcg. This prevents killing unrelated/important
0175  * processes such as /sbin/init. Note that the target task may share clean
0176  * pages with init (eg. libc text), which is harmless. If the target task
0177  * share _dirty_ pages with another task B, the test scheme must make sure B
0178  * is also included in the memcg. At last, due to race conditions this filter
0179  * can only guarantee that the page either belongs to the memcg tasks, or is
0180  * a freed page.
0181  */
0182 #ifdef CONFIG_MEMCG
0183 u64 hwpoison_filter_memcg;
0184 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
0185 static int hwpoison_filter_task(struct page *p)
0186 {
0187     if (!hwpoison_filter_memcg)
0188         return 0;
0189
0190     if (page_cgroup_ino(p) != hwpoison_filter_memcg)
0191         return -EINVAL;
0192
0193     return 0;
0194 }
0195 #else
0196 static int hwpoison_filter_task(struct page *p) { return 0; }
0197 #endif
0198
0199 int hwpoison_filter(struct page *p)
0200 {
0201     if (!hwpoison_filter_enable)
0202         return 0;
0203
0204     if (hwpoison_filter_dev(p))
0205         return -EINVAL;
0206
0207     if (hwpoison_filter_flags(p))
0208         return -EINVAL;
0209
0210     if (hwpoison_filter_task(p))
0211         return -EINVAL;
0212
0213     return 0;
0214 }
0215 #else
0216 int hwpoison_filter(struct page *p)
0217 {
0218     return 0;
0219 }
0220 #endif
0221
0222 EXPORT_SYMBOL_GPL(hwpoison_filter);
0223
0224 /*
0225  * Kill all processes that have a poisoned page mapped and then isolate
0226  * the page.
0227  *
0228  * General strategy:
0229  * Find all processes having the page mapped and kill them.
0230  * But we keep a page reference around so that the page is not
0231  * actually freed yet.
0232  * Then stash the page away
0233  *
0234  * There's no convenient way to get back to mapped processes
0235  * from the VMAs. So do a brute-force search over all
0236  * running processes.
0237  *
0238  * Remember that machine checks are not common (or rather
0239  * if they are common you have other problems), so this shouldn't
0240  * be a performance issue.
0241  *
0242  * Also there are some races possible while we get from the
0243  * error detection to actually handle it.
0244  */
0245
0246 struct to_kill {
0247     struct list_head nd;
0248     struct task_struct *tsk;
0249     unsigned long addr;
0250     short size_shift;
0251 };
0252
0253 /*
0254  * Send all the processes who have the page mapped a signal.
0255  * ``action optional'' if they are not immediately affected by the error
0256  * ``action required'' if error happened in current execution context
0257  */
0258 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
0259 {
0260     struct task_struct *t = tk->tsk;
0261     short addr_lsb = tk->size_shift;
0262     int ret = 0;
0263
0264     pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
0265             pfn, t->comm, t->pid);
0266
0267     if ((flags & MF_ACTION_REQUIRED) && (t == current))
0268         ret = force_sig_mceerr(BUS_MCEERR_AR,
0269                  (void __user *)tk->addr, addr_lsb);
0270     else
0271         /*
0272          * Signal other processes sharing the page if they have
0273          * PF_MCE_EARLY set.
0274          * Don't use force here, it's convenient if the signal
0275          * can be temporarily blocked.
0276          * This could cause a loop when the user sets SIGBUS
0277          * to SIG_IGN, but hopefully no one will do that?
0278          */
0279         ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
0280                       addr_lsb, t);  /* synchronous? */
0281     if (ret < 0)
0282         pr_info("Error sending signal to %s:%d: %d\n",
0283             t->comm, t->pid, ret);
0284     return ret;
0285 }
0286
0287 /*
0288  * Unknown page type encountered. Try to check whether it can turn PageLRU by
0289  * lru_add_drain_all.
0290  */
0291 void shake_page(struct page *p)
0292 {
0293     if (PageHuge(p))
0294         return;
0295
0296     if (!PageSlab(p)) {
0297         lru_add_drain_all();
0298         if (PageLRU(p) || is_free_buddy_page(p))
0299             return;
0300     }
0301
0302     /*
0303      * TODO: Could shrink slab caches here if a lightweight range-based
0304      * shrinker will be available.
0305      */
0306 }
0307 EXPORT_SYMBOL_GPL(shake_page);
0308
0309 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
0310         unsigned long address)
0311 {
0312     unsigned long ret = 0;
0313     pgd_t *pgd;
0314     p4d_t *p4d;
0315     pud_t *pud;
0316     pmd_t *pmd;
0317     pte_t *pte;
0318
0319     VM_BUG_ON_VMA(address == -EFAULT, vma);
0320     pgd = pgd_offset(vma->vm_mm, address);
0321     if (!pgd_present(*pgd))
0322         return 0;
0323     p4d = p4d_offset(pgd, address);
0324     if (!p4d_present(*p4d))
0325         return 0;
0326     pud = pud_offset(p4d, address);
0327     if (!pud_present(*pud))
0328         return 0;
0329     if (pud_devmap(*pud))
0330         return PUD_SHIFT;
0331     pmd = pmd_offset(pud, address);
0332     if (!pmd_present(*pmd))
0333         return 0;
0334     if (pmd_devmap(*pmd))
0335         return PMD_SHIFT;
0336     pte = pte_offset_map(pmd, address);
0337     if (pte_present(*pte) && pte_devmap(*pte))
0338         ret = PAGE_SHIFT;
0339     pte_unmap(pte);
0340     return ret;
0341 }
0342
0343 /*
0344  * Failure handling: if we can't find or can't kill a process there's
0345  * not much we can do.  We just print a message and ignore otherwise.
0346  */
0347
0348 #define FSDAX_INVALID_PGOFF ULONG_MAX
0349
0350 /*
0351  * Schedule a process for later kill.
0352  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
0353  *
0354  * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
0355  * filesystem with a memory failure handler has claimed the
0356  * memory_failure event. In all other cases, page->index and
0357  * page->mapping are sufficient for mapping the page back to its
0358  * corresponding user virtual address.
0359  */
0360 static void add_to_kill(struct task_struct *tsk, struct page *p,
0361             pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
0362             struct list_head *to_kill)
0363 {
0364     struct to_kill *tk;
0365
0366     tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
0367     if (!tk) {
0368         pr_err("Out of memory while machine check handling\n");
0369         return;
0370     }
0371
0372     tk->addr = page_address_in_vma(p, vma);
0373     if (is_zone_device_page(p)) {
0374         if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
0375             tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
0376         tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
0377     } else
0378         tk->size_shift = page_shift(compound_head(p));
0379
0380     /*
0381      * Send SIGKILL if "tk->addr == -EFAULT". Also, as
0382      * "tk->size_shift" is always non-zero for !is_zone_device_page(),
0383      * so "tk->size_shift == 0" effectively checks no mapping on
0384      * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
0385      * to a process' address space, it's possible not all N VMAs
0386      * contain mappings for the page, but at least one VMA does.
0387      * Only deliver SIGBUS with payload derived from the VMA that
0388      * has a mapping for the page.
0389      */
0390     if (tk->addr == -EFAULT) {
0391         pr_info("Unable to find user space address %lx in %s\n",
0392             page_to_pfn(p), tsk->comm);
0393     } else if (tk->size_shift == 0) {
0394         kfree(tk);
0395         return;
0396     }
0397
0398     get_task_struct(tsk);
0399     tk->tsk = tsk;
0400     list_add_tail(&tk->nd, to_kill);
0401 }
0402
0403 /*
0404  * Kill the processes that have been collected earlier.
0405  *
0406  * Only do anything when FORCEKILL is set, otherwise just free the
0407  * list (this is used for clean pages which do not need killing)
0408  * Also when FAIL is set do a force kill because something went
0409  * wrong earlier.
0410  */
0411 static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
0412         unsigned long pfn, int flags)
0413 {
0414     struct to_kill *tk, *next;
0415
0416     list_for_each_entry_safe (tk, next, to_kill, nd) {
0417         if (forcekill) {
0418             /*
0419              * In case something went wrong with munmapping
0420              * make sure the process doesn't catch the
0421              * signal and then access the memory. Just kill it.
0422              */
0423             if (fail || tk->addr == -EFAULT) {
0424                 pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
0425                        pfn, tk->tsk->comm, tk->tsk->pid);
0426                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
0427                          tk->tsk, PIDTYPE_PID);
0428             }
0429
0430             /*
0431              * In theory the process could have mapped
0432              * something else on the address in-between. We could
0433              * check for that, but we need to tell the
0434              * process anyways.
0435              */
0436             else if (kill_proc(tk, pfn, flags) < 0)
0437                 pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
0438                        pfn, tk->tsk->comm, tk->tsk->pid);
0439         }
0440         put_task_struct(tk->tsk);
0441         kfree(tk);
0442     }
0443 }
0444
0445 /*
0446  * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
0447  * on behalf of the thread group. Return task_struct of the (first found)
0448  * dedicated thread if found, and return NULL otherwise.
0449  *
0450  * We already hold read_lock(&tasklist_lock) in the caller, so we don't
0451  * have to call rcu_read_lock/unlock() in this function.
0452  */
0453 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
0454 {
0455     struct task_struct *t;
0456
0457     for_each_thread(tsk, t) {
0458         if (t->flags & PF_MCE_PROCESS) {
0459             if (t->flags & PF_MCE_EARLY)
0460                 return t;
0461         } else {
0462             if (sysctl_memory_failure_early_kill)
0463                 return t;
0464         }
0465     }
0466     return NULL;
0467 }
0468
0469 /*
0470  * Determine whether a given process is "early kill" process which expects
0471  * to be signaled when some page under the process is hwpoisoned.
0472  * Return task_struct of the dedicated thread (main thread unless explicitly
0473  * specified) if the process is "early kill" and otherwise returns NULL.
0474  *
0475  * Note that the above is true for Action Optional case. For Action Required
0476  * case, it's only meaningful to the current thread which need to be signaled
0477  * with SIGBUS, this error is Action Optional for other non current
0478  * processes sharing the same error page,if the process is "early kill", the
0479  * task_struct of the dedicated thread will also be returned.
0480  */
0481 static struct task_struct *task_early_kill(struct task_struct *tsk,
0482                        int force_early)
0483 {
0484     if (!tsk->mm)
0485         return NULL;
0486     /*
0487      * Comparing ->mm here because current task might represent
0488      * a subthread, while tsk always points to the main thread.
0489      */
0490     if (force_early && tsk->mm == current->mm)
0491         return current;
0492
0493     return find_early_kill_thread(tsk);
0494 }
0495
0496 /*
0497  * Collect processes when the error hit an anonymous page.
0498  */
0499 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
0500                 int force_early)
0501 {
0502     struct folio *folio = page_folio(page);
0503     struct vm_area_struct *vma;
0504     struct task_struct *tsk;
0505     struct anon_vma *av;
0506     pgoff_t pgoff;
0507
0508     av = folio_lock_anon_vma_read(folio, NULL);
0509     if (av == NULL) /* Not actually mapped anymore */
0510         return;
0511
0512     pgoff = page_to_pgoff(page);
0513     read_lock(&tasklist_lock);
0514     for_each_process (tsk) {
0515         struct anon_vma_chain *vmac;
0516         struct task_struct *t = task_early_kill(tsk, force_early);
0517
0518         if (!t)
0519             continue;
0520         anon_vma_interval_tree_foreach(vmac, &av->rb_root,
0521                            pgoff, pgoff) {
0522             vma = vmac->vma;
0523             if (!page_mapped_in_vma(page, vma))
0524                 continue;
0525             if (vma->vm_mm == t->mm)
0526                 add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
0527                         to_kill);
0528         }
0529     }
0530     read_unlock(&tasklist_lock);
0531     page_unlock_anon_vma_read(av);
0532 }
0533
0534 /*
0535  * Collect processes when the error hit a file mapped page.
0536  */
0537 static void collect_procs_file(struct page *page, struct list_head *to_kill,
0538                 int force_early)
0539 {
0540     struct vm_area_struct *vma;
0541     struct task_struct *tsk;
0542     struct address_space *mapping = page->mapping;
0543     pgoff_t pgoff;
0544
0545     i_mmap_lock_read(mapping);
0546     read_lock(&tasklist_lock);
0547     pgoff = page_to_pgoff(page);
0548     for_each_process(tsk) {
0549         struct task_struct *t = task_early_kill(tsk, force_early);
0550
0551         if (!t)
0552             continue;
0553         vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
0554                       pgoff) {
0555             /*
0556              * Send early kill signal to tasks where a vma covers
0557              * the page but the corrupted page is not necessarily
0558              * mapped it in its pte.
0559              * Assume applications who requested early kill want
0560              * to be informed of all such data corruptions.
0561              */
0562             if (vma->vm_mm == t->mm)
0563                 add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
0564                         to_kill);
0565         }
0566     }
0567     read_unlock(&tasklist_lock);
0568     i_mmap_unlock_read(mapping);
0569 }
0570
0571 #ifdef CONFIG_FS_DAX
0572 /*
0573  * Collect processes when the error hit a fsdax page.
0574  */
0575 static void collect_procs_fsdax(struct page *page,
0576         struct address_space *mapping, pgoff_t pgoff,
0577         struct list_head *to_kill)
0578 {
0579     struct vm_area_struct *vma;
0580     struct task_struct *tsk;
0581
0582     i_mmap_lock_read(mapping);
0583     read_lock(&tasklist_lock);
0584     for_each_process(tsk) {
0585         struct task_struct *t = task_early_kill(tsk, true);
0586
0587         if (!t)
0588             continue;
0589         vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
0590             if (vma->vm_mm == t->mm)
0591                 add_to_kill(t, page, pgoff, vma, to_kill);
0592         }
0593     }
0594     read_unlock(&tasklist_lock);
0595     i_mmap_unlock_read(mapping);
0596 }
0597 #endif /* CONFIG_FS_DAX */
0598
0599 /*
0600  * Collect the processes who have the corrupted page mapped to kill.
0601  */
0602 static void collect_procs(struct page *page, struct list_head *tokill,
0603                 int force_early)
0604 {
0605     if (!page->mapping)
0606         return;
0607
0608     if (PageAnon(page))
0609         collect_procs_anon(page, tokill, force_early);
0610     else
0611         collect_procs_file(page, tokill, force_early);
0612 }
0613
0614 struct hwp_walk {
0615     struct to_kill tk;
0616     unsigned long pfn;
0617     int flags;
0618 };
0619
0620 static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
0621 {
0622     tk->addr = addr;
0623     tk->size_shift = shift;
0624 }
0625
0626 static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
0627                 unsigned long poisoned_pfn, struct to_kill *tk)
0628 {
0629     unsigned long pfn = 0;
0630
0631     if (pte_present(pte)) {
0632         pfn = pte_pfn(pte);
0633     } else {
0634         swp_entry_t swp = pte_to_swp_entry(pte);
0635
0636         if (is_hwpoison_entry(swp))
0637             pfn = hwpoison_entry_to_pfn(swp);
0638     }
0639
0640     if (!pfn || pfn != poisoned_pfn)
0641         return 0;
0642
0643     set_to_kill(tk, addr, shift);
0644     return 1;
0645 }
0646
0647 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0648 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
0649                       struct hwp_walk *hwp)
0650 {
0651     pmd_t pmd = *pmdp;
0652     unsigned long pfn;
0653     unsigned long hwpoison_vaddr;
0654
0655     if (!pmd_present(pmd))
0656         return 0;
0657     pfn = pmd_pfn(pmd);
0658     if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
0659         hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
0660         set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
0661         return 1;
0662     }
0663     return 0;
0664 }
0665 #else
0666 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
0667                       struct hwp_walk *hwp)
0668 {
0669     return 0;
0670 }
0671 #endif
0672
0673 static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
0674                   unsigned long end, struct mm_walk *walk)
0675 {
0676     struct hwp_walk *hwp = walk->private;
0677     int ret = 0;
0678     pte_t *ptep, *mapped_pte;
0679     spinlock_t *ptl;
0680
0681     ptl = pmd_trans_huge_lock(pmdp, walk->vma);
0682     if (ptl) {
0683         ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
0684         spin_unlock(ptl);
0685         goto out;
0686     }
0687
0688     if (pmd_trans_unstable(pmdp))
0689         goto out;
0690
0691     mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
0692                         addr, &ptl);
0693     for (; addr != end; ptep++, addr += PAGE_SIZE) {
0694         ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
0695                          hwp->pfn, &hwp->tk);
0696         if (ret == 1)
0697             break;
0698     }
0699     pte_unmap_unlock(mapped_pte, ptl);
0700 out:
0701     cond_resched();
0702     return ret;
0703 }
0704
0705 #ifdef CONFIG_HUGETLB_PAGE
0706 static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
0707                 unsigned long addr, unsigned long end,
0708                 struct mm_walk *walk)
0709 {
0710     struct hwp_walk *hwp = walk->private;
0711     pte_t pte = huge_ptep_get(ptep);
0712     struct hstate *h = hstate_vma(walk->vma);
0713
0714     return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
0715                       hwp->pfn, &hwp->tk);
0716 }
0717 #else
0718 #define hwpoison_hugetlb_range  NULL
0719 #endif
0720
0721 static const struct mm_walk_ops hwp_walk_ops = {
0722     .pmd_entry = hwpoison_pte_range,
0723     .hugetlb_entry = hwpoison_hugetlb_range,
0724 };
0725
0726 /*
0727  * Sends SIGBUS to the current process with error info.
0728  *
0729  * This function is intended to handle "Action Required" MCEs on already
0730  * hardware poisoned pages. They could happen, for example, when
0731  * memory_failure() failed to unmap the error page at the first call, or
0732  * when multiple local machine checks happened on different CPUs.
0733  *
0734  * MCE handler currently has no easy access to the error virtual address,
0735  * so this function walks page table to find it. The returned virtual address
0736  * is proper in most cases, but it could be wrong when the application
0737  * process has multiple entries mapping the error page.
0738  */
0739 static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
0740                   int flags)
0741 {
0742     int ret;
0743     struct hwp_walk priv = {
0744         .pfn = pfn,
0745     };
0746     priv.tk.tsk = p;
0747
0748     if (!p->mm)
0749         return -EFAULT;
0750
0751     mmap_read_lock(p->mm);
0752     ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
0753                   (void *)&priv);
0754     if (ret == 1 && priv.tk.addr)
0755         kill_proc(&priv.tk, pfn, flags);
0756     else
0757         ret = 0;
0758     mmap_read_unlock(p->mm);
0759     return ret > 0 ? -EHWPOISON : -EFAULT;
0760 }
0761
0762 static const char *action_name[] = {
0763     [MF_IGNORED] = "Ignored",
0764     [MF_FAILED] = "Failed",
0765     [MF_DELAYED] = "Delayed",
0766     [MF_RECOVERED] = "Recovered",
0767 };
0768
0769 static const char * const action_page_types[] = {
0770     [MF_MSG_KERNEL]         = "reserved kernel page",
0771     [MF_MSG_KERNEL_HIGH_ORDER]  = "high-order kernel page",
0772     [MF_MSG_SLAB]           = "kernel slab page",
0773     [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
0774     [MF_MSG_HUGE]           = "huge page",
0775     [MF_MSG_FREE_HUGE]      = "free huge page",
0776     [MF_MSG_UNMAP_FAILED]       = "unmapping failed page",
0777     [MF_MSG_DIRTY_SWAPCACHE]    = "dirty swapcache page",
0778     [MF_MSG_CLEAN_SWAPCACHE]    = "clean swapcache page",
0779     [MF_MSG_DIRTY_MLOCKED_LRU]  = "dirty mlocked LRU page",
0780     [MF_MSG_CLEAN_MLOCKED_LRU]  = "clean mlocked LRU page",
0781     [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
0782     [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
0783     [MF_MSG_DIRTY_LRU]      = "dirty LRU page",
0784     [MF_MSG_CLEAN_LRU]      = "clean LRU page",
0785     [MF_MSG_TRUNCATED_LRU]      = "already truncated LRU page",
0786     [MF_MSG_BUDDY]          = "free buddy page",
0787     [MF_MSG_DAX]            = "dax page",
0788     [MF_MSG_UNSPLIT_THP]        = "unsplit thp",
0789     [MF_MSG_UNKNOWN]        = "unknown page",
0790 };
0791
0792 /*
0793  * XXX: It is possible that a page is isolated from LRU cache,
0794  * and then kept in swap cache or failed to remove from page cache.
0795  * The page count will stop it from being freed by unpoison.
0796  * Stress tests should be aware of this memory leak problem.
0797  */
0798 static int delete_from_lru_cache(struct page *p)
0799 {
0800     if (!isolate_lru_page(p)) {
0801         /*
0802          * Clear sensible page flags, so that the buddy system won't
0803          * complain when the page is unpoison-and-freed.
0804          */
0805         ClearPageActive(p);
0806         ClearPageUnevictable(p);
0807
0808         /*
0809          * Poisoned page might never drop its ref count to 0 so we have
0810          * to uncharge it manually from its memcg.
0811          */
0812         mem_cgroup_uncharge(page_folio(p));
0813
0814         /*
0815          * drop the page count elevated by isolate_lru_page()
0816          */
0817         put_page(p);
0818         return 0;
0819     }
0820     return -EIO;
0821 }
0822
0823 static int truncate_error_page(struct page *p, unsigned long pfn,
0824                 struct address_space *mapping)
0825 {
0826     int ret = MF_FAILED;
0827
0828     if (mapping->a_ops->error_remove_page) {
0829         int err = mapping->a_ops->error_remove_page(mapping, p);
0830
0831         if (err != 0) {
0832             pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
0833         } else if (page_has_private(p) &&
0834                !try_to_release_page(p, GFP_NOIO)) {
0835             pr_info("%#lx: failed to release buffers\n", pfn);
0836         } else {
0837             ret = MF_RECOVERED;
0838         }
0839     } else {
0840         /*
0841          * If the file system doesn't support it just invalidate
0842          * This fails on dirty or anything with private pages
0843          */
0844         if (invalidate_inode_page(p))
0845             ret = MF_RECOVERED;
0846         else
0847             pr_info("%#lx: Failed to invalidate\n", pfn);
0848     }
0849
0850     return ret;
0851 }
0852
0853 struct page_state {
0854     unsigned long mask;
0855     unsigned long res;
0856     enum mf_action_page_type type;
0857
0858     /* Callback ->action() has to unlock the relevant page inside it. */
0859     int (*action)(struct page_state *ps, struct page *p);
0860 };
0861
0862 /*
0863  * Return true if page is still referenced by others, otherwise return
0864  * false.
0865  *
0866  * The extra_pins is true when one extra refcount is expected.
0867  */
0868 static bool has_extra_refcount(struct page_state *ps, struct page *p,
0869                    bool extra_pins)
0870 {
0871     int count = page_count(p) - 1;
0872
0873     if (extra_pins)
0874         count -= 1;
0875
0876     if (count > 0) {
0877         pr_err("%#lx: %s still referenced by %d users\n",
0878                page_to_pfn(p), action_page_types[ps->type], count);
0879         return true;
0880     }
0881
0882     return false;
0883 }
0884
0885 /*
0886  * Error hit kernel page.
0887  * Do nothing, try to be lucky and not touch this instead. For a few cases we
0888  * could be more sophisticated.
0889  */
0890 static int me_kernel(struct page_state *ps, struct page *p)
0891 {
0892     unlock_page(p);
0893     return MF_IGNORED;
0894 }
0895
0896 /*
0897  * Page in unknown state. Do nothing.
0898  */
0899 static int me_unknown(struct page_state *ps, struct page *p)
0900 {
0901     pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
0902     unlock_page(p);
0903     return MF_FAILED;
0904 }
0905
0906 /*
0907  * Clean (or cleaned) page cache page.
0908  */
0909 static int me_pagecache_clean(struct page_state *ps, struct page *p)
0910 {
0911     int ret;
0912     struct address_space *mapping;
0913     bool extra_pins;
0914
0915     delete_from_lru_cache(p);
0916
0917     /*
0918      * For anonymous pages we're done the only reference left
0919      * should be the one m_f() holds.
0920      */
0921     if (PageAnon(p)) {
0922         ret = MF_RECOVERED;
0923         goto out;
0924     }
0925
0926     /*
0927      * Now truncate the page in the page cache. This is really
0928      * more like a "temporary hole punch"
0929      * Don't do this for block devices when someone else
0930      * has a reference, because it could be file system metadata
0931      * and that's not safe to truncate.
0932      */
0933     mapping = page_mapping(p);
0934     if (!mapping) {
0935         /*
0936          * Page has been teared down in the meanwhile
0937          */
0938         ret = MF_FAILED;
0939         goto out;
0940     }
0941
0942     /*
0943      * The shmem page is kept in page cache instead of truncating
0944      * so is expected to have an extra refcount after error-handling.
0945      */
0946     extra_pins = shmem_mapping(mapping);
0947
0948     /*
0949      * Truncation is a bit tricky. Enable it per file system for now.
0950      *
0951      * Open: to take i_rwsem or not for this? Right now we don't.
0952      */
0953     ret = truncate_error_page(p, page_to_pfn(p), mapping);
0954     if (has_extra_refcount(ps, p, extra_pins))
0955         ret = MF_FAILED;
0956
0957 out:
0958     unlock_page(p);
0959
0960     return ret;
0961 }
0962
0963 /*
0964  * Dirty pagecache page
0965  * Issues: when the error hit a hole page the error is not properly
0966  * propagated.
0967  */
0968 static int me_pagecache_dirty(struct page_state *ps, struct page *p)
0969 {
0970     struct address_space *mapping = page_mapping(p);
0971
0972     SetPageError(p);
0973     /* TBD: print more information about the file. */
0974     if (mapping) {
0975         /*
0976          * IO error will be reported by write(), fsync(), etc.
0977          * who check the mapping.
0978          * This way the application knows that something went
0979          * wrong with its dirty file data.
0980          *
0981          * There's one open issue:
0982          *
0983          * The EIO will be only reported on the next IO
0984          * operation and then cleared through the IO map.
0985          * Normally Linux has two mechanisms to pass IO error
0986          * first through the AS_EIO flag in the address space
0987          * and then through the PageError flag in the page.
0988          * Since we drop pages on memory failure handling the
0989          * only mechanism open to use is through AS_AIO.
0990          *
0991          * This has the disadvantage that it gets cleared on
0992          * the first operation that returns an error, while
0993          * the PageError bit is more sticky and only cleared
0994          * when the page is reread or dropped.  If an
0995          * application assumes it will always get error on
0996          * fsync, but does other operations on the fd before
0997          * and the page is dropped between then the error
0998          * will not be properly reported.
0999          *
1000          * This can already happen even without hwpoisoned
1001          * pages: first on metadata IO errors (which only
1002          * report through AS_EIO) or when the page is dropped
1003          * at the wrong time.
1004          *
1005          * So right now we assume that the application DTRT on
1006          * the first EIO, but we're not worse than other parts
1007          * of the kernel.
1008          */
1009         mapping_set_error(mapping, -EIO);
1010     }
1011
1012     return me_pagecache_clean(ps, p);
1013 }
1014
1015 /*
1016  * Clean and dirty swap cache.
1017  *
1018  * Dirty swap cache page is tricky to handle. The page could live both in page
1019  * cache and swap cache(ie. page is freshly swapped in). So it could be
1020  * referenced concurrently by 2 types of PTEs:
1021  * normal PTEs and swap PTEs. We try to handle them consistently by calling
1022  * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
1023  * and then
1024  *      - clear dirty bit to prevent IO
1025  *      - remove from LRU
1026  *      - but keep in the swap cache, so that when we return to it on
1027  *        a later page fault, we know the application is accessing
1028  *        corrupted data and shall be killed (we installed simple
1029  *        interception code in do_swap_page to catch it).
1030  *
1031  * Clean swap cache pages can be directly isolated. A later page fault will
1032  * bring in the known good data from disk.
1033  */
1034 static int me_swapcache_dirty(struct page_state *ps, struct page *p)
1035 {
1036     int ret;
1037     bool extra_pins = false;
1038
1039     ClearPageDirty(p);
1040     /* Trigger EIO in shmem: */
1041     ClearPageUptodate(p);
1042
1043     ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
1044     unlock_page(p);
1045
1046     if (ret == MF_DELAYED)
1047         extra_pins = true;
1048
1049     if (has_extra_refcount(ps, p, extra_pins))
1050         ret = MF_FAILED;
1051
1052     return ret;
1053 }
1054
1055 static int me_swapcache_clean(struct page_state *ps, struct page *p)
1056 {
1057     struct folio *folio = page_folio(p);
1058     int ret;
1059
1060     delete_from_swap_cache(folio);
1061
1062     ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
1063     folio_unlock(folio);
1064
1065     if (has_extra_refcount(ps, p, false))
1066         ret = MF_FAILED;
1067
1068     return ret;
1069 }
1070
1071 /*
1072  * Huge pages. Needs work.
1073  * Issues:
1074  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
1075  *   To narrow down kill region to one page, we need to break up pmd.
1076  */
1077 static int me_huge_page(struct page_state *ps, struct page *p)
1078 {
1079     int res;
1080     struct page *hpage = compound_head(p);
1081     struct address_space *mapping;
1082
1083     if (!PageHuge(hpage))
1084         return MF_DELAYED;
1085
1086     mapping = page_mapping(hpage);
1087     if (mapping) {
1088         res = truncate_error_page(hpage, page_to_pfn(p), mapping);
1089         unlock_page(hpage);
1090     } else {
1091         unlock_page(hpage);
1092         /*
1093          * migration entry prevents later access on error hugepage,
1094          * so we can free and dissolve it into buddy to save healthy
1095          * subpages.
1096          */
1097         put_page(hpage);
1098         if (__page_handle_poison(p) >= 0) {
1099             page_ref_inc(p);
1100             res = MF_RECOVERED;
1101         } else {
1102             res = MF_FAILED;
1103         }
1104     }
1105
1106     if (has_extra_refcount(ps, p, false))
1107         res = MF_FAILED;
1108
1109     return res;
1110 }
1111
1112 /*
1113  * Various page states we can handle.
1114  *
1115  * A page state is defined by its current page->flags bits.
1116  * The table matches them in order and calls the right handler.
1117  *
1118  * This is quite tricky because we can access page at any time
1119  * in its live cycle, so all accesses have to be extremely careful.
1120  *
1121  * This is not complete. More states could be added.
1122  * For any missing state don't attempt recovery.
1123  */
1124
1125 #define dirty       (1UL << PG_dirty)
1126 #define sc      ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1127 #define unevict     (1UL << PG_unevictable)
1128 #define mlock       (1UL << PG_mlocked)
1129 #define lru     (1UL << PG_lru)
1130 #define head        (1UL << PG_head)
1131 #define slab        (1UL << PG_slab)
1132 #define reserved    (1UL << PG_reserved)
1133
1134 static struct page_state error_states[] = {
1135     { reserved, reserved,   MF_MSG_KERNEL,  me_kernel },
1136     /*
1137      * free pages are specially detected outside this table:
1138      * PG_buddy pages only make a small fraction of all free pages.
1139      */
1140
1141     /*
1142      * Could in theory check if slab page is free or if we can drop
1143      * currently unused objects without touching them. But just
1144      * treat it as standard kernel for now.
1145      */
1146     { slab,     slab,       MF_MSG_SLAB,    me_kernel },
1147
1148     { head,     head,       MF_MSG_HUGE,        me_huge_page },
1149
1150     { sc|dirty, sc|dirty,   MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
1151     { sc|dirty, sc,     MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
1152
1153     { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,   me_pagecache_dirty },
1154     { mlock|dirty,  mlock,      MF_MSG_CLEAN_MLOCKED_LRU,   me_pagecache_clean },
1155
1156     { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
1157     { unevict|dirty, unevict,   MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
1158
1159     { lru|dirty,    lru|dirty,  MF_MSG_DIRTY_LRU,   me_pagecache_dirty },
1160     { lru|dirty,    lru,        MF_MSG_CLEAN_LRU,   me_pagecache_clean },
1161
1162     /*
1163      * Catchall entry: must be at end.
1164      */
1165     { 0,        0,      MF_MSG_UNKNOWN, me_unknown },
1166 };
1167
1168 #undef dirty
1169 #undef sc
1170 #undef unevict
1171 #undef mlock
1172 #undef lru
1173 #undef head
1174 #undef slab
1175 #undef reserved
1176
1177 /*
1178  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
1179  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
1180  */
1181 static void action_result(unsigned long pfn, enum mf_action_page_type type,
1182               enum mf_result result)
1183 {
1184     trace_memory_failure_event(pfn, type, result);
1185
1186     num_poisoned_pages_inc();
1187     pr_err("%#lx: recovery action for %s: %s\n",
1188         pfn, action_page_types[type], action_name[result]);
1189 }
1190
1191 static int page_action(struct page_state *ps, struct page *p,
1192             unsigned long pfn)
1193 {
1194     int result;
1195
1196     /* page p should be unlocked after returning from ps->action().  */
1197     result = ps->action(ps, p);
1198
1199     action_result(pfn, ps->type, result);
1200
1201     /* Could do more checks here if page looks ok */
1202     /*
1203      * Could adjust zone counters here to correct for the missing page.
1204      */
1205
1206     return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1207 }
1208
1209 static inline bool PageHWPoisonTakenOff(struct page *page)
1210 {
1211     return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1212 }
1213
1214 void SetPageHWPoisonTakenOff(struct page *page)
1215 {
1216     set_page_private(page, MAGIC_HWPOISON);
1217 }
1218
1219 void ClearPageHWPoisonTakenOff(struct page *page)
1220 {
1221     if (PageHWPoison(page))
1222         set_page_private(page, 0);
1223 }
1224
1225 /*
1226  * Return true if a page type of a given page is supported by hwpoison
1227  * mechanism (while handling could fail), otherwise false.  This function
1228  * does not return true for hugetlb or device memory pages, so it's assumed
1229  * to be called only in the context where we never have such pages.
1230  */
1231 static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
1232 {
1233     /* Soft offline could migrate non-LRU movable pages */
1234     if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1235         return true;
1236
1237     return PageLRU(page) || is_free_buddy_page(page);
1238 }
1239
1240 static int __get_hwpoison_page(struct page *page, unsigned long flags)
1241 {
1242     struct page *head = compound_head(page);
1243     int ret = 0;
1244     bool hugetlb = false;
1245
1246     ret = get_hwpoison_huge_page(head, &hugetlb);
1247     if (hugetlb)
1248         return ret;
1249
1250     /*
1251      * This check prevents from calling get_hwpoison_unless_zero()
1252      * for any unsupported type of page in order to reduce the risk of
1253      * unexpected races caused by taking a page refcount.
1254      */
1255     if (!HWPoisonHandlable(head, flags))
1256         return -EBUSY;
1257
1258     if (get_page_unless_zero(head)) {
1259         if (head == compound_head(page))
1260             return 1;
1261
1262         pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1263         put_page(head);
1264     }
1265
1266     return 0;
1267 }
1268
1269 static int get_any_page(struct page *p, unsigned long flags)
1270 {
1271     int ret = 0, pass = 0;
1272     bool count_increased = false;
1273
1274     if (flags & MF_COUNT_INCREASED)
1275         count_increased = true;
1276
1277 try_again:
1278     if (!count_increased) {
1279         ret = __get_hwpoison_page(p, flags);
1280         if (!ret) {
1281             if (page_count(p)) {
1282                 /* We raced with an allocation, retry. */
1283                 if (pass++ < 3)
1284                     goto try_again;
1285                 ret = -EBUSY;
1286             } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1287                 /* We raced with put_page, retry. */
1288                 if (pass++ < 3)
1289                     goto try_again;
1290                 ret = -EIO;
1291             }
1292             goto out;
1293         } else if (ret == -EBUSY) {
1294             /*
1295              * We raced with (possibly temporary) unhandlable
1296              * page, retry.
1297              */
1298             if (pass++ < 3) {
1299                 shake_page(p);
1300                 goto try_again;
1301             }
1302             ret = -EIO;
1303             goto out;
1304         }
1305     }
1306
1307     if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
1308         ret = 1;
1309     } else {
1310         /*
1311          * A page we cannot handle. Check whether we can turn
1312          * it into something we can handle.
1313          */
1314         if (pass++ < 3) {
1315             put_page(p);
1316             shake_page(p);
1317             count_increased = false;
1318             goto try_again;
1319         }
1320         put_page(p);
1321         ret = -EIO;
1322     }
1323 out:
1324     if (ret == -EIO)
1325         pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1326
1327     return ret;
1328 }
1329
1330 static int __get_unpoison_page(struct page *page)
1331 {
1332     struct page *head = compound_head(page);
1333     int ret = 0;
1334     bool hugetlb = false;
1335
1336     ret = get_hwpoison_huge_page(head, &hugetlb);
1337     if (hugetlb)
1338         return ret;
1339
1340     /*
1341      * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
1342      * but also isolated from buddy freelist, so need to identify the
1343      * state and have to cancel both operations to unpoison.
1344      */
1345     if (PageHWPoisonTakenOff(page))
1346         return -EHWPOISON;
1347
1348     return get_page_unless_zero(page) ? 1 : 0;
1349 }
1350
1351 /**
1352  * get_hwpoison_page() - Get refcount for memory error handling
1353  * @p:      Raw error page (hit by memory error)
1354  * @flags:  Flags controlling behavior of error handling
1355  *
1356  * get_hwpoison_page() takes a page refcount of an error page to handle memory
1357  * error on it, after checking that the error page is in a well-defined state
1358  * (defined as a page-type we can successfully handle the memory error on it,
1359  * such as LRU page and hugetlb page).
1360  *
1361  * Memory error handling could be triggered at any time on any type of page,
1362  * so it's prone to race with typical memory management lifecycle (like
1363  * allocation and free).  So to avoid such races, get_hwpoison_page() takes
1364  * extra care for the error page's state (as done in __get_hwpoison_page()),
1365  * and has some retry logic in get_any_page().
1366  *
1367  * When called from unpoison_memory(), the caller should already ensure that
1368  * the given page has PG_hwpoison. So it's never reused for other page
1369  * allocations, and __get_unpoison_page() never races with them.
1370  *
1371  * Return: 0 on failure,
1372  *         1 on success for in-use pages in a well-defined state,
1373  *         -EIO for pages on which we can not handle memory errors,
1374  *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
1375  *         operations like allocation and free,
1376  *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
1377  */
1378 static int get_hwpoison_page(struct page *p, unsigned long flags)
1379 {
1380     int ret;
1381
1382     zone_pcp_disable(page_zone(p));
1383     if (flags & MF_UNPOISON)
1384         ret = __get_unpoison_page(p);
1385     else
1386         ret = get_any_page(p, flags);
1387     zone_pcp_enable(page_zone(p));
1388
1389     return ret;
1390 }
1391
1392 /*
1393  * Do all that is necessary to remove user space mappings. Unmap
1394  * the pages and send SIGBUS to the processes if the data was dirty.
1395  */
1396 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1397                   int flags, struct page *hpage)
1398 {
1399     struct folio *folio = page_folio(hpage);
1400     enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
1401     struct address_space *mapping;
1402     LIST_HEAD(tokill);
1403     bool unmap_success;
1404     int kill = 1, forcekill;
1405     bool mlocked = PageMlocked(hpage);
1406
1407     /*
1408      * Here we are interested only in user-mapped pages, so skip any
1409      * other types of pages.
1410      */
1411     if (PageReserved(p) || PageSlab(p))
1412         return true;
1413     if (!(PageLRU(hpage) || PageHuge(p)))
1414         return true;
1415
1416     /*
1417      * This check implies we don't kill processes if their pages
1418      * are in the swap cache early. Those are always late kills.
1419      */
1420     if (!page_mapped(hpage))
1421         return true;
1422
1423     if (PageKsm(p)) {
1424         pr_err("%#lx: can't handle KSM pages.\n", pfn);
1425         return false;
1426     }
1427
1428     if (PageSwapCache(p)) {
1429         pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1430         ttu |= TTU_IGNORE_HWPOISON;
1431     }
1432
1433     /*
1434      * Propagate the dirty bit from PTEs to struct page first, because we
1435      * need this to decide if we should kill or just drop the page.
1436      * XXX: the dirty test could be racy: set_page_dirty() may not always
1437      * be called inside page lock (it's recommended but not enforced).
1438      */
1439     mapping = page_mapping(hpage);
1440     if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1441         mapping_can_writeback(mapping)) {
1442         if (page_mkclean(hpage)) {
1443             SetPageDirty(hpage);
1444         } else {
1445             kill = 0;
1446             ttu |= TTU_IGNORE_HWPOISON;
1447             pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1448                 pfn);
1449         }
1450     }
1451
1452     /*
1453      * First collect all the processes that have the page
1454      * mapped in dirty form.  This has to be done before try_to_unmap,
1455      * because ttu takes the rmap data structures down.
1456      *
1457      * Error handling: We ignore errors here because
1458      * there's nothing that can be done.
1459      */
1460     if (kill)
1461         collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1462
1463     if (PageHuge(hpage) && !PageAnon(hpage)) {
1464         /*
1465          * For hugetlb pages in shared mappings, try_to_unmap
1466          * could potentially call huge_pmd_unshare.  Because of
1467          * this, take semaphore in write mode here and set
1468          * TTU_RMAP_LOCKED to indicate we have taken the lock
1469          * at this higher level.
1470          */
1471         mapping = hugetlb_page_mapping_lock_write(hpage);
1472         if (mapping) {
1473             try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
1474             i_mmap_unlock_write(mapping);
1475         } else
1476             pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1477     } else {
1478         try_to_unmap(folio, ttu);
1479     }
1480
1481     unmap_success = !page_mapped(hpage);
1482     if (!unmap_success)
1483         pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
1484                pfn, page_mapcount(hpage));
1485
1486     /*
1487      * try_to_unmap() might put mlocked page in lru cache, so call
1488      * shake_page() again to ensure that it's flushed.
1489      */
1490     if (mlocked)
1491         shake_page(hpage);
1492
1493     /*
1494      * Now that the dirty bit has been propagated to the
1495      * struct page and all unmaps done we can decide if
1496      * killing is needed or not.  Only kill when the page
1497      * was dirty or the process is not restartable,
1498      * otherwise the tokill list is merely
1499      * freed.  When there was a problem unmapping earlier
1500      * use a more force-full uncatchable kill to prevent
1501      * any accesses to the poisoned memory.
1502      */
1503     forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1504     kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1505
1506     return unmap_success;
1507 }
1508
1509 static int identify_page_state(unsigned long pfn, struct page *p,
1510                 unsigned long page_flags)
1511 {
1512     struct page_state *ps;
1513
1514     /*
1515      * The first check uses the current page flags which may not have any
1516      * relevant information. The second check with the saved page flags is
1517      * carried out only if the first check can't determine the page status.
1518      */
1519     for (ps = error_states;; ps++)
1520         if ((p->flags & ps->mask) == ps->res)
1521             break;
1522
1523     page_flags |= (p->flags & (1UL << PG_dirty));
1524
1525     if (!ps->mask)
1526         for (ps = error_states;; ps++)
1527             if ((page_flags & ps->mask) == ps->res)
1528                 break;
1529     return page_action(ps, p, pfn);
1530 }
1531
1532 static int try_to_split_thp_page(struct page *page, const char *msg)
1533 {
1534     lock_page(page);
1535     if (unlikely(split_huge_page(page))) {
1536         unsigned long pfn = page_to_pfn(page);
1537
1538         unlock_page(page);
1539         pr_info("%s: %#lx: thp split failed\n", msg, pfn);
1540         put_page(page);
1541         return -EBUSY;
1542     }
1543     unlock_page(page);
1544
1545     return 0;
1546 }
1547
1548 static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
1549         struct address_space *mapping, pgoff_t index, int flags)
1550 {
1551     struct to_kill *tk;
1552     unsigned long size = 0;
1553
1554     list_for_each_entry(tk, to_kill, nd)
1555         if (tk->size_shift)
1556             size = max(size, 1UL << tk->size_shift);
1557
1558     if (size) {
1559         /*
1560          * Unmap the largest mapping to avoid breaking up device-dax
1561          * mappings which are constant size. The actual size of the
1562          * mapping being torn down is communicated in siginfo, see
1563          * kill_proc()
1564          */
1565         loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
1566
1567         unmap_mapping_range(mapping, start, size, 0);
1568     }
1569
1570     kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
1571 }
1572
1573 static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1574         struct dev_pagemap *pgmap)
1575 {
1576     struct page *page = pfn_to_page(pfn);
1577     LIST_HEAD(to_kill);
1578     dax_entry_t cookie;
1579     int rc = 0;
1580
1581     /*
1582      * Pages instantiated by device-dax (not filesystem-dax)
1583      * may be compound pages.
1584      */
1585     page = compound_head(page);
1586
1587     /*
1588      * Prevent the inode from being freed while we are interrogating
1589      * the address_space, typically this would be handled by
1590      * lock_page(), but dax pages do not use the page lock. This
1591      * also prevents changes to the mapping of this pfn until
1592      * poison signaling is complete.
1593      */
1594     cookie = dax_lock_page(page);
1595     if (!cookie)
1596         return -EBUSY;
1597
1598     if (hwpoison_filter(page)) {
1599         rc = -EOPNOTSUPP;
1600         goto unlock;
1601     }
1602
1603     switch (pgmap->type) {
1604     case MEMORY_DEVICE_PRIVATE:
1605     case MEMORY_DEVICE_COHERENT:
1606         /*
1607          * TODO: Handle device pages which may need coordination
1608          * with device-side memory.
1609          */
1610         rc = -ENXIO;
1611         goto unlock;
1612     default:
1613         break;
1614     }
1615
1616     /*
1617      * Use this flag as an indication that the dax page has been
1618      * remapped UC to prevent speculative consumption of poison.
1619      */
1620     SetPageHWPoison(page);
1621
1622     /*
1623      * Unlike System-RAM there is no possibility to swap in a
1624      * different physical page at a given virtual address, so all
1625      * userspace consumption of ZONE_DEVICE memory necessitates
1626      * SIGBUS (i.e. MF_MUST_KILL)
1627      */
1628     flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1629     collect_procs(page, &to_kill, true);
1630
1631     unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
1632 unlock:
1633     dax_unlock_page(page, cookie);
1634     return rc;
1635 }
1636
1637 #ifdef CONFIG_FS_DAX
1638 /**
1639  * mf_dax_kill_procs - Collect and kill processes who are using this file range
1640  * @mapping:    address_space of the file in use
1641  * @index:  start pgoff of the range within the file
1642  * @count:  length of the range, in unit of PAGE_SIZE
1643  * @mf_flags:   memory failure flags
1644  */
1645 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1646         unsigned long count, int mf_flags)
1647 {
1648     LIST_HEAD(to_kill);
1649     dax_entry_t cookie;
1650     struct page *page;
1651     size_t end = index + count;
1652
1653     mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1654
1655     for (; index < end; index++) {
1656         page = NULL;
1657         cookie = dax_lock_mapping_entry(mapping, index, &page);
1658         if (!cookie)
1659             return -EBUSY;
1660         if (!page)
1661             goto unlock;
1662
1663         SetPageHWPoison(page);
1664
1665         collect_procs_fsdax(page, mapping, index, &to_kill);
1666         unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
1667                 index, mf_flags);
1668 unlock:
1669         dax_unlock_mapping_entry(mapping, index, cookie);
1670     }
1671     return 0;
1672 }
1673 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1674 #endif /* CONFIG_FS_DAX */
1675
1676 #ifdef CONFIG_HUGETLB_PAGE
1677 /*
1678  * Struct raw_hwp_page represents information about "raw error page",
1679  * constructing singly linked list originated from ->private field of
1680  * SUBPAGE_INDEX_HWPOISON-th tail page.
1681  */
1682 struct raw_hwp_page {
1683     struct llist_node node;
1684     struct page *page;
1685 };
1686
1687 static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
1688 {
1689     return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
1690 }
1691
1692 static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
1693 {
1694     struct llist_head *head;
1695     struct llist_node *t, *tnode;
1696     unsigned long count = 0;
1697
1698     head = raw_hwp_list_head(hpage);
1699     llist_for_each_safe(tnode, t, head->first) {
1700         struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1701
1702         if (move_flag)
1703             SetPageHWPoison(p->page);
1704         kfree(p);
1705         count++;
1706     }
1707     llist_del_all(head);
1708     return count;
1709 }
1710
1711 static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
1712 {
1713     struct llist_head *head;
1714     struct raw_hwp_page *raw_hwp;
1715     struct llist_node *t, *tnode;
1716     int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
1717
1718     /*
1719      * Once the hwpoison hugepage has lost reliable raw error info,
1720      * there is little meaning to keep additional error info precisely,
1721      * so skip to add additional raw error info.
1722      */
1723     if (HPageRawHwpUnreliable(hpage))
1724         return -EHWPOISON;
1725     head = raw_hwp_list_head(hpage);
1726     llist_for_each_safe(tnode, t, head->first) {
1727         struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1728
1729         if (p->page == page)
1730             return -EHWPOISON;
1731     }
1732
1733     raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
1734     if (raw_hwp) {
1735         raw_hwp->page = page;
1736         llist_add(&raw_hwp->node, head);
1737         /* the first error event will be counted in action_result(). */
1738         if (ret)
1739             num_poisoned_pages_inc();
1740     } else {
1741         /*
1742          * Failed to save raw error info.  We no longer trace all
1743          * hwpoisoned subpages, and we need refuse to free/dissolve
1744          * this hwpoisoned hugepage.
1745          */
1746         SetHPageRawHwpUnreliable(hpage);
1747         /*
1748          * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
1749          * used any more, so free it.
1750          */
1751         __free_raw_hwp_pages(hpage, false);
1752     }
1753     return ret;
1754 }
1755
1756 static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
1757 {
1758     /*
1759      * HPageVmemmapOptimized hugepages can't be freed because struct
1760      * pages for tail pages are required but they don't exist.
1761      */
1762     if (move_flag && HPageVmemmapOptimized(hpage))
1763         return 0;
1764
1765     /*
1766      * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
1767      * definition.
1768      */
1769     if (HPageRawHwpUnreliable(hpage))
1770         return 0;
1771
1772     return __free_raw_hwp_pages(hpage, move_flag);
1773 }
1774
1775 void hugetlb_clear_page_hwpoison(struct page *hpage)
1776 {
1777     if (HPageRawHwpUnreliable(hpage))
1778         return;
1779     ClearPageHWPoison(hpage);
1780     free_raw_hwp_pages(hpage, true);
1781 }
1782
1783 /*
1784  * Called from hugetlb code with hugetlb_lock held.
1785  *
1786  * Return values:
1787  *   0             - free hugepage
1788  *   1             - in-use hugepage
1789  *   2             - not a hugepage
1790  *   -EBUSY        - the hugepage is busy (try to retry)
1791  *   -EHWPOISON    - the hugepage is already hwpoisoned
1792  */
1793 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
1794 {
1795     struct page *page = pfn_to_page(pfn);
1796     struct page *head = compound_head(page);
1797     int ret = 2;    /* fallback to normal page handling */
1798     bool count_increased = false;
1799
1800     if (!PageHeadHuge(head))
1801         goto out;
1802
1803     if (flags & MF_COUNT_INCREASED) {
1804         ret = 1;
1805         count_increased = true;
1806     } else if (HPageFreed(head)) {
1807         ret = 0;
1808     } else if (HPageMigratable(head)) {
1809         ret = get_page_unless_zero(head);
1810         if (ret)
1811             count_increased = true;
1812     } else {
1813         ret = -EBUSY;
1814         if (!(flags & MF_NO_RETRY))
1815             goto out;
1816     }
1817
1818     if (hugetlb_set_page_hwpoison(head, page)) {
1819         ret = -EHWPOISON;
1820         goto out;
1821     }
1822
1823     return ret;
1824 out:
1825     if (count_increased)
1826         put_page(head);
1827     return ret;
1828 }
1829
1830 /*
1831  * Taking refcount of hugetlb pages needs extra care about race conditions
1832  * with basic operations like hugepage allocation/free/demotion.
1833  * So some of prechecks for hwpoison (pinning, and testing/setting
1834  * PageHWPoison) should be done in single hugetlb_lock range.
1835  */
1836 static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1837 {
1838     int res;
1839     struct page *p = pfn_to_page(pfn);
1840     struct page *head;
1841     unsigned long page_flags;
1842
1843     *hugetlb = 1;
1844 retry:
1845     res = get_huge_page_for_hwpoison(pfn, flags);
1846     if (res == 2) { /* fallback to normal page handling */
1847         *hugetlb = 0;
1848         return 0;
1849     } else if (res == -EHWPOISON) {
1850         pr_err("%#lx: already hardware poisoned\n", pfn);
1851         if (flags & MF_ACTION_REQUIRED) {
1852             head = compound_head(p);
1853             res = kill_accessing_process(current, page_to_pfn(head), flags);
1854         }
1855         return res;
1856     } else if (res == -EBUSY) {
1857         if (!(flags & MF_NO_RETRY)) {
1858             flags |= MF_NO_RETRY;
1859             goto retry;
1860         }
1861         action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
1862         return res;
1863     }
1864
1865     head = compound_head(p);
1866     lock_page(head);
1867
1868     if (hwpoison_filter(p)) {
1869         hugetlb_clear_page_hwpoison(head);
1870         res = -EOPNOTSUPP;
1871         goto out;
1872     }
1873
1874     /*
1875      * Handling free hugepage.  The possible race with hugepage allocation
1876      * or demotion can be prevented by PageHWPoison flag.
1877      */
1878     if (res == 0) {
1879         unlock_page(head);
1880         if (__page_handle_poison(p) >= 0) {
1881             page_ref_inc(p);
1882             res = MF_RECOVERED;
1883         } else {
1884             res = MF_FAILED;
1885         }
1886         action_result(pfn, MF_MSG_FREE_HUGE, res);
1887         return res == MF_RECOVERED ? 0 : -EBUSY;
1888     }
1889
1890     page_flags = head->flags;
1891
1892     if (!hwpoison_user_mappings(p, pfn, flags, head)) {
1893         action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1894         res = -EBUSY;
1895         goto out;
1896     }
1897
1898     return identify_page_state(pfn, p, page_flags);
1899 out:
1900     unlock_page(head);
1901     return res;
1902 }
1903
1904 #else
1905 static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1906 {
1907     return 0;
1908 }
1909
1910 static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
1911 {
1912     return 0;
1913 }
1914 #endif  /* CONFIG_HUGETLB_PAGE */
1915
1916 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1917         struct dev_pagemap *pgmap)
1918 {
1919     struct page *page = pfn_to_page(pfn);
1920     int rc = -ENXIO;
1921
1922     if (flags & MF_COUNT_INCREASED)
1923         /*
1924          * Drop the extra refcount in case we come from madvise().
1925          */
1926         put_page(page);
1927
1928     /* device metadata space is not recoverable */
1929     if (!pgmap_pfn_valid(pgmap, pfn))
1930         goto out;
1931
1932     /*
1933      * Call driver's implementation to handle the memory failure, otherwise
1934      * fall back to generic handler.
1935      */
1936     if (pgmap_has_memory_failure(pgmap)) {
1937         rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
1938         /*
1939          * Fall back to generic handler too if operation is not
1940          * supported inside the driver/device/filesystem.
1941          */
1942         if (rc != -EOPNOTSUPP)
1943             goto out;
1944     }
1945
1946     rc = mf_generic_kill_procs(pfn, flags, pgmap);
1947 out:
1948     /* drop pgmap ref acquired in caller */
1949     put_dev_pagemap(pgmap);
1950     action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1951     return rc;
1952 }
1953
1954 static DEFINE_MUTEX(mf_mutex);
1955
1956 /**
1957  * memory_failure - Handle memory failure of a page.
1958  * @pfn: Page Number of the corrupted page
1959  * @flags: fine tune action taken
1960  *
1961  * This function is called by the low level machine check code
1962  * of an architecture when it detects hardware memory corruption
1963  * of a page. It tries its best to recover, which includes
1964  * dropping pages, killing processes etc.
1965  *
1966  * The function is primarily of use for corruptions that
1967  * happen outside the current execution context (e.g. when
1968  * detected by a background scrubber)
1969  *
1970  * Must run in process context (e.g. a work queue) with interrupts
1971  * enabled and no spinlocks hold.
1972  *
1973  * Return: 0 for successfully handled the memory error,
1974  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
1975  *         < 0(except -EOPNOTSUPP) on failure.
1976  */
1977 int memory_failure(unsigned long pfn, int flags)
1978 {
1979     struct page *p;
1980     struct page *hpage;
1981     struct dev_pagemap *pgmap;
1982     int res = 0;
1983     unsigned long page_flags;
1984     bool retry = true;
1985     int hugetlb = 0;
1986
1987     if (!sysctl_memory_failure_recovery)
1988         panic("Memory failure on page %lx", pfn);
1989
1990     mutex_lock(&mf_mutex);
1991
1992     if (!(flags & MF_SW_SIMULATED))
1993         hw_memory_failure = true;
1994
1995     p = pfn_to_online_page(pfn);
1996     if (!p) {
1997         res = arch_memory_failure(pfn, flags);
1998         if (res == 0)
1999             goto unlock_mutex;
2000
2001         if (pfn_valid(pfn)) {
2002             pgmap = get_dev_pagemap(pfn, NULL);
2003             if (pgmap) {
2004                 res = memory_failure_dev_pagemap(pfn, flags,
2005                                  pgmap);
2006                 goto unlock_mutex;
2007             }
2008         }
2009         pr_err("%#lx: memory outside kernel control\n", pfn);
2010         res = -ENXIO;
2011         goto unlock_mutex;
2012     }
2013
2014 try_again:
2015     res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
2016     if (hugetlb)
2017         goto unlock_mutex;
2018
2019     if (TestSetPageHWPoison(p)) {
2020         pr_err("%#lx: already hardware poisoned\n", pfn);
2021         res = -EHWPOISON;
2022         if (flags & MF_ACTION_REQUIRED)
2023             res = kill_accessing_process(current, pfn, flags);
2024         if (flags & MF_COUNT_INCREASED)
2025             put_page(p);
2026         goto unlock_mutex;
2027     }
2028
2029     hpage = compound_head(p);
2030
2031     /*
2032      * We need/can do nothing about count=0 pages.
2033      * 1) it's a free page, and therefore in safe hand:
2034      *    prep_new_page() will be the gate keeper.
2035      * 2) it's part of a non-compound high order page.
2036      *    Implies some kernel user: cannot stop them from
2037      *    R/W the page; let's pray that the page has been
2038      *    used and will be freed some time later.
2039      * In fact it's dangerous to directly bump up page count from 0,
2040      * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
2041      */
2042     if (!(flags & MF_COUNT_INCREASED)) {
2043         res = get_hwpoison_page(p, flags);
2044         if (!res) {
2045             if (is_free_buddy_page(p)) {
2046                 if (take_page_off_buddy(p)) {
2047                     page_ref_inc(p);
2048                     res = MF_RECOVERED;
2049                 } else {
2050                     /* We lost the race, try again */
2051                     if (retry) {
2052                         ClearPageHWPoison(p);
2053                         retry = false;
2054                         goto try_again;
2055                     }
2056                     res = MF_FAILED;
2057                 }
2058                 action_result(pfn, MF_MSG_BUDDY, res);
2059                 res = res == MF_RECOVERED ? 0 : -EBUSY;
2060             } else {
2061                 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
2062                 res = -EBUSY;
2063             }
2064             goto unlock_mutex;
2065         } else if (res < 0) {
2066             action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
2067             res = -EBUSY;
2068             goto unlock_mutex;
2069         }
2070     }
2071
2072     if (PageTransHuge(hpage)) {
2073         /*
2074          * The flag must be set after the refcount is bumped
2075          * otherwise it may race with THP split.
2076          * And the flag can't be set in get_hwpoison_page() since
2077          * it is called by soft offline too and it is just called
2078          * for !MF_COUNT_INCREASE.  So here seems to be the best
2079          * place.
2080          *
2081          * Don't need care about the above error handling paths for
2082          * get_hwpoison_page() since they handle either free page
2083          * or unhandlable page.  The refcount is bumped iff the
2084          * page is a valid handlable page.
2085          */
2086         SetPageHasHWPoisoned(hpage);
2087         if (try_to_split_thp_page(p, "Memory Failure") < 0) {
2088             action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
2089             res = -EBUSY;
2090             goto unlock_mutex;
2091         }
2092         VM_BUG_ON_PAGE(!page_count(p), p);
2093     }
2094
2095     /*
2096      * We ignore non-LRU pages for good reasons.
2097      * - PG_locked is only well defined for LRU pages and a few others
2098      * - to avoid races with __SetPageLocked()
2099      * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
2100      * The check (unnecessarily) ignores LRU pages being isolated and
2101      * walked by the page reclaim code, however that's not a big loss.
2102      */
2103     shake_page(p);
2104
2105     lock_page(p);
2106
2107     /*
2108      * We're only intended to deal with the non-Compound page here.
2109      * However, the page could have changed compound pages due to
2110      * race window. If this happens, we could try again to hopefully
2111      * handle the page next round.
2112      */
2113     if (PageCompound(p)) {
2114         if (retry) {
2115             ClearPageHWPoison(p);
2116             unlock_page(p);
2117             put_page(p);
2118             flags &= ~MF_COUNT_INCREASED;
2119             retry = false;
2120             goto try_again;
2121         }
2122         action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
2123         res = -EBUSY;
2124         goto unlock_page;
2125     }
2126
2127     /*
2128      * We use page flags to determine what action should be taken, but
2129      * the flags can be modified by the error containment action.  One
2130      * example is an mlocked page, where PG_mlocked is cleared by
2131      * page_remove_rmap() in try_to_unmap_one(). So to determine page status
2132      * correctly, we save a copy of the page flags at this time.
2133      */
2134     page_flags = p->flags;
2135
2136     if (hwpoison_filter(p)) {
2137         TestClearPageHWPoison(p);
2138         unlock_page(p);
2139         put_page(p);
2140         res = -EOPNOTSUPP;
2141         goto unlock_mutex;
2142     }
2143
2144     /*
2145      * __munlock_pagevec may clear a writeback page's LRU flag without
2146      * page_lock. We need wait writeback completion for this page or it
2147      * may trigger vfs BUG while evict inode.
2148      */
2149     if (!PageLRU(p) && !PageWriteback(p))
2150         goto identify_page_state;
2151
2152     /*
2153      * It's very difficult to mess with pages currently under IO
2154      * and in many cases impossible, so we just avoid it here.
2155      */
2156     wait_on_page_writeback(p);
2157
2158     /*
2159      * Now take care of user space mappings.
2160      * Abort on fail: __filemap_remove_folio() assumes unmapped page.
2161      */
2162     if (!hwpoison_user_mappings(p, pfn, flags, p)) {
2163         action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
2164         res = -EBUSY;
2165         goto unlock_page;
2166     }
2167
2168     /*
2169      * Torn down by someone else?
2170      */
2171     if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
2172         action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
2173         res = -EBUSY;
2174         goto unlock_page;
2175     }
2176
2177 identify_page_state:
2178     res = identify_page_state(pfn, p, page_flags);
2179     mutex_unlock(&mf_mutex);
2180     return res;
2181 unlock_page:
2182     unlock_page(p);
2183 unlock_mutex:
2184     mutex_unlock(&mf_mutex);
2185     return res;
2186 }
2187 EXPORT_SYMBOL_GPL(memory_failure);
2188
2189 #define MEMORY_FAILURE_FIFO_ORDER   4
2190 #define MEMORY_FAILURE_FIFO_SIZE    (1 << MEMORY_FAILURE_FIFO_ORDER)
2191
2192 struct memory_failure_entry {
2193     unsigned long pfn;
2194     int flags;
2195 };
2196
2197 struct memory_failure_cpu {
2198     DECLARE_KFIFO(fifo, struct memory_failure_entry,
2199               MEMORY_FAILURE_FIFO_SIZE);
2200     spinlock_t lock;
2201     struct work_struct work;
2202 };
2203
2204 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2205
2206 /**
2207  * memory_failure_queue - Schedule handling memory failure of a page.
2208  * @pfn: Page Number of the corrupted page
2209  * @flags: Flags for memory failure handling
2210  *
2211  * This function is called by the low level hardware error handler
2212  * when it detects hardware memory corruption of a page. It schedules
2213  * the recovering of error page, including dropping pages, killing
2214  * processes etc.
2215  *
2216  * The function is primarily of use for corruptions that
2217  * happen outside the current execution context (e.g. when
2218  * detected by a background scrubber)
2219  *
2220  * Can run in IRQ context.
2221  */
2222 void memory_failure_queue(unsigned long pfn, int flags)
2223 {
2224     struct memory_failure_cpu *mf_cpu;
2225     unsigned long proc_flags;
2226     struct memory_failure_entry entry = {
2227         .pfn =      pfn,
2228         .flags =    flags,
2229     };
2230
2231     mf_cpu = &get_cpu_var(memory_failure_cpu);
2232     spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2233     if (kfifo_put(&mf_cpu->fifo, entry))
2234         schedule_work_on(smp_processor_id(), &mf_cpu->work);
2235     else
2236         pr_err("buffer overflow when queuing memory failure at %#lx\n",
2237                pfn);
2238     spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2239     put_cpu_var(memory_failure_cpu);
2240 }
2241 EXPORT_SYMBOL_GPL(memory_failure_queue);
2242
2243 static void memory_failure_work_func(struct work_struct *work)
2244 {
2245     struct memory_failure_cpu *mf_cpu;
2246     struct memory_failure_entry entry = { 0, };
2247     unsigned long proc_flags;
2248     int gotten;
2249
2250     mf_cpu = container_of(work, struct memory_failure_cpu, work);
2251     for (;;) {
2252         spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2253         gotten = kfifo_get(&mf_cpu->fifo, &entry);
2254         spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2255         if (!gotten)
2256             break;
2257         if (entry.flags & MF_SOFT_OFFLINE)
2258             soft_offline_page(entry.pfn, entry.flags);
2259         else
2260             memory_failure(entry.pfn, entry.flags);
2261     }
2262 }
2263
2264 /*
2265  * Process memory_failure work queued on the specified CPU.
2266  * Used to avoid return-to-userspace racing with the memory_failure workqueue.
2267  */
2268 void memory_failure_queue_kick(int cpu)
2269 {
2270     struct memory_failure_cpu *mf_cpu;
2271
2272     mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2273     cancel_work_sync(&mf_cpu->work);
2274     memory_failure_work_func(&mf_cpu->work);
2275 }
2276
2277 static int __init memory_failure_init(void)
2278 {
2279     struct memory_failure_cpu *mf_cpu;
2280     int cpu;
2281
2282     for_each_possible_cpu(cpu) {
2283         mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2284         spin_lock_init(&mf_cpu->lock);
2285         INIT_KFIFO(mf_cpu->fifo);
2286         INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2287     }
2288
2289     return 0;
2290 }
2291 core_initcall(memory_failure_init);
2292
2293 #undef pr_fmt
2294 #define pr_fmt(fmt) "" fmt
2295 #define unpoison_pr_info(fmt, pfn, rs)          \
2296 ({                          \
2297     if (__ratelimit(rs))                \
2298         pr_info(fmt, pfn);          \
2299 })
2300
2301 /**
2302  * unpoison_memory - Unpoison a previously poisoned page
2303  * @pfn: Page number of the to be unpoisoned page
2304  *
2305  * Software-unpoison a page that has been poisoned by
2306  * memory_failure() earlier.
2307  *
2308  * This is only done on the software-level, so it only works
2309  * for linux injected failures, not real hardware failures
2310  *
2311  * Returns 0 for success, otherwise -errno.
2312  */
2313 int unpoison_memory(unsigned long pfn)
2314 {
2315     struct page *page;
2316     struct page *p;
2317     int ret = -EBUSY;
2318     int freeit = 0;
2319     unsigned long count = 1;
2320     static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2321                     DEFAULT_RATELIMIT_BURST);
2322
2323     if (!pfn_valid(pfn))
2324         return -ENXIO;
2325
2326     p = pfn_to_page(pfn);
2327     page = compound_head(p);
2328
2329     mutex_lock(&mf_mutex);
2330
2331     if (hw_memory_failure) {
2332         unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
2333                  pfn, &unpoison_rs);
2334         ret = -EOPNOTSUPP;
2335         goto unlock_mutex;
2336     }
2337
2338     if (!PageHWPoison(p)) {
2339         unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
2340                  pfn, &unpoison_rs);
2341         goto unlock_mutex;
2342     }
2343
2344     if (page_count(page) > 1) {
2345         unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
2346                  pfn, &unpoison_rs);
2347         goto unlock_mutex;
2348     }
2349
2350     if (page_mapped(page)) {
2351         unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
2352                  pfn, &unpoison_rs);
2353         goto unlock_mutex;
2354     }
2355
2356     if (page_mapping(page)) {
2357         unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
2358                  pfn, &unpoison_rs);
2359         goto unlock_mutex;
2360     }
2361
2362     if (PageSlab(page) || PageTable(page))
2363         goto unlock_mutex;
2364
2365     ret = get_hwpoison_page(p, MF_UNPOISON);
2366     if (!ret) {
2367         if (PageHuge(p)) {
2368             count = free_raw_hwp_pages(page, false);
2369             if (count == 0) {
2370                 ret = -EBUSY;
2371                 goto unlock_mutex;
2372             }
2373         }
2374         ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
2375     } else if (ret < 0) {
2376         if (ret == -EHWPOISON) {
2377             ret = put_page_back_buddy(p) ? 0 : -EBUSY;
2378         } else
2379             unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
2380                      pfn, &unpoison_rs);
2381     } else {
2382         if (PageHuge(p)) {
2383             count = free_raw_hwp_pages(page, false);
2384             if (count == 0) {
2385                 ret = -EBUSY;
2386                 goto unlock_mutex;
2387             }
2388         }
2389         freeit = !!TestClearPageHWPoison(p);
2390
2391         put_page(page);
2392         if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
2393             put_page(page);
2394             ret = 0;
2395         }
2396     }
2397
2398 unlock_mutex:
2399     mutex_unlock(&mf_mutex);
2400     if (!ret || freeit) {
2401         num_poisoned_pages_sub(count);
2402         unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2403                  page_to_pfn(p), &unpoison_rs);
2404     }
2405     return ret;
2406 }
2407 EXPORT_SYMBOL(unpoison_memory);
2408
2409 static bool isolate_page(struct page *page, struct list_head *pagelist)
2410 {
2411     bool isolated = false;
2412     bool lru = PageLRU(page);
2413
2414     if (PageHuge(page)) {
2415         isolated = !isolate_hugetlb(page, pagelist);
2416     } else {
2417         if (lru)
2418             isolated = !isolate_lru_page(page);
2419         else
2420             isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
2421
2422         if (isolated)
2423             list_add(&page->lru, pagelist);
2424     }
2425
2426     if (isolated && lru)
2427         inc_node_page_state(page, NR_ISOLATED_ANON +
2428                     page_is_file_lru(page));
2429
2430     /*
2431      * If we succeed to isolate the page, we grabbed another refcount on
2432      * the page, so we can safely drop the one we got from get_any_pages().
2433      * If we failed to isolate the page, it means that we cannot go further
2434      * and we will return an error, so drop the reference we got from
2435      * get_any_pages() as well.
2436      */
2437     put_page(page);
2438     return isolated;
2439 }
2440
2441 /*
2442  * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
2443  * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
2444  * If the page is mapped, it migrates the contents over.
2445  */
2446 static int __soft_offline_page(struct page *page)
2447 {
2448     long ret = 0;
2449     unsigned long pfn = page_to_pfn(page);
2450     struct page *hpage = compound_head(page);
2451     char const *msg_page[] = {"page", "hugepage"};
2452     bool huge = PageHuge(page);
2453     LIST_HEAD(pagelist);
2454     struct migration_target_control mtc = {
2455         .nid = NUMA_NO_NODE,
2456         .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2457     };
2458
2459     lock_page(page);
2460     if (!PageHuge(page))
2461         wait_on_page_writeback(page);
2462     if (PageHWPoison(page)) {
2463         unlock_page(page);
2464         put_page(page);
2465         pr_info("soft offline: %#lx page already poisoned\n", pfn);
2466         return 0;
2467     }
2468
2469     if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
2470         /*
2471          * Try to invalidate first. This should work for
2472          * non dirty unmapped page cache pages.
2473          */
2474         ret = invalidate_inode_page(page);
2475     unlock_page(page);
2476
2477     if (ret) {
2478         pr_info("soft_offline: %#lx: invalidated\n", pfn);
2479         page_handle_poison(page, false, true);
2480         return 0;
2481     }
2482
2483     if (isolate_page(hpage, &pagelist)) {
2484         ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2485             (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2486         if (!ret) {
2487             bool release = !huge;
2488
2489             if (!page_handle_poison(page, huge, release))
2490                 ret = -EBUSY;
2491         } else {
2492             if (!list_empty(&pagelist))
2493                 putback_movable_pages(&pagelist);
2494
2495             pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
2496                 pfn, msg_page[huge], ret, &page->flags);
2497             if (ret > 0)
2498                 ret = -EBUSY;
2499         }
2500     } else {
2501         pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
2502             pfn, msg_page[huge], page_count(page), &page->flags);
2503         ret = -EBUSY;
2504     }
2505     return ret;
2506 }
2507
2508 static int soft_offline_in_use_page(struct page *page)
2509 {
2510     struct page *hpage = compound_head(page);
2511
2512     if (!PageHuge(page) && PageTransHuge(hpage))
2513         if (try_to_split_thp_page(page, "soft offline") < 0)
2514             return -EBUSY;
2515     return __soft_offline_page(page);
2516 }
2517
2518 static int soft_offline_free_page(struct page *page)
2519 {
2520     int rc = 0;
2521
2522     if (!page_handle_poison(page, true, false))
2523         rc = -EBUSY;
2524
2525     return rc;
2526 }
2527
2528 static void put_ref_page(struct page *page)
2529 {
2530     if (page)
2531         put_page(page);
2532 }
2533
2534 /**
2535  * soft_offline_page - Soft offline a page.
2536  * @pfn: pfn to soft-offline
2537  * @flags: flags. Same as memory_failure().
2538  *
2539  * Returns 0 on success
2540  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event
2541  *         < 0 otherwise negated errno.
2542  *
2543  * Soft offline a page, by migration or invalidation,
2544  * without killing anything. This is for the case when
2545  * a page is not corrupted yet (so it's still valid to access),
2546  * but has had a number of corrected errors and is better taken
2547  * out.
2548  *
2549  * The actual policy on when to do that is maintained by
2550  * user space.
2551  *
2552  * This should never impact any application or cause data loss,
2553  * however it might take some time.
2554  *
2555  * This is not a 100% solution for all memory, but tries to be
2556  * ``good enough'' for the majority of memory.
2557  */
2558 int soft_offline_page(unsigned long pfn, int flags)
2559 {
2560     int ret;
2561     bool try_again = true;
2562     struct page *page, *ref_page = NULL;
2563
2564     WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
2565
2566     if (!pfn_valid(pfn))
2567         return -ENXIO;
2568     if (flags & MF_COUNT_INCREASED)
2569         ref_page = pfn_to_page(pfn);
2570
2571     /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
2572     page = pfn_to_online_page(pfn);
2573     if (!page) {
2574         put_ref_page(ref_page);
2575         return -EIO;
2576     }
2577
2578     mutex_lock(&mf_mutex);
2579
2580     if (PageHWPoison(page)) {
2581         pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2582         put_ref_page(ref_page);
2583         mutex_unlock(&mf_mutex);
2584         return 0;
2585     }
2586
2587 retry:
2588     get_online_mems();
2589     ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
2590     put_online_mems();
2591
2592     if (hwpoison_filter(page)) {
2593         if (ret > 0)
2594             put_page(page);
2595         else
2596             put_ref_page(ref_page);
2597
2598         mutex_unlock(&mf_mutex);
2599         return -EOPNOTSUPP;
2600     }
2601
2602     if (ret > 0) {
2603         ret = soft_offline_in_use_page(page);
2604     } else if (ret == 0) {
2605         if (soft_offline_free_page(page) && try_again) {
2606             try_again = false;
2607             flags &= ~MF_COUNT_INCREASED;
2608             goto retry;
2609         }
2610     }
2611
2612     mutex_unlock(&mf_mutex);
2613
2614     return ret;
2615 }
2616
2617 void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
2618 {
2619     int i;
2620
2621     /*
2622      * A further optimization is to have per section refcounted
2623      * num_poisoned_pages.  But that would need more space per memmap, so
2624      * for now just do a quick global check to speed up this routine in the
2625      * absence of bad pages.
2626      */
2627     if (atomic_long_read(&num_poisoned_pages) == 0)
2628         return;
2629
2630     for (i = 0; i < nr_pages; i++) {
2631         if (PageHWPoison(&memmap[i])) {
2632             num_poisoned_pages_dec();
2633             ClearPageHWPoison(&memmap[i]);
2634         }
2635     }
2636 }