arm64/mm/fault.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Based on arch/arm/mm/fault.c
0004  *
0005  * Copyright (C) 1995  Linus Torvalds
0006  * Copyright (C) 1995-2004 Russell King
0007  * Copyright (C) 2012 ARM Ltd.
0008  */
0009
0010 #include <linux/acpi.h>
0011 #include <linux/bitfield.h>
0012 #include <linux/extable.h>
0013 #include <linux/kfence.h>
0014 #include <linux/signal.h>
0015 #include <linux/mm.h>
0016 #include <linux/hardirq.h>
0017 #include <linux/init.h>
0018 #include <linux/kasan.h>
0019 #include <linux/kprobes.h>
0020 #include <linux/uaccess.h>
0021 #include <linux/page-flags.h>
0022 #include <linux/sched/signal.h>
0023 #include <linux/sched/debug.h>
0024 #include <linux/highmem.h>
0025 #include <linux/perf_event.h>
0026 #include <linux/preempt.h>
0027 #include <linux/hugetlb.h>
0028
0029 #include <asm/acpi.h>
0030 #include <asm/bug.h>
0031 #include <asm/cmpxchg.h>
0032 #include <asm/cpufeature.h>
0033 #include <asm/exception.h>
0034 #include <asm/daifflags.h>
0035 #include <asm/debug-monitors.h>
0036 #include <asm/esr.h>
0037 #include <asm/kprobes.h>
0038 #include <asm/mte.h>
0039 #include <asm/processor.h>
0040 #include <asm/sysreg.h>
0041 #include <asm/system_misc.h>
0042 #include <asm/tlbflush.h>
0043 #include <asm/traps.h>
0044
0045 struct fault_info {
0046     int (*fn)(unsigned long far, unsigned long esr,
0047               struct pt_regs *regs);
0048     int sig;
0049     int code;
0050     const char *name;
0051 };
0052
0053 static const struct fault_info fault_info[];
0054 static struct fault_info debug_fault_info[];
0055
0056 static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
0057 {
0058     return fault_info + (esr & ESR_ELx_FSC);
0059 }
0060
0061 static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
0062 {
0063     return debug_fault_info + DBG_ESR_EVT(esr);
0064 }
0065
0066 static void data_abort_decode(unsigned long esr)
0067 {
0068     pr_alert("Data abort info:\n");
0069
0070     if (esr & ESR_ELx_ISV) {
0071         pr_alert("  Access size = %u byte(s)\n",
0072              1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
0073         pr_alert("  SSE = %lu, SRT = %lu\n",
0074              (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
0075              (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
0076         pr_alert("  SF = %lu, AR = %lu\n",
0077              (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
0078              (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
0079     } else {
0080         pr_alert("  ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
0081     }
0082
0083     pr_alert("  CM = %lu, WnR = %lu\n",
0084          (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
0085          (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT);
0086 }
0087
0088 static void mem_abort_decode(unsigned long esr)
0089 {
0090     pr_alert("Mem abort info:\n");
0091
0092     pr_alert("  ESR = 0x%016lx\n", esr);
0093     pr_alert("  EC = 0x%02lx: %s, IL = %u bits\n",
0094          ESR_ELx_EC(esr), esr_get_class_string(esr),
0095          (esr & ESR_ELx_IL) ? 32 : 16);
0096     pr_alert("  SET = %lu, FnV = %lu\n",
0097          (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
0098          (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
0099     pr_alert("  EA = %lu, S1PTW = %lu\n",
0100          (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
0101          (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
0102     pr_alert("  FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
0103          esr_to_fault_info(esr)->name);
0104
0105     if (esr_is_data_abort(esr))
0106         data_abort_decode(esr);
0107 }
0108
0109 static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
0110 {
0111     /* Either init_pg_dir or swapper_pg_dir */
0112     if (mm == &init_mm)
0113         return __pa_symbol(mm->pgd);
0114
0115     return (unsigned long)virt_to_phys(mm->pgd);
0116 }
0117
0118 /*
0119  * Dump out the page tables associated with 'addr' in the currently active mm.
0120  */
0121 static void show_pte(unsigned long addr)
0122 {
0123     struct mm_struct *mm;
0124     pgd_t *pgdp;
0125     pgd_t pgd;
0126
0127     if (is_ttbr0_addr(addr)) {
0128         /* TTBR0 */
0129         mm = current->active_mm;
0130         if (mm == &init_mm) {
0131             pr_alert("[%016lx] user address but active_mm is swapper\n",
0132                  addr);
0133             return;
0134         }
0135     } else if (is_ttbr1_addr(addr)) {
0136         /* TTBR1 */
0137         mm = &init_mm;
0138     } else {
0139         pr_alert("[%016lx] address between user and kernel address ranges\n",
0140              addr);
0141         return;
0142     }
0143
0144     pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
0145          mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
0146          vabits_actual, mm_to_pgd_phys(mm));
0147     pgdp = pgd_offset(mm, addr);
0148     pgd = READ_ONCE(*pgdp);
0149     pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
0150
0151     do {
0152         p4d_t *p4dp, p4d;
0153         pud_t *pudp, pud;
0154         pmd_t *pmdp, pmd;
0155         pte_t *ptep, pte;
0156
0157         if (pgd_none(pgd) || pgd_bad(pgd))
0158             break;
0159
0160         p4dp = p4d_offset(pgdp, addr);
0161         p4d = READ_ONCE(*p4dp);
0162         pr_cont(", p4d=%016llx", p4d_val(p4d));
0163         if (p4d_none(p4d) || p4d_bad(p4d))
0164             break;
0165
0166         pudp = pud_offset(p4dp, addr);
0167         pud = READ_ONCE(*pudp);
0168         pr_cont(", pud=%016llx", pud_val(pud));
0169         if (pud_none(pud) || pud_bad(pud))
0170             break;
0171
0172         pmdp = pmd_offset(pudp, addr);
0173         pmd = READ_ONCE(*pmdp);
0174         pr_cont(", pmd=%016llx", pmd_val(pmd));
0175         if (pmd_none(pmd) || pmd_bad(pmd))
0176             break;
0177
0178         ptep = pte_offset_map(pmdp, addr);
0179         pte = READ_ONCE(*ptep);
0180         pr_cont(", pte=%016llx", pte_val(pte));
0181         pte_unmap(ptep);
0182     } while(0);
0183
0184     pr_cont("\n");
0185 }
0186
0187 /*
0188  * This function sets the access flags (dirty, accessed), as well as write
0189  * permission, and only to a more permissive setting.
0190  *
0191  * It needs to cope with hardware update of the accessed/dirty state by other
0192  * agents in the system and can safely skip the __sync_icache_dcache() call as,
0193  * like set_pte_at(), the PTE is never changed from no-exec to exec here.
0194  *
0195  * Returns whether or not the PTE actually changed.
0196  */
0197 int ptep_set_access_flags(struct vm_area_struct *vma,
0198               unsigned long address, pte_t *ptep,
0199               pte_t entry, int dirty)
0200 {
0201     pteval_t old_pteval, pteval;
0202     pte_t pte = READ_ONCE(*ptep);
0203
0204     if (pte_same(pte, entry))
0205         return 0;
0206
0207     /* only preserve the access flags and write permission */
0208     pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
0209
0210     /*
0211      * Setting the flags must be done atomically to avoid racing with the
0212      * hardware update of the access/dirty state. The PTE_RDONLY bit must
0213      * be set to the most permissive (lowest value) of *ptep and entry
0214      * (calculated as: a & b == ~(~a | ~b)).
0215      */
0216     pte_val(entry) ^= PTE_RDONLY;
0217     pteval = pte_val(pte);
0218     do {
0219         old_pteval = pteval;
0220         pteval ^= PTE_RDONLY;
0221         pteval |= pte_val(entry);
0222         pteval ^= PTE_RDONLY;
0223         pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
0224     } while (pteval != old_pteval);
0225
0226     /* Invalidate a stale read-only entry */
0227     if (dirty)
0228         flush_tlb_page(vma, address);
0229     return 1;
0230 }
0231
0232 static bool is_el1_instruction_abort(unsigned long esr)
0233 {
0234     return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
0235 }
0236
0237 static bool is_el1_data_abort(unsigned long esr)
0238 {
0239     return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
0240 }
0241
0242 static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
0243                        struct pt_regs *regs)
0244 {
0245     unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE;
0246
0247     if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
0248         return false;
0249
0250     if (fsc_type == ESR_ELx_FSC_PERM)
0251         return true;
0252
0253     if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
0254         return fsc_type == ESR_ELx_FSC_FAULT &&
0255             (regs->pstate & PSR_PAN_BIT);
0256
0257     return false;
0258 }
0259
0260 static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
0261                             unsigned long esr,
0262                             struct pt_regs *regs)
0263 {
0264     unsigned long flags;
0265     u64 par, dfsc;
0266
0267     if (!is_el1_data_abort(esr) ||
0268         (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
0269         return false;
0270
0271     local_irq_save(flags);
0272     asm volatile("at s1e1r, %0" :: "r" (addr));
0273     isb();
0274     par = read_sysreg_par();
0275     local_irq_restore(flags);
0276
0277     /*
0278      * If we now have a valid translation, treat the translation fault as
0279      * spurious.
0280      */
0281     if (!(par & SYS_PAR_EL1_F))
0282         return true;
0283
0284     /*
0285      * If we got a different type of fault from the AT instruction,
0286      * treat the translation fault as spurious.
0287      */
0288     dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
0289     return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
0290 }
0291
0292 static void die_kernel_fault(const char *msg, unsigned long addr,
0293                  unsigned long esr, struct pt_regs *regs)
0294 {
0295     bust_spinlocks(1);
0296
0297     pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
0298          addr);
0299
0300     kasan_non_canonical_hook(addr);
0301
0302     mem_abort_decode(esr);
0303
0304     show_pte(addr);
0305     die("Oops", regs, esr);
0306     bust_spinlocks(0);
0307     make_task_dead(SIGKILL);
0308 }
0309
0310 #ifdef CONFIG_KASAN_HW_TAGS
0311 static void report_tag_fault(unsigned long addr, unsigned long esr,
0312                  struct pt_regs *regs)
0313 {
0314     /*
0315      * SAS bits aren't set for all faults reported in EL1, so we can't
0316      * find out access size.
0317      */
0318     bool is_write = !!(esr & ESR_ELx_WNR);
0319     kasan_report(addr, 0, is_write, regs->pc);
0320 }
0321 #else
0322 /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
0323 static inline void report_tag_fault(unsigned long addr, unsigned long esr,
0324                     struct pt_regs *regs) { }
0325 #endif
0326
0327 static void do_tag_recovery(unsigned long addr, unsigned long esr,
0328                struct pt_regs *regs)
0329 {
0330
0331     report_tag_fault(addr, esr, regs);
0332
0333     /*
0334      * Disable MTE Tag Checking on the local CPU for the current EL.
0335      * It will be done lazily on the other CPUs when they will hit a
0336      * tag fault.
0337      */
0338     sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
0339              SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
0340     isb();
0341 }
0342
0343 static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
0344 {
0345     unsigned long fsc = esr & ESR_ELx_FSC;
0346
0347     if (!is_el1_data_abort(esr))
0348         return false;
0349
0350     if (fsc == ESR_ELx_FSC_MTE)
0351         return true;
0352
0353     return false;
0354 }
0355
0356 static void __do_kernel_fault(unsigned long addr, unsigned long esr,
0357                   struct pt_regs *regs)
0358 {
0359     const char *msg;
0360
0361     /*
0362      * Are we prepared to handle this kernel fault?
0363      * We are almost certainly not prepared to handle instruction faults.
0364      */
0365     if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
0366         return;
0367
0368     if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
0369         "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
0370         return;
0371
0372     if (is_el1_mte_sync_tag_check_fault(esr)) {
0373         do_tag_recovery(addr, esr, regs);
0374
0375         return;
0376     }
0377
0378     if (is_el1_permission_fault(addr, esr, regs)) {
0379         if (esr & ESR_ELx_WNR)
0380             msg = "write to read-only memory";
0381         else if (is_el1_instruction_abort(esr))
0382             msg = "execute from non-executable memory";
0383         else
0384             msg = "read from unreadable memory";
0385     } else if (addr < PAGE_SIZE) {
0386         msg = "NULL pointer dereference";
0387     } else {
0388         if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
0389             return;
0390
0391         msg = "paging request";
0392     }
0393
0394     die_kernel_fault(msg, addr, esr, regs);
0395 }
0396
0397 static void set_thread_esr(unsigned long address, unsigned long esr)
0398 {
0399     current->thread.fault_address = address;
0400
0401     /*
0402      * If the faulting address is in the kernel, we must sanitize the ESR.
0403      * From userspace's point of view, kernel-only mappings don't exist
0404      * at all, so we report them as level 0 translation faults.
0405      * (This is not quite the way that "no mapping there at all" behaves:
0406      * an alignment fault not caused by the memory type would take
0407      * precedence over translation fault for a real access to empty
0408      * space. Unfortunately we can't easily distinguish "alignment fault
0409      * not caused by memory type" from "alignment fault caused by memory
0410      * type", so we ignore this wrinkle and just return the translation
0411      * fault.)
0412      */
0413     if (!is_ttbr0_addr(current->thread.fault_address)) {
0414         switch (ESR_ELx_EC(esr)) {
0415         case ESR_ELx_EC_DABT_LOW:
0416             /*
0417              * These bits provide only information about the
0418              * faulting instruction, which userspace knows already.
0419              * We explicitly clear bits which are architecturally
0420              * RES0 in case they are given meanings in future.
0421              * We always report the ESR as if the fault was taken
0422              * to EL1 and so ISV and the bits in ISS[23:14] are
0423              * clear. (In fact it always will be a fault to EL1.)
0424              */
0425             esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
0426                 ESR_ELx_CM | ESR_ELx_WNR;
0427             esr |= ESR_ELx_FSC_FAULT;
0428             break;
0429         case ESR_ELx_EC_IABT_LOW:
0430             /*
0431              * Claim a level 0 translation fault.
0432              * All other bits are architecturally RES0 for faults
0433              * reported with that DFSC value, so we clear them.
0434              */
0435             esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
0436             esr |= ESR_ELx_FSC_FAULT;
0437             break;
0438         default:
0439             /*
0440              * This should never happen (entry.S only brings us
0441              * into this code for insn and data aborts from a lower
0442              * exception level). Fail safe by not providing an ESR
0443              * context record at all.
0444              */
0445             WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
0446             esr = 0;
0447             break;
0448         }
0449     }
0450
0451     current->thread.fault_code = esr;
0452 }
0453
0454 static void do_bad_area(unsigned long far, unsigned long esr,
0455             struct pt_regs *regs)
0456 {
0457     unsigned long addr = untagged_addr(far);
0458
0459     /*
0460      * If we are in kernel mode at this point, we have no context to
0461      * handle this fault with.
0462      */
0463     if (user_mode(regs)) {
0464         const struct fault_info *inf = esr_to_fault_info(esr);
0465
0466         set_thread_esr(addr, esr);
0467         arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
0468     } else {
0469         __do_kernel_fault(addr, esr, regs);
0470     }
0471 }
0472
0473 #define VM_FAULT_BADMAP     0x010000
0474 #define VM_FAULT_BADACCESS  0x020000
0475
0476 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
0477                   unsigned int mm_flags, unsigned long vm_flags,
0478                   struct pt_regs *regs)
0479 {
0480     struct vm_area_struct *vma = find_vma(mm, addr);
0481
0482     if (unlikely(!vma))
0483         return VM_FAULT_BADMAP;
0484
0485     /*
0486      * Ok, we have a good vm_area for this memory access, so we can handle
0487      * it.
0488      */
0489     if (unlikely(vma->vm_start > addr)) {
0490         if (!(vma->vm_flags & VM_GROWSDOWN))
0491             return VM_FAULT_BADMAP;
0492         if (expand_stack(vma, addr))
0493             return VM_FAULT_BADMAP;
0494     }
0495
0496     /*
0497      * Check that the permissions on the VMA allow for the fault which
0498      * occurred.
0499      */
0500     if (!(vma->vm_flags & vm_flags))
0501         return VM_FAULT_BADACCESS;
0502     return handle_mm_fault(vma, addr, mm_flags, regs);
0503 }
0504
0505 static bool is_el0_instruction_abort(unsigned long esr)
0506 {
0507     return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
0508 }
0509
0510 /*
0511  * Note: not valid for EL1 DC IVAC, but we never use that such that it
0512  * should fault. EL0 cannot issue DC IVAC (undef).
0513  */
0514 static bool is_write_abort(unsigned long esr)
0515 {
0516     return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
0517 }
0518
0519 static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
0520                    struct pt_regs *regs)
0521 {
0522     const struct fault_info *inf;
0523     struct mm_struct *mm = current->mm;
0524     vm_fault_t fault;
0525     unsigned long vm_flags;
0526     unsigned int mm_flags = FAULT_FLAG_DEFAULT;
0527     unsigned long addr = untagged_addr(far);
0528
0529     if (kprobe_page_fault(regs, esr))
0530         return 0;
0531
0532     /*
0533      * If we're in an interrupt or have no user context, we must not take
0534      * the fault.
0535      */
0536     if (faulthandler_disabled() || !mm)
0537         goto no_context;
0538
0539     if (user_mode(regs))
0540         mm_flags |= FAULT_FLAG_USER;
0541
0542     /*
0543      * vm_flags tells us what bits we must have in vma->vm_flags
0544      * for the fault to be benign, __do_page_fault() would check
0545      * vma->vm_flags & vm_flags and returns an error if the
0546      * intersection is empty
0547      */
0548     if (is_el0_instruction_abort(esr)) {
0549         /* It was exec fault */
0550         vm_flags = VM_EXEC;
0551         mm_flags |= FAULT_FLAG_INSTRUCTION;
0552     } else if (is_write_abort(esr)) {
0553         /* It was write fault */
0554         vm_flags = VM_WRITE;
0555         mm_flags |= FAULT_FLAG_WRITE;
0556     } else {
0557         /* It was read fault */
0558         vm_flags = VM_READ;
0559         /* Write implies read */
0560         vm_flags |= VM_WRITE;
0561         /* If EPAN is absent then exec implies read */
0562         if (!cpus_have_const_cap(ARM64_HAS_EPAN))
0563             vm_flags |= VM_EXEC;
0564     }
0565
0566     if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
0567         if (is_el1_instruction_abort(esr))
0568             die_kernel_fault("execution of user memory",
0569                      addr, esr, regs);
0570
0571         if (!search_exception_tables(regs->pc))
0572             die_kernel_fault("access to user memory outside uaccess routines",
0573                      addr, esr, regs);
0574     }
0575
0576     perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
0577
0578     /*
0579      * As per x86, we may deadlock here. However, since the kernel only
0580      * validly references user space from well defined areas of the code,
0581      * we can bug out early if this is from code which shouldn't.
0582      */
0583     if (!mmap_read_trylock(mm)) {
0584         if (!user_mode(regs) && !search_exception_tables(regs->pc))
0585             goto no_context;
0586 retry:
0587         mmap_read_lock(mm);
0588     } else {
0589         /*
0590          * The above mmap_read_trylock() might have succeeded in which
0591          * case, we'll have missed the might_sleep() from down_read().
0592          */
0593         might_sleep();
0594 #ifdef CONFIG_DEBUG_VM
0595         if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
0596             mmap_read_unlock(mm);
0597             goto no_context;
0598         }
0599 #endif
0600     }
0601
0602     fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
0603
0604     /* Quick path to respond to signals */
0605     if (fault_signal_pending(fault, regs)) {
0606         if (!user_mode(regs))
0607             goto no_context;
0608         return 0;
0609     }
0610
0611     /* The fault is fully completed (including releasing mmap lock) */
0612     if (fault & VM_FAULT_COMPLETED)
0613         return 0;
0614
0615     if (fault & VM_FAULT_RETRY) {
0616         mm_flags |= FAULT_FLAG_TRIED;
0617         goto retry;
0618     }
0619     mmap_read_unlock(mm);
0620
0621     /*
0622      * Handle the "normal" (no error) case first.
0623      */
0624     if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
0625                   VM_FAULT_BADACCESS))))
0626         return 0;
0627
0628     /*
0629      * If we are in kernel mode at this point, we have no context to
0630      * handle this fault with.
0631      */
0632     if (!user_mode(regs))
0633         goto no_context;
0634
0635     if (fault & VM_FAULT_OOM) {
0636         /*
0637          * We ran out of memory, call the OOM killer, and return to
0638          * userspace (which will retry the fault, or kill us if we got
0639          * oom-killed).
0640          */
0641         pagefault_out_of_memory();
0642         return 0;
0643     }
0644
0645     inf = esr_to_fault_info(esr);
0646     set_thread_esr(addr, esr);
0647     if (fault & VM_FAULT_SIGBUS) {
0648         /*
0649          * We had some memory, but were unable to successfully fix up
0650          * this page fault.
0651          */
0652         arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
0653     } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
0654         unsigned int lsb;
0655
0656         lsb = PAGE_SHIFT;
0657         if (fault & VM_FAULT_HWPOISON_LARGE)
0658             lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
0659
0660         arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
0661     } else {
0662         /*
0663          * Something tried to access memory that isn't in our memory
0664          * map.
0665          */
0666         arm64_force_sig_fault(SIGSEGV,
0667                       fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
0668                       far, inf->name);
0669     }
0670
0671     return 0;
0672
0673 no_context:
0674     __do_kernel_fault(addr, esr, regs);
0675     return 0;
0676 }
0677
0678 static int __kprobes do_translation_fault(unsigned long far,
0679                       unsigned long esr,
0680                       struct pt_regs *regs)
0681 {
0682     unsigned long addr = untagged_addr(far);
0683
0684     if (is_ttbr0_addr(addr))
0685         return do_page_fault(far, esr, regs);
0686
0687     do_bad_area(far, esr, regs);
0688     return 0;
0689 }
0690
0691 static int do_alignment_fault(unsigned long far, unsigned long esr,
0692                   struct pt_regs *regs)
0693 {
0694     do_bad_area(far, esr, regs);
0695     return 0;
0696 }
0697
0698 static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
0699 {
0700     return 1; /* "fault" */
0701 }
0702
0703 static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
0704 {
0705     const struct fault_info *inf;
0706     unsigned long siaddr;
0707
0708     inf = esr_to_fault_info(esr);
0709
0710     if (user_mode(regs) && apei_claim_sea(regs) == 0) {
0711         /*
0712          * APEI claimed this as a firmware-first notification.
0713          * Some processing deferred to task_work before ret_to_user().
0714          */
0715         return 0;
0716     }
0717
0718     if (esr & ESR_ELx_FnV) {
0719         siaddr = 0;
0720     } else {
0721         /*
0722          * The architecture specifies that the tag bits of FAR_EL1 are
0723          * UNKNOWN for synchronous external aborts. Mask them out now
0724          * so that userspace doesn't see them.
0725          */
0726         siaddr  = untagged_addr(far);
0727     }
0728     arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
0729
0730     return 0;
0731 }
0732
0733 static int do_tag_check_fault(unsigned long far, unsigned long esr,
0734                   struct pt_regs *regs)
0735 {
0736     /*
0737      * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
0738      * for tag check faults. Set them to corresponding bits in the untagged
0739      * address.
0740      */
0741     far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
0742     do_bad_area(far, esr, regs);
0743     return 0;
0744 }
0745
0746 static const struct fault_info fault_info[] = {
0747     { do_bad,       SIGKILL, SI_KERNEL, "ttbr address size fault"   },
0748     { do_bad,       SIGKILL, SI_KERNEL, "level 1 address size fault"    },
0749     { do_bad,       SIGKILL, SI_KERNEL, "level 2 address size fault"    },
0750     { do_bad,       SIGKILL, SI_KERNEL, "level 3 address size fault"    },
0751     { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault" },
0752     { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault" },
0753     { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault" },
0754     { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault" },
0755     { do_bad,       SIGKILL, SI_KERNEL, "unknown 8"         },
0756     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault" },
0757     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault" },
0758     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 access flag fault" },
0759     { do_bad,       SIGKILL, SI_KERNEL, "unknown 12"            },
0760     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"  },
0761     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"  },
0762     { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"  },
0763     { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous external abort"    },
0764     { do_tag_check_fault,   SIGSEGV, SEGV_MTESERR,  "synchronous tag check fault"   },
0765     { do_bad,       SIGKILL, SI_KERNEL, "unknown 18"            },
0766     { do_bad,       SIGKILL, SI_KERNEL, "unknown 19"            },
0767     { do_sea,       SIGKILL, SI_KERNEL, "level 0 (translation table walk)"  },
0768     { do_sea,       SIGKILL, SI_KERNEL, "level 1 (translation table walk)"  },
0769     { do_sea,       SIGKILL, SI_KERNEL, "level 2 (translation table walk)"  },
0770     { do_sea,       SIGKILL, SI_KERNEL, "level 3 (translation table walk)"  },
0771     { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous parity or ECC error" },    // Reserved when RAS is implemented
0772     { do_bad,       SIGKILL, SI_KERNEL, "unknown 25"            },
0773     { do_bad,       SIGKILL, SI_KERNEL, "unknown 26"            },
0774     { do_bad,       SIGKILL, SI_KERNEL, "unknown 27"            },
0775     { do_sea,       SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
0776     { do_sea,       SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
0777     { do_sea,       SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
0778     { do_sea,       SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
0779     { do_bad,       SIGKILL, SI_KERNEL, "unknown 32"            },
0780     { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"       },
0781     { do_bad,       SIGKILL, SI_KERNEL, "unknown 34"            },
0782     { do_bad,       SIGKILL, SI_KERNEL, "unknown 35"            },
0783     { do_bad,       SIGKILL, SI_KERNEL, "unknown 36"            },
0784     { do_bad,       SIGKILL, SI_KERNEL, "unknown 37"            },
0785     { do_bad,       SIGKILL, SI_KERNEL, "unknown 38"            },
0786     { do_bad,       SIGKILL, SI_KERNEL, "unknown 39"            },
0787     { do_bad,       SIGKILL, SI_KERNEL, "unknown 40"            },
0788     { do_bad,       SIGKILL, SI_KERNEL, "unknown 41"            },
0789     { do_bad,       SIGKILL, SI_KERNEL, "unknown 42"            },
0790     { do_bad,       SIGKILL, SI_KERNEL, "unknown 43"            },
0791     { do_bad,       SIGKILL, SI_KERNEL, "unknown 44"            },
0792     { do_bad,       SIGKILL, SI_KERNEL, "unknown 45"            },
0793     { do_bad,       SIGKILL, SI_KERNEL, "unknown 46"            },
0794     { do_bad,       SIGKILL, SI_KERNEL, "unknown 47"            },
0795     { do_bad,       SIGKILL, SI_KERNEL, "TLB conflict abort"        },
0796     { do_bad,       SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault"  },
0797     { do_bad,       SIGKILL, SI_KERNEL, "unknown 50"            },
0798     { do_bad,       SIGKILL, SI_KERNEL, "unknown 51"            },
0799     { do_bad,       SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" },
0800     { do_bad,       SIGBUS,  BUS_OBJERR,    "implementation fault (unsupported exclusive)" },
0801     { do_bad,       SIGKILL, SI_KERNEL, "unknown 54"            },
0802     { do_bad,       SIGKILL, SI_KERNEL, "unknown 55"            },
0803     { do_bad,       SIGKILL, SI_KERNEL, "unknown 56"            },
0804     { do_bad,       SIGKILL, SI_KERNEL, "unknown 57"            },
0805     { do_bad,       SIGKILL, SI_KERNEL, "unknown 58"            },
0806     { do_bad,       SIGKILL, SI_KERNEL, "unknown 59"            },
0807     { do_bad,       SIGKILL, SI_KERNEL, "unknown 60"            },
0808     { do_bad,       SIGKILL, SI_KERNEL, "section domain fault"      },
0809     { do_bad,       SIGKILL, SI_KERNEL, "page domain fault"     },
0810     { do_bad,       SIGKILL, SI_KERNEL, "unknown 63"            },
0811 };
0812
0813 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
0814 {
0815     const struct fault_info *inf = esr_to_fault_info(esr);
0816     unsigned long addr = untagged_addr(far);
0817
0818     if (!inf->fn(far, esr, regs))
0819         return;
0820
0821     if (!user_mode(regs))
0822         die_kernel_fault(inf->name, addr, esr, regs);
0823
0824     /*
0825      * At this point we have an unrecognized fault type whose tag bits may
0826      * have been defined as UNKNOWN. Therefore we only expose the untagged
0827      * address to the signal handler.
0828      */
0829     arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
0830 }
0831 NOKPROBE_SYMBOL(do_mem_abort);
0832
0833 void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
0834 {
0835     arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
0836              addr, esr);
0837 }
0838 NOKPROBE_SYMBOL(do_sp_pc_abort);
0839
0840 int __init early_brk64(unsigned long addr, unsigned long esr,
0841                struct pt_regs *regs);
0842
0843 /*
0844  * __refdata because early_brk64 is __init, but the reference to it is
0845  * clobbered at arch_initcall time.
0846  * See traps.c and debug-monitors.c:debug_traps_init().
0847  */
0848 static struct fault_info __refdata debug_fault_info[] = {
0849     { do_bad,   SIGTRAP,    TRAP_HWBKPT,    "hardware breakpoint"   },
0850     { do_bad,   SIGTRAP,    TRAP_HWBKPT,    "hardware single-step"  },
0851     { do_bad,   SIGTRAP,    TRAP_HWBKPT,    "hardware watchpoint"   },
0852     { do_bad,   SIGKILL,    SI_KERNEL,  "unknown 3"     },
0853     { do_bad,   SIGTRAP,    TRAP_BRKPT, "aarch32 BKPT"      },
0854     { do_bad,   SIGKILL,    SI_KERNEL,  "aarch32 vector catch"  },
0855     { early_brk64,  SIGTRAP,    TRAP_BRKPT, "aarch64 BRK"       },
0856     { do_bad,   SIGKILL,    SI_KERNEL,  "unknown 7"     },
0857 };
0858
0859 void __init hook_debug_fault_code(int nr,
0860                   int (*fn)(unsigned long, unsigned long, struct pt_regs *),
0861                   int sig, int code, const char *name)
0862 {
0863     BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
0864
0865     debug_fault_info[nr].fn     = fn;
0866     debug_fault_info[nr].sig    = sig;
0867     debug_fault_info[nr].code   = code;
0868     debug_fault_info[nr].name   = name;
0869 }
0870
0871 /*
0872  * In debug exception context, we explicitly disable preemption despite
0873  * having interrupts disabled.
0874  * This serves two purposes: it makes it much less likely that we would
0875  * accidentally schedule in exception context and it will force a warning
0876  * if we somehow manage to schedule by accident.
0877  */
0878 static void debug_exception_enter(struct pt_regs *regs)
0879 {
0880     preempt_disable();
0881
0882     /* This code is a bit fragile.  Test it. */
0883     RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
0884 }
0885 NOKPROBE_SYMBOL(debug_exception_enter);
0886
0887 static void debug_exception_exit(struct pt_regs *regs)
0888 {
0889     preempt_enable_no_resched();
0890 }
0891 NOKPROBE_SYMBOL(debug_exception_exit);
0892
0893 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
0894             struct pt_regs *regs)
0895 {
0896     const struct fault_info *inf = esr_to_debug_fault_info(esr);
0897     unsigned long pc = instruction_pointer(regs);
0898
0899     debug_exception_enter(regs);
0900
0901     if (user_mode(regs) && !is_ttbr0_addr(pc))
0902         arm64_apply_bp_hardening();
0903
0904     if (inf->fn(addr_if_watchpoint, esr, regs)) {
0905         arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
0906     }
0907
0908     debug_exception_exit(regs);
0909 }
0910 NOKPROBE_SYMBOL(do_debug_exception);
0911
0912 /*
0913  * Used during anonymous page fault handling.
0914  */
0915 struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
0916                         unsigned long vaddr)
0917 {
0918     gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
0919
0920     /*
0921      * If the page is mapped with PROT_MTE, initialise the tags at the
0922      * point of allocation and page zeroing as this is usually faster than
0923      * separate DC ZVA and STGM.
0924      */
0925     if (vma->vm_flags & VM_MTE)
0926         flags |= __GFP_ZEROTAGS;
0927
0928     return alloc_page_vma(flags, vma, vaddr);
0929 }
0930
0931 void tag_clear_highpage(struct page *page)
0932 {
0933     mte_zero_clear_page_tags(page_address(page));
0934     set_bit(PG_mte_tagged, &page->flags);
0935 }