0001
0002
0003
0004
0005
0006
0007 #include <linux/sched.h> /* test_thread_flag(), ... */
0008 #include <linux/sched/task_stack.h> /* task_stack_*(), ... */
0009 #include <linux/kdebug.h> /* oops_begin/end, ... */
0010 #include <linux/extable.h> /* search_exception_tables */
0011 #include <linux/memblock.h> /* max_low_pfn */
0012 #include <linux/kfence.h> /* kfence_handle_page_fault */
0013 #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
0014 #include <linux/mmiotrace.h> /* kmmio_handler, ... */
0015 #include <linux/perf_event.h> /* perf_sw_event */
0016 #include <linux/hugetlb.h> /* hstate_index_to_shift */
0017 #include <linux/prefetch.h> /* prefetchw */
0018 #include <linux/context_tracking.h> /* exception_enter(), ... */
0019 #include <linux/uaccess.h> /* faulthandler_disabled() */
0020 #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
0021 #include <linux/mm_types.h>
0022
0023 #include <asm/cpufeature.h> /* boot_cpu_has, ... */
0024 #include <asm/traps.h> /* dotraplinkage, ... */
0025 #include <asm/fixmap.h> /* VSYSCALL_ADDR */
0026 #include <asm/vsyscall.h> /* emulate_vsyscall */
0027 #include <asm/vm86.h> /* struct vm86 */
0028 #include <asm/mmu_context.h> /* vma_pkey() */
0029 #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
0030 #include <asm/desc.h> /* store_idt(), ... */
0031 #include <asm/cpu_entry_area.h> /* exception stack */
0032 #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
0033 #include <asm/kvm_para.h> /* kvm_handle_async_pf */
0034 #include <asm/vdso.h> /* fixup_vdso_exception() */
0035 #include <asm/irq_stack.h>
0036
0037 #define CREATE_TRACE_POINTS
0038 #include <asm/trace/exceptions.h>
0039
0040
0041
0042
0043
0044 static nokprobe_inline int
0045 kmmio_fault(struct pt_regs *regs, unsigned long addr)
0046 {
0047 if (unlikely(is_kmmio_active()))
0048 if (kmmio_handler(regs, addr) == 1)
0049 return -1;
0050 return 0;
0051 }
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068 static inline int
0069 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
0070 unsigned char opcode, int *prefetch)
0071 {
0072 unsigned char instr_hi = opcode & 0xf0;
0073 unsigned char instr_lo = opcode & 0x0f;
0074
0075 switch (instr_hi) {
0076 case 0x20:
0077 case 0x30:
0078
0079
0080
0081
0082
0083
0084 return ((instr_lo & 7) == 0x6);
0085 #ifdef CONFIG_X86_64
0086 case 0x40:
0087
0088
0089
0090 return (!user_mode(regs) || user_64bit_mode(regs));
0091 #endif
0092 case 0x60:
0093
0094 return (instr_lo & 0xC) == 0x4;
0095 case 0xF0:
0096
0097 return !instr_lo || (instr_lo>>1) == 1;
0098 case 0x00:
0099
0100 if (get_kernel_nofault(opcode, instr))
0101 return 0;
0102
0103 *prefetch = (instr_lo == 0xF) &&
0104 (opcode == 0x0D || opcode == 0x18);
0105 return 0;
0106 default:
0107 return 0;
0108 }
0109 }
0110
0111 static bool is_amd_k8_pre_npt(void)
0112 {
0113 struct cpuinfo_x86 *c = &boot_cpu_data;
0114
0115 return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
0116 c->x86_vendor == X86_VENDOR_AMD &&
0117 c->x86 == 0xf && c->x86_model < 0x40);
0118 }
0119
0120 static int
0121 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
0122 {
0123 unsigned char *max_instr;
0124 unsigned char *instr;
0125 int prefetch = 0;
0126
0127
0128 if (!is_amd_k8_pre_npt())
0129 return 0;
0130
0131
0132
0133
0134
0135 if (error_code & X86_PF_INSTR)
0136 return 0;
0137
0138 instr = (void *)convert_ip_to_linear(current, regs);
0139 max_instr = instr + 15;
0140
0141
0142
0143
0144
0145
0146 pagefault_disable();
0147
0148 while (instr < max_instr) {
0149 unsigned char opcode;
0150
0151 if (user_mode(regs)) {
0152 if (get_user(opcode, (unsigned char __user *) instr))
0153 break;
0154 } else {
0155 if (get_kernel_nofault(opcode, instr))
0156 break;
0157 }
0158
0159 instr++;
0160
0161 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
0162 break;
0163 }
0164
0165 pagefault_enable();
0166 return prefetch;
0167 }
0168
0169 DEFINE_SPINLOCK(pgd_lock);
0170 LIST_HEAD(pgd_list);
0171
0172 #ifdef CONFIG_X86_32
0173 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
0174 {
0175 unsigned index = pgd_index(address);
0176 pgd_t *pgd_k;
0177 p4d_t *p4d, *p4d_k;
0178 pud_t *pud, *pud_k;
0179 pmd_t *pmd, *pmd_k;
0180
0181 pgd += index;
0182 pgd_k = init_mm.pgd + index;
0183
0184 if (!pgd_present(*pgd_k))
0185 return NULL;
0186
0187
0188
0189
0190
0191
0192 p4d = p4d_offset(pgd, address);
0193 p4d_k = p4d_offset(pgd_k, address);
0194 if (!p4d_present(*p4d_k))
0195 return NULL;
0196
0197 pud = pud_offset(p4d, address);
0198 pud_k = pud_offset(p4d_k, address);
0199 if (!pud_present(*pud_k))
0200 return NULL;
0201
0202 pmd = pmd_offset(pud, address);
0203 pmd_k = pmd_offset(pud_k, address);
0204
0205 if (pmd_present(*pmd) != pmd_present(*pmd_k))
0206 set_pmd(pmd, *pmd_k);
0207
0208 if (!pmd_present(*pmd_k))
0209 return NULL;
0210 else
0211 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
0212
0213 return pmd_k;
0214 }
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230 static noinline int vmalloc_fault(unsigned long address)
0231 {
0232 unsigned long pgd_paddr;
0233 pmd_t *pmd_k;
0234 pte_t *pte_k;
0235
0236
0237 if (!(address >= VMALLOC_START && address < VMALLOC_END))
0238 return -1;
0239
0240
0241
0242
0243
0244
0245
0246
0247 pgd_paddr = read_cr3_pa();
0248 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
0249 if (!pmd_k)
0250 return -1;
0251
0252 if (pmd_large(*pmd_k))
0253 return 0;
0254
0255 pte_k = pte_offset_kernel(pmd_k, address);
0256 if (!pte_present(*pte_k))
0257 return -1;
0258
0259 return 0;
0260 }
0261 NOKPROBE_SYMBOL(vmalloc_fault);
0262
0263 void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
0264 {
0265 unsigned long addr;
0266
0267 for (addr = start & PMD_MASK;
0268 addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
0269 addr += PMD_SIZE) {
0270 struct page *page;
0271
0272 spin_lock(&pgd_lock);
0273 list_for_each_entry(page, &pgd_list, lru) {
0274 spinlock_t *pgt_lock;
0275
0276
0277 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
0278
0279 spin_lock(pgt_lock);
0280 vmalloc_sync_one(page_address(page), addr);
0281 spin_unlock(pgt_lock);
0282 }
0283 spin_unlock(&pgd_lock);
0284 }
0285 }
0286
0287 static bool low_pfn(unsigned long pfn)
0288 {
0289 return pfn < max_low_pfn;
0290 }
0291
0292 static void dump_pagetable(unsigned long address)
0293 {
0294 pgd_t *base = __va(read_cr3_pa());
0295 pgd_t *pgd = &base[pgd_index(address)];
0296 p4d_t *p4d;
0297 pud_t *pud;
0298 pmd_t *pmd;
0299 pte_t *pte;
0300
0301 #ifdef CONFIG_X86_PAE
0302 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
0303 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
0304 goto out;
0305 #define pr_pde pr_cont
0306 #else
0307 #define pr_pde pr_info
0308 #endif
0309 p4d = p4d_offset(pgd, address);
0310 pud = pud_offset(p4d, address);
0311 pmd = pmd_offset(pud, address);
0312 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
0313 #undef pr_pde
0314
0315
0316
0317
0318
0319
0320
0321 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
0322 goto out;
0323
0324 pte = pte_offset_kernel(pmd, address);
0325 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
0326 out:
0327 pr_cont("\n");
0328 }
0329
0330 #else
0331
0332 #ifdef CONFIG_CPU_SUP_AMD
0333 static const char errata93_warning[] =
0334 KERN_ERR
0335 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
0336 "******* Working around it, but it may cause SEGVs or burn power.\n"
0337 "******* Please consider a BIOS update.\n"
0338 "******* Disabling USB legacy in the BIOS may also help.\n";
0339 #endif
0340
0341 static int bad_address(void *p)
0342 {
0343 unsigned long dummy;
0344
0345 return get_kernel_nofault(dummy, (unsigned long *)p);
0346 }
0347
0348 static void dump_pagetable(unsigned long address)
0349 {
0350 pgd_t *base = __va(read_cr3_pa());
0351 pgd_t *pgd = base + pgd_index(address);
0352 p4d_t *p4d;
0353 pud_t *pud;
0354 pmd_t *pmd;
0355 pte_t *pte;
0356
0357 if (bad_address(pgd))
0358 goto bad;
0359
0360 pr_info("PGD %lx ", pgd_val(*pgd));
0361
0362 if (!pgd_present(*pgd))
0363 goto out;
0364
0365 p4d = p4d_offset(pgd, address);
0366 if (bad_address(p4d))
0367 goto bad;
0368
0369 pr_cont("P4D %lx ", p4d_val(*p4d));
0370 if (!p4d_present(*p4d) || p4d_large(*p4d))
0371 goto out;
0372
0373 pud = pud_offset(p4d, address);
0374 if (bad_address(pud))
0375 goto bad;
0376
0377 pr_cont("PUD %lx ", pud_val(*pud));
0378 if (!pud_present(*pud) || pud_large(*pud))
0379 goto out;
0380
0381 pmd = pmd_offset(pud, address);
0382 if (bad_address(pmd))
0383 goto bad;
0384
0385 pr_cont("PMD %lx ", pmd_val(*pmd));
0386 if (!pmd_present(*pmd) || pmd_large(*pmd))
0387 goto out;
0388
0389 pte = pte_offset_kernel(pmd, address);
0390 if (bad_address(pte))
0391 goto bad;
0392
0393 pr_cont("PTE %lx", pte_val(*pte));
0394 out:
0395 pr_cont("\n");
0396 return;
0397 bad:
0398 pr_info("BAD\n");
0399 }
0400
0401 #endif
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417 static int is_errata93(struct pt_regs *regs, unsigned long address)
0418 {
0419 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
0420 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
0421 || boot_cpu_data.x86 != 0xf)
0422 return 0;
0423
0424 if (user_mode(regs))
0425 return 0;
0426
0427 if (address != regs->ip)
0428 return 0;
0429
0430 if ((address >> 32) != 0)
0431 return 0;
0432
0433 address |= 0xffffffffUL << 32;
0434 if ((address >= (u64)_stext && address <= (u64)_etext) ||
0435 (address >= MODULES_VADDR && address <= MODULES_END)) {
0436 printk_once(errata93_warning);
0437 regs->ip = address;
0438 return 1;
0439 }
0440 #endif
0441 return 0;
0442 }
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452 static int is_errata100(struct pt_regs *regs, unsigned long address)
0453 {
0454 #ifdef CONFIG_X86_64
0455 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
0456 return 1;
0457 #endif
0458 return 0;
0459 }
0460
0461
0462 static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
0463 unsigned long address)
0464 {
0465 #ifdef CONFIG_X86_F00F_BUG
0466 if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
0467 idt_is_f00f_address(address)) {
0468 handle_invalid_op(regs);
0469 return 1;
0470 }
0471 #endif
0472 return 0;
0473 }
0474
0475 static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
0476 {
0477 u32 offset = (index >> 3) * sizeof(struct desc_struct);
0478 unsigned long addr;
0479 struct ldttss_desc desc;
0480
0481 if (index == 0) {
0482 pr_alert("%s: NULL\n", name);
0483 return;
0484 }
0485
0486 if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
0487 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
0488 return;
0489 }
0490
0491 if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
0492 sizeof(struct ldttss_desc))) {
0493 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
0494 name, index);
0495 return;
0496 }
0497
0498 addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
0499 #ifdef CONFIG_X86_64
0500 addr |= ((u64)desc.base3 << 32);
0501 #endif
0502 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
0503 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
0504 }
0505
0506 static void
0507 show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
0508 {
0509 if (!oops_may_print())
0510 return;
0511
0512 if (error_code & X86_PF_INSTR) {
0513 unsigned int level;
0514 pgd_t *pgd;
0515 pte_t *pte;
0516
0517 pgd = __va(read_cr3_pa());
0518 pgd += pgd_index(address);
0519
0520 pte = lookup_address_in_pgd(pgd, address, &level);
0521
0522 if (pte && pte_present(*pte) && !pte_exec(*pte))
0523 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
0524 from_kuid(&init_user_ns, current_uid()));
0525 if (pte && pte_present(*pte) && pte_exec(*pte) &&
0526 (pgd_flags(*pgd) & _PAGE_USER) &&
0527 (__read_cr4() & X86_CR4_SMEP))
0528 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
0529 from_kuid(&init_user_ns, current_uid()));
0530 }
0531
0532 if (address < PAGE_SIZE && !user_mode(regs))
0533 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
0534 (void *)address);
0535 else
0536 pr_alert("BUG: unable to handle page fault for address: %px\n",
0537 (void *)address);
0538
0539 pr_alert("#PF: %s %s in %s mode\n",
0540 (error_code & X86_PF_USER) ? "user" : "supervisor",
0541 (error_code & X86_PF_INSTR) ? "instruction fetch" :
0542 (error_code & X86_PF_WRITE) ? "write access" :
0543 "read access",
0544 user_mode(regs) ? "user" : "kernel");
0545 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
0546 !(error_code & X86_PF_PROT) ? "not-present page" :
0547 (error_code & X86_PF_RSVD) ? "reserved bit violation" :
0548 (error_code & X86_PF_PK) ? "protection keys violation" :
0549 "permissions violation");
0550
0551 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
0552 struct desc_ptr idt, gdt;
0553 u16 ldtr, tr;
0554
0555
0556
0557
0558
0559
0560
0561
0562
0563
0564
0565 store_idt(&idt);
0566
0567
0568 native_store_gdt(&gdt);
0569
0570 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
0571 idt.address, idt.size, gdt.address, gdt.size);
0572
0573 store_ldt(ldtr);
0574 show_ldttss(&gdt, "LDTR", ldtr);
0575
0576 store_tr(tr);
0577 show_ldttss(&gdt, "TR", tr);
0578 }
0579
0580 dump_pagetable(address);
0581 }
0582
0583 static noinline void
0584 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
0585 unsigned long address)
0586 {
0587 struct task_struct *tsk;
0588 unsigned long flags;
0589 int sig;
0590
0591 flags = oops_begin();
0592 tsk = current;
0593 sig = SIGKILL;
0594
0595 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
0596 tsk->comm, address);
0597 dump_pagetable(address);
0598
0599 if (__die("Bad pagetable", regs, error_code))
0600 sig = 0;
0601
0602 oops_end(flags, regs, sig);
0603 }
0604
0605 static void sanitize_error_code(unsigned long address,
0606 unsigned long *error_code)
0607 {
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617 if (address >= TASK_SIZE_MAX)
0618 *error_code |= X86_PF_PROT;
0619 }
0620
0621 static void set_signal_archinfo(unsigned long address,
0622 unsigned long error_code)
0623 {
0624 struct task_struct *tsk = current;
0625
0626 tsk->thread.trap_nr = X86_TRAP_PF;
0627 tsk->thread.error_code = error_code | X86_PF_USER;
0628 tsk->thread.cr2 = address;
0629 }
0630
0631 static noinline void
0632 page_fault_oops(struct pt_regs *regs, unsigned long error_code,
0633 unsigned long address)
0634 {
0635 #ifdef CONFIG_VMAP_STACK
0636 struct stack_info info;
0637 #endif
0638 unsigned long flags;
0639 int sig;
0640
0641 if (user_mode(regs)) {
0642
0643
0644
0645
0646 goto oops;
0647 }
0648
0649 #ifdef CONFIG_VMAP_STACK
0650
0651
0652
0653
0654
0655 if (is_vmalloc_addr((void *)address) &&
0656 get_stack_guard_info((void *)address, &info)) {
0657
0658
0659
0660
0661
0662
0663
0664
0665
0666
0667 call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
0668 handle_stack_overflow,
0669 ASM_CALL_ARG3,
0670 , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
0671
0672 unreachable();
0673 }
0674 #endif
0675
0676
0677
0678
0679
0680
0681 if (IS_ENABLED(CONFIG_EFI))
0682 efi_crash_gracefully_on_page_fault(address);
0683
0684
0685 if (!(error_code & X86_PF_PROT) &&
0686 kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
0687 return;
0688
0689 oops:
0690
0691
0692
0693
0694 flags = oops_begin();
0695
0696 show_fault_oops(regs, error_code, address);
0697
0698 if (task_stack_end_corrupted(current))
0699 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
0700
0701 sig = SIGKILL;
0702 if (__die("Oops", regs, error_code))
0703 sig = 0;
0704
0705
0706 printk(KERN_DEFAULT "CR2: %016lx\n", address);
0707
0708 oops_end(flags, regs, sig);
0709 }
0710
0711 static noinline void
0712 kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
0713 unsigned long address, int signal, int si_code,
0714 u32 pkey)
0715 {
0716 WARN_ON_ONCE(user_mode(regs));
0717
0718
0719 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
0720
0721
0722
0723
0724
0725 if (in_interrupt())
0726 return;
0727
0728
0729
0730
0731
0732
0733
0734 if (current->thread.sig_on_uaccess_err && signal) {
0735 sanitize_error_code(address, &error_code);
0736
0737 set_signal_archinfo(address, error_code);
0738
0739 if (si_code == SEGV_PKUERR) {
0740 force_sig_pkuerr((void __user *)address, pkey);
0741 } else {
0742
0743 force_sig_fault(signal, si_code, (void __user *)address);
0744 }
0745 }
0746
0747
0748
0749
0750 return;
0751 }
0752
0753
0754
0755
0756
0757 if (is_prefetch(regs, error_code, address))
0758 return;
0759
0760 page_fault_oops(regs, error_code, address);
0761 }
0762
0763
0764
0765
0766
0767 static inline void
0768 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
0769 unsigned long address, struct task_struct *tsk)
0770 {
0771 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
0772
0773 if (!unhandled_signal(tsk, SIGSEGV))
0774 return;
0775
0776 if (!printk_ratelimit())
0777 return;
0778
0779 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
0780 loglvl, tsk->comm, task_pid_nr(tsk), address,
0781 (void *)regs->ip, (void *)regs->sp, error_code);
0782
0783 print_vma_addr(KERN_CONT " in ", regs->ip);
0784
0785 printk(KERN_CONT "\n");
0786
0787 show_opcodes(regs, loglvl);
0788 }
0789
0790
0791
0792
0793
0794 static bool is_vsyscall_vaddr(unsigned long vaddr)
0795 {
0796 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
0797 }
0798
0799 static void
0800 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
0801 unsigned long address, u32 pkey, int si_code)
0802 {
0803 struct task_struct *tsk = current;
0804
0805 if (!user_mode(regs)) {
0806 kernelmode_fixup_or_oops(regs, error_code, address,
0807 SIGSEGV, si_code, pkey);
0808 return;
0809 }
0810
0811 if (!(error_code & X86_PF_USER)) {
0812
0813 page_fault_oops(regs, error_code, address);
0814 return;
0815 }
0816
0817
0818
0819
0820
0821 local_irq_enable();
0822
0823
0824
0825
0826
0827 if (is_prefetch(regs, error_code, address))
0828 return;
0829
0830 if (is_errata100(regs, address))
0831 return;
0832
0833 sanitize_error_code(address, &error_code);
0834
0835 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
0836 return;
0837
0838 if (likely(show_unhandled_signals))
0839 show_signal_msg(regs, error_code, address, tsk);
0840
0841 set_signal_archinfo(address, error_code);
0842
0843 if (si_code == SEGV_PKUERR)
0844 force_sig_pkuerr((void __user *)address, pkey);
0845 else
0846 force_sig_fault(SIGSEGV, si_code, (void __user *)address);
0847
0848 local_irq_disable();
0849 }
0850
0851 static noinline void
0852 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
0853 unsigned long address)
0854 {
0855 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
0856 }
0857
0858 static void
0859 __bad_area(struct pt_regs *regs, unsigned long error_code,
0860 unsigned long address, u32 pkey, int si_code)
0861 {
0862 struct mm_struct *mm = current->mm;
0863
0864
0865
0866
0867 mmap_read_unlock(mm);
0868
0869 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
0870 }
0871
0872 static noinline void
0873 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
0874 {
0875 __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
0876 }
0877
0878 static inline bool bad_area_access_from_pkeys(unsigned long error_code,
0879 struct vm_area_struct *vma)
0880 {
0881
0882 bool foreign = false;
0883
0884 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
0885 return false;
0886 if (error_code & X86_PF_PK)
0887 return true;
0888
0889 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
0890 (error_code & X86_PF_INSTR), foreign))
0891 return true;
0892 return false;
0893 }
0894
0895 static noinline void
0896 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
0897 unsigned long address, struct vm_area_struct *vma)
0898 {
0899
0900
0901
0902
0903
0904 if (bad_area_access_from_pkeys(error_code, vma)) {
0905
0906
0907
0908
0909
0910
0911
0912
0913
0914
0915
0916
0917
0918
0919
0920
0921
0922
0923
0924
0925 u32 pkey = vma_pkey(vma);
0926
0927 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
0928 } else {
0929 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
0930 }
0931 }
0932
0933 static void
0934 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
0935 vm_fault_t fault)
0936 {
0937
0938 if (!user_mode(regs)) {
0939 kernelmode_fixup_or_oops(regs, error_code, address,
0940 SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
0941 return;
0942 }
0943
0944
0945 if (is_prefetch(regs, error_code, address))
0946 return;
0947
0948 sanitize_error_code(address, &error_code);
0949
0950 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
0951 return;
0952
0953 set_signal_archinfo(address, error_code);
0954
0955 #ifdef CONFIG_MEMORY_FAILURE
0956 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
0957 struct task_struct *tsk = current;
0958 unsigned lsb = 0;
0959
0960 pr_err(
0961 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
0962 tsk->comm, tsk->pid, address);
0963 if (fault & VM_FAULT_HWPOISON_LARGE)
0964 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
0965 if (fault & VM_FAULT_HWPOISON)
0966 lsb = PAGE_SHIFT;
0967 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
0968 return;
0969 }
0970 #endif
0971 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
0972 }
0973
0974 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
0975 {
0976 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
0977 return 0;
0978
0979 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
0980 return 0;
0981
0982 return 1;
0983 }
0984
0985
0986
0987
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006 static noinline int
1007 spurious_kernel_fault(unsigned long error_code, unsigned long address)
1008 {
1009 pgd_t *pgd;
1010 p4d_t *p4d;
1011 pud_t *pud;
1012 pmd_t *pmd;
1013 pte_t *pte;
1014 int ret;
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1026 error_code != (X86_PF_INSTR | X86_PF_PROT))
1027 return 0;
1028
1029 pgd = init_mm.pgd + pgd_index(address);
1030 if (!pgd_present(*pgd))
1031 return 0;
1032
1033 p4d = p4d_offset(pgd, address);
1034 if (!p4d_present(*p4d))
1035 return 0;
1036
1037 if (p4d_large(*p4d))
1038 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1039
1040 pud = pud_offset(p4d, address);
1041 if (!pud_present(*pud))
1042 return 0;
1043
1044 if (pud_large(*pud))
1045 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1046
1047 pmd = pmd_offset(pud, address);
1048 if (!pmd_present(*pmd))
1049 return 0;
1050
1051 if (pmd_large(*pmd))
1052 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1053
1054 pte = pte_offset_kernel(pmd, address);
1055 if (!pte_present(*pte))
1056 return 0;
1057
1058 ret = spurious_kernel_fault_check(error_code, pte);
1059 if (!ret)
1060 return 0;
1061
1062
1063
1064
1065
1066 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1067 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1068
1069 return ret;
1070 }
1071 NOKPROBE_SYMBOL(spurious_kernel_fault);
1072
1073 int show_unhandled_signals = 1;
1074
1075 static inline int
1076 access_error(unsigned long error_code, struct vm_area_struct *vma)
1077 {
1078
1079 bool foreign = false;
1080
1081
1082
1083
1084
1085
1086 if (error_code & X86_PF_PK)
1087 return 1;
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098 if (unlikely(error_code & X86_PF_SGX))
1099 return 1;
1100
1101
1102
1103
1104
1105
1106 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1107 (error_code & X86_PF_INSTR), foreign))
1108 return 1;
1109
1110 if (error_code & X86_PF_WRITE) {
1111
1112 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1113 return 1;
1114 return 0;
1115 }
1116
1117
1118 if (unlikely(error_code & X86_PF_PROT))
1119 return 1;
1120
1121
1122 if (unlikely(!vma_is_accessible(vma)))
1123 return 1;
1124
1125 return 0;
1126 }
1127
1128 bool fault_in_kernel_space(unsigned long address)
1129 {
1130
1131
1132
1133
1134
1135 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1136 return false;
1137
1138 return address >= TASK_SIZE_MAX;
1139 }
1140
1141
1142
1143
1144
1145
1146 static void
1147 do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1148 unsigned long address)
1149 {
1150
1151
1152
1153
1154
1155 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1156
1157 #ifdef CONFIG_X86_32
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1183 if (vmalloc_fault(address) >= 0)
1184 return;
1185 }
1186 #endif
1187
1188 if (is_f00f_bug(regs, hw_error_code, address))
1189 return;
1190
1191
1192 if (spurious_kernel_fault(hw_error_code, address))
1193 return;
1194
1195
1196 if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1197 return;
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207 bad_area_nosemaphore(regs, hw_error_code, address);
1208 }
1209 NOKPROBE_SYMBOL(do_kern_addr_fault);
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219 static inline
1220 void do_user_addr_fault(struct pt_regs *regs,
1221 unsigned long error_code,
1222 unsigned long address)
1223 {
1224 struct vm_area_struct *vma;
1225 struct task_struct *tsk;
1226 struct mm_struct *mm;
1227 vm_fault_t fault;
1228 unsigned int flags = FAULT_FLAG_DEFAULT;
1229
1230 tsk = current;
1231 mm = tsk->mm;
1232
1233 if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
1234
1235
1236
1237
1238
1239
1240
1241 if (is_errata93(regs, address))
1242 return;
1243
1244 page_fault_oops(regs, error_code, address);
1245 return;
1246 }
1247
1248
1249 if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1250 return;
1251
1252
1253
1254
1255
1256 if (unlikely(error_code & X86_PF_RSVD))
1257 pgtable_bad(regs, error_code, address);
1258
1259
1260
1261
1262
1263
1264
1265
1266 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1267 !(error_code & X86_PF_USER) &&
1268 !(regs->flags & X86_EFLAGS_AC))) {
1269
1270
1271
1272
1273 page_fault_oops(regs, error_code, address);
1274 return;
1275 }
1276
1277
1278
1279
1280
1281 if (unlikely(faulthandler_disabled() || !mm)) {
1282 bad_area_nosemaphore(regs, error_code, address);
1283 return;
1284 }
1285
1286
1287
1288
1289
1290
1291
1292
1293 if (user_mode(regs)) {
1294 local_irq_enable();
1295 flags |= FAULT_FLAG_USER;
1296 } else {
1297 if (regs->flags & X86_EFLAGS_IF)
1298 local_irq_enable();
1299 }
1300
1301 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1302
1303 if (error_code & X86_PF_WRITE)
1304 flags |= FAULT_FLAG_WRITE;
1305 if (error_code & X86_PF_INSTR)
1306 flags |= FAULT_FLAG_INSTRUCTION;
1307
1308 #ifdef CONFIG_X86_64
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320 if (is_vsyscall_vaddr(address)) {
1321 if (emulate_vsyscall(error_code, regs, address))
1322 return;
1323 }
1324 #endif
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338 if (unlikely(!mmap_read_trylock(mm))) {
1339 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1340
1341
1342
1343
1344 bad_area_nosemaphore(regs, error_code, address);
1345 return;
1346 }
1347 retry:
1348 mmap_read_lock(mm);
1349 } else {
1350
1351
1352
1353
1354
1355 might_sleep();
1356 }
1357
1358 vma = find_vma(mm, address);
1359 if (unlikely(!vma)) {
1360 bad_area(regs, error_code, address);
1361 return;
1362 }
1363 if (likely(vma->vm_start <= address))
1364 goto good_area;
1365 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1366 bad_area(regs, error_code, address);
1367 return;
1368 }
1369 if (unlikely(expand_stack(vma, address))) {
1370 bad_area(regs, error_code, address);
1371 return;
1372 }
1373
1374
1375
1376
1377
1378 good_area:
1379 if (unlikely(access_error(error_code, vma))) {
1380 bad_area_access_error(regs, error_code, address, vma);
1381 return;
1382 }
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397 fault = handle_mm_fault(vma, address, flags, regs);
1398
1399 if (fault_signal_pending(fault, regs)) {
1400
1401
1402
1403
1404 if (!user_mode(regs))
1405 kernelmode_fixup_or_oops(regs, error_code, address,
1406 SIGBUS, BUS_ADRERR,
1407 ARCH_DEFAULT_PKEY);
1408 return;
1409 }
1410
1411
1412 if (fault & VM_FAULT_COMPLETED)
1413 return;
1414
1415
1416
1417
1418
1419
1420 if (unlikely(fault & VM_FAULT_RETRY)) {
1421 flags |= FAULT_FLAG_TRIED;
1422 goto retry;
1423 }
1424
1425 mmap_read_unlock(mm);
1426 if (likely(!(fault & VM_FAULT_ERROR)))
1427 return;
1428
1429 if (fatal_signal_pending(current) && !user_mode(regs)) {
1430 kernelmode_fixup_or_oops(regs, error_code, address,
1431 0, 0, ARCH_DEFAULT_PKEY);
1432 return;
1433 }
1434
1435 if (fault & VM_FAULT_OOM) {
1436
1437 if (!user_mode(regs)) {
1438 kernelmode_fixup_or_oops(regs, error_code, address,
1439 SIGSEGV, SEGV_MAPERR,
1440 ARCH_DEFAULT_PKEY);
1441 return;
1442 }
1443
1444
1445
1446
1447
1448
1449 pagefault_out_of_memory();
1450 } else {
1451 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1452 VM_FAULT_HWPOISON_LARGE))
1453 do_sigbus(regs, error_code, address, fault);
1454 else if (fault & VM_FAULT_SIGSEGV)
1455 bad_area_nosemaphore(regs, error_code, address);
1456 else
1457 BUG();
1458 }
1459 }
1460 NOKPROBE_SYMBOL(do_user_addr_fault);
1461
1462 static __always_inline void
1463 trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1464 unsigned long address)
1465 {
1466 if (!trace_pagefault_enabled())
1467 return;
1468
1469 if (user_mode(regs))
1470 trace_page_fault_user(address, regs, error_code);
1471 else
1472 trace_page_fault_kernel(address, regs, error_code);
1473 }
1474
1475 static __always_inline void
1476 handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1477 unsigned long address)
1478 {
1479 trace_page_fault_entries(regs, error_code, address);
1480
1481 if (unlikely(kmmio_fault(regs, address)))
1482 return;
1483
1484
1485 if (unlikely(fault_in_kernel_space(address))) {
1486 do_kern_addr_fault(regs, error_code, address);
1487 } else {
1488 do_user_addr_fault(regs, error_code, address);
1489
1490
1491
1492
1493
1494
1495
1496 local_irq_disable();
1497 }
1498 }
1499
1500 DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1501 {
1502 unsigned long address = read_cr2();
1503 irqentry_state_t state;
1504
1505 prefetchw(¤t->mm->mmap_lock);
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528 if (kvm_handle_async_pf(regs, (u32)address))
1529 return;
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541 state = irqentry_enter(regs);
1542
1543 instrumentation_begin();
1544 handle_page_fault(regs, error_code, address);
1545 instrumentation_end();
1546
1547 irqentry_exit(regs, state);
1548 }