0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 #include <linux/signal.h>
0015 #include <linux/sched.h>
0016 #include <linux/sched/task_stack.h>
0017 #include <linux/kernel.h>
0018 #include <linux/errno.h>
0019 #include <linux/string.h>
0020 #include <linux/types.h>
0021 #include <linux/pagemap.h>
0022 #include <linux/ptrace.h>
0023 #include <linux/mman.h>
0024 #include <linux/mm.h>
0025 #include <linux/interrupt.h>
0026 #include <linux/highmem.h>
0027 #include <linux/extable.h>
0028 #include <linux/kprobes.h>
0029 #include <linux/kdebug.h>
0030 #include <linux/perf_event.h>
0031 #include <linux/ratelimit.h>
0032 #include <linux/context_tracking.h>
0033 #include <linux/hugetlb.h>
0034 #include <linux/uaccess.h>
0035 #include <linux/kfence.h>
0036 #include <linux/pkeys.h>
0037
0038 #include <asm/firmware.h>
0039 #include <asm/interrupt.h>
0040 #include <asm/page.h>
0041 #include <asm/mmu.h>
0042 #include <asm/mmu_context.h>
0043 #include <asm/siginfo.h>
0044 #include <asm/debug.h>
0045 #include <asm/kup.h>
0046 #include <asm/inst.h>
0047
0048
0049
0050
0051
0052
0053 static int
0054 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
0055 {
0056
0057
0058
0059
0060
0061 if (!user_mode(regs))
0062 return SIGSEGV;
0063
0064 _exception(SIGSEGV, regs, si_code, address);
0065
0066 return 0;
0067 }
0068
0069 static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
0070 {
0071 return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
0072 }
0073
0074 static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
0075 {
0076 struct mm_struct *mm = current->mm;
0077
0078
0079
0080
0081
0082 mmap_read_unlock(mm);
0083
0084 return __bad_area_nosemaphore(regs, address, si_code);
0085 }
0086
0087 static noinline int bad_area(struct pt_regs *regs, unsigned long address)
0088 {
0089 return __bad_area(regs, address, SEGV_MAPERR);
0090 }
0091
0092 static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
0093 struct vm_area_struct *vma)
0094 {
0095 struct mm_struct *mm = current->mm;
0096 int pkey;
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115 pkey = vma_pkey(vma);
0116
0117 mmap_read_unlock(mm);
0118
0119
0120
0121
0122
0123
0124 if (!user_mode(regs))
0125 return SIGSEGV;
0126
0127 _exception_pkey(regs, address, pkey);
0128
0129 return 0;
0130 }
0131
0132 static noinline int bad_access(struct pt_regs *regs, unsigned long address)
0133 {
0134 return __bad_area(regs, address, SEGV_ACCERR);
0135 }
0136
0137 static int do_sigbus(struct pt_regs *regs, unsigned long address,
0138 vm_fault_t fault)
0139 {
0140 if (!user_mode(regs))
0141 return SIGBUS;
0142
0143 current->thread.trap_nr = BUS_ADRERR;
0144 #ifdef CONFIG_MEMORY_FAILURE
0145 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
0146 unsigned int lsb = 0;
0147
0148 pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
0149 current->comm, current->pid, address);
0150
0151 if (fault & VM_FAULT_HWPOISON_LARGE)
0152 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
0153 if (fault & VM_FAULT_HWPOISON)
0154 lsb = PAGE_SHIFT;
0155
0156 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
0157 return 0;
0158 }
0159
0160 #endif
0161 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
0162 return 0;
0163 }
0164
0165 static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
0166 vm_fault_t fault)
0167 {
0168
0169
0170
0171
0172 if (fatal_signal_pending(current) && !user_mode(regs))
0173 return SIGKILL;
0174
0175
0176 if (fault & VM_FAULT_OOM) {
0177
0178
0179
0180
0181 if (!user_mode(regs))
0182 return SIGSEGV;
0183 pagefault_out_of_memory();
0184 } else {
0185 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
0186 VM_FAULT_HWPOISON_LARGE))
0187 return do_sigbus(regs, addr, fault);
0188 else if (fault & VM_FAULT_SIGSEGV)
0189 return bad_area_nosemaphore(regs, addr);
0190 else
0191 BUG();
0192 }
0193 return 0;
0194 }
0195
0196
0197 static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
0198 unsigned long address, bool is_write)
0199 {
0200 int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
0201
0202 if (is_exec) {
0203 pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n",
0204 address >= TASK_SIZE ? "exec-protected" : "user",
0205 address,
0206 from_kuid(&init_user_ns, current_uid()));
0207
0208
0209 return true;
0210 }
0211
0212
0213 if (address >= TASK_SIZE)
0214 return true;
0215
0216
0217 if (bad_kuap_fault(regs, address, is_write)) {
0218 pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n",
0219 is_write ? "write" : "read", address,
0220 from_kuid(&init_user_ns, current_uid()));
0221
0222
0223 if (!search_exception_tables(regs->nip))
0224 return true;
0225
0226
0227
0228 return WARN(true, "Bug: %s fault blocked by KUAP!", is_write ? "Write" : "Read");
0229 }
0230
0231
0232 return false;
0233 }
0234
0235 static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
0236 struct vm_area_struct *vma)
0237 {
0238
0239
0240
0241
0242
0243 if (!arch_vma_access_permitted(vma, is_write, is_exec, 0))
0244 return true;
0245
0246 return false;
0247 }
0248
0249 static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma)
0250 {
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261 if (is_exec) {
0262 return !(vma->vm_flags & VM_EXEC) &&
0263 (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
0264 !(vma->vm_flags & (VM_READ | VM_WRITE)));
0265 }
0266
0267 if (is_write) {
0268 if (unlikely(!(vma->vm_flags & VM_WRITE)))
0269 return true;
0270 return false;
0271 }
0272
0273 if (unlikely(!vma_is_accessible(vma)))
0274 return true;
0275
0276
0277
0278
0279
0280
0281 return false;
0282 }
0283
0284 #ifdef CONFIG_PPC_SMLPAR
0285 static inline void cmo_account_page_fault(void)
0286 {
0287 if (firmware_has_feature(FW_FEATURE_CMO)) {
0288 u32 page_ins;
0289
0290 preempt_disable();
0291 page_ins = be32_to_cpu(get_lppaca()->page_ins);
0292 page_ins += 1 << PAGE_FACTOR;
0293 get_lppaca()->page_ins = cpu_to_be32(page_ins);
0294 preempt_enable();
0295 }
0296 }
0297 #else
0298 static inline void cmo_account_page_fault(void) { }
0299 #endif
0300
0301 static void sanity_check_fault(bool is_write, bool is_user,
0302 unsigned long error_code, unsigned long address)
0303 {
0304
0305
0306
0307 if (is_user && address >= TASK_SIZE) {
0308 if ((long)address == -1)
0309 return;
0310
0311 pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n",
0312 current->comm, current->pid, address,
0313 from_kuid(&init_user_ns, current_uid()));
0314 return;
0315 }
0316
0317 if (!IS_ENABLED(CONFIG_PPC_BOOK3S))
0318 return;
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349 if (radix_enabled() || is_write)
0350 return;
0351
0352 WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
0353 }
0354
0355
0356
0357
0358
0359 #if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
0360 #define page_fault_is_write(__err) ((__err) & ESR_DST)
0361 #else
0362 #define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
0363 #endif
0364
0365 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
0366 #define page_fault_is_bad(__err) (0)
0367 #elif defined(CONFIG_PPC_8xx)
0368 #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G)
0369 #elif defined(CONFIG_PPC64)
0370 #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S)
0371 #else
0372 #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
0373 #endif
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386 static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
0387 unsigned long error_code)
0388 {
0389 struct vm_area_struct * vma;
0390 struct mm_struct *mm = current->mm;
0391 unsigned int flags = FAULT_FLAG_DEFAULT;
0392 int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
0393 int is_user = user_mode(regs);
0394 int is_write = page_fault_is_write(error_code);
0395 vm_fault_t fault, major = 0;
0396 bool kprobe_fault = kprobe_page_fault(regs, 11);
0397
0398 if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
0399 return 0;
0400
0401 if (unlikely(page_fault_is_bad(error_code))) {
0402 if (is_user) {
0403 _exception(SIGBUS, regs, BUS_OBJERR, address);
0404 return 0;
0405 }
0406 return SIGBUS;
0407 }
0408
0409
0410 sanity_check_fault(is_write, is_user, error_code, address);
0411
0412
0413
0414
0415
0416
0417 if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) {
0418 if (kfence_handle_page_fault(address, is_write, regs))
0419 return 0;
0420
0421 return SIGSEGV;
0422 }
0423
0424
0425
0426
0427
0428 if (unlikely(faulthandler_disabled() || !mm)) {
0429 if (is_user)
0430 printk_ratelimited(KERN_ERR "Page fault in user mode"
0431 " with faulthandler_disabled()=%d"
0432 " mm=%p\n",
0433 faulthandler_disabled(), mm);
0434 return bad_area_nosemaphore(regs, address);
0435 }
0436
0437 interrupt_cond_local_irq_enable(regs);
0438
0439 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
0440
0441
0442
0443
0444
0445
0446 if (is_user)
0447 flags |= FAULT_FLAG_USER;
0448 if (is_write)
0449 flags |= FAULT_FLAG_WRITE;
0450 if (is_exec)
0451 flags |= FAULT_FLAG_INSTRUCTION;
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468 if (unlikely(!mmap_read_trylock(mm))) {
0469 if (!is_user && !search_exception_tables(regs->nip))
0470 return bad_area_nosemaphore(regs, address);
0471
0472 retry:
0473 mmap_read_lock(mm);
0474 } else {
0475
0476
0477
0478
0479
0480 might_sleep();
0481 }
0482
0483 vma = find_vma(mm, address);
0484 if (unlikely(!vma))
0485 return bad_area(regs, address);
0486
0487 if (unlikely(vma->vm_start > address)) {
0488 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
0489 return bad_area(regs, address);
0490
0491 if (unlikely(expand_stack(vma, address)))
0492 return bad_area(regs, address);
0493 }
0494
0495 if (unlikely(access_pkey_error(is_write, is_exec,
0496 (error_code & DSISR_KEYFAULT), vma)))
0497 return bad_access_pkey(regs, address, vma);
0498
0499 if (unlikely(access_error(is_write, is_exec, vma)))
0500 return bad_access(regs, address);
0501
0502
0503
0504
0505
0506
0507 fault = handle_mm_fault(vma, address, flags, regs);
0508
0509 major |= fault & VM_FAULT_MAJOR;
0510
0511 if (fault_signal_pending(fault, regs))
0512 return user_mode(regs) ? 0 : SIGBUS;
0513
0514
0515 if (fault & VM_FAULT_COMPLETED)
0516 goto out;
0517
0518
0519
0520
0521
0522 if (unlikely(fault & VM_FAULT_RETRY)) {
0523 flags |= FAULT_FLAG_TRIED;
0524 goto retry;
0525 }
0526
0527 mmap_read_unlock(current->mm);
0528
0529 if (unlikely(fault & VM_FAULT_ERROR))
0530 return mm_fault_error(regs, address, fault);
0531
0532 out:
0533
0534
0535
0536 if (major)
0537 cmo_account_page_fault();
0538
0539 return 0;
0540 }
0541 NOKPROBE_SYMBOL(___do_page_fault);
0542
0543 static __always_inline void __do_page_fault(struct pt_regs *regs)
0544 {
0545 long err;
0546
0547 err = ___do_page_fault(regs, regs->dar, regs->dsisr);
0548 if (unlikely(err))
0549 bad_page_fault(regs, err);
0550 }
0551
0552 DEFINE_INTERRUPT_HANDLER(do_page_fault)
0553 {
0554 __do_page_fault(regs);
0555 }
0556
0557 #ifdef CONFIG_PPC_BOOK3S_64
0558
0559 void hash__do_page_fault(struct pt_regs *regs)
0560 {
0561 __do_page_fault(regs);
0562 }
0563 NOKPROBE_SYMBOL(hash__do_page_fault);
0564 #endif
0565
0566
0567
0568
0569
0570
0571 static void __bad_page_fault(struct pt_regs *regs, int sig)
0572 {
0573 int is_write = page_fault_is_write(regs->dsisr);
0574 const char *msg;
0575
0576
0577
0578 if (regs->dar < PAGE_SIZE)
0579 msg = "Kernel NULL pointer dereference";
0580 else
0581 msg = "Unable to handle kernel data access";
0582
0583 switch (TRAP(regs)) {
0584 case INTERRUPT_DATA_STORAGE:
0585 case INTERRUPT_H_DATA_STORAGE:
0586 pr_alert("BUG: %s on %s at 0x%08lx\n", msg,
0587 is_write ? "write" : "read", regs->dar);
0588 break;
0589 case INTERRUPT_DATA_SEGMENT:
0590 pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar);
0591 break;
0592 case INTERRUPT_INST_STORAGE:
0593 case INTERRUPT_INST_SEGMENT:
0594 pr_alert("BUG: Unable to handle kernel instruction fetch%s",
0595 regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n");
0596 break;
0597 case INTERRUPT_ALIGNMENT:
0598 pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n",
0599 regs->dar);
0600 break;
0601 default:
0602 pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n",
0603 regs->dar);
0604 break;
0605 }
0606 printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
0607 regs->nip);
0608
0609 if (task_stack_end_corrupted(current))
0610 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
0611
0612 die("Kernel access of bad area", regs, sig);
0613 }
0614
0615 void bad_page_fault(struct pt_regs *regs, int sig)
0616 {
0617 const struct exception_table_entry *entry;
0618
0619
0620 entry = search_exception_tables(instruction_pointer(regs));
0621 if (entry)
0622 instruction_pointer_set(regs, extable_fixup(entry));
0623 else
0624 __bad_page_fault(regs, sig);
0625 }
0626
0627 #ifdef CONFIG_PPC_BOOK3S_64
0628 DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv)
0629 {
0630 bad_page_fault(regs, SIGSEGV);
0631 }
0632
0633
0634
0635
0636
0637
0638
0639
0640 DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
0641 {
0642 int err = regs->result;
0643
0644 if (err == -EFAULT) {
0645 if (user_mode(regs))
0646 _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
0647 else
0648 bad_page_fault(regs, SIGSEGV);
0649 } else if (err == -EINVAL) {
0650 unrecoverable_exception(regs);
0651 } else {
0652 BUG();
0653 }
0654 }
0655 #endif