0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #include <linux/cpu.h>
0019 #include <linux/errno.h>
0020 #include <linux/sched.h>
0021 #include <linux/sched/task.h>
0022 #include <linux/sched/task_stack.h>
0023 #include <linux/fs.h>
0024 #include <linux/kernel.h>
0025 #include <linux/mm.h>
0026 #include <linux/elfcore.h>
0027 #include <linux/smp.h>
0028 #include <linux/slab.h>
0029 #include <linux/user.h>
0030 #include <linux/interrupt.h>
0031 #include <linux/delay.h>
0032 #include <linux/export.h>
0033 #include <linux/ptrace.h>
0034 #include <linux/notifier.h>
0035 #include <linux/kprobes.h>
0036 #include <linux/kdebug.h>
0037 #include <linux/prctl.h>
0038 #include <linux/uaccess.h>
0039 #include <linux/io.h>
0040 #include <linux/ftrace.h>
0041 #include <linux/syscalls.h>
0042
0043 #include <asm/processor.h>
0044 #include <asm/pkru.h>
0045 #include <asm/fpu/sched.h>
0046 #include <asm/mmu_context.h>
0047 #include <asm/prctl.h>
0048 #include <asm/desc.h>
0049 #include <asm/proto.h>
0050 #include <asm/ia32.h>
0051 #include <asm/debugreg.h>
0052 #include <asm/switch_to.h>
0053 #include <asm/xen/hypervisor.h>
0054 #include <asm/vdso.h>
0055 #include <asm/resctrl.h>
0056 #include <asm/unistd.h>
0057 #include <asm/fsgsbase.h>
0058 #ifdef CONFIG_IA32_EMULATION
0059
0060 #include <asm/unistd_32_ia32.h>
0061 #endif
0062
0063 #include "process.h"
0064
0065
0066 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
0067 const char *log_lvl)
0068 {
0069 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
0070 unsigned long d0, d1, d2, d3, d6, d7;
0071 unsigned int fsindex, gsindex;
0072 unsigned int ds, es;
0073
0074 show_iret_regs(regs, log_lvl);
0075
0076 if (regs->orig_ax != -1)
0077 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
0078 else
0079 pr_cont("\n");
0080
0081 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
0082 log_lvl, regs->ax, regs->bx, regs->cx);
0083 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
0084 log_lvl, regs->dx, regs->si, regs->di);
0085 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
0086 log_lvl, regs->bp, regs->r8, regs->r9);
0087 printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
0088 log_lvl, regs->r10, regs->r11, regs->r12);
0089 printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
0090 log_lvl, regs->r13, regs->r14, regs->r15);
0091
0092 if (mode == SHOW_REGS_SHORT)
0093 return;
0094
0095 if (mode == SHOW_REGS_USER) {
0096 rdmsrl(MSR_FS_BASE, fs);
0097 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
0098 printk("%sFS: %016lx GS: %016lx\n",
0099 log_lvl, fs, shadowgs);
0100 return;
0101 }
0102
0103 asm("movl %%ds,%0" : "=r" (ds));
0104 asm("movl %%es,%0" : "=r" (es));
0105 asm("movl %%fs,%0" : "=r" (fsindex));
0106 asm("movl %%gs,%0" : "=r" (gsindex));
0107
0108 rdmsrl(MSR_FS_BASE, fs);
0109 rdmsrl(MSR_GS_BASE, gs);
0110 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
0111
0112 cr0 = read_cr0();
0113 cr2 = read_cr2();
0114 cr3 = __read_cr3();
0115 cr4 = __read_cr4();
0116
0117 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
0118 log_lvl, fs, fsindex, gs, gsindex, shadowgs);
0119 printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n",
0120 log_lvl, regs->cs, ds, es, cr0);
0121 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
0122 log_lvl, cr2, cr3, cr4);
0123
0124 get_debugreg(d0, 0);
0125 get_debugreg(d1, 1);
0126 get_debugreg(d2, 2);
0127 get_debugreg(d3, 3);
0128 get_debugreg(d6, 6);
0129 get_debugreg(d7, 7);
0130
0131
0132 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
0133 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
0134 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
0135 log_lvl, d0, d1, d2);
0136 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
0137 log_lvl, d3, d6, d7);
0138 }
0139
0140 if (cpu_feature_enabled(X86_FEATURE_OSPKE))
0141 printk("%sPKRU: %08x\n", log_lvl, read_pkru());
0142 }
0143
0144 void release_thread(struct task_struct *dead_task)
0145 {
0146 WARN_ON(dead_task->mm);
0147 }
0148
0149 enum which_selector {
0150 FS,
0151 GS
0152 };
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162 static noinstr unsigned long __rdgsbase_inactive(void)
0163 {
0164 unsigned long gsbase;
0165
0166 lockdep_assert_irqs_disabled();
0167
0168 if (!static_cpu_has(X86_FEATURE_XENPV)) {
0169 native_swapgs();
0170 gsbase = rdgsbase();
0171 native_swapgs();
0172 } else {
0173 instrumentation_begin();
0174 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
0175 instrumentation_end();
0176 }
0177
0178 return gsbase;
0179 }
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
0190 {
0191 lockdep_assert_irqs_disabled();
0192
0193 if (!static_cpu_has(X86_FEATURE_XENPV)) {
0194 native_swapgs();
0195 wrgsbase(gsbase);
0196 native_swapgs();
0197 } else {
0198 instrumentation_begin();
0199 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
0200 instrumentation_end();
0201 }
0202 }
0203
0204
0205
0206
0207
0208
0209
0210 static __always_inline void save_base_legacy(struct task_struct *prev_p,
0211 unsigned short selector,
0212 enum which_selector which)
0213 {
0214 if (likely(selector == 0)) {
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231 } else {
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242 if (which == FS)
0243 prev_p->thread.fsbase = 0;
0244 else
0245 prev_p->thread.gsbase = 0;
0246 }
0247 }
0248
0249 static __always_inline void save_fsgs(struct task_struct *task)
0250 {
0251 savesegment(fs, task->thread.fsindex);
0252 savesegment(gs, task->thread.gsindex);
0253 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
0254
0255
0256
0257
0258
0259 task->thread.fsbase = rdfsbase();
0260 task->thread.gsbase = __rdgsbase_inactive();
0261 } else {
0262 save_base_legacy(task, task->thread.fsindex, FS);
0263 save_base_legacy(task, task->thread.gsindex, GS);
0264 }
0265 }
0266
0267
0268
0269
0270
0271 void current_save_fsgs(void)
0272 {
0273 unsigned long flags;
0274
0275
0276 local_irq_save(flags);
0277 save_fsgs(current);
0278 local_irq_restore(flags);
0279 }
0280 #if IS_ENABLED(CONFIG_KVM)
0281 EXPORT_SYMBOL_GPL(current_save_fsgs);
0282 #endif
0283
0284 static __always_inline void loadseg(enum which_selector which,
0285 unsigned short sel)
0286 {
0287 if (which == FS)
0288 loadsegment(fs, sel);
0289 else
0290 load_gs_index(sel);
0291 }
0292
0293 static __always_inline void load_seg_legacy(unsigned short prev_index,
0294 unsigned long prev_base,
0295 unsigned short next_index,
0296 unsigned long next_base,
0297 enum which_selector which)
0298 {
0299 if (likely(next_index <= 3)) {
0300
0301
0302
0303
0304 if (next_base == 0) {
0305
0306
0307
0308
0309 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
0310 loadseg(which, __USER_DS);
0311 loadseg(which, next_index);
0312 } else {
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325 if (likely(prev_index | next_index | prev_base))
0326 loadseg(which, next_index);
0327 }
0328 } else {
0329 if (prev_index != next_index)
0330 loadseg(which, next_index);
0331 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
0332 next_base);
0333 }
0334 } else {
0335
0336
0337
0338
0339 loadseg(which, next_index);
0340 }
0341 }
0342
0343
0344
0345
0346
0347
0348
0349 static __always_inline void x86_pkru_load(struct thread_struct *prev,
0350 struct thread_struct *next)
0351 {
0352 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
0353 return;
0354
0355
0356 prev->pkru = rdpkru();
0357
0358
0359
0360
0361
0362 if (prev->pkru != next->pkru)
0363 wrpkru(next->pkru);
0364 }
0365
0366 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
0367 struct thread_struct *next)
0368 {
0369 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
0370
0371 if (unlikely(prev->fsindex || next->fsindex))
0372 loadseg(FS, next->fsindex);
0373 if (unlikely(prev->gsindex || next->gsindex))
0374 loadseg(GS, next->gsindex);
0375
0376
0377 wrfsbase(next->fsbase);
0378 __wrgsbase_inactive(next->gsbase);
0379 } else {
0380 load_seg_legacy(prev->fsindex, prev->fsbase,
0381 next->fsindex, next->fsbase, FS);
0382 load_seg_legacy(prev->gsindex, prev->gsbase,
0383 next->gsindex, next->gsbase, GS);
0384 }
0385 }
0386
0387 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
0388 unsigned short selector)
0389 {
0390 unsigned short idx = selector >> 3;
0391 unsigned long base;
0392
0393 if (likely((selector & SEGMENT_TI_MASK) == 0)) {
0394 if (unlikely(idx >= GDT_ENTRIES))
0395 return 0;
0396
0397
0398
0399
0400
0401 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
0402 return 0;
0403
0404 idx -= GDT_ENTRY_TLS_MIN;
0405 base = get_desc_base(&task->thread.tls_array[idx]);
0406 } else {
0407 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0408 struct ldt_struct *ldt;
0409
0410
0411
0412
0413
0414
0415 mutex_lock(&task->mm->context.lock);
0416 ldt = task->mm->context.ldt;
0417 if (unlikely(!ldt || idx >= ldt->nr_entries))
0418 base = 0;
0419 else
0420 base = get_desc_base(ldt->entries + idx);
0421 mutex_unlock(&task->mm->context.lock);
0422 #else
0423 base = 0;
0424 #endif
0425 }
0426
0427 return base;
0428 }
0429
0430 unsigned long x86_gsbase_read_cpu_inactive(void)
0431 {
0432 unsigned long gsbase;
0433
0434 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
0435 unsigned long flags;
0436
0437 local_irq_save(flags);
0438 gsbase = __rdgsbase_inactive();
0439 local_irq_restore(flags);
0440 } else {
0441 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
0442 }
0443
0444 return gsbase;
0445 }
0446
0447 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
0448 {
0449 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
0450 unsigned long flags;
0451
0452 local_irq_save(flags);
0453 __wrgsbase_inactive(gsbase);
0454 local_irq_restore(flags);
0455 } else {
0456 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
0457 }
0458 }
0459
0460 unsigned long x86_fsbase_read_task(struct task_struct *task)
0461 {
0462 unsigned long fsbase;
0463
0464 if (task == current)
0465 fsbase = x86_fsbase_read_cpu();
0466 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
0467 (task->thread.fsindex == 0))
0468 fsbase = task->thread.fsbase;
0469 else
0470 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
0471
0472 return fsbase;
0473 }
0474
0475 unsigned long x86_gsbase_read_task(struct task_struct *task)
0476 {
0477 unsigned long gsbase;
0478
0479 if (task == current)
0480 gsbase = x86_gsbase_read_cpu_inactive();
0481 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
0482 (task->thread.gsindex == 0))
0483 gsbase = task->thread.gsbase;
0484 else
0485 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
0486
0487 return gsbase;
0488 }
0489
0490 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
0491 {
0492 WARN_ON_ONCE(task == current);
0493
0494 task->thread.fsbase = fsbase;
0495 }
0496
0497 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
0498 {
0499 WARN_ON_ONCE(task == current);
0500
0501 task->thread.gsbase = gsbase;
0502 }
0503
0504 static void
0505 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
0506 unsigned long new_sp,
0507 unsigned int _cs, unsigned int _ss, unsigned int _ds)
0508 {
0509 WARN_ON_ONCE(regs != current_pt_regs());
0510
0511 if (static_cpu_has(X86_BUG_NULL_SEG)) {
0512
0513 loadsegment(fs, __USER_DS);
0514 load_gs_index(__USER_DS);
0515 }
0516
0517 loadsegment(fs, 0);
0518 loadsegment(es, _ds);
0519 loadsegment(ds, _ds);
0520 load_gs_index(0);
0521
0522 regs->ip = new_ip;
0523 regs->sp = new_sp;
0524 regs->cs = _cs;
0525 regs->ss = _ss;
0526 regs->flags = X86_EFLAGS_IF;
0527 }
0528
0529 void
0530 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
0531 {
0532 start_thread_common(regs, new_ip, new_sp,
0533 __USER_CS, __USER_DS, 0);
0534 }
0535 EXPORT_SYMBOL_GPL(start_thread);
0536
0537 #ifdef CONFIG_COMPAT
0538 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
0539 {
0540 start_thread_common(regs, new_ip, new_sp,
0541 x32 ? __USER_CS : __USER32_CS,
0542 __USER_DS, __USER_DS);
0543 }
0544 #endif
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556 __visible __notrace_funcgraph struct task_struct *
0557 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
0558 {
0559 struct thread_struct *prev = &prev_p->thread;
0560 struct thread_struct *next = &next_p->thread;
0561 struct fpu *prev_fpu = &prev->fpu;
0562 int cpu = smp_processor_id();
0563
0564 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
0565 this_cpu_read(hardirq_stack_inuse));
0566
0567 if (!test_thread_flag(TIF_NEED_FPU_LOAD))
0568 switch_fpu_prepare(prev_fpu, cpu);
0569
0570
0571
0572
0573
0574
0575 save_fsgs(prev_p);
0576
0577
0578
0579
0580
0581 load_TLS(next, cpu);
0582
0583
0584
0585
0586
0587
0588 arch_end_context_switch(next_p);
0589
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604 savesegment(es, prev->es);
0605 if (unlikely(next->es | prev->es))
0606 loadsegment(es, next->es);
0607
0608 savesegment(ds, prev->ds);
0609 if (unlikely(next->ds | prev->ds))
0610 loadsegment(ds, next->ds);
0611
0612 x86_fsgsbase_load(prev, next);
0613
0614 x86_pkru_load(prev, next);
0615
0616
0617
0618
0619 this_cpu_write(current_task, next_p);
0620 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
0621
0622 switch_fpu_finish();
0623
0624
0625 update_task_stack(next_p);
0626
0627 switch_to_extra(prev_p, next_p);
0628
0629 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
0630
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649
0650
0651 unsigned short ss_sel;
0652 savesegment(ss, ss_sel);
0653 if (ss_sel != __KERNEL_DS)
0654 loadsegment(ss, __KERNEL_DS);
0655 }
0656
0657
0658 resctrl_sched_in();
0659
0660 return prev_p;
0661 }
0662
0663 void set_personality_64bit(void)
0664 {
0665
0666
0667
0668 clear_thread_flag(TIF_ADDR32);
0669
0670 task_pt_regs(current)->orig_ax = __NR_execve;
0671 current_thread_info()->status &= ~TS_COMPAT;
0672 if (current->mm)
0673 current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
0674
0675
0676
0677
0678
0679 current->personality &= ~READ_IMPLIES_EXEC;
0680 }
0681
0682 static void __set_personality_x32(void)
0683 {
0684 #ifdef CONFIG_X86_X32_ABI
0685 if (current->mm)
0686 current->mm->context.flags = 0;
0687
0688 current->personality &= ~READ_IMPLIES_EXEC;
0689
0690
0691
0692
0693
0694
0695
0696
0697 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
0698 current_thread_info()->status &= ~TS_COMPAT;
0699 #endif
0700 }
0701
0702 static void __set_personality_ia32(void)
0703 {
0704 #ifdef CONFIG_IA32_EMULATION
0705 if (current->mm) {
0706
0707
0708
0709
0710 current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
0711 }
0712
0713 current->personality |= force_personality32;
0714
0715 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
0716 current_thread_info()->status |= TS_COMPAT;
0717 #endif
0718 }
0719
0720 void set_personality_ia32(bool x32)
0721 {
0722
0723 set_thread_flag(TIF_ADDR32);
0724
0725 if (x32)
0726 __set_personality_x32();
0727 else
0728 __set_personality_ia32();
0729 }
0730 EXPORT_SYMBOL_GPL(set_personality_ia32);
0731
0732 #ifdef CONFIG_CHECKPOINT_RESTORE
0733 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
0734 {
0735 int ret;
0736
0737 ret = map_vdso_once(image, addr);
0738 if (ret)
0739 return ret;
0740
0741 return (long)image->size;
0742 }
0743 #endif
0744
0745 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
0746 {
0747 int ret = 0;
0748
0749 switch (option) {
0750 case ARCH_SET_GS: {
0751 if (unlikely(arg2 >= TASK_SIZE_MAX))
0752 return -EPERM;
0753
0754 preempt_disable();
0755
0756
0757
0758
0759
0760
0761 if (task == current) {
0762 loadseg(GS, 0);
0763 x86_gsbase_write_cpu_inactive(arg2);
0764
0765
0766
0767
0768
0769 task->thread.gsbase = arg2;
0770
0771 } else {
0772 task->thread.gsindex = 0;
0773 x86_gsbase_write_task(task, arg2);
0774 }
0775 preempt_enable();
0776 break;
0777 }
0778 case ARCH_SET_FS: {
0779
0780
0781
0782
0783 if (unlikely(arg2 >= TASK_SIZE_MAX))
0784 return -EPERM;
0785
0786 preempt_disable();
0787
0788
0789
0790
0791 if (task == current) {
0792 loadseg(FS, 0);
0793 x86_fsbase_write_cpu(arg2);
0794
0795
0796
0797
0798
0799 task->thread.fsbase = arg2;
0800 } else {
0801 task->thread.fsindex = 0;
0802 x86_fsbase_write_task(task, arg2);
0803 }
0804 preempt_enable();
0805 break;
0806 }
0807 case ARCH_GET_FS: {
0808 unsigned long base = x86_fsbase_read_task(task);
0809
0810 ret = put_user(base, (unsigned long __user *)arg2);
0811 break;
0812 }
0813 case ARCH_GET_GS: {
0814 unsigned long base = x86_gsbase_read_task(task);
0815
0816 ret = put_user(base, (unsigned long __user *)arg2);
0817 break;
0818 }
0819
0820 #ifdef CONFIG_CHECKPOINT_RESTORE
0821 # ifdef CONFIG_X86_X32_ABI
0822 case ARCH_MAP_VDSO_X32:
0823 return prctl_map_vdso(&vdso_image_x32, arg2);
0824 # endif
0825 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
0826 case ARCH_MAP_VDSO_32:
0827 return prctl_map_vdso(&vdso_image_32, arg2);
0828 # endif
0829 case ARCH_MAP_VDSO_64:
0830 return prctl_map_vdso(&vdso_image_64, arg2);
0831 #endif
0832
0833 default:
0834 ret = -EINVAL;
0835 break;
0836 }
0837
0838 return ret;
0839 }
0840
0841 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
0842 {
0843 long ret;
0844
0845 ret = do_arch_prctl_64(current, option, arg2);
0846 if (ret == -EINVAL)
0847 ret = do_arch_prctl_common(option, arg2);
0848
0849 return ret;
0850 }
0851
0852 #ifdef CONFIG_IA32_EMULATION
0853 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
0854 {
0855 return do_arch_prctl_common(option, arg2);
0856 }
0857 #endif
0858
0859 unsigned long KSTK_ESP(struct task_struct *task)
0860 {
0861 return task_pt_regs(task)->sp;
0862 }