Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  Copyright (C) 1995  Linus Torvalds
0004  *
0005  *  Pentium III FXSR, SSE support
0006  *  Gareth Hughes <gareth@valinux.com>, May 2000
0007  *
0008  *  X86-64 port
0009  *  Andi Kleen.
0010  *
0011  *  CPU hotplug support - ashok.raj@intel.com
0012  */
0013 
0014 /*
0015  * This file handles the architecture-dependent parts of process handling..
0016  */
0017 
0018 #include <linux/cpu.h>
0019 #include <linux/errno.h>
0020 #include <linux/sched.h>
0021 #include <linux/sched/task.h>
0022 #include <linux/sched/task_stack.h>
0023 #include <linux/fs.h>
0024 #include <linux/kernel.h>
0025 #include <linux/mm.h>
0026 #include <linux/elfcore.h>
0027 #include <linux/smp.h>
0028 #include <linux/slab.h>
0029 #include <linux/user.h>
0030 #include <linux/interrupt.h>
0031 #include <linux/delay.h>
0032 #include <linux/export.h>
0033 #include <linux/ptrace.h>
0034 #include <linux/notifier.h>
0035 #include <linux/kprobes.h>
0036 #include <linux/kdebug.h>
0037 #include <linux/prctl.h>
0038 #include <linux/uaccess.h>
0039 #include <linux/io.h>
0040 #include <linux/ftrace.h>
0041 #include <linux/syscalls.h>
0042 
0043 #include <asm/processor.h>
0044 #include <asm/pkru.h>
0045 #include <asm/fpu/sched.h>
0046 #include <asm/mmu_context.h>
0047 #include <asm/prctl.h>
0048 #include <asm/desc.h>
0049 #include <asm/proto.h>
0050 #include <asm/ia32.h>
0051 #include <asm/debugreg.h>
0052 #include <asm/switch_to.h>
0053 #include <asm/xen/hypervisor.h>
0054 #include <asm/vdso.h>
0055 #include <asm/resctrl.h>
0056 #include <asm/unistd.h>
0057 #include <asm/fsgsbase.h>
0058 #ifdef CONFIG_IA32_EMULATION
0059 /* Not included via unistd.h */
0060 #include <asm/unistd_32_ia32.h>
0061 #endif
0062 
0063 #include "process.h"
0064 
0065 /* Prints also some state that isn't saved in the pt_regs */
0066 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
0067          const char *log_lvl)
0068 {
0069     unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
0070     unsigned long d0, d1, d2, d3, d6, d7;
0071     unsigned int fsindex, gsindex;
0072     unsigned int ds, es;
0073 
0074     show_iret_regs(regs, log_lvl);
0075 
0076     if (regs->orig_ax != -1)
0077         pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
0078     else
0079         pr_cont("\n");
0080 
0081     printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
0082            log_lvl, regs->ax, regs->bx, regs->cx);
0083     printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
0084            log_lvl, regs->dx, regs->si, regs->di);
0085     printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
0086            log_lvl, regs->bp, regs->r8, regs->r9);
0087     printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
0088            log_lvl, regs->r10, regs->r11, regs->r12);
0089     printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
0090            log_lvl, regs->r13, regs->r14, regs->r15);
0091 
0092     if (mode == SHOW_REGS_SHORT)
0093         return;
0094 
0095     if (mode == SHOW_REGS_USER) {
0096         rdmsrl(MSR_FS_BASE, fs);
0097         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
0098         printk("%sFS:  %016lx GS:  %016lx\n",
0099                log_lvl, fs, shadowgs);
0100         return;
0101     }
0102 
0103     asm("movl %%ds,%0" : "=r" (ds));
0104     asm("movl %%es,%0" : "=r" (es));
0105     asm("movl %%fs,%0" : "=r" (fsindex));
0106     asm("movl %%gs,%0" : "=r" (gsindex));
0107 
0108     rdmsrl(MSR_FS_BASE, fs);
0109     rdmsrl(MSR_GS_BASE, gs);
0110     rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
0111 
0112     cr0 = read_cr0();
0113     cr2 = read_cr2();
0114     cr3 = __read_cr3();
0115     cr4 = __read_cr4();
0116 
0117     printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
0118            log_lvl, fs, fsindex, gs, gsindex, shadowgs);
0119     printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
0120         log_lvl, regs->cs, ds, es, cr0);
0121     printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
0122         log_lvl, cr2, cr3, cr4);
0123 
0124     get_debugreg(d0, 0);
0125     get_debugreg(d1, 1);
0126     get_debugreg(d2, 2);
0127     get_debugreg(d3, 3);
0128     get_debugreg(d6, 6);
0129     get_debugreg(d7, 7);
0130 
0131     /* Only print out debug registers if they are in their non-default state. */
0132     if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
0133         (d6 == DR6_RESERVED) && (d7 == 0x400))) {
0134         printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
0135                log_lvl, d0, d1, d2);
0136         printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
0137                log_lvl, d3, d6, d7);
0138     }
0139 
0140     if (cpu_feature_enabled(X86_FEATURE_OSPKE))
0141         printk("%sPKRU: %08x\n", log_lvl, read_pkru());
0142 }
0143 
0144 void release_thread(struct task_struct *dead_task)
0145 {
0146     WARN_ON(dead_task->mm);
0147 }
0148 
0149 enum which_selector {
0150     FS,
0151     GS
0152 };
0153 
0154 /*
0155  * Out of line to be protected from kprobes and tracing. If this would be
0156  * traced or probed than any access to a per CPU variable happens with
0157  * the wrong GS.
0158  *
0159  * It is not used on Xen paravirt. When paravirt support is needed, it
0160  * needs to be renamed with native_ prefix.
0161  */
0162 static noinstr unsigned long __rdgsbase_inactive(void)
0163 {
0164     unsigned long gsbase;
0165 
0166     lockdep_assert_irqs_disabled();
0167 
0168     if (!static_cpu_has(X86_FEATURE_XENPV)) {
0169         native_swapgs();
0170         gsbase = rdgsbase();
0171         native_swapgs();
0172     } else {
0173         instrumentation_begin();
0174         rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
0175         instrumentation_end();
0176     }
0177 
0178     return gsbase;
0179 }
0180 
0181 /*
0182  * Out of line to be protected from kprobes and tracing. If this would be
0183  * traced or probed than any access to a per CPU variable happens with
0184  * the wrong GS.
0185  *
0186  * It is not used on Xen paravirt. When paravirt support is needed, it
0187  * needs to be renamed with native_ prefix.
0188  */
0189 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
0190 {
0191     lockdep_assert_irqs_disabled();
0192 
0193     if (!static_cpu_has(X86_FEATURE_XENPV)) {
0194         native_swapgs();
0195         wrgsbase(gsbase);
0196         native_swapgs();
0197     } else {
0198         instrumentation_begin();
0199         wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
0200         instrumentation_end();
0201     }
0202 }
0203 
0204 /*
0205  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
0206  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
0207  * It's forcibly inlined because it'll generate better code and this function
0208  * is hot.
0209  */
0210 static __always_inline void save_base_legacy(struct task_struct *prev_p,
0211                          unsigned short selector,
0212                          enum which_selector which)
0213 {
0214     if (likely(selector == 0)) {
0215         /*
0216          * On Intel (without X86_BUG_NULL_SEG), the segment base could
0217          * be the pre-existing saved base or it could be zero.  On AMD
0218          * (with X86_BUG_NULL_SEG), the segment base could be almost
0219          * anything.
0220          *
0221          * This branch is very hot (it's hit twice on almost every
0222          * context switch between 64-bit programs), and avoiding
0223          * the RDMSR helps a lot, so we just assume that whatever
0224          * value is already saved is correct.  This matches historical
0225          * Linux behavior, so it won't break existing applications.
0226          *
0227          * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
0228          * report that the base is zero, it needs to actually be zero:
0229          * see the corresponding logic in load_seg_legacy.
0230          */
0231     } else {
0232         /*
0233          * If the selector is 1, 2, or 3, then the base is zero on
0234          * !X86_BUG_NULL_SEG CPUs and could be anything on
0235          * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
0236          * has never attempted to preserve the base across context
0237          * switches.
0238          *
0239          * If selector > 3, then it refers to a real segment, and
0240          * saving the base isn't necessary.
0241          */
0242         if (which == FS)
0243             prev_p->thread.fsbase = 0;
0244         else
0245             prev_p->thread.gsbase = 0;
0246     }
0247 }
0248 
0249 static __always_inline void save_fsgs(struct task_struct *task)
0250 {
0251     savesegment(fs, task->thread.fsindex);
0252     savesegment(gs, task->thread.gsindex);
0253     if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
0254         /*
0255          * If FSGSBASE is enabled, we can't make any useful guesses
0256          * about the base, and user code expects us to save the current
0257          * value.  Fortunately, reading the base directly is efficient.
0258          */
0259         task->thread.fsbase = rdfsbase();
0260         task->thread.gsbase = __rdgsbase_inactive();
0261     } else {
0262         save_base_legacy(task, task->thread.fsindex, FS);
0263         save_base_legacy(task, task->thread.gsindex, GS);
0264     }
0265 }
0266 
0267 /*
0268  * While a process is running,current->thread.fsbase and current->thread.gsbase
0269  * may not match the corresponding CPU registers (see save_base_legacy()).
0270  */
0271 void current_save_fsgs(void)
0272 {
0273     unsigned long flags;
0274 
0275     /* Interrupts need to be off for FSGSBASE */
0276     local_irq_save(flags);
0277     save_fsgs(current);
0278     local_irq_restore(flags);
0279 }
0280 #if IS_ENABLED(CONFIG_KVM)
0281 EXPORT_SYMBOL_GPL(current_save_fsgs);
0282 #endif
0283 
0284 static __always_inline void loadseg(enum which_selector which,
0285                     unsigned short sel)
0286 {
0287     if (which == FS)
0288         loadsegment(fs, sel);
0289     else
0290         load_gs_index(sel);
0291 }
0292 
0293 static __always_inline void load_seg_legacy(unsigned short prev_index,
0294                         unsigned long prev_base,
0295                         unsigned short next_index,
0296                         unsigned long next_base,
0297                         enum which_selector which)
0298 {
0299     if (likely(next_index <= 3)) {
0300         /*
0301          * The next task is using 64-bit TLS, is not using this
0302          * segment at all, or is having fun with arcane CPU features.
0303          */
0304         if (next_base == 0) {
0305             /*
0306              * Nasty case: on AMD CPUs, we need to forcibly zero
0307              * the base.
0308              */
0309             if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
0310                 loadseg(which, __USER_DS);
0311                 loadseg(which, next_index);
0312             } else {
0313                 /*
0314                  * We could try to exhaustively detect cases
0315                  * under which we can skip the segment load,
0316                  * but there's really only one case that matters
0317                  * for performance: if both the previous and
0318                  * next states are fully zeroed, we can skip
0319                  * the load.
0320                  *
0321                  * (This assumes that prev_base == 0 has no
0322                  * false positives.  This is the case on
0323                  * Intel-style CPUs.)
0324                  */
0325                 if (likely(prev_index | next_index | prev_base))
0326                     loadseg(which, next_index);
0327             }
0328         } else {
0329             if (prev_index != next_index)
0330                 loadseg(which, next_index);
0331             wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
0332                    next_base);
0333         }
0334     } else {
0335         /*
0336          * The next task is using a real segment.  Loading the selector
0337          * is sufficient.
0338          */
0339         loadseg(which, next_index);
0340     }
0341 }
0342 
0343 /*
0344  * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
0345  * is not XSTATE managed on context switch because that would require a
0346  * lookup in the task's FPU xsave buffer and require to keep that updated
0347  * in various places.
0348  */
0349 static __always_inline void x86_pkru_load(struct thread_struct *prev,
0350                       struct thread_struct *next)
0351 {
0352     if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
0353         return;
0354 
0355     /* Stash the prev task's value: */
0356     prev->pkru = rdpkru();
0357 
0358     /*
0359      * PKRU writes are slightly expensive.  Avoid them when not
0360      * strictly necessary:
0361      */
0362     if (prev->pkru != next->pkru)
0363         wrpkru(next->pkru);
0364 }
0365 
0366 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
0367                           struct thread_struct *next)
0368 {
0369     if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
0370         /* Update the FS and GS selectors if they could have changed. */
0371         if (unlikely(prev->fsindex || next->fsindex))
0372             loadseg(FS, next->fsindex);
0373         if (unlikely(prev->gsindex || next->gsindex))
0374             loadseg(GS, next->gsindex);
0375 
0376         /* Update the bases. */
0377         wrfsbase(next->fsbase);
0378         __wrgsbase_inactive(next->gsbase);
0379     } else {
0380         load_seg_legacy(prev->fsindex, prev->fsbase,
0381                 next->fsindex, next->fsbase, FS);
0382         load_seg_legacy(prev->gsindex, prev->gsbase,
0383                 next->gsindex, next->gsbase, GS);
0384     }
0385 }
0386 
0387 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
0388                      unsigned short selector)
0389 {
0390     unsigned short idx = selector >> 3;
0391     unsigned long base;
0392 
0393     if (likely((selector & SEGMENT_TI_MASK) == 0)) {
0394         if (unlikely(idx >= GDT_ENTRIES))
0395             return 0;
0396 
0397         /*
0398          * There are no user segments in the GDT with nonzero bases
0399          * other than the TLS segments.
0400          */
0401         if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
0402             return 0;
0403 
0404         idx -= GDT_ENTRY_TLS_MIN;
0405         base = get_desc_base(&task->thread.tls_array[idx]);
0406     } else {
0407 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0408         struct ldt_struct *ldt;
0409 
0410         /*
0411          * If performance here mattered, we could protect the LDT
0412          * with RCU.  This is a slow path, though, so we can just
0413          * take the mutex.
0414          */
0415         mutex_lock(&task->mm->context.lock);
0416         ldt = task->mm->context.ldt;
0417         if (unlikely(!ldt || idx >= ldt->nr_entries))
0418             base = 0;
0419         else
0420             base = get_desc_base(ldt->entries + idx);
0421         mutex_unlock(&task->mm->context.lock);
0422 #else
0423         base = 0;
0424 #endif
0425     }
0426 
0427     return base;
0428 }
0429 
0430 unsigned long x86_gsbase_read_cpu_inactive(void)
0431 {
0432     unsigned long gsbase;
0433 
0434     if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
0435         unsigned long flags;
0436 
0437         local_irq_save(flags);
0438         gsbase = __rdgsbase_inactive();
0439         local_irq_restore(flags);
0440     } else {
0441         rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
0442     }
0443 
0444     return gsbase;
0445 }
0446 
0447 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
0448 {
0449     if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
0450         unsigned long flags;
0451 
0452         local_irq_save(flags);
0453         __wrgsbase_inactive(gsbase);
0454         local_irq_restore(flags);
0455     } else {
0456         wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
0457     }
0458 }
0459 
0460 unsigned long x86_fsbase_read_task(struct task_struct *task)
0461 {
0462     unsigned long fsbase;
0463 
0464     if (task == current)
0465         fsbase = x86_fsbase_read_cpu();
0466     else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
0467          (task->thread.fsindex == 0))
0468         fsbase = task->thread.fsbase;
0469     else
0470         fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
0471 
0472     return fsbase;
0473 }
0474 
0475 unsigned long x86_gsbase_read_task(struct task_struct *task)
0476 {
0477     unsigned long gsbase;
0478 
0479     if (task == current)
0480         gsbase = x86_gsbase_read_cpu_inactive();
0481     else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
0482          (task->thread.gsindex == 0))
0483         gsbase = task->thread.gsbase;
0484     else
0485         gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
0486 
0487     return gsbase;
0488 }
0489 
0490 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
0491 {
0492     WARN_ON_ONCE(task == current);
0493 
0494     task->thread.fsbase = fsbase;
0495 }
0496 
0497 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
0498 {
0499     WARN_ON_ONCE(task == current);
0500 
0501     task->thread.gsbase = gsbase;
0502 }
0503 
0504 static void
0505 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
0506             unsigned long new_sp,
0507             unsigned int _cs, unsigned int _ss, unsigned int _ds)
0508 {
0509     WARN_ON_ONCE(regs != current_pt_regs());
0510 
0511     if (static_cpu_has(X86_BUG_NULL_SEG)) {
0512         /* Loading zero below won't clear the base. */
0513         loadsegment(fs, __USER_DS);
0514         load_gs_index(__USER_DS);
0515     }
0516 
0517     loadsegment(fs, 0);
0518     loadsegment(es, _ds);
0519     loadsegment(ds, _ds);
0520     load_gs_index(0);
0521 
0522     regs->ip        = new_ip;
0523     regs->sp        = new_sp;
0524     regs->cs        = _cs;
0525     regs->ss        = _ss;
0526     regs->flags     = X86_EFLAGS_IF;
0527 }
0528 
0529 void
0530 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
0531 {
0532     start_thread_common(regs, new_ip, new_sp,
0533                 __USER_CS, __USER_DS, 0);
0534 }
0535 EXPORT_SYMBOL_GPL(start_thread);
0536 
0537 #ifdef CONFIG_COMPAT
0538 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
0539 {
0540     start_thread_common(regs, new_ip, new_sp,
0541                 x32 ? __USER_CS : __USER32_CS,
0542                 __USER_DS, __USER_DS);
0543 }
0544 #endif
0545 
0546 /*
0547  *  switch_to(x,y) should switch tasks from x to y.
0548  *
0549  * This could still be optimized:
0550  * - fold all the options into a flag word and test it with a single test.
0551  * - could test fs/gs bitsliced
0552  *
0553  * Kprobes not supported here. Set the probe on schedule instead.
0554  * Function graph tracer not supported too.
0555  */
0556 __visible __notrace_funcgraph struct task_struct *
0557 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
0558 {
0559     struct thread_struct *prev = &prev_p->thread;
0560     struct thread_struct *next = &next_p->thread;
0561     struct fpu *prev_fpu = &prev->fpu;
0562     int cpu = smp_processor_id();
0563 
0564     WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
0565              this_cpu_read(hardirq_stack_inuse));
0566 
0567     if (!test_thread_flag(TIF_NEED_FPU_LOAD))
0568         switch_fpu_prepare(prev_fpu, cpu);
0569 
0570     /* We must save %fs and %gs before load_TLS() because
0571      * %fs and %gs may be cleared by load_TLS().
0572      *
0573      * (e.g. xen_load_tls())
0574      */
0575     save_fsgs(prev_p);
0576 
0577     /*
0578      * Load TLS before restoring any segments so that segment loads
0579      * reference the correct GDT entries.
0580      */
0581     load_TLS(next, cpu);
0582 
0583     /*
0584      * Leave lazy mode, flushing any hypercalls made here.  This
0585      * must be done after loading TLS entries in the GDT but before
0586      * loading segments that might reference them.
0587      */
0588     arch_end_context_switch(next_p);
0589 
0590     /* Switch DS and ES.
0591      *
0592      * Reading them only returns the selectors, but writing them (if
0593      * nonzero) loads the full descriptor from the GDT or LDT.  The
0594      * LDT for next is loaded in switch_mm, and the GDT is loaded
0595      * above.
0596      *
0597      * We therefore need to write new values to the segment
0598      * registers on every context switch unless both the new and old
0599      * values are zero.
0600      *
0601      * Note that we don't need to do anything for CS and SS, as
0602      * those are saved and restored as part of pt_regs.
0603      */
0604     savesegment(es, prev->es);
0605     if (unlikely(next->es | prev->es))
0606         loadsegment(es, next->es);
0607 
0608     savesegment(ds, prev->ds);
0609     if (unlikely(next->ds | prev->ds))
0610         loadsegment(ds, next->ds);
0611 
0612     x86_fsgsbase_load(prev, next);
0613 
0614     x86_pkru_load(prev, next);
0615 
0616     /*
0617      * Switch the PDA and FPU contexts.
0618      */
0619     this_cpu_write(current_task, next_p);
0620     this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
0621 
0622     switch_fpu_finish();
0623 
0624     /* Reload sp0. */
0625     update_task_stack(next_p);
0626 
0627     switch_to_extra(prev_p, next_p);
0628 
0629     if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
0630         /*
0631          * AMD CPUs have a misfeature: SYSRET sets the SS selector but
0632          * does not update the cached descriptor.  As a result, if we
0633          * do SYSRET while SS is NULL, we'll end up in user mode with
0634          * SS apparently equal to __USER_DS but actually unusable.
0635          *
0636          * The straightforward workaround would be to fix it up just
0637          * before SYSRET, but that would slow down the system call
0638          * fast paths.  Instead, we ensure that SS is never NULL in
0639          * system call context.  We do this by replacing NULL SS
0640          * selectors at every context switch.  SYSCALL sets up a valid
0641          * SS, so the only way to get NULL is to re-enter the kernel
0642          * from CPL 3 through an interrupt.  Since that can't happen
0643          * in the same task as a running syscall, we are guaranteed to
0644          * context switch between every interrupt vector entry and a
0645          * subsequent SYSRET.
0646          *
0647          * We read SS first because SS reads are much faster than
0648          * writes.  Out of caution, we force SS to __KERNEL_DS even if
0649          * it previously had a different non-NULL value.
0650          */
0651         unsigned short ss_sel;
0652         savesegment(ss, ss_sel);
0653         if (ss_sel != __KERNEL_DS)
0654             loadsegment(ss, __KERNEL_DS);
0655     }
0656 
0657     /* Load the Intel cache allocation PQR MSR. */
0658     resctrl_sched_in();
0659 
0660     return prev_p;
0661 }
0662 
0663 void set_personality_64bit(void)
0664 {
0665     /* inherit personality from parent */
0666 
0667     /* Make sure to be in 64bit mode */
0668     clear_thread_flag(TIF_ADDR32);
0669     /* Pretend that this comes from a 64bit execve */
0670     task_pt_regs(current)->orig_ax = __NR_execve;
0671     current_thread_info()->status &= ~TS_COMPAT;
0672     if (current->mm)
0673         current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
0674 
0675     /* TBD: overwrites user setup. Should have two bits.
0676        But 64bit processes have always behaved this way,
0677        so it's not too bad. The main problem is just that
0678        32bit children are affected again. */
0679     current->personality &= ~READ_IMPLIES_EXEC;
0680 }
0681 
0682 static void __set_personality_x32(void)
0683 {
0684 #ifdef CONFIG_X86_X32_ABI
0685     if (current->mm)
0686         current->mm->context.flags = 0;
0687 
0688     current->personality &= ~READ_IMPLIES_EXEC;
0689     /*
0690      * in_32bit_syscall() uses the presence of the x32 syscall bit
0691      * flag to determine compat status.  The x86 mmap() code relies on
0692      * the syscall bitness so set x32 syscall bit right here to make
0693      * in_32bit_syscall() work during exec().
0694      *
0695      * Pretend to come from a x32 execve.
0696      */
0697     task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
0698     current_thread_info()->status &= ~TS_COMPAT;
0699 #endif
0700 }
0701 
0702 static void __set_personality_ia32(void)
0703 {
0704 #ifdef CONFIG_IA32_EMULATION
0705     if (current->mm) {
0706         /*
0707          * uprobes applied to this MM need to know this and
0708          * cannot use user_64bit_mode() at that time.
0709          */
0710         current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
0711     }
0712 
0713     current->personality |= force_personality32;
0714     /* Prepare the first "return" to user space */
0715     task_pt_regs(current)->orig_ax = __NR_ia32_execve;
0716     current_thread_info()->status |= TS_COMPAT;
0717 #endif
0718 }
0719 
0720 void set_personality_ia32(bool x32)
0721 {
0722     /* Make sure to be in 32bit mode */
0723     set_thread_flag(TIF_ADDR32);
0724 
0725     if (x32)
0726         __set_personality_x32();
0727     else
0728         __set_personality_ia32();
0729 }
0730 EXPORT_SYMBOL_GPL(set_personality_ia32);
0731 
0732 #ifdef CONFIG_CHECKPOINT_RESTORE
0733 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
0734 {
0735     int ret;
0736 
0737     ret = map_vdso_once(image, addr);
0738     if (ret)
0739         return ret;
0740 
0741     return (long)image->size;
0742 }
0743 #endif
0744 
0745 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
0746 {
0747     int ret = 0;
0748 
0749     switch (option) {
0750     case ARCH_SET_GS: {
0751         if (unlikely(arg2 >= TASK_SIZE_MAX))
0752             return -EPERM;
0753 
0754         preempt_disable();
0755         /*
0756          * ARCH_SET_GS has always overwritten the index
0757          * and the base. Zero is the most sensible value
0758          * to put in the index, and is the only value that
0759          * makes any sense if FSGSBASE is unavailable.
0760          */
0761         if (task == current) {
0762             loadseg(GS, 0);
0763             x86_gsbase_write_cpu_inactive(arg2);
0764 
0765             /*
0766              * On non-FSGSBASE systems, save_base_legacy() expects
0767              * that we also fill in thread.gsbase.
0768              */
0769             task->thread.gsbase = arg2;
0770 
0771         } else {
0772             task->thread.gsindex = 0;
0773             x86_gsbase_write_task(task, arg2);
0774         }
0775         preempt_enable();
0776         break;
0777     }
0778     case ARCH_SET_FS: {
0779         /*
0780          * Not strictly needed for %fs, but do it for symmetry
0781          * with %gs
0782          */
0783         if (unlikely(arg2 >= TASK_SIZE_MAX))
0784             return -EPERM;
0785 
0786         preempt_disable();
0787         /*
0788          * Set the selector to 0 for the same reason
0789          * as %gs above.
0790          */
0791         if (task == current) {
0792             loadseg(FS, 0);
0793             x86_fsbase_write_cpu(arg2);
0794 
0795             /*
0796              * On non-FSGSBASE systems, save_base_legacy() expects
0797              * that we also fill in thread.fsbase.
0798              */
0799             task->thread.fsbase = arg2;
0800         } else {
0801             task->thread.fsindex = 0;
0802             x86_fsbase_write_task(task, arg2);
0803         }
0804         preempt_enable();
0805         break;
0806     }
0807     case ARCH_GET_FS: {
0808         unsigned long base = x86_fsbase_read_task(task);
0809 
0810         ret = put_user(base, (unsigned long __user *)arg2);
0811         break;
0812     }
0813     case ARCH_GET_GS: {
0814         unsigned long base = x86_gsbase_read_task(task);
0815 
0816         ret = put_user(base, (unsigned long __user *)arg2);
0817         break;
0818     }
0819 
0820 #ifdef CONFIG_CHECKPOINT_RESTORE
0821 # ifdef CONFIG_X86_X32_ABI
0822     case ARCH_MAP_VDSO_X32:
0823         return prctl_map_vdso(&vdso_image_x32, arg2);
0824 # endif
0825 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
0826     case ARCH_MAP_VDSO_32:
0827         return prctl_map_vdso(&vdso_image_32, arg2);
0828 # endif
0829     case ARCH_MAP_VDSO_64:
0830         return prctl_map_vdso(&vdso_image_64, arg2);
0831 #endif
0832 
0833     default:
0834         ret = -EINVAL;
0835         break;
0836     }
0837 
0838     return ret;
0839 }
0840 
0841 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
0842 {
0843     long ret;
0844 
0845     ret = do_arch_prctl_64(current, option, arg2);
0846     if (ret == -EINVAL)
0847         ret = do_arch_prctl_common(option, arg2);
0848 
0849     return ret;
0850 }
0851 
0852 #ifdef CONFIG_IA32_EMULATION
0853 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
0854 {
0855     return do_arch_prctl_common(option, arg2);
0856 }
0857 #endif
0858 
0859 unsigned long KSTK_ESP(struct task_struct *task)
0860 {
0861     return task_pt_regs(task)->sp;
0862 }