Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 
0003 #include <linux/context_tracking.h>
0004 #include <linux/entry-common.h>
0005 #include <linux/resume_user_mode.h>
0006 #include <linux/highmem.h>
0007 #include <linux/jump_label.h>
0008 #include <linux/livepatch.h>
0009 #include <linux/audit.h>
0010 #include <linux/tick.h>
0011 
0012 #include "common.h"
0013 
0014 #define CREATE_TRACE_POINTS
0015 #include <trace/events/syscalls.h>
0016 
0017 /* See comment for enter_from_user_mode() in entry-common.h */
0018 static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
0019 {
0020     arch_enter_from_user_mode(regs);
0021     lockdep_hardirqs_off(CALLER_ADDR0);
0022 
0023     CT_WARN_ON(ct_state() != CONTEXT_USER);
0024     user_exit_irqoff();
0025 
0026     instrumentation_begin();
0027     trace_hardirqs_off_finish();
0028     instrumentation_end();
0029 }
0030 
0031 void noinstr enter_from_user_mode(struct pt_regs *regs)
0032 {
0033     __enter_from_user_mode(regs);
0034 }
0035 
0036 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
0037 {
0038     if (unlikely(audit_context())) {
0039         unsigned long args[6];
0040 
0041         syscall_get_arguments(current, regs, args);
0042         audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
0043     }
0044 }
0045 
0046 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
0047                 unsigned long work)
0048 {
0049     long ret = 0;
0050 
0051     /*
0052      * Handle Syscall User Dispatch.  This must comes first, since
0053      * the ABI here can be something that doesn't make sense for
0054      * other syscall_work features.
0055      */
0056     if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
0057         if (syscall_user_dispatch(regs))
0058             return -1L;
0059     }
0060 
0061     /* Handle ptrace */
0062     if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
0063         ret = ptrace_report_syscall_entry(regs);
0064         if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
0065             return -1L;
0066     }
0067 
0068     /* Do seccomp after ptrace, to catch any tracer changes. */
0069     if (work & SYSCALL_WORK_SECCOMP) {
0070         ret = __secure_computing(NULL);
0071         if (ret == -1L)
0072             return ret;
0073     }
0074 
0075     /* Either of the above might have changed the syscall number */
0076     syscall = syscall_get_nr(current, regs);
0077 
0078     if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
0079         trace_sys_enter(regs, syscall);
0080 
0081     syscall_enter_audit(regs, syscall);
0082 
0083     return ret ? : syscall;
0084 }
0085 
0086 static __always_inline long
0087 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
0088 {
0089     unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
0090 
0091     if (work & SYSCALL_WORK_ENTER)
0092         syscall = syscall_trace_enter(regs, syscall, work);
0093 
0094     return syscall;
0095 }
0096 
0097 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
0098 {
0099     return __syscall_enter_from_user_work(regs, syscall);
0100 }
0101 
0102 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
0103 {
0104     long ret;
0105 
0106     __enter_from_user_mode(regs);
0107 
0108     instrumentation_begin();
0109     local_irq_enable();
0110     ret = __syscall_enter_from_user_work(regs, syscall);
0111     instrumentation_end();
0112 
0113     return ret;
0114 }
0115 
0116 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
0117 {
0118     __enter_from_user_mode(regs);
0119     instrumentation_begin();
0120     local_irq_enable();
0121     instrumentation_end();
0122 }
0123 
0124 /* See comment for exit_to_user_mode() in entry-common.h */
0125 static __always_inline void __exit_to_user_mode(void)
0126 {
0127     instrumentation_begin();
0128     trace_hardirqs_on_prepare();
0129     lockdep_hardirqs_on_prepare();
0130     instrumentation_end();
0131 
0132     user_enter_irqoff();
0133     arch_exit_to_user_mode();
0134     lockdep_hardirqs_on(CALLER_ADDR0);
0135 }
0136 
0137 void noinstr exit_to_user_mode(void)
0138 {
0139     __exit_to_user_mode();
0140 }
0141 
0142 /* Workaround to allow gradual conversion of architecture code */
0143 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
0144 
0145 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
0146                         unsigned long ti_work)
0147 {
0148     /*
0149      * Before returning to user space ensure that all pending work
0150      * items have been completed.
0151      */
0152     while (ti_work & EXIT_TO_USER_MODE_WORK) {
0153 
0154         local_irq_enable_exit_to_user(ti_work);
0155 
0156         if (ti_work & _TIF_NEED_RESCHED)
0157             schedule();
0158 
0159         if (ti_work & _TIF_UPROBE)
0160             uprobe_notify_resume(regs);
0161 
0162         if (ti_work & _TIF_PATCH_PENDING)
0163             klp_update_patch_state(current);
0164 
0165         if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
0166             arch_do_signal_or_restart(regs);
0167 
0168         if (ti_work & _TIF_NOTIFY_RESUME)
0169             resume_user_mode_work(regs);
0170 
0171         /* Architecture specific TIF work */
0172         arch_exit_to_user_mode_work(regs, ti_work);
0173 
0174         /*
0175          * Disable interrupts and reevaluate the work flags as they
0176          * might have changed while interrupts and preemption was
0177          * enabled above.
0178          */
0179         local_irq_disable_exit_to_user();
0180 
0181         /* Check if any of the above work has queued a deferred wakeup */
0182         tick_nohz_user_enter_prepare();
0183 
0184         ti_work = read_thread_flags();
0185     }
0186 
0187     /* Return the latest work state for arch_exit_to_user_mode() */
0188     return ti_work;
0189 }
0190 
0191 static void exit_to_user_mode_prepare(struct pt_regs *regs)
0192 {
0193     unsigned long ti_work = read_thread_flags();
0194 
0195     lockdep_assert_irqs_disabled();
0196 
0197     /* Flush pending rcuog wakeup before the last need_resched() check */
0198     tick_nohz_user_enter_prepare();
0199 
0200     if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
0201         ti_work = exit_to_user_mode_loop(regs, ti_work);
0202 
0203     arch_exit_to_user_mode_prepare(regs, ti_work);
0204 
0205     /* Ensure that the address limit is intact and no locks are held */
0206     addr_limit_user_check();
0207     kmap_assert_nomap();
0208     lockdep_assert_irqs_disabled();
0209     lockdep_sys_exit();
0210 }
0211 
0212 /*
0213  * If SYSCALL_EMU is set, then the only reason to report is when
0214  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
0215  * instruction has been already reported in syscall_enter_from_user_mode().
0216  */
0217 static inline bool report_single_step(unsigned long work)
0218 {
0219     if (work & SYSCALL_WORK_SYSCALL_EMU)
0220         return false;
0221 
0222     return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
0223 }
0224 
0225 static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
0226 {
0227     bool step;
0228 
0229     /*
0230      * If the syscall was rolled back due to syscall user dispatching,
0231      * then the tracers below are not invoked for the same reason as
0232      * the entry side was not invoked in syscall_trace_enter(): The ABI
0233      * of these syscalls is unknown.
0234      */
0235     if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
0236         if (unlikely(current->syscall_dispatch.on_dispatch)) {
0237             current->syscall_dispatch.on_dispatch = false;
0238             return;
0239         }
0240     }
0241 
0242     audit_syscall_exit(regs);
0243 
0244     if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
0245         trace_sys_exit(regs, syscall_get_return_value(current, regs));
0246 
0247     step = report_single_step(work);
0248     if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
0249         ptrace_report_syscall_exit(regs, step);
0250 }
0251 
0252 /*
0253  * Syscall specific exit to user mode preparation. Runs with interrupts
0254  * enabled.
0255  */
0256 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
0257 {
0258     unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
0259     unsigned long nr = syscall_get_nr(current, regs);
0260 
0261     CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
0262 
0263     if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
0264         if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
0265             local_irq_enable();
0266     }
0267 
0268     rseq_syscall(regs);
0269 
0270     /*
0271      * Do one-time syscall specific work. If these work items are
0272      * enabled, we want to run them exactly once per syscall exit with
0273      * interrupts enabled.
0274      */
0275     if (unlikely(work & SYSCALL_WORK_EXIT))
0276         syscall_exit_work(regs, work);
0277 }
0278 
0279 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
0280 {
0281     syscall_exit_to_user_mode_prepare(regs);
0282     local_irq_disable_exit_to_user();
0283     exit_to_user_mode_prepare(regs);
0284 }
0285 
0286 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
0287 {
0288     __syscall_exit_to_user_mode_work(regs);
0289 }
0290 
0291 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
0292 {
0293     instrumentation_begin();
0294     __syscall_exit_to_user_mode_work(regs);
0295     instrumentation_end();
0296     __exit_to_user_mode();
0297 }
0298 
0299 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
0300 {
0301     __enter_from_user_mode(regs);
0302 }
0303 
0304 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
0305 {
0306     instrumentation_begin();
0307     exit_to_user_mode_prepare(regs);
0308     instrumentation_end();
0309     __exit_to_user_mode();
0310 }
0311 
0312 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
0313 {
0314     irqentry_state_t ret = {
0315         .exit_rcu = false,
0316     };
0317 
0318     if (user_mode(regs)) {
0319         irqentry_enter_from_user_mode(regs);
0320         return ret;
0321     }
0322 
0323     /*
0324      * If this entry hit the idle task invoke ct_irq_enter() whether
0325      * RCU is watching or not.
0326      *
0327      * Interrupts can nest when the first interrupt invokes softirq
0328      * processing on return which enables interrupts.
0329      *
0330      * Scheduler ticks in the idle task can mark quiescent state and
0331      * terminate a grace period, if and only if the timer interrupt is
0332      * not nested into another interrupt.
0333      *
0334      * Checking for rcu_is_watching() here would prevent the nesting
0335      * interrupt to invoke ct_irq_enter(). If that nested interrupt is
0336      * the tick then rcu_flavor_sched_clock_irq() would wrongfully
0337      * assume that it is the first interrupt and eventually claim
0338      * quiescent state and end grace periods prematurely.
0339      *
0340      * Unconditionally invoke ct_irq_enter() so RCU state stays
0341      * consistent.
0342      *
0343      * TINY_RCU does not support EQS, so let the compiler eliminate
0344      * this part when enabled.
0345      */
0346     if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
0347         /*
0348          * If RCU is not watching then the same careful
0349          * sequence vs. lockdep and tracing is required
0350          * as in irqentry_enter_from_user_mode().
0351          */
0352         lockdep_hardirqs_off(CALLER_ADDR0);
0353         ct_irq_enter();
0354         instrumentation_begin();
0355         trace_hardirqs_off_finish();
0356         instrumentation_end();
0357 
0358         ret.exit_rcu = true;
0359         return ret;
0360     }
0361 
0362     /*
0363      * If RCU is watching then RCU only wants to check whether it needs
0364      * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
0365      * already contains a warning when RCU is not watching, so no point
0366      * in having another one here.
0367      */
0368     lockdep_hardirqs_off(CALLER_ADDR0);
0369     instrumentation_begin();
0370     rcu_irq_enter_check_tick();
0371     trace_hardirqs_off_finish();
0372     instrumentation_end();
0373 
0374     return ret;
0375 }
0376 
0377 void raw_irqentry_exit_cond_resched(void)
0378 {
0379     if (!preempt_count()) {
0380         /* Sanity check RCU and thread stack */
0381         rcu_irq_exit_check_preempt();
0382         if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
0383             WARN_ON_ONCE(!on_thread_stack());
0384         if (need_resched())
0385             preempt_schedule_irq();
0386     }
0387 }
0388 #ifdef CONFIG_PREEMPT_DYNAMIC
0389 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
0390 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
0391 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
0392 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
0393 void dynamic_irqentry_exit_cond_resched(void)
0394 {
0395     if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
0396         return;
0397     raw_irqentry_exit_cond_resched();
0398 }
0399 #endif
0400 #endif
0401 
0402 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
0403 {
0404     lockdep_assert_irqs_disabled();
0405 
0406     /* Check whether this returns to user mode */
0407     if (user_mode(regs)) {
0408         irqentry_exit_to_user_mode(regs);
0409     } else if (!regs_irqs_disabled(regs)) {
0410         /*
0411          * If RCU was not watching on entry this needs to be done
0412          * carefully and needs the same ordering of lockdep/tracing
0413          * and RCU as the return to user mode path.
0414          */
0415         if (state.exit_rcu) {
0416             instrumentation_begin();
0417             /* Tell the tracer that IRET will enable interrupts */
0418             trace_hardirqs_on_prepare();
0419             lockdep_hardirqs_on_prepare();
0420             instrumentation_end();
0421             ct_irq_exit();
0422             lockdep_hardirqs_on(CALLER_ADDR0);
0423             return;
0424         }
0425 
0426         instrumentation_begin();
0427         if (IS_ENABLED(CONFIG_PREEMPTION))
0428             irqentry_exit_cond_resched();
0429 
0430         /* Covers both tracing and lockdep */
0431         trace_hardirqs_on();
0432         instrumentation_end();
0433     } else {
0434         /*
0435          * IRQ flags state is correct already. Just tell RCU if it
0436          * was not watching on entry.
0437          */
0438         if (state.exit_rcu)
0439             ct_irq_exit();
0440     }
0441 }
0442 
0443 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
0444 {
0445     irqentry_state_t irq_state;
0446 
0447     irq_state.lockdep = lockdep_hardirqs_enabled();
0448 
0449     __nmi_enter();
0450     lockdep_hardirqs_off(CALLER_ADDR0);
0451     lockdep_hardirq_enter();
0452     ct_nmi_enter();
0453 
0454     instrumentation_begin();
0455     trace_hardirqs_off_finish();
0456     ftrace_nmi_enter();
0457     instrumentation_end();
0458 
0459     return irq_state;
0460 }
0461 
0462 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
0463 {
0464     instrumentation_begin();
0465     ftrace_nmi_exit();
0466     if (irq_state.lockdep) {
0467         trace_hardirqs_on_prepare();
0468         lockdep_hardirqs_on_prepare();
0469     }
0470     instrumentation_end();
0471 
0472     ct_nmi_exit();
0473     lockdep_hardirq_exit();
0474     if (irq_state.lockdep)
0475         lockdep_hardirqs_on(CALLER_ADDR0);
0476     __nmi_exit();
0477 }