x86/kernel/nmi.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  Copyright (C) 1991, 1992  Linus Torvalds
0004  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
0005  *  Copyright (C) 2011  Don Zickus Red Hat, Inc.
0006  *
0007  *  Pentium III FXSR, SSE support
0008  *  Gareth Hughes <gareth@valinux.com>, May 2000
0009  */
0010
0011 /*
0012  * Handle hardware traps and faults.
0013  */
0014 #include <linux/spinlock.h>
0015 #include <linux/kprobes.h>
0016 #include <linux/kdebug.h>
0017 #include <linux/sched/debug.h>
0018 #include <linux/nmi.h>
0019 #include <linux/debugfs.h>
0020 #include <linux/delay.h>
0021 #include <linux/hardirq.h>
0022 #include <linux/ratelimit.h>
0023 #include <linux/slab.h>
0024 #include <linux/export.h>
0025 #include <linux/atomic.h>
0026 #include <linux/sched/clock.h>
0027
0028 #include <asm/cpu_entry_area.h>
0029 #include <asm/traps.h>
0030 #include <asm/mach_traps.h>
0031 #include <asm/nmi.h>
0032 #include <asm/x86_init.h>
0033 #include <asm/reboot.h>
0034 #include <asm/cache.h>
0035 #include <asm/nospec-branch.h>
0036 #include <asm/sev.h>
0037
0038 #define CREATE_TRACE_POINTS
0039 #include <trace/events/nmi.h>
0040
0041 struct nmi_desc {
0042     raw_spinlock_t lock;
0043     struct list_head head;
0044 };
0045
0046 static struct nmi_desc nmi_desc[NMI_MAX] =
0047 {
0048     {
0049         .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
0050         .head = LIST_HEAD_INIT(nmi_desc[0].head),
0051     },
0052     {
0053         .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
0054         .head = LIST_HEAD_INIT(nmi_desc[1].head),
0055     },
0056     {
0057         .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
0058         .head = LIST_HEAD_INIT(nmi_desc[2].head),
0059     },
0060     {
0061         .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
0062         .head = LIST_HEAD_INIT(nmi_desc[3].head),
0063     },
0064
0065 };
0066
0067 struct nmi_stats {
0068     unsigned int normal;
0069     unsigned int unknown;
0070     unsigned int external;
0071     unsigned int swallow;
0072 };
0073
0074 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
0075
0076 static int ignore_nmis __read_mostly;
0077
0078 int unknown_nmi_panic;
0079 /*
0080  * Prevent NMI reason port (0x61) being accessed simultaneously, can
0081  * only be used in NMI handler.
0082  */
0083 static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
0084
0085 static int __init setup_unknown_nmi_panic(char *str)
0086 {
0087     unknown_nmi_panic = 1;
0088     return 1;
0089 }
0090 __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
0091
0092 #define nmi_to_desc(type) (&nmi_desc[type])
0093
0094 static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
0095
0096 static int __init nmi_warning_debugfs(void)
0097 {
0098     debugfs_create_u64("nmi_longest_ns", 0644,
0099             arch_debugfs_dir, &nmi_longest_ns);
0100     return 0;
0101 }
0102 fs_initcall(nmi_warning_debugfs);
0103
0104 static void nmi_check_duration(struct nmiaction *action, u64 duration)
0105 {
0106     int remainder_ns, decimal_msecs;
0107
0108     if (duration < nmi_longest_ns || duration < action->max_duration)
0109         return;
0110
0111     action->max_duration = duration;
0112
0113     remainder_ns = do_div(duration, (1000 * 1000));
0114     decimal_msecs = remainder_ns / 1000;
0115
0116     printk_ratelimited(KERN_INFO
0117         "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
0118         action->handler, duration, decimal_msecs);
0119 }
0120
0121 static int nmi_handle(unsigned int type, struct pt_regs *regs)
0122 {
0123     struct nmi_desc *desc = nmi_to_desc(type);
0124     struct nmiaction *a;
0125     int handled=0;
0126
0127     rcu_read_lock();
0128
0129     /*
0130      * NMIs are edge-triggered, which means if you have enough
0131      * of them concurrently, you can lose some because only one
0132      * can be latched at any given time.  Walk the whole list
0133      * to handle those situations.
0134      */
0135     list_for_each_entry_rcu(a, &desc->head, list) {
0136         int thishandled;
0137         u64 delta;
0138
0139         delta = sched_clock();
0140         thishandled = a->handler(type, regs);
0141         handled += thishandled;
0142         delta = sched_clock() - delta;
0143         trace_nmi_handler(a->handler, (int)delta, thishandled);
0144
0145         nmi_check_duration(a, delta);
0146     }
0147
0148     rcu_read_unlock();
0149
0150     /* return total number of NMI events handled */
0151     return handled;
0152 }
0153 NOKPROBE_SYMBOL(nmi_handle);
0154
0155 int __register_nmi_handler(unsigned int type, struct nmiaction *action)
0156 {
0157     struct nmi_desc *desc = nmi_to_desc(type);
0158     unsigned long flags;
0159
0160     if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
0161         return -EINVAL;
0162
0163     raw_spin_lock_irqsave(&desc->lock, flags);
0164
0165     /*
0166      * Indicate if there are multiple registrations on the
0167      * internal NMI handler call chains (SERR and IO_CHECK).
0168      */
0169     WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
0170     WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
0171
0172     /*
0173      * some handlers need to be executed first otherwise a fake
0174      * event confuses some handlers (kdump uses this flag)
0175      */
0176     if (action->flags & NMI_FLAG_FIRST)
0177         list_add_rcu(&action->list, &desc->head);
0178     else
0179         list_add_tail_rcu(&action->list, &desc->head);
0180
0181     raw_spin_unlock_irqrestore(&desc->lock, flags);
0182     return 0;
0183 }
0184 EXPORT_SYMBOL(__register_nmi_handler);
0185
0186 void unregister_nmi_handler(unsigned int type, const char *name)
0187 {
0188     struct nmi_desc *desc = nmi_to_desc(type);
0189     struct nmiaction *n, *found = NULL;
0190     unsigned long flags;
0191
0192     raw_spin_lock_irqsave(&desc->lock, flags);
0193
0194     list_for_each_entry_rcu(n, &desc->head, list) {
0195         /*
0196          * the name passed in to describe the nmi handler
0197          * is used as the lookup key
0198          */
0199         if (!strcmp(n->name, name)) {
0200             WARN(in_nmi(),
0201                 "Trying to free NMI (%s) from NMI context!\n", n->name);
0202             list_del_rcu(&n->list);
0203             found = n;
0204             break;
0205         }
0206     }
0207
0208     raw_spin_unlock_irqrestore(&desc->lock, flags);
0209     if (found) {
0210         synchronize_rcu();
0211         INIT_LIST_HEAD(&found->list);
0212     }
0213 }
0214 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
0215
0216 static void
0217 pci_serr_error(unsigned char reason, struct pt_regs *regs)
0218 {
0219     /* check to see if anyone registered against these types of errors */
0220     if (nmi_handle(NMI_SERR, regs))
0221         return;
0222
0223     pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
0224          reason, smp_processor_id());
0225
0226     if (panic_on_unrecovered_nmi)
0227         nmi_panic(regs, "NMI: Not continuing");
0228
0229     pr_emerg("Dazed and confused, but trying to continue\n");
0230
0231     /* Clear and disable the PCI SERR error line. */
0232     reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
0233     outb(reason, NMI_REASON_PORT);
0234 }
0235 NOKPROBE_SYMBOL(pci_serr_error);
0236
0237 static void
0238 io_check_error(unsigned char reason, struct pt_regs *regs)
0239 {
0240     unsigned long i;
0241
0242     /* check to see if anyone registered against these types of errors */
0243     if (nmi_handle(NMI_IO_CHECK, regs))
0244         return;
0245
0246     pr_emerg(
0247     "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
0248          reason, smp_processor_id());
0249     show_regs(regs);
0250
0251     if (panic_on_io_nmi) {
0252         nmi_panic(regs, "NMI IOCK error: Not continuing");
0253
0254         /*
0255          * If we end up here, it means we have received an NMI while
0256          * processing panic(). Simply return without delaying and
0257          * re-enabling NMIs.
0258          */
0259         return;
0260     }
0261
0262     /* Re-enable the IOCK line, wait for a few seconds */
0263     reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
0264     outb(reason, NMI_REASON_PORT);
0265
0266     i = 20000;
0267     while (--i) {
0268         touch_nmi_watchdog();
0269         udelay(100);
0270     }
0271
0272     reason &= ~NMI_REASON_CLEAR_IOCHK;
0273     outb(reason, NMI_REASON_PORT);
0274 }
0275 NOKPROBE_SYMBOL(io_check_error);
0276
0277 static void
0278 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
0279 {
0280     int handled;
0281
0282     /*
0283      * Use 'false' as back-to-back NMIs are dealt with one level up.
0284      * Of course this makes having multiple 'unknown' handlers useless
0285      * as only the first one is ever run (unless it can actually determine
0286      * if it caused the NMI)
0287      */
0288     handled = nmi_handle(NMI_UNKNOWN, regs);
0289     if (handled) {
0290         __this_cpu_add(nmi_stats.unknown, handled);
0291         return;
0292     }
0293
0294     __this_cpu_add(nmi_stats.unknown, 1);
0295
0296     pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
0297          reason, smp_processor_id());
0298
0299     if (unknown_nmi_panic || panic_on_unrecovered_nmi)
0300         nmi_panic(regs, "NMI: Not continuing");
0301
0302     pr_emerg("Dazed and confused, but trying to continue\n");
0303 }
0304 NOKPROBE_SYMBOL(unknown_nmi_error);
0305
0306 static DEFINE_PER_CPU(bool, swallow_nmi);
0307 static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
0308
0309 static noinstr void default_do_nmi(struct pt_regs *regs)
0310 {
0311     unsigned char reason = 0;
0312     int handled;
0313     bool b2b = false;
0314
0315     /*
0316      * CPU-specific NMI must be processed before non-CPU-specific
0317      * NMI, otherwise we may lose it, because the CPU-specific
0318      * NMI can not be detected/processed on other CPUs.
0319      */
0320
0321     /*
0322      * Back-to-back NMIs are interesting because they can either
0323      * be two NMI or more than two NMIs (any thing over two is dropped
0324      * due to NMI being edge-triggered).  If this is the second half
0325      * of the back-to-back NMI, assume we dropped things and process
0326      * more handlers.  Otherwise reset the 'swallow' NMI behaviour
0327      */
0328     if (regs->ip == __this_cpu_read(last_nmi_rip))
0329         b2b = true;
0330     else
0331         __this_cpu_write(swallow_nmi, false);
0332
0333     __this_cpu_write(last_nmi_rip, regs->ip);
0334
0335     instrumentation_begin();
0336
0337     handled = nmi_handle(NMI_LOCAL, regs);
0338     __this_cpu_add(nmi_stats.normal, handled);
0339     if (handled) {
0340         /*
0341          * There are cases when a NMI handler handles multiple
0342          * events in the current NMI.  One of these events may
0343          * be queued for in the next NMI.  Because the event is
0344          * already handled, the next NMI will result in an unknown
0345          * NMI.  Instead lets flag this for a potential NMI to
0346          * swallow.
0347          */
0348         if (handled > 1)
0349             __this_cpu_write(swallow_nmi, true);
0350         goto out;
0351     }
0352
0353     /*
0354      * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
0355      *
0356      * Another CPU may be processing panic routines while holding
0357      * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
0358      * and if so, call its callback directly.  If there is no CPU preparing
0359      * crash dump, we simply loop here.
0360      */
0361     while (!raw_spin_trylock(&nmi_reason_lock)) {
0362         run_crash_ipi_callback(regs);
0363         cpu_relax();
0364     }
0365
0366     reason = x86_platform.get_nmi_reason();
0367
0368     if (reason & NMI_REASON_MASK) {
0369         if (reason & NMI_REASON_SERR)
0370             pci_serr_error(reason, regs);
0371         else if (reason & NMI_REASON_IOCHK)
0372             io_check_error(reason, regs);
0373 #ifdef CONFIG_X86_32
0374         /*
0375          * Reassert NMI in case it became active
0376          * meanwhile as it's edge-triggered:
0377          */
0378         reassert_nmi();
0379 #endif
0380         __this_cpu_add(nmi_stats.external, 1);
0381         raw_spin_unlock(&nmi_reason_lock);
0382         goto out;
0383     }
0384     raw_spin_unlock(&nmi_reason_lock);
0385
0386     /*
0387      * Only one NMI can be latched at a time.  To handle
0388      * this we may process multiple nmi handlers at once to
0389      * cover the case where an NMI is dropped.  The downside
0390      * to this approach is we may process an NMI prematurely,
0391      * while its real NMI is sitting latched.  This will cause
0392      * an unknown NMI on the next run of the NMI processing.
0393      *
0394      * We tried to flag that condition above, by setting the
0395      * swallow_nmi flag when we process more than one event.
0396      * This condition is also only present on the second half
0397      * of a back-to-back NMI, so we flag that condition too.
0398      *
0399      * If both are true, we assume we already processed this
0400      * NMI previously and we swallow it.  Otherwise we reset
0401      * the logic.
0402      *
0403      * There are scenarios where we may accidentally swallow
0404      * a 'real' unknown NMI.  For example, while processing
0405      * a perf NMI another perf NMI comes in along with a
0406      * 'real' unknown NMI.  These two NMIs get combined into
0407      * one (as described above).  When the next NMI gets
0408      * processed, it will be flagged by perf as handled, but
0409      * no one will know that there was a 'real' unknown NMI sent
0410      * also.  As a result it gets swallowed.  Or if the first
0411      * perf NMI returns two events handled then the second
0412      * NMI will get eaten by the logic below, again losing a
0413      * 'real' unknown NMI.  But this is the best we can do
0414      * for now.
0415      */
0416     if (b2b && __this_cpu_read(swallow_nmi))
0417         __this_cpu_add(nmi_stats.swallow, 1);
0418     else
0419         unknown_nmi_error(reason, regs);
0420
0421 out:
0422     instrumentation_end();
0423 }
0424
0425 /*
0426  * NMIs can page fault or hit breakpoints which will cause it to lose
0427  * its NMI context with the CPU when the breakpoint or page fault does an IRET.
0428  *
0429  * As a result, NMIs can nest if NMIs get unmasked due an IRET during
0430  * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
0431  * if the outer NMI came from kernel mode, but we can still nest if the
0432  * outer NMI came from user mode.
0433  *
0434  * To handle these nested NMIs, we have three states:
0435  *
0436  *  1) not running
0437  *  2) executing
0438  *  3) latched
0439  *
0440  * When no NMI is in progress, it is in the "not running" state.
0441  * When an NMI comes in, it goes into the "executing" state.
0442  * Normally, if another NMI is triggered, it does not interrupt
0443  * the running NMI and the HW will simply latch it so that when
0444  * the first NMI finishes, it will restart the second NMI.
0445  * (Note, the latch is binary, thus multiple NMIs triggering,
0446  *  when one is running, are ignored. Only one NMI is restarted.)
0447  *
0448  * If an NMI executes an iret, another NMI can preempt it. We do not
0449  * want to allow this new NMI to run, but we want to execute it when the
0450  * first one finishes.  We set the state to "latched", and the exit of
0451  * the first NMI will perform a dec_return, if the result is zero
0452  * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
0453  * dec_return would have set the state to NMI_EXECUTING (what we want it
0454  * to be when we are running). In this case, we simply jump back to
0455  * rerun the NMI handler again, and restart the 'latched' NMI.
0456  *
0457  * No trap (breakpoint or page fault) should be hit before nmi_restart,
0458  * thus there is no race between the first check of state for NOT_RUNNING
0459  * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
0460  * at this point.
0461  *
0462  * In case the NMI takes a page fault, we need to save off the CR2
0463  * because the NMI could have preempted another page fault and corrupt
0464  * the CR2 that is about to be read. As nested NMIs must be restarted
0465  * and they can not take breakpoints or page faults, the update of the
0466  * CR2 must be done before converting the nmi state back to NOT_RUNNING.
0467  * Otherwise, there would be a race of another nested NMI coming in
0468  * after setting state to NOT_RUNNING but before updating the nmi_cr2.
0469  */
0470 enum nmi_states {
0471     NMI_NOT_RUNNING = 0,
0472     NMI_EXECUTING,
0473     NMI_LATCHED,
0474 };
0475 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
0476 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
0477 static DEFINE_PER_CPU(unsigned long, nmi_dr7);
0478
0479 DEFINE_IDTENTRY_RAW(exc_nmi)
0480 {
0481     irqentry_state_t irq_state;
0482
0483     /*
0484      * Re-enable NMIs right here when running as an SEV-ES guest. This might
0485      * cause nested NMIs, but those can be handled safely.
0486      */
0487     sev_es_nmi_complete();
0488
0489     if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
0490         return;
0491
0492     if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
0493         this_cpu_write(nmi_state, NMI_LATCHED);
0494         return;
0495     }
0496     this_cpu_write(nmi_state, NMI_EXECUTING);
0497     this_cpu_write(nmi_cr2, read_cr2());
0498 nmi_restart:
0499
0500     /*
0501      * Needs to happen before DR7 is accessed, because the hypervisor can
0502      * intercept DR7 reads/writes, turning those into #VC exceptions.
0503      */
0504     sev_es_ist_enter(regs);
0505
0506     this_cpu_write(nmi_dr7, local_db_save());
0507
0508     irq_state = irqentry_nmi_enter(regs);
0509
0510     inc_irq_stat(__nmi_count);
0511
0512     if (!ignore_nmis)
0513         default_do_nmi(regs);
0514
0515     irqentry_nmi_exit(regs, irq_state);
0516
0517     local_db_restore(this_cpu_read(nmi_dr7));
0518
0519     sev_es_ist_exit();
0520
0521     if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
0522         write_cr2(this_cpu_read(nmi_cr2));
0523     if (this_cpu_dec_return(nmi_state))
0524         goto nmi_restart;
0525
0526     if (user_mode(regs))
0527         mds_user_clear_cpu_buffers();
0528 }
0529
0530 #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
0531 DEFINE_IDTENTRY_RAW(exc_nmi_noist)
0532 {
0533     exc_nmi(regs);
0534 }
0535 #endif
0536 #if IS_MODULE(CONFIG_KVM_INTEL)
0537 EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
0538 #endif
0539
0540 void stop_nmi(void)
0541 {
0542     ignore_nmis++;
0543 }
0544
0545 void restart_nmi(void)
0546 {
0547     ignore_nmis--;
0548 }
0549
0550 /* reset the back-to-back NMI logic */
0551 void local_touch_nmi(void)
0552 {
0553     __this_cpu_write(last_nmi_rip, 0);
0554 }
0555 EXPORT_SYMBOL_GPL(local_touch_nmi);