the-tree/kernel/watchdog_hld.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Detect hard lockups on a system
0004  *
0005  * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
0006  *
0007  * Note: Most of this code is borrowed heavily from the original softlockup
0008  * detector, so thanks to Ingo for the initial implementation.
0009  * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
0010  * to those contributors as well.
0011  */
0012
0013 #define pr_fmt(fmt) "NMI watchdog: " fmt
0014
0015 #include <linux/nmi.h>
0016 #include <linux/atomic.h>
0017 #include <linux/module.h>
0018 #include <linux/sched/debug.h>
0019
0020 #include <asm/irq_regs.h>
0021 #include <linux/perf_event.h>
0022
0023 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
0024 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
0025 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
0026 static DEFINE_PER_CPU(struct perf_event *, dead_event);
0027 static struct cpumask dead_events_mask;
0028
0029 static unsigned long hardlockup_allcpu_dumped;
0030 static atomic_t watchdog_cpus = ATOMIC_INIT(0);
0031
0032 notrace void arch_touch_nmi_watchdog(void)
0033 {
0034     /*
0035      * Using __raw here because some code paths have
0036      * preemption enabled.  If preemption is enabled
0037      * then interrupts should be enabled too, in which
0038      * case we shouldn't have to worry about the watchdog
0039      * going off.
0040      */
0041     raw_cpu_write(watchdog_nmi_touch, true);
0042 }
0043 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
0044
0045 #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
0046 static DEFINE_PER_CPU(ktime_t, last_timestamp);
0047 static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
0048 static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
0049
0050 void watchdog_update_hrtimer_threshold(u64 period)
0051 {
0052     /*
0053      * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
0054      *
0055      * So it runs effectively with 2.5 times the rate of the NMI
0056      * watchdog. That means the hrtimer should fire 2-3 times before
0057      * the NMI watchdog expires. The NMI watchdog on x86 is based on
0058      * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
0059      * might run way faster than expected and the NMI fires in a
0060      * smaller period than the one deduced from the nominal CPU
0061      * frequency. Depending on the Turbo-Mode factor this might be fast
0062      * enough to get the NMI period smaller than the hrtimer watchdog
0063      * period and trigger false positives.
0064      *
0065      * The sample threshold is used to check in the NMI handler whether
0066      * the minimum time between two NMI samples has elapsed. That
0067      * prevents false positives.
0068      *
0069      * Set this to 4/5 of the actual watchdog threshold period so the
0070      * hrtimer is guaranteed to fire at least once within the real
0071      * watchdog threshold.
0072      */
0073     watchdog_hrtimer_sample_threshold = period * 2;
0074 }
0075
0076 static bool watchdog_check_timestamp(void)
0077 {
0078     ktime_t delta, now = ktime_get_mono_fast_ns();
0079
0080     delta = now - __this_cpu_read(last_timestamp);
0081     if (delta < watchdog_hrtimer_sample_threshold) {
0082         /*
0083          * If ktime is jiffies based, a stalled timer would prevent
0084          * jiffies from being incremented and the filter would look
0085          * at a stale timestamp and never trigger.
0086          */
0087         if (__this_cpu_inc_return(nmi_rearmed) < 10)
0088             return false;
0089     }
0090     __this_cpu_write(nmi_rearmed, 0);
0091     __this_cpu_write(last_timestamp, now);
0092     return true;
0093 }
0094 #else
0095 static inline bool watchdog_check_timestamp(void)
0096 {
0097     return true;
0098 }
0099 #endif
0100
0101 static struct perf_event_attr wd_hw_attr = {
0102     .type       = PERF_TYPE_HARDWARE,
0103     .config     = PERF_COUNT_HW_CPU_CYCLES,
0104     .size       = sizeof(struct perf_event_attr),
0105     .pinned     = 1,
0106     .disabled   = 1,
0107 };
0108
0109 /* Callback function for perf event subsystem */
0110 static void watchdog_overflow_callback(struct perf_event *event,
0111                        struct perf_sample_data *data,
0112                        struct pt_regs *regs)
0113 {
0114     /* Ensure the watchdog never gets throttled */
0115     event->hw.interrupts = 0;
0116
0117     if (__this_cpu_read(watchdog_nmi_touch) == true) {
0118         __this_cpu_write(watchdog_nmi_touch, false);
0119         return;
0120     }
0121
0122     if (!watchdog_check_timestamp())
0123         return;
0124
0125     /* check for a hardlockup
0126      * This is done by making sure our timer interrupt
0127      * is incrementing.  The timer interrupt should have
0128      * fired multiple times before we overflow'd.  If it hasn't
0129      * then this is a good indication the cpu is stuck
0130      */
0131     if (is_hardlockup()) {
0132         int this_cpu = smp_processor_id();
0133
0134         /* only print hardlockups once */
0135         if (__this_cpu_read(hard_watchdog_warn) == true)
0136             return;
0137
0138         pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
0139              this_cpu);
0140         print_modules();
0141         print_irqtrace_events(current);
0142         if (regs)
0143             show_regs(regs);
0144         else
0145             dump_stack();
0146
0147         /*
0148          * Perform all-CPU dump only once to avoid multiple hardlockups
0149          * generating interleaving traces
0150          */
0151         if (sysctl_hardlockup_all_cpu_backtrace &&
0152                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
0153             trigger_allbutself_cpu_backtrace();
0154
0155         if (hardlockup_panic)
0156             nmi_panic(regs, "Hard LOCKUP");
0157
0158         __this_cpu_write(hard_watchdog_warn, true);
0159         return;
0160     }
0161
0162     __this_cpu_write(hard_watchdog_warn, false);
0163     return;
0164 }
0165
0166 static int hardlockup_detector_event_create(void)
0167 {
0168     unsigned int cpu = smp_processor_id();
0169     struct perf_event_attr *wd_attr;
0170     struct perf_event *evt;
0171
0172     wd_attr = &wd_hw_attr;
0173     wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
0174
0175     /* Try to register using hardware perf events */
0176     evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
0177                            watchdog_overflow_callback, NULL);
0178     if (IS_ERR(evt)) {
0179         pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
0180              PTR_ERR(evt));
0181         return PTR_ERR(evt);
0182     }
0183     this_cpu_write(watchdog_ev, evt);
0184     return 0;
0185 }
0186
0187 /**
0188  * hardlockup_detector_perf_enable - Enable the local event
0189  */
0190 void hardlockup_detector_perf_enable(void)
0191 {
0192     if (hardlockup_detector_event_create())
0193         return;
0194
0195     /* use original value for check */
0196     if (!atomic_fetch_inc(&watchdog_cpus))
0197         pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
0198
0199     perf_event_enable(this_cpu_read(watchdog_ev));
0200 }
0201
0202 /**
0203  * hardlockup_detector_perf_disable - Disable the local event
0204  */
0205 void hardlockup_detector_perf_disable(void)
0206 {
0207     struct perf_event *event = this_cpu_read(watchdog_ev);
0208
0209     if (event) {
0210         perf_event_disable(event);
0211         this_cpu_write(watchdog_ev, NULL);
0212         this_cpu_write(dead_event, event);
0213         cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
0214         atomic_dec(&watchdog_cpus);
0215     }
0216 }
0217
0218 /**
0219  * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
0220  *
0221  * Called from lockup_detector_cleanup(). Serialized by the caller.
0222  */
0223 void hardlockup_detector_perf_cleanup(void)
0224 {
0225     int cpu;
0226
0227     for_each_cpu(cpu, &dead_events_mask) {
0228         struct perf_event *event = per_cpu(dead_event, cpu);
0229
0230         /*
0231          * Required because for_each_cpu() reports  unconditionally
0232          * CPU0 as set on UP kernels. Sigh.
0233          */
0234         if (event)
0235             perf_event_release_kernel(event);
0236         per_cpu(dead_event, cpu) = NULL;
0237     }
0238     cpumask_clear(&dead_events_mask);
0239 }
0240
0241 /**
0242  * hardlockup_detector_perf_stop - Globally stop watchdog events
0243  *
0244  * Special interface for x86 to handle the perf HT bug.
0245  */
0246 void __init hardlockup_detector_perf_stop(void)
0247 {
0248     int cpu;
0249
0250     lockdep_assert_cpus_held();
0251
0252     for_each_online_cpu(cpu) {
0253         struct perf_event *event = per_cpu(watchdog_ev, cpu);
0254
0255         if (event)
0256             perf_event_disable(event);
0257     }
0258 }
0259
0260 /**
0261  * hardlockup_detector_perf_restart - Globally restart watchdog events
0262  *
0263  * Special interface for x86 to handle the perf HT bug.
0264  */
0265 void __init hardlockup_detector_perf_restart(void)
0266 {
0267     int cpu;
0268
0269     lockdep_assert_cpus_held();
0270
0271     if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
0272         return;
0273
0274     for_each_online_cpu(cpu) {
0275         struct perf_event *event = per_cpu(watchdog_ev, cpu);
0276
0277         if (event)
0278             perf_event_enable(event);
0279     }
0280 }
0281
0282 /**
0283  * hardlockup_detector_perf_init - Probe whether NMI event is available at all
0284  */
0285 int __init hardlockup_detector_perf_init(void)
0286 {
0287     int ret = hardlockup_detector_event_create();
0288
0289     if (ret) {
0290         pr_info("Perf NMI watchdog permanently disabled\n");
0291     } else {
0292         perf_event_release_kernel(this_cpu_read(watchdog_ev));
0293         this_cpu_write(watchdog_ev, NULL);
0294     }
0295     return ret;
0296 }