powerpc/kernel/watchdog.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Watchdog support on powerpc systems.
0004  *
0005  * Copyright 2017, IBM Corporation.
0006  *
0007  * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
0008  */
0009
0010 #define pr_fmt(fmt) "watchdog: " fmt
0011
0012 #include <linux/kernel.h>
0013 #include <linux/param.h>
0014 #include <linux/init.h>
0015 #include <linux/percpu.h>
0016 #include <linux/cpu.h>
0017 #include <linux/nmi.h>
0018 #include <linux/module.h>
0019 #include <linux/export.h>
0020 #include <linux/kprobes.h>
0021 #include <linux/hardirq.h>
0022 #include <linux/reboot.h>
0023 #include <linux/slab.h>
0024 #include <linux/kdebug.h>
0025 #include <linux/sched/debug.h>
0026 #include <linux/delay.h>
0027 #include <linux/processor.h>
0028 #include <linux/smp.h>
0029
0030 #include <asm/interrupt.h>
0031 #include <asm/paca.h>
0032 #include <asm/nmi.h>
0033
0034 /*
0035  * The powerpc watchdog ensures that each CPU is able to service timers.
0036  * The watchdog sets up a simple timer on each CPU to run once per timer
0037  * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
0038  * the heartbeat.
0039  *
0040  * Then there are two systems to check that the heartbeat is still running.
0041  * The local soft-NMI, and the SMP checker.
0042  *
0043  * The soft-NMI checker can detect lockups on the local CPU. When interrupts
0044  * are disabled with local_irq_disable(), platforms that use soft-masking
0045  * can leave hardware interrupts enabled and handle them with a masked
0046  * interrupt handler. The masked handler can send the timer interrupt to the
0047  * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
0048  * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
0049  *
0050  * The soft-NMI checker will compare the heartbeat timestamp for this CPU
0051  * with the current time, and take action if the difference exceeds the
0052  * watchdog threshold.
0053  *
0054  * The limitation of the soft-NMI watchdog is that it does not work when
0055  * interrupts are hard disabled or otherwise not being serviced. This is
0056  * solved by also having a SMP watchdog where all CPUs check all other
0057  * CPUs heartbeat.
0058  *
0059  * The SMP checker can detect lockups on other CPUs. A global "pending"
0060  * cpumask is kept, containing all CPUs which enable the watchdog. Each
0061  * CPU clears their pending bit in their heartbeat timer. When the bitmask
0062  * becomes empty, the last CPU to clear its pending bit updates a global
0063  * timestamp and refills the pending bitmask.
0064  *
0065  * In the heartbeat timer, if any CPU notices that the global timestamp has
0066  * not been updated for a period exceeding the watchdog threshold, then it
0067  * means the CPU(s) with their bit still set in the pending mask have had
0068  * their heartbeat stop, and action is taken.
0069  *
0070  * Some platforms implement true NMI IPIs, which can be used by the SMP
0071  * watchdog to detect an unresponsive CPU and pull it out of its stuck
0072  * state with the NMI IPI, to get crash/debug data from it. This way the
0073  * SMP watchdog can detect hardware interrupts off lockups.
0074  */
0075
0076 static cpumask_t wd_cpus_enabled __read_mostly;
0077
0078 static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
0079 static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
0080
0081 static u64 wd_timer_period_ms __read_mostly;  /* interval between heartbeat */
0082
0083 static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
0084 static DEFINE_PER_CPU(u64, wd_timer_tb);
0085
0086 /* SMP checker bits */
0087 static unsigned long __wd_smp_lock;
0088 static unsigned long __wd_reporting;
0089 static unsigned long __wd_nmi_output;
0090 static cpumask_t wd_smp_cpus_pending;
0091 static cpumask_t wd_smp_cpus_stuck;
0092 static u64 wd_smp_last_reset_tb;
0093
0094 #ifdef CONFIG_PPC_PSERIES
0095 static u64 wd_timeout_pct;
0096 #endif
0097
0098 /*
0099  * Try to take the exclusive watchdog action / NMI IPI / printing lock.
0100  * wd_smp_lock must be held. If this fails, we should return and wait
0101  * for the watchdog to kick in again (or another CPU to trigger it).
0102  *
0103  * Importantly, if hardlockup_panic is set, wd_try_report failure should
0104  * not delay the panic, because whichever other CPU is reporting will
0105  * call panic.
0106  */
0107 static bool wd_try_report(void)
0108 {
0109     if (__wd_reporting)
0110         return false;
0111     __wd_reporting = 1;
0112     return true;
0113 }
0114
0115 /* End printing after successful wd_try_report. wd_smp_lock not required. */
0116 static void wd_end_reporting(void)
0117 {
0118     smp_mb(); /* End printing "critical section" */
0119     WARN_ON_ONCE(__wd_reporting == 0);
0120     WRITE_ONCE(__wd_reporting, 0);
0121 }
0122
0123 static inline void wd_smp_lock(unsigned long *flags)
0124 {
0125     /*
0126      * Avoid locking layers if possible.
0127      * This may be called from low level interrupt handlers at some
0128      * point in future.
0129      */
0130     raw_local_irq_save(*flags);
0131     hard_irq_disable(); /* Make it soft-NMI safe */
0132     while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
0133         raw_local_irq_restore(*flags);
0134         spin_until_cond(!test_bit(0, &__wd_smp_lock));
0135         raw_local_irq_save(*flags);
0136         hard_irq_disable();
0137     }
0138 }
0139
0140 static inline void wd_smp_unlock(unsigned long *flags)
0141 {
0142     clear_bit_unlock(0, &__wd_smp_lock);
0143     raw_local_irq_restore(*flags);
0144 }
0145
0146 static void wd_lockup_ipi(struct pt_regs *regs)
0147 {
0148     int cpu = raw_smp_processor_id();
0149     u64 tb = get_tb();
0150
0151     pr_emerg("CPU %d Hard LOCKUP\n", cpu);
0152     pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
0153          cpu, tb, per_cpu(wd_timer_tb, cpu),
0154          tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
0155     print_modules();
0156     print_irqtrace_events(current);
0157     if (regs)
0158         show_regs(regs);
0159     else
0160         dump_stack();
0161
0162     /*
0163      * __wd_nmi_output must be set after we printk from NMI context.
0164      *
0165      * printk from NMI context defers printing to the console to irq_work.
0166      * If that NMI was taken in some code that is hard-locked, then irqs
0167      * are disabled so irq_work will never fire. That can result in the
0168      * hard lockup messages being delayed (indefinitely, until something
0169      * else kicks the console drivers).
0170      *
0171      * Setting __wd_nmi_output will cause another CPU to notice and kick
0172      * the console drivers for us.
0173      *
0174      * xchg is not needed here (it could be a smp_mb and store), but xchg
0175      * gives the memory ordering and atomicity required.
0176      */
0177     xchg(&__wd_nmi_output, 1);
0178
0179     /* Do not panic from here because that can recurse into NMI IPI layer */
0180 }
0181
0182 static bool set_cpu_stuck(int cpu)
0183 {
0184     cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
0185     cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
0186     /*
0187      * See wd_smp_clear_cpu_pending()
0188      */
0189     smp_mb();
0190     if (cpumask_empty(&wd_smp_cpus_pending)) {
0191         wd_smp_last_reset_tb = get_tb();
0192         cpumask_andnot(&wd_smp_cpus_pending,
0193                 &wd_cpus_enabled,
0194                 &wd_smp_cpus_stuck);
0195         return true;
0196     }
0197     return false;
0198 }
0199
0200 static void watchdog_smp_panic(int cpu)
0201 {
0202     static cpumask_t wd_smp_cpus_ipi; // protected by reporting
0203     unsigned long flags;
0204     u64 tb, last_reset;
0205     int c;
0206
0207     wd_smp_lock(&flags);
0208     /* Double check some things under lock */
0209     tb = get_tb();
0210     last_reset = wd_smp_last_reset_tb;
0211     if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb)
0212         goto out;
0213     if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
0214         goto out;
0215     if (!wd_try_report())
0216         goto out;
0217     for_each_online_cpu(c) {
0218         if (!cpumask_test_cpu(c, &wd_smp_cpus_pending))
0219             continue;
0220         if (c == cpu)
0221             continue; // should not happen
0222
0223         __cpumask_set_cpu(c, &wd_smp_cpus_ipi);
0224         if (set_cpu_stuck(c))
0225             break;
0226     }
0227     if (cpumask_empty(&wd_smp_cpus_ipi)) {
0228         wd_end_reporting();
0229         goto out;
0230     }
0231     wd_smp_unlock(&flags);
0232
0233     pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
0234          cpu, cpumask_pr_args(&wd_smp_cpus_ipi));
0235     pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
0236          cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000);
0237
0238     if (!sysctl_hardlockup_all_cpu_backtrace) {
0239         /*
0240          * Try to trigger the stuck CPUs, unless we are going to
0241          * get a backtrace on all of them anyway.
0242          */
0243         for_each_cpu(c, &wd_smp_cpus_ipi) {
0244             smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
0245             __cpumask_clear_cpu(c, &wd_smp_cpus_ipi);
0246         }
0247     } else {
0248         trigger_allbutself_cpu_backtrace();
0249         cpumask_clear(&wd_smp_cpus_ipi);
0250     }
0251
0252     if (hardlockup_panic)
0253         nmi_panic(NULL, "Hard LOCKUP");
0254
0255     wd_end_reporting();
0256
0257     return;
0258
0259 out:
0260     wd_smp_unlock(&flags);
0261 }
0262
0263 static void wd_smp_clear_cpu_pending(int cpu)
0264 {
0265     if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
0266         if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
0267             struct pt_regs *regs = get_irq_regs();
0268             unsigned long flags;
0269
0270             pr_emerg("CPU %d became unstuck TB:%lld\n",
0271                  cpu, get_tb());
0272             print_irqtrace_events(current);
0273             if (regs)
0274                 show_regs(regs);
0275             else
0276                 dump_stack();
0277
0278             wd_smp_lock(&flags);
0279             cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
0280             wd_smp_unlock(&flags);
0281         } else {
0282             /*
0283              * The last CPU to clear pending should have reset the
0284              * watchdog so we generally should not find it empty
0285              * here if our CPU was clear. However it could happen
0286              * due to a rare race with another CPU taking the
0287              * last CPU out of the mask concurrently.
0288              *
0289              * We can't add a warning for it. But just in case
0290              * there is a problem with the watchdog that is causing
0291              * the mask to not be reset, try to kick it along here.
0292              */
0293             if (unlikely(cpumask_empty(&wd_smp_cpus_pending)))
0294                 goto none_pending;
0295         }
0296         return;
0297     }
0298
0299     /*
0300      * All other updates to wd_smp_cpus_pending are performed under
0301      * wd_smp_lock. All of them are atomic except the case where the
0302      * mask becomes empty and is reset. This will not happen here because
0303      * cpu was tested to be in the bitmap (above), and a CPU only clears
0304      * its own bit. _Except_ in the case where another CPU has detected a
0305      * hard lockup on our CPU and takes us out of the pending mask. So in
0306      * normal operation there will be no race here, no problem.
0307      *
0308      * In the lockup case, this atomic clear-bit vs a store that refills
0309      * other bits in the accessed word wll not be a problem. The bit clear
0310      * is atomic so it will not cause the store to get lost, and the store
0311      * will never set this bit so it will not overwrite the bit clear. The
0312      * only way for a stuck CPU to return to the pending bitmap is to
0313      * become unstuck itself.
0314      */
0315     cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
0316
0317     /*
0318      * Order the store to clear pending with the load(s) to check all
0319      * words in the pending mask to check they are all empty. This orders
0320      * with the same barrier on another CPU. This prevents two CPUs
0321      * clearing the last 2 pending bits, but neither seeing the other's
0322      * store when checking if the mask is empty, and missing an empty
0323      * mask, which ends with a false positive.
0324      */
0325     smp_mb();
0326     if (cpumask_empty(&wd_smp_cpus_pending)) {
0327         unsigned long flags;
0328
0329 none_pending:
0330         /*
0331          * Double check under lock because more than one CPU could see
0332          * a clear mask with the lockless check after clearing their
0333          * pending bits.
0334          */
0335         wd_smp_lock(&flags);
0336         if (cpumask_empty(&wd_smp_cpus_pending)) {
0337             wd_smp_last_reset_tb = get_tb();
0338             cpumask_andnot(&wd_smp_cpus_pending,
0339                     &wd_cpus_enabled,
0340                     &wd_smp_cpus_stuck);
0341         }
0342         wd_smp_unlock(&flags);
0343     }
0344 }
0345
0346 static void watchdog_timer_interrupt(int cpu)
0347 {
0348     u64 tb = get_tb();
0349
0350     per_cpu(wd_timer_tb, cpu) = tb;
0351
0352     wd_smp_clear_cpu_pending(cpu);
0353
0354     if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
0355         watchdog_smp_panic(cpu);
0356
0357     if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
0358         /*
0359          * Something has called printk from NMI context. It might be
0360          * stuck, so this triggers a flush that will get that
0361          * printk output to the console.
0362          *
0363          * See wd_lockup_ipi.
0364          */
0365         printk_trigger_flush();
0366     }
0367 }
0368
0369 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
0370 {
0371     unsigned long flags;
0372     int cpu = raw_smp_processor_id();
0373     u64 tb;
0374
0375     /* should only arrive from kernel, with irqs disabled */
0376     WARN_ON_ONCE(!arch_irq_disabled_regs(regs));
0377
0378     if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
0379         return 0;
0380
0381     __this_cpu_inc(irq_stat.soft_nmi_irqs);
0382
0383     tb = get_tb();
0384     if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
0385         /*
0386          * Taking wd_smp_lock here means it is a soft-NMI lock, which
0387          * means we can't take any regular or irqsafe spin locks while
0388          * holding this lock. This is why timers can't printk while
0389          * holding the lock.
0390          */
0391         wd_smp_lock(&flags);
0392         if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
0393             wd_smp_unlock(&flags);
0394             return 0;
0395         }
0396         if (!wd_try_report()) {
0397             wd_smp_unlock(&flags);
0398             /* Couldn't report, try again in 100ms */
0399             mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
0400             return 0;
0401         }
0402
0403         set_cpu_stuck(cpu);
0404
0405         wd_smp_unlock(&flags);
0406
0407         pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
0408              cpu, (void *)regs->nip);
0409         pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
0410              cpu, tb, per_cpu(wd_timer_tb, cpu),
0411              tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
0412         print_modules();
0413         print_irqtrace_events(current);
0414         show_regs(regs);
0415
0416         xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi
0417
0418         if (sysctl_hardlockup_all_cpu_backtrace)
0419             trigger_allbutself_cpu_backtrace();
0420
0421         if (hardlockup_panic)
0422             nmi_panic(regs, "Hard LOCKUP");
0423
0424         wd_end_reporting();
0425     }
0426     /*
0427      * We are okay to change DEC in soft_nmi_interrupt because the masked
0428      * handler has marked a DEC as pending, so the timer interrupt will be
0429      * replayed as soon as local irqs are enabled again.
0430      */
0431     if (wd_panic_timeout_tb < 0x7fffffff)
0432         mtspr(SPRN_DEC, wd_panic_timeout_tb);
0433
0434     return 0;
0435 }
0436
0437 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
0438 {
0439     int cpu = smp_processor_id();
0440
0441     if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
0442         return HRTIMER_NORESTART;
0443
0444     if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0445         return HRTIMER_NORESTART;
0446
0447     watchdog_timer_interrupt(cpu);
0448
0449     hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
0450
0451     return HRTIMER_RESTART;
0452 }
0453
0454 void arch_touch_nmi_watchdog(void)
0455 {
0456     unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
0457     int cpu = smp_processor_id();
0458     u64 tb;
0459
0460     if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0461         return;
0462
0463     tb = get_tb();
0464     if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
0465         per_cpu(wd_timer_tb, cpu) = tb;
0466         wd_smp_clear_cpu_pending(cpu);
0467     }
0468 }
0469 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
0470
0471 static void start_watchdog(void *arg)
0472 {
0473     struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
0474     int cpu = smp_processor_id();
0475     unsigned long flags;
0476
0477     if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
0478         WARN_ON(1);
0479         return;
0480     }
0481
0482     if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
0483         return;
0484
0485     if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0486         return;
0487
0488     wd_smp_lock(&flags);
0489     cpumask_set_cpu(cpu, &wd_cpus_enabled);
0490     if (cpumask_weight(&wd_cpus_enabled) == 1) {
0491         cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
0492         wd_smp_last_reset_tb = get_tb();
0493     }
0494     wd_smp_unlock(&flags);
0495
0496     *this_cpu_ptr(&wd_timer_tb) = get_tb();
0497
0498     hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
0499     hrtimer->function = watchdog_timer_fn;
0500     hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
0501               HRTIMER_MODE_REL_PINNED);
0502 }
0503
0504 static int start_watchdog_on_cpu(unsigned int cpu)
0505 {
0506     return smp_call_function_single(cpu, start_watchdog, NULL, true);
0507 }
0508
0509 static void stop_watchdog(void *arg)
0510 {
0511     struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
0512     int cpu = smp_processor_id();
0513     unsigned long flags;
0514
0515     if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
0516         return; /* Can happen in CPU unplug case */
0517
0518     hrtimer_cancel(hrtimer);
0519
0520     wd_smp_lock(&flags);
0521     cpumask_clear_cpu(cpu, &wd_cpus_enabled);
0522     wd_smp_unlock(&flags);
0523
0524     wd_smp_clear_cpu_pending(cpu);
0525 }
0526
0527 static int stop_watchdog_on_cpu(unsigned int cpu)
0528 {
0529     return smp_call_function_single(cpu, stop_watchdog, NULL, true);
0530 }
0531
0532 static void watchdog_calc_timeouts(void)
0533 {
0534     u64 threshold = watchdog_thresh;
0535
0536 #ifdef CONFIG_PPC_PSERIES
0537     threshold += (READ_ONCE(wd_timeout_pct) * threshold) / 100;
0538 #endif
0539
0540     wd_panic_timeout_tb = threshold * ppc_tb_freq;
0541
0542     /* Have the SMP detector trigger a bit later */
0543     wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
0544
0545     /* 2/5 is the factor that the perf based detector uses */
0546     wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
0547 }
0548
0549 void watchdog_nmi_stop(void)
0550 {
0551     int cpu;
0552
0553     for_each_cpu(cpu, &wd_cpus_enabled)
0554         stop_watchdog_on_cpu(cpu);
0555 }
0556
0557 void watchdog_nmi_start(void)
0558 {
0559     int cpu;
0560
0561     watchdog_calc_timeouts();
0562     for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
0563         start_watchdog_on_cpu(cpu);
0564 }
0565
0566 /*
0567  * Invoked from core watchdog init.
0568  */
0569 int __init watchdog_nmi_probe(void)
0570 {
0571     int err;
0572
0573     err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
0574                     "powerpc/watchdog:online",
0575                     start_watchdog_on_cpu,
0576                     stop_watchdog_on_cpu);
0577     if (err < 0) {
0578         pr_warn("could not be initialized");
0579         return err;
0580     }
0581     return 0;
0582 }
0583
0584 #ifdef CONFIG_PPC_PSERIES
0585 void watchdog_nmi_set_timeout_pct(u64 pct)
0586 {
0587     pr_info("Set the NMI watchdog timeout factor to %llu%%\n", pct);
0588     WRITE_ONCE(wd_timeout_pct, pct);
0589     lockup_detector_reconfigure();
0590 }
0591 #endif