0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #define pr_fmt(fmt) "watchdog: " fmt
0011
0012 #include <linux/kernel.h>
0013 #include <linux/param.h>
0014 #include <linux/init.h>
0015 #include <linux/percpu.h>
0016 #include <linux/cpu.h>
0017 #include <linux/nmi.h>
0018 #include <linux/module.h>
0019 #include <linux/export.h>
0020 #include <linux/kprobes.h>
0021 #include <linux/hardirq.h>
0022 #include <linux/reboot.h>
0023 #include <linux/slab.h>
0024 #include <linux/kdebug.h>
0025 #include <linux/sched/debug.h>
0026 #include <linux/delay.h>
0027 #include <linux/processor.h>
0028 #include <linux/smp.h>
0029
0030 #include <asm/interrupt.h>
0031 #include <asm/paca.h>
0032 #include <asm/nmi.h>
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076 static cpumask_t wd_cpus_enabled __read_mostly;
0077
0078 static u64 wd_panic_timeout_tb __read_mostly;
0079 static u64 wd_smp_panic_timeout_tb __read_mostly;
0080
0081 static u64 wd_timer_period_ms __read_mostly;
0082
0083 static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
0084 static DEFINE_PER_CPU(u64, wd_timer_tb);
0085
0086
0087 static unsigned long __wd_smp_lock;
0088 static unsigned long __wd_reporting;
0089 static unsigned long __wd_nmi_output;
0090 static cpumask_t wd_smp_cpus_pending;
0091 static cpumask_t wd_smp_cpus_stuck;
0092 static u64 wd_smp_last_reset_tb;
0093
0094 #ifdef CONFIG_PPC_PSERIES
0095 static u64 wd_timeout_pct;
0096 #endif
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107 static bool wd_try_report(void)
0108 {
0109 if (__wd_reporting)
0110 return false;
0111 __wd_reporting = 1;
0112 return true;
0113 }
0114
0115
0116 static void wd_end_reporting(void)
0117 {
0118 smp_mb();
0119 WARN_ON_ONCE(__wd_reporting == 0);
0120 WRITE_ONCE(__wd_reporting, 0);
0121 }
0122
0123 static inline void wd_smp_lock(unsigned long *flags)
0124 {
0125
0126
0127
0128
0129
0130 raw_local_irq_save(*flags);
0131 hard_irq_disable();
0132 while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
0133 raw_local_irq_restore(*flags);
0134 spin_until_cond(!test_bit(0, &__wd_smp_lock));
0135 raw_local_irq_save(*flags);
0136 hard_irq_disable();
0137 }
0138 }
0139
0140 static inline void wd_smp_unlock(unsigned long *flags)
0141 {
0142 clear_bit_unlock(0, &__wd_smp_lock);
0143 raw_local_irq_restore(*flags);
0144 }
0145
0146 static void wd_lockup_ipi(struct pt_regs *regs)
0147 {
0148 int cpu = raw_smp_processor_id();
0149 u64 tb = get_tb();
0150
0151 pr_emerg("CPU %d Hard LOCKUP\n", cpu);
0152 pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
0153 cpu, tb, per_cpu(wd_timer_tb, cpu),
0154 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
0155 print_modules();
0156 print_irqtrace_events(current);
0157 if (regs)
0158 show_regs(regs);
0159 else
0160 dump_stack();
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177 xchg(&__wd_nmi_output, 1);
0178
0179
0180 }
0181
0182 static bool set_cpu_stuck(int cpu)
0183 {
0184 cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
0185 cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
0186
0187
0188
0189 smp_mb();
0190 if (cpumask_empty(&wd_smp_cpus_pending)) {
0191 wd_smp_last_reset_tb = get_tb();
0192 cpumask_andnot(&wd_smp_cpus_pending,
0193 &wd_cpus_enabled,
0194 &wd_smp_cpus_stuck);
0195 return true;
0196 }
0197 return false;
0198 }
0199
0200 static void watchdog_smp_panic(int cpu)
0201 {
0202 static cpumask_t wd_smp_cpus_ipi;
0203 unsigned long flags;
0204 u64 tb, last_reset;
0205 int c;
0206
0207 wd_smp_lock(&flags);
0208
0209 tb = get_tb();
0210 last_reset = wd_smp_last_reset_tb;
0211 if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb)
0212 goto out;
0213 if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
0214 goto out;
0215 if (!wd_try_report())
0216 goto out;
0217 for_each_online_cpu(c) {
0218 if (!cpumask_test_cpu(c, &wd_smp_cpus_pending))
0219 continue;
0220 if (c == cpu)
0221 continue;
0222
0223 __cpumask_set_cpu(c, &wd_smp_cpus_ipi);
0224 if (set_cpu_stuck(c))
0225 break;
0226 }
0227 if (cpumask_empty(&wd_smp_cpus_ipi)) {
0228 wd_end_reporting();
0229 goto out;
0230 }
0231 wd_smp_unlock(&flags);
0232
0233 pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
0234 cpu, cpumask_pr_args(&wd_smp_cpus_ipi));
0235 pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
0236 cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000);
0237
0238 if (!sysctl_hardlockup_all_cpu_backtrace) {
0239
0240
0241
0242
0243 for_each_cpu(c, &wd_smp_cpus_ipi) {
0244 smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
0245 __cpumask_clear_cpu(c, &wd_smp_cpus_ipi);
0246 }
0247 } else {
0248 trigger_allbutself_cpu_backtrace();
0249 cpumask_clear(&wd_smp_cpus_ipi);
0250 }
0251
0252 if (hardlockup_panic)
0253 nmi_panic(NULL, "Hard LOCKUP");
0254
0255 wd_end_reporting();
0256
0257 return;
0258
0259 out:
0260 wd_smp_unlock(&flags);
0261 }
0262
0263 static void wd_smp_clear_cpu_pending(int cpu)
0264 {
0265 if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
0266 if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
0267 struct pt_regs *regs = get_irq_regs();
0268 unsigned long flags;
0269
0270 pr_emerg("CPU %d became unstuck TB:%lld\n",
0271 cpu, get_tb());
0272 print_irqtrace_events(current);
0273 if (regs)
0274 show_regs(regs);
0275 else
0276 dump_stack();
0277
0278 wd_smp_lock(&flags);
0279 cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
0280 wd_smp_unlock(&flags);
0281 } else {
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292
0293 if (unlikely(cpumask_empty(&wd_smp_cpus_pending)))
0294 goto none_pending;
0295 }
0296 return;
0297 }
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315 cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325 smp_mb();
0326 if (cpumask_empty(&wd_smp_cpus_pending)) {
0327 unsigned long flags;
0328
0329 none_pending:
0330
0331
0332
0333
0334
0335 wd_smp_lock(&flags);
0336 if (cpumask_empty(&wd_smp_cpus_pending)) {
0337 wd_smp_last_reset_tb = get_tb();
0338 cpumask_andnot(&wd_smp_cpus_pending,
0339 &wd_cpus_enabled,
0340 &wd_smp_cpus_stuck);
0341 }
0342 wd_smp_unlock(&flags);
0343 }
0344 }
0345
0346 static void watchdog_timer_interrupt(int cpu)
0347 {
0348 u64 tb = get_tb();
0349
0350 per_cpu(wd_timer_tb, cpu) = tb;
0351
0352 wd_smp_clear_cpu_pending(cpu);
0353
0354 if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
0355 watchdog_smp_panic(cpu);
0356
0357 if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
0358
0359
0360
0361
0362
0363
0364
0365 printk_trigger_flush();
0366 }
0367 }
0368
0369 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
0370 {
0371 unsigned long flags;
0372 int cpu = raw_smp_processor_id();
0373 u64 tb;
0374
0375
0376 WARN_ON_ONCE(!arch_irq_disabled_regs(regs));
0377
0378 if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
0379 return 0;
0380
0381 __this_cpu_inc(irq_stat.soft_nmi_irqs);
0382
0383 tb = get_tb();
0384 if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
0385
0386
0387
0388
0389
0390
0391 wd_smp_lock(&flags);
0392 if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
0393 wd_smp_unlock(&flags);
0394 return 0;
0395 }
0396 if (!wd_try_report()) {
0397 wd_smp_unlock(&flags);
0398
0399 mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
0400 return 0;
0401 }
0402
0403 set_cpu_stuck(cpu);
0404
0405 wd_smp_unlock(&flags);
0406
0407 pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
0408 cpu, (void *)regs->nip);
0409 pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
0410 cpu, tb, per_cpu(wd_timer_tb, cpu),
0411 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
0412 print_modules();
0413 print_irqtrace_events(current);
0414 show_regs(regs);
0415
0416 xchg(&__wd_nmi_output, 1);
0417
0418 if (sysctl_hardlockup_all_cpu_backtrace)
0419 trigger_allbutself_cpu_backtrace();
0420
0421 if (hardlockup_panic)
0422 nmi_panic(regs, "Hard LOCKUP");
0423
0424 wd_end_reporting();
0425 }
0426
0427
0428
0429
0430
0431 if (wd_panic_timeout_tb < 0x7fffffff)
0432 mtspr(SPRN_DEC, wd_panic_timeout_tb);
0433
0434 return 0;
0435 }
0436
0437 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
0438 {
0439 int cpu = smp_processor_id();
0440
0441 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
0442 return HRTIMER_NORESTART;
0443
0444 if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0445 return HRTIMER_NORESTART;
0446
0447 watchdog_timer_interrupt(cpu);
0448
0449 hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
0450
0451 return HRTIMER_RESTART;
0452 }
0453
0454 void arch_touch_nmi_watchdog(void)
0455 {
0456 unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
0457 int cpu = smp_processor_id();
0458 u64 tb;
0459
0460 if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0461 return;
0462
0463 tb = get_tb();
0464 if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
0465 per_cpu(wd_timer_tb, cpu) = tb;
0466 wd_smp_clear_cpu_pending(cpu);
0467 }
0468 }
0469 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
0470
0471 static void start_watchdog(void *arg)
0472 {
0473 struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
0474 int cpu = smp_processor_id();
0475 unsigned long flags;
0476
0477 if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
0478 WARN_ON(1);
0479 return;
0480 }
0481
0482 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
0483 return;
0484
0485 if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
0486 return;
0487
0488 wd_smp_lock(&flags);
0489 cpumask_set_cpu(cpu, &wd_cpus_enabled);
0490 if (cpumask_weight(&wd_cpus_enabled) == 1) {
0491 cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
0492 wd_smp_last_reset_tb = get_tb();
0493 }
0494 wd_smp_unlock(&flags);
0495
0496 *this_cpu_ptr(&wd_timer_tb) = get_tb();
0497
0498 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
0499 hrtimer->function = watchdog_timer_fn;
0500 hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
0501 HRTIMER_MODE_REL_PINNED);
0502 }
0503
0504 static int start_watchdog_on_cpu(unsigned int cpu)
0505 {
0506 return smp_call_function_single(cpu, start_watchdog, NULL, true);
0507 }
0508
0509 static void stop_watchdog(void *arg)
0510 {
0511 struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
0512 int cpu = smp_processor_id();
0513 unsigned long flags;
0514
0515 if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
0516 return;
0517
0518 hrtimer_cancel(hrtimer);
0519
0520 wd_smp_lock(&flags);
0521 cpumask_clear_cpu(cpu, &wd_cpus_enabled);
0522 wd_smp_unlock(&flags);
0523
0524 wd_smp_clear_cpu_pending(cpu);
0525 }
0526
0527 static int stop_watchdog_on_cpu(unsigned int cpu)
0528 {
0529 return smp_call_function_single(cpu, stop_watchdog, NULL, true);
0530 }
0531
0532 static void watchdog_calc_timeouts(void)
0533 {
0534 u64 threshold = watchdog_thresh;
0535
0536 #ifdef CONFIG_PPC_PSERIES
0537 threshold += (READ_ONCE(wd_timeout_pct) * threshold) / 100;
0538 #endif
0539
0540 wd_panic_timeout_tb = threshold * ppc_tb_freq;
0541
0542
0543 wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
0544
0545
0546 wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
0547 }
0548
0549 void watchdog_nmi_stop(void)
0550 {
0551 int cpu;
0552
0553 for_each_cpu(cpu, &wd_cpus_enabled)
0554 stop_watchdog_on_cpu(cpu);
0555 }
0556
0557 void watchdog_nmi_start(void)
0558 {
0559 int cpu;
0560
0561 watchdog_calc_timeouts();
0562 for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
0563 start_watchdog_on_cpu(cpu);
0564 }
0565
0566
0567
0568
0569 int __init watchdog_nmi_probe(void)
0570 {
0571 int err;
0572
0573 err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
0574 "powerpc/watchdog:online",
0575 start_watchdog_on_cpu,
0576 stop_watchdog_on_cpu);
0577 if (err < 0) {
0578 pr_warn("could not be initialized");
0579 return err;
0580 }
0581 return 0;
0582 }
0583
0584 #ifdef CONFIG_PPC_PSERIES
0585 void watchdog_nmi_set_timeout_pct(u64 pct)
0586 {
0587 pr_info("Set the NMI watchdog timeout factor to %llu%%\n", pct);
0588 WRITE_ONCE(wd_timeout_pct, pct);
0589 lockup_detector_reconfigure();
0590 }
0591 #endif