0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/kernel.h>
0010 #include <linux/param.h>
0011 #include <linux/init.h>
0012 #include <linux/percpu.h>
0013 #include <linux/nmi.h>
0014 #include <linux/export.h>
0015 #include <linux/kprobes.h>
0016 #include <linux/kernel_stat.h>
0017 #include <linux/reboot.h>
0018 #include <linux/slab.h>
0019 #include <linux/kdebug.h>
0020 #include <linux/delay.h>
0021 #include <linux/smp.h>
0022
0023 #include <asm/perf_event.h>
0024 #include <asm/ptrace.h>
0025 #include <asm/pcr.h>
0026
0027 #include "kstack.h"
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037 static int panic_on_timeout;
0038
0039
0040
0041
0042
0043
0044 atomic_t nmi_active = ATOMIC_INIT(0);
0045 EXPORT_SYMBOL(nmi_active);
0046 static int nmi_init_done;
0047 static unsigned int nmi_hz = HZ;
0048 static DEFINE_PER_CPU(short, wd_enabled);
0049 static int endflag __initdata;
0050
0051 static DEFINE_PER_CPU(unsigned int, last_irq_sum);
0052 static DEFINE_PER_CPU(long, alert_counter);
0053 static DEFINE_PER_CPU(int, nmi_touch);
0054
0055 void arch_touch_nmi_watchdog(void)
0056 {
0057 if (atomic_read(&nmi_active)) {
0058 int cpu;
0059
0060 for_each_present_cpu(cpu) {
0061 if (per_cpu(nmi_touch, cpu) != 1)
0062 per_cpu(nmi_touch, cpu) = 1;
0063 }
0064 }
0065 }
0066 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
0067
0068 static void die_nmi(const char *str, struct pt_regs *regs, int do_panic)
0069 {
0070 int this_cpu = smp_processor_id();
0071
0072 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0,
0073 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
0074 return;
0075
0076 if (do_panic || panic_on_oops)
0077 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
0078 else
0079 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
0080 }
0081
0082 notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
0083 {
0084 unsigned int sum, touched = 0;
0085 void *orig_sp;
0086
0087 clear_softint(1 << irq);
0088
0089 local_cpu_data().__nmi_count++;
0090
0091 nmi_enter();
0092
0093 orig_sp = set_hardirq_stack();
0094
0095 if (notify_die(DIE_NMI, "nmi", regs, 0,
0096 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
0097 touched = 1;
0098 else
0099 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
0100
0101 sum = local_cpu_data().irq0_irqs;
0102 if (__this_cpu_read(nmi_touch)) {
0103 __this_cpu_write(nmi_touch, 0);
0104 touched = 1;
0105 }
0106 if (!touched && __this_cpu_read(last_irq_sum) == sum) {
0107 __this_cpu_inc(alert_counter);
0108 if (__this_cpu_read(alert_counter) == 30 * nmi_hz)
0109 die_nmi("BUG: NMI Watchdog detected LOCKUP",
0110 regs, panic_on_timeout);
0111 } else {
0112 __this_cpu_write(last_irq_sum, sum);
0113 __this_cpu_write(alert_counter, 0);
0114 }
0115 if (__this_cpu_read(wd_enabled)) {
0116 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
0117 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
0118 }
0119
0120 restore_hardirq_stack(orig_sp);
0121
0122 nmi_exit();
0123 }
0124
0125 static inline unsigned int get_nmi_count(int cpu)
0126 {
0127 return cpu_data(cpu).__nmi_count;
0128 }
0129
0130 static __init void nmi_cpu_busy(void *data)
0131 {
0132 while (endflag == 0)
0133 mb();
0134 }
0135
0136 static void report_broken_nmi(int cpu, int *prev_nmi_count)
0137 {
0138 printk(KERN_CONT "\n");
0139
0140 printk(KERN_WARNING
0141 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
0142 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
0143
0144 printk(KERN_WARNING
0145 "Please report this to bugzilla.kernel.org,\n");
0146 printk(KERN_WARNING
0147 "and attach the output of the 'dmesg' command.\n");
0148
0149 per_cpu(wd_enabled, cpu) = 0;
0150 atomic_dec(&nmi_active);
0151 }
0152
0153 void stop_nmi_watchdog(void *unused)
0154 {
0155 if (!__this_cpu_read(wd_enabled))
0156 return;
0157 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
0158 __this_cpu_write(wd_enabled, 0);
0159 atomic_dec(&nmi_active);
0160 }
0161
0162 static int __init check_nmi_watchdog(void)
0163 {
0164 unsigned int *prev_nmi_count;
0165 int cpu, err;
0166
0167 if (!atomic_read(&nmi_active))
0168 return 0;
0169
0170 prev_nmi_count = kmalloc_array(nr_cpu_ids, sizeof(unsigned int),
0171 GFP_KERNEL);
0172 if (!prev_nmi_count) {
0173 err = -ENOMEM;
0174 goto error;
0175 }
0176
0177 printk(KERN_INFO "Testing NMI watchdog ... ");
0178
0179 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
0180
0181 for_each_possible_cpu(cpu)
0182 prev_nmi_count[cpu] = get_nmi_count(cpu);
0183 local_irq_enable();
0184 mdelay((20 * 1000) / nmi_hz);
0185
0186 for_each_online_cpu(cpu) {
0187 if (!per_cpu(wd_enabled, cpu))
0188 continue;
0189 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
0190 report_broken_nmi(cpu, prev_nmi_count);
0191 }
0192 endflag = 1;
0193 if (!atomic_read(&nmi_active)) {
0194 kfree(prev_nmi_count);
0195 atomic_set(&nmi_active, -1);
0196 err = -ENODEV;
0197 goto error;
0198 }
0199 printk("OK.\n");
0200
0201 nmi_hz = 1;
0202
0203 kfree(prev_nmi_count);
0204 return 0;
0205 error:
0206 on_each_cpu(stop_nmi_watchdog, NULL, 1);
0207 return err;
0208 }
0209
0210 void start_nmi_watchdog(void *unused)
0211 {
0212 if (__this_cpu_read(wd_enabled))
0213 return;
0214
0215 __this_cpu_write(wd_enabled, 1);
0216 atomic_inc(&nmi_active);
0217
0218 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
0219 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
0220
0221 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
0222 }
0223
0224 static void nmi_adjust_hz_one(void *unused)
0225 {
0226 if (!__this_cpu_read(wd_enabled))
0227 return;
0228
0229 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
0230 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
0231
0232 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
0233 }
0234
0235 void nmi_adjust_hz(unsigned int new_hz)
0236 {
0237 nmi_hz = new_hz;
0238 on_each_cpu(nmi_adjust_hz_one, NULL, 1);
0239 }
0240 EXPORT_SYMBOL_GPL(nmi_adjust_hz);
0241
0242 static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
0243 {
0244 on_each_cpu(stop_nmi_watchdog, NULL, 1);
0245 return 0;
0246 }
0247
0248 static struct notifier_block nmi_reboot_notifier = {
0249 .notifier_call = nmi_shutdown,
0250 };
0251
0252 int __init nmi_init(void)
0253 {
0254 int err;
0255
0256 on_each_cpu(start_nmi_watchdog, NULL, 1);
0257
0258 err = check_nmi_watchdog();
0259 if (!err) {
0260 err = register_reboot_notifier(&nmi_reboot_notifier);
0261 if (err) {
0262 on_each_cpu(stop_nmi_watchdog, NULL, 1);
0263 atomic_set(&nmi_active, -1);
0264 }
0265 }
0266
0267 nmi_init_done = 1;
0268
0269 return err;
0270 }
0271
0272 static int __init setup_nmi_watchdog(char *str)
0273 {
0274 if (!strncmp(str, "panic", 5))
0275 panic_on_timeout = 1;
0276
0277 return 0;
0278 }
0279 __setup("nmi_watchdog=", setup_nmi_watchdog);
0280
0281
0282
0283
0284
0285 int watchdog_nmi_enable(unsigned int cpu)
0286 {
0287 if (atomic_read(&nmi_active) == -1) {
0288 pr_warn("NMI watchdog cannot be enabled or disabled\n");
0289 return -1;
0290 }
0291
0292
0293
0294
0295
0296
0297 if (!nmi_init_done)
0298 return 0;
0299
0300 smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1);
0301
0302 return 0;
0303 }
0304
0305
0306
0307
0308 void watchdog_nmi_disable(unsigned int cpu)
0309 {
0310 if (atomic_read(&nmi_active) == -1)
0311 pr_warn_once("NMI watchdog cannot be enabled or disabled\n");
0312 else
0313 smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1);
0314 }