0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <linux/thread_info.h>
0013 #include <linux/capability.h>
0014 #include <linux/miscdevice.h>
0015 #include <linux/ratelimit.h>
0016 #include <linux/rcupdate.h>
0017 #include <linux/kobject.h>
0018 #include <linux/uaccess.h>
0019 #include <linux/kdebug.h>
0020 #include <linux/kernel.h>
0021 #include <linux/percpu.h>
0022 #include <linux/string.h>
0023 #include <linux/device.h>
0024 #include <linux/syscore_ops.h>
0025 #include <linux/delay.h>
0026 #include <linux/ctype.h>
0027 #include <linux/sched.h>
0028 #include <linux/sysfs.h>
0029 #include <linux/types.h>
0030 #include <linux/slab.h>
0031 #include <linux/init.h>
0032 #include <linux/kmod.h>
0033 #include <linux/poll.h>
0034 #include <linux/nmi.h>
0035 #include <linux/cpu.h>
0036 #include <linux/ras.h>
0037 #include <linux/smp.h>
0038 #include <linux/fs.h>
0039 #include <linux/mm.h>
0040 #include <linux/debugfs.h>
0041 #include <linux/irq_work.h>
0042 #include <linux/export.h>
0043 #include <linux/set_memory.h>
0044 #include <linux/sync_core.h>
0045 #include <linux/task_work.h>
0046 #include <linux/hardirq.h>
0047
0048 #include <asm/intel-family.h>
0049 #include <asm/processor.h>
0050 #include <asm/traps.h>
0051 #include <asm/tlbflush.h>
0052 #include <asm/mce.h>
0053 #include <asm/msr.h>
0054 #include <asm/reboot.h>
0055
0056 #include "internal.h"
0057
0058
0059 static DEFINE_MUTEX(mce_sysfs_mutex);
0060
0061 #define CREATE_TRACE_POINTS
0062 #include <trace/events/mce.h>
0063
0064 #define SPINUNIT 100
0065
0066 DEFINE_PER_CPU(unsigned, mce_exception_count);
0067
0068 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
0069
0070 struct mce_bank {
0071 u64 ctl;
0072
0073 __u64 init : 1,
0074 __reserved_1 : 63;
0075 };
0076 static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
0077
0078 #define ATTR_LEN 16
0079
0080 struct mce_bank_dev {
0081 struct device_attribute attr;
0082 char attrname[ATTR_LEN];
0083 u8 bank;
0084 };
0085 static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
0086
0087 struct mce_vendor_flags mce_flags __read_mostly;
0088
0089 struct mca_config mca_cfg __read_mostly = {
0090 .bootlog = -1,
0091 .monarch_timeout = -1
0092 };
0093
0094 static DEFINE_PER_CPU(struct mce, mces_seen);
0095 static unsigned long mce_need_notify;
0096
0097
0098
0099
0100
0101 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
0102 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
0103 };
0104
0105
0106
0107
0108
0109
0110
0111
0112 mce_banks_t mce_banks_ce_disabled;
0113
0114 static struct work_struct mce_work;
0115 static struct irq_work mce_irq_work;
0116
0117
0118
0119
0120
0121 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
0122
0123
0124 void mce_setup(struct mce *m)
0125 {
0126 memset(m, 0, sizeof(struct mce));
0127 m->cpu = m->extcpu = smp_processor_id();
0128
0129 m->time = __ktime_get_real_seconds();
0130 m->cpuvendor = boot_cpu_data.x86_vendor;
0131 m->cpuid = cpuid_eax(1);
0132 m->socketid = cpu_data(m->extcpu).phys_proc_id;
0133 m->apicid = cpu_data(m->extcpu).initial_apicid;
0134 m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
0135 m->ppin = cpu_data(m->extcpu).ppin;
0136 m->microcode = boot_cpu_data.microcode;
0137 }
0138
0139 DEFINE_PER_CPU(struct mce, injectm);
0140 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
0141
0142 void mce_log(struct mce *m)
0143 {
0144 if (!mce_gen_pool_add(m))
0145 irq_work_queue(&mce_irq_work);
0146 }
0147 EXPORT_SYMBOL_GPL(mce_log);
0148
0149 void mce_register_decode_chain(struct notifier_block *nb)
0150 {
0151 if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
0152 nb->priority > MCE_PRIO_HIGHEST))
0153 return;
0154
0155 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
0156 }
0157 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
0158
0159 void mce_unregister_decode_chain(struct notifier_block *nb)
0160 {
0161 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
0162 }
0163 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
0164
0165 static void __print_mce(struct mce *m)
0166 {
0167 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
0168 m->extcpu,
0169 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
0170 m->mcgstatus, m->bank, m->status);
0171
0172 if (m->ip) {
0173 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
0174 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
0175 m->cs, m->ip);
0176
0177 if (m->cs == __KERNEL_CS)
0178 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
0179 pr_cont("\n");
0180 }
0181
0182 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
0183 if (m->addr)
0184 pr_cont("ADDR %llx ", m->addr);
0185 if (m->misc)
0186 pr_cont("MISC %llx ", m->misc);
0187 if (m->ppin)
0188 pr_cont("PPIN %llx ", m->ppin);
0189
0190 if (mce_flags.smca) {
0191 if (m->synd)
0192 pr_cont("SYND %llx ", m->synd);
0193 if (m->ipid)
0194 pr_cont("IPID %llx ", m->ipid);
0195 }
0196
0197 pr_cont("\n");
0198
0199
0200
0201
0202
0203 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
0204 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
0205 m->microcode);
0206 }
0207
0208 static void print_mce(struct mce *m)
0209 {
0210 __print_mce(m);
0211
0212 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
0213 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
0214 }
0215
0216 #define PANIC_TIMEOUT 5
0217
0218 static atomic_t mce_panicked;
0219
0220 static int fake_panic;
0221 static atomic_t mce_fake_panicked;
0222
0223
0224 static void wait_for_panic(void)
0225 {
0226 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
0227
0228 preempt_disable();
0229 local_irq_enable();
0230 while (timeout-- > 0)
0231 udelay(1);
0232 if (panic_timeout == 0)
0233 panic_timeout = mca_cfg.panic_timeout;
0234 panic("Panicing machine check CPU died");
0235 }
0236
0237 static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
0238 {
0239 struct llist_node *pending;
0240 struct mce_evt_llist *l;
0241 int apei_err = 0;
0242
0243
0244
0245
0246
0247 instrumentation_begin();
0248
0249 if (!fake_panic) {
0250
0251
0252
0253 if (atomic_inc_return(&mce_panicked) > 1)
0254 wait_for_panic();
0255 barrier();
0256
0257 bust_spinlocks(1);
0258 console_verbose();
0259 } else {
0260
0261 if (atomic_inc_return(&mce_fake_panicked) > 1)
0262 goto out;
0263 }
0264 pending = mce_gen_pool_prepare_records();
0265
0266 llist_for_each_entry(l, pending, llnode) {
0267 struct mce *m = &l->mce;
0268 if (!(m->status & MCI_STATUS_UC)) {
0269 print_mce(m);
0270 if (!apei_err)
0271 apei_err = apei_write_mce(m);
0272 }
0273 }
0274
0275 llist_for_each_entry(l, pending, llnode) {
0276 struct mce *m = &l->mce;
0277 if (!(m->status & MCI_STATUS_UC))
0278 continue;
0279 if (!final || mce_cmp(m, final)) {
0280 print_mce(m);
0281 if (!apei_err)
0282 apei_err = apei_write_mce(m);
0283 }
0284 }
0285 if (final) {
0286 print_mce(final);
0287 if (!apei_err)
0288 apei_err = apei_write_mce(final);
0289 }
0290 if (exp)
0291 pr_emerg(HW_ERR "Machine check: %s\n", exp);
0292 if (!fake_panic) {
0293 if (panic_timeout == 0)
0294 panic_timeout = mca_cfg.panic_timeout;
0295 panic(msg);
0296 } else
0297 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
0298
0299 out:
0300 instrumentation_end();
0301 }
0302
0303
0304
0305 static int msr_to_offset(u32 msr)
0306 {
0307 unsigned bank = __this_cpu_read(injectm.bank);
0308
0309 if (msr == mca_cfg.rip_msr)
0310 return offsetof(struct mce, ip);
0311 if (msr == mca_msr_reg(bank, MCA_STATUS))
0312 return offsetof(struct mce, status);
0313 if (msr == mca_msr_reg(bank, MCA_ADDR))
0314 return offsetof(struct mce, addr);
0315 if (msr == mca_msr_reg(bank, MCA_MISC))
0316 return offsetof(struct mce, misc);
0317 if (msr == MSR_IA32_MCG_STATUS)
0318 return offsetof(struct mce, mcgstatus);
0319 return -1;
0320 }
0321
0322 void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
0323 {
0324 if (wrmsr) {
0325 pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
0326 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
0327 regs->ip, (void *)regs->ip);
0328 } else {
0329 pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
0330 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
0331 }
0332
0333 show_stack_regs(regs);
0334
0335 panic("MCA architectural violation!\n");
0336
0337 while (true)
0338 cpu_relax();
0339 }
0340
0341
0342 noinstr u64 mce_rdmsrl(u32 msr)
0343 {
0344 DECLARE_ARGS(val, low, high);
0345
0346 if (__this_cpu_read(injectm.finished)) {
0347 int offset;
0348 u64 ret;
0349
0350 instrumentation_begin();
0351
0352 offset = msr_to_offset(msr);
0353 if (offset < 0)
0354 ret = 0;
0355 else
0356 ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
0357
0358 instrumentation_end();
0359
0360 return ret;
0361 }
0362
0363
0364
0365
0366
0367
0368 asm volatile("1: rdmsr\n"
0369 "2:\n"
0370 _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
0371 : EAX_EDX_RET(val, low, high) : "c" (msr));
0372
0373
0374 return EAX_EDX_VAL(val, low, high);
0375 }
0376
0377 static noinstr void mce_wrmsrl(u32 msr, u64 v)
0378 {
0379 u32 low, high;
0380
0381 if (__this_cpu_read(injectm.finished)) {
0382 int offset;
0383
0384 instrumentation_begin();
0385
0386 offset = msr_to_offset(msr);
0387 if (offset >= 0)
0388 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
0389
0390 instrumentation_end();
0391
0392 return;
0393 }
0394
0395 low = (u32)v;
0396 high = (u32)(v >> 32);
0397
0398
0399 asm volatile("1: wrmsr\n"
0400 "2:\n"
0401 _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
0402 : : "c" (msr), "a"(low), "d" (high) : "memory");
0403 }
0404
0405
0406
0407
0408
0409
0410 static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
0411 {
0412
0413
0414
0415
0416 instrumentation_begin();
0417 mce_setup(m);
0418 instrumentation_end();
0419
0420 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
0421 if (regs) {
0422
0423
0424
0425
0426 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
0427 m->ip = regs->ip;
0428 m->cs = regs->cs;
0429
0430
0431
0432
0433
0434
0435 if (v8086_mode(regs))
0436 m->cs |= 3;
0437 }
0438
0439 if (mca_cfg.rip_msr)
0440 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
0441 }
0442 }
0443
0444 int mce_available(struct cpuinfo_x86 *c)
0445 {
0446 if (mca_cfg.disabled)
0447 return 0;
0448 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
0449 }
0450
0451 static void mce_schedule_work(void)
0452 {
0453 if (!mce_gen_pool_empty())
0454 schedule_work(&mce_work);
0455 }
0456
0457 static void mce_irq_work_cb(struct irq_work *entry)
0458 {
0459 mce_schedule_work();
0460 }
0461
0462
0463
0464
0465
0466
0467
0468 int mce_usable_address(struct mce *m)
0469 {
0470 if (!(m->status & MCI_STATUS_ADDRV))
0471 return 0;
0472
0473
0474 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
0475 boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
0476 return 1;
0477
0478 if (!(m->status & MCI_STATUS_MISCV))
0479 return 0;
0480
0481 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
0482 return 0;
0483
0484 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
0485 return 0;
0486
0487 return 1;
0488 }
0489 EXPORT_SYMBOL_GPL(mce_usable_address);
0490
0491 bool mce_is_memory_error(struct mce *m)
0492 {
0493 switch (m->cpuvendor) {
0494 case X86_VENDOR_AMD:
0495 case X86_VENDOR_HYGON:
0496 return amd_mce_is_memory_error(m);
0497
0498 case X86_VENDOR_INTEL:
0499 case X86_VENDOR_ZHAOXIN:
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513 return (m->status & 0xef80) == BIT(7) ||
0514 (m->status & 0xef00) == BIT(8) ||
0515 (m->status & 0xeffc) == 0xc;
0516
0517 default:
0518 return false;
0519 }
0520 }
0521 EXPORT_SYMBOL_GPL(mce_is_memory_error);
0522
0523 static bool whole_page(struct mce *m)
0524 {
0525 if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
0526 return true;
0527
0528 return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
0529 }
0530
0531 bool mce_is_correctable(struct mce *m)
0532 {
0533 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
0534 return false;
0535
0536 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
0537 return false;
0538
0539 if (m->status & MCI_STATUS_UC)
0540 return false;
0541
0542 return true;
0543 }
0544 EXPORT_SYMBOL_GPL(mce_is_correctable);
0545
0546 static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
0547 void *data)
0548 {
0549 struct mce *m = (struct mce *)data;
0550
0551 if (!m)
0552 return NOTIFY_DONE;
0553
0554
0555 trace_mce_record(m);
0556
0557 set_bit(0, &mce_need_notify);
0558
0559 mce_notify_irq();
0560
0561 return NOTIFY_DONE;
0562 }
0563
0564 static struct notifier_block early_nb = {
0565 .notifier_call = mce_early_notifier,
0566 .priority = MCE_PRIO_EARLY,
0567 };
0568
0569 static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
0570 void *data)
0571 {
0572 struct mce *mce = (struct mce *)data;
0573 unsigned long pfn;
0574
0575 if (!mce || !mce_usable_address(mce))
0576 return NOTIFY_DONE;
0577
0578 if (mce->severity != MCE_AO_SEVERITY &&
0579 mce->severity != MCE_DEFERRED_SEVERITY)
0580 return NOTIFY_DONE;
0581
0582 pfn = mce->addr >> PAGE_SHIFT;
0583 if (!memory_failure(pfn, 0)) {
0584 set_mce_nospec(pfn);
0585 mce->kflags |= MCE_HANDLED_UC;
0586 }
0587
0588 return NOTIFY_OK;
0589 }
0590
0591 static struct notifier_block mce_uc_nb = {
0592 .notifier_call = uc_decode_notifier,
0593 .priority = MCE_PRIO_UC,
0594 };
0595
0596 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
0597 void *data)
0598 {
0599 struct mce *m = (struct mce *)data;
0600
0601 if (!m)
0602 return NOTIFY_DONE;
0603
0604 if (mca_cfg.print_all || !m->kflags)
0605 __print_mce(m);
0606
0607 return NOTIFY_DONE;
0608 }
0609
0610 static struct notifier_block mce_default_nb = {
0611 .notifier_call = mce_default_notifier,
0612
0613 .priority = MCE_PRIO_LOWEST,
0614 };
0615
0616
0617
0618
0619 static noinstr void mce_read_aux(struct mce *m, int i)
0620 {
0621 if (m->status & MCI_STATUS_MISCV)
0622 m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
0623
0624 if (m->status & MCI_STATUS_ADDRV) {
0625 m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
0626
0627
0628
0629
0630 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
0631 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
0632 m->addr >>= shift;
0633 m->addr <<= shift;
0634 }
0635
0636
0637
0638
0639
0640 if (mce_flags.smca) {
0641 u8 lsb = (m->addr >> 56) & 0x3f;
0642
0643 m->addr &= GENMASK_ULL(55, lsb);
0644 }
0645 }
0646
0647 if (mce_flags.smca) {
0648 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
0649
0650 if (m->status & MCI_STATUS_SYNDV)
0651 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
0652 }
0653 }
0654
0655 DEFINE_PER_CPU(unsigned, mce_poll_count);
0656
0657
0658
0659
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
0673 {
0674 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
0675 bool error_seen = false;
0676 struct mce m;
0677 int i;
0678
0679 this_cpu_inc(mce_poll_count);
0680
0681 mce_gather_info(&m, NULL);
0682
0683 if (flags & MCP_TIMESTAMP)
0684 m.tsc = rdtsc();
0685
0686 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
0687 if (!mce_banks[i].ctl || !test_bit(i, *b))
0688 continue;
0689
0690 m.misc = 0;
0691 m.addr = 0;
0692 m.bank = i;
0693
0694 barrier();
0695 m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
0696
0697
0698 if (!(m.status & MCI_STATUS_VAL))
0699 continue;
0700
0701
0702
0703
0704
0705 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
0706 goto log_it;
0707
0708
0709
0710
0711
0712
0713
0714 if (!mca_cfg.ser) {
0715 if (m.status & MCI_STATUS_UC)
0716 continue;
0717 goto log_it;
0718 }
0719
0720
0721 if (!(m.status & MCI_STATUS_EN))
0722 goto log_it;
0723
0724
0725
0726
0727
0728 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
0729 goto log_it;
0730
0731
0732
0733
0734
0735
0736 continue;
0737
0738 log_it:
0739 error_seen = true;
0740
0741 if (flags & MCP_DONTLOG)
0742 goto clear_it;
0743
0744 mce_read_aux(&m, i);
0745 m.severity = mce_severity(&m, NULL, NULL, false);
0746
0747
0748
0749
0750
0751 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
0752 goto clear_it;
0753
0754 if (flags & MCP_QUEUE_LOG)
0755 mce_gen_pool_add(&m);
0756 else
0757 mce_log(&m);
0758
0759 clear_it:
0760
0761
0762
0763 mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
0764 }
0765
0766
0767
0768
0769
0770
0771 sync_core();
0772
0773 return error_seen;
0774 }
0775 EXPORT_SYMBOL_GPL(machine_check_poll);
0776
0777
0778
0779
0780
0781
0782
0783
0784
0785 static __always_inline void
0786 quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
0787 {
0788 if (bank != 0)
0789 return;
0790 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
0791 return;
0792 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
0793 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
0794 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
0795 MCACOD)) !=
0796 (MCI_STATUS_UC|MCI_STATUS_EN|
0797 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
0798 MCI_STATUS_AR|MCACOD_INSTR))
0799 return;
0800
0801 m->mcgstatus |= MCG_STATUS_EIPV;
0802 m->ip = regs->ip;
0803 m->cs = regs->cs;
0804 }
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818
0819
0820
0821 static noinstr bool quirk_skylake_repmov(void)
0822 {
0823 u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
0824 u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
0825 u64 mc1_status;
0826
0827
0828
0829
0830
0831 if (!(mcgstatus & MCG_STATUS_LMCES) ||
0832 !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
0833 return false;
0834
0835 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
0836
0837
0838 if ((mc1_status &
0839 (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
0840 MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
0841 MCI_STATUS_AR | MCI_STATUS_S)) ==
0842 (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
0843 MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
0844 MCI_STATUS_AR | MCI_STATUS_S)) {
0845 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
0846 mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
0847 mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
0848
0849 instrumentation_begin();
0850 pr_err_once("Erratum detected, disable fast string copy instructions.\n");
0851 instrumentation_end();
0852
0853 return true;
0854 }
0855
0856 return false;
0857 }
0858
0859
0860
0861
0862
0863 static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
0864 struct pt_regs *regs)
0865 {
0866 char *tmp = *msg;
0867 int i;
0868
0869 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
0870 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
0871 if (!(m->status & MCI_STATUS_VAL))
0872 continue;
0873
0874 arch___set_bit(i, validp);
0875 if (mce_flags.snb_ifu_quirk)
0876 quirk_sandybridge_ifu(i, m, regs);
0877
0878 m->bank = i;
0879 if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
0880 mce_read_aux(m, i);
0881 *msg = tmp;
0882 return 1;
0883 }
0884 }
0885 return 0;
0886 }
0887
0888
0889
0890
0891
0892 static atomic_t mce_executing;
0893
0894
0895
0896
0897 static atomic_t mce_callin;
0898
0899
0900
0901
0902
0903 static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
0904
0905
0906
0907
0908 static noinstr int mce_timed_out(u64 *t, const char *msg)
0909 {
0910 int ret = 0;
0911
0912
0913 instrumentation_begin();
0914
0915
0916
0917
0918
0919
0920
0921 rmb();
0922 if (atomic_read(&mce_panicked))
0923 wait_for_panic();
0924 if (!mca_cfg.monarch_timeout)
0925 goto out;
0926 if ((s64)*t < SPINUNIT) {
0927 if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
0928 pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
0929 cpumask_pr_args(&mce_missing_cpus));
0930 mce_panic(msg, NULL, NULL);
0931
0932 ret = 1;
0933 goto out;
0934 }
0935 *t -= SPINUNIT;
0936
0937 out:
0938 touch_nmi_watchdog();
0939
0940 instrumentation_end();
0941
0942 return ret;
0943 }
0944
0945
0946
0947
0948
0949
0950
0951
0952
0953
0954
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969 static void mce_reign(void)
0970 {
0971 int cpu;
0972 struct mce *m = NULL;
0973 int global_worst = 0;
0974 char *msg = NULL;
0975
0976
0977
0978
0979
0980
0981 for_each_possible_cpu(cpu) {
0982 struct mce *mtmp = &per_cpu(mces_seen, cpu);
0983
0984 if (mtmp->severity > global_worst) {
0985 global_worst = mtmp->severity;
0986 m = &per_cpu(mces_seen, cpu);
0987 }
0988 }
0989
0990
0991
0992
0993
0994
0995 if (m && global_worst >= MCE_PANIC_SEVERITY) {
0996
0997 mce_severity(m, NULL, &msg, true);
0998 mce_panic("Fatal machine check", m, msg);
0999 }
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011 if (global_worst <= MCE_KEEP_SEVERITY)
1012 mce_panic("Fatal machine check from unknown source", NULL, NULL);
1013
1014
1015
1016
1017
1018 for_each_possible_cpu(cpu)
1019 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
1020 }
1021
1022 static atomic_t global_nwo;
1023
1024
1025
1026
1027
1028
1029
1030
1031 static noinstr int mce_start(int *no_way_out)
1032 {
1033 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1034 int order, ret = -1;
1035
1036 if (!timeout)
1037 return ret;
1038
1039 arch_atomic_add(*no_way_out, &global_nwo);
1040
1041
1042
1043
1044 order = arch_atomic_inc_return(&mce_callin);
1045 arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
1046
1047
1048 instrumentation_begin();
1049
1050
1051
1052
1053 while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
1054 if (mce_timed_out(&timeout,
1055 "Timeout: Not all CPUs entered broadcast exception handler")) {
1056 arch_atomic_set(&global_nwo, 0);
1057 goto out;
1058 }
1059 ndelay(SPINUNIT);
1060 }
1061
1062
1063
1064
1065 smp_rmb();
1066
1067 if (order == 1) {
1068
1069
1070
1071 arch_atomic_set(&mce_executing, 1);
1072 } else {
1073
1074
1075
1076
1077
1078
1079 while (arch_atomic_read(&mce_executing) < order) {
1080 if (mce_timed_out(&timeout,
1081 "Timeout: Subject CPUs unable to finish machine check processing")) {
1082 arch_atomic_set(&global_nwo, 0);
1083 goto out;
1084 }
1085 ndelay(SPINUNIT);
1086 }
1087 }
1088
1089
1090
1091
1092 *no_way_out = arch_atomic_read(&global_nwo);
1093
1094 ret = order;
1095
1096 out:
1097 instrumentation_end();
1098
1099 return ret;
1100 }
1101
1102
1103
1104
1105
1106 static noinstr int mce_end(int order)
1107 {
1108 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1109 int ret = -1;
1110
1111
1112 instrumentation_begin();
1113
1114 if (!timeout)
1115 goto reset;
1116 if (order < 0)
1117 goto reset;
1118
1119
1120
1121
1122 atomic_inc(&mce_executing);
1123
1124 if (order == 1) {
1125
1126
1127
1128
1129 while (atomic_read(&mce_executing) <= num_online_cpus()) {
1130 if (mce_timed_out(&timeout,
1131 "Timeout: Monarch CPU unable to finish machine check processing"))
1132 goto reset;
1133 ndelay(SPINUNIT);
1134 }
1135
1136 mce_reign();
1137 barrier();
1138 ret = 0;
1139 } else {
1140
1141
1142
1143 while (atomic_read(&mce_executing) != 0) {
1144 if (mce_timed_out(&timeout,
1145 "Timeout: Monarch CPU did not finish machine check processing"))
1146 goto reset;
1147 ndelay(SPINUNIT);
1148 }
1149
1150
1151
1152
1153 ret = 0;
1154 goto out;
1155 }
1156
1157
1158
1159
1160 reset:
1161 atomic_set(&global_nwo, 0);
1162 atomic_set(&mce_callin, 0);
1163 cpumask_setall(&mce_missing_cpus);
1164 barrier();
1165
1166
1167
1168
1169 atomic_set(&mce_executing, 0);
1170
1171 out:
1172 instrumentation_end();
1173
1174 return ret;
1175 }
1176
1177 static __always_inline void mce_clear_state(unsigned long *toclear)
1178 {
1179 int i;
1180
1181 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1182 if (arch_test_bit(i, toclear))
1183 mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
1184 }
1185 }
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199 static noinstr bool mce_check_crashing_cpu(void)
1200 {
1201 unsigned int cpu = smp_processor_id();
1202
1203 if (arch_cpu_is_offline(cpu) ||
1204 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1205 u64 mcgstatus;
1206
1207 mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
1208
1209 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1210 if (mcgstatus & MCG_STATUS_LMCES)
1211 return false;
1212 }
1213
1214 if (mcgstatus & MCG_STATUS_RIPV) {
1215 __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
1216 return true;
1217 }
1218 }
1219 return false;
1220 }
1221
1222 static __always_inline int
1223 __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
1224 unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
1225 int *worst)
1226 {
1227 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1228 struct mca_config *cfg = &mca_cfg;
1229 int severity, i, taint = 0;
1230
1231 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1232 arch___clear_bit(i, toclear);
1233 if (!arch_test_bit(i, valid_banks))
1234 continue;
1235
1236 if (!mce_banks[i].ctl)
1237 continue;
1238
1239 m->misc = 0;
1240 m->addr = 0;
1241 m->bank = i;
1242
1243 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
1244 if (!(m->status & MCI_STATUS_VAL))
1245 continue;
1246
1247
1248
1249
1250
1251 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1252 !no_way_out)
1253 continue;
1254
1255
1256 taint++;
1257
1258 severity = mce_severity(m, regs, NULL, true);
1259
1260
1261
1262
1263
1264 if ((severity == MCE_KEEP_SEVERITY ||
1265 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1266 continue;
1267
1268 arch___set_bit(i, toclear);
1269
1270
1271 if (severity == MCE_NO_SEVERITY)
1272 continue;
1273
1274 mce_read_aux(m, i);
1275
1276
1277 m->severity = severity;
1278
1279
1280
1281
1282
1283 instrumentation_begin();
1284 mce_log(m);
1285 instrumentation_end();
1286
1287 if (severity > *worst) {
1288 *final = *m;
1289 *worst = severity;
1290 }
1291 }
1292
1293
1294 *m = *final;
1295
1296 return taint;
1297 }
1298
1299 static void kill_me_now(struct callback_head *ch)
1300 {
1301 struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
1302
1303 p->mce_count = 0;
1304 force_sig(SIGBUS);
1305 }
1306
1307 static void kill_me_maybe(struct callback_head *cb)
1308 {
1309 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1310 int flags = MF_ACTION_REQUIRED;
1311 int ret;
1312
1313 p->mce_count = 0;
1314 pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1315
1316 if (!p->mce_ripv)
1317 flags |= MF_MUST_KILL;
1318
1319 ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
1320 if (!ret) {
1321 set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
1322 sync_core();
1323 return;
1324 }
1325
1326
1327
1328
1329
1330
1331
1332
1333 if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
1334 return;
1335
1336 pr_err("Memory error not recovered");
1337 kill_me_now(cb);
1338 }
1339
1340 static void kill_me_never(struct callback_head *cb)
1341 {
1342 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1343
1344 p->mce_count = 0;
1345 pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1346 if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
1347 set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
1348 }
1349
1350 static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
1351 {
1352 int count = ++current->mce_count;
1353
1354
1355 if (count == 1) {
1356 current->mce_addr = m->addr;
1357 current->mce_kflags = m->kflags;
1358 current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1359 current->mce_whole_page = whole_page(m);
1360 current->mce_kill_me.func = func;
1361 }
1362
1363
1364 if (count > 10)
1365 mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
1366
1367
1368 if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1369 mce_panic("Consecutive machine checks to different user pages", m, msg);
1370
1371
1372 if (count > 1)
1373 return;
1374
1375 task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
1376 }
1377
1378
1379 static noinstr void unexpected_machine_check(struct pt_regs *regs)
1380 {
1381 instrumentation_begin();
1382 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1383 smp_processor_id());
1384 instrumentation_end();
1385 }
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413 noinstr void do_machine_check(struct pt_regs *regs)
1414 {
1415 int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
1416 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
1417 DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
1418 struct mce m, *final;
1419 char *msg = NULL;
1420
1421 if (unlikely(mce_flags.p5))
1422 return pentium_machine_check(regs);
1423 else if (unlikely(mce_flags.winchip))
1424 return winchip_machine_check(regs);
1425 else if (unlikely(!mca_cfg.initialized))
1426 return unexpected_machine_check(regs);
1427
1428 if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
1429 goto clear;
1430
1431
1432
1433
1434
1435 order = -1;
1436
1437
1438
1439
1440
1441 no_way_out = 0;
1442
1443
1444
1445
1446
1447 kill_current_task = 0;
1448
1449
1450
1451
1452
1453 lmce = 1;
1454
1455 this_cpu_inc(mce_exception_count);
1456
1457 mce_gather_info(&m, regs);
1458 m.tsc = rdtsc();
1459
1460 final = this_cpu_ptr(&mces_seen);
1461 *final = m;
1462
1463 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1464
1465 barrier();
1466
1467
1468
1469
1470
1471
1472 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1473 kill_current_task = 1;
1474
1475
1476
1477
1478 if (m.cpuvendor == X86_VENDOR_INTEL ||
1479 m.cpuvendor == X86_VENDOR_ZHAOXIN)
1480 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1481
1482
1483
1484
1485
1486
1487
1488
1489 if (lmce) {
1490 if (no_way_out)
1491 mce_panic("Fatal local machine check", &m, msg);
1492 } else {
1493 order = mce_start(&no_way_out);
1494 }
1495
1496 taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
1497
1498 if (!no_way_out)
1499 mce_clear_state(toclear);
1500
1501
1502
1503
1504
1505 if (!lmce) {
1506 if (mce_end(order) < 0) {
1507 if (!no_way_out)
1508 no_way_out = worst >= MCE_PANIC_SEVERITY;
1509
1510 if (no_way_out)
1511 mce_panic("Fatal machine check on current CPU", &m, msg);
1512 }
1513 } else {
1514
1515
1516
1517
1518
1519
1520
1521
1522 if (worst >= MCE_PANIC_SEVERITY) {
1523 mce_severity(&m, regs, &msg, true);
1524 mce_panic("Local fatal machine check!", &m, msg);
1525 }
1526 }
1527
1528
1529
1530
1531
1532
1533 instrumentation_begin();
1534
1535 if (taint)
1536 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1537
1538 if (worst != MCE_AR_SEVERITY && !kill_current_task)
1539 goto out;
1540
1541
1542 if ((m.cs & 3) == 3) {
1543
1544 BUG_ON(!on_thread_stack() || !user_mode(regs));
1545
1546 if (kill_current_task)
1547 queue_task_work(&m, msg, kill_me_now);
1548 else
1549 queue_task_work(&m, msg, kill_me_maybe);
1550
1551 } else {
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561 if (m.kflags & MCE_IN_KERNEL_RECOV) {
1562 if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
1563 mce_panic("Failed kernel mode recovery", &m, msg);
1564 }
1565
1566 if (m.kflags & MCE_IN_KERNEL_COPYIN)
1567 queue_task_work(&m, msg, kill_me_never);
1568 }
1569
1570 out:
1571 instrumentation_end();
1572
1573 clear:
1574 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1575 }
1576 EXPORT_SYMBOL_GPL(do_machine_check);
1577
1578 #ifndef CONFIG_MEMORY_FAILURE
1579 int memory_failure(unsigned long pfn, int flags)
1580 {
1581
1582 BUG_ON(flags & MF_ACTION_REQUIRED);
1583 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1584 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1585 pfn);
1586
1587 return 0;
1588 }
1589 #endif
1590
1591
1592
1593
1594
1595
1596 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1597
1598 static DEFINE_PER_CPU(unsigned long, mce_next_interval);
1599 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1600
1601 static unsigned long mce_adjust_timer_default(unsigned long interval)
1602 {
1603 return interval;
1604 }
1605
1606 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1607
1608 static void __start_timer(struct timer_list *t, unsigned long interval)
1609 {
1610 unsigned long when = jiffies + interval;
1611 unsigned long flags;
1612
1613 local_irq_save(flags);
1614
1615 if (!timer_pending(t) || time_before(when, t->expires))
1616 mod_timer(t, round_jiffies(when));
1617
1618 local_irq_restore(flags);
1619 }
1620
1621 static void mce_timer_fn(struct timer_list *t)
1622 {
1623 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1624 unsigned long iv;
1625
1626 WARN_ON(cpu_t != t);
1627
1628 iv = __this_cpu_read(mce_next_interval);
1629
1630 if (mce_available(this_cpu_ptr(&cpu_info))) {
1631 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1632
1633 if (mce_intel_cmci_poll()) {
1634 iv = mce_adjust_timer(iv);
1635 goto done;
1636 }
1637 }
1638
1639
1640
1641
1642
1643 if (mce_notify_irq())
1644 iv = max(iv / 2, (unsigned long) HZ/100);
1645 else
1646 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1647
1648 done:
1649 __this_cpu_write(mce_next_interval, iv);
1650 __start_timer(t, iv);
1651 }
1652
1653
1654
1655
1656 void mce_timer_kick(unsigned long interval)
1657 {
1658 struct timer_list *t = this_cpu_ptr(&mce_timer);
1659 unsigned long iv = __this_cpu_read(mce_next_interval);
1660
1661 __start_timer(t, interval);
1662
1663 if (interval < iv)
1664 __this_cpu_write(mce_next_interval, interval);
1665 }
1666
1667
1668 static void mce_timer_delete_all(void)
1669 {
1670 int cpu;
1671
1672 for_each_online_cpu(cpu)
1673 del_timer_sync(&per_cpu(mce_timer, cpu));
1674 }
1675
1676
1677
1678
1679
1680
1681 int mce_notify_irq(void)
1682 {
1683
1684 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1685
1686 if (test_and_clear_bit(0, &mce_need_notify)) {
1687 mce_work_trigger();
1688
1689 if (__ratelimit(&ratelimit))
1690 pr_info(HW_ERR "Machine check events logged\n");
1691
1692 return 1;
1693 }
1694 return 0;
1695 }
1696 EXPORT_SYMBOL_GPL(mce_notify_irq);
1697
1698 static void __mcheck_cpu_mce_banks_init(void)
1699 {
1700 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1701 u8 n_banks = this_cpu_read(mce_num_banks);
1702 int i;
1703
1704 for (i = 0; i < n_banks; i++) {
1705 struct mce_bank *b = &mce_banks[i];
1706
1707
1708
1709
1710
1711
1712 b->ctl = -1ULL;
1713 b->init = true;
1714 }
1715 }
1716
1717
1718
1719
1720 static void __mcheck_cpu_cap_init(void)
1721 {
1722 u64 cap;
1723 u8 b;
1724
1725 rdmsrl(MSR_IA32_MCG_CAP, cap);
1726
1727 b = cap & MCG_BANKCNT_MASK;
1728
1729 if (b > MAX_NR_BANKS) {
1730 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1731 smp_processor_id(), MAX_NR_BANKS, b);
1732 b = MAX_NR_BANKS;
1733 }
1734
1735 this_cpu_write(mce_num_banks, b);
1736
1737 __mcheck_cpu_mce_banks_init();
1738
1739
1740 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1741 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1742
1743 if (cap & MCG_SER_P)
1744 mca_cfg.ser = 1;
1745 }
1746
1747 static void __mcheck_cpu_init_generic(void)
1748 {
1749 enum mcp_flags m_fl = 0;
1750 mce_banks_t all_banks;
1751 u64 cap;
1752
1753 if (!mca_cfg.bootlog)
1754 m_fl = MCP_DONTLOG;
1755
1756
1757
1758
1759
1760
1761 bitmap_fill(all_banks, MAX_NR_BANKS);
1762 machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
1763
1764 cr4_set_bits(X86_CR4_MCE);
1765
1766 rdmsrl(MSR_IA32_MCG_CAP, cap);
1767 if (cap & MCG_CTL_P)
1768 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1769 }
1770
1771 static void __mcheck_cpu_init_clear_banks(void)
1772 {
1773 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1774 int i;
1775
1776 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1777 struct mce_bank *b = &mce_banks[i];
1778
1779 if (!b->init)
1780 continue;
1781 wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
1782 wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
1783 }
1784 }
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796 static void __mcheck_cpu_check_banks(void)
1797 {
1798 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1799 u64 msrval;
1800 int i;
1801
1802 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1803 struct mce_bank *b = &mce_banks[i];
1804
1805 if (!b->init)
1806 continue;
1807
1808 rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
1809 b->init = !!msrval;
1810 }
1811 }
1812
1813
1814 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1815 {
1816 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1817 struct mca_config *cfg = &mca_cfg;
1818
1819 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1820 pr_info("unknown CPU type - not enabling MCE support\n");
1821 return -EOPNOTSUPP;
1822 }
1823
1824
1825 if (c->x86_vendor == X86_VENDOR_AMD) {
1826 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
1827
1828
1829
1830
1831
1832 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1833 }
1834 if (c->x86 < 0x11 && cfg->bootlog < 0) {
1835
1836
1837
1838
1839 cfg->bootlog = 0;
1840 }
1841
1842
1843
1844
1845 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
1846 mce_banks[0].ctl = 0;
1847
1848
1849
1850
1851
1852 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1853 mce_flags.overflow_recov = 1;
1854
1855 }
1856
1857 if (c->x86_vendor == X86_VENDOR_INTEL) {
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
1868 mce_banks[0].init = false;
1869
1870
1871
1872
1873
1874 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1875 cfg->monarch_timeout < 0)
1876 cfg->monarch_timeout = USEC_PER_SEC;
1877
1878
1879
1880
1881
1882 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1883 cfg->bootlog = 0;
1884
1885 if (c->x86 == 6 && c->x86_model == 45)
1886 mce_flags.snb_ifu_quirk = 1;
1887
1888
1889
1890
1891
1892 if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
1893 mce_flags.skx_repmov_quirk = 1;
1894 }
1895
1896 if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1897
1898
1899
1900
1901 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1902 if (cfg->monarch_timeout < 0)
1903 cfg->monarch_timeout = USEC_PER_SEC;
1904 }
1905 }
1906
1907 if (cfg->monarch_timeout < 0)
1908 cfg->monarch_timeout = 0;
1909 if (cfg->bootlog != 0)
1910 cfg->panic_timeout = 30;
1911
1912 return 0;
1913 }
1914
1915 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1916 {
1917 if (c->x86 != 5)
1918 return 0;
1919
1920 switch (c->x86_vendor) {
1921 case X86_VENDOR_INTEL:
1922 intel_p5_mcheck_init(c);
1923 mce_flags.p5 = 1;
1924 return 1;
1925 case X86_VENDOR_CENTAUR:
1926 winchip_mcheck_init(c);
1927 mce_flags.winchip = 1;
1928 return 1;
1929 default:
1930 return 0;
1931 }
1932
1933 return 0;
1934 }
1935
1936
1937
1938
1939 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1940 {
1941 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1942 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1943 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1944 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
1945 mce_flags.amd_threshold = 1;
1946 }
1947 }
1948
1949 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1950 {
1951 struct mca_config *cfg = &mca_cfg;
1952
1953
1954
1955
1956
1957 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1958 c->x86 > 6) {
1959 if (cfg->monarch_timeout < 0)
1960 cfg->monarch_timeout = USEC_PER_SEC;
1961 }
1962 }
1963
1964 static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1965 {
1966 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977 if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1978 (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1979 if (this_cpu_read(mce_num_banks) > 8)
1980 mce_banks[8].ctl = 0;
1981 }
1982
1983 intel_init_cmci();
1984 intel_init_lmce();
1985 mce_adjust_timer = cmci_intel_adjust_timer;
1986 }
1987
1988 static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1989 {
1990 intel_clear_lmce();
1991 }
1992
1993 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1994 {
1995 switch (c->x86_vendor) {
1996 case X86_VENDOR_INTEL:
1997 mce_intel_feature_init(c);
1998 mce_adjust_timer = cmci_intel_adjust_timer;
1999 break;
2000
2001 case X86_VENDOR_AMD: {
2002 mce_amd_feature_init(c);
2003 break;
2004 }
2005
2006 case X86_VENDOR_HYGON:
2007 mce_hygon_feature_init(c);
2008 break;
2009
2010 case X86_VENDOR_CENTAUR:
2011 mce_centaur_feature_init(c);
2012 break;
2013
2014 case X86_VENDOR_ZHAOXIN:
2015 mce_zhaoxin_feature_init(c);
2016 break;
2017
2018 default:
2019 break;
2020 }
2021 }
2022
2023 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2024 {
2025 switch (c->x86_vendor) {
2026 case X86_VENDOR_INTEL:
2027 mce_intel_feature_clear(c);
2028 break;
2029
2030 case X86_VENDOR_ZHAOXIN:
2031 mce_zhaoxin_feature_clear(c);
2032 break;
2033
2034 default:
2035 break;
2036 }
2037 }
2038
2039 static void mce_start_timer(struct timer_list *t)
2040 {
2041 unsigned long iv = check_interval * HZ;
2042
2043 if (mca_cfg.ignore_ce || !iv)
2044 return;
2045
2046 this_cpu_write(mce_next_interval, iv);
2047 __start_timer(t, iv);
2048 }
2049
2050 static void __mcheck_cpu_setup_timer(void)
2051 {
2052 struct timer_list *t = this_cpu_ptr(&mce_timer);
2053
2054 timer_setup(t, mce_timer_fn, TIMER_PINNED);
2055 }
2056
2057 static void __mcheck_cpu_init_timer(void)
2058 {
2059 struct timer_list *t = this_cpu_ptr(&mce_timer);
2060
2061 timer_setup(t, mce_timer_fn, TIMER_PINNED);
2062 mce_start_timer(t);
2063 }
2064
2065 bool filter_mce(struct mce *m)
2066 {
2067 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2068 return amd_filter_mce(m);
2069 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2070 return intel_filter_mce(m);
2071
2072 return false;
2073 }
2074
2075 static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
2076 {
2077 irqentry_state_t irq_state;
2078
2079 WARN_ON_ONCE(user_mode(regs));
2080
2081
2082
2083
2084
2085 if (mca_cfg.initialized && mce_check_crashing_cpu())
2086 return;
2087
2088 irq_state = irqentry_nmi_enter(regs);
2089
2090 do_machine_check(regs);
2091
2092 irqentry_nmi_exit(regs, irq_state);
2093 }
2094
2095 static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2096 {
2097 irqentry_enter_from_user_mode(regs);
2098
2099 do_machine_check(regs);
2100
2101 irqentry_exit_to_user_mode(regs);
2102 }
2103
2104 #ifdef CONFIG_X86_64
2105
2106 DEFINE_IDTENTRY_MCE(exc_machine_check)
2107 {
2108 unsigned long dr7;
2109
2110 dr7 = local_db_save();
2111 exc_machine_check_kernel(regs);
2112 local_db_restore(dr7);
2113 }
2114
2115
2116 DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2117 {
2118 unsigned long dr7;
2119
2120 dr7 = local_db_save();
2121 exc_machine_check_user(regs);
2122 local_db_restore(dr7);
2123 }
2124 #else
2125
2126 DEFINE_IDTENTRY_RAW(exc_machine_check)
2127 {
2128 unsigned long dr7;
2129
2130 dr7 = local_db_save();
2131 if (user_mode(regs))
2132 exc_machine_check_user(regs);
2133 else
2134 exc_machine_check_kernel(regs);
2135 local_db_restore(dr7);
2136 }
2137 #endif
2138
2139
2140
2141
2142
2143 void mcheck_cpu_init(struct cpuinfo_x86 *c)
2144 {
2145 if (mca_cfg.disabled)
2146 return;
2147
2148 if (__mcheck_cpu_ancient_init(c))
2149 return;
2150
2151 if (!mce_available(c))
2152 return;
2153
2154 __mcheck_cpu_cap_init();
2155
2156 if (__mcheck_cpu_apply_quirks(c) < 0) {
2157 mca_cfg.disabled = 1;
2158 return;
2159 }
2160
2161 if (mce_gen_pool_init()) {
2162 mca_cfg.disabled = 1;
2163 pr_emerg("Couldn't allocate MCE records pool!\n");
2164 return;
2165 }
2166
2167 mca_cfg.initialized = 1;
2168
2169 __mcheck_cpu_init_early(c);
2170 __mcheck_cpu_init_generic();
2171 __mcheck_cpu_init_vendor(c);
2172 __mcheck_cpu_init_clear_banks();
2173 __mcheck_cpu_check_banks();
2174 __mcheck_cpu_setup_timer();
2175 }
2176
2177
2178
2179
2180 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2181 {
2182 if (mca_cfg.disabled)
2183 return;
2184
2185 if (!mce_available(c))
2186 return;
2187
2188
2189
2190
2191
2192 __mcheck_cpu_clear_vendor(c);
2193
2194 }
2195
2196 static void __mce_disable_bank(void *arg)
2197 {
2198 int bank = *((int *)arg);
2199 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2200 cmci_disable_bank(bank);
2201 }
2202
2203 void mce_disable_bank(int bank)
2204 {
2205 if (bank >= this_cpu_read(mce_num_banks)) {
2206 pr_warn(FW_BUG
2207 "Ignoring request to disable invalid MCA bank %d.\n",
2208 bank);
2209 return;
2210 }
2211 set_bit(bank, mce_banks_ce_disabled);
2212 on_each_cpu(__mce_disable_bank, &bank, 1);
2213 }
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231 static int __init mcheck_enable(char *str)
2232 {
2233 struct mca_config *cfg = &mca_cfg;
2234
2235 if (*str == 0) {
2236 enable_p5_mce();
2237 return 1;
2238 }
2239 if (*str == '=')
2240 str++;
2241 if (!strcmp(str, "off"))
2242 cfg->disabled = 1;
2243 else if (!strcmp(str, "no_cmci"))
2244 cfg->cmci_disabled = true;
2245 else if (!strcmp(str, "no_lmce"))
2246 cfg->lmce_disabled = 1;
2247 else if (!strcmp(str, "dont_log_ce"))
2248 cfg->dont_log_ce = true;
2249 else if (!strcmp(str, "print_all"))
2250 cfg->print_all = true;
2251 else if (!strcmp(str, "ignore_ce"))
2252 cfg->ignore_ce = true;
2253 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2254 cfg->bootlog = (str[0] == 'b');
2255 else if (!strcmp(str, "bios_cmci_threshold"))
2256 cfg->bios_cmci_threshold = 1;
2257 else if (!strcmp(str, "recovery"))
2258 cfg->recovery = 1;
2259 else if (isdigit(str[0]))
2260 get_option(&str, &(cfg->monarch_timeout));
2261 else {
2262 pr_info("mce argument %s ignored. Please use /sys\n", str);
2263 return 0;
2264 }
2265 return 1;
2266 }
2267 __setup("mce", mcheck_enable);
2268
2269 int __init mcheck_init(void)
2270 {
2271 mce_register_decode_chain(&early_nb);
2272 mce_register_decode_chain(&mce_uc_nb);
2273 mce_register_decode_chain(&mce_default_nb);
2274
2275 INIT_WORK(&mce_work, mce_gen_pool_process);
2276 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2277
2278 return 0;
2279 }
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289 static void mce_disable_error_reporting(void)
2290 {
2291 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2292 int i;
2293
2294 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2295 struct mce_bank *b = &mce_banks[i];
2296
2297 if (b->init)
2298 wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
2299 }
2300 return;
2301 }
2302
2303 static void vendor_disable_error_reporting(void)
2304 {
2305
2306
2307
2308
2309
2310
2311
2312 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
2313 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
2314 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2315 boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2316 return;
2317
2318 mce_disable_error_reporting();
2319 }
2320
2321 static int mce_syscore_suspend(void)
2322 {
2323 vendor_disable_error_reporting();
2324 return 0;
2325 }
2326
2327 static void mce_syscore_shutdown(void)
2328 {
2329 vendor_disable_error_reporting();
2330 }
2331
2332
2333
2334
2335
2336
2337 static void mce_syscore_resume(void)
2338 {
2339 __mcheck_cpu_init_generic();
2340 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2341 __mcheck_cpu_init_clear_banks();
2342 }
2343
2344 static struct syscore_ops mce_syscore_ops = {
2345 .suspend = mce_syscore_suspend,
2346 .shutdown = mce_syscore_shutdown,
2347 .resume = mce_syscore_resume,
2348 };
2349
2350
2351
2352
2353
2354 static void mce_cpu_restart(void *data)
2355 {
2356 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2357 return;
2358 __mcheck_cpu_init_generic();
2359 __mcheck_cpu_init_clear_banks();
2360 __mcheck_cpu_init_timer();
2361 }
2362
2363
2364 static void mce_restart(void)
2365 {
2366 mce_timer_delete_all();
2367 on_each_cpu(mce_cpu_restart, NULL, 1);
2368 }
2369
2370
2371 static void mce_disable_cmci(void *data)
2372 {
2373 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2374 return;
2375 cmci_clear();
2376 }
2377
2378 static void mce_enable_ce(void *all)
2379 {
2380 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2381 return;
2382 cmci_reenable();
2383 cmci_recheck();
2384 if (all)
2385 __mcheck_cpu_init_timer();
2386 }
2387
2388 static struct bus_type mce_subsys = {
2389 .name = "machinecheck",
2390 .dev_name = "machinecheck",
2391 };
2392
2393 DEFINE_PER_CPU(struct device *, mce_device);
2394
2395 static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
2396 {
2397 return container_of(attr, struct mce_bank_dev, attr);
2398 }
2399
2400 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2401 char *buf)
2402 {
2403 u8 bank = attr_to_bank(attr)->bank;
2404 struct mce_bank *b;
2405
2406 if (bank >= per_cpu(mce_num_banks, s->id))
2407 return -EINVAL;
2408
2409 b = &per_cpu(mce_banks_array, s->id)[bank];
2410
2411 if (!b->init)
2412 return -ENODEV;
2413
2414 return sprintf(buf, "%llx\n", b->ctl);
2415 }
2416
2417 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2418 const char *buf, size_t size)
2419 {
2420 u8 bank = attr_to_bank(attr)->bank;
2421 struct mce_bank *b;
2422 u64 new;
2423
2424 if (kstrtou64(buf, 0, &new) < 0)
2425 return -EINVAL;
2426
2427 if (bank >= per_cpu(mce_num_banks, s->id))
2428 return -EINVAL;
2429
2430 b = &per_cpu(mce_banks_array, s->id)[bank];
2431
2432 if (!b->init)
2433 return -ENODEV;
2434
2435 b->ctl = new;
2436 mce_restart();
2437
2438 return size;
2439 }
2440
2441 static ssize_t set_ignore_ce(struct device *s,
2442 struct device_attribute *attr,
2443 const char *buf, size_t size)
2444 {
2445 u64 new;
2446
2447 if (kstrtou64(buf, 0, &new) < 0)
2448 return -EINVAL;
2449
2450 mutex_lock(&mce_sysfs_mutex);
2451 if (mca_cfg.ignore_ce ^ !!new) {
2452 if (new) {
2453
2454 mce_timer_delete_all();
2455 on_each_cpu(mce_disable_cmci, NULL, 1);
2456 mca_cfg.ignore_ce = true;
2457 } else {
2458
2459 mca_cfg.ignore_ce = false;
2460 on_each_cpu(mce_enable_ce, (void *)1, 1);
2461 }
2462 }
2463 mutex_unlock(&mce_sysfs_mutex);
2464
2465 return size;
2466 }
2467
2468 static ssize_t set_cmci_disabled(struct device *s,
2469 struct device_attribute *attr,
2470 const char *buf, size_t size)
2471 {
2472 u64 new;
2473
2474 if (kstrtou64(buf, 0, &new) < 0)
2475 return -EINVAL;
2476
2477 mutex_lock(&mce_sysfs_mutex);
2478 if (mca_cfg.cmci_disabled ^ !!new) {
2479 if (new) {
2480
2481 on_each_cpu(mce_disable_cmci, NULL, 1);
2482 mca_cfg.cmci_disabled = true;
2483 } else {
2484
2485 mca_cfg.cmci_disabled = false;
2486 on_each_cpu(mce_enable_ce, NULL, 1);
2487 }
2488 }
2489 mutex_unlock(&mce_sysfs_mutex);
2490
2491 return size;
2492 }
2493
2494 static ssize_t store_int_with_restart(struct device *s,
2495 struct device_attribute *attr,
2496 const char *buf, size_t size)
2497 {
2498 unsigned long old_check_interval = check_interval;
2499 ssize_t ret = device_store_ulong(s, attr, buf, size);
2500
2501 if (check_interval == old_check_interval)
2502 return ret;
2503
2504 mutex_lock(&mce_sysfs_mutex);
2505 mce_restart();
2506 mutex_unlock(&mce_sysfs_mutex);
2507
2508 return ret;
2509 }
2510
2511 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2512 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2513 static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
2514
2515 static struct dev_ext_attribute dev_attr_check_interval = {
2516 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2517 &check_interval
2518 };
2519
2520 static struct dev_ext_attribute dev_attr_ignore_ce = {
2521 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2522 &mca_cfg.ignore_ce
2523 };
2524
2525 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2526 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2527 &mca_cfg.cmci_disabled
2528 };
2529
2530 static struct device_attribute *mce_device_attrs[] = {
2531 &dev_attr_check_interval.attr,
2532 #ifdef CONFIG_X86_MCELOG_LEGACY
2533 &dev_attr_trigger,
2534 #endif
2535 &dev_attr_monarch_timeout.attr,
2536 &dev_attr_dont_log_ce.attr,
2537 &dev_attr_print_all.attr,
2538 &dev_attr_ignore_ce.attr,
2539 &dev_attr_cmci_disabled.attr,
2540 NULL
2541 };
2542
2543 static cpumask_var_t mce_device_initialized;
2544
2545 static void mce_device_release(struct device *dev)
2546 {
2547 kfree(dev);
2548 }
2549
2550
2551 static int mce_device_create(unsigned int cpu)
2552 {
2553 struct device *dev;
2554 int err;
2555 int i, j;
2556
2557 if (!mce_available(&boot_cpu_data))
2558 return -EIO;
2559
2560 dev = per_cpu(mce_device, cpu);
2561 if (dev)
2562 return 0;
2563
2564 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2565 if (!dev)
2566 return -ENOMEM;
2567 dev->id = cpu;
2568 dev->bus = &mce_subsys;
2569 dev->release = &mce_device_release;
2570
2571 err = device_register(dev);
2572 if (err) {
2573 put_device(dev);
2574 return err;
2575 }
2576
2577 for (i = 0; mce_device_attrs[i]; i++) {
2578 err = device_create_file(dev, mce_device_attrs[i]);
2579 if (err)
2580 goto error;
2581 }
2582 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
2583 err = device_create_file(dev, &mce_bank_devs[j].attr);
2584 if (err)
2585 goto error2;
2586 }
2587 cpumask_set_cpu(cpu, mce_device_initialized);
2588 per_cpu(mce_device, cpu) = dev;
2589
2590 return 0;
2591 error2:
2592 while (--j >= 0)
2593 device_remove_file(dev, &mce_bank_devs[j].attr);
2594 error:
2595 while (--i >= 0)
2596 device_remove_file(dev, mce_device_attrs[i]);
2597
2598 device_unregister(dev);
2599
2600 return err;
2601 }
2602
2603 static void mce_device_remove(unsigned int cpu)
2604 {
2605 struct device *dev = per_cpu(mce_device, cpu);
2606 int i;
2607
2608 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2609 return;
2610
2611 for (i = 0; mce_device_attrs[i]; i++)
2612 device_remove_file(dev, mce_device_attrs[i]);
2613
2614 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
2615 device_remove_file(dev, &mce_bank_devs[i].attr);
2616
2617 device_unregister(dev);
2618 cpumask_clear_cpu(cpu, mce_device_initialized);
2619 per_cpu(mce_device, cpu) = NULL;
2620 }
2621
2622
2623 static void mce_disable_cpu(void)
2624 {
2625 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2626 return;
2627
2628 if (!cpuhp_tasks_frozen)
2629 cmci_clear();
2630
2631 vendor_disable_error_reporting();
2632 }
2633
2634 static void mce_reenable_cpu(void)
2635 {
2636 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2637 int i;
2638
2639 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2640 return;
2641
2642 if (!cpuhp_tasks_frozen)
2643 cmci_reenable();
2644 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2645 struct mce_bank *b = &mce_banks[i];
2646
2647 if (b->init)
2648 wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
2649 }
2650 }
2651
2652 static int mce_cpu_dead(unsigned int cpu)
2653 {
2654 mce_intel_hcpu_update(cpu);
2655
2656
2657 if (!cpuhp_tasks_frozen)
2658 cmci_rediscover();
2659 return 0;
2660 }
2661
2662 static int mce_cpu_online(unsigned int cpu)
2663 {
2664 struct timer_list *t = this_cpu_ptr(&mce_timer);
2665 int ret;
2666
2667 mce_device_create(cpu);
2668
2669 ret = mce_threshold_create_device(cpu);
2670 if (ret) {
2671 mce_device_remove(cpu);
2672 return ret;
2673 }
2674 mce_reenable_cpu();
2675 mce_start_timer(t);
2676 return 0;
2677 }
2678
2679 static int mce_cpu_pre_down(unsigned int cpu)
2680 {
2681 struct timer_list *t = this_cpu_ptr(&mce_timer);
2682
2683 mce_disable_cpu();
2684 del_timer_sync(t);
2685 mce_threshold_remove_device(cpu);
2686 mce_device_remove(cpu);
2687 return 0;
2688 }
2689
2690 static __init void mce_init_banks(void)
2691 {
2692 int i;
2693
2694 for (i = 0; i < MAX_NR_BANKS; i++) {
2695 struct mce_bank_dev *b = &mce_bank_devs[i];
2696 struct device_attribute *a = &b->attr;
2697
2698 b->bank = i;
2699
2700 sysfs_attr_init(&a->attr);
2701 a->attr.name = b->attrname;
2702 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2703
2704 a->attr.mode = 0644;
2705 a->show = show_bank;
2706 a->store = set_bank;
2707 }
2708 }
2709
2710
2711
2712
2713
2714
2715
2716
2717 static __init int mcheck_init_device(void)
2718 {
2719 int err;
2720
2721
2722
2723
2724
2725 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2726
2727 if (!mce_available(&boot_cpu_data)) {
2728 err = -EIO;
2729 goto err_out;
2730 }
2731
2732 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2733 err = -ENOMEM;
2734 goto err_out;
2735 }
2736
2737 mce_init_banks();
2738
2739 err = subsys_system_register(&mce_subsys, NULL);
2740 if (err)
2741 goto err_out_mem;
2742
2743 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2744 mce_cpu_dead);
2745 if (err)
2746 goto err_out_mem;
2747
2748
2749
2750
2751
2752 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2753 mce_cpu_online, mce_cpu_pre_down);
2754 if (err < 0)
2755 goto err_out_online;
2756
2757 register_syscore_ops(&mce_syscore_ops);
2758
2759 return 0;
2760
2761 err_out_online:
2762 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2763
2764 err_out_mem:
2765 free_cpumask_var(mce_device_initialized);
2766
2767 err_out:
2768 pr_err("Unable to init MCE device (rc: %d)\n", err);
2769
2770 return err;
2771 }
2772 device_initcall_sync(mcheck_init_device);
2773
2774
2775
2776
2777 static int __init mcheck_disable(char *str)
2778 {
2779 mca_cfg.disabled = 1;
2780 return 1;
2781 }
2782 __setup("nomce", mcheck_disable);
2783
2784 #ifdef CONFIG_DEBUG_FS
2785 struct dentry *mce_get_debugfs_dir(void)
2786 {
2787 static struct dentry *dmce;
2788
2789 if (!dmce)
2790 dmce = debugfs_create_dir("mce", NULL);
2791
2792 return dmce;
2793 }
2794
2795 static void mce_reset(void)
2796 {
2797 atomic_set(&mce_fake_panicked, 0);
2798 atomic_set(&mce_executing, 0);
2799 atomic_set(&mce_callin, 0);
2800 atomic_set(&global_nwo, 0);
2801 cpumask_setall(&mce_missing_cpus);
2802 }
2803
2804 static int fake_panic_get(void *data, u64 *val)
2805 {
2806 *val = fake_panic;
2807 return 0;
2808 }
2809
2810 static int fake_panic_set(void *data, u64 val)
2811 {
2812 mce_reset();
2813 fake_panic = val;
2814 return 0;
2815 }
2816
2817 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2818 "%llu\n");
2819
2820 static void __init mcheck_debugfs_init(void)
2821 {
2822 struct dentry *dmce;
2823
2824 dmce = mce_get_debugfs_dir();
2825 debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2826 &fake_panic_fops);
2827 }
2828 #else
2829 static void __init mcheck_debugfs_init(void) { }
2830 #endif
2831
2832 static int __init mcheck_late_init(void)
2833 {
2834 if (mca_cfg.recovery)
2835 enable_copy_mc_fragile();
2836
2837 mcheck_debugfs_init();
2838
2839
2840
2841
2842
2843 mce_schedule_work();
2844
2845 return 0;
2846 }
2847 late_initcall(mcheck_late_init);