0001
0002
0003
0004
0005
0006
0007
0008
0009 #undef DEBUG
0010
0011 #include <linux/kernel.h>
0012 #include <linux/init.h>
0013 #include <linux/of.h>
0014 #include <linux/mm.h>
0015 #include <linux/slab.h>
0016
0017 #include <asm/opal.h>
0018 #include <asm/cputable.h>
0019 #include <asm/machdep.h>
0020
0021 #include "powernv.h"
0022
0023 static int opal_hmi_handler_nb_init;
0024 struct OpalHmiEvtNode {
0025 struct list_head list;
0026 struct OpalHMIEvent hmi_evt;
0027 };
0028
0029 struct xstop_reason {
0030 uint32_t xstop_reason;
0031 const char *unit_failed;
0032 const char *description;
0033 };
0034
0035 static LIST_HEAD(opal_hmi_evt_list);
0036 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
0037
0038 static void print_core_checkstop_reason(const char *level,
0039 struct OpalHMIEvent *hmi_evt)
0040 {
0041 int i;
0042 static const struct xstop_reason xstop_reason[] = {
0043 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
0044 "RegFile core check stop" },
0045 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
0046 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
0047 "Core checkstop during recovery" },
0048 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
0049 "RegFile core check stop (mapper error)" },
0050 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
0051 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
0052 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
0053 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
0054 "Recovery in maintenance mode" },
0055 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
0056 "RegFile core check stop" },
0057 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
0058 "Forward Progress Error" },
0059 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
0060 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
0061 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
0062 "Hypervisor Resource error - core check stop" },
0063 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
0064 "Hang Recovery Failed (core check stop)" },
0065 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
0066 "Ambiguous Hang Detected (unknown source)" },
0067 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
0068 "Debug Trigger Error inject" },
0069 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
0070 "Hypervisor check stop via SPRC/SPRD" },
0071 };
0072
0073
0074 if (!hmi_evt->u.xstop_error.xstop_reason) {
0075 printk("%s Unknown Core check stop.\n", level);
0076 return;
0077 }
0078
0079 printk("%s CPU PIR: %08x\n", level,
0080 be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
0081 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
0082 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
0083 xstop_reason[i].xstop_reason)
0084 printk("%s [Unit: %-3s] %s\n", level,
0085 xstop_reason[i].unit_failed,
0086 xstop_reason[i].description);
0087 }
0088
0089 static void print_nx_checkstop_reason(const char *level,
0090 struct OpalHMIEvent *hmi_evt)
0091 {
0092 int i;
0093 static const struct xstop_reason xstop_reason[] = {
0094 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
0095 "SHM invalid state error" },
0096 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
0097 "DMA invalid state error bit 15" },
0098 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
0099 "DMA invalid state error bit 16" },
0100 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
0101 "Channel 0 invalid state error" },
0102 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
0103 "Channel 1 invalid state error" },
0104 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
0105 "Channel 2 invalid state error" },
0106 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
0107 "Channel 3 invalid state error" },
0108 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
0109 "Channel 4 invalid state error" },
0110 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
0111 "Channel 5 invalid state error" },
0112 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
0113 "Channel 6 invalid state error" },
0114 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
0115 "Channel 7 invalid state error" },
0116 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
0117 "UE error on CRB(CSB address, CCB)" },
0118 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
0119 "SUE error on CRB(CSB address, CCB)" },
0120 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
0121 "CRB Kill ISN received while holding ISN with UE error" },
0122 };
0123
0124
0125 if (!hmi_evt->u.xstop_error.xstop_reason) {
0126 printk("%s Unknown NX check stop.\n", level);
0127 return;
0128 }
0129
0130 printk("%s NX checkstop on CHIP ID: %x\n", level,
0131 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
0132 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
0133 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
0134 xstop_reason[i].xstop_reason)
0135 printk("%s [Unit: %-3s] %s\n", level,
0136 xstop_reason[i].unit_failed,
0137 xstop_reason[i].description);
0138 }
0139
0140 static void print_npu_checkstop_reason(const char *level,
0141 struct OpalHMIEvent *hmi_evt)
0142 {
0143 uint8_t reason, reason_count, i;
0144
0145
0146
0147
0148
0149 if (!hmi_evt->u.xstop_error.xstop_reason) {
0150 printk("%s NPU checkstop on chip %x\n", level,
0151 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
0152 return;
0153 }
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165 reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
0166 sizeof(reason);
0167 for (i = 0; i < reason_count; i++) {
0168 reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
0169 if (reason)
0170 printk("%s NPU checkstop on chip %x: FIR%d bit %d is set\n",
0171 level,
0172 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
0173 reason >> 6, reason & 0x3F);
0174 }
0175 }
0176
0177 static void print_checkstop_reason(const char *level,
0178 struct OpalHMIEvent *hmi_evt)
0179 {
0180 uint8_t type = hmi_evt->u.xstop_error.xstop_type;
0181 switch (type) {
0182 case CHECKSTOP_TYPE_CORE:
0183 print_core_checkstop_reason(level, hmi_evt);
0184 break;
0185 case CHECKSTOP_TYPE_NX:
0186 print_nx_checkstop_reason(level, hmi_evt);
0187 break;
0188 case CHECKSTOP_TYPE_NPU:
0189 print_npu_checkstop_reason(level, hmi_evt);
0190 break;
0191 default:
0192 printk("%s Unknown Malfunction Alert of type %d\n",
0193 level, type);
0194 break;
0195 }
0196 }
0197
0198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
0199 {
0200 const char *level, *sevstr, *error_info;
0201 static const char *hmi_error_types[] = {
0202 "Malfunction Alert",
0203 "Processor Recovery done",
0204 "Processor recovery occurred again",
0205 "Processor recovery occurred for masked error",
0206 "Timer facility experienced an error",
0207 "TFMR SPR is corrupted",
0208 "UPS (Uninterrupted Power System) Overflow indication",
0209 "An XSCOM operation failure",
0210 "An XSCOM operation completed",
0211 "SCOM has set a reserved FIR bit to cause recovery",
0212 "Debug trigger has set a reserved FIR bit to cause recovery",
0213 "A hypervisor resource error occurred",
0214 "CAPP recovery process is in progress",
0215 };
0216 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
0217 DEFAULT_RATELIMIT_BURST);
0218
0219
0220 if (hmi_evt->version < OpalHMIEvt_V1) {
0221 pr_err("HMI Interrupt, Unknown event version %d !\n",
0222 hmi_evt->version);
0223 return;
0224 }
0225 switch (hmi_evt->severity) {
0226 case OpalHMI_SEV_NO_ERROR:
0227 level = KERN_INFO;
0228 sevstr = "Harmless";
0229 break;
0230 case OpalHMI_SEV_WARNING:
0231 level = KERN_WARNING;
0232 sevstr = "";
0233 break;
0234 case OpalHMI_SEV_ERROR_SYNC:
0235 level = KERN_ERR;
0236 sevstr = "Severe";
0237 break;
0238 case OpalHMI_SEV_FATAL:
0239 default:
0240 level = KERN_ERR;
0241 sevstr = "Fatal";
0242 break;
0243 }
0244
0245 if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
0246 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
0247 level, sevstr,
0248 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
0249 "Recovered" : "Not recovered");
0250 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
0251 hmi_error_types[hmi_evt->type]
0252 : "Unknown";
0253 printk("%s Error detail: %s\n", level, error_info);
0254 printk("%s HMER: %016llx\n", level,
0255 be64_to_cpu(hmi_evt->hmer));
0256 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
0257 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
0258 printk("%s TFMR: %016llx\n", level,
0259 be64_to_cpu(hmi_evt->tfmr));
0260 }
0261
0262 if (hmi_evt->version < OpalHMIEvt_V2)
0263 return;
0264
0265
0266 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
0267 print_checkstop_reason(level, hmi_evt);
0268 }
0269
0270 static void hmi_event_handler(struct work_struct *work)
0271 {
0272 unsigned long flags;
0273 struct OpalHMIEvent *hmi_evt;
0274 struct OpalHmiEvtNode *msg_node;
0275 uint8_t disposition;
0276 struct opal_msg msg;
0277 int unrecoverable = 0;
0278
0279 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0280 while (!list_empty(&opal_hmi_evt_list)) {
0281 msg_node = list_entry(opal_hmi_evt_list.next,
0282 struct OpalHmiEvtNode, list);
0283 list_del(&msg_node->list);
0284 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0285
0286 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
0287 print_hmi_event_info(hmi_evt);
0288 disposition = hmi_evt->disposition;
0289 kfree(msg_node);
0290
0291
0292
0293
0294
0295
0296
0297 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
0298 unrecoverable = 1;
0299
0300 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0301 }
0302 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0303
0304 if (unrecoverable) {
0305
0306 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
0307 u32 type;
0308
0309 type = be32_to_cpu(msg.msg_type);
0310
0311
0312 if (type != OPAL_MSG_HMI_EVT)
0313 continue;
0314
0315
0316 hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
0317 print_hmi_event_info(hmi_evt);
0318 }
0319
0320 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
0321 }
0322 }
0323
0324 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
0325
0326
0327
0328
0329 static int opal_handle_hmi_event(struct notifier_block *nb,
0330 unsigned long msg_type, void *msg)
0331 {
0332 unsigned long flags;
0333 struct OpalHMIEvent *hmi_evt;
0334 struct opal_msg *hmi_msg = msg;
0335 struct OpalHmiEvtNode *msg_node;
0336
0337
0338 if (msg_type != OPAL_MSG_HMI_EVT)
0339 return 0;
0340
0341
0342 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
0343
0344
0345 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
0346 if (!msg_node) {
0347 pr_err("HMI: out of memory, Opal message event not handled\n");
0348 return -ENOMEM;
0349 }
0350 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
0351
0352 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0353 list_add(&msg_node->list, &opal_hmi_evt_list);
0354 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0355
0356 schedule_work(&hmi_event_work);
0357 return 0;
0358 }
0359
0360 static struct notifier_block opal_hmi_handler_nb = {
0361 .notifier_call = opal_handle_hmi_event,
0362 .next = NULL,
0363 .priority = 0,
0364 };
0365
0366 int __init opal_hmi_handler_init(void)
0367 {
0368 int ret;
0369
0370 if (!opal_hmi_handler_nb_init) {
0371 ret = opal_message_notifier_register(
0372 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
0373 if (ret) {
0374 pr_err("%s: Can't register OPAL event notifier (%d)\n",
0375 __func__, ret);
0376 return ret;
0377 }
0378 opal_hmi_handler_nb_init = 1;
0379 }
0380 return 0;
0381 }