Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
0004  *
0005  * Copyright 2014 IBM Corporation
0006  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
0007  */
0008 
0009 #undef DEBUG
0010 
0011 #include <linux/kernel.h>
0012 #include <linux/init.h>
0013 #include <linux/of.h>
0014 #include <linux/mm.h>
0015 #include <linux/slab.h>
0016 
0017 #include <asm/opal.h>
0018 #include <asm/cputable.h>
0019 #include <asm/machdep.h>
0020 
0021 #include "powernv.h"
0022 
0023 static int opal_hmi_handler_nb_init;
0024 struct OpalHmiEvtNode {
0025     struct list_head list;
0026     struct OpalHMIEvent hmi_evt;
0027 };
0028 
0029 struct xstop_reason {
0030     uint32_t xstop_reason;
0031     const char *unit_failed;
0032     const char *description;
0033 };
0034 
0035 static LIST_HEAD(opal_hmi_evt_list);
0036 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
0037 
0038 static void print_core_checkstop_reason(const char *level,
0039                     struct OpalHMIEvent *hmi_evt)
0040 {
0041     int i;
0042     static const struct xstop_reason xstop_reason[] = {
0043         { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
0044                 "RegFile core check stop" },
0045         { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
0046         { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
0047                 "Core checkstop during recovery" },
0048         { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
0049                 "RegFile core check stop (mapper error)" },
0050         { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
0051         { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
0052         { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
0053         { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
0054                 "Recovery in maintenance mode" },
0055         { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
0056                 "RegFile core check stop" },
0057         { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
0058                 "Forward Progress Error" },
0059         { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
0060         { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
0061         { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
0062                 "Hypervisor Resource error - core check stop" },
0063         { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
0064                 "Hang Recovery Failed (core check stop)" },
0065         { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
0066                 "Ambiguous Hang Detected (unknown source)" },
0067         { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
0068                 "Debug Trigger Error inject" },
0069         { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
0070                 "Hypervisor check stop via SPRC/SPRD" },
0071     };
0072 
0073     /* Validity check */
0074     if (!hmi_evt->u.xstop_error.xstop_reason) {
0075         printk("%s  Unknown Core check stop.\n", level);
0076         return;
0077     }
0078 
0079     printk("%s  CPU PIR: %08x\n", level,
0080             be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
0081     for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
0082         if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
0083                     xstop_reason[i].xstop_reason)
0084             printk("%s  [Unit: %-3s] %s\n", level,
0085                     xstop_reason[i].unit_failed,
0086                     xstop_reason[i].description);
0087 }
0088 
0089 static void print_nx_checkstop_reason(const char *level,
0090                     struct OpalHMIEvent *hmi_evt)
0091 {
0092     int i;
0093     static const struct xstop_reason xstop_reason[] = {
0094         { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
0095                     "SHM invalid state error" },
0096         { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
0097                     "DMA invalid state error bit 15" },
0098         { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
0099                     "DMA invalid state error bit 16" },
0100         { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
0101                     "Channel 0 invalid state error" },
0102         { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
0103                     "Channel 1 invalid state error" },
0104         { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
0105                     "Channel 2 invalid state error" },
0106         { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
0107                     "Channel 3 invalid state error" },
0108         { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
0109                     "Channel 4 invalid state error" },
0110         { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
0111                     "Channel 5 invalid state error" },
0112         { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
0113                     "Channel 6 invalid state error" },
0114         { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
0115                     "Channel 7 invalid state error" },
0116         { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
0117                     "UE error on CRB(CSB address, CCB)" },
0118         { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
0119                     "SUE error on CRB(CSB address, CCB)" },
0120         { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
0121         "CRB Kill ISN received while holding ISN with UE error" },
0122     };
0123 
0124     /* Validity check */
0125     if (!hmi_evt->u.xstop_error.xstop_reason) {
0126         printk("%s  Unknown NX check stop.\n", level);
0127         return;
0128     }
0129 
0130     printk("%s  NX checkstop on CHIP ID: %x\n", level,
0131             be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
0132     for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
0133         if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
0134                     xstop_reason[i].xstop_reason)
0135             printk("%s  [Unit: %-3s] %s\n", level,
0136                     xstop_reason[i].unit_failed,
0137                     xstop_reason[i].description);
0138 }
0139 
0140 static void print_npu_checkstop_reason(const char *level,
0141                     struct OpalHMIEvent *hmi_evt)
0142 {
0143     uint8_t reason, reason_count, i;
0144 
0145     /*
0146      * We may not have a checkstop reason on some combination of
0147      * hardware and/or skiboot version
0148      */
0149     if (!hmi_evt->u.xstop_error.xstop_reason) {
0150         printk("%s  NPU checkstop on chip %x\n", level,
0151             be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
0152         return;
0153     }
0154 
0155     /*
0156      * NPU2 has 3 FIRs. Reason encoded on a byte as:
0157      *   2 bits for the FIR number
0158      *   6 bits for the bit number
0159      * It may be possible to find several reasons.
0160      *
0161      * We don't display a specific message per FIR bit as there
0162      * are too many and most are meaningless without the workbook
0163      * and/or hw team help anyway.
0164      */
0165     reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
0166         sizeof(reason);
0167     for (i = 0; i < reason_count; i++) {
0168         reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
0169         if (reason)
0170             printk("%s  NPU checkstop on chip %x: FIR%d bit %d is set\n",
0171                 level,
0172                 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
0173                 reason >> 6, reason & 0x3F);
0174     }
0175 }
0176 
0177 static void print_checkstop_reason(const char *level,
0178                     struct OpalHMIEvent *hmi_evt)
0179 {
0180     uint8_t type = hmi_evt->u.xstop_error.xstop_type;
0181     switch (type) {
0182     case CHECKSTOP_TYPE_CORE:
0183         print_core_checkstop_reason(level, hmi_evt);
0184         break;
0185     case CHECKSTOP_TYPE_NX:
0186         print_nx_checkstop_reason(level, hmi_evt);
0187         break;
0188     case CHECKSTOP_TYPE_NPU:
0189         print_npu_checkstop_reason(level, hmi_evt);
0190         break;
0191     default:
0192         printk("%s  Unknown Malfunction Alert of type %d\n",
0193                level, type);
0194         break;
0195     }
0196 }
0197 
0198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
0199 {
0200     const char *level, *sevstr, *error_info;
0201     static const char *hmi_error_types[] = {
0202         "Malfunction Alert",
0203         "Processor Recovery done",
0204         "Processor recovery occurred again",
0205         "Processor recovery occurred for masked error",
0206         "Timer facility experienced an error",
0207         "TFMR SPR is corrupted",
0208         "UPS (Uninterrupted Power System) Overflow indication",
0209         "An XSCOM operation failure",
0210         "An XSCOM operation completed",
0211         "SCOM has set a reserved FIR bit to cause recovery",
0212         "Debug trigger has set a reserved FIR bit to cause recovery",
0213         "A hypervisor resource error occurred",
0214         "CAPP recovery process is in progress",
0215     };
0216     static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
0217                       DEFAULT_RATELIMIT_BURST);
0218 
0219     /* Print things out */
0220     if (hmi_evt->version < OpalHMIEvt_V1) {
0221         pr_err("HMI Interrupt, Unknown event version %d !\n",
0222             hmi_evt->version);
0223         return;
0224     }
0225     switch (hmi_evt->severity) {
0226     case OpalHMI_SEV_NO_ERROR:
0227         level = KERN_INFO;
0228         sevstr = "Harmless";
0229         break;
0230     case OpalHMI_SEV_WARNING:
0231         level = KERN_WARNING;
0232         sevstr = "";
0233         break;
0234     case OpalHMI_SEV_ERROR_SYNC:
0235         level = KERN_ERR;
0236         sevstr = "Severe";
0237         break;
0238     case OpalHMI_SEV_FATAL:
0239     default:
0240         level = KERN_ERR;
0241         sevstr = "Fatal";
0242         break;
0243     }
0244 
0245     if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
0246         printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
0247             level, sevstr,
0248             hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
0249             "Recovered" : "Not recovered");
0250         error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
0251                 hmi_error_types[hmi_evt->type]
0252                 : "Unknown";
0253         printk("%s Error detail: %s\n", level, error_info);
0254         printk("%s  HMER: %016llx\n", level,
0255                     be64_to_cpu(hmi_evt->hmer));
0256         if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
0257             (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
0258             printk("%s  TFMR: %016llx\n", level,
0259                         be64_to_cpu(hmi_evt->tfmr));
0260     }
0261 
0262     if (hmi_evt->version < OpalHMIEvt_V2)
0263         return;
0264 
0265     /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
0266     if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
0267         print_checkstop_reason(level, hmi_evt);
0268 }
0269 
0270 static void hmi_event_handler(struct work_struct *work)
0271 {
0272     unsigned long flags;
0273     struct OpalHMIEvent *hmi_evt;
0274     struct OpalHmiEvtNode *msg_node;
0275     uint8_t disposition;
0276     struct opal_msg msg;
0277     int unrecoverable = 0;
0278 
0279     spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0280     while (!list_empty(&opal_hmi_evt_list)) {
0281         msg_node = list_entry(opal_hmi_evt_list.next,
0282                        struct OpalHmiEvtNode, list);
0283         list_del(&msg_node->list);
0284         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0285 
0286         hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
0287         print_hmi_event_info(hmi_evt);
0288         disposition = hmi_evt->disposition;
0289         kfree(msg_node);
0290 
0291         /*
0292          * Check if HMI event has been recovered or not. If not
0293          * then kernel can't continue, we need to panic.
0294          * But before we do that, display all the HMI event
0295          * available on the list and set unrecoverable flag to 1.
0296          */
0297         if (disposition != OpalHMI_DISPOSITION_RECOVERED)
0298             unrecoverable = 1;
0299 
0300         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0301     }
0302     spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0303 
0304     if (unrecoverable) {
0305         /* Pull all HMI events from OPAL before we panic. */
0306         while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
0307             u32 type;
0308 
0309             type = be32_to_cpu(msg.msg_type);
0310 
0311             /* skip if not HMI event */
0312             if (type != OPAL_MSG_HMI_EVT)
0313                 continue;
0314 
0315             /* HMI event info starts from param[0] */
0316             hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
0317             print_hmi_event_info(hmi_evt);
0318         }
0319 
0320         pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
0321     }
0322 }
0323 
0324 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
0325 /*
0326  * opal_handle_hmi_event - notifier handler that queues up HMI events
0327  * to be preocessed later.
0328  */
0329 static int opal_handle_hmi_event(struct notifier_block *nb,
0330               unsigned long msg_type, void *msg)
0331 {
0332     unsigned long flags;
0333     struct OpalHMIEvent *hmi_evt;
0334     struct opal_msg *hmi_msg = msg;
0335     struct OpalHmiEvtNode *msg_node;
0336 
0337     /* Sanity Checks */
0338     if (msg_type != OPAL_MSG_HMI_EVT)
0339         return 0;
0340 
0341     /* HMI event info starts from param[0] */
0342     hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
0343 
0344     /* Delay the logging of HMI events to workqueue. */
0345     msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
0346     if (!msg_node) {
0347         pr_err("HMI: out of memory, Opal message event not handled\n");
0348         return -ENOMEM;
0349     }
0350     memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
0351 
0352     spin_lock_irqsave(&opal_hmi_evt_lock, flags);
0353     list_add(&msg_node->list, &opal_hmi_evt_list);
0354     spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
0355 
0356     schedule_work(&hmi_event_work);
0357     return 0;
0358 }
0359 
0360 static struct notifier_block opal_hmi_handler_nb = {
0361     .notifier_call  = opal_handle_hmi_event,
0362     .next       = NULL,
0363     .priority   = 0,
0364 };
0365 
0366 int __init opal_hmi_handler_init(void)
0367 {
0368     int ret;
0369 
0370     if (!opal_hmi_handler_nb_init) {
0371         ret = opal_message_notifier_register(
0372                 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
0373         if (ret) {
0374             pr_err("%s: Can't register OPAL event notifier (%d)\n",
0375                    __func__, ret);
0376             return ret;
0377         }
0378         opal_hmi_handler_nb_init = 1;
0379     }
0380     return 0;
0381 }