Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #undef TRACE_SYSTEM
0003 #define TRACE_SYSTEM ras
0004 #define TRACE_INCLUDE_FILE ras_event
0005 
0006 #if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ)
0007 #define _TRACE_HW_EVENT_MC_H
0008 
0009 #include <linux/tracepoint.h>
0010 #include <linux/edac.h>
0011 #include <linux/ktime.h>
0012 #include <linux/pci.h>
0013 #include <linux/aer.h>
0014 #include <linux/cper.h>
0015 #include <linux/mm.h>
0016 
0017 /*
0018  * MCE Extended Error Log trace event
0019  *
0020  * These events are generated when hardware detects a corrected or
0021  * uncorrected event.
0022  */
0023 
0024 /* memory trace event */
0025 
0026 #if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
0027 TRACE_EVENT(extlog_mem_event,
0028     TP_PROTO(struct cper_sec_mem_err *mem,
0029          u32 err_seq,
0030          const guid_t *fru_id,
0031          const char *fru_text,
0032          u8 sev),
0033 
0034     TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
0035 
0036     TP_STRUCT__entry(
0037         __field(u32, err_seq)
0038         __field(u8, etype)
0039         __field(u8, sev)
0040         __field(u64, pa)
0041         __field(u8, pa_mask_lsb)
0042         __field_struct(guid_t, fru_id)
0043         __string(fru_text, fru_text)
0044         __field_struct(struct cper_mem_err_compact, data)
0045     ),
0046 
0047     TP_fast_assign(
0048         __entry->err_seq = err_seq;
0049         if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
0050             __entry->etype = mem->error_type;
0051         else
0052             __entry->etype = ~0;
0053         __entry->sev = sev;
0054         if (mem->validation_bits & CPER_MEM_VALID_PA)
0055             __entry->pa = mem->physical_addr;
0056         else
0057             __entry->pa = ~0ull;
0058 
0059         if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
0060             __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
0061         else
0062             __entry->pa_mask_lsb = ~0;
0063         __entry->fru_id = *fru_id;
0064         __assign_str(fru_text, fru_text);
0065         cper_mem_err_pack(mem, &__entry->data);
0066     ),
0067 
0068     TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
0069           __entry->err_seq,
0070           cper_severity_str(__entry->sev),
0071           cper_mem_err_type_str(__entry->etype),
0072           __entry->pa,
0073           __entry->pa_mask_lsb,
0074           cper_mem_err_unpack(p, &__entry->data),
0075           &__entry->fru_id,
0076           __get_str(fru_text))
0077 );
0078 #endif
0079 
0080 /*
0081  * Hardware Events Report
0082  *
0083  * Those events are generated when hardware detected a corrected or
0084  * uncorrected event, and are meant to replace the current API to report
0085  * errors defined on both EDAC and MCE subsystems.
0086  *
0087  * FIXME: Add events for handling memory errors originated from the
0088  *        MCE subsystem.
0089  */
0090 
0091 /*
0092  * Hardware-independent Memory Controller specific events
0093  */
0094 
0095 /*
0096  * Default error mechanisms for Memory Controller errors (CE and UE)
0097  */
0098 TRACE_EVENT(mc_event,
0099 
0100     TP_PROTO(const unsigned int err_type,
0101          const char *error_msg,
0102          const char *label,
0103          const int error_count,
0104          const u8 mc_index,
0105          const s8 top_layer,
0106          const s8 mid_layer,
0107          const s8 low_layer,
0108          unsigned long address,
0109          const u8 grain_bits,
0110          unsigned long syndrome,
0111          const char *driver_detail),
0112 
0113     TP_ARGS(err_type, error_msg, label, error_count, mc_index,
0114         top_layer, mid_layer, low_layer, address, grain_bits,
0115         syndrome, driver_detail),
0116 
0117     TP_STRUCT__entry(
0118         __field(    unsigned int,   error_type      )
0119         __string(   msg,        error_msg       )
0120         __string(   label,      label           )
0121         __field(    u16,        error_count     )
0122         __field(    u8,     mc_index        )
0123         __field(    s8,     top_layer       )
0124         __field(    s8,     middle_layer        )
0125         __field(    s8,     lower_layer     )
0126         __field(    long,       address         )
0127         __field(    u8,     grain_bits      )
0128         __field(    long,       syndrome        )
0129         __string(   driver_detail,  driver_detail       )
0130     ),
0131 
0132     TP_fast_assign(
0133         __entry->error_type     = err_type;
0134         __assign_str(msg, error_msg);
0135         __assign_str(label, label);
0136         __entry->error_count        = error_count;
0137         __entry->mc_index       = mc_index;
0138         __entry->top_layer      = top_layer;
0139         __entry->middle_layer       = mid_layer;
0140         __entry->lower_layer        = low_layer;
0141         __entry->address        = address;
0142         __entry->grain_bits     = grain_bits;
0143         __entry->syndrome       = syndrome;
0144         __assign_str(driver_detail, driver_detail);
0145     ),
0146 
0147     TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
0148           __entry->error_count,
0149           mc_event_error_type(__entry->error_type),
0150           __entry->error_count > 1 ? "s" : "",
0151           __get_str(msg)[0] ? " " : "",
0152           __get_str(msg),
0153           __get_str(label),
0154           __entry->mc_index,
0155           __entry->top_layer,
0156           __entry->middle_layer,
0157           __entry->lower_layer,
0158           __entry->address,
0159           1 << __entry->grain_bits,
0160           __entry->syndrome,
0161           __get_str(driver_detail)[0] ? " " : "",
0162           __get_str(driver_detail))
0163 );
0164 
0165 /*
0166  * ARM Processor Events Report
0167  *
0168  * This event is generated when hardware detects an ARM processor error
0169  * has occurred. UEFI 2.6 spec section N.2.4.4.
0170  */
0171 TRACE_EVENT(arm_event,
0172 
0173     TP_PROTO(const struct cper_sec_proc_arm *proc),
0174 
0175     TP_ARGS(proc),
0176 
0177     TP_STRUCT__entry(
0178         __field(u64, mpidr)
0179         __field(u64, midr)
0180         __field(u32, running_state)
0181         __field(u32, psci_state)
0182         __field(u8, affinity)
0183     ),
0184 
0185     TP_fast_assign(
0186         if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
0187             __entry->affinity = proc->affinity_level;
0188         else
0189             __entry->affinity = ~0;
0190         if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
0191             __entry->mpidr = proc->mpidr;
0192         else
0193             __entry->mpidr = 0ULL;
0194         __entry->midr = proc->midr;
0195         if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
0196             __entry->running_state = proc->running_state;
0197             __entry->psci_state = proc->psci_state;
0198         } else {
0199             __entry->running_state = ~0;
0200             __entry->psci_state = ~0;
0201         }
0202     ),
0203 
0204     TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
0205           "running state: %d; PSCI state: %d",
0206           __entry->affinity, __entry->mpidr, __entry->midr,
0207           __entry->running_state, __entry->psci_state)
0208 );
0209 
0210 /*
0211  * Non-Standard Section Report
0212  *
0213  * This event is generated when hardware detected a hardware
0214  * error event, which may be of non-standard section as defined
0215  * in UEFI spec appendix "Common Platform Error Record", or may
0216  * be of sections for which TRACE_EVENT is not defined.
0217  *
0218  */
0219 TRACE_EVENT(non_standard_event,
0220 
0221     TP_PROTO(const guid_t *sec_type,
0222          const guid_t *fru_id,
0223          const char *fru_text,
0224          const u8 sev,
0225          const u8 *err,
0226          const u32 len),
0227 
0228     TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
0229 
0230     TP_STRUCT__entry(
0231         __array(char, sec_type, UUID_SIZE)
0232         __array(char, fru_id, UUID_SIZE)
0233         __string(fru_text, fru_text)
0234         __field(u8, sev)
0235         __field(u32, len)
0236         __dynamic_array(u8, buf, len)
0237     ),
0238 
0239     TP_fast_assign(
0240         memcpy(__entry->sec_type, sec_type, UUID_SIZE);
0241         memcpy(__entry->fru_id, fru_id, UUID_SIZE);
0242         __assign_str(fru_text, fru_text);
0243         __entry->sev = sev;
0244         __entry->len = len;
0245         memcpy(__get_dynamic_array(buf), err, len);
0246     ),
0247 
0248     TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
0249           __entry->sev, __entry->sec_type,
0250           __entry->fru_id, __get_str(fru_text),
0251           __entry->len,
0252           __print_hex(__get_dynamic_array(buf), __entry->len))
0253 );
0254 
0255 /*
0256  * PCIe AER Trace event
0257  *
0258  * These events are generated when hardware detects a corrected or
0259  * uncorrected event on a PCIe device. The event report has
0260  * the following structure:
0261  *
0262  * char * dev_name -    The name of the slot where the device resides
0263  *          ([domain:]bus:device.function).
0264  * u32 status -     Either the correctable or uncorrectable register
0265  *          indicating what error or errors have been seen
0266  * u8 severity -    error severity 0:NONFATAL 1:FATAL 2:CORRECTED
0267  */
0268 
0269 #define aer_correctable_errors                  \
0270     {PCI_ERR_COR_RCVR,  "Receiver Error"},      \
0271     {PCI_ERR_COR_BAD_TLP,   "Bad TLP"},         \
0272     {PCI_ERR_COR_BAD_DLLP,  "Bad DLLP"},            \
0273     {PCI_ERR_COR_REP_ROLL,  "RELAY_NUM Rollover"},      \
0274     {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"},    \
0275     {PCI_ERR_COR_ADV_NFAT,  "Advisory Non-Fatal Error"},    \
0276     {PCI_ERR_COR_INTERNAL,  "Corrected Internal Error"},    \
0277     {PCI_ERR_COR_LOG_OVER,  "Header Log Overflow"}
0278 
0279 #define aer_uncorrectable_errors                \
0280     {PCI_ERR_UNC_UND,   "Undefined"},           \
0281     {PCI_ERR_UNC_DLP,   "Data Link Protocol Error"},    \
0282     {PCI_ERR_UNC_SURPDN,    "Surprise Down Error"},     \
0283     {PCI_ERR_UNC_POISON_TLP,"Poisoned TLP"},        \
0284     {PCI_ERR_UNC_FCP,   "Flow Control Protocol Error"}, \
0285     {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"},      \
0286     {PCI_ERR_UNC_COMP_ABORT,"Completer Abort"},     \
0287     {PCI_ERR_UNC_UNX_COMP,  "Unexpected Completion"},   \
0288     {PCI_ERR_UNC_RX_OVER,   "Receiver Overflow"},       \
0289     {PCI_ERR_UNC_MALF_TLP,  "Malformed TLP"},       \
0290     {PCI_ERR_UNC_ECRC,  "ECRC Error"},          \
0291     {PCI_ERR_UNC_UNSUP, "Unsupported Request Error"},   \
0292     {PCI_ERR_UNC_ACSV,  "ACS Violation"},       \
0293     {PCI_ERR_UNC_INTN,  "Uncorrectable Internal Error"},\
0294     {PCI_ERR_UNC_MCBTLP,    "MC Blocked TLP"},      \
0295     {PCI_ERR_UNC_ATOMEG,    "AtomicOp Egress Blocked"}, \
0296     {PCI_ERR_UNC_TLPPRE,    "TLP Prefix Blocked Error"}
0297 
0298 TRACE_EVENT(aer_event,
0299     TP_PROTO(const char *dev_name,
0300          const u32 status,
0301          const u8 severity,
0302          const u8 tlp_header_valid,
0303          struct aer_header_log_regs *tlp),
0304 
0305     TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
0306 
0307     TP_STRUCT__entry(
0308         __string(   dev_name,   dev_name    )
0309         __field(    u32,        status      )
0310         __field(    u8,     severity    )
0311         __field(    u8,         tlp_header_valid)
0312         __array(    u32,        tlp_header, 4   )
0313     ),
0314 
0315     TP_fast_assign(
0316         __assign_str(dev_name, dev_name);
0317         __entry->status     = status;
0318         __entry->severity   = severity;
0319         __entry->tlp_header_valid = tlp_header_valid;
0320         if (tlp_header_valid) {
0321             __entry->tlp_header[0] = tlp->dw0;
0322             __entry->tlp_header[1] = tlp->dw1;
0323             __entry->tlp_header[2] = tlp->dw2;
0324             __entry->tlp_header[3] = tlp->dw3;
0325         }
0326     ),
0327 
0328     TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
0329         __get_str(dev_name),
0330         __entry->severity == AER_CORRECTABLE ? "Corrected" :
0331             __entry->severity == AER_FATAL ?
0332             "Fatal" : "Uncorrected, non-fatal",
0333         __entry->severity == AER_CORRECTABLE ?
0334         __print_flags(__entry->status, "|", aer_correctable_errors) :
0335         __print_flags(__entry->status, "|", aer_uncorrectable_errors),
0336         __entry->tlp_header_valid ?
0337             __print_array(__entry->tlp_header, 4, 4) :
0338             "Not available")
0339 );
0340 
0341 /*
0342  * memory-failure recovery action result event
0343  *
0344  * unsigned long pfn -  Page Frame Number of the corrupted page
0345  * int type -   Page types of the corrupted page
0346  * int result   -   Result of recovery action
0347  */
0348 
0349 #ifdef CONFIG_MEMORY_FAILURE
0350 #define MF_ACTION_RESULT    \
0351     EM ( MF_IGNORED, "Ignored" )    \
0352     EM ( MF_FAILED,  "Failed" ) \
0353     EM ( MF_DELAYED, "Delayed" )    \
0354     EMe ( MF_RECOVERED, "Recovered" )
0355 
0356 #define MF_PAGE_TYPE        \
0357     EM ( MF_MSG_KERNEL, "reserved kernel page" )            \
0358     EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )   \
0359     EM ( MF_MSG_SLAB, "kernel slab page" )              \
0360     EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
0361     EM ( MF_MSG_HUGE, "huge page" )                 \
0362     EM ( MF_MSG_FREE_HUGE, "free huge page" )           \
0363     EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )     \
0364     EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )       \
0365     EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )       \
0366     EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )   \
0367     EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )   \
0368     EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )   \
0369     EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )   \
0370     EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )           \
0371     EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )           \
0372     EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )   \
0373     EM ( MF_MSG_BUDDY, "free buddy page" )              \
0374     EM ( MF_MSG_DAX, "dax page" )                   \
0375     EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )            \
0376     EMe ( MF_MSG_UNKNOWN, "unknown page" )
0377 
0378 /*
0379  * First define the enums in MM_ACTION_RESULT to be exported to userspace
0380  * via TRACE_DEFINE_ENUM().
0381  */
0382 #undef EM
0383 #undef EMe
0384 #define EM(a, b) TRACE_DEFINE_ENUM(a);
0385 #define EMe(a, b)   TRACE_DEFINE_ENUM(a);
0386 
0387 MF_ACTION_RESULT
0388 MF_PAGE_TYPE
0389 
0390 /*
0391  * Now redefine the EM() and EMe() macros to map the enums to the strings
0392  * that will be printed in the output.
0393  */
0394 #undef EM
0395 #undef EMe
0396 #define EM(a, b)        { a, b },
0397 #define EMe(a, b)   { a, b }
0398 
0399 TRACE_EVENT(memory_failure_event,
0400     TP_PROTO(unsigned long pfn,
0401          int type,
0402          int result),
0403 
0404     TP_ARGS(pfn, type, result),
0405 
0406     TP_STRUCT__entry(
0407         __field(unsigned long, pfn)
0408         __field(int, type)
0409         __field(int, result)
0410     ),
0411 
0412     TP_fast_assign(
0413         __entry->pfn    = pfn;
0414         __entry->type   = type;
0415         __entry->result = result;
0416     ),
0417 
0418     TP_printk("pfn %#lx: recovery action for %s: %s",
0419         __entry->pfn,
0420         __print_symbolic(__entry->type, MF_PAGE_TYPE),
0421         __print_symbolic(__entry->result, MF_ACTION_RESULT)
0422     )
0423 );
0424 #endif /* CONFIG_MEMORY_FAILURE */
0425 #endif /* _TRACE_HW_EVENT_MC_H */
0426 
0427 /* This part must be outside protection */
0428 #include <trace/define_trace.h>