0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0011
0012 #include <acpi/ghes.h>
0013 #include <linux/edac.h>
0014 #include <linux/dmi.h>
0015 #include "edac_module.h"
0016 #include <ras/ras_event.h>
0017
0018 #define OTHER_DETAIL_LEN 400
0019
0020 struct ghes_pvt {
0021 struct mem_ctl_info *mci;
0022
0023
0024 char other_detail[OTHER_DETAIL_LEN];
0025 char msg[80];
0026 };
0027
0028 static refcount_t ghes_refcount = REFCOUNT_INIT(0);
0029
0030
0031
0032
0033
0034
0035 static struct ghes_pvt *ghes_pvt;
0036
0037
0038
0039
0040
0041 static struct ghes_hw_desc {
0042 int num_dimms;
0043 struct dimm_info *dimms;
0044 } ghes_hw;
0045
0046
0047 static DEFINE_MUTEX(ghes_reg_mutex);
0048
0049
0050
0051
0052
0053
0054 static DEFINE_SPINLOCK(ghes_lock);
0055
0056
0057 static bool __read_mostly force_load;
0058 module_param(force_load, bool, 0);
0059
0060 static bool system_scanned;
0061
0062
0063 struct memdev_dmi_entry {
0064 u8 type;
0065 u8 length;
0066 u16 handle;
0067 u16 phys_mem_array_handle;
0068 u16 mem_err_info_handle;
0069 u16 total_width;
0070 u16 data_width;
0071 u16 size;
0072 u8 form_factor;
0073 u8 device_set;
0074 u8 device_locator;
0075 u8 bank_locator;
0076 u8 memory_type;
0077 u16 type_detail;
0078 u16 speed;
0079 u8 manufacturer;
0080 u8 serial_number;
0081 u8 asset_tag;
0082 u8 part_number;
0083 u8 attributes;
0084 u32 extended_size;
0085 u16 conf_mem_clk_speed;
0086 } __attribute__((__packed__));
0087
0088 static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
0089 {
0090 struct dimm_info *dimm;
0091
0092 mci_for_each_dimm(mci, dimm) {
0093 if (dimm->smbios_handle == handle)
0094 return dimm;
0095 }
0096
0097 return NULL;
0098 }
0099
0100 static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
0101 {
0102 const char *bank = NULL, *device = NULL;
0103
0104 dmi_memdev_name(handle, &bank, &device);
0105
0106
0107
0108
0109
0110 snprintf(dimm->label, sizeof(dimm->label), "%s%s%s",
0111 (bank && *bank) ? bank : "",
0112 (bank && *bank && device && *device) ? " " : "",
0113 (device && *device) ? device : "");
0114 }
0115
0116 static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
0117 {
0118 u16 rdr_mask = BIT(7) | BIT(13);
0119
0120 if (entry->size == 0xffff) {
0121 pr_info("Can't get DIMM%i size\n", dimm->idx);
0122 dimm->nr_pages = MiB_TO_PAGES(32);
0123 } else if (entry->size == 0x7fff) {
0124 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
0125 } else {
0126 if (entry->size & BIT(15))
0127 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
0128 else
0129 dimm->nr_pages = MiB_TO_PAGES(entry->size);
0130 }
0131
0132 switch (entry->memory_type) {
0133 case 0x12:
0134 if (entry->type_detail & BIT(13))
0135 dimm->mtype = MEM_RDDR;
0136 else
0137 dimm->mtype = MEM_DDR;
0138 break;
0139 case 0x13:
0140 if (entry->type_detail & BIT(13))
0141 dimm->mtype = MEM_RDDR2;
0142 else
0143 dimm->mtype = MEM_DDR2;
0144 break;
0145 case 0x14:
0146 dimm->mtype = MEM_FB_DDR2;
0147 break;
0148 case 0x18:
0149 if (entry->type_detail & BIT(12))
0150 dimm->mtype = MEM_NVDIMM;
0151 else if (entry->type_detail & BIT(13))
0152 dimm->mtype = MEM_RDDR3;
0153 else
0154 dimm->mtype = MEM_DDR3;
0155 break;
0156 case 0x1a:
0157 if (entry->type_detail & BIT(12))
0158 dimm->mtype = MEM_NVDIMM;
0159 else if (entry->type_detail & BIT(13))
0160 dimm->mtype = MEM_RDDR4;
0161 else
0162 dimm->mtype = MEM_DDR4;
0163 break;
0164 default:
0165 if (entry->type_detail & BIT(6))
0166 dimm->mtype = MEM_RMBS;
0167 else if ((entry->type_detail & rdr_mask) == rdr_mask)
0168 dimm->mtype = MEM_RDR;
0169 else if (entry->type_detail & BIT(7))
0170 dimm->mtype = MEM_SDR;
0171 else if (entry->type_detail & BIT(9))
0172 dimm->mtype = MEM_EDO;
0173 else
0174 dimm->mtype = MEM_UNKNOWN;
0175 }
0176
0177
0178
0179
0180
0181 if (entry->total_width == entry->data_width)
0182 dimm->edac_mode = EDAC_NONE;
0183 else
0184 dimm->edac_mode = EDAC_SECDED;
0185
0186 dimm->dtype = DEV_UNKNOWN;
0187 dimm->grain = 128;
0188
0189 dimm_setup_label(dimm, entry->handle);
0190
0191 if (dimm->nr_pages) {
0192 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
0193 dimm->idx, edac_mem_types[dimm->mtype],
0194 PAGES_TO_MiB(dimm->nr_pages),
0195 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
0196 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
0197 entry->memory_type, entry->type_detail,
0198 entry->total_width, entry->data_width);
0199 }
0200
0201 dimm->smbios_handle = entry->handle;
0202 }
0203
0204 static void enumerate_dimms(const struct dmi_header *dh, void *arg)
0205 {
0206 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
0207 struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg;
0208 struct dimm_info *d;
0209
0210 if (dh->type != DMI_ENTRY_MEM_DEVICE)
0211 return;
0212
0213
0214 if (!hw->num_dimms || !(hw->num_dimms % 16)) {
0215 struct dimm_info *new;
0216
0217 new = krealloc_array(hw->dimms, hw->num_dimms + 16,
0218 sizeof(struct dimm_info), GFP_KERNEL);
0219 if (!new) {
0220 WARN_ON_ONCE(1);
0221 return;
0222 }
0223
0224 hw->dimms = new;
0225 }
0226
0227 d = &hw->dimms[hw->num_dimms];
0228 d->idx = hw->num_dimms;
0229
0230 assign_dmi_dimm_info(d, entry);
0231
0232 hw->num_dimms++;
0233 }
0234
0235 static void ghes_scan_system(void)
0236 {
0237 if (system_scanned)
0238 return;
0239
0240 dmi_walk(enumerate_dimms, &ghes_hw);
0241
0242 system_scanned = true;
0243 }
0244
0245 static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg,
0246 const char *location, unsigned int len)
0247 {
0248 u32 n;
0249
0250 if (!msg)
0251 return 0;
0252
0253 n = 0;
0254 len -= 1;
0255
0256 n += scnprintf(msg + n, len - n, "APEI location: %s ", location);
0257
0258 if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS))
0259 goto out;
0260
0261 n += scnprintf(msg + n, len - n, "status(0x%016llx): ", mem->error_status);
0262 n += scnprintf(msg + n, len - n, "%s ", cper_mem_err_status_str(mem->error_status));
0263
0264 out:
0265 msg[n] = '\0';
0266
0267 return n;
0268 }
0269
0270 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
0271 {
0272 struct cper_mem_err_compact cmem;
0273 struct edac_raw_error_desc *e;
0274 struct mem_ctl_info *mci;
0275 struct ghes_pvt *pvt;
0276 unsigned long flags;
0277 char *p;
0278
0279
0280
0281
0282
0283
0284 if (WARN_ON_ONCE(in_nmi()))
0285 return;
0286
0287 spin_lock_irqsave(&ghes_lock, flags);
0288
0289 pvt = ghes_pvt;
0290 if (!pvt)
0291 goto unlock;
0292
0293 mci = pvt->mci;
0294 e = &mci->error_desc;
0295
0296
0297 memset(e, 0, sizeof (*e));
0298 e->error_count = 1;
0299 e->grain = 1;
0300 e->msg = pvt->msg;
0301 e->other_detail = pvt->other_detail;
0302 e->top_layer = -1;
0303 e->mid_layer = -1;
0304 e->low_layer = -1;
0305 *pvt->other_detail = '\0';
0306 *pvt->msg = '\0';
0307
0308 switch (sev) {
0309 case GHES_SEV_CORRECTED:
0310 e->type = HW_EVENT_ERR_CORRECTED;
0311 break;
0312 case GHES_SEV_RECOVERABLE:
0313 e->type = HW_EVENT_ERR_UNCORRECTED;
0314 break;
0315 case GHES_SEV_PANIC:
0316 e->type = HW_EVENT_ERR_FATAL;
0317 break;
0318 default:
0319 case GHES_SEV_NO:
0320 e->type = HW_EVENT_ERR_INFO;
0321 }
0322
0323 edac_dbg(1, "error validation_bits: 0x%08llx\n",
0324 (long long)mem_err->validation_bits);
0325
0326
0327 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
0328 u8 etype = mem_err->error_type;
0329
0330 p = pvt->msg;
0331 p += snprintf(p, sizeof(pvt->msg), "%s", cper_mem_err_type_str(etype));
0332 } else {
0333 strcpy(pvt->msg, "unknown error");
0334 }
0335
0336
0337 if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
0338 e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
0339 e->offset_in_page = offset_in_page(mem_err->physical_addr);
0340 }
0341
0342
0343 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
0344 e->grain = ~mem_err->physical_addr_mask + 1;
0345
0346
0347 p = e->location;
0348 cper_mem_err_pack(mem_err, &cmem);
0349 p += cper_mem_err_location(&cmem, p);
0350
0351 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
0352 struct dimm_info *dimm;
0353
0354 p += cper_dimm_err_location(&cmem, p);
0355 dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
0356 if (dimm) {
0357 e->top_layer = dimm->idx;
0358 strcpy(e->label, dimm->label);
0359 }
0360 }
0361 if (p > e->location)
0362 *(p - 1) = '\0';
0363
0364 if (!*e->label)
0365 strcpy(e->label, "unknown memory");
0366
0367
0368 p = pvt->other_detail;
0369 p += print_mem_error_other_detail(mem_err, p, e->location, OTHER_DETAIL_LEN);
0370 if (p > pvt->other_detail)
0371 *(p - 1) = '\0';
0372
0373 edac_raw_mc_handle_error(e);
0374
0375 unlock:
0376 spin_unlock_irqrestore(&ghes_lock, flags);
0377 }
0378
0379
0380
0381
0382 static struct acpi_platform_list plat_list[] = {
0383 {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
0384 { }
0385 };
0386
0387 int ghes_edac_register(struct ghes *ghes, struct device *dev)
0388 {
0389 bool fake = false;
0390 struct mem_ctl_info *mci;
0391 struct ghes_pvt *pvt;
0392 struct edac_mc_layer layers[1];
0393 unsigned long flags;
0394 int idx = -1;
0395 int rc = 0;
0396
0397 if (IS_ENABLED(CONFIG_X86)) {
0398
0399 idx = acpi_match_platform_list(plat_list);
0400 if (!force_load && idx < 0)
0401 return -ENODEV;
0402 } else {
0403 force_load = true;
0404 idx = 0;
0405 }
0406
0407
0408 mutex_lock(&ghes_reg_mutex);
0409
0410
0411
0412
0413 if (refcount_inc_not_zero(&ghes_refcount))
0414 goto unlock;
0415
0416 ghes_scan_system();
0417
0418
0419 if (!ghes_hw.num_dimms) {
0420 fake = true;
0421 ghes_hw.num_dimms = 1;
0422 }
0423
0424 layers[0].type = EDAC_MC_LAYER_ALL_MEM;
0425 layers[0].size = ghes_hw.num_dimms;
0426 layers[0].is_virt_csrow = true;
0427
0428 mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_pvt));
0429 if (!mci) {
0430 pr_info("Can't allocate memory for EDAC data\n");
0431 rc = -ENOMEM;
0432 goto unlock;
0433 }
0434
0435 pvt = mci->pvt_info;
0436 pvt->mci = mci;
0437
0438 mci->pdev = dev;
0439 mci->mtype_cap = MEM_FLAG_EMPTY;
0440 mci->edac_ctl_cap = EDAC_FLAG_NONE;
0441 mci->edac_cap = EDAC_FLAG_NONE;
0442 mci->mod_name = "ghes_edac.c";
0443 mci->ctl_name = "ghes_edac";
0444 mci->dev_name = "ghes";
0445
0446 if (fake) {
0447 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
0448 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
0449 pr_info("work on such system. Use this driver with caution\n");
0450 } else if (idx < 0) {
0451 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
0452 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
0453 pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
0454 pr_info("If you find incorrect reports, please contact your hardware vendor\n");
0455 pr_info("to correct its BIOS.\n");
0456 pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
0457 }
0458
0459 if (!fake) {
0460 struct dimm_info *src, *dst;
0461 int i = 0;
0462
0463 mci_for_each_dimm(mci, dst) {
0464 src = &ghes_hw.dimms[i];
0465
0466 dst->idx = src->idx;
0467 dst->smbios_handle = src->smbios_handle;
0468 dst->nr_pages = src->nr_pages;
0469 dst->mtype = src->mtype;
0470 dst->edac_mode = src->edac_mode;
0471 dst->dtype = src->dtype;
0472 dst->grain = src->grain;
0473
0474
0475
0476
0477
0478 if (strlen(src->label))
0479 memcpy(dst->label, src->label, sizeof(src->label));
0480
0481 i++;
0482 }
0483
0484 } else {
0485 struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
0486
0487 dimm->nr_pages = 1;
0488 dimm->grain = 128;
0489 dimm->mtype = MEM_UNKNOWN;
0490 dimm->dtype = DEV_UNKNOWN;
0491 dimm->edac_mode = EDAC_SECDED;
0492 }
0493
0494 rc = edac_mc_add_mc(mci);
0495 if (rc < 0) {
0496 pr_info("Can't register with the EDAC core\n");
0497 edac_mc_free(mci);
0498 rc = -ENODEV;
0499 goto unlock;
0500 }
0501
0502 spin_lock_irqsave(&ghes_lock, flags);
0503 ghes_pvt = pvt;
0504 spin_unlock_irqrestore(&ghes_lock, flags);
0505
0506
0507 refcount_set(&ghes_refcount, 1);
0508
0509 unlock:
0510
0511
0512 kfree(ghes_hw.dimms);
0513 ghes_hw.dimms = NULL;
0514
0515 mutex_unlock(&ghes_reg_mutex);
0516
0517 return rc;
0518 }
0519
0520 void ghes_edac_unregister(struct ghes *ghes)
0521 {
0522 struct mem_ctl_info *mci;
0523 unsigned long flags;
0524
0525 if (!force_load)
0526 return;
0527
0528 mutex_lock(&ghes_reg_mutex);
0529
0530 system_scanned = false;
0531 memset(&ghes_hw, 0, sizeof(struct ghes_hw_desc));
0532
0533 if (!refcount_dec_and_test(&ghes_refcount))
0534 goto unlock;
0535
0536
0537
0538
0539 spin_lock_irqsave(&ghes_lock, flags);
0540 mci = ghes_pvt ? ghes_pvt->mci : NULL;
0541 ghes_pvt = NULL;
0542 spin_unlock_irqrestore(&ghes_lock, flags);
0543
0544 if (!mci)
0545 goto unlock;
0546
0547 mci = edac_mc_del_mc(mci->pdev);
0548 if (mci)
0549 edac_mc_free(mci);
0550
0551 unlock:
0552 mutex_unlock(&ghes_reg_mutex);
0553 }