0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include <linux/debugfs.h>
0025 #include <linux/list.h>
0026 #include <linux/module.h>
0027 #include <linux/uaccess.h>
0028 #include <linux/reboot.h>
0029 #include <linux/syscalls.h>
0030 #include <linux/pm_runtime.h>
0031
0032 #include "amdgpu.h"
0033 #include "amdgpu_ras.h"
0034 #include "amdgpu_atomfirmware.h"
0035 #include "amdgpu_xgmi.h"
0036 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
0037 #include "atom.h"
0038 #include "amdgpu_reset.h"
0039
0040 #ifdef CONFIG_X86_MCE_AMD
0041 #include <asm/mce.h>
0042
0043 static bool notifier_registered;
0044 #endif
0045 static const char *RAS_FS_NAME = "ras";
0046
0047 const char *ras_error_string[] = {
0048 "none",
0049 "parity",
0050 "single_correctable",
0051 "multi_uncorrectable",
0052 "poison",
0053 };
0054
0055 const char *ras_block_string[] = {
0056 "umc",
0057 "sdma",
0058 "gfx",
0059 "mmhub",
0060 "athub",
0061 "pcie_bif",
0062 "hdp",
0063 "xgmi_wafl",
0064 "df",
0065 "smn",
0066 "sem",
0067 "mp0",
0068 "mp1",
0069 "fuse",
0070 "mca",
0071 "vcn",
0072 "jpeg",
0073 };
0074
0075 const char *ras_mca_block_string[] = {
0076 "mca_mp0",
0077 "mca_mp1",
0078 "mca_mpio",
0079 "mca_iohc",
0080 };
0081
0082 struct amdgpu_ras_block_list {
0083
0084 struct list_head node;
0085
0086 struct amdgpu_ras_block_object *ras_obj;
0087 };
0088
0089 const char *get_ras_block_str(struct ras_common_if *ras_block)
0090 {
0091 if (!ras_block)
0092 return "NULL";
0093
0094 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
0095 return "OUT OF RANGE";
0096
0097 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
0098 return ras_mca_block_string[ras_block->sub_block_index];
0099
0100 return ras_block_string[ras_block->block];
0101 }
0102
0103 #define ras_block_str(_BLOCK_) \
0104 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
0105
0106 #define ras_err_str(i) (ras_error_string[ffs(i)])
0107
0108 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
0109
0110
0111 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
0112
0113
0114 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
0115
0116 enum amdgpu_ras_retire_page_reservation {
0117 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
0118 AMDGPU_RAS_RETIRE_PAGE_PENDING,
0119 AMDGPU_RAS_RETIRE_PAGE_FAULT,
0120 };
0121
0122 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
0123
0124 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
0125 uint64_t addr);
0126 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
0127 uint64_t addr);
0128 #ifdef CONFIG_X86_MCE_AMD
0129 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
0130 struct mce_notifier_adev_list {
0131 struct amdgpu_device *devs[MAX_GPU_INSTANCE];
0132 int num_gpu;
0133 };
0134 static struct mce_notifier_adev_list mce_adev_list;
0135 #endif
0136
0137 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
0138 {
0139 if (adev && amdgpu_ras_get_context(adev))
0140 amdgpu_ras_get_context(adev)->error_query_ready = ready;
0141 }
0142
0143 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
0144 {
0145 if (adev && amdgpu_ras_get_context(adev))
0146 return amdgpu_ras_get_context(adev)->error_query_ready;
0147
0148 return false;
0149 }
0150
0151 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
0152 {
0153 struct ras_err_data err_data = {0, 0, 0, NULL};
0154 struct eeprom_table_record err_rec;
0155
0156 if ((address >= adev->gmc.mc_vram_size) ||
0157 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
0158 dev_warn(adev->dev,
0159 "RAS WARN: input address 0x%llx is invalid.\n",
0160 address);
0161 return -EINVAL;
0162 }
0163
0164 if (amdgpu_ras_check_bad_page(adev, address)) {
0165 dev_warn(adev->dev,
0166 "RAS WARN: 0x%llx has already been marked as bad page!\n",
0167 address);
0168 return 0;
0169 }
0170
0171 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
0172 err_data.err_addr = &err_rec;
0173 amdgpu_umc_fill_error_record(&err_data, address,
0174 (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0);
0175
0176 if (amdgpu_bad_page_threshold != 0) {
0177 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
0178 err_data.err_addr_cnt);
0179 amdgpu_ras_save_bad_pages(adev);
0180 }
0181
0182 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
0183 dev_warn(adev->dev, "Clear EEPROM:\n");
0184 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
0185
0186 return 0;
0187 }
0188
0189 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
0190 size_t size, loff_t *pos)
0191 {
0192 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
0193 struct ras_query_if info = {
0194 .head = obj->head,
0195 };
0196 ssize_t s;
0197 char val[128];
0198
0199 if (amdgpu_ras_query_error_status(obj->adev, &info))
0200 return -EINVAL;
0201
0202
0203 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
0204 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
0205 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
0206 dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
0207 }
0208
0209 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
0210 "ue", info.ue_count,
0211 "ce", info.ce_count);
0212 if (*pos >= s)
0213 return 0;
0214
0215 s -= *pos;
0216 s = min_t(u64, s, size);
0217
0218
0219 if (copy_to_user(buf, &val[*pos], s))
0220 return -EINVAL;
0221
0222 *pos += s;
0223
0224 return s;
0225 }
0226
0227 static const struct file_operations amdgpu_ras_debugfs_ops = {
0228 .owner = THIS_MODULE,
0229 .read = amdgpu_ras_debugfs_read,
0230 .write = NULL,
0231 .llseek = default_llseek
0232 };
0233
0234 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
0235 {
0236 int i;
0237
0238 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
0239 *block_id = i;
0240 if (strcmp(name, ras_block_string[i]) == 0)
0241 return 0;
0242 }
0243 return -EINVAL;
0244 }
0245
0246 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
0247 const char __user *buf, size_t size,
0248 loff_t *pos, struct ras_debug_if *data)
0249 {
0250 ssize_t s = min_t(u64, 64, size);
0251 char str[65];
0252 char block_name[33];
0253 char err[9] = "ue";
0254 int op = -1;
0255 int block_id;
0256 uint32_t sub_block;
0257 u64 address, value;
0258
0259 if (*pos)
0260 return -EINVAL;
0261 *pos = size;
0262
0263 memset(str, 0, sizeof(str));
0264 memset(data, 0, sizeof(*data));
0265
0266 if (copy_from_user(str, buf, s))
0267 return -EINVAL;
0268
0269 if (sscanf(str, "disable %32s", block_name) == 1)
0270 op = 0;
0271 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
0272 op = 1;
0273 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
0274 op = 2;
0275 else if (strstr(str, "retire_page") != NULL)
0276 op = 3;
0277 else if (str[0] && str[1] && str[2] && str[3])
0278
0279 return -EINVAL;
0280
0281 if (op != -1) {
0282 if (op == 3) {
0283 if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
0284 sscanf(str, "%*s %llu", &address) != 1)
0285 return -EINVAL;
0286
0287 data->op = op;
0288 data->inject.address = address;
0289
0290 return 0;
0291 }
0292
0293 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
0294 return -EINVAL;
0295
0296 data->head.block = block_id;
0297
0298 if (!memcmp("ue", err, 2))
0299 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
0300 else if (!memcmp("ce", err, 2))
0301 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
0302 else
0303 return -EINVAL;
0304
0305 data->op = op;
0306
0307 if (op == 2) {
0308 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
0309 &sub_block, &address, &value) != 3 &&
0310 sscanf(str, "%*s %*s %*s %u %llu %llu",
0311 &sub_block, &address, &value) != 3)
0312 return -EINVAL;
0313 data->head.sub_block_index = sub_block;
0314 data->inject.address = address;
0315 data->inject.value = value;
0316 }
0317 } else {
0318 if (size < sizeof(*data))
0319 return -EINVAL;
0320
0321 if (copy_from_user(data, buf, sizeof(*data)))
0322 return -EINVAL;
0323 }
0324
0325 return 0;
0326 }
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
0408 const char __user *buf,
0409 size_t size, loff_t *pos)
0410 {
0411 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
0412 struct ras_debug_if data;
0413 int ret = 0;
0414
0415 if (!amdgpu_ras_get_error_query_ready(adev)) {
0416 dev_warn(adev->dev, "RAS WARN: error injection "
0417 "currently inaccessible\n");
0418 return size;
0419 }
0420
0421 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
0422 if (ret)
0423 return ret;
0424
0425 if (data.op == 3) {
0426 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
0427 if (!ret)
0428 return size;
0429 else
0430 return ret;
0431 }
0432
0433 if (!amdgpu_ras_is_supported(adev, data.head.block))
0434 return -EINVAL;
0435
0436 switch (data.op) {
0437 case 0:
0438 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
0439 break;
0440 case 1:
0441 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
0442 break;
0443 case 2:
0444 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
0445 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
0446 dev_warn(adev->dev, "RAS WARN: input address "
0447 "0x%llx is invalid.",
0448 data.inject.address);
0449 ret = -EINVAL;
0450 break;
0451 }
0452
0453
0454 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
0455 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
0456 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
0457 "already been marked as bad!\n",
0458 data.inject.address);
0459 break;
0460 }
0461
0462
0463 ret = amdgpu_ras_error_inject(adev, &data.inject);
0464 break;
0465 default:
0466 ret = -EINVAL;
0467 break;
0468 }
0469
0470 if (ret)
0471 return ret;
0472
0473 return size;
0474 }
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
0493 const char __user *buf,
0494 size_t size, loff_t *pos)
0495 {
0496 struct amdgpu_device *adev =
0497 (struct amdgpu_device *)file_inode(f)->i_private;
0498 int ret;
0499
0500 ret = amdgpu_ras_eeprom_reset_table(
0501 &(amdgpu_ras_get_context(adev)->eeprom_control));
0502
0503 if (!ret) {
0504
0505
0506 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
0507 return size;
0508 } else {
0509 return ret;
0510 }
0511 }
0512
0513 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
0514 .owner = THIS_MODULE,
0515 .read = NULL,
0516 .write = amdgpu_ras_debugfs_ctrl_write,
0517 .llseek = default_llseek
0518 };
0519
0520 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
0521 .owner = THIS_MODULE,
0522 .read = NULL,
0523 .write = amdgpu_ras_debugfs_eeprom_write,
0524 .llseek = default_llseek
0525 };
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
0549 struct device_attribute *attr, char *buf)
0550 {
0551 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
0552 struct ras_query_if info = {
0553 .head = obj->head,
0554 };
0555
0556 if (!amdgpu_ras_get_error_query_ready(obj->adev))
0557 return sysfs_emit(buf, "Query currently inaccessible\n");
0558
0559 if (amdgpu_ras_query_error_status(obj->adev, &info))
0560 return -EINVAL;
0561
0562 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
0563 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
0564 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
0565 dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
0566 }
0567
0568 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
0569 "ce", info.ce_count);
0570 }
0571
0572
0573
0574 #define get_obj(obj) do { (obj)->use++; } while (0)
0575 #define alive_obj(obj) ((obj)->use)
0576
0577 static inline void put_obj(struct ras_manager *obj)
0578 {
0579 if (obj && (--obj->use == 0))
0580 list_del(&obj->node);
0581 if (obj && (obj->use < 0))
0582 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
0583 }
0584
0585
0586 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
0587 struct ras_common_if *head)
0588 {
0589 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0590 struct ras_manager *obj;
0591
0592 if (!adev->ras_enabled || !con)
0593 return NULL;
0594
0595 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
0596 return NULL;
0597
0598 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
0599 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
0600 return NULL;
0601
0602 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
0603 } else
0604 obj = &con->objs[head->block];
0605
0606
0607 if (alive_obj(obj))
0608 return NULL;
0609
0610 obj->head = *head;
0611 obj->adev = adev;
0612 list_add(&obj->node, &con->head);
0613 get_obj(obj);
0614
0615 return obj;
0616 }
0617
0618
0619 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
0620 struct ras_common_if *head)
0621 {
0622 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0623 struct ras_manager *obj;
0624 int i;
0625
0626 if (!adev->ras_enabled || !con)
0627 return NULL;
0628
0629 if (head) {
0630 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
0631 return NULL;
0632
0633 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
0634 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
0635 return NULL;
0636
0637 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
0638 } else
0639 obj = &con->objs[head->block];
0640
0641 if (alive_obj(obj))
0642 return obj;
0643 } else {
0644 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
0645 obj = &con->objs[i];
0646 if (alive_obj(obj))
0647 return obj;
0648 }
0649 }
0650
0651 return NULL;
0652 }
0653
0654
0655
0656 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
0657 struct ras_common_if *head)
0658 {
0659 return adev->ras_hw_enabled & BIT(head->block);
0660 }
0661
0662 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
0663 struct ras_common_if *head)
0664 {
0665 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0666
0667 return con->features & BIT(head->block);
0668 }
0669
0670
0671
0672
0673
0674 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
0675 struct ras_common_if *head, int enable)
0676 {
0677 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0678 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
0679
0680
0681
0682
0683
0684
0685
0686 if (!amdgpu_ras_is_feature_allowed(adev, head))
0687 return 0;
0688
0689 if (enable) {
0690 if (!obj) {
0691 obj = amdgpu_ras_create_obj(adev, head);
0692 if (!obj)
0693 return -EINVAL;
0694 } else {
0695
0696 get_obj(obj);
0697 }
0698 con->features |= BIT(head->block);
0699 } else {
0700 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
0701 con->features &= ~BIT(head->block);
0702 put_obj(obj);
0703 }
0704 }
0705
0706 return 0;
0707 }
0708
0709
0710 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
0711 struct ras_common_if *head, bool enable)
0712 {
0713 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0714 union ta_ras_cmd_input *info;
0715 int ret;
0716
0717 if (!con)
0718 return -EINVAL;
0719
0720 if (head->block == AMDGPU_RAS_BLOCK__GFX) {
0721 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
0722 if (!info)
0723 return -ENOMEM;
0724
0725 if (!enable) {
0726 info->disable_features = (struct ta_ras_disable_features_input) {
0727 .block_id = amdgpu_ras_block_to_ta(head->block),
0728 .error_type = amdgpu_ras_error_to_ta(head->type),
0729 };
0730 } else {
0731 info->enable_features = (struct ta_ras_enable_features_input) {
0732 .block_id = amdgpu_ras_block_to_ta(head->block),
0733 .error_type = amdgpu_ras_error_to_ta(head->type),
0734 };
0735 }
0736 }
0737
0738
0739 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
0740
0741
0742 if (head->block == AMDGPU_RAS_BLOCK__GFX &&
0743 !amdgpu_sriov_vf(adev) &&
0744 !amdgpu_ras_intr_triggered()) {
0745 ret = psp_ras_enable_features(&adev->psp, info, enable);
0746 if (ret) {
0747 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
0748 enable ? "enable":"disable",
0749 get_ras_block_str(head),
0750 amdgpu_ras_is_poison_mode_supported(adev), ret);
0751 goto out;
0752 }
0753 }
0754
0755
0756 __amdgpu_ras_feature_enable(adev, head, enable);
0757 ret = 0;
0758 out:
0759 if (head->block == AMDGPU_RAS_BLOCK__GFX)
0760 kfree(info);
0761 return ret;
0762 }
0763
0764
0765 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
0766 struct ras_common_if *head, bool enable)
0767 {
0768 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0769 int ret;
0770
0771 if (!con)
0772 return -EINVAL;
0773
0774 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
0775 if (enable) {
0776
0777
0778
0779
0780
0781
0782 ret = amdgpu_ras_feature_enable(adev, head, 1);
0783
0784
0785
0786
0787 if (ret == -EINVAL) {
0788 ret = __amdgpu_ras_feature_enable(adev, head, 1);
0789 if (!ret)
0790 dev_info(adev->dev,
0791 "RAS INFO: %s setup object\n",
0792 get_ras_block_str(head));
0793 }
0794 } else {
0795
0796 ret = __amdgpu_ras_feature_enable(adev, head, 1);
0797 if (ret)
0798 return ret;
0799
0800
0801 if (head->block == AMDGPU_RAS_BLOCK__GFX)
0802 con->features |= BIT(head->block);
0803
0804 ret = amdgpu_ras_feature_enable(adev, head, 0);
0805
0806
0807 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
0808 con->features &= ~BIT(head->block);
0809 }
0810 } else
0811 ret = amdgpu_ras_feature_enable(adev, head, enable);
0812
0813 return ret;
0814 }
0815
0816 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
0817 bool bypass)
0818 {
0819 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0820 struct ras_manager *obj, *tmp;
0821
0822 list_for_each_entry_safe(obj, tmp, &con->head, node) {
0823
0824
0825
0826 if (bypass) {
0827 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
0828 break;
0829 } else {
0830 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
0831 break;
0832 }
0833 }
0834
0835 return con->features;
0836 }
0837
0838 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
0839 bool bypass)
0840 {
0841 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0842 int i;
0843 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
0844
0845 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
0846 struct ras_common_if head = {
0847 .block = i,
0848 .type = default_ras_type,
0849 .sub_block_index = 0,
0850 };
0851
0852 if (i == AMDGPU_RAS_BLOCK__MCA)
0853 continue;
0854
0855 if (bypass) {
0856
0857
0858
0859
0860 if (__amdgpu_ras_feature_enable(adev, &head, 1))
0861 break;
0862 } else {
0863 if (amdgpu_ras_feature_enable(adev, &head, 1))
0864 break;
0865 }
0866 }
0867
0868 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
0869 struct ras_common_if head = {
0870 .block = AMDGPU_RAS_BLOCK__MCA,
0871 .type = default_ras_type,
0872 .sub_block_index = i,
0873 };
0874
0875 if (bypass) {
0876
0877
0878
0879
0880 if (__amdgpu_ras_feature_enable(adev, &head, 1))
0881 break;
0882 } else {
0883 if (amdgpu_ras_feature_enable(adev, &head, 1))
0884 break;
0885 }
0886 }
0887
0888 return con->features;
0889 }
0890
0891
0892 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
0893 enum amdgpu_ras_block block)
0894 {
0895 if (!block_obj)
0896 return -EINVAL;
0897
0898 if (block_obj->ras_comm.block == block)
0899 return 0;
0900
0901 return -EINVAL;
0902 }
0903
0904 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
0905 enum amdgpu_ras_block block, uint32_t sub_block_index)
0906 {
0907 struct amdgpu_ras_block_list *node, *tmp;
0908 struct amdgpu_ras_block_object *obj;
0909
0910 if (block >= AMDGPU_RAS_BLOCK__LAST)
0911 return NULL;
0912
0913 if (!amdgpu_ras_is_supported(adev, block))
0914 return NULL;
0915
0916 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
0917 if (!node->ras_obj) {
0918 dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
0919 continue;
0920 }
0921
0922 obj = node->ras_obj;
0923 if (obj->ras_block_match) {
0924 if (obj->ras_block_match(obj, block, sub_block_index) == 0)
0925 return obj;
0926 } else {
0927 if (amdgpu_ras_block_match_default(obj, block) == 0)
0928 return obj;
0929 }
0930 }
0931
0932 return NULL;
0933 }
0934
0935 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
0936 {
0937 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0938 int ret = 0;
0939
0940
0941
0942
0943
0944 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
0945 if (ret == -EOPNOTSUPP) {
0946 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
0947 adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
0948 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
0949
0950
0951
0952
0953 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
0954 adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
0955 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
0956 } else if (!ret) {
0957 if (adev->umc.ras &&
0958 adev->umc.ras->ecc_info_query_ras_error_count)
0959 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
0960
0961 if (adev->umc.ras &&
0962 adev->umc.ras->ecc_info_query_ras_error_address)
0963 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
0964 }
0965 }
0966
0967
0968 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
0969 struct ras_query_if *info)
0970 {
0971 struct amdgpu_ras_block_object *block_obj = NULL;
0972 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
0973 struct ras_err_data err_data = {0, 0, 0, NULL};
0974
0975 if (!obj)
0976 return -EINVAL;
0977
0978 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
0979 amdgpu_ras_get_ecc_info(adev, &err_data);
0980 } else {
0981 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
0982 if (!block_obj || !block_obj->hw_ops) {
0983 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
0984 get_ras_block_str(&info->head));
0985 return -EINVAL;
0986 }
0987
0988 if (block_obj->hw_ops->query_ras_error_count)
0989 block_obj->hw_ops->query_ras_error_count(adev, &err_data);
0990
0991 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
0992 (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
0993 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
0994 if (block_obj->hw_ops->query_ras_error_status)
0995 block_obj->hw_ops->query_ras_error_status(adev);
0996 }
0997 }
0998
0999 obj->err_data.ue_count += err_data.ue_count;
1000 obj->err_data.ce_count += err_data.ce_count;
1001
1002 info->ue_count = obj->err_data.ue_count;
1003 info->ce_count = obj->err_data.ce_count;
1004
1005 if (err_data.ce_count) {
1006 if (adev->smuio.funcs &&
1007 adev->smuio.funcs->get_socket_id &&
1008 adev->smuio.funcs->get_die_id) {
1009 dev_info(adev->dev, "socket: %d, die: %d "
1010 "%ld correctable hardware errors "
1011 "detected in %s block, no user "
1012 "action is needed.\n",
1013 adev->smuio.funcs->get_socket_id(adev),
1014 adev->smuio.funcs->get_die_id(adev),
1015 obj->err_data.ce_count,
1016 get_ras_block_str(&info->head));
1017 } else {
1018 dev_info(adev->dev, "%ld correctable hardware errors "
1019 "detected in %s block, no user "
1020 "action is needed.\n",
1021 obj->err_data.ce_count,
1022 get_ras_block_str(&info->head));
1023 }
1024 }
1025 if (err_data.ue_count) {
1026 if (adev->smuio.funcs &&
1027 adev->smuio.funcs->get_socket_id &&
1028 adev->smuio.funcs->get_die_id) {
1029 dev_info(adev->dev, "socket: %d, die: %d "
1030 "%ld uncorrectable hardware errors "
1031 "detected in %s block\n",
1032 adev->smuio.funcs->get_socket_id(adev),
1033 adev->smuio.funcs->get_die_id(adev),
1034 obj->err_data.ue_count,
1035 get_ras_block_str(&info->head));
1036 } else {
1037 dev_info(adev->dev, "%ld uncorrectable hardware errors "
1038 "detected in %s block\n",
1039 obj->err_data.ue_count,
1040 get_ras_block_str(&info->head));
1041 }
1042 }
1043
1044 return 0;
1045 }
1046
1047 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1048 enum amdgpu_ras_block block)
1049 {
1050 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
1051
1052 if (!amdgpu_ras_is_supported(adev, block))
1053 return -EINVAL;
1054
1055 if (!block_obj || !block_obj->hw_ops) {
1056 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1057 ras_block_str(block));
1058 return -EINVAL;
1059 }
1060
1061 if (block_obj->hw_ops->reset_ras_error_count)
1062 block_obj->hw_ops->reset_ras_error_count(adev);
1063
1064 if ((block == AMDGPU_RAS_BLOCK__GFX) ||
1065 (block == AMDGPU_RAS_BLOCK__MMHUB)) {
1066 if (block_obj->hw_ops->reset_ras_error_status)
1067 block_obj->hw_ops->reset_ras_error_status(adev);
1068 }
1069
1070 return 0;
1071 }
1072
1073
1074 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1075 struct ras_inject_if *info)
1076 {
1077 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1078 struct ta_ras_trigger_error_input block_info = {
1079 .block_id = amdgpu_ras_block_to_ta(info->head.block),
1080 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1081 .sub_block_index = info->head.sub_block_index,
1082 .address = info->address,
1083 .value = info->value,
1084 };
1085 int ret = -EINVAL;
1086 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
1087 info->head.block,
1088 info->head.sub_block_index);
1089
1090 if (!obj)
1091 return -EINVAL;
1092
1093 if (!block_obj || !block_obj->hw_ops) {
1094 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1095 get_ras_block_str(&info->head));
1096 return -EINVAL;
1097 }
1098
1099
1100 if (adev->gmc.xgmi.num_physical_nodes > 1) {
1101 block_info.address =
1102 amdgpu_xgmi_get_relative_phy_addr(adev,
1103 block_info.address);
1104 }
1105
1106 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
1107 if (block_obj->hw_ops->ras_error_inject)
1108 ret = block_obj->hw_ops->ras_error_inject(adev, info);
1109 } else {
1110
1111 if (block_obj->hw_ops->ras_error_inject)
1112 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
1113 else
1114 ret = psp_ras_trigger_error(&adev->psp, &block_info);
1115 }
1116
1117 if (ret)
1118 dev_err(adev->dev, "ras inject %s failed %d\n",
1119 get_ras_block_str(&info->head), ret);
1120
1121 return ret;
1122 }
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1136 unsigned long *ce_count,
1137 unsigned long *ue_count)
1138 {
1139 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1140 struct ras_manager *obj;
1141 unsigned long ce, ue;
1142
1143 if (!adev->ras_enabled || !con)
1144 return -EOPNOTSUPP;
1145
1146
1147
1148 if (!ce_count && !ue_count)
1149 return 0;
1150
1151 ce = 0;
1152 ue = 0;
1153 list_for_each_entry(obj, &con->head, node) {
1154 struct ras_query_if info = {
1155 .head = obj->head,
1156 };
1157 int res;
1158
1159 res = amdgpu_ras_query_error_status(adev, &info);
1160 if (res)
1161 return res;
1162
1163 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
1164 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
1165 if (amdgpu_ras_reset_error_status(adev, info.head.block))
1166 dev_warn(adev->dev, "Failed to reset error counter and error status");
1167 }
1168
1169 ce += info.ce_count;
1170 ue += info.ue_count;
1171 }
1172
1173 if (ce_count)
1174 *ce_count = ce;
1175
1176 if (ue_count)
1177 *ue_count = ue;
1178
1179 return 0;
1180 }
1181
1182
1183
1184
1185
1186 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1187 struct ras_badpage **bps, unsigned int *count);
1188
1189 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1190 {
1191 switch (flags) {
1192 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1193 return "R";
1194 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1195 return "P";
1196 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1197 default:
1198 return "F";
1199 }
1200 }
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1233 struct kobject *kobj, struct bin_attribute *attr,
1234 char *buf, loff_t ppos, size_t count)
1235 {
1236 struct amdgpu_ras *con =
1237 container_of(attr, struct amdgpu_ras, badpages_attr);
1238 struct amdgpu_device *adev = con->adev;
1239 const unsigned int element_size =
1240 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1241 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1242 unsigned int end = div64_ul(ppos + count - 1, element_size);
1243 ssize_t s = 0;
1244 struct ras_badpage *bps = NULL;
1245 unsigned int bps_count = 0;
1246
1247 memset(buf, 0, count);
1248
1249 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1250 return 0;
1251
1252 for (; start < end && start < bps_count; start++)
1253 s += scnprintf(&buf[s], element_size + 1,
1254 "0x%08x : 0x%08x : %1s\n",
1255 bps[start].bp,
1256 bps[start].size,
1257 amdgpu_ras_badpage_flags_str(bps[start].flags));
1258
1259 kfree(bps);
1260
1261 return s;
1262 }
1263
1264 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1265 struct device_attribute *attr, char *buf)
1266 {
1267 struct amdgpu_ras *con =
1268 container_of(attr, struct amdgpu_ras, features_attr);
1269
1270 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1271 }
1272
1273 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1274 {
1275 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1276
1277 sysfs_remove_file_from_group(&adev->dev->kobj,
1278 &con->badpages_attr.attr,
1279 RAS_FS_NAME);
1280 }
1281
1282 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1283 {
1284 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1285 struct attribute *attrs[] = {
1286 &con->features_attr.attr,
1287 NULL
1288 };
1289 struct attribute_group group = {
1290 .name = RAS_FS_NAME,
1291 .attrs = attrs,
1292 };
1293
1294 sysfs_remove_group(&adev->dev->kobj, &group);
1295
1296 return 0;
1297 }
1298
1299 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1300 struct ras_common_if *head)
1301 {
1302 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1303
1304 if (!obj || obj->attr_inuse)
1305 return -EINVAL;
1306
1307 get_obj(obj);
1308
1309 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
1310 "%s_err_count", head->name);
1311
1312 obj->sysfs_attr = (struct device_attribute){
1313 .attr = {
1314 .name = obj->fs_data.sysfs_name,
1315 .mode = S_IRUGO,
1316 },
1317 .show = amdgpu_ras_sysfs_read,
1318 };
1319 sysfs_attr_init(&obj->sysfs_attr.attr);
1320
1321 if (sysfs_add_file_to_group(&adev->dev->kobj,
1322 &obj->sysfs_attr.attr,
1323 RAS_FS_NAME)) {
1324 put_obj(obj);
1325 return -EINVAL;
1326 }
1327
1328 obj->attr_inuse = 1;
1329
1330 return 0;
1331 }
1332
1333 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1334 struct ras_common_if *head)
1335 {
1336 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1337
1338 if (!obj || !obj->attr_inuse)
1339 return -EINVAL;
1340
1341 sysfs_remove_file_from_group(&adev->dev->kobj,
1342 &obj->sysfs_attr.attr,
1343 RAS_FS_NAME);
1344 obj->attr_inuse = 0;
1345 put_obj(obj);
1346
1347 return 0;
1348 }
1349
1350 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1351 {
1352 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1353 struct ras_manager *obj, *tmp;
1354
1355 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1356 amdgpu_ras_sysfs_remove(adev, &obj->head);
1357 }
1358
1359 if (amdgpu_bad_page_threshold != 0)
1360 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1361
1362 amdgpu_ras_sysfs_remove_feature_node(adev);
1363
1364 return 0;
1365 }
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1388 {
1389 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1390 struct drm_minor *minor = adev_to_drm(adev)->primary;
1391 struct dentry *dir;
1392
1393 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1394 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
1395 &amdgpu_ras_debugfs_ctrl_ops);
1396 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
1397 &amdgpu_ras_debugfs_eeprom_ops);
1398 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
1399 &con->bad_page_cnt_threshold);
1400 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1401 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1402 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1403 &amdgpu_ras_debugfs_eeprom_size_ops);
1404 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1405 S_IRUGO, dir, adev,
1406 &amdgpu_ras_debugfs_eeprom_table_ops);
1407 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
1418
1419
1420
1421
1422
1423 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
1424 &con->disable_ras_err_cnt_harvest);
1425 return dir;
1426 }
1427
1428 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1429 struct ras_fs_if *head,
1430 struct dentry *dir)
1431 {
1432 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1433
1434 if (!obj || !dir)
1435 return;
1436
1437 get_obj(obj);
1438
1439 memcpy(obj->fs_data.debugfs_name,
1440 head->debugfs_name,
1441 sizeof(obj->fs_data.debugfs_name));
1442
1443 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
1444 obj, &amdgpu_ras_debugfs_ops);
1445 }
1446
1447 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1448 {
1449 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1450 struct dentry *dir;
1451 struct ras_manager *obj;
1452 struct ras_fs_if fs_info;
1453
1454
1455
1456
1457
1458 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1459 return;
1460
1461 dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1462
1463 list_for_each_entry(obj, &con->head, node) {
1464 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1465 (obj->attr_inuse == 1)) {
1466 sprintf(fs_info.debugfs_name, "%s_err_inject",
1467 get_ras_block_str(&obj->head));
1468 fs_info.head = obj->head;
1469 amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1470 }
1471 }
1472 }
1473
1474
1475
1476
1477 static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1478 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1479 static DEVICE_ATTR(features, S_IRUGO,
1480 amdgpu_ras_sysfs_features_read, NULL);
1481 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1482 {
1483 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1484 struct attribute_group group = {
1485 .name = RAS_FS_NAME,
1486 };
1487 struct attribute *attrs[] = {
1488 &con->features_attr.attr,
1489 NULL
1490 };
1491 struct bin_attribute *bin_attrs[] = {
1492 NULL,
1493 NULL,
1494 };
1495 int r;
1496
1497
1498 con->features_attr = dev_attr_features;
1499 group.attrs = attrs;
1500 sysfs_attr_init(attrs[0]);
1501
1502 if (amdgpu_bad_page_threshold != 0) {
1503
1504 bin_attr_gpu_vram_bad_pages.private = NULL;
1505 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1506 bin_attrs[0] = &con->badpages_attr;
1507 group.bin_attrs = bin_attrs;
1508 sysfs_bin_attr_init(bin_attrs[0]);
1509 }
1510
1511 r = sysfs_create_group(&adev->dev->kobj, &group);
1512 if (r)
1513 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1514
1515 return 0;
1516 }
1517
1518 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1519 {
1520 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1521 struct ras_manager *con_obj, *ip_obj, *tmp;
1522
1523 if (IS_ENABLED(CONFIG_DEBUG_FS)) {
1524 list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
1525 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
1526 if (ip_obj)
1527 put_obj(ip_obj);
1528 }
1529 }
1530
1531 amdgpu_ras_sysfs_remove_all(adev);
1532 return 0;
1533 }
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
1544 {
1545
1546 if (amdgpu_sriov_vf(adev) ||
1547 !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
1548 return;
1549
1550 if (adev->nbio.ras &&
1551 adev->nbio.ras->handle_ras_controller_intr_no_bifring)
1552 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
1553
1554 if (adev->nbio.ras &&
1555 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
1556 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
1557 }
1558
1559 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
1560 struct amdgpu_iv_entry *entry)
1561 {
1562 bool poison_stat = false;
1563 struct amdgpu_device *adev = obj->adev;
1564 struct ras_err_data err_data = {0, 0, 0, NULL};
1565 struct amdgpu_ras_block_object *block_obj =
1566 amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
1567
1568 if (!block_obj || !block_obj->hw_ops)
1569 return;
1570
1571
1572
1573
1574
1575 if (block_obj->hw_ops->query_poison_status) {
1576 poison_stat = block_obj->hw_ops->query_poison_status(adev);
1577 if (!poison_stat) {
1578
1579 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
1580 block_obj->ras_comm.name);
1581
1582 return;
1583 }
1584 }
1585
1586 if (!adev->gmc.xgmi.connected_to_cpu)
1587 amdgpu_umc_poison_handler(adev, &err_data, false);
1588
1589 if (block_obj->hw_ops->handle_poison_consumption)
1590 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
1591
1592
1593 if (poison_stat) {
1594 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
1595 block_obj->ras_comm.name);
1596 amdgpu_ras_reset_gpu(adev);
1597 }
1598 }
1599
1600 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
1601 struct amdgpu_iv_entry *entry)
1602 {
1603 dev_info(obj->adev->dev,
1604 "Poison is created, no user action is needed.\n");
1605 }
1606
1607 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
1608 struct amdgpu_iv_entry *entry)
1609 {
1610 struct ras_ih_data *data = &obj->ih_data;
1611 struct ras_err_data err_data = {0, 0, 0, NULL};
1612 int ret;
1613
1614 if (!data->cb)
1615 return;
1616
1617
1618
1619
1620 ret = data->cb(obj->adev, &err_data, entry);
1621
1622
1623
1624
1625
1626 if (ret == AMDGPU_RAS_SUCCESS) {
1627
1628
1629
1630 obj->err_data.ue_count += err_data.ue_count;
1631 obj->err_data.ce_count += err_data.ce_count;
1632 }
1633 }
1634
1635 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1636 {
1637 struct ras_ih_data *data = &obj->ih_data;
1638 struct amdgpu_iv_entry entry;
1639
1640 while (data->rptr != data->wptr) {
1641 rmb();
1642 memcpy(&entry, &data->ring[data->rptr],
1643 data->element_size);
1644
1645 wmb();
1646 data->rptr = (data->aligned_element_size +
1647 data->rptr) % data->ring_size;
1648
1649 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
1650 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
1651 amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
1652 else
1653 amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
1654 } else {
1655 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
1656 amdgpu_ras_interrupt_umc_handler(obj, &entry);
1657 else
1658 dev_warn(obj->adev->dev,
1659 "No RAS interrupt handler for non-UMC block with poison disabled.\n");
1660 }
1661 }
1662 }
1663
1664 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1665 {
1666 struct ras_ih_data *data =
1667 container_of(work, struct ras_ih_data, ih_work);
1668 struct ras_manager *obj =
1669 container_of(data, struct ras_manager, ih_data);
1670
1671 amdgpu_ras_interrupt_handler(obj);
1672 }
1673
1674 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1675 struct ras_dispatch_if *info)
1676 {
1677 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1678 struct ras_ih_data *data = &obj->ih_data;
1679
1680 if (!obj)
1681 return -EINVAL;
1682
1683 if (data->inuse == 0)
1684 return 0;
1685
1686
1687 memcpy(&data->ring[data->wptr], info->entry,
1688 data->element_size);
1689
1690 wmb();
1691 data->wptr = (data->aligned_element_size +
1692 data->wptr) % data->ring_size;
1693
1694 schedule_work(&data->ih_work);
1695
1696 return 0;
1697 }
1698
1699 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1700 struct ras_common_if *head)
1701 {
1702 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1703 struct ras_ih_data *data;
1704
1705 if (!obj)
1706 return -EINVAL;
1707
1708 data = &obj->ih_data;
1709 if (data->inuse == 0)
1710 return 0;
1711
1712 cancel_work_sync(&data->ih_work);
1713
1714 kfree(data->ring);
1715 memset(data, 0, sizeof(*data));
1716 put_obj(obj);
1717
1718 return 0;
1719 }
1720
1721 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1722 struct ras_common_if *head)
1723 {
1724 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1725 struct ras_ih_data *data;
1726 struct amdgpu_ras_block_object *ras_obj;
1727
1728 if (!obj) {
1729
1730 obj = amdgpu_ras_create_obj(adev, head);
1731 if (!obj)
1732 return -EINVAL;
1733 } else
1734 get_obj(obj);
1735
1736 ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
1737
1738 data = &obj->ih_data;
1739
1740 *data = (struct ras_ih_data) {
1741 .inuse = 0,
1742 .cb = ras_obj->ras_cb,
1743 .element_size = sizeof(struct amdgpu_iv_entry),
1744 .rptr = 0,
1745 .wptr = 0,
1746 };
1747
1748 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1749
1750 data->aligned_element_size = ALIGN(data->element_size, 8);
1751
1752 data->ring_size = 64 * data->aligned_element_size;
1753 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1754 if (!data->ring) {
1755 put_obj(obj);
1756 return -ENOMEM;
1757 }
1758
1759
1760 data->inuse = 1;
1761
1762 return 0;
1763 }
1764
1765 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1766 {
1767 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1768 struct ras_manager *obj, *tmp;
1769
1770 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1771 amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
1772 }
1773
1774 return 0;
1775 }
1776
1777
1778
1779 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1780 {
1781 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1782 struct ras_manager *obj;
1783
1784 if (!adev->ras_enabled || !con)
1785 return;
1786
1787 list_for_each_entry(obj, &con->head, node) {
1788 struct ras_query_if info = {
1789 .head = obj->head,
1790 };
1791
1792
1793
1794
1795
1796
1797
1798 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1799 continue;
1800
1801
1802
1803
1804
1805
1806
1807 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
1808 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
1809 continue;
1810
1811 amdgpu_ras_query_error_status(adev, &info);
1812
1813 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
1814 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
1815 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
1816 if (amdgpu_ras_reset_error_status(adev, info.head.block))
1817 dev_warn(adev->dev, "Failed to reset error counter and error status");
1818 }
1819 }
1820 }
1821
1822
1823 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1824 struct ras_query_if *info)
1825 {
1826 struct amdgpu_ras_block_object *block_obj;
1827
1828
1829
1830
1831 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
1832 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
1833 return;
1834
1835 block_obj = amdgpu_ras_get_ras_block(adev,
1836 info->head.block,
1837 info->head.sub_block_index);
1838
1839 if (!block_obj || !block_obj->hw_ops) {
1840 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1841 get_ras_block_str(&info->head));
1842 return;
1843 }
1844
1845 if (block_obj->hw_ops->query_ras_error_status)
1846 block_obj->hw_ops->query_ras_error_status(adev);
1847
1848 }
1849
1850 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1851 {
1852 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1853 struct ras_manager *obj;
1854
1855 if (!adev->ras_enabled || !con)
1856 return;
1857
1858 list_for_each_entry(obj, &con->head, node) {
1859 struct ras_query_if info = {
1860 .head = obj->head,
1861 };
1862
1863 amdgpu_ras_error_status_query(adev, &info);
1864 }
1865 }
1866
1867
1868
1869
1870
1871
1872 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1873 struct ras_badpage **bps, unsigned int *count)
1874 {
1875 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1876 struct ras_err_handler_data *data;
1877 int i = 0;
1878 int ret = 0, status;
1879
1880 if (!con || !con->eh_data || !bps || !count)
1881 return -EINVAL;
1882
1883 mutex_lock(&con->recovery_lock);
1884 data = con->eh_data;
1885 if (!data || data->count == 0) {
1886 *bps = NULL;
1887 ret = -EINVAL;
1888 goto out;
1889 }
1890
1891 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1892 if (!*bps) {
1893 ret = -ENOMEM;
1894 goto out;
1895 }
1896
1897 for (; i < data->count; i++) {
1898 (*bps)[i] = (struct ras_badpage){
1899 .bp = data->bps[i].retired_page,
1900 .size = AMDGPU_GPU_PAGE_SIZE,
1901 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1902 };
1903 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
1904 data->bps[i].retired_page);
1905 if (status == -EBUSY)
1906 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1907 else if (status == -ENOENT)
1908 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1909 }
1910
1911 *count = data->count;
1912 out:
1913 mutex_unlock(&con->recovery_lock);
1914 return ret;
1915 }
1916
1917 static void amdgpu_ras_do_recovery(struct work_struct *work)
1918 {
1919 struct amdgpu_ras *ras =
1920 container_of(work, struct amdgpu_ras, recovery_work);
1921 struct amdgpu_device *remote_adev = NULL;
1922 struct amdgpu_device *adev = ras->adev;
1923 struct list_head device_list, *device_list_handle = NULL;
1924
1925 if (!ras->disable_ras_err_cnt_harvest) {
1926 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1927
1928
1929 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1930 device_list_handle = &hive->device_list;
1931 } else {
1932 INIT_LIST_HEAD(&device_list);
1933 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1934 device_list_handle = &device_list;
1935 }
1936
1937 list_for_each_entry(remote_adev,
1938 device_list_handle, gmc.xgmi.head) {
1939 amdgpu_ras_query_err_status(remote_adev);
1940 amdgpu_ras_log_on_err_counter(remote_adev);
1941 }
1942
1943 amdgpu_put_xgmi_hive(hive);
1944 }
1945
1946 if (amdgpu_device_should_recover_gpu(ras->adev)) {
1947 struct amdgpu_reset_context reset_context;
1948 memset(&reset_context, 0, sizeof(reset_context));
1949
1950 reset_context.method = AMD_RESET_METHOD_NONE;
1951 reset_context.reset_req_dev = adev;
1952 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
1953
1954 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
1955 }
1956 atomic_set(&ras->in_recovery, 0);
1957 }
1958
1959
1960 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1961 struct ras_err_handler_data *data, int pages)
1962 {
1963 unsigned int old_space = data->count + data->space_left;
1964 unsigned int new_space = old_space + pages;
1965 unsigned int align_space = ALIGN(new_space, 512);
1966 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1967
1968 if (!bps) {
1969 return -ENOMEM;
1970 }
1971
1972 if (data->bps) {
1973 memcpy(bps, data->bps,
1974 data->count * sizeof(*data->bps));
1975 kfree(data->bps);
1976 }
1977
1978 data->bps = bps;
1979 data->space_left += align_space - old_space;
1980 return 0;
1981 }
1982
1983
1984 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1985 struct eeprom_table_record *bps, int pages)
1986 {
1987 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1988 struct ras_err_handler_data *data;
1989 int ret = 0;
1990 uint32_t i;
1991
1992 if (!con || !con->eh_data || !bps || pages <= 0)
1993 return 0;
1994
1995 mutex_lock(&con->recovery_lock);
1996 data = con->eh_data;
1997 if (!data)
1998 goto out;
1999
2000 for (i = 0; i < pages; i++) {
2001 if (amdgpu_ras_check_bad_page_unlock(con,
2002 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2003 continue;
2004
2005 if (!data->space_left &&
2006 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
2007 ret = -ENOMEM;
2008 goto out;
2009 }
2010
2011 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
2012 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
2013 AMDGPU_GPU_PAGE_SIZE);
2014
2015 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
2016 data->count++;
2017 data->space_left--;
2018 }
2019 out:
2020 mutex_unlock(&con->recovery_lock);
2021
2022 return ret;
2023 }
2024
2025
2026
2027
2028
2029 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
2030 {
2031 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2032 struct ras_err_handler_data *data;
2033 struct amdgpu_ras_eeprom_control *control;
2034 int save_count;
2035
2036 if (!con || !con->eh_data)
2037 return 0;
2038
2039 mutex_lock(&con->recovery_lock);
2040 control = &con->eeprom_control;
2041 data = con->eh_data;
2042 save_count = data->count - control->ras_num_recs;
2043 mutex_unlock(&con->recovery_lock);
2044
2045 if (save_count > 0) {
2046 if (amdgpu_ras_eeprom_append(control,
2047 &data->bps[control->ras_num_recs],
2048 save_count)) {
2049 dev_err(adev->dev, "Failed to save EEPROM table data!");
2050 return -EIO;
2051 }
2052
2053 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
2054 }
2055
2056 return 0;
2057 }
2058
2059
2060
2061
2062
2063 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
2064 {
2065 struct amdgpu_ras_eeprom_control *control =
2066 &adev->psp.ras_context.ras->eeprom_control;
2067 struct eeprom_table_record *bps;
2068 int ret;
2069
2070
2071 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
2072 return 0;
2073
2074 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
2075 if (!bps)
2076 return -ENOMEM;
2077
2078 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
2079 if (ret)
2080 dev_err(adev->dev, "Failed to load EEPROM table records!");
2081 else
2082 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
2083
2084 kfree(bps);
2085 return ret;
2086 }
2087
2088 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
2089 uint64_t addr)
2090 {
2091 struct ras_err_handler_data *data = con->eh_data;
2092 int i;
2093
2094 addr >>= AMDGPU_GPU_PAGE_SHIFT;
2095 for (i = 0; i < data->count; i++)
2096 if (addr == data->bps[i].retired_page)
2097 return true;
2098
2099 return false;
2100 }
2101
2102
2103
2104
2105
2106
2107 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
2108 uint64_t addr)
2109 {
2110 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2111 bool ret = false;
2112
2113 if (!con || !con->eh_data)
2114 return ret;
2115
2116 mutex_lock(&con->recovery_lock);
2117 ret = amdgpu_ras_check_bad_page_unlock(con, addr);
2118 mutex_unlock(&con->recovery_lock);
2119 return ret;
2120 }
2121
2122 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
2123 uint32_t max_count)
2124 {
2125 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146 if (amdgpu_bad_page_threshold < 0) {
2147 u64 val = adev->gmc.mc_vram_size;
2148
2149 do_div(val, RAS_BAD_PAGE_COVER);
2150 con->bad_page_cnt_threshold = min(lower_32_bits(val),
2151 max_count);
2152 } else {
2153 con->bad_page_cnt_threshold = min_t(int, max_count,
2154 amdgpu_bad_page_threshold);
2155 }
2156 }
2157
2158 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2159 {
2160 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2161 struct ras_err_handler_data **data;
2162 u32 max_eeprom_records_count = 0;
2163 bool exc_err_limit = false;
2164 int ret;
2165
2166 if (!con || amdgpu_sriov_vf(adev))
2167 return 0;
2168
2169
2170
2171
2172
2173
2174 con->adev = adev;
2175
2176 if (!adev->ras_enabled)
2177 return 0;
2178
2179 data = &con->eh_data;
2180 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
2181 if (!*data) {
2182 ret = -ENOMEM;
2183 goto out;
2184 }
2185
2186 mutex_init(&con->recovery_lock);
2187 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2188 atomic_set(&con->in_recovery, 0);
2189 con->eeprom_control.bad_channel_bitmap = 0;
2190
2191 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
2192 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2193
2194
2195
2196
2197
2198 if (adev->gmc.xgmi.pending_reset)
2199 return 0;
2200 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2201
2202
2203
2204
2205 if (exc_err_limit || ret)
2206 goto free;
2207
2208 if (con->eeprom_control.ras_num_recs) {
2209 ret = amdgpu_ras_load_bad_pages(adev);
2210 if (ret)
2211 goto free;
2212
2213 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
2214
2215 if (con->update_channel_flag == true) {
2216 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
2217 con->update_channel_flag = false;
2218 }
2219 }
2220
2221 #ifdef CONFIG_X86_MCE_AMD
2222 if ((adev->asic_type == CHIP_ALDEBARAN) &&
2223 (adev->gmc.xgmi.connected_to_cpu))
2224 amdgpu_register_bad_pages_mca_notifier(adev);
2225 #endif
2226 return 0;
2227
2228 free:
2229 kfree((*data)->bps);
2230 kfree(*data);
2231 con->eh_data = NULL;
2232 out:
2233 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
2234
2235
2236
2237
2238
2239 if (!exc_err_limit)
2240 ret = 0;
2241 else
2242 ret = -EINVAL;
2243
2244 return ret;
2245 }
2246
2247 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2248 {
2249 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2250 struct ras_err_handler_data *data = con->eh_data;
2251
2252
2253 if (!data)
2254 return 0;
2255
2256 cancel_work_sync(&con->recovery_work);
2257
2258 mutex_lock(&con->recovery_lock);
2259 con->eh_data = NULL;
2260 kfree(data->bps);
2261 kfree(data);
2262 mutex_unlock(&con->recovery_lock);
2263
2264 return 0;
2265 }
2266
2267
2268 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
2269 {
2270 return adev->asic_type == CHIP_VEGA10 ||
2271 adev->asic_type == CHIP_VEGA20 ||
2272 adev->asic_type == CHIP_ARCTURUS ||
2273 adev->asic_type == CHIP_ALDEBARAN ||
2274 adev->asic_type == CHIP_SIENNA_CICHLID;
2275 }
2276
2277
2278
2279
2280
2281
2282 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2283 {
2284 struct atom_context *ctx = adev->mode_info.atom_context;
2285
2286 if (!ctx)
2287 return;
2288
2289 if (strnstr(ctx->vbios_version, "D16406",
2290 sizeof(ctx->vbios_version)) ||
2291 strnstr(ctx->vbios_version, "D36002",
2292 sizeof(ctx->vbios_version)))
2293 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2294 }
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305 static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2306 {
2307 adev->ras_hw_enabled = adev->ras_enabled = 0;
2308
2309 if (!adev->is_atom_fw ||
2310 !amdgpu_ras_asic_supported(adev))
2311 return;
2312
2313
2314 if (amdgpu_sriov_vf(adev) &&
2315 adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
2316 return;
2317
2318 if (!adev->gmc.xgmi.connected_to_cpu) {
2319 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2320 dev_info(adev->dev, "MEM ECC is active.\n");
2321 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
2322 1 << AMDGPU_RAS_BLOCK__DF);
2323 } else {
2324 dev_info(adev->dev, "MEM ECC is not presented.\n");
2325 }
2326
2327 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2328 dev_info(adev->dev, "SRAM ECC is active.\n");
2329 if (!amdgpu_sriov_vf(adev)) {
2330 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2331 1 << AMDGPU_RAS_BLOCK__DF);
2332
2333 if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
2334 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
2335 1 << AMDGPU_RAS_BLOCK__JPEG);
2336 else
2337 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
2338 1 << AMDGPU_RAS_BLOCK__JPEG);
2339 } else {
2340 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
2341 1 << AMDGPU_RAS_BLOCK__SDMA |
2342 1 << AMDGPU_RAS_BLOCK__GFX);
2343 }
2344 } else {
2345 dev_info(adev->dev, "SRAM ECC is not presented.\n");
2346 }
2347 } else {
2348
2349
2350 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
2351 1 << AMDGPU_RAS_BLOCK__SDMA |
2352 1 << AMDGPU_RAS_BLOCK__MMHUB);
2353 }
2354
2355 amdgpu_ras_get_quirks(adev);
2356
2357
2358 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2359
2360 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
2361 adev->ras_hw_enabled & amdgpu_ras_mask;
2362 }
2363
2364 static void amdgpu_ras_counte_dw(struct work_struct *work)
2365 {
2366 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
2367 ras_counte_delay_work.work);
2368 struct amdgpu_device *adev = con->adev;
2369 struct drm_device *dev = adev_to_drm(adev);
2370 unsigned long ce_count, ue_count;
2371 int res;
2372
2373 res = pm_runtime_get_sync(dev->dev);
2374 if (res < 0)
2375 goto Out;
2376
2377
2378
2379 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2380 atomic_set(&con->ras_ce_count, ce_count);
2381 atomic_set(&con->ras_ue_count, ue_count);
2382 }
2383
2384 pm_runtime_mark_last_busy(dev->dev);
2385 Out:
2386 pm_runtime_put_autosuspend(dev->dev);
2387 }
2388
2389 int amdgpu_ras_init(struct amdgpu_device *adev)
2390 {
2391 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2392 int r;
2393 bool df_poison, umc_poison;
2394
2395 if (con)
2396 return 0;
2397
2398 con = kmalloc(sizeof(struct amdgpu_ras) +
2399 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
2400 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2401 GFP_KERNEL|__GFP_ZERO);
2402 if (!con)
2403 return -ENOMEM;
2404
2405 con->adev = adev;
2406 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
2407 atomic_set(&con->ras_ce_count, 0);
2408 atomic_set(&con->ras_ue_count, 0);
2409
2410 con->objs = (struct ras_manager *)(con + 1);
2411
2412 amdgpu_ras_set_context(adev, con);
2413
2414 amdgpu_ras_check_supported(adev);
2415
2416 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2417
2418
2419
2420 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2421 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2422
2423 return 0;
2424 }
2425
2426 r = 0;
2427 goto release_con;
2428 }
2429
2430 con->update_channel_flag = false;
2431 con->features = 0;
2432 INIT_LIST_HEAD(&con->head);
2433
2434 con->flags = RAS_DEFAULT_FLAGS;
2435
2436
2437
2438
2439 switch (adev->asic_type) {
2440 case CHIP_VEGA20:
2441 case CHIP_ARCTURUS:
2442 case CHIP_ALDEBARAN:
2443 if (!adev->gmc.xgmi.connected_to_cpu) {
2444 adev->nbio.ras = &nbio_v7_4_ras;
2445 amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block);
2446 adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm;
2447 }
2448 break;
2449 default:
2450
2451 break;
2452 }
2453
2454 if (adev->nbio.ras &&
2455 adev->nbio.ras->init_ras_controller_interrupt) {
2456 r = adev->nbio.ras->init_ras_controller_interrupt(adev);
2457 if (r)
2458 goto release_con;
2459 }
2460
2461 if (adev->nbio.ras &&
2462 adev->nbio.ras->init_ras_err_event_athub_interrupt) {
2463 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
2464 if (r)
2465 goto release_con;
2466 }
2467
2468
2469 if (adev->gmc.xgmi.connected_to_cpu) {
2470
2471 con->poison_supported = true;
2472 }
2473 else if (adev->df.funcs &&
2474 adev->df.funcs->query_ras_poison_mode &&
2475 adev->umc.ras &&
2476 adev->umc.ras->query_ras_poison_mode) {
2477 df_poison =
2478 adev->df.funcs->query_ras_poison_mode(adev);
2479 umc_poison =
2480 adev->umc.ras->query_ras_poison_mode(adev);
2481
2482 if (df_poison && umc_poison)
2483 con->poison_supported = true;
2484 else if (df_poison != umc_poison)
2485 dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2486 df_poison, umc_poison);
2487 }
2488
2489 if (amdgpu_ras_fs_init(adev)) {
2490 r = -EINVAL;
2491 goto release_con;
2492 }
2493
2494 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2495 "hardware ability[%x] ras_mask[%x]\n",
2496 adev->ras_hw_enabled, adev->ras_enabled);
2497
2498 return 0;
2499 release_con:
2500 amdgpu_ras_set_context(adev, NULL);
2501 kfree(con);
2502
2503 return r;
2504 }
2505
2506 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2507 {
2508 if (adev->gmc.xgmi.connected_to_cpu)
2509 return 1;
2510 return 0;
2511 }
2512
2513 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2514 struct ras_common_if *ras_block)
2515 {
2516 struct ras_query_if info = {
2517 .head = *ras_block,
2518 };
2519
2520 if (!amdgpu_persistent_edc_harvesting_supported(adev))
2521 return 0;
2522
2523 if (amdgpu_ras_query_error_status(adev, &info) != 0)
2524 DRM_WARN("RAS init harvest failure");
2525
2526 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2527 DRM_WARN("RAS init harvest reset failure");
2528
2529 return 0;
2530 }
2531
2532 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2533 {
2534 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2535
2536 if (!con)
2537 return false;
2538
2539 return con->poison_supported;
2540 }
2541
2542
2543 int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
2544 struct ras_common_if *ras_block)
2545 {
2546 struct amdgpu_ras_block_object *ras_obj = NULL;
2547 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2548 unsigned long ue_count, ce_count;
2549 int r;
2550
2551
2552 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2553 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2554 return 0;
2555 }
2556
2557 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2558 if (r) {
2559 if (adev->in_suspend || amdgpu_in_reset(adev)) {
2560
2561
2562 goto cleanup;
2563 } else
2564 return r;
2565 }
2566
2567
2568 amdgpu_persistent_edc_harvesting(adev, ras_block);
2569
2570
2571 if (adev->in_suspend || amdgpu_in_reset(adev))
2572 return 0;
2573
2574 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2575 if (ras_obj->ras_cb || (ras_obj->hw_ops &&
2576 (ras_obj->hw_ops->query_poison_status ||
2577 ras_obj->hw_ops->handle_poison_consumption))) {
2578 r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
2579 if (r)
2580 goto cleanup;
2581 }
2582
2583 r = amdgpu_ras_sysfs_create(adev, ras_block);
2584 if (r)
2585 goto interrupt;
2586
2587
2588
2589 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2590 atomic_set(&con->ras_ce_count, ce_count);
2591 atomic_set(&con->ras_ue_count, ue_count);
2592 }
2593
2594 return 0;
2595
2596 interrupt:
2597 if (ras_obj->ras_cb)
2598 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2599 cleanup:
2600 amdgpu_ras_feature_enable(adev, ras_block, 0);
2601 return r;
2602 }
2603
2604 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
2605 struct ras_common_if *ras_block)
2606 {
2607 return amdgpu_ras_block_late_init(adev, ras_block);
2608 }
2609
2610
2611 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
2612 struct ras_common_if *ras_block)
2613 {
2614 struct amdgpu_ras_block_object *ras_obj;
2615 if (!ras_block)
2616 return;
2617
2618 amdgpu_ras_sysfs_remove(adev, ras_block);
2619
2620 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2621 if (ras_obj->ras_cb)
2622 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2623 }
2624
2625 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
2626 struct ras_common_if *ras_block)
2627 {
2628 return amdgpu_ras_block_late_fini(adev, ras_block);
2629 }
2630
2631
2632
2633
2634 void amdgpu_ras_resume(struct amdgpu_device *adev)
2635 {
2636 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2637 struct ras_manager *obj, *tmp;
2638
2639 if (!adev->ras_enabled || !con) {
2640
2641 amdgpu_release_ras_context(adev);
2642
2643 return;
2644 }
2645
2646 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2647
2648
2649
2650
2651
2652 amdgpu_ras_enable_all_features(adev, 1);
2653
2654
2655
2656
2657
2658 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2659 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2660 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2661
2662 WARN_ON(alive_obj(obj));
2663 }
2664 }
2665 }
2666 }
2667
2668 void amdgpu_ras_suspend(struct amdgpu_device *adev)
2669 {
2670 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2671
2672 if (!adev->ras_enabled || !con)
2673 return;
2674
2675 amdgpu_ras_disable_all_features(adev, 0);
2676
2677 if (con->features)
2678 amdgpu_ras_disable_all_features(adev, 1);
2679 }
2680
2681 int amdgpu_ras_late_init(struct amdgpu_device *adev)
2682 {
2683 struct amdgpu_ras_block_list *node, *tmp;
2684 struct amdgpu_ras_block_object *obj;
2685 int r;
2686
2687
2688 if (amdgpu_sriov_vf(adev))
2689 return 0;
2690
2691 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
2692 if (!node->ras_obj) {
2693 dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
2694 continue;
2695 }
2696
2697 obj = node->ras_obj;
2698 if (obj->ras_late_init) {
2699 r = obj->ras_late_init(adev, &obj->ras_comm);
2700 if (r) {
2701 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
2702 obj->ras_comm.name, r);
2703 return r;
2704 }
2705 } else
2706 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
2707 }
2708
2709 return 0;
2710 }
2711
2712
2713 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2714 {
2715 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2716
2717 if (!adev->ras_enabled || !con)
2718 return 0;
2719
2720
2721
2722 amdgpu_ras_disable_all_features(adev, 0);
2723 amdgpu_ras_recovery_fini(adev);
2724 return 0;
2725 }
2726
2727 int amdgpu_ras_fini(struct amdgpu_device *adev)
2728 {
2729 struct amdgpu_ras_block_list *ras_node, *tmp;
2730 struct amdgpu_ras_block_object *obj = NULL;
2731 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2732
2733 if (!adev->ras_enabled || !con)
2734 return 0;
2735
2736 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
2737 if (ras_node->ras_obj) {
2738 obj = ras_node->ras_obj;
2739 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
2740 obj->ras_fini)
2741 obj->ras_fini(adev, &obj->ras_comm);
2742 else
2743 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
2744 }
2745
2746
2747 list_del(&ras_node->node);
2748 kfree(ras_node);
2749 }
2750
2751 amdgpu_ras_fs_fini(adev);
2752 amdgpu_ras_interrupt_remove_all(adev);
2753
2754 WARN(con->features, "Feature mask is not cleared");
2755
2756 if (con->features)
2757 amdgpu_ras_disable_all_features(adev, 1);
2758
2759 cancel_delayed_work_sync(&con->ras_counte_delay_work);
2760
2761 amdgpu_ras_set_context(adev, NULL);
2762 kfree(con);
2763
2764 return 0;
2765 }
2766
2767 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2768 {
2769 amdgpu_ras_check_supported(adev);
2770 if (!adev->ras_hw_enabled)
2771 return;
2772
2773 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2774 dev_info(adev->dev, "uncorrectable hardware error"
2775 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2776
2777 amdgpu_ras_reset_gpu(adev);
2778 }
2779 }
2780
2781 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2782 {
2783 if (adev->asic_type == CHIP_VEGA20 &&
2784 adev->pm.fw_version <= 0x283400) {
2785 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2786 amdgpu_ras_intr_triggered();
2787 }
2788
2789 return false;
2790 }
2791
2792 void amdgpu_release_ras_context(struct amdgpu_device *adev)
2793 {
2794 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2795
2796 if (!con)
2797 return;
2798
2799 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
2800 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
2801 amdgpu_ras_set_context(adev, NULL);
2802 kfree(con);
2803 }
2804 }
2805
2806 #ifdef CONFIG_X86_MCE_AMD
2807 static struct amdgpu_device *find_adev(uint32_t node_id)
2808 {
2809 int i;
2810 struct amdgpu_device *adev = NULL;
2811
2812 for (i = 0; i < mce_adev_list.num_gpu; i++) {
2813 adev = mce_adev_list.devs[i];
2814
2815 if (adev && adev->gmc.xgmi.connected_to_cpu &&
2816 adev->gmc.xgmi.physical_node_id == node_id)
2817 break;
2818 adev = NULL;
2819 }
2820
2821 return adev;
2822 }
2823
2824 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
2825 #define GET_UMC_INST(m) (((m) >> 21) & 0x7)
2826 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
2827 #define GPU_ID_OFFSET 8
2828
2829 static int amdgpu_bad_page_notifier(struct notifier_block *nb,
2830 unsigned long val, void *data)
2831 {
2832 struct mce *m = (struct mce *)data;
2833 struct amdgpu_device *adev = NULL;
2834 uint32_t gpu_id = 0;
2835 uint32_t umc_inst = 0;
2836 uint32_t ch_inst, channel_index = 0;
2837 struct ras_err_data err_data = {0, 0, 0, NULL};
2838 struct eeprom_table_record err_rec;
2839 uint64_t retired_page;
2840
2841
2842
2843
2844
2845
2846 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
2847 (XEC(m->status, 0x3f) == 0x0)))
2848 return NOTIFY_DONE;
2849
2850
2851
2852
2853 if (mce_is_correctable(m))
2854 return NOTIFY_OK;
2855
2856
2857
2858
2859 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
2860
2861 adev = find_adev(gpu_id);
2862 if (!adev) {
2863 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
2864 gpu_id);
2865 return NOTIFY_DONE;
2866 }
2867
2868
2869
2870
2871
2872 umc_inst = GET_UMC_INST(m->ipid);
2873 ch_inst = GET_CHAN_INDEX(m->ipid);
2874
2875 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
2876 umc_inst, ch_inst);
2877
2878
2879
2880
2881 channel_index =
2882 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
2883 + ch_inst];
2884
2885 retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
2886 ADDR_OF_256B_BLOCK(channel_index) |
2887 OFFSET_IN_256B_BLOCK(m->addr);
2888
2889 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
2890 err_data.err_addr = &err_rec;
2891 amdgpu_umc_fill_error_record(&err_data, m->addr,
2892 retired_page, channel_index, umc_inst);
2893
2894 if (amdgpu_bad_page_threshold != 0) {
2895 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
2896 err_data.err_addr_cnt);
2897 amdgpu_ras_save_bad_pages(adev);
2898 }
2899
2900 return NOTIFY_OK;
2901 }
2902
2903 static struct notifier_block amdgpu_bad_page_nb = {
2904 .notifier_call = amdgpu_bad_page_notifier,
2905 .priority = MCE_PRIO_UC,
2906 };
2907
2908 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
2909 {
2910
2911
2912
2913
2914
2915
2916
2917
2918 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
2919
2920
2921
2922
2923
2924 if (notifier_registered == false) {
2925 mce_register_decode_chain(&amdgpu_bad_page_nb);
2926 notifier_registered = true;
2927 }
2928 }
2929 #endif
2930
2931 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
2932 {
2933 if (!adev)
2934 return NULL;
2935
2936 return adev->psp.ras_context.ras;
2937 }
2938
2939 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
2940 {
2941 if (!adev)
2942 return -EINVAL;
2943
2944 adev->psp.ras_context.ras = ras_con;
2945 return 0;
2946 }
2947
2948
2949 int amdgpu_ras_is_supported(struct amdgpu_device *adev,
2950 unsigned int block)
2951 {
2952 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2953
2954 if (block >= AMDGPU_RAS_BLOCK_COUNT)
2955 return 0;
2956 return ras && (adev->ras_enabled & (1 << block));
2957 }
2958
2959 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
2960 {
2961 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2962
2963 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
2964 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
2965 return 0;
2966 }
2967
2968
2969
2970 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
2971 struct amdgpu_ras_block_object *ras_block_obj)
2972 {
2973 struct amdgpu_ras_block_list *ras_node;
2974 if (!adev || !ras_block_obj)
2975 return -EINVAL;
2976
2977 if (!amdgpu_ras_asic_supported(adev))
2978 return 0;
2979
2980 ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
2981 if (!ras_node)
2982 return -ENOMEM;
2983
2984 INIT_LIST_HEAD(&ras_node->node);
2985 ras_node->ras_obj = ras_block_obj;
2986 list_add_tail(&ras_node->node, &adev->ras_list);
2987
2988 return 0;
2989 }