0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include "amdgpu.h"
0025
0026 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
0027 void *ras_error_status,
0028 struct amdgpu_iv_entry *entry,
0029 bool reset)
0030 {
0031 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0032 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0033 int ret = 0;
0034
0035 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
0036 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
0037 if (ret == -EOPNOTSUPP) {
0038 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
0039 adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
0040 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
0041
0042 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
0043 adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
0044 adev->umc.max_ras_err_cnt_per_query) {
0045 err_data->err_addr =
0046 kcalloc(adev->umc.max_ras_err_cnt_per_query,
0047 sizeof(struct eeprom_table_record), GFP_KERNEL);
0048
0049
0050
0051
0052 if(!err_data->err_addr)
0053 dev_warn(adev->dev, "Failed to alloc memory for "
0054 "umc error address record!\n");
0055
0056
0057
0058
0059 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
0060 }
0061 } else if (!ret) {
0062 if (adev->umc.ras &&
0063 adev->umc.ras->ecc_info_query_ras_error_count)
0064 adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
0065
0066 if (adev->umc.ras &&
0067 adev->umc.ras->ecc_info_query_ras_error_address &&
0068 adev->umc.max_ras_err_cnt_per_query) {
0069 err_data->err_addr =
0070 kcalloc(adev->umc.max_ras_err_cnt_per_query,
0071 sizeof(struct eeprom_table_record), GFP_KERNEL);
0072
0073
0074
0075
0076 if(!err_data->err_addr)
0077 dev_warn(adev->dev, "Failed to alloc memory for "
0078 "umc error address record!\n");
0079
0080
0081
0082
0083 adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
0084 }
0085 }
0086
0087
0088 if (err_data->ue_count) {
0089 dev_info(adev->dev, "%ld uncorrectable hardware errors "
0090 "detected in UMC block\n",
0091 err_data->ue_count);
0092
0093 if ((amdgpu_bad_page_threshold != 0) &&
0094 err_data->err_addr_cnt) {
0095 amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
0096 err_data->err_addr_cnt);
0097 amdgpu_ras_save_bad_pages(adev);
0098
0099 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
0100
0101 if (con->update_channel_flag == true) {
0102 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
0103 con->update_channel_flag = false;
0104 }
0105 }
0106
0107 if (reset)
0108 amdgpu_ras_reset_gpu(adev);
0109 }
0110
0111 kfree(err_data->err_addr);
0112 return AMDGPU_RAS_SUCCESS;
0113 }
0114
0115 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
0116 void *ras_error_status,
0117 bool reset)
0118 {
0119 int ret;
0120 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0121 struct ras_common_if head = {
0122 .block = AMDGPU_RAS_BLOCK__UMC,
0123 };
0124 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
0125
0126 ret =
0127 amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
0128
0129 if (ret == AMDGPU_RAS_SUCCESS && obj) {
0130 obj->err_data.ue_count += err_data->ue_count;
0131 obj->err_data.ce_count += err_data->ce_count;
0132 }
0133
0134 return ret;
0135 }
0136
0137 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
0138 void *ras_error_status,
0139 struct amdgpu_iv_entry *entry)
0140 {
0141 return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
0142 }
0143
0144 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
0145 {
0146 int r;
0147
0148 r = amdgpu_ras_block_late_init(adev, ras_block);
0149 if (r)
0150 return r;
0151
0152 if (amdgpu_ras_is_supported(adev, ras_block->block)) {
0153 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
0154 if (r)
0155 goto late_fini;
0156 }
0157
0158
0159 if (adev->umc.ras &&
0160 adev->umc.ras->err_cnt_init)
0161 adev->umc.ras->err_cnt_init(adev);
0162
0163 return 0;
0164
0165 late_fini:
0166 amdgpu_ras_block_late_fini(adev, ras_block);
0167 return r;
0168 }
0169
0170 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
0171 struct amdgpu_irq_src *source,
0172 struct amdgpu_iv_entry *entry)
0173 {
0174 struct ras_common_if *ras_if = adev->umc.ras_if;
0175 struct ras_dispatch_if ih_data = {
0176 .entry = entry,
0177 };
0178
0179 if (!ras_if)
0180 return 0;
0181
0182 ih_data.head = *ras_if;
0183
0184 amdgpu_ras_interrupt_dispatch(adev, &ih_data);
0185 return 0;
0186 }
0187
0188 void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
0189 uint64_t err_addr,
0190 uint64_t retired_page,
0191 uint32_t channel_index,
0192 uint32_t umc_inst)
0193 {
0194 struct eeprom_table_record *err_rec =
0195 &err_data->err_addr[err_data->err_addr_cnt];
0196
0197 err_rec->address = err_addr;
0198
0199 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
0200 err_rec->ts = (uint64_t)ktime_get_real_seconds();
0201 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
0202 err_rec->cu = 0;
0203 err_rec->mem_channel = channel_index;
0204 err_rec->mcumc_id = umc_inst;
0205
0206 err_data->err_addr_cnt++;
0207 }