0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023 #include "umc_v8_7.h"
0024 #include "amdgpu_ras.h"
0025 #include "amdgpu_umc.h"
0026 #include "amdgpu.h"
0027
0028 #include "rsmu/rsmu_0_0_2_offset.h"
0029 #include "rsmu/rsmu_0_0_2_sh_mask.h"
0030 #include "umc/umc_8_7_0_offset.h"
0031 #include "umc/umc_8_7_0_sh_mask.h"
0032
0033 #define UMC_8_INST_DIST 0x40000
0034
0035 const uint32_t
0036 umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
0037 {2, 11}, {4, 13},
0038 {1, 8}, {7, 14},
0039 {10, 3}, {12, 5},
0040 {9, 0}, {15, 6}
0041 };
0042
0043 static inline uint32_t get_umc_v8_7_reg_offset(struct amdgpu_device *adev,
0044 uint32_t umc_inst,
0045 uint32_t ch_inst)
0046 {
0047 return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
0048 }
0049
0050 static void umc_v8_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
0051 uint32_t umc_inst, uint32_t ch_inst,
0052 unsigned long *error_count)
0053 {
0054 uint64_t mc_umc_status;
0055 uint32_t eccinfo_table_idx;
0056 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0057
0058 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0059
0060
0061
0062
0063 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0064 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0065 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
0066 *error_count += 1;
0067 }
0068
0069 static void umc_v8_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
0070 uint32_t umc_inst, uint32_t ch_inst,
0071 unsigned long *error_count)
0072 {
0073 uint64_t mc_umc_status;
0074 uint32_t eccinfo_table_idx;
0075 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0076
0077 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0078
0079
0080 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0081 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
0082 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
0083 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0084 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
0085 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
0086 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
0087 *error_count += 1;
0088 }
0089
0090 static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
0091 void *ras_error_status)
0092 {
0093 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0094
0095 uint32_t umc_inst = 0;
0096 uint32_t ch_inst = 0;
0097
0098
0099
0100
0101 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0102 umc_v8_7_ecc_info_query_correctable_error_count(adev,
0103 umc_inst, ch_inst,
0104 &(err_data->ce_count));
0105 umc_v8_7_ecc_info_querry_uncorrectable_error_count(adev,
0106 umc_inst, ch_inst,
0107 &(err_data->ue_count));
0108 }
0109 }
0110
0111 static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev,
0112 struct ras_err_data *err_data,
0113 uint32_t ch_inst,
0114 uint32_t umc_inst)
0115 {
0116 uint64_t mc_umc_status, err_addr, retired_page;
0117 uint32_t channel_index;
0118 uint32_t eccinfo_table_idx;
0119 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0120
0121 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0122 channel_index =
0123 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0124
0125 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0126
0127 if (mc_umc_status == 0)
0128 return;
0129
0130 if (!err_data->err_addr)
0131 return;
0132
0133
0134 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0135 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0136 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
0137
0138 err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
0139 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0140
0141
0142 retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
0143 ADDR_OF_256B_BLOCK(channel_index) |
0144 OFFSET_IN_256B_BLOCK(err_addr);
0145
0146
0147 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
0148 == 1)
0149 amdgpu_umc_fill_error_record(err_data, err_addr,
0150 retired_page, channel_index, umc_inst);
0151 }
0152 }
0153
0154 static void umc_v8_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
0155 void *ras_error_status)
0156 {
0157 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0158
0159 uint32_t umc_inst = 0;
0160 uint32_t ch_inst = 0;
0161
0162
0163
0164
0165
0166 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0167 umc_v8_7_ecc_info_query_error_address(adev,
0168 err_data,
0169 ch_inst,
0170 umc_inst);
0171 }
0172 }
0173
0174 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
0175 uint32_t umc_reg_offset)
0176 {
0177 uint32_t ecc_err_cnt_addr;
0178 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0179
0180 ecc_err_cnt_sel_addr =
0181 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
0182 ecc_err_cnt_addr =
0183 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
0184
0185
0186 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
0187 umc_reg_offset) * 4);
0188 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
0189 UMCCH0_0_GeccErrCntSel,
0190 GeccErrCntCsSel, 0);
0191 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
0192 ecc_err_cnt_sel);
0193
0194
0195 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
0196 UMC_V8_7_CE_CNT_INIT);
0197
0198
0199 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
0200 umc_reg_offset) * 4);
0201 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
0202 UMCCH0_0_GeccErrCntSel,
0203 GeccErrCntCsSel, 1);
0204 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
0205 ecc_err_cnt_sel);
0206
0207
0208 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
0209 UMC_V8_7_CE_CNT_INIT);
0210 }
0211
0212 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
0213 {
0214 uint32_t umc_inst = 0;
0215 uint32_t ch_inst = 0;
0216 uint32_t umc_reg_offset = 0;
0217
0218 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0219 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
0220 umc_inst,
0221 ch_inst);
0222
0223 umc_v8_7_clear_error_count_per_channel(adev,
0224 umc_reg_offset);
0225 }
0226 }
0227
0228 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
0229 uint32_t umc_reg_offset,
0230 unsigned long *error_count)
0231 {
0232 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0233 uint32_t ecc_err_cnt, ecc_err_cnt_addr;
0234 uint64_t mc_umc_status;
0235 uint32_t mc_umc_status_addr;
0236
0237
0238 ecc_err_cnt_sel_addr =
0239 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
0240 ecc_err_cnt_addr =
0241 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
0242 mc_umc_status_addr =
0243 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
0244
0245
0246 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
0247 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0248 GeccErrCntCsSel, 0);
0249 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0250
0251 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
0252 *error_count +=
0253 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
0254 UMC_V8_7_CE_CNT_INIT);
0255
0256
0257 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0258 GeccErrCntCsSel, 1);
0259 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0260
0261 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
0262 *error_count +=
0263 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
0264 UMC_V8_7_CE_CNT_INIT);
0265
0266
0267
0268 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0269 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
0270 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0271 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
0272 *error_count += 1;
0273 }
0274
0275 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
0276 uint32_t umc_reg_offset,
0277 unsigned long *error_count)
0278 {
0279 uint64_t mc_umc_status;
0280 uint32_t mc_umc_status_addr;
0281
0282 mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
0283
0284
0285 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0286 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
0287 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
0288 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0289 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
0290 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
0291 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
0292 *error_count += 1;
0293 }
0294
0295 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
0296 void *ras_error_status)
0297 {
0298 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0299
0300 uint32_t umc_inst = 0;
0301 uint32_t ch_inst = 0;
0302 uint32_t umc_reg_offset = 0;
0303
0304 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0305 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
0306 umc_inst,
0307 ch_inst);
0308
0309 umc_v8_7_query_correctable_error_count(adev,
0310 umc_reg_offset,
0311 &(err_data->ce_count));
0312 umc_v8_7_querry_uncorrectable_error_count(adev,
0313 umc_reg_offset,
0314 &(err_data->ue_count));
0315 }
0316
0317 umc_v8_7_clear_error_count(adev);
0318 }
0319
0320 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
0321 struct ras_err_data *err_data,
0322 uint32_t umc_reg_offset,
0323 uint32_t ch_inst,
0324 uint32_t umc_inst)
0325 {
0326 uint32_t lsb, mc_umc_status_addr;
0327 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
0328 uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0329
0330 mc_umc_status_addr =
0331 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
0332 mc_umc_addrt0 =
0333 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
0334
0335 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0336
0337 if (mc_umc_status == 0)
0338 return;
0339
0340 if (!err_data->err_addr) {
0341
0342 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0343 return;
0344 }
0345
0346
0347 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0348 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0349 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
0350
0351 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
0352
0353 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
0354 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0355 err_addr &= ~((0x1ULL << lsb) - 1);
0356
0357
0358 retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
0359 ADDR_OF_256B_BLOCK(channel_index) |
0360 OFFSET_IN_256B_BLOCK(err_addr);
0361
0362
0363 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
0364 == 1)
0365 amdgpu_umc_fill_error_record(err_data, err_addr,
0366 retired_page, channel_index, umc_inst);
0367 }
0368
0369
0370 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0371 }
0372
0373 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
0374 void *ras_error_status)
0375 {
0376 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0377
0378 uint32_t umc_inst = 0;
0379 uint32_t ch_inst = 0;
0380 uint32_t umc_reg_offset = 0;
0381
0382 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0383 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
0384 umc_inst,
0385 ch_inst);
0386
0387 umc_v8_7_query_error_address(adev,
0388 err_data,
0389 umc_reg_offset,
0390 ch_inst,
0391 umc_inst);
0392 }
0393 }
0394
0395 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
0396 uint32_t umc_reg_offset)
0397 {
0398 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0399 uint32_t ecc_err_cnt_addr;
0400
0401 ecc_err_cnt_sel_addr =
0402 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
0403 ecc_err_cnt_addr =
0404 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
0405
0406
0407 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
0408 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0409 GeccErrCntCsSel, 0);
0410
0411 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0412 GeccErrInt, 0x1);
0413 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0414
0415 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
0416
0417
0418 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0419 GeccErrCntCsSel, 1);
0420 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0421 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
0422 }
0423
0424 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
0425 {
0426 uint32_t umc_inst = 0;
0427 uint32_t ch_inst = 0;
0428 uint32_t umc_reg_offset = 0;
0429
0430 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0431 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
0432 umc_inst,
0433 ch_inst);
0434
0435 umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
0436 }
0437 }
0438
0439 const struct amdgpu_ras_block_hw_ops umc_v8_7_ras_hw_ops = {
0440 .query_ras_error_count = umc_v8_7_query_ras_error_count,
0441 .query_ras_error_address = umc_v8_7_query_ras_error_address,
0442 };
0443
0444 struct amdgpu_umc_ras umc_v8_7_ras = {
0445 .ras_block = {
0446 .hw_ops = &umc_v8_7_ras_hw_ops,
0447 },
0448 .err_cnt_init = umc_v8_7_err_cnt_init,
0449 .ecc_info_query_ras_error_count = umc_v8_7_ecc_info_query_ras_error_count,
0450 .ecc_info_query_ras_error_address = umc_v8_7_ecc_info_query_ras_error_address,
0451 };