Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2021 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  */
0023 #include "umc_v6_7.h"
0024 #include "amdgpu_ras.h"
0025 #include "amdgpu_umc.h"
0026 #include "amdgpu.h"
0027 
0028 #include "umc/umc_6_7_0_offset.h"
0029 #include "umc/umc_6_7_0_sh_mask.h"
0030 
0031 const uint32_t
0032     umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
0033         {28, 20, 24, 16, 12, 4, 8, 0},
0034         {6, 30, 2, 26, 22, 14, 18, 10},
0035         {19, 11, 15, 7, 3, 27, 31, 23},
0036         {9, 1, 5, 29, 25, 17, 21, 13}
0037 };
0038 const uint32_t
0039     umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
0040         {19, 11, 15, 7, 3, 27, 31, 23},
0041         {9, 1, 5, 29, 25, 17, 21, 13},
0042         {28, 20, 24, 16, 12, 4, 8, 0},
0043         {6, 30, 2, 26, 22, 14, 18, 10},
0044 };
0045 
0046 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
0047                           uint32_t umc_inst,
0048                           uint32_t ch_inst)
0049 {
0050     uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
0051 
0052     /* adjust umc and channel index offset,
0053      * the register address is not linear on each umc instace */
0054     umc_inst = index / 4;
0055     ch_inst = index % 4;
0056 
0057     return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
0058 }
0059 
0060 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
0061                           uint32_t umc_inst,
0062                           uint32_t ch_inst)
0063 {
0064     return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0065 }
0066 
0067 static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
0068                           uint64_t mc_umc_status, uint32_t umc_reg_offset)
0069 {
0070     uint32_t mc_umc_addr;
0071     uint64_t reg_value;
0072 
0073     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
0074         dev_info(adev->dev, "Deferred error, no user action is needed.\n");
0075 
0076     if (mc_umc_status)
0077         dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
0078 
0079     /* print IPID registers value */
0080     mc_umc_addr =
0081         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
0082     reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
0083     if (reg_value)
0084         dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
0085 
0086     /* print SYND registers value */
0087     mc_umc_addr =
0088         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
0089     reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
0090     if (reg_value)
0091         dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
0092 
0093     /* print MISC0 registers value */
0094     mc_umc_addr =
0095         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
0096     reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
0097     if (reg_value)
0098         dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
0099 }
0100 
0101 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
0102                            uint32_t umc_inst, uint32_t ch_inst,
0103                            unsigned long *error_count)
0104 {
0105     uint64_t mc_umc_status;
0106     uint32_t eccinfo_table_idx;
0107     uint32_t umc_reg_offset;
0108     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0109 
0110     umc_reg_offset = get_umc_v6_7_reg_offset(adev,
0111                         umc_inst, ch_inst);
0112 
0113     eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0114     /* check for SRAM correctable error
0115       MCUMC_STATUS is a 64 bit register */
0116     mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0117     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0118         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
0119         *error_count += 1;
0120 
0121         umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
0122 
0123         if (ras->umc_ecc.record_ce_addr_supported)  {
0124             uint64_t err_addr, soc_pa;
0125             uint32_t channel_index =
0126                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0127 
0128             err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
0129             err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0130             /* translate umc channel address to soc pa, 3 parts are included */
0131             soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
0132                     ADDR_OF_256B_BLOCK(channel_index) |
0133                     OFFSET_IN_256B_BLOCK(err_addr);
0134 
0135             /* The umc channel bits are not original values, they are hashed */
0136             SET_CHANNEL_HASH(channel_index, soc_pa);
0137 
0138             dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
0139         }
0140     }
0141 }
0142 
0143 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
0144                               uint32_t umc_inst, uint32_t ch_inst,
0145                               unsigned long *error_count)
0146 {
0147     uint64_t mc_umc_status;
0148     uint32_t eccinfo_table_idx;
0149     uint32_t umc_reg_offset;
0150     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0151 
0152     umc_reg_offset = get_umc_v6_7_reg_offset(adev,
0153                         umc_inst, ch_inst);
0154 
0155     eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0156     /* check the MCUMC_STATUS */
0157     mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0158     if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
0159         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
0160         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0161         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
0162         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
0163         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
0164         *error_count += 1;
0165 
0166         umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
0167     }
0168 }
0169 
0170 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
0171                        void *ras_error_status)
0172 {
0173     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0174 
0175     uint32_t umc_inst        = 0;
0176     uint32_t ch_inst         = 0;
0177 
0178     /*TODO: driver needs to toggle DF Cstate to ensure
0179      * safe access of UMC registers. Will add the protection */
0180     LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0181         umc_v6_7_ecc_info_query_correctable_error_count(adev,
0182                               umc_inst, ch_inst,
0183                               &(err_data->ce_count));
0184         umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
0185                               umc_inst, ch_inst,
0186                               &(err_data->ue_count));
0187     }
0188 }
0189 
0190 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
0191                      struct ras_err_data *err_data,
0192                      uint32_t ch_inst,
0193                      uint32_t umc_inst)
0194 {
0195     uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
0196     uint32_t channel_index;
0197     uint32_t eccinfo_table_idx;
0198     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0199 
0200     eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
0201     channel_index =
0202         adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0203 
0204     mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
0205 
0206     if (mc_umc_status == 0)
0207         return;
0208 
0209     if (!err_data->err_addr)
0210         return;
0211 
0212     /* calculate error address if ue/ce error is detected */
0213     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0214         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0215         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
0216 
0217         err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
0218         err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0219 
0220         /* translate umc channel address to soc pa, 3 parts are included */
0221         soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
0222                 ADDR_OF_256B_BLOCK(channel_index) |
0223                 OFFSET_IN_256B_BLOCK(err_addr);
0224 
0225         /* The umc channel bits are not original values, they are hashed */
0226         SET_CHANNEL_HASH(channel_index, soc_pa);
0227 
0228         /* clear [C4 C3 C2] in soc physical address */
0229         soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
0230 
0231         /* we only save ue error information currently, ce is skipped */
0232         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
0233                 == 1) {
0234             /* loop for all possibilities of [C4 C3 C2] */
0235             for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
0236                 retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
0237                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
0238                 amdgpu_umc_fill_error_record(err_data, err_addr,
0239                     retired_page, channel_index, umc_inst);
0240 
0241                 /* shift R14 bit */
0242                 retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
0243                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
0244                 amdgpu_umc_fill_error_record(err_data, err_addr,
0245                     retired_page, channel_index, umc_inst);
0246             }
0247         }
0248     }
0249 }
0250 
0251 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
0252                          void *ras_error_status)
0253 {
0254     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0255 
0256     uint32_t umc_inst        = 0;
0257     uint32_t ch_inst         = 0;
0258 
0259     /*TODO: driver needs to toggle DF Cstate to ensure
0260      * safe access of UMC resgisters. Will add the protection
0261      * when firmware interface is ready */
0262     LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0263         umc_v6_7_ecc_info_query_error_address(adev,
0264                          err_data,
0265                          ch_inst,
0266                          umc_inst);
0267     }
0268 }
0269 
0270 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
0271                            uint32_t umc_reg_offset,
0272                            unsigned long *error_count,
0273                            uint32_t ch_inst,
0274                            uint32_t umc_inst)
0275 {
0276     uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0277     uint32_t ecc_err_cnt, ecc_err_cnt_addr;
0278     uint64_t mc_umc_status;
0279     uint32_t mc_umc_status_addr;
0280 
0281     /* UMC 6_1_1 registers */
0282     ecc_err_cnt_sel_addr =
0283         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
0284     ecc_err_cnt_addr =
0285         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
0286     mc_umc_status_addr =
0287         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0288 
0289     /* select the lower chip and check the error count */
0290     ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
0291     ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
0292                     EccErrCntCsSel, 0);
0293     WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0294 
0295     ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
0296     *error_count +=
0297         (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
0298          UMC_V6_7_CE_CNT_INIT);
0299 
0300     /* select the higher chip and check the err counter */
0301     ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
0302                     EccErrCntCsSel, 1);
0303     WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0304 
0305     ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
0306     *error_count +=
0307         (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
0308          UMC_V6_7_CE_CNT_INIT);
0309 
0310     /* check for SRAM correctable error
0311       MCUMC_STATUS is a 64 bit register */
0312     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0313     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0314         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
0315         *error_count += 1;
0316 
0317         umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
0318 
0319         {
0320             uint64_t err_addr, soc_pa;
0321             uint32_t mc_umc_addrt0;
0322             uint32_t channel_index;
0323 
0324             mc_umc_addrt0 =
0325                 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
0326 
0327             channel_index =
0328                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0329 
0330             err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
0331             err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0332 
0333             /* translate umc channel address to soc pa, 3 parts are included */
0334             soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
0335                     ADDR_OF_256B_BLOCK(channel_index) |
0336                     OFFSET_IN_256B_BLOCK(err_addr);
0337 
0338             /* The umc channel bits are not original values, they are hashed */
0339             SET_CHANNEL_HASH(channel_index, soc_pa);
0340 
0341             dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
0342         }
0343     }
0344 }
0345 
0346 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
0347                               uint32_t umc_reg_offset,
0348                               unsigned long *error_count)
0349 {
0350     uint64_t mc_umc_status;
0351     uint32_t mc_umc_status_addr;
0352 
0353     mc_umc_status_addr =
0354         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0355 
0356     /* check the MCUMC_STATUS */
0357     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0358     if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
0359         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
0360         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0361         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
0362         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
0363         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
0364         *error_count += 1;
0365 
0366         umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
0367     }
0368 }
0369 
0370 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
0371                            uint32_t umc_reg_offset)
0372 {
0373     uint32_t ecc_err_cnt_addr;
0374     uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0375 
0376     ecc_err_cnt_sel_addr =
0377         SOC15_REG_OFFSET(UMC, 0,
0378                 regUMCCH0_0_EccErrCntSel);
0379     ecc_err_cnt_addr =
0380         SOC15_REG_OFFSET(UMC, 0,
0381                 regUMCCH0_0_EccErrCnt);
0382 
0383     /* select the lower chip */
0384     ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
0385                        umc_reg_offset) * 4);
0386     ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
0387                     UMCCH0_0_EccErrCntSel,
0388                     EccErrCntCsSel, 0);
0389     WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
0390             ecc_err_cnt_sel);
0391 
0392     /* clear lower chip error count */
0393     WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
0394             UMC_V6_7_CE_CNT_INIT);
0395 
0396     /* select the higher chip */
0397     ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
0398                     umc_reg_offset) * 4);
0399     ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
0400                     UMCCH0_0_EccErrCntSel,
0401                     EccErrCntCsSel, 1);
0402     WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
0403             ecc_err_cnt_sel);
0404 
0405     /* clear higher chip error count */
0406     WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
0407             UMC_V6_7_CE_CNT_INIT);
0408 }
0409 
0410 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
0411 {
0412     uint32_t umc_inst        = 0;
0413     uint32_t ch_inst         = 0;
0414     uint32_t umc_reg_offset  = 0;
0415 
0416     LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0417         umc_reg_offset = get_umc_v6_7_reg_offset(adev,
0418                              umc_inst,
0419                              ch_inst);
0420 
0421         umc_v6_7_reset_error_count_per_channel(adev,
0422                                umc_reg_offset);
0423     }
0424 }
0425 
0426 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
0427                        void *ras_error_status)
0428 {
0429     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0430 
0431     uint32_t umc_inst        = 0;
0432     uint32_t ch_inst         = 0;
0433     uint32_t umc_reg_offset  = 0;
0434 
0435     /*TODO: driver needs to toggle DF Cstate to ensure
0436      * safe access of UMC registers. Will add the protection */
0437     LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0438         umc_reg_offset = get_umc_v6_7_reg_offset(adev,
0439                              umc_inst,
0440                              ch_inst);
0441         umc_v6_7_query_correctable_error_count(adev,
0442                                umc_reg_offset,
0443                                &(err_data->ce_count),
0444                                ch_inst, umc_inst);
0445         umc_v6_7_querry_uncorrectable_error_count(adev,
0446                               umc_reg_offset,
0447                               &(err_data->ue_count));
0448     }
0449 
0450     umc_v6_7_reset_error_count(adev);
0451 }
0452 
0453 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
0454                      struct ras_err_data *err_data,
0455                      uint32_t umc_reg_offset,
0456                      uint32_t ch_inst,
0457                      uint32_t umc_inst)
0458 {
0459     uint32_t mc_umc_status_addr;
0460     uint32_t channel_index;
0461     uint64_t mc_umc_status, mc_umc_addrt0;
0462     uint64_t err_addr, soc_pa, retired_page, column;
0463 
0464     mc_umc_status_addr =
0465         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0466     mc_umc_addrt0 =
0467         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
0468 
0469     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0470 
0471     if (mc_umc_status == 0)
0472         return;
0473 
0474     if (!err_data->err_addr) {
0475         /* clear umc status */
0476         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0477         return;
0478     }
0479 
0480     channel_index =
0481         adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
0482 
0483     /* calculate error address if ue/ce error is detected */
0484     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0485         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0486         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
0487 
0488         err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
0489         err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0490 
0491         /* translate umc channel address to soc pa, 3 parts are included */
0492         soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
0493                 ADDR_OF_256B_BLOCK(channel_index) |
0494                 OFFSET_IN_256B_BLOCK(err_addr);
0495 
0496         /* The umc channel bits are not original values, they are hashed */
0497         SET_CHANNEL_HASH(channel_index, soc_pa);
0498 
0499         /* clear [C4 C3 C2] in soc physical address */
0500         soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
0501 
0502         /* we only save ue error information currently, ce is skipped */
0503         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
0504                 == 1) {
0505             /* loop for all possibilities of [C4 C3 C2] */
0506             for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
0507                 retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
0508                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
0509                 amdgpu_umc_fill_error_record(err_data, err_addr,
0510                     retired_page, channel_index, umc_inst);
0511 
0512                 /* shift R14 bit */
0513                 retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
0514                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
0515                 amdgpu_umc_fill_error_record(err_data, err_addr,
0516                     retired_page, channel_index, umc_inst);
0517             }
0518         }
0519     }
0520 
0521     /* clear umc status */
0522     WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0523 }
0524 
0525 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
0526                          void *ras_error_status)
0527 {
0528     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0529 
0530     uint32_t umc_inst        = 0;
0531     uint32_t ch_inst         = 0;
0532     uint32_t umc_reg_offset  = 0;
0533 
0534     /*TODO: driver needs to toggle DF Cstate to ensure
0535      * safe access of UMC resgisters. Will add the protection
0536      * when firmware interface is ready */
0537     LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
0538         umc_reg_offset = get_umc_v6_7_reg_offset(adev,
0539                              umc_inst,
0540                              ch_inst);
0541         umc_v6_7_query_error_address(adev,
0542                          err_data,
0543                          umc_reg_offset,
0544                          ch_inst,
0545                          umc_inst);
0546     }
0547 }
0548 
0549 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
0550                         struct amdgpu_device *adev,
0551                         uint32_t umc_reg_offset)
0552 {
0553     uint32_t ecc_ctrl_addr, ecc_ctrl;
0554 
0555     ecc_ctrl_addr =
0556         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
0557     ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
0558                     umc_reg_offset) * 4);
0559 
0560     return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
0561 }
0562 
0563 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
0564 {
0565     uint32_t umc_reg_offset  = 0;
0566 
0567     /* Enabling fatal error in umc instance0 channel0 will be
0568      * considered as fatal error mode
0569      */
0570     umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
0571     return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
0572 }
0573 
0574 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
0575     .query_ras_error_count = umc_v6_7_query_ras_error_count,
0576     .query_ras_error_address = umc_v6_7_query_ras_error_address,
0577 };
0578 
0579 struct amdgpu_umc_ras umc_v6_7_ras = {
0580     .ras_block = {
0581         .hw_ops = &umc_v6_7_ras_hw_ops,
0582     },
0583     .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
0584     .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
0585     .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
0586 };