Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2022 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  */
0023 #include "umc_v8_10.h"
0024 #include "amdgpu_ras.h"
0025 #include "amdgpu_umc.h"
0026 #include "amdgpu.h"
0027 #include "umc/umc_8_10_0_offset.h"
0028 #include "umc/umc_8_10_0_sh_mask.h"
0029 
0030 #define UMC_8_NODE_DIST   0x800000
0031 #define UMC_8_INST_DIST   0x4000
0032 
0033 struct channelnum_map_colbit {
0034     uint32_t channel_num;
0035     uint32_t col_bit;
0036 };
0037 
0038 const struct channelnum_map_colbit umc_v8_10_channelnum_map_colbit_table[] = {
0039     {24, 13},
0040     {20, 13},
0041     {16, 12},
0042     {14, 12},
0043     {12, 12},
0044     {10, 12},
0045     {6,  11},
0046 };
0047 
0048 const uint32_t
0049     umc_v8_10_channel_idx_tbl[]
0050                 [UMC_V8_10_UMC_INSTANCE_NUM]
0051                 [UMC_V8_10_CHANNEL_INSTANCE_NUM] = {
0052        {{16, 18}, {17, 19}},
0053        {{15, 11}, {3,   7}},
0054        {{1,   5}, {13,  9}},
0055        {{23, 21}, {22, 20}},
0056        {{0,   4}, {12,  8}},
0057        {{14, 10}, {2,   6}}
0058     };
0059 
0060 static inline uint32_t get_umc_v8_10_reg_offset(struct amdgpu_device *adev,
0061                         uint32_t node_inst,
0062                         uint32_t umc_inst,
0063                         uint32_t ch_inst)
0064 {
0065     return adev->umc.channel_offs * ch_inst + UMC_8_INST_DIST * umc_inst +
0066         UMC_8_NODE_DIST * node_inst;
0067 }
0068 
0069 static void umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev,
0070                     uint32_t umc_reg_offset)
0071 {
0072     uint32_t ecc_err_cnt_addr;
0073 
0074     ecc_err_cnt_addr =
0075         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt);
0076 
0077     /* clear error count */
0078     WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
0079             UMC_V8_10_CE_CNT_INIT);
0080 }
0081 
0082 static void umc_v8_10_clear_error_count(struct amdgpu_device *adev)
0083 {
0084     uint32_t node_inst       = 0;
0085     uint32_t umc_inst        = 0;
0086     uint32_t ch_inst         = 0;
0087     uint32_t umc_reg_offset  = 0;
0088 
0089     LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
0090         umc_reg_offset = get_umc_v8_10_reg_offset(adev,
0091                         node_inst,
0092                         umc_inst,
0093                         ch_inst);
0094 
0095         umc_v8_10_clear_error_count_per_channel(adev,
0096                         umc_reg_offset);
0097     }
0098 }
0099 
0100 static void umc_v8_10_query_correctable_error_count(struct amdgpu_device *adev,
0101                            uint32_t umc_reg_offset,
0102                            unsigned long *error_count)
0103 {
0104     uint32_t ecc_err_cnt, ecc_err_cnt_addr;
0105     uint64_t mc_umc_status;
0106     uint32_t mc_umc_status_addr;
0107 
0108     /* UMC 8_10 registers */
0109     ecc_err_cnt_addr =
0110         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt);
0111     mc_umc_status_addr =
0112         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0113 
0114     ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
0115     *error_count +=
0116         (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
0117          UMC_V8_10_CE_CNT_INIT);
0118 
0119     /* Check for SRAM correctable error, MCUMC_STATUS is a 64 bit register */
0120     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0121     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0122         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
0123         *error_count += 1;
0124 }
0125 
0126 static void umc_v8_10_query_uncorrectable_error_count(struct amdgpu_device *adev,
0127                               uint32_t umc_reg_offset,
0128                               unsigned long *error_count)
0129 {
0130     uint64_t mc_umc_status;
0131     uint32_t mc_umc_status_addr;
0132 
0133     mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0134 
0135     /* Check the MCUMC_STATUS. */
0136     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0137     if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
0138         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
0139         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0140         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
0141         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
0142         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
0143         *error_count += 1;
0144 }
0145 
0146 static void umc_v8_10_query_ras_error_count(struct amdgpu_device *adev,
0147                        void *ras_error_status)
0148 {
0149     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0150 
0151     uint32_t node_inst       = 0;
0152     uint32_t umc_inst        = 0;
0153     uint32_t ch_inst         = 0;
0154     uint32_t umc_reg_offset  = 0;
0155 
0156     LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
0157         umc_reg_offset = get_umc_v8_10_reg_offset(adev,
0158                         node_inst,
0159                         umc_inst,
0160                         ch_inst);
0161 
0162         umc_v8_10_query_correctable_error_count(adev,
0163                         umc_reg_offset,
0164                         &(err_data->ce_count));
0165         umc_v8_10_query_uncorrectable_error_count(adev,
0166                         umc_reg_offset,
0167                         &(err_data->ue_count));
0168     }
0169 
0170     umc_v8_10_clear_error_count(adev);
0171 }
0172 
0173 static uint32_t umc_v8_10_get_col_bit(uint32_t channel_num)
0174 {
0175     uint32_t t = 0;
0176 
0177     for (t = 0; t < ARRAY_SIZE(umc_v8_10_channelnum_map_colbit_table); t++)
0178         if (channel_num == umc_v8_10_channelnum_map_colbit_table[t].channel_num)
0179             return umc_v8_10_channelnum_map_colbit_table[t].col_bit;
0180 
0181     /* Failed to get col_bit. */
0182     return U32_MAX;
0183 }
0184 
0185 /*
0186  * Mapping normal address to soc physical address in swizzle mode.
0187  */
0188 static int umc_v8_10_swizzle_mode_na_to_pa(struct amdgpu_device *adev,
0189                     uint32_t channel_idx,
0190                     uint64_t na, uint64_t *soc_pa)
0191 {
0192     uint32_t channel_num = UMC_V8_10_TOTAL_CHANNEL_NUM(adev);
0193     uint32_t col_bit = umc_v8_10_get_col_bit(channel_num);
0194     uint64_t tmp_addr;
0195 
0196     if (col_bit == U32_MAX)
0197         return -1;
0198 
0199     tmp_addr = SWIZZLE_MODE_TMP_ADDR(na, channel_num, channel_idx);
0200     *soc_pa = SWIZZLE_MODE_ADDR_HI(tmp_addr, col_bit) |
0201         SWIZZLE_MODE_ADDR_MID(na, col_bit) |
0202         SWIZZLE_MODE_ADDR_LOW(tmp_addr, col_bit) |
0203         SWIZZLE_MODE_ADDR_LSB(na);
0204 
0205     return 0;
0206 }
0207 
0208 static void umc_v8_10_query_error_address(struct amdgpu_device *adev,
0209                      struct ras_err_data *err_data,
0210                      uint32_t umc_reg_offset,
0211                      uint32_t node_inst,
0212                      uint32_t ch_inst,
0213                      uint32_t umc_inst)
0214 {
0215     uint64_t mc_umc_status_addr;
0216     uint64_t mc_umc_status, err_addr;
0217     uint32_t channel_index;
0218 
0219     mc_umc_status_addr =
0220         SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
0221     mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
0222 
0223     if (mc_umc_status == 0)
0224         return;
0225 
0226     if (!err_data->err_addr) {
0227         /* clear umc status */
0228         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0229         return;
0230     }
0231 
0232     channel_index =
0233         adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
0234                     adev->umc.channel_inst_num +
0235                     umc_inst * adev->umc.channel_inst_num +
0236                     ch_inst];
0237 
0238     /* calculate error address if ue/ce error is detected */
0239     if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
0240         REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
0241         (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
0242          REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
0243         uint32_t addr_lsb;
0244         uint64_t mc_umc_addrt0;
0245 
0246         mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
0247         err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
0248         err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
0249 
0250         /* the lowest lsb bits should be ignored */
0251         addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb);
0252 
0253         err_addr &= ~((0x1ULL << addr_lsb) - 1);
0254 
0255         /* we only save ue error information currently, ce is skipped */
0256         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
0257             uint64_t na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT);
0258             uint64_t na_err_addr, retired_page_addr;
0259             uint32_t col = 0;
0260             int ret = 0;
0261 
0262             /* loop for all possibilities of [C6 C5] in normal address. */
0263             for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) {
0264                 na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT);
0265 
0266                 /* Mapping normal error address to retired soc physical address. */
0267                 ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index,
0268                                 na_err_addr, &retired_page_addr);
0269                 if (ret) {
0270                     dev_err(adev->dev, "Failed to map pa from umc na.\n");
0271                     break;
0272                 }
0273                 dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
0274                     retired_page_addr);
0275                 amdgpu_umc_fill_error_record(err_data, na_err_addr,
0276                         retired_page_addr, channel_index, umc_inst);
0277             }
0278         }
0279     }
0280 
0281     /* clear umc status */
0282     WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
0283 }
0284 
0285 static void umc_v8_10_query_ras_error_address(struct amdgpu_device *adev,
0286                          void *ras_error_status)
0287 {
0288     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0289     uint32_t node_inst       = 0;
0290     uint32_t umc_inst        = 0;
0291     uint32_t ch_inst         = 0;
0292     uint32_t umc_reg_offset  = 0;
0293 
0294     LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
0295         umc_reg_offset = get_umc_v8_10_reg_offset(adev,
0296                         node_inst,
0297                         umc_inst,
0298                         ch_inst);
0299 
0300         umc_v8_10_query_error_address(adev,
0301                     err_data,
0302                     umc_reg_offset,
0303                     node_inst,
0304                     ch_inst,
0305                     umc_inst);
0306     }
0307 }
0308 
0309 static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev,
0310                           uint32_t umc_reg_offset)
0311 {
0312     uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
0313     uint32_t ecc_err_cnt_addr;
0314 
0315     ecc_err_cnt_sel_addr =
0316         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCntSel);
0317     ecc_err_cnt_addr =
0318         SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt);
0319 
0320     ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
0321 
0322     /* set ce error interrupt type to APIC based interrupt */
0323     ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
0324                     GeccErrInt, 0x1);
0325     WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
0326     /* set error count to initial value */
0327     WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_10_CE_CNT_INIT);
0328 }
0329 
0330 static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev)
0331 {
0332     uint32_t node_inst       = 0;
0333     uint32_t umc_inst        = 0;
0334     uint32_t ch_inst         = 0;
0335     uint32_t umc_reg_offset  = 0;
0336 
0337     LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
0338         umc_reg_offset = get_umc_v8_10_reg_offset(adev,
0339                         node_inst,
0340                         umc_inst,
0341                         ch_inst);
0342 
0343         umc_v8_10_err_cnt_init_per_channel(adev, umc_reg_offset);
0344     }
0345 }
0346 
0347 const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
0348     .query_ras_error_count = umc_v8_10_query_ras_error_count,
0349     .query_ras_error_address = umc_v8_10_query_ras_error_address,
0350 };
0351 
0352 struct amdgpu_umc_ras umc_v8_10_ras = {
0353     .ras_block = {
0354         .hw_ops = &umc_v8_10_ras_hw_ops,
0355     },
0356     .err_cnt_init = umc_v8_10_err_cnt_init,
0357 };