Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2020 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  */
0023 #include "amdgpu.h"
0024 #include "sdma/sdma_4_4_0_offset.h"
0025 #include "sdma/sdma_4_4_0_sh_mask.h"
0026 #include "soc15.h"
0027 #include "amdgpu_ras.h"
0028 
0029 #define SDMA1_REG_OFFSET 0x600
0030 #define SDMA2_REG_OFFSET 0x1cda0
0031 #define SDMA3_REG_OFFSET 0x1d1a0
0032 #define SDMA4_REG_OFFSET 0x1d5a0
0033 
0034 /* helper function that allow only use sdma0 register offset
0035  * to calculate register offset for all the sdma instances */
0036 static uint32_t sdma_v4_4_get_reg_offset(struct amdgpu_device *adev,
0037                      uint32_t instance,
0038                      uint32_t offset)
0039 {
0040     uint32_t sdma_base = adev->reg_offset[SDMA0_HWIP][0][0];
0041 
0042     switch (instance) {
0043     case 0:
0044         return (sdma_base + offset);
0045     case 1:
0046         return (sdma_base + SDMA1_REG_OFFSET + offset);
0047     case 2:
0048         return (sdma_base + SDMA2_REG_OFFSET + offset);
0049     case 3:
0050         return (sdma_base + SDMA3_REG_OFFSET + offset);
0051     case 4:
0052         return (sdma_base + SDMA4_REG_OFFSET + offset);
0053     default:
0054         break;
0055     }
0056     return 0;
0057 }
0058 
0059 static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
0060     { "SDMA_MBANK_DATA_BUF0_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0061     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED),
0062     0, 0,
0063     },
0064     { "SDMA_MBANK_DATA_BUF1_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0065     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED),
0066     0, 0,
0067     },
0068     { "SDMA_MBANK_DATA_BUF2_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0069     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED),
0070     0, 0,
0071     },
0072     { "SDMA_MBANK_DATA_BUF3_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0073     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED),
0074     0, 0,
0075     },
0076     { "SDMA_MBANK_DATA_BUF4_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0077     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED),
0078     0, 0,
0079     },
0080     { "SDMA_MBANK_DATA_BUF5_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0081     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED),
0082     0, 0,
0083     },
0084     { "SDMA_MBANK_DATA_BUF6_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0085     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED),
0086     0, 0,
0087     },
0088     { "SDMA_MBANK_DATA_BUF7_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0089     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED),
0090     0, 0,
0091     },
0092     { "SDMA_MBANK_DATA_BUF8_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0093     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED),
0094     0, 0,
0095     },
0096     { "SDMA_MBANK_DATA_BUF9_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0097     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED),
0098     0, 0,
0099     },
0100     { "SDMA_MBANK_DATA_BUF10_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0101     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED),
0102     0, 0,
0103     },
0104     { "SDMA_MBANK_DATA_BUF11_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0105     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED),
0106     0, 0,
0107     },
0108     { "SDMA_MBANK_DATA_BUF12_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0109     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED),
0110     0, 0,
0111     },
0112     { "SDMA_MBANK_DATA_BUF13_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0113     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED),
0114     0, 0,
0115     },
0116     { "SDMA_MBANK_DATA_BUF14_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0117     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED),
0118     0, 0,
0119     },
0120     { "SDMA_MBANK_DATA_BUF15_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
0121     SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED),
0122     0, 0,
0123     },
0124     { "SDMA_UCODE_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0125     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UCODE_BUF_SED),
0126     0, 0,
0127     },
0128     { "SDMA_RB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0129     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_RB_CMD_BUF_SED),
0130     0, 0,
0131     },
0132     { "SDMA_IB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0133     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_IB_CMD_BUF_SED),
0134     0, 0,
0135     },
0136     { "SDMA_UTCL1_RD_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0137     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RD_FIFO_SED),
0138     0, 0,
0139     },
0140     { "SDMA_UTCL1_RDBST_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0141     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RDBST_FIFO_SED),
0142     0, 0,
0143     },
0144     { "SDMA_DATA_LUT_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0145     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_DATA_LUT_FIFO_SED),
0146     0, 0,
0147     },
0148     { "SDMA_SPLIT_DATA_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0149     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_SPLIT_DATA_BUF_SED),
0150     0, 0,
0151     },
0152     { "SDMA_MC_WR_ADDR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0153     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED),
0154     0, 0,
0155     },
0156     { "SDMA_MC_RDRET_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
0157     SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED),
0158     0, 0,
0159     },
0160 };
0161 
0162 static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
0163                       uint32_t reg_offset,
0164                       uint32_t value,
0165                       uint32_t instance,
0166                       uint32_t *sec_count)
0167 {
0168     uint32_t i;
0169     uint32_t sec_cnt;
0170 
0171     /* double bits error (multiple bits) error detection is not supported */
0172     for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
0173         if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
0174             continue;
0175 
0176         /* the SDMA_EDC_COUNTER register in each sdma instance
0177          * shares the same sed shift_mask
0178          * */
0179         sec_cnt = (value &
0180             sdma_v4_4_ras_fields[i].sec_count_mask) >>
0181             sdma_v4_4_ras_fields[i].sec_count_shift;
0182         if (sec_cnt) {
0183             dev_info(adev->dev, "Detected %s in SDMA%d, SED %d\n",
0184                  sdma_v4_4_ras_fields[i].name,
0185                  instance, sec_cnt);
0186             *sec_count += sec_cnt;
0187         }
0188     }
0189 }
0190 
0191 static int sdma_v4_4_query_ras_error_count_by_instance(struct amdgpu_device *adev,
0192                        uint32_t instance,
0193                        void *ras_error_status)
0194 {
0195     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0196     uint32_t sec_count = 0;
0197     uint32_t reg_value = 0;
0198     uint32_t reg_offset = 0;
0199 
0200     reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER);
0201     reg_value = RREG32(reg_offset);
0202     /* double bit error is not supported */
0203     if (reg_value)
0204         sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
0205                           instance, &sec_count);
0206 
0207     reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
0208     reg_value = RREG32(reg_offset);
0209     /* double bit error is not supported */
0210     if (reg_value)
0211         sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
0212                           instance, &sec_count);
0213 
0214     /*
0215      * err_data->ue_count should be initialized to 0
0216      * before calling into this function
0217      *
0218      * SDMA RAS supports single bit uncorrectable error detection.
0219      * So, increment uncorrectable error count.
0220      */
0221     err_data->ue_count += sec_count;
0222 
0223     /*
0224      * SDMA RAS does not support correctable errors.
0225      * Set ce count to 0.
0226      */
0227     err_data->ce_count = 0;
0228 
0229     return 0;
0230 };
0231 
0232 static void sdma_v4_4_reset_ras_error_count(struct amdgpu_device *adev)
0233 {
0234     int i;
0235     uint32_t reg_offset;
0236 
0237     /* write 0 to EDC_COUNTER reg to clear sdma edc counters */
0238     if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
0239         for (i = 0; i < adev->sdma.num_instances; i++) {
0240             reg_offset = sdma_v4_4_get_reg_offset(adev, i, regSDMA0_EDC_COUNTER);
0241             WREG32(reg_offset, 0);
0242             reg_offset = sdma_v4_4_get_reg_offset(adev, i, regSDMA0_EDC_COUNTER2);
0243             WREG32(reg_offset, 0);
0244         }
0245     }
0246 }
0247 
0248 static void sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,  void *ras_error_status)
0249 {
0250     int i = 0;
0251 
0252     for (i = 0; i < adev->sdma.num_instances; i++) {
0253         if (sdma_v4_4_query_ras_error_count_by_instance(adev, i, ras_error_status)) {
0254             dev_err(adev->dev, "Query ras error count failed in SDMA%d\n", i);
0255             return;
0256         }
0257     }
0258 
0259 }
0260 
0261 const struct amdgpu_ras_block_hw_ops sdma_v4_4_ras_hw_ops = {
0262     .query_ras_error_count = sdma_v4_4_query_ras_error_count,
0263     .reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
0264 };
0265 
0266 struct amdgpu_sdma_ras sdma_v4_4_ras = {
0267     .ras_block = {
0268         .hw_ops = &sdma_v4_4_ras_hw_ops,
0269     },
0270 };