0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include "amdgpu_ras_eeprom.h"
0025 #include "amdgpu.h"
0026 #include "amdgpu_ras.h"
0027 #include <linux/bits.h>
0028 #include "atom.h"
0029 #include "amdgpu_eeprom.h"
0030 #include "amdgpu_atomfirmware.h"
0031 #include <linux/debugfs.h>
0032 #include <linux/uaccess.h>
0033
0034 #include "amdgpu_reset.h"
0035
0036 #define EEPROM_I2C_MADDR_VEGA20 0x0
0037 #define EEPROM_I2C_MADDR_ARCTURUS 0x40000
0038 #define EEPROM_I2C_MADDR_ARCTURUS_D342 0x0
0039 #define EEPROM_I2C_MADDR_SIENNA_CICHLID 0x0
0040 #define EEPROM_I2C_MADDR_ALDEBARAN 0x0
0041
0042
0043
0044
0045
0046
0047
0048 #define RAS_TABLE_HEADER_SIZE 20
0049 #define RAS_TABLE_RECORD_SIZE 24
0050
0051
0052 #define RAS_TABLE_HDR_VAL 0x414d4452
0053 #define RAS_TABLE_VER 0x00010000
0054
0055
0056 #define RAS_TABLE_HDR_BAD 0x42414447
0057
0058
0059 #define RAS_TBL_SIZE_BYTES (256 * 1024)
0060 #define RAS_TABLE_START 0
0061 #define RAS_HDR_START RAS_TABLE_START
0062 #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
0063 #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
0064 / RAS_TABLE_RECORD_SIZE)
0065
0066
0067
0068
0069
0070
0071
0072 #define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
0073 (_N) * RAS_TABLE_RECORD_SIZE)
0074
0075 #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
0076 (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
0077
0078
0079
0080
0081
0082 #define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
0083 (_C)->ras_max_record_count)
0084
0085 #define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
0086 RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
0087
0088 #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
0089
0090 static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
0091 {
0092 return adev->asic_type == CHIP_VEGA20 ||
0093 adev->asic_type == CHIP_ARCTURUS ||
0094 adev->asic_type == CHIP_SIENNA_CICHLID ||
0095 adev->asic_type == CHIP_ALDEBARAN;
0096 }
0097
0098 static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
0099 struct amdgpu_ras_eeprom_control *control)
0100 {
0101 struct atom_context *atom_ctx = adev->mode_info.atom_context;
0102
0103 if (!control || !atom_ctx)
0104 return false;
0105
0106 if (strnstr(atom_ctx->vbios_version,
0107 "D342",
0108 sizeof(atom_ctx->vbios_version)))
0109 control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS_D342;
0110 else
0111 control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS;
0112
0113 return true;
0114 }
0115
0116 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
0117 struct amdgpu_ras_eeprom_control *control)
0118 {
0119 u8 i2c_addr;
0120
0121 if (!control)
0122 return false;
0123
0124 if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
0125
0126
0127
0128
0129
0130
0131
0132
0133 i2c_addr = (i2c_addr & 0x0F) >> 1;
0134 control->i2c_address = ((u32) i2c_addr) << 16;
0135
0136 return true;
0137 }
0138
0139 switch (adev->asic_type) {
0140 case CHIP_VEGA20:
0141 control->i2c_address = EEPROM_I2C_MADDR_VEGA20;
0142 break;
0143
0144 case CHIP_ARCTURUS:
0145 return __get_eeprom_i2c_addr_arct(adev, control);
0146
0147 case CHIP_SIENNA_CICHLID:
0148 control->i2c_address = EEPROM_I2C_MADDR_SIENNA_CICHLID;
0149 break;
0150
0151 case CHIP_ALDEBARAN:
0152 control->i2c_address = EEPROM_I2C_MADDR_ALDEBARAN;
0153 break;
0154
0155 default:
0156 return false;
0157 }
0158
0159 return true;
0160 }
0161
0162 static void
0163 __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr,
0164 unsigned char *buf)
0165 {
0166 u32 *pp = (uint32_t *)buf;
0167
0168 pp[0] = cpu_to_le32(hdr->header);
0169 pp[1] = cpu_to_le32(hdr->version);
0170 pp[2] = cpu_to_le32(hdr->first_rec_offset);
0171 pp[3] = cpu_to_le32(hdr->tbl_size);
0172 pp[4] = cpu_to_le32(hdr->checksum);
0173 }
0174
0175 static void
0176 __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr,
0177 unsigned char *buf)
0178 {
0179 u32 *pp = (uint32_t *)buf;
0180
0181 hdr->header = le32_to_cpu(pp[0]);
0182 hdr->version = le32_to_cpu(pp[1]);
0183 hdr->first_rec_offset = le32_to_cpu(pp[2]);
0184 hdr->tbl_size = le32_to_cpu(pp[3]);
0185 hdr->checksum = le32_to_cpu(pp[4]);
0186 }
0187
0188 static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
0189 {
0190 u8 buf[RAS_TABLE_HEADER_SIZE];
0191 struct amdgpu_device *adev = to_amdgpu_device(control);
0192 int res;
0193
0194 memset(buf, 0, sizeof(buf));
0195 __encode_table_header_to_buf(&control->tbl_hdr, buf);
0196
0197
0198 down_read(&adev->reset_domain->sem);
0199 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
0200 control->i2c_address +
0201 control->ras_header_offset,
0202 buf, RAS_TABLE_HEADER_SIZE);
0203 up_read(&adev->reset_domain->sem);
0204
0205 if (res < 0) {
0206 DRM_ERROR("Failed to write EEPROM table header:%d", res);
0207 } else if (res < RAS_TABLE_HEADER_SIZE) {
0208 DRM_ERROR("Short write:%d out of %d\n",
0209 res, RAS_TABLE_HEADER_SIZE);
0210 res = -EIO;
0211 } else {
0212 res = 0;
0213 }
0214
0215 return res;
0216 }
0217
0218 static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
0219 {
0220 int ii;
0221 u8 *pp, csum;
0222 size_t sz;
0223
0224
0225 sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
0226 pp = (u8 *) &control->tbl_hdr;
0227 csum = 0;
0228 for (ii = 0; ii < sz; ii++, pp++)
0229 csum += *pp;
0230
0231 return csum;
0232 }
0233
0234 static int amdgpu_ras_eeprom_correct_header_tag(
0235 struct amdgpu_ras_eeprom_control *control,
0236 uint32_t header)
0237 {
0238 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
0239 u8 *hh;
0240 int res;
0241 u8 csum;
0242
0243 csum = -hdr->checksum;
0244
0245 hh = (void *) &hdr->header;
0246 csum -= (hh[0] + hh[1] + hh[2] + hh[3]);
0247 hh = (void *) &header;
0248 csum += hh[0] + hh[1] + hh[2] + hh[3];
0249 csum = -csum;
0250 mutex_lock(&control->ras_tbl_mutex);
0251 hdr->header = header;
0252 hdr->checksum = csum;
0253 res = __write_table_header(control);
0254 mutex_unlock(&control->ras_tbl_mutex);
0255
0256 return res;
0257 }
0258
0259
0260
0261
0262
0263
0264
0265
0266 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
0267 {
0268 struct amdgpu_device *adev = to_amdgpu_device(control);
0269 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
0270 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0271 u8 csum;
0272 int res;
0273
0274 mutex_lock(&control->ras_tbl_mutex);
0275
0276 hdr->header = RAS_TABLE_HDR_VAL;
0277 hdr->version = RAS_TABLE_VER;
0278 hdr->first_rec_offset = RAS_RECORD_START;
0279 hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
0280
0281 csum = __calc_hdr_byte_sum(control);
0282 csum = -csum;
0283 hdr->checksum = csum;
0284 res = __write_table_header(control);
0285
0286 control->ras_num_recs = 0;
0287 control->ras_fri = 0;
0288
0289 amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
0290
0291 control->bad_channel_bitmap = 0;
0292 amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
0293 con->update_channel_flag = false;
0294
0295 amdgpu_ras_debugfs_set_ret_size(control);
0296
0297 mutex_unlock(&control->ras_tbl_mutex);
0298
0299 return res;
0300 }
0301
0302 static void
0303 __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control,
0304 struct eeprom_table_record *record,
0305 unsigned char *buf)
0306 {
0307 __le64 tmp = 0;
0308 int i = 0;
0309
0310
0311 buf[i++] = record->err_type;
0312
0313 buf[i++] = record->bank;
0314
0315 tmp = cpu_to_le64(record->ts);
0316 memcpy(buf + i, &tmp, 8);
0317 i += 8;
0318
0319 tmp = cpu_to_le64((record->offset & 0xffffffffffff));
0320 memcpy(buf + i, &tmp, 6);
0321 i += 6;
0322
0323 buf[i++] = record->mem_channel;
0324 buf[i++] = record->mcumc_id;
0325
0326 tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
0327 memcpy(buf + i, &tmp, 6);
0328 }
0329
0330 static void
0331 __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control,
0332 struct eeprom_table_record *record,
0333 unsigned char *buf)
0334 {
0335 __le64 tmp = 0;
0336 int i = 0;
0337
0338
0339 record->err_type = buf[i++];
0340
0341 record->bank = buf[i++];
0342
0343 memcpy(&tmp, buf + i, 8);
0344 record->ts = le64_to_cpu(tmp);
0345 i += 8;
0346
0347 memcpy(&tmp, buf + i, 6);
0348 record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
0349 i += 6;
0350
0351 record->mem_channel = buf[i++];
0352 record->mcumc_id = buf[i++];
0353
0354 memcpy(&tmp, buf + i, 6);
0355 record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
0356 }
0357
0358 bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
0359 {
0360 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0361
0362 if (!__is_ras_eeprom_supported(adev))
0363 return false;
0364
0365
0366 if (!con)
0367 return false;
0368 else
0369 if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
0370 return false;
0371
0372 if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
0373 dev_warn(adev->dev, "This GPU is in BAD status.");
0374 dev_warn(adev->dev, "Please retire it or set a larger "
0375 "threshold value when reloading driver.\n");
0376 return true;
0377 }
0378
0379 return false;
0380 }
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392 static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
0393 u8 *buf, const u32 fri, const u32 num)
0394 {
0395 struct amdgpu_device *adev = to_amdgpu_device(control);
0396 u32 buf_size;
0397 int res;
0398
0399
0400 down_read(&adev->reset_domain->sem);
0401 buf_size = num * RAS_TABLE_RECORD_SIZE;
0402 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
0403 control->i2c_address +
0404 RAS_INDEX_TO_OFFSET(control, fri),
0405 buf, buf_size);
0406 up_read(&adev->reset_domain->sem);
0407 if (res < 0) {
0408 DRM_ERROR("Writing %d EEPROM table records error:%d",
0409 num, res);
0410 } else if (res < buf_size) {
0411
0412
0413 DRM_ERROR("Wrote %d records out of %d",
0414 res / RAS_TABLE_RECORD_SIZE, num);
0415 res = -EIO;
0416 } else {
0417 res = 0;
0418 }
0419
0420 return res;
0421 }
0422
0423 static int
0424 amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
0425 struct eeprom_table_record *record,
0426 const u32 num)
0427 {
0428 struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
0429 u32 a, b, i;
0430 u8 *buf, *pp;
0431 int res;
0432
0433 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
0434 if (!buf)
0435 return -ENOMEM;
0436
0437
0438
0439 pp = buf;
0440 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
0441 __encode_table_record_to_buf(control, &record[i], pp);
0442
0443
0444 if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
0445 control->bad_channel_bitmap |= 1 << record[i].mem_channel;
0446 con->update_channel_flag = true;
0447 }
0448 }
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477 a = control->ras_fri + control->ras_num_recs;
0478 b = a + num - 1;
0479 if (b < control->ras_max_record_count) {
0480 res = __amdgpu_ras_eeprom_write(control, buf, a, num);
0481 } else if (a < control->ras_max_record_count) {
0482 u32 g0, g1;
0483
0484 g0 = control->ras_max_record_count - a;
0485 g1 = b % control->ras_max_record_count + 1;
0486 res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
0487 if (res)
0488 goto Out;
0489 res = __amdgpu_ras_eeprom_write(control,
0490 buf + g0 * RAS_TABLE_RECORD_SIZE,
0491 0, g1);
0492 if (res)
0493 goto Out;
0494 if (g1 > control->ras_fri)
0495 control->ras_fri = g1 % control->ras_max_record_count;
0496 } else {
0497 a %= control->ras_max_record_count;
0498 b %= control->ras_max_record_count;
0499
0500 if (a <= b) {
0501
0502 res = __amdgpu_ras_eeprom_write(control, buf, a, num);
0503 if (res)
0504 goto Out;
0505 if (b >= control->ras_fri)
0506 control->ras_fri = (b + 1) % control->ras_max_record_count;
0507 } else {
0508 u32 g0, g1;
0509
0510
0511
0512
0513
0514 g0 = control->ras_max_record_count - a;
0515 g1 = b + 1;
0516 res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
0517 if (res)
0518 goto Out;
0519 res = __amdgpu_ras_eeprom_write(control,
0520 buf + g0 * RAS_TABLE_RECORD_SIZE,
0521 0, g1);
0522 if (res)
0523 goto Out;
0524 control->ras_fri = g1 % control->ras_max_record_count;
0525 }
0526 }
0527 control->ras_num_recs = 1 + (control->ras_max_record_count + b
0528 - control->ras_fri)
0529 % control->ras_max_record_count;
0530 Out:
0531 kfree(buf);
0532 return res;
0533 }
0534
0535 static int
0536 amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
0537 {
0538 struct amdgpu_device *adev = to_amdgpu_device(control);
0539 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0540 u8 *buf, *pp, csum;
0541 u32 buf_size;
0542 int res;
0543
0544
0545
0546 if (amdgpu_bad_page_threshold != 0 &&
0547 control->ras_num_recs >= ras->bad_page_cnt_threshold) {
0548 dev_warn(adev->dev,
0549 "Saved bad pages %d reaches threshold value %d\n",
0550 control->ras_num_recs, ras->bad_page_cnt_threshold);
0551 control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
0552 }
0553
0554 control->tbl_hdr.version = RAS_TABLE_VER;
0555 control->tbl_hdr.first_rec_offset = RAS_INDEX_TO_OFFSET(control, control->ras_fri);
0556 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
0557 control->tbl_hdr.checksum = 0;
0558
0559 buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
0560 buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
0561 if (!buf) {
0562 DRM_ERROR("allocating memory for table of size %d bytes failed\n",
0563 control->tbl_hdr.tbl_size);
0564 res = -ENOMEM;
0565 goto Out;
0566 }
0567
0568 down_read(&adev->reset_domain->sem);
0569 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
0570 control->i2c_address +
0571 control->ras_record_offset,
0572 buf, buf_size);
0573 up_read(&adev->reset_domain->sem);
0574 if (res < 0) {
0575 DRM_ERROR("EEPROM failed reading records:%d\n",
0576 res);
0577 goto Out;
0578 } else if (res < buf_size) {
0579 DRM_ERROR("EEPROM read %d out of %d bytes\n",
0580 res, buf_size);
0581 res = -EIO;
0582 goto Out;
0583 }
0584
0585
0586
0587 csum = 0;
0588 for (pp = buf; pp < buf + buf_size; pp++)
0589 csum += *pp;
0590
0591 csum += __calc_hdr_byte_sum(control);
0592
0593 csum = -csum;
0594 control->tbl_hdr.checksum = csum;
0595 res = __write_table_header(control);
0596 Out:
0597 kfree(buf);
0598 return res;
0599 }
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614 int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
0615 struct eeprom_table_record *record,
0616 const u32 num)
0617 {
0618 struct amdgpu_device *adev = to_amdgpu_device(control);
0619 int res;
0620
0621 if (!__is_ras_eeprom_supported(adev))
0622 return 0;
0623
0624 if (num == 0) {
0625 DRM_ERROR("will not append 0 records\n");
0626 return -EINVAL;
0627 } else if (num > control->ras_max_record_count) {
0628 DRM_ERROR("cannot append %d records than the size of table %d\n",
0629 num, control->ras_max_record_count);
0630 return -EINVAL;
0631 }
0632
0633 mutex_lock(&control->ras_tbl_mutex);
0634
0635 res = amdgpu_ras_eeprom_append_table(control, record, num);
0636 if (!res)
0637 res = amdgpu_ras_eeprom_update_header(control);
0638 if (!res)
0639 amdgpu_ras_debugfs_set_ret_size(control);
0640
0641 mutex_unlock(&control->ras_tbl_mutex);
0642 return res;
0643 }
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654
0655 static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
0656 u8 *buf, const u32 fri, const u32 num)
0657 {
0658 struct amdgpu_device *adev = to_amdgpu_device(control);
0659 u32 buf_size;
0660 int res;
0661
0662
0663 down_read(&adev->reset_domain->sem);
0664 buf_size = num * RAS_TABLE_RECORD_SIZE;
0665 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
0666 control->i2c_address +
0667 RAS_INDEX_TO_OFFSET(control, fri),
0668 buf, buf_size);
0669 up_read(&adev->reset_domain->sem);
0670 if (res < 0) {
0671 DRM_ERROR("Reading %d EEPROM table records error:%d",
0672 num, res);
0673 } else if (res < buf_size) {
0674
0675
0676 DRM_ERROR("Read %d records out of %d",
0677 res / RAS_TABLE_RECORD_SIZE, num);
0678 res = -EIO;
0679 } else {
0680 res = 0;
0681 }
0682
0683 return res;
0684 }
0685
0686
0687
0688
0689
0690
0691
0692
0693
0694
0695
0696
0697 int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
0698 struct eeprom_table_record *record,
0699 const u32 num)
0700 {
0701 struct amdgpu_device *adev = to_amdgpu_device(control);
0702 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
0703 int i, res;
0704 u8 *buf, *pp;
0705 u32 g0, g1;
0706
0707 if (!__is_ras_eeprom_supported(adev))
0708 return 0;
0709
0710 if (num == 0) {
0711 DRM_ERROR("will not read 0 records\n");
0712 return -EINVAL;
0713 } else if (num > control->ras_num_recs) {
0714 DRM_ERROR("too many records to read:%d available:%d\n",
0715 num, control->ras_num_recs);
0716 return -EINVAL;
0717 }
0718
0719 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
0720 if (!buf)
0721 return -ENOMEM;
0722
0723
0724
0725
0726
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739
0740
0741
0742
0743 g0 = control->ras_fri + num - 1;
0744 g1 = g0 % control->ras_max_record_count;
0745 if (g0 < control->ras_max_record_count) {
0746 g0 = num;
0747 g1 = 0;
0748 } else {
0749 g0 = control->ras_max_record_count - control->ras_fri;
0750 g1 += 1;
0751 }
0752
0753 mutex_lock(&control->ras_tbl_mutex);
0754 res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0);
0755 if (res)
0756 goto Out;
0757 if (g1) {
0758 res = __amdgpu_ras_eeprom_read(control,
0759 buf + g0 * RAS_TABLE_RECORD_SIZE,
0760 0, g1);
0761 if (res)
0762 goto Out;
0763 }
0764
0765 res = 0;
0766
0767
0768
0769 pp = buf;
0770 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
0771 __decode_table_record_from_buf(control, &record[i], pp);
0772
0773
0774 if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
0775 control->bad_channel_bitmap |= 1 << record[i].mem_channel;
0776 con->update_channel_flag = true;
0777 }
0778 }
0779 Out:
0780 kfree(buf);
0781 mutex_unlock(&control->ras_tbl_mutex);
0782
0783 return res;
0784 }
0785
0786 uint32_t amdgpu_ras_eeprom_max_record_count(void)
0787 {
0788 return RAS_MAX_RECORD_COUNT;
0789 }
0790
0791 static ssize_t
0792 amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
0793 size_t size, loff_t *pos)
0794 {
0795 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
0796 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0797 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
0798 u8 data[50];
0799 int res;
0800
0801 if (!size)
0802 return size;
0803
0804 if (!ras || !control) {
0805 res = snprintf(data, sizeof(data), "Not supported\n");
0806 } else {
0807 res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
0808 RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
0809 }
0810
0811 if (*pos >= res)
0812 return 0;
0813
0814 res -= *pos;
0815 res = min_t(size_t, res, size);
0816
0817 if (copy_to_user(buf, &data[*pos], res))
0818 return -EFAULT;
0819
0820 *pos += res;
0821
0822 return res;
0823 }
0824
0825 const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
0826 .owner = THIS_MODULE,
0827 .read = amdgpu_ras_debugfs_eeprom_size_read,
0828 .write = NULL,
0829 .llseek = default_llseek,
0830 };
0831
0832 static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
0833 static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
0834 #define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
0835 static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
0836 static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
0837 #define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
0838
0839 static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
0840 "ignore",
0841 "re",
0842 "ue",
0843 };
0844
0845 static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
0846 {
0847 return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
0848 strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
0849 }
0850
0851 void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
0852 {
0853 struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
0854 eeprom_control);
0855 struct dentry *de = ras->de_ras_eeprom_table;
0856
0857 if (de)
0858 d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
0859 }
0860
0861 static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
0862 size_t size, loff_t *pos)
0863 {
0864 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
0865 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0866 struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
0867 const size_t orig_size = size;
0868 int res = -EFAULT;
0869 size_t data_len;
0870
0871 mutex_lock(&control->ras_tbl_mutex);
0872
0873
0874
0875
0876 data_len = strlen(tbl_hdr_str);
0877 if (*pos < data_len) {
0878 data_len -= *pos;
0879 data_len = min_t(size_t, data_len, size);
0880 if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
0881 goto Out;
0882 buf += data_len;
0883 size -= data_len;
0884 *pos += data_len;
0885 }
0886
0887 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
0888 if (*pos < data_len && size > 0) {
0889 u8 data[tbl_hdr_fmt_size + 1];
0890 loff_t lpos;
0891
0892 snprintf(data, sizeof(data), tbl_hdr_fmt,
0893 control->tbl_hdr.header,
0894 control->tbl_hdr.version,
0895 control->tbl_hdr.first_rec_offset,
0896 control->tbl_hdr.tbl_size,
0897 control->tbl_hdr.checksum);
0898
0899 data_len -= *pos;
0900 data_len = min_t(size_t, data_len, size);
0901 lpos = *pos - strlen(tbl_hdr_str);
0902 if (copy_to_user(buf, &data[lpos], data_len))
0903 goto Out;
0904 buf += data_len;
0905 size -= data_len;
0906 *pos += data_len;
0907 }
0908
0909 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
0910 if (*pos < data_len && size > 0) {
0911 loff_t lpos;
0912
0913 data_len -= *pos;
0914 data_len = min_t(size_t, data_len, size);
0915 lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
0916 if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
0917 goto Out;
0918 buf += data_len;
0919 size -= data_len;
0920 *pos += data_len;
0921 }
0922
0923 data_len = amdgpu_ras_debugfs_table_size(control);
0924 if (*pos < data_len && size > 0) {
0925 u8 dare[RAS_TABLE_RECORD_SIZE];
0926 u8 data[rec_hdr_fmt_size + 1];
0927 struct eeprom_table_record record;
0928 int s, r;
0929
0930
0931
0932 s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
0933 strlen(rec_hdr_str);
0934 s = s / rec_hdr_fmt_size;
0935 r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
0936 strlen(rec_hdr_str);
0937 r = r % rec_hdr_fmt_size;
0938
0939 for ( ; size > 0 && s < control->ras_num_recs; s++) {
0940 u32 ai = RAS_RI_TO_AI(control, s);
0941
0942
0943 res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
0944 if (res)
0945 goto Out;
0946 __decode_table_record_from_buf(control, &record, dare);
0947 snprintf(data, sizeof(data), rec_hdr_fmt,
0948 s,
0949 RAS_INDEX_TO_OFFSET(control, ai),
0950 record_err_type_str[record.err_type],
0951 record.bank,
0952 record.ts,
0953 record.offset,
0954 record.mem_channel,
0955 record.mcumc_id,
0956 record.retired_page);
0957
0958 data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
0959 if (copy_to_user(buf, &data[r], data_len)) {
0960 res = -EFAULT;
0961 goto Out;
0962 }
0963 buf += data_len;
0964 size -= data_len;
0965 *pos += data_len;
0966 r = 0;
0967 }
0968 }
0969 res = 0;
0970 Out:
0971 mutex_unlock(&control->ras_tbl_mutex);
0972 return res < 0 ? res : orig_size - size;
0973 }
0974
0975 static ssize_t
0976 amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
0977 size_t size, loff_t *pos)
0978 {
0979 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
0980 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
0981 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
0982 u8 data[81];
0983 int res;
0984
0985 if (!size)
0986 return size;
0987
0988 if (!ras || !control) {
0989 res = snprintf(data, sizeof(data), "Not supported\n");
0990 if (*pos >= res)
0991 return 0;
0992
0993 res -= *pos;
0994 res = min_t(size_t, res, size);
0995
0996 if (copy_to_user(buf, &data[*pos], res))
0997 return -EFAULT;
0998
0999 *pos += res;
1000
1001 return res;
1002 } else {
1003 return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
1004 }
1005 }
1006
1007 const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
1008 .owner = THIS_MODULE,
1009 .read = amdgpu_ras_debugfs_eeprom_table_read,
1010 .write = NULL,
1011 .llseek = default_llseek,
1012 };
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024 static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control)
1025 {
1026 struct amdgpu_device *adev = to_amdgpu_device(control);
1027 int buf_size, res;
1028 u8 csum, *buf, *pp;
1029
1030 buf_size = RAS_TABLE_HEADER_SIZE +
1031 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1032 buf = kzalloc(buf_size, GFP_KERNEL);
1033 if (!buf) {
1034 DRM_ERROR("Out of memory checking RAS table checksum.\n");
1035 return -ENOMEM;
1036 }
1037
1038 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1039 control->i2c_address +
1040 control->ras_header_offset,
1041 buf, buf_size);
1042 if (res < buf_size) {
1043 DRM_ERROR("Partial read for checksum, res:%d\n", res);
1044
1045
1046 if (res >= 0)
1047 res = -EIO;
1048 goto Out;
1049 }
1050
1051 csum = 0;
1052 for (pp = buf; pp < buf + buf_size; pp++)
1053 csum += *pp;
1054 Out:
1055 kfree(buf);
1056 return res < 0 ? res : csum;
1057 }
1058
1059 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
1060 bool *exceed_err_limit)
1061 {
1062 struct amdgpu_device *adev = to_amdgpu_device(control);
1063 unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
1064 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1065 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1066 int res;
1067
1068 *exceed_err_limit = false;
1069
1070 if (!__is_ras_eeprom_supported(adev))
1071 return 0;
1072
1073
1074 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
1075 return -ENOENT;
1076
1077 if (!__get_eeprom_i2c_addr(adev, control))
1078 return -EINVAL;
1079
1080 control->ras_header_offset = RAS_HDR_START;
1081 control->ras_record_offset = RAS_RECORD_START;
1082 control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
1083 mutex_init(&control->ras_tbl_mutex);
1084
1085
1086 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1087 control->i2c_address + control->ras_header_offset,
1088 buf, RAS_TABLE_HEADER_SIZE);
1089 if (res < RAS_TABLE_HEADER_SIZE) {
1090 DRM_ERROR("Failed to read EEPROM table header, res:%d", res);
1091 return res >= 0 ? -EIO : res;
1092 }
1093
1094 __decode_table_header_from_buf(hdr, buf);
1095
1096 control->ras_num_recs = RAS_NUM_RECS(hdr);
1097 control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
1098
1099 if (hdr->header == RAS_TABLE_HDR_VAL) {
1100 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
1101 control->ras_num_recs);
1102 res = __verify_ras_table_checksum(control);
1103 if (res)
1104 DRM_ERROR("RAS table incorrect checksum or error:%d\n",
1105 res);
1106
1107
1108
1109 if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
1110 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
1111 control->ras_num_recs,
1112 ras->bad_page_cnt_threshold);
1113 } else if (hdr->header == RAS_TABLE_HDR_BAD &&
1114 amdgpu_bad_page_threshold != 0) {
1115 res = __verify_ras_table_checksum(control);
1116 if (res)
1117 DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
1118 res);
1119 if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
1120
1121
1122
1123
1124
1125
1126 dev_info(adev->dev,
1127 "records:%d threshold:%d, resetting "
1128 "RAS table header signature",
1129 control->ras_num_recs,
1130 ras->bad_page_cnt_threshold);
1131 res = amdgpu_ras_eeprom_correct_header_tag(control,
1132 RAS_TABLE_HDR_VAL);
1133 } else {
1134 dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
1135 control->ras_num_recs, ras->bad_page_cnt_threshold);
1136 if (amdgpu_bad_page_threshold == -2) {
1137 dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
1138 res = 0;
1139 } else {
1140 *exceed_err_limit = true;
1141 dev_err(adev->dev,
1142 "RAS records:%d exceed threshold:%d, "
1143 "GPU will not be initialized. Replace this GPU or increase the threshold",
1144 control->ras_num_recs, ras->bad_page_cnt_threshold);
1145 }
1146 }
1147 } else {
1148 DRM_INFO("Creating a new EEPROM table");
1149
1150 res = amdgpu_ras_eeprom_reset_table(control);
1151 }
1152
1153 return res < 0 ? res : 0;
1154 }