0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include <linux/list.h>
0025 #include "amdgpu.h"
0026 #include "amdgpu_xgmi.h"
0027 #include "amdgpu_ras.h"
0028 #include "soc15.h"
0029 #include "df/df_3_6_offset.h"
0030 #include "xgmi/xgmi_4_0_0_smn.h"
0031 #include "xgmi/xgmi_4_0_0_sh_mask.h"
0032 #include "wafl/wafl2_4_0_0_smn.h"
0033 #include "wafl/wafl2_4_0_0_sh_mask.h"
0034
0035 #include "amdgpu_reset.h"
0036
0037 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
0038 #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
0039
0040 static DEFINE_MUTEX(xgmi_mutex);
0041
0042 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
0043
0044 static LIST_HEAD(xgmi_hive_list);
0045
0046 static const int xgmi_pcs_err_status_reg_vg20[] = {
0047 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
0048 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
0049 };
0050
0051 static const int wafl_pcs_err_status_reg_vg20[] = {
0052 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
0053 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
0054 };
0055
0056 static const int xgmi_pcs_err_status_reg_arct[] = {
0057 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
0058 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
0059 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
0060 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
0061 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
0062 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
0063 };
0064
0065
0066 static const int wafl_pcs_err_status_reg_arct[] = {
0067 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
0068 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
0069 };
0070
0071 static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
0072 smnPCS_XGMI3X16_PCS_ERROR_STATUS,
0073 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
0074 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
0075 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
0076 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
0077 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
0078 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
0079 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
0080 };
0081
0082 static const int walf_pcs_err_status_reg_aldebaran[] = {
0083 smnPCS_GOPX1_PCS_ERROR_STATUS,
0084 smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
0085 };
0086
0087 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
0088 {"XGMI PCS DataLossErr",
0089 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
0090 {"XGMI PCS TrainingErr",
0091 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
0092 {"XGMI PCS CRCErr",
0093 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
0094 {"XGMI PCS BERExceededErr",
0095 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
0096 {"XGMI PCS TxMetaDataErr",
0097 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
0098 {"XGMI PCS ReplayBufParityErr",
0099 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
0100 {"XGMI PCS DataParityErr",
0101 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
0102 {"XGMI PCS ReplayFifoOverflowErr",
0103 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
0104 {"XGMI PCS ReplayFifoUnderflowErr",
0105 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
0106 {"XGMI PCS ElasticFifoOverflowErr",
0107 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
0108 {"XGMI PCS DeskewErr",
0109 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
0110 {"XGMI PCS DataStartupLimitErr",
0111 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
0112 {"XGMI PCS FCInitTimeoutErr",
0113 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
0114 {"XGMI PCS RecoveryTimeoutErr",
0115 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
0116 {"XGMI PCS ReadySerialTimeoutErr",
0117 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
0118 {"XGMI PCS ReadySerialAttemptErr",
0119 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
0120 {"XGMI PCS RecoveryAttemptErr",
0121 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
0122 {"XGMI PCS RecoveryRelockAttemptErr",
0123 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
0124 };
0125
0126 static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
0127 {"WAFL PCS DataLossErr",
0128 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
0129 {"WAFL PCS TrainingErr",
0130 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
0131 {"WAFL PCS CRCErr",
0132 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
0133 {"WAFL PCS BERExceededErr",
0134 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
0135 {"WAFL PCS TxMetaDataErr",
0136 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
0137 {"WAFL PCS ReplayBufParityErr",
0138 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
0139 {"WAFL PCS DataParityErr",
0140 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
0141 {"WAFL PCS ReplayFifoOverflowErr",
0142 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
0143 {"WAFL PCS ReplayFifoUnderflowErr",
0144 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
0145 {"WAFL PCS ElasticFifoOverflowErr",
0146 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
0147 {"WAFL PCS DeskewErr",
0148 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
0149 {"WAFL PCS DataStartupLimitErr",
0150 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
0151 {"WAFL PCS FCInitTimeoutErr",
0152 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
0153 {"WAFL PCS RecoveryTimeoutErr",
0154 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
0155 {"WAFL PCS ReadySerialTimeoutErr",
0156 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
0157 {"WAFL PCS ReadySerialAttemptErr",
0158 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
0159 {"WAFL PCS RecoveryAttemptErr",
0160 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
0161 {"WAFL PCS RecoveryRelockAttemptErr",
0162 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
0163 };
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192 static struct attribute amdgpu_xgmi_hive_id = {
0193 .name = "xgmi_hive_id",
0194 .mode = S_IRUGO
0195 };
0196
0197 static struct attribute *amdgpu_xgmi_hive_attrs[] = {
0198 &amdgpu_xgmi_hive_id,
0199 NULL
0200 };
0201 ATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
0202
0203 static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
0204 struct attribute *attr, char *buf)
0205 {
0206 struct amdgpu_hive_info *hive = container_of(
0207 kobj, struct amdgpu_hive_info, kobj);
0208
0209 if (attr == &amdgpu_xgmi_hive_id)
0210 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
0211
0212 return 0;
0213 }
0214
0215 static void amdgpu_xgmi_hive_release(struct kobject *kobj)
0216 {
0217 struct amdgpu_hive_info *hive = container_of(
0218 kobj, struct amdgpu_hive_info, kobj);
0219
0220 amdgpu_reset_put_reset_domain(hive->reset_domain);
0221 hive->reset_domain = NULL;
0222
0223 mutex_destroy(&hive->hive_lock);
0224 kfree(hive);
0225 }
0226
0227 static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
0228 .show = amdgpu_xgmi_show_attrs,
0229 };
0230
0231 struct kobj_type amdgpu_xgmi_hive_type = {
0232 .release = amdgpu_xgmi_hive_release,
0233 .sysfs_ops = &amdgpu_xgmi_hive_ops,
0234 .default_groups = amdgpu_xgmi_hive_groups,
0235 };
0236
0237 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
0238 struct device_attribute *attr,
0239 char *buf)
0240 {
0241 struct drm_device *ddev = dev_get_drvdata(dev);
0242 struct amdgpu_device *adev = drm_to_adev(ddev);
0243
0244 return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
0245
0246 }
0247
0248 #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
0249 static ssize_t amdgpu_xgmi_show_error(struct device *dev,
0250 struct device_attribute *attr,
0251 char *buf)
0252 {
0253 struct drm_device *ddev = dev_get_drvdata(dev);
0254 struct amdgpu_device *adev = drm_to_adev(ddev);
0255 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
0256 uint64_t fica_out;
0257 unsigned int error_count = 0;
0258
0259 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
0260 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
0261
0262 if ((!adev->df.funcs) ||
0263 (!adev->df.funcs->get_fica) ||
0264 (!adev->df.funcs->set_fica))
0265 return -EINVAL;
0266
0267 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
0268 if (fica_out != 0x1f)
0269 pr_err("xGMI error counters not enabled!\n");
0270
0271 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
0272
0273 if ((fica_out & 0xffff) == 2)
0274 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
0275
0276 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
0277
0278 return sysfs_emit(buf, "%u\n", error_count);
0279 }
0280
0281
0282 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
0283 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
0284
0285 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
0286 struct amdgpu_hive_info *hive)
0287 {
0288 int ret = 0;
0289 char node[10] = { 0 };
0290
0291
0292 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
0293 if (ret) {
0294 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
0295 return ret;
0296 }
0297
0298
0299 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
0300 if (ret)
0301 pr_err("failed to create xgmi_error\n");
0302
0303
0304
0305 if (hive->kobj.parent != (&adev->dev->kobj)) {
0306 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
0307 "xgmi_hive_info");
0308 if (ret) {
0309 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
0310 goto remove_file;
0311 }
0312 }
0313
0314 sprintf(node, "node%d", atomic_read(&hive->number_devices));
0315
0316 ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
0317 if (ret) {
0318 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
0319 goto remove_link;
0320 }
0321
0322 goto success;
0323
0324
0325 remove_link:
0326 sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
0327
0328 remove_file:
0329 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
0330
0331 success:
0332 return ret;
0333 }
0334
0335 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
0336 struct amdgpu_hive_info *hive)
0337 {
0338 char node[10];
0339 memset(node, 0, sizeof(node));
0340
0341 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
0342 device_remove_file(adev->dev, &dev_attr_xgmi_error);
0343
0344 if (hive->kobj.parent != (&adev->dev->kobj))
0345 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
0346
0347 sprintf(node, "node%d", atomic_read(&hive->number_devices));
0348 sysfs_remove_link(&hive->kobj, node);
0349
0350 }
0351
0352
0353
0354 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
0355 {
0356 struct amdgpu_hive_info *hive = NULL;
0357 int ret;
0358
0359 if (!adev->gmc.xgmi.hive_id)
0360 return NULL;
0361
0362 if (adev->hive) {
0363 kobject_get(&adev->hive->kobj);
0364 return adev->hive;
0365 }
0366
0367 mutex_lock(&xgmi_mutex);
0368
0369 list_for_each_entry(hive, &xgmi_hive_list, node) {
0370 if (hive->hive_id == adev->gmc.xgmi.hive_id)
0371 goto pro_end;
0372 }
0373
0374 hive = kzalloc(sizeof(*hive), GFP_KERNEL);
0375 if (!hive) {
0376 dev_err(adev->dev, "XGMI: allocation failed\n");
0377 hive = NULL;
0378 goto pro_end;
0379 }
0380
0381
0382 ret = kobject_init_and_add(&hive->kobj,
0383 &amdgpu_xgmi_hive_type,
0384 &adev->dev->kobj,
0385 "%s", "xgmi_hive_info");
0386 if (ret) {
0387 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
0388 kobject_put(&hive->kobj);
0389 kfree(hive);
0390 hive = NULL;
0391 goto pro_end;
0392 }
0393
0394
0395
0396
0397
0398
0399 if (adev->reset_domain->type != XGMI_HIVE) {
0400 hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
0401 if (!hive->reset_domain) {
0402 dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
0403 ret = -ENOMEM;
0404 kobject_put(&hive->kobj);
0405 kfree(hive);
0406 hive = NULL;
0407 goto pro_end;
0408 }
0409 } else {
0410 amdgpu_reset_get_reset_domain(adev->reset_domain);
0411 hive->reset_domain = adev->reset_domain;
0412 }
0413
0414 hive->hive_id = adev->gmc.xgmi.hive_id;
0415 INIT_LIST_HEAD(&hive->device_list);
0416 INIT_LIST_HEAD(&hive->node);
0417 mutex_init(&hive->hive_lock);
0418 atomic_set(&hive->number_devices, 0);
0419 task_barrier_init(&hive->tb);
0420 hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
0421 hive->hi_req_gpu = NULL;
0422
0423
0424
0425
0426
0427 hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
0428 list_add_tail(&hive->node, &xgmi_hive_list);
0429
0430 pro_end:
0431 if (hive)
0432 kobject_get(&hive->kobj);
0433 mutex_unlock(&xgmi_mutex);
0434 return hive;
0435 }
0436
0437 void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
0438 {
0439 if (hive)
0440 kobject_put(&hive->kobj);
0441 }
0442
0443 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
0444 {
0445 int ret = 0;
0446 struct amdgpu_hive_info *hive;
0447 struct amdgpu_device *request_adev;
0448 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
0449 bool init_low;
0450
0451 hive = amdgpu_get_xgmi_hive(adev);
0452 if (!hive)
0453 return 0;
0454
0455 request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
0456 init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
0457 amdgpu_put_xgmi_hive(hive);
0458
0459 return 0;
0460
0461 if (!hive || adev->asic_type != CHIP_VEGA20)
0462 return 0;
0463
0464 mutex_lock(&hive->hive_lock);
0465
0466 if (is_hi_req)
0467 hive->hi_req_count++;
0468 else
0469 hive->hi_req_count--;
0470
0471
0472
0473
0474
0475 if (hive->pstate == pstate ||
0476 (!is_hi_req && hive->hi_req_count && !init_low))
0477 goto out;
0478
0479 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
0480
0481 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
0482 if (ret) {
0483 dev_err(request_adev->dev,
0484 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
0485 request_adev->gmc.xgmi.node_id,
0486 request_adev->gmc.xgmi.hive_id, ret);
0487 goto out;
0488 }
0489
0490 if (init_low)
0491 hive->pstate = hive->hi_req_count ?
0492 hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
0493 else {
0494 hive->pstate = pstate;
0495 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
0496 adev : NULL;
0497 }
0498 out:
0499 mutex_unlock(&hive->hive_lock);
0500 return ret;
0501 }
0502
0503 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
0504 {
0505 int ret;
0506
0507
0508 ret = psp_xgmi_set_topology_info(&adev->psp,
0509 atomic_read(&hive->number_devices),
0510 &adev->psp.xgmi_context.top_info);
0511 if (ret)
0512 dev_err(adev->dev,
0513 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
0514 adev->gmc.xgmi.node_id,
0515 adev->gmc.xgmi.hive_id, ret);
0516
0517 return ret;
0518 }
0519
0520
0521
0522
0523
0524
0525
0526
0527 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
0528 struct amdgpu_device *peer_adev)
0529 {
0530 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
0531 uint8_t num_hops_mask = 0x7;
0532 int i;
0533
0534 for (i = 0 ; i < top->num_nodes; ++i)
0535 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
0536 return top->nodes[i].num_hops & num_hops_mask;
0537 return -EINVAL;
0538 }
0539
0540 int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
0541 struct amdgpu_device *peer_adev)
0542 {
0543 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
0544 int i;
0545
0546 for (i = 0 ; i < top->num_nodes; ++i)
0547 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
0548 return top->nodes[i].num_links;
0549 return -EINVAL;
0550 }
0551
0552
0553
0554
0555
0556
0557
0558 static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
0559 bool set_extended_data)
0560 {
0561 struct amdgpu_device *tmp_adev;
0562 int ret;
0563
0564 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0565 ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
0566 if (ret) {
0567 dev_err(tmp_adev->dev,
0568 "XGMI: Failed to initialize xgmi session for data partition %i\n",
0569 set_extended_data);
0570 return ret;
0571 }
0572
0573 }
0574
0575 return 0;
0576 }
0577
0578 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
0579 {
0580 struct psp_xgmi_topology_info *top_info;
0581 struct amdgpu_hive_info *hive;
0582 struct amdgpu_xgmi *entry;
0583 struct amdgpu_device *tmp_adev = NULL;
0584
0585 int count = 0, ret = 0;
0586
0587 if (!adev->gmc.xgmi.supported)
0588 return 0;
0589
0590 if (!adev->gmc.xgmi.pending_reset &&
0591 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
0592 ret = psp_xgmi_initialize(&adev->psp, false, true);
0593 if (ret) {
0594 dev_err(adev->dev,
0595 "XGMI: Failed to initialize xgmi session\n");
0596 return ret;
0597 }
0598
0599 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
0600 if (ret) {
0601 dev_err(adev->dev,
0602 "XGMI: Failed to get hive id\n");
0603 return ret;
0604 }
0605
0606 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
0607 if (ret) {
0608 dev_err(adev->dev,
0609 "XGMI: Failed to get node id\n");
0610 return ret;
0611 }
0612 } else {
0613 adev->gmc.xgmi.hive_id = 16;
0614 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
0615 }
0616
0617 hive = amdgpu_get_xgmi_hive(adev);
0618 if (!hive) {
0619 ret = -EINVAL;
0620 dev_err(adev->dev,
0621 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
0622 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
0623 goto exit;
0624 }
0625 mutex_lock(&hive->hive_lock);
0626
0627 top_info = &adev->psp.xgmi_context.top_info;
0628
0629 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
0630 list_for_each_entry(entry, &hive->device_list, head)
0631 top_info->nodes[count++].node_id = entry->node_id;
0632 top_info->num_nodes = count;
0633 atomic_set(&hive->number_devices, count);
0634
0635 task_barrier_add_task(&hive->tb);
0636
0637 if (!adev->gmc.xgmi.pending_reset &&
0638 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
0639 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0640
0641 if (tmp_adev != adev) {
0642 top_info = &tmp_adev->psp.xgmi_context.top_info;
0643 top_info->nodes[count - 1].node_id =
0644 adev->gmc.xgmi.node_id;
0645 top_info->num_nodes = count;
0646 }
0647 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
0648 if (ret)
0649 goto exit_unlock;
0650 }
0651
0652
0653 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0654 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
0655 &tmp_adev->psp.xgmi_context.top_info, false);
0656 if (ret) {
0657 dev_err(tmp_adev->dev,
0658 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
0659 tmp_adev->gmc.xgmi.node_id,
0660 tmp_adev->gmc.xgmi.hive_id, ret);
0661
0662 goto exit_unlock;
0663 }
0664 }
0665
0666
0667 if (adev->psp.xgmi_context.supports_extended_data) {
0668
0669
0670 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
0671 if (ret)
0672 goto exit_unlock;
0673
0674
0675 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0676 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
0677 &tmp_adev->psp.xgmi_context.top_info, true);
0678 if (ret) {
0679 dev_err(tmp_adev->dev,
0680 "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
0681 tmp_adev->gmc.xgmi.node_id,
0682 tmp_adev->gmc.xgmi.hive_id, ret);
0683 goto exit_unlock;
0684 }
0685 }
0686
0687
0688 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
0689 if (ret)
0690 goto exit_unlock;
0691
0692 }
0693 }
0694
0695 if (!ret && !adev->gmc.xgmi.pending_reset)
0696 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
0697
0698 exit_unlock:
0699 mutex_unlock(&hive->hive_lock);
0700 exit:
0701 if (!ret) {
0702 adev->hive = hive;
0703 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
0704 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
0705 } else {
0706 amdgpu_put_xgmi_hive(hive);
0707 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
0708 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
0709 ret);
0710 }
0711
0712 return ret;
0713 }
0714
0715 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
0716 {
0717 struct amdgpu_hive_info *hive = adev->hive;
0718
0719 if (!adev->gmc.xgmi.supported)
0720 return -EINVAL;
0721
0722 if (!hive)
0723 return -EINVAL;
0724
0725 mutex_lock(&hive->hive_lock);
0726 task_barrier_rem_task(&hive->tb);
0727 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
0728 if (hive->hi_req_gpu == adev)
0729 hive->hi_req_gpu = NULL;
0730 list_del(&adev->gmc.xgmi.head);
0731 mutex_unlock(&hive->hive_lock);
0732
0733 amdgpu_put_xgmi_hive(hive);
0734 adev->hive = NULL;
0735
0736 if (atomic_dec_return(&hive->number_devices) == 0) {
0737
0738 mutex_lock(&xgmi_mutex);
0739 list_del(&hive->node);
0740 mutex_unlock(&xgmi_mutex);
0741
0742 amdgpu_put_xgmi_hive(hive);
0743 }
0744
0745 return 0;
0746 }
0747
0748 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
0749 {
0750 if (!adev->gmc.xgmi.supported ||
0751 adev->gmc.xgmi.num_physical_nodes == 0)
0752 return 0;
0753
0754 adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
0755
0756 return amdgpu_ras_block_late_init(adev, ras_block);
0757 }
0758
0759 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
0760 uint64_t addr)
0761 {
0762 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
0763 return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
0764 }
0765
0766 static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
0767 {
0768 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
0769 WREG32_PCIE(pcs_status_reg, 0);
0770 }
0771
0772 static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
0773 {
0774 uint32_t i;
0775
0776 switch (adev->asic_type) {
0777 case CHIP_ARCTURUS:
0778 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
0779 pcs_clear_status(adev,
0780 xgmi_pcs_err_status_reg_arct[i]);
0781 break;
0782 case CHIP_VEGA20:
0783 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
0784 pcs_clear_status(adev,
0785 xgmi_pcs_err_status_reg_vg20[i]);
0786 break;
0787 case CHIP_ALDEBARAN:
0788 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
0789 pcs_clear_status(adev,
0790 xgmi3x16_pcs_err_status_reg_aldebaran[i]);
0791 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
0792 pcs_clear_status(adev,
0793 walf_pcs_err_status_reg_aldebaran[i]);
0794 break;
0795 default:
0796 break;
0797 }
0798 }
0799
0800 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
0801 uint32_t value,
0802 uint32_t *ue_count,
0803 uint32_t *ce_count,
0804 bool is_xgmi_pcs)
0805 {
0806 int i;
0807 int ue_cnt;
0808
0809 if (is_xgmi_pcs) {
0810
0811
0812 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
0813 ue_cnt = (value &
0814 xgmi_pcs_ras_fields[i].pcs_err_mask) >>
0815 xgmi_pcs_ras_fields[i].pcs_err_shift;
0816 if (ue_cnt) {
0817 dev_info(adev->dev, "%s detected\n",
0818 xgmi_pcs_ras_fields[i].err_name);
0819 *ue_count += ue_cnt;
0820 }
0821 }
0822 } else {
0823
0824
0825 for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
0826 ue_cnt = (value &
0827 wafl_pcs_ras_fields[i].pcs_err_mask) >>
0828 wafl_pcs_ras_fields[i].pcs_err_shift;
0829 if (ue_cnt) {
0830 dev_info(adev->dev, "%s detected\n",
0831 wafl_pcs_ras_fields[i].err_name);
0832 *ue_count += ue_cnt;
0833 }
0834 }
0835 }
0836
0837 return 0;
0838 }
0839
0840 static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
0841 void *ras_error_status)
0842 {
0843 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0844 int i;
0845 uint32_t data;
0846 uint32_t ue_cnt = 0, ce_cnt = 0;
0847
0848 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
0849 return ;
0850
0851 err_data->ue_count = 0;
0852 err_data->ce_count = 0;
0853
0854 switch (adev->asic_type) {
0855 case CHIP_ARCTURUS:
0856
0857 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
0858 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
0859 if (data)
0860 amdgpu_xgmi_query_pcs_error_status(adev,
0861 data, &ue_cnt, &ce_cnt, true);
0862 }
0863
0864 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
0865 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
0866 if (data)
0867 amdgpu_xgmi_query_pcs_error_status(adev,
0868 data, &ue_cnt, &ce_cnt, false);
0869 }
0870 break;
0871 case CHIP_VEGA20:
0872
0873 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
0874 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
0875 if (data)
0876 amdgpu_xgmi_query_pcs_error_status(adev,
0877 data, &ue_cnt, &ce_cnt, true);
0878 }
0879
0880 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
0881 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
0882 if (data)
0883 amdgpu_xgmi_query_pcs_error_status(adev,
0884 data, &ue_cnt, &ce_cnt, false);
0885 }
0886 break;
0887 case CHIP_ALDEBARAN:
0888
0889 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
0890 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
0891 if (data)
0892 amdgpu_xgmi_query_pcs_error_status(adev,
0893 data, &ue_cnt, &ce_cnt, true);
0894 }
0895
0896 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
0897 data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
0898 if (data)
0899 amdgpu_xgmi_query_pcs_error_status(adev,
0900 data, &ue_cnt, &ce_cnt, false);
0901 }
0902 break;
0903 default:
0904 dev_warn(adev->dev, "XGMI RAS error query not supported");
0905 break;
0906 }
0907
0908 adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
0909
0910 err_data->ue_count += ue_cnt;
0911 err_data->ce_count += ce_cnt;
0912 }
0913
0914
0915 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if)
0916 {
0917 int ret = 0;
0918 struct ta_ras_trigger_error_input *block_info =
0919 (struct ta_ras_trigger_error_input *)inject_if;
0920
0921 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
0922 dev_warn(adev->dev, "Failed to disallow df cstate");
0923
0924 if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
0925 dev_warn(adev->dev, "Failed to disallow XGMI power down");
0926
0927 ret = psp_ras_trigger_error(&adev->psp, block_info);
0928
0929 if (amdgpu_ras_intr_triggered())
0930 return ret;
0931
0932 if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
0933 dev_warn(adev->dev, "Failed to allow XGMI power down");
0934
0935 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
0936 dev_warn(adev->dev, "Failed to allow df cstate");
0937
0938 return ret;
0939 }
0940
0941 struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = {
0942 .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
0943 .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
0944 .ras_error_inject = amdgpu_ras_error_inject_xgmi,
0945 };
0946
0947 struct amdgpu_xgmi_ras xgmi_ras = {
0948 .ras_block = {
0949 .ras_comm = {
0950 .name = "xgmi_wafl",
0951 .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
0952 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
0953 },
0954 .hw_ops = &xgmi_ras_hw_ops,
0955 .ras_late_init = amdgpu_xgmi_ras_late_init,
0956 },
0957 };