Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2018 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  *
0023  */
0024 #include <linux/list.h>
0025 #include "amdgpu.h"
0026 #include "amdgpu_xgmi.h"
0027 #include "amdgpu_ras.h"
0028 #include "soc15.h"
0029 #include "df/df_3_6_offset.h"
0030 #include "xgmi/xgmi_4_0_0_smn.h"
0031 #include "xgmi/xgmi_4_0_0_sh_mask.h"
0032 #include "wafl/wafl2_4_0_0_smn.h"
0033 #include "wafl/wafl2_4_0_0_sh_mask.h"
0034 
0035 #include "amdgpu_reset.h"
0036 
0037 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
0038 #define smnPCS_GOPX1_PCS_ERROR_STATUS    0x12200210
0039 
0040 static DEFINE_MUTEX(xgmi_mutex);
0041 
0042 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE     4
0043 
0044 static LIST_HEAD(xgmi_hive_list);
0045 
0046 static const int xgmi_pcs_err_status_reg_vg20[] = {
0047     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
0048     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
0049 };
0050 
0051 static const int wafl_pcs_err_status_reg_vg20[] = {
0052     smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
0053     smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
0054 };
0055 
0056 static const int xgmi_pcs_err_status_reg_arct[] = {
0057     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
0058     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
0059     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
0060     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
0061     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
0062     smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
0063 };
0064 
0065 /* same as vg20*/
0066 static const int wafl_pcs_err_status_reg_arct[] = {
0067     smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
0068     smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
0069 };
0070 
0071 static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
0072     smnPCS_XGMI3X16_PCS_ERROR_STATUS,
0073     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
0074     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
0075     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
0076     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
0077     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
0078     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
0079     smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
0080 };
0081 
0082 static const int walf_pcs_err_status_reg_aldebaran[] = {
0083     smnPCS_GOPX1_PCS_ERROR_STATUS,
0084     smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
0085 };
0086 
0087 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
0088     {"XGMI PCS DataLossErr",
0089      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
0090     {"XGMI PCS TrainingErr",
0091      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
0092     {"XGMI PCS CRCErr",
0093      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
0094     {"XGMI PCS BERExceededErr",
0095      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
0096     {"XGMI PCS TxMetaDataErr",
0097      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
0098     {"XGMI PCS ReplayBufParityErr",
0099      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
0100     {"XGMI PCS DataParityErr",
0101      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
0102     {"XGMI PCS ReplayFifoOverflowErr",
0103      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
0104     {"XGMI PCS ReplayFifoUnderflowErr",
0105      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
0106     {"XGMI PCS ElasticFifoOverflowErr",
0107      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
0108     {"XGMI PCS DeskewErr",
0109      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
0110     {"XGMI PCS DataStartupLimitErr",
0111      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
0112     {"XGMI PCS FCInitTimeoutErr",
0113      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
0114     {"XGMI PCS RecoveryTimeoutErr",
0115      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
0116     {"XGMI PCS ReadySerialTimeoutErr",
0117      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
0118     {"XGMI PCS ReadySerialAttemptErr",
0119      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
0120     {"XGMI PCS RecoveryAttemptErr",
0121      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
0122     {"XGMI PCS RecoveryRelockAttemptErr",
0123      SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
0124 };
0125 
0126 static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
0127     {"WAFL PCS DataLossErr",
0128      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
0129     {"WAFL PCS TrainingErr",
0130      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
0131     {"WAFL PCS CRCErr",
0132      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
0133     {"WAFL PCS BERExceededErr",
0134      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
0135     {"WAFL PCS TxMetaDataErr",
0136      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
0137     {"WAFL PCS ReplayBufParityErr",
0138      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
0139     {"WAFL PCS DataParityErr",
0140      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
0141     {"WAFL PCS ReplayFifoOverflowErr",
0142      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
0143     {"WAFL PCS ReplayFifoUnderflowErr",
0144      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
0145     {"WAFL PCS ElasticFifoOverflowErr",
0146      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
0147     {"WAFL PCS DeskewErr",
0148      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
0149     {"WAFL PCS DataStartupLimitErr",
0150      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
0151     {"WAFL PCS FCInitTimeoutErr",
0152      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
0153     {"WAFL PCS RecoveryTimeoutErr",
0154      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
0155     {"WAFL PCS ReadySerialTimeoutErr",
0156      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
0157     {"WAFL PCS ReadySerialAttemptErr",
0158      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
0159     {"WAFL PCS RecoveryAttemptErr",
0160      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
0161     {"WAFL PCS RecoveryRelockAttemptErr",
0162      SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
0163 };
0164 
0165 /**
0166  * DOC: AMDGPU XGMI Support
0167  *
0168  * XGMI is a high speed interconnect that joins multiple GPU cards
0169  * into a homogeneous memory space that is organized by a collective
0170  * hive ID and individual node IDs, both of which are 64-bit numbers.
0171  *
0172  * The file xgmi_device_id contains the unique per GPU device ID and
0173  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
0174  *
0175  * Inside the device directory a sub-directory 'xgmi_hive_info' is
0176  * created which contains the hive ID and the list of nodes.
0177  *
0178  * The hive ID is stored in:
0179  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
0180  *
0181  * The node information is stored in numbered directories:
0182  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
0183  *
0184  * Each device has their own xgmi_hive_info direction with a mirror
0185  * set of node sub-directories.
0186  *
0187  * The XGMI memory space is built by contiguously adding the power of
0188  * two padded VRAM space from each node to each other.
0189  *
0190  */
0191 
0192 static struct attribute amdgpu_xgmi_hive_id = {
0193     .name = "xgmi_hive_id",
0194     .mode = S_IRUGO
0195 };
0196 
0197 static struct attribute *amdgpu_xgmi_hive_attrs[] = {
0198     &amdgpu_xgmi_hive_id,
0199     NULL
0200 };
0201 ATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
0202 
0203 static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
0204     struct attribute *attr, char *buf)
0205 {
0206     struct amdgpu_hive_info *hive = container_of(
0207         kobj, struct amdgpu_hive_info, kobj);
0208 
0209     if (attr == &amdgpu_xgmi_hive_id)
0210         return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
0211 
0212     return 0;
0213 }
0214 
0215 static void amdgpu_xgmi_hive_release(struct kobject *kobj)
0216 {
0217     struct amdgpu_hive_info *hive = container_of(
0218         kobj, struct amdgpu_hive_info, kobj);
0219 
0220     amdgpu_reset_put_reset_domain(hive->reset_domain);
0221     hive->reset_domain = NULL;
0222 
0223     mutex_destroy(&hive->hive_lock);
0224     kfree(hive);
0225 }
0226 
0227 static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
0228     .show = amdgpu_xgmi_show_attrs,
0229 };
0230 
0231 struct kobj_type amdgpu_xgmi_hive_type = {
0232     .release = amdgpu_xgmi_hive_release,
0233     .sysfs_ops = &amdgpu_xgmi_hive_ops,
0234     .default_groups = amdgpu_xgmi_hive_groups,
0235 };
0236 
0237 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
0238                      struct device_attribute *attr,
0239                      char *buf)
0240 {
0241     struct drm_device *ddev = dev_get_drvdata(dev);
0242     struct amdgpu_device *adev = drm_to_adev(ddev);
0243 
0244     return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
0245 
0246 }
0247 
0248 #define AMDGPU_XGMI_SET_FICAA(o)    ((o) | 0x456801)
0249 static ssize_t amdgpu_xgmi_show_error(struct device *dev,
0250                       struct device_attribute *attr,
0251                       char *buf)
0252 {
0253     struct drm_device *ddev = dev_get_drvdata(dev);
0254     struct amdgpu_device *adev = drm_to_adev(ddev);
0255     uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
0256     uint64_t fica_out;
0257     unsigned int error_count = 0;
0258 
0259     ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
0260     ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
0261 
0262     if ((!adev->df.funcs) ||
0263         (!adev->df.funcs->get_fica) ||
0264         (!adev->df.funcs->set_fica))
0265         return -EINVAL;
0266 
0267     fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
0268     if (fica_out != 0x1f)
0269         pr_err("xGMI error counters not enabled!\n");
0270 
0271     fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
0272 
0273     if ((fica_out & 0xffff) == 2)
0274         error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
0275 
0276     adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
0277 
0278     return sysfs_emit(buf, "%u\n", error_count);
0279 }
0280 
0281 
0282 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
0283 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
0284 
0285 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
0286                      struct amdgpu_hive_info *hive)
0287 {
0288     int ret = 0;
0289     char node[10] = { 0 };
0290 
0291     /* Create xgmi device id file */
0292     ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
0293     if (ret) {
0294         dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
0295         return ret;
0296     }
0297 
0298     /* Create xgmi error file */
0299     ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
0300     if (ret)
0301         pr_err("failed to create xgmi_error\n");
0302 
0303 
0304     /* Create sysfs link to hive info folder on the first device */
0305     if (hive->kobj.parent != (&adev->dev->kobj)) {
0306         ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
0307                     "xgmi_hive_info");
0308         if (ret) {
0309             dev_err(adev->dev, "XGMI: Failed to create link to hive info");
0310             goto remove_file;
0311         }
0312     }
0313 
0314     sprintf(node, "node%d", atomic_read(&hive->number_devices));
0315     /* Create sysfs link form the hive folder to yourself */
0316     ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
0317     if (ret) {
0318         dev_err(adev->dev, "XGMI: Failed to create link from hive info");
0319         goto remove_link;
0320     }
0321 
0322     goto success;
0323 
0324 
0325 remove_link:
0326     sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
0327 
0328 remove_file:
0329     device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
0330 
0331 success:
0332     return ret;
0333 }
0334 
0335 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
0336                       struct amdgpu_hive_info *hive)
0337 {
0338     char node[10];
0339     memset(node, 0, sizeof(node));
0340 
0341     device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
0342     device_remove_file(adev->dev, &dev_attr_xgmi_error);
0343 
0344     if (hive->kobj.parent != (&adev->dev->kobj))
0345         sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
0346 
0347     sprintf(node, "node%d", atomic_read(&hive->number_devices));
0348     sysfs_remove_link(&hive->kobj, node);
0349 
0350 }
0351 
0352 
0353 
0354 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
0355 {
0356     struct amdgpu_hive_info *hive = NULL;
0357     int ret;
0358 
0359     if (!adev->gmc.xgmi.hive_id)
0360         return NULL;
0361 
0362     if (adev->hive) {
0363         kobject_get(&adev->hive->kobj);
0364         return adev->hive;
0365     }
0366 
0367     mutex_lock(&xgmi_mutex);
0368 
0369     list_for_each_entry(hive, &xgmi_hive_list, node)  {
0370         if (hive->hive_id == adev->gmc.xgmi.hive_id)
0371             goto pro_end;
0372     }
0373 
0374     hive = kzalloc(sizeof(*hive), GFP_KERNEL);
0375     if (!hive) {
0376         dev_err(adev->dev, "XGMI: allocation failed\n");
0377         hive = NULL;
0378         goto pro_end;
0379     }
0380 
0381     /* initialize new hive if not exist */
0382     ret = kobject_init_and_add(&hive->kobj,
0383             &amdgpu_xgmi_hive_type,
0384             &adev->dev->kobj,
0385             "%s", "xgmi_hive_info");
0386     if (ret) {
0387         dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
0388         kobject_put(&hive->kobj);
0389         kfree(hive);
0390         hive = NULL;
0391         goto pro_end;
0392     }
0393 
0394     /**
0395      * Avoid recreating reset domain when hive is reconstructed for the case
0396      * of reset the devices in the XGMI hive during probe for SRIOV
0397      * See https://www.spinics.net/lists/amd-gfx/msg58836.html
0398      */
0399     if (adev->reset_domain->type != XGMI_HIVE) {
0400         hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
0401             if (!hive->reset_domain) {
0402                 dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
0403                 ret = -ENOMEM;
0404                 kobject_put(&hive->kobj);
0405                 kfree(hive);
0406                 hive = NULL;
0407                 goto pro_end;
0408             }
0409     } else {
0410         amdgpu_reset_get_reset_domain(adev->reset_domain);
0411         hive->reset_domain = adev->reset_domain;
0412     }
0413 
0414     hive->hive_id = adev->gmc.xgmi.hive_id;
0415     INIT_LIST_HEAD(&hive->device_list);
0416     INIT_LIST_HEAD(&hive->node);
0417     mutex_init(&hive->hive_lock);
0418     atomic_set(&hive->number_devices, 0);
0419     task_barrier_init(&hive->tb);
0420     hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
0421     hive->hi_req_gpu = NULL;
0422 
0423     /*
0424      * hive pstate on boot is high in vega20 so we have to go to low
0425      * pstate on after boot.
0426      */
0427     hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
0428     list_add_tail(&hive->node, &xgmi_hive_list);
0429 
0430 pro_end:
0431     if (hive)
0432         kobject_get(&hive->kobj);
0433     mutex_unlock(&xgmi_mutex);
0434     return hive;
0435 }
0436 
0437 void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
0438 {
0439     if (hive)
0440         kobject_put(&hive->kobj);
0441 }
0442 
0443 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
0444 {
0445     int ret = 0;
0446     struct amdgpu_hive_info *hive;
0447     struct amdgpu_device *request_adev;
0448     bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
0449     bool init_low;
0450 
0451     hive = amdgpu_get_xgmi_hive(adev);
0452     if (!hive)
0453         return 0;
0454 
0455     request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
0456     init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
0457     amdgpu_put_xgmi_hive(hive);
0458     /* fw bug so temporarily disable pstate switching */
0459     return 0;
0460 
0461     if (!hive || adev->asic_type != CHIP_VEGA20)
0462         return 0;
0463 
0464     mutex_lock(&hive->hive_lock);
0465 
0466     if (is_hi_req)
0467         hive->hi_req_count++;
0468     else
0469         hive->hi_req_count--;
0470 
0471     /*
0472      * Vega20 only needs single peer to request pstate high for the hive to
0473      * go high but all peers must request pstate low for the hive to go low
0474      */
0475     if (hive->pstate == pstate ||
0476             (!is_hi_req && hive->hi_req_count && !init_low))
0477         goto out;
0478 
0479     dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
0480 
0481     ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
0482     if (ret) {
0483         dev_err(request_adev->dev,
0484             "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
0485             request_adev->gmc.xgmi.node_id,
0486             request_adev->gmc.xgmi.hive_id, ret);
0487         goto out;
0488     }
0489 
0490     if (init_low)
0491         hive->pstate = hive->hi_req_count ?
0492                     hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
0493     else {
0494         hive->pstate = pstate;
0495         hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
0496                             adev : NULL;
0497     }
0498 out:
0499     mutex_unlock(&hive->hive_lock);
0500     return ret;
0501 }
0502 
0503 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
0504 {
0505     int ret;
0506 
0507     /* Each psp need to set the latest topology */
0508     ret = psp_xgmi_set_topology_info(&adev->psp,
0509                      atomic_read(&hive->number_devices),
0510                      &adev->psp.xgmi_context.top_info);
0511     if (ret)
0512         dev_err(adev->dev,
0513             "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
0514             adev->gmc.xgmi.node_id,
0515             adev->gmc.xgmi.hive_id, ret);
0516 
0517     return ret;
0518 }
0519 
0520 
0521 /*
0522  * NOTE psp_xgmi_node_info.num_hops layout is as follows:
0523  * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
0524  * num_hops[5:3] = reserved
0525  * num_hops[2:0] = number of hops
0526  */
0527 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
0528         struct amdgpu_device *peer_adev)
0529 {
0530     struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
0531     uint8_t num_hops_mask = 0x7;
0532     int i;
0533 
0534     for (i = 0 ; i < top->num_nodes; ++i)
0535         if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
0536             return top->nodes[i].num_hops & num_hops_mask;
0537     return  -EINVAL;
0538 }
0539 
0540 int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
0541         struct amdgpu_device *peer_adev)
0542 {
0543     struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
0544     int i;
0545 
0546     for (i = 0 ; i < top->num_nodes; ++i)
0547         if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
0548             return top->nodes[i].num_links;
0549     return  -EINVAL;
0550 }
0551 
0552 /*
0553  * Devices that support extended data require the entire hive to initialize with
0554  * the shared memory buffer flag set.
0555  *
0556  * Hive locks and conditions apply - see amdgpu_xgmi_add_device
0557  */
0558 static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
0559                             bool set_extended_data)
0560 {
0561     struct amdgpu_device *tmp_adev;
0562     int ret;
0563 
0564     list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0565         ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
0566         if (ret) {
0567             dev_err(tmp_adev->dev,
0568                 "XGMI: Failed to initialize xgmi session for data partition %i\n",
0569                 set_extended_data);
0570             return ret;
0571         }
0572 
0573     }
0574 
0575     return 0;
0576 }
0577 
0578 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
0579 {
0580     struct psp_xgmi_topology_info *top_info;
0581     struct amdgpu_hive_info *hive;
0582     struct amdgpu_xgmi  *entry;
0583     struct amdgpu_device *tmp_adev = NULL;
0584 
0585     int count = 0, ret = 0;
0586 
0587     if (!adev->gmc.xgmi.supported)
0588         return 0;
0589 
0590     if (!adev->gmc.xgmi.pending_reset &&
0591         amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
0592         ret = psp_xgmi_initialize(&adev->psp, false, true);
0593         if (ret) {
0594             dev_err(adev->dev,
0595                 "XGMI: Failed to initialize xgmi session\n");
0596             return ret;
0597         }
0598 
0599         ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
0600         if (ret) {
0601             dev_err(adev->dev,
0602                 "XGMI: Failed to get hive id\n");
0603             return ret;
0604         }
0605 
0606         ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
0607         if (ret) {
0608             dev_err(adev->dev,
0609                 "XGMI: Failed to get node id\n");
0610             return ret;
0611         }
0612     } else {
0613         adev->gmc.xgmi.hive_id = 16;
0614         adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
0615     }
0616 
0617     hive = amdgpu_get_xgmi_hive(adev);
0618     if (!hive) {
0619         ret = -EINVAL;
0620         dev_err(adev->dev,
0621             "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
0622             adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
0623         goto exit;
0624     }
0625     mutex_lock(&hive->hive_lock);
0626 
0627     top_info = &adev->psp.xgmi_context.top_info;
0628 
0629     list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
0630     list_for_each_entry(entry, &hive->device_list, head)
0631         top_info->nodes[count++].node_id = entry->node_id;
0632     top_info->num_nodes = count;
0633     atomic_set(&hive->number_devices, count);
0634 
0635     task_barrier_add_task(&hive->tb);
0636 
0637     if (!adev->gmc.xgmi.pending_reset &&
0638         amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
0639         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0640             /* update node list for other device in the hive */
0641             if (tmp_adev != adev) {
0642                 top_info = &tmp_adev->psp.xgmi_context.top_info;
0643                 top_info->nodes[count - 1].node_id =
0644                     adev->gmc.xgmi.node_id;
0645                 top_info->num_nodes = count;
0646             }
0647             ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
0648             if (ret)
0649                 goto exit_unlock;
0650         }
0651 
0652         /* get latest topology info for each device from psp */
0653         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0654             ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
0655                     &tmp_adev->psp.xgmi_context.top_info, false);
0656             if (ret) {
0657                 dev_err(tmp_adev->dev,
0658                     "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
0659                     tmp_adev->gmc.xgmi.node_id,
0660                     tmp_adev->gmc.xgmi.hive_id, ret);
0661                 /* To do : continue with some node failed or disable the whole hive */
0662                 goto exit_unlock;
0663             }
0664         }
0665 
0666         /* get topology again for hives that support extended data */
0667         if (adev->psp.xgmi_context.supports_extended_data) {
0668 
0669             /* initialize the hive to get extended data.  */
0670             ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
0671             if (ret)
0672                 goto exit_unlock;
0673 
0674             /* get the extended data. */
0675             list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
0676                 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
0677                         &tmp_adev->psp.xgmi_context.top_info, true);
0678                 if (ret) {
0679                     dev_err(tmp_adev->dev,
0680                         "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
0681                         tmp_adev->gmc.xgmi.node_id,
0682                         tmp_adev->gmc.xgmi.hive_id, ret);
0683                     goto exit_unlock;
0684                 }
0685             }
0686 
0687             /* initialize the hive to get non-extended data for the next round. */
0688             ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
0689             if (ret)
0690                 goto exit_unlock;
0691 
0692         }
0693     }
0694 
0695     if (!ret && !adev->gmc.xgmi.pending_reset)
0696         ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
0697 
0698 exit_unlock:
0699     mutex_unlock(&hive->hive_lock);
0700 exit:
0701     if (!ret) {
0702         adev->hive = hive;
0703         dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
0704              adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
0705     } else {
0706         amdgpu_put_xgmi_hive(hive);
0707         dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
0708             adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
0709             ret);
0710     }
0711 
0712     return ret;
0713 }
0714 
0715 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
0716 {
0717     struct amdgpu_hive_info *hive = adev->hive;
0718 
0719     if (!adev->gmc.xgmi.supported)
0720         return -EINVAL;
0721 
0722     if (!hive)
0723         return -EINVAL;
0724 
0725     mutex_lock(&hive->hive_lock);
0726     task_barrier_rem_task(&hive->tb);
0727     amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
0728     if (hive->hi_req_gpu == adev)
0729         hive->hi_req_gpu = NULL;
0730     list_del(&adev->gmc.xgmi.head);
0731     mutex_unlock(&hive->hive_lock);
0732 
0733     amdgpu_put_xgmi_hive(hive);
0734     adev->hive = NULL;
0735 
0736     if (atomic_dec_return(&hive->number_devices) == 0) {
0737         /* Remove the hive from global hive list */
0738         mutex_lock(&xgmi_mutex);
0739         list_del(&hive->node);
0740         mutex_unlock(&xgmi_mutex);
0741 
0742         amdgpu_put_xgmi_hive(hive);
0743     }
0744 
0745     return 0;
0746 }
0747 
0748 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
0749 {
0750     if (!adev->gmc.xgmi.supported ||
0751         adev->gmc.xgmi.num_physical_nodes == 0)
0752         return 0;
0753 
0754     adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
0755 
0756     return amdgpu_ras_block_late_init(adev, ras_block);
0757 }
0758 
0759 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
0760                        uint64_t addr)
0761 {
0762     struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
0763     return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
0764 }
0765 
0766 static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
0767 {
0768     WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
0769     WREG32_PCIE(pcs_status_reg, 0);
0770 }
0771 
0772 static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
0773 {
0774     uint32_t i;
0775 
0776     switch (adev->asic_type) {
0777     case CHIP_ARCTURUS:
0778         for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
0779             pcs_clear_status(adev,
0780                      xgmi_pcs_err_status_reg_arct[i]);
0781         break;
0782     case CHIP_VEGA20:
0783         for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
0784             pcs_clear_status(adev,
0785                      xgmi_pcs_err_status_reg_vg20[i]);
0786         break;
0787     case CHIP_ALDEBARAN:
0788         for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
0789             pcs_clear_status(adev,
0790                      xgmi3x16_pcs_err_status_reg_aldebaran[i]);
0791         for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
0792             pcs_clear_status(adev,
0793                      walf_pcs_err_status_reg_aldebaran[i]);
0794         break;
0795     default:
0796         break;
0797     }
0798 }
0799 
0800 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
0801                           uint32_t value,
0802                           uint32_t *ue_count,
0803                           uint32_t *ce_count,
0804                           bool is_xgmi_pcs)
0805 {
0806     int i;
0807     int ue_cnt;
0808 
0809     if (is_xgmi_pcs) {
0810         /* query xgmi pcs error status,
0811          * only ue is supported */
0812         for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
0813             ue_cnt = (value &
0814                   xgmi_pcs_ras_fields[i].pcs_err_mask) >>
0815                   xgmi_pcs_ras_fields[i].pcs_err_shift;
0816             if (ue_cnt) {
0817                 dev_info(adev->dev, "%s detected\n",
0818                      xgmi_pcs_ras_fields[i].err_name);
0819                 *ue_count += ue_cnt;
0820             }
0821         }
0822     } else {
0823         /* query wafl pcs error status,
0824          * only ue is supported */
0825         for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
0826             ue_cnt = (value &
0827                   wafl_pcs_ras_fields[i].pcs_err_mask) >>
0828                   wafl_pcs_ras_fields[i].pcs_err_shift;
0829             if (ue_cnt) {
0830                 dev_info(adev->dev, "%s detected\n",
0831                      wafl_pcs_ras_fields[i].err_name);
0832                 *ue_count += ue_cnt;
0833             }
0834         }
0835     }
0836 
0837     return 0;
0838 }
0839 
0840 static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
0841                          void *ras_error_status)
0842 {
0843     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
0844     int i;
0845     uint32_t data;
0846     uint32_t ue_cnt = 0, ce_cnt = 0;
0847 
0848     if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
0849         return ;
0850 
0851     err_data->ue_count = 0;
0852     err_data->ce_count = 0;
0853 
0854     switch (adev->asic_type) {
0855     case CHIP_ARCTURUS:
0856         /* check xgmi pcs error */
0857         for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
0858             data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
0859             if (data)
0860                 amdgpu_xgmi_query_pcs_error_status(adev,
0861                         data, &ue_cnt, &ce_cnt, true);
0862         }
0863         /* check wafl pcs error */
0864         for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
0865             data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
0866             if (data)
0867                 amdgpu_xgmi_query_pcs_error_status(adev,
0868                         data, &ue_cnt, &ce_cnt, false);
0869         }
0870         break;
0871     case CHIP_VEGA20:
0872         /* check xgmi pcs error */
0873         for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
0874             data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
0875             if (data)
0876                 amdgpu_xgmi_query_pcs_error_status(adev,
0877                         data, &ue_cnt, &ce_cnt, true);
0878         }
0879         /* check wafl pcs error */
0880         for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
0881             data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
0882             if (data)
0883                 amdgpu_xgmi_query_pcs_error_status(adev,
0884                         data, &ue_cnt, &ce_cnt, false);
0885         }
0886         break;
0887     case CHIP_ALDEBARAN:
0888         /* check xgmi3x16 pcs error */
0889         for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
0890             data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
0891             if (data)
0892                 amdgpu_xgmi_query_pcs_error_status(adev,
0893                         data, &ue_cnt, &ce_cnt, true);
0894         }
0895         /* check wafl pcs error */
0896         for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
0897             data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
0898             if (data)
0899                 amdgpu_xgmi_query_pcs_error_status(adev,
0900                         data, &ue_cnt, &ce_cnt, false);
0901         }
0902         break;
0903     default:
0904         dev_warn(adev->dev, "XGMI RAS error query not supported");
0905         break;
0906     }
0907 
0908     adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
0909 
0910     err_data->ue_count += ue_cnt;
0911     err_data->ce_count += ce_cnt;
0912 }
0913 
0914 /* Trigger XGMI/WAFL error */
0915 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  void *inject_if)
0916 {
0917     int ret = 0;
0918     struct ta_ras_trigger_error_input *block_info =
0919                 (struct ta_ras_trigger_error_input *)inject_if;
0920 
0921     if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
0922         dev_warn(adev->dev, "Failed to disallow df cstate");
0923 
0924     if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
0925         dev_warn(adev->dev, "Failed to disallow XGMI power down");
0926 
0927     ret = psp_ras_trigger_error(&adev->psp, block_info);
0928 
0929     if (amdgpu_ras_intr_triggered())
0930         return ret;
0931 
0932     if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
0933         dev_warn(adev->dev, "Failed to allow XGMI power down");
0934 
0935     if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
0936         dev_warn(adev->dev, "Failed to allow df cstate");
0937 
0938     return ret;
0939 }
0940 
0941 struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
0942     .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
0943     .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
0944     .ras_error_inject = amdgpu_ras_error_inject_xgmi,
0945 };
0946 
0947 struct amdgpu_xgmi_ras xgmi_ras = {
0948     .ras_block = {
0949         .ras_comm = {
0950             .name = "xgmi_wafl",
0951             .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
0952             .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
0953         },
0954         .hw_ops = &xgmi_ras_hw_ops,
0955         .ras_late_init = amdgpu_xgmi_ras_late_init,
0956     },
0957 };