Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2021 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  */
0023 
0024 #include "aldebaran.h"
0025 #include "amdgpu_reset.h"
0026 #include "amdgpu_amdkfd.h"
0027 #include "amdgpu_dpm.h"
0028 #include "amdgpu_job.h"
0029 #include "amdgpu_ring.h"
0030 #include "amdgpu_ras.h"
0031 #include "amdgpu_psp.h"
0032 #include "amdgpu_xgmi.h"
0033 
0034 static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
0035 {
0036     struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
0037 
0038     if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
0039          adev->gmc.xgmi.connected_to_cpu))
0040         return true;
0041 
0042     return false;
0043 }
0044 
0045 static struct amdgpu_reset_handler *
0046 aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
0047                 struct amdgpu_reset_context *reset_context)
0048 {
0049     struct amdgpu_reset_handler *handler;
0050     struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
0051 
0052     if (reset_context->method != AMD_RESET_METHOD_NONE) {
0053         dev_dbg(adev->dev, "Getting reset handler for method %d\n",
0054             reset_context->method);
0055         list_for_each_entry(handler, &reset_ctl->reset_handlers,
0056                      handler_list) {
0057             if (handler->reset_method == reset_context->method)
0058                 return handler;
0059         }
0060     }
0061 
0062     if (aldebaran_is_mode2_default(reset_ctl)) {
0063         list_for_each_entry(handler, &reset_ctl->reset_handlers,
0064                      handler_list) {
0065             if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
0066                 reset_context->method = AMD_RESET_METHOD_MODE2;
0067                 return handler;
0068             }
0069         }
0070     }
0071 
0072     dev_dbg(adev->dev, "Reset handler not found!\n");
0073 
0074     return NULL;
0075 }
0076 
0077 static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
0078 {
0079     int r, i;
0080 
0081     amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
0082     amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
0083 
0084     for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
0085         if (!(adev->ip_blocks[i].version->type ==
0086                   AMD_IP_BLOCK_TYPE_GFX ||
0087               adev->ip_blocks[i].version->type ==
0088                   AMD_IP_BLOCK_TYPE_SDMA))
0089             continue;
0090 
0091         r = adev->ip_blocks[i].version->funcs->suspend(adev);
0092 
0093         if (r) {
0094             dev_err(adev->dev,
0095                 "suspend of IP block <%s> failed %d\n",
0096                 adev->ip_blocks[i].version->funcs->name, r);
0097             return r;
0098         }
0099 
0100         adev->ip_blocks[i].status.hw = false;
0101     }
0102 
0103     return r;
0104 }
0105 
0106 static int
0107 aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
0108                   struct amdgpu_reset_context *reset_context)
0109 {
0110     int r = 0;
0111     struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
0112 
0113     dev_dbg(adev->dev, "Aldebaran prepare hw context\n");
0114     /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
0115     if (!amdgpu_sriov_vf(adev))
0116         r = aldebaran_mode2_suspend_ip(adev);
0117 
0118     return r;
0119 }
0120 
0121 static void aldebaran_async_reset(struct work_struct *work)
0122 {
0123     struct amdgpu_reset_handler *handler;
0124     struct amdgpu_reset_control *reset_ctl =
0125         container_of(work, struct amdgpu_reset_control, reset_work);
0126     struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
0127 
0128     list_for_each_entry(handler, &reset_ctl->reset_handlers,
0129                  handler_list) {
0130         if (handler->reset_method == reset_ctl->active_reset) {
0131             dev_dbg(adev->dev, "Resetting device\n");
0132             handler->do_reset(adev);
0133             break;
0134         }
0135     }
0136 }
0137 
0138 static int aldebaran_mode2_reset(struct amdgpu_device *adev)
0139 {
0140     /* disable BM */
0141     pci_clear_master(adev->pdev);
0142     adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
0143     return adev->asic_reset_res;
0144 }
0145 
0146 static int
0147 aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
0148                   struct amdgpu_reset_context *reset_context)
0149 {
0150     struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
0151     struct list_head *reset_device_list = reset_context->reset_device_list;
0152     struct amdgpu_device *tmp_adev = NULL;
0153     int r = 0;
0154 
0155     dev_dbg(adev->dev, "aldebaran perform hw reset\n");
0156 
0157     if (reset_device_list == NULL)
0158         return -EINVAL;
0159 
0160     if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
0161         reset_context->hive == NULL) {
0162         /* Wrong context, return error */
0163         return -EINVAL;
0164     }
0165 
0166     list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
0167         mutex_lock(&tmp_adev->reset_cntl->reset_lock);
0168         tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
0169     }
0170     /*
0171      * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
0172      * them together so that they can be completed asynchronously on multiple nodes
0173      */
0174     list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
0175         /* For XGMI run all resets in parallel to speed up the process */
0176         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
0177             if (!queue_work(system_unbound_wq,
0178                     &tmp_adev->reset_cntl->reset_work))
0179                 r = -EALREADY;
0180         } else
0181             r = aldebaran_mode2_reset(tmp_adev);
0182         if (r) {
0183             dev_err(tmp_adev->dev,
0184                 "ASIC reset failed with error, %d for drm dev, %s",
0185                 r, adev_to_drm(tmp_adev)->unique);
0186             break;
0187         }
0188     }
0189 
0190     /* For XGMI wait for all resets to complete before proceed */
0191     if (!r) {
0192         list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
0193             if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
0194                 flush_work(&tmp_adev->reset_cntl->reset_work);
0195                 r = tmp_adev->asic_reset_res;
0196                 if (r)
0197                     break;
0198             }
0199         }
0200     }
0201 
0202     list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
0203         mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
0204         tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
0205     }
0206 
0207     return r;
0208 }
0209 
0210 static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
0211 {
0212     struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
0213     struct amdgpu_firmware_info *ucode;
0214     struct amdgpu_ip_block *cmn_block;
0215     int ucode_count = 0;
0216     int i, r;
0217 
0218     dev_dbg(adev->dev, "Reloading ucodes after reset\n");
0219     for (i = 0; i < adev->firmware.max_ucodes; i++) {
0220         ucode = &adev->firmware.ucode[i];
0221         if (!ucode->fw)
0222             continue;
0223         switch (ucode->ucode_id) {
0224         case AMDGPU_UCODE_ID_SDMA0:
0225         case AMDGPU_UCODE_ID_SDMA1:
0226         case AMDGPU_UCODE_ID_SDMA2:
0227         case AMDGPU_UCODE_ID_SDMA3:
0228         case AMDGPU_UCODE_ID_SDMA4:
0229         case AMDGPU_UCODE_ID_SDMA5:
0230         case AMDGPU_UCODE_ID_SDMA6:
0231         case AMDGPU_UCODE_ID_SDMA7:
0232         case AMDGPU_UCODE_ID_CP_MEC1:
0233         case AMDGPU_UCODE_ID_CP_MEC1_JT:
0234         case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:
0235         case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:
0236         case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:
0237         case AMDGPU_UCODE_ID_RLC_G:
0238             ucode_list[ucode_count++] = ucode;
0239             break;
0240         default:
0241             break;
0242         }
0243     }
0244 
0245     /* Reinit NBIF block */
0246     cmn_block =
0247         amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);
0248     if (unlikely(!cmn_block)) {
0249         dev_err(adev->dev, "Failed to get BIF handle\n");
0250         return -EINVAL;
0251     }
0252     r = cmn_block->version->funcs->resume(adev);
0253     if (r)
0254         return r;
0255 
0256     /* Reinit GFXHUB */
0257     adev->gfxhub.funcs->init(adev);
0258     r = adev->gfxhub.funcs->gart_enable(adev);
0259     if (r) {
0260         dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
0261         return r;
0262     }
0263 
0264     /* Reload GFX firmware */
0265     r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);
0266     if (r) {
0267         dev_err(adev->dev, "GFX ucode load failed after reset\n");
0268         return r;
0269     }
0270 
0271     /* Resume RLC, FW needs RLC alive to complete reset process */
0272     adev->gfx.rlc.funcs->resume(adev);
0273 
0274     /* Wait for FW reset event complete */
0275     r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);
0276     if (r) {
0277         dev_err(adev->dev,
0278             "Failed to get response from firmware after reset\n");
0279         return r;
0280     }
0281 
0282     for (i = 0; i < adev->num_ip_blocks; i++) {
0283         if (!(adev->ip_blocks[i].version->type ==
0284                   AMD_IP_BLOCK_TYPE_GFX ||
0285               adev->ip_blocks[i].version->type ==
0286                   AMD_IP_BLOCK_TYPE_SDMA))
0287             continue;
0288         r = adev->ip_blocks[i].version->funcs->resume(adev);
0289         if (r) {
0290             dev_err(adev->dev,
0291                 "resume of IP block <%s> failed %d\n",
0292                 adev->ip_blocks[i].version->funcs->name, r);
0293             return r;
0294         }
0295 
0296         adev->ip_blocks[i].status.hw = true;
0297     }
0298 
0299     for (i = 0; i < adev->num_ip_blocks; i++) {
0300         if (!(adev->ip_blocks[i].version->type ==
0301                   AMD_IP_BLOCK_TYPE_GFX ||
0302               adev->ip_blocks[i].version->type ==
0303                   AMD_IP_BLOCK_TYPE_SDMA ||
0304               adev->ip_blocks[i].version->type ==
0305                   AMD_IP_BLOCK_TYPE_COMMON))
0306             continue;
0307 
0308         if (adev->ip_blocks[i].version->funcs->late_init) {
0309             r = adev->ip_blocks[i].version->funcs->late_init(
0310                 (void *)adev);
0311             if (r) {
0312                 dev_err(adev->dev,
0313                     "late_init of IP block <%s> failed %d after reset\n",
0314                     adev->ip_blocks[i].version->funcs->name,
0315                     r);
0316                 return r;
0317             }
0318         }
0319         adev->ip_blocks[i].status.late_initialized = true;
0320     }
0321 
0322     amdgpu_ras_set_error_query_ready(adev, true);
0323 
0324     amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
0325     amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
0326 
0327     return r;
0328 }
0329 
0330 static int
0331 aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
0332                   struct amdgpu_reset_context *reset_context)
0333 {
0334     struct list_head *reset_device_list = reset_context->reset_device_list;
0335     struct amdgpu_device *tmp_adev = NULL;
0336     int r;
0337 
0338     if (reset_device_list == NULL)
0339         return -EINVAL;
0340 
0341     if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
0342             IP_VERSION(13, 0, 2) &&
0343         reset_context->hive == NULL) {
0344         /* Wrong context, return error */
0345         return -EINVAL;
0346     }
0347 
0348     list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
0349         dev_info(tmp_adev->dev,
0350              "GPU reset succeeded, trying to resume\n");
0351         r = aldebaran_mode2_restore_ip(tmp_adev);
0352         if (r)
0353             goto end;
0354 
0355         /*
0356          * Add this ASIC as tracked as reset was already
0357          * complete successfully.
0358          */
0359         amdgpu_register_gpu_instance(tmp_adev);
0360 
0361         /* Resume RAS */
0362         amdgpu_ras_resume(tmp_adev);
0363 
0364         /* Update PSP FW topology after reset */
0365         if (reset_context->hive &&
0366             tmp_adev->gmc.xgmi.num_physical_nodes > 1)
0367             r = amdgpu_xgmi_update_topology(reset_context->hive,
0368                             tmp_adev);
0369 
0370         if (!r) {
0371             amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
0372 
0373             r = amdgpu_ib_ring_tests(tmp_adev);
0374             if (r) {
0375                 dev_err(tmp_adev->dev,
0376                     "ib ring test failed (%d).\n", r);
0377                 r = -EAGAIN;
0378                 tmp_adev->asic_reset_res = r;
0379                 goto end;
0380             }
0381         }
0382     }
0383 
0384 end:
0385     return r;
0386 }
0387 
0388 static struct amdgpu_reset_handler aldebaran_mode2_handler = {
0389     .reset_method       = AMD_RESET_METHOD_MODE2,
0390     .prepare_env        = NULL,
0391     .prepare_hwcontext  = aldebaran_mode2_prepare_hwcontext,
0392     .perform_reset      = aldebaran_mode2_perform_reset,
0393     .restore_hwcontext  = aldebaran_mode2_restore_hwcontext,
0394     .restore_env        = NULL,
0395     .do_reset       = aldebaran_mode2_reset,
0396 };
0397 
0398 int aldebaran_reset_init(struct amdgpu_device *adev)
0399 {
0400     struct amdgpu_reset_control *reset_ctl;
0401 
0402     reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
0403     if (!reset_ctl)
0404         return -ENOMEM;
0405 
0406     reset_ctl->handle = adev;
0407     reset_ctl->async_reset = aldebaran_async_reset;
0408     reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
0409     reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
0410 
0411     INIT_LIST_HEAD(&reset_ctl->reset_handlers);
0412     INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
0413     /* Only mode2 is handled through reset control now */
0414     amdgpu_reset_add_handler(reset_ctl, &aldebaran_mode2_handler);
0415 
0416     adev->reset_cntl = reset_ctl;
0417 
0418     return 0;
0419 }
0420 
0421 int aldebaran_reset_fini(struct amdgpu_device *adev)
0422 {
0423     kfree(adev->reset_cntl);
0424     adev->reset_cntl = NULL;
0425     return 0;
0426 }