0001
0002
0003
0004
0005
0006
0007
0008
0009 #define pr_fmt(fmt) "habanalabs: " fmt
0010
0011 #include "habanalabs.h"
0012
0013 #include <linux/pci.h>
0014 #include <linux/aer.h>
0015 #include <linux/module.h>
0016
0017 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
0018
0019 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
0020
0021 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
0022 MODULE_DESCRIPTION(HL_DRIVER_DESC);
0023 MODULE_LICENSE("GPL v2");
0024
0025 static int hl_major;
0026 static struct class *hl_class;
0027 static DEFINE_IDR(hl_devs_idr);
0028 static DEFINE_MUTEX(hl_devs_idr_lock);
0029
0030 static int timeout_locked = 30;
0031 static int reset_on_lockup = 1;
0032 static int memory_scrub;
0033 static ulong boot_error_status_mask = ULONG_MAX;
0034
0035 module_param(timeout_locked, int, 0444);
0036 MODULE_PARM_DESC(timeout_locked,
0037 "Device lockup timeout in seconds (0 = disabled, default 30s)");
0038
0039 module_param(reset_on_lockup, int, 0444);
0040 MODULE_PARM_DESC(reset_on_lockup,
0041 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
0042
0043 module_param(memory_scrub, int, 0444);
0044 MODULE_PARM_DESC(memory_scrub,
0045 "Scrub device memory in various states (0 = no, 1 = yes, default no)");
0046
0047 module_param(boot_error_status_mask, ulong, 0444);
0048 MODULE_PARM_DESC(boot_error_status_mask,
0049 "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
0050
0051 #define PCI_VENDOR_ID_HABANALABS 0x1da3
0052
0053 #define PCI_IDS_GOYA 0x0001
0054 #define PCI_IDS_GAUDI 0x1000
0055 #define PCI_IDS_GAUDI_SEC 0x1010
0056
0057 #define PCI_IDS_GAUDI2 0x1020
0058 #define PCI_IDS_GAUDI2_SEC 0x1030
0059
0060 static const struct pci_device_id ids[] = {
0061 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
0062 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
0063 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
0064 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
0065 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2_SEC), },
0066 { 0, }
0067 };
0068 MODULE_DEVICE_TABLE(pci, ids);
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078 static enum hl_asic_type get_asic_type(u16 device)
0079 {
0080 enum hl_asic_type asic_type;
0081
0082 switch (device) {
0083 case PCI_IDS_GOYA:
0084 asic_type = ASIC_GOYA;
0085 break;
0086 case PCI_IDS_GAUDI:
0087 asic_type = ASIC_GAUDI;
0088 break;
0089 case PCI_IDS_GAUDI_SEC:
0090 asic_type = ASIC_GAUDI_SEC;
0091 break;
0092 case PCI_IDS_GAUDI2:
0093 asic_type = ASIC_GAUDI2;
0094 break;
0095 case PCI_IDS_GAUDI2_SEC:
0096 asic_type = ASIC_GAUDI2_SEC;
0097 break;
0098 default:
0099 asic_type = ASIC_INVALID;
0100 break;
0101 }
0102
0103 return asic_type;
0104 }
0105
0106 static bool is_asic_secured(enum hl_asic_type asic_type)
0107 {
0108 switch (asic_type) {
0109 case ASIC_GAUDI_SEC:
0110 case ASIC_GAUDI2_SEC:
0111 return true;
0112 default:
0113 return false;
0114 }
0115 }
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125 int hl_device_open(struct inode *inode, struct file *filp)
0126 {
0127 enum hl_device_status status;
0128 struct hl_device *hdev;
0129 struct hl_fpriv *hpriv;
0130 int rc;
0131
0132 mutex_lock(&hl_devs_idr_lock);
0133 hdev = idr_find(&hl_devs_idr, iminor(inode));
0134 mutex_unlock(&hl_devs_idr_lock);
0135
0136 if (!hdev) {
0137 pr_err("Couldn't find device %d:%d\n",
0138 imajor(inode), iminor(inode));
0139 return -ENXIO;
0140 }
0141
0142 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
0143 if (!hpriv)
0144 return -ENOMEM;
0145
0146 hpriv->hdev = hdev;
0147 filp->private_data = hpriv;
0148 hpriv->filp = filp;
0149
0150 mutex_init(&hpriv->notifier_event.lock);
0151 mutex_init(&hpriv->restore_phase_mutex);
0152 mutex_init(&hpriv->ctx_lock);
0153 kref_init(&hpriv->refcount);
0154 nonseekable_open(inode, filp);
0155
0156 hl_ctx_mgr_init(&hpriv->ctx_mgr);
0157 hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
0158
0159 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
0160
0161 mutex_lock(&hdev->fpriv_list_lock);
0162
0163 if (!hl_device_operational(hdev, &status)) {
0164 dev_err_ratelimited(hdev->dev,
0165 "Can't open %s because it is %s\n",
0166 dev_name(hdev->dev), hdev->status[status]);
0167
0168 if (status == HL_DEVICE_STATUS_IN_RESET ||
0169 status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
0170 rc = -EAGAIN;
0171 else
0172 rc = -EPERM;
0173
0174 goto out_err;
0175 }
0176
0177 if (hdev->is_in_dram_scrub) {
0178 dev_dbg_ratelimited(hdev->dev,
0179 "Can't open %s during dram scrub\n",
0180 dev_name(hdev->dev));
0181 rc = -EAGAIN;
0182 goto out_err;
0183 }
0184
0185 if (hdev->compute_ctx_in_release) {
0186 dev_dbg_ratelimited(hdev->dev,
0187 "Can't open %s because another user is still releasing it\n",
0188 dev_name(hdev->dev));
0189 rc = -EAGAIN;
0190 goto out_err;
0191 }
0192
0193 if (hdev->is_compute_ctx_active) {
0194 dev_dbg_ratelimited(hdev->dev,
0195 "Can't open %s because another user is working on it\n",
0196 dev_name(hdev->dev));
0197 rc = -EBUSY;
0198 goto out_err;
0199 }
0200
0201 rc = hl_ctx_create(hdev, hpriv);
0202 if (rc) {
0203 dev_err(hdev->dev, "Failed to create context %d\n", rc);
0204 goto out_err;
0205 }
0206
0207 list_add(&hpriv->dev_node, &hdev->fpriv_list);
0208 mutex_unlock(&hdev->fpriv_list_lock);
0209
0210 hl_debugfs_add_file(hpriv);
0211
0212 atomic_set(&hdev->last_error.cs_timeout.write_enable, 1);
0213 atomic_set(&hdev->last_error.razwi.write_enable, 1);
0214 hdev->last_error.undef_opcode.write_enable = true;
0215
0216 hdev->open_counter++;
0217 hdev->last_successful_open_jif = jiffies;
0218 hdev->last_successful_open_ktime = ktime_get();
0219
0220 return 0;
0221
0222 out_err:
0223 mutex_unlock(&hdev->fpriv_list_lock);
0224 hl_mem_mgr_fini(&hpriv->mem_mgr);
0225 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
0226 filp->private_data = NULL;
0227 mutex_destroy(&hpriv->ctx_lock);
0228 mutex_destroy(&hpriv->restore_phase_mutex);
0229 mutex_destroy(&hpriv->notifier_event.lock);
0230 put_pid(hpriv->taskpid);
0231
0232 kfree(hpriv);
0233
0234 return rc;
0235 }
0236
0237 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
0238 {
0239 struct hl_device *hdev;
0240 struct hl_fpriv *hpriv;
0241 int rc;
0242
0243 mutex_lock(&hl_devs_idr_lock);
0244 hdev = idr_find(&hl_devs_idr, iminor(inode));
0245 mutex_unlock(&hl_devs_idr_lock);
0246
0247 if (!hdev) {
0248 pr_err("Couldn't find device %d:%d\n",
0249 imajor(inode), iminor(inode));
0250 return -ENXIO;
0251 }
0252
0253 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
0254 if (!hpriv)
0255 return -ENOMEM;
0256
0257
0258
0259
0260 hpriv->hdev = hdev;
0261 filp->private_data = hpriv;
0262 hpriv->filp = filp;
0263
0264 mutex_init(&hpriv->notifier_event.lock);
0265 nonseekable_open(inode, filp);
0266
0267 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
0268
0269 mutex_lock(&hdev->fpriv_ctrl_list_lock);
0270
0271 if (!hl_device_operational(hdev, NULL)) {
0272 dev_err_ratelimited(hdev->dev_ctrl,
0273 "Can't open %s because it is disabled or in reset\n",
0274 dev_name(hdev->dev_ctrl));
0275 rc = -EPERM;
0276 goto out_err;
0277 }
0278
0279 list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
0280 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
0281
0282 return 0;
0283
0284 out_err:
0285 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
0286 filp->private_data = NULL;
0287 put_pid(hpriv->taskpid);
0288
0289 kfree(hpriv);
0290
0291 return rc;
0292 }
0293
0294 static void set_driver_behavior_per_device(struct hl_device *hdev)
0295 {
0296 hdev->nic_ports_mask = 0;
0297 hdev->fw_components = FW_TYPE_ALL_TYPES;
0298 hdev->mmu_enable = MMU_EN_ALL;
0299 hdev->cpu_queues_enable = 1;
0300 hdev->pldm = 0;
0301 hdev->hard_reset_on_fw_events = 1;
0302 hdev->bmc_enable = 1;
0303 hdev->reset_on_preboot_fail = 1;
0304 hdev->heartbeat = 1;
0305 }
0306
0307 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
0308 {
0309 hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
0310
0311 hdev->major = hl_major;
0312 hdev->memory_scrub = memory_scrub;
0313 hdev->reset_on_lockup = reset_on_lockup;
0314 hdev->boot_error_status_mask = boot_error_status_mask;
0315 }
0316
0317 static void fixup_device_params_per_asic(struct hl_device *hdev)
0318 {
0319 switch (hdev->asic_type) {
0320 case ASIC_GOYA:
0321 case ASIC_GAUDI:
0322 case ASIC_GAUDI_SEC:
0323 hdev->reset_upon_device_release = 0;
0324 break;
0325
0326 default:
0327 hdev->reset_upon_device_release = 1;
0328 break;
0329 }
0330 }
0331
0332 static int fixup_device_params(struct hl_device *hdev)
0333 {
0334 int tmp_timeout;
0335
0336 tmp_timeout = timeout_locked;
0337
0338 hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
0339 hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
0340
0341 if (tmp_timeout)
0342 hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * 1000);
0343 else
0344 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
0345
0346 hdev->stop_on_err = true;
0347 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
0348 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
0349
0350
0351 hdev->disabled = true;
0352
0353 if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
0354 (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
0355 pr_err("Preboot must be set along with other components");
0356 return -EINVAL;
0357 }
0358
0359
0360 if (!hdev->cpu_queues_enable)
0361 hdev->heartbeat = 0;
0362
0363 fixup_device_params_per_asic(hdev);
0364
0365 return 0;
0366 }
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
0379 {
0380 int main_id, ctrl_id = 0, rc = 0;
0381 struct hl_device *hdev;
0382
0383 *dev = NULL;
0384
0385 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
0386 if (!hdev)
0387 return -ENOMEM;
0388
0389
0390 hdev->pdev = pdev;
0391
0392
0393 strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
0394 strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
0395 strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
0396 strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
0397 strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
0398 "in device creation", HL_STR_MAX);
0399 strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
0400 "in reset after device release", HL_STR_MAX);
0401
0402
0403
0404
0405
0406 hdev->asic_type = get_asic_type(pdev->device);
0407 if (hdev->asic_type == ASIC_INVALID) {
0408 dev_err(&pdev->dev, "Unsupported ASIC\n");
0409 rc = -ENODEV;
0410 goto free_hdev;
0411 }
0412
0413 copy_kernel_module_params_to_device(hdev);
0414
0415 set_driver_behavior_per_device(hdev);
0416
0417 fixup_device_params(hdev);
0418
0419 mutex_lock(&hl_devs_idr_lock);
0420
0421
0422
0423
0424 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
0425
0426 if (main_id >= 0)
0427 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
0428 main_id + 2, GFP_KERNEL);
0429
0430 mutex_unlock(&hl_devs_idr_lock);
0431
0432 if ((main_id < 0) || (ctrl_id < 0)) {
0433 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
0434 pr_err("too many devices in the system\n");
0435
0436 if (main_id >= 0) {
0437 mutex_lock(&hl_devs_idr_lock);
0438 idr_remove(&hl_devs_idr, main_id);
0439 mutex_unlock(&hl_devs_idr_lock);
0440 }
0441
0442 rc = -EBUSY;
0443 goto free_hdev;
0444 }
0445
0446 hdev->id = main_id;
0447 hdev->id_control = ctrl_id;
0448
0449 *dev = hdev;
0450
0451 return 0;
0452
0453 free_hdev:
0454 kfree(hdev);
0455 return rc;
0456 }
0457
0458
0459
0460
0461
0462
0463
0464 static void destroy_hdev(struct hl_device *hdev)
0465 {
0466
0467 mutex_lock(&hl_devs_idr_lock);
0468 idr_remove(&hl_devs_idr, hdev->id);
0469 idr_remove(&hl_devs_idr, hdev->id_control);
0470 mutex_unlock(&hl_devs_idr_lock);
0471
0472 kfree(hdev);
0473 }
0474
0475 static int hl_pmops_suspend(struct device *dev)
0476 {
0477 struct hl_device *hdev = dev_get_drvdata(dev);
0478
0479 pr_debug("Going to suspend PCI device\n");
0480
0481 if (!hdev) {
0482 pr_err("device pointer is NULL in suspend\n");
0483 return 0;
0484 }
0485
0486 return hl_device_suspend(hdev);
0487 }
0488
0489 static int hl_pmops_resume(struct device *dev)
0490 {
0491 struct hl_device *hdev = dev_get_drvdata(dev);
0492
0493 pr_debug("Going to resume PCI device\n");
0494
0495 if (!hdev) {
0496 pr_err("device pointer is NULL in resume\n");
0497 return 0;
0498 }
0499
0500 return hl_device_resume(hdev);
0501 }
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
0514 {
0515 struct hl_device *hdev;
0516 int rc;
0517
0518 dev_info(&pdev->dev, HL_NAME
0519 " device found [%04x:%04x] (rev %x)\n",
0520 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
0521
0522 rc = create_hdev(&hdev, pdev);
0523 if (rc)
0524 return rc;
0525
0526 pci_set_drvdata(pdev, hdev);
0527
0528 pci_enable_pcie_error_reporting(pdev);
0529
0530 rc = hl_device_init(hdev, hl_class);
0531 if (rc) {
0532 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
0533 rc = -ENODEV;
0534 goto disable_device;
0535 }
0536
0537 return 0;
0538
0539 disable_device:
0540 pci_disable_pcie_error_reporting(pdev);
0541 pci_set_drvdata(pdev, NULL);
0542 destroy_hdev(hdev);
0543
0544 return rc;
0545 }
0546
0547
0548
0549
0550
0551
0552
0553
0554 static void hl_pci_remove(struct pci_dev *pdev)
0555 {
0556 struct hl_device *hdev;
0557
0558 hdev = pci_get_drvdata(pdev);
0559 if (!hdev)
0560 return;
0561
0562 hl_device_fini(hdev);
0563 pci_disable_pcie_error_reporting(pdev);
0564 pci_set_drvdata(pdev, NULL);
0565 destroy_hdev(hdev);
0566 }
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577 static pci_ers_result_t
0578 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
0579 {
0580 struct hl_device *hdev = pci_get_drvdata(pdev);
0581 enum pci_ers_result result;
0582
0583 switch (state) {
0584 case pci_channel_io_normal:
0585 return PCI_ERS_RESULT_CAN_RECOVER;
0586
0587 case pci_channel_io_frozen:
0588 dev_warn(hdev->dev, "frozen state error detected\n");
0589 result = PCI_ERS_RESULT_NEED_RESET;
0590 break;
0591
0592 case pci_channel_io_perm_failure:
0593 dev_warn(hdev->dev, "failure state error detected\n");
0594 result = PCI_ERS_RESULT_DISCONNECT;
0595 break;
0596
0597 default:
0598 result = PCI_ERS_RESULT_NONE;
0599 }
0600
0601 hdev->asic_funcs->halt_engines(hdev, true, false);
0602
0603 return result;
0604 }
0605
0606
0607
0608
0609
0610
0611
0612 static void hl_pci_err_resume(struct pci_dev *pdev)
0613 {
0614 struct hl_device *hdev = pci_get_drvdata(pdev);
0615
0616 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
0617 hl_device_resume(hdev);
0618 }
0619
0620
0621
0622
0623
0624
0625
0626
0627 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
0628 {
0629 return PCI_ERS_RESULT_RECOVERED;
0630 }
0631
0632 static const struct dev_pm_ops hl_pm_ops = {
0633 .suspend = hl_pmops_suspend,
0634 .resume = hl_pmops_resume,
0635 };
0636
0637 static const struct pci_error_handlers hl_pci_err_handler = {
0638 .error_detected = hl_pci_err_detected,
0639 .slot_reset = hl_pci_err_slot_reset,
0640 .resume = hl_pci_err_resume,
0641 };
0642
0643 static struct pci_driver hl_pci_driver = {
0644 .name = HL_NAME,
0645 .id_table = ids,
0646 .probe = hl_pci_probe,
0647 .remove = hl_pci_remove,
0648 .shutdown = hl_pci_remove,
0649 .driver = {
0650 .name = HL_NAME,
0651 .pm = &hl_pm_ops,
0652 .probe_type = PROBE_PREFER_ASYNCHRONOUS,
0653 },
0654 .err_handler = &hl_pci_err_handler,
0655 };
0656
0657
0658
0659
0660 static int __init hl_init(void)
0661 {
0662 int rc;
0663 dev_t dev;
0664
0665 pr_info("loading driver\n");
0666
0667 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
0668 if (rc < 0) {
0669 pr_err("unable to get major\n");
0670 return rc;
0671 }
0672
0673 hl_major = MAJOR(dev);
0674
0675 hl_class = class_create(THIS_MODULE, HL_NAME);
0676 if (IS_ERR(hl_class)) {
0677 pr_err("failed to allocate class\n");
0678 rc = PTR_ERR(hl_class);
0679 goto remove_major;
0680 }
0681
0682 hl_debugfs_init();
0683
0684 rc = pci_register_driver(&hl_pci_driver);
0685 if (rc) {
0686 pr_err("failed to register pci device\n");
0687 goto remove_debugfs;
0688 }
0689
0690 pr_debug("driver loaded\n");
0691
0692 return 0;
0693
0694 remove_debugfs:
0695 hl_debugfs_fini();
0696 class_destroy(hl_class);
0697 remove_major:
0698 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
0699 return rc;
0700 }
0701
0702
0703
0704
0705 static void __exit hl_exit(void)
0706 {
0707 pci_unregister_driver(&hl_pci_driver);
0708
0709
0710
0711
0712
0713
0714 hl_debugfs_fini();
0715
0716 class_destroy(hl_class);
0717 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
0718
0719 idr_destroy(&hl_devs_idr);
0720
0721 pr_debug("driver removed\n");
0722 }
0723
0724 module_init(hl_init);
0725 module_exit(hl_exit);