0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/delay.h>
0010 #include <linux/interrupt.h>
0011 #include <linux/irq.h>
0012 #include <linux/module.h>
0013 #include <linux/pci.h>
0014 #include <linux/pci_hotplug.h>
0015 #include <asm/eeh.h>
0016 #include <asm/eeh_event.h>
0017 #include <asm/ppc-pci.h>
0018 #include <asm/pci-bridge.h>
0019 #include <asm/rtas.h>
0020
0021 struct eeh_rmv_data {
0022 struct list_head removed_vf_list;
0023 int removed_dev_count;
0024 };
0025
0026 static int eeh_result_priority(enum pci_ers_result result)
0027 {
0028 switch (result) {
0029 case PCI_ERS_RESULT_NONE:
0030 return 1;
0031 case PCI_ERS_RESULT_NO_AER_DRIVER:
0032 return 2;
0033 case PCI_ERS_RESULT_RECOVERED:
0034 return 3;
0035 case PCI_ERS_RESULT_CAN_RECOVER:
0036 return 4;
0037 case PCI_ERS_RESULT_DISCONNECT:
0038 return 5;
0039 case PCI_ERS_RESULT_NEED_RESET:
0040 return 6;
0041 default:
0042 WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result);
0043 return 0;
0044 }
0045 };
0046
0047 static const char *pci_ers_result_name(enum pci_ers_result result)
0048 {
0049 switch (result) {
0050 case PCI_ERS_RESULT_NONE:
0051 return "none";
0052 case PCI_ERS_RESULT_CAN_RECOVER:
0053 return "can recover";
0054 case PCI_ERS_RESULT_NEED_RESET:
0055 return "need reset";
0056 case PCI_ERS_RESULT_DISCONNECT:
0057 return "disconnect";
0058 case PCI_ERS_RESULT_RECOVERED:
0059 return "recovered";
0060 case PCI_ERS_RESULT_NO_AER_DRIVER:
0061 return "no AER driver";
0062 default:
0063 WARN_ONCE(1, "Unknown result type: %d\n", (int)result);
0064 return "unknown";
0065 }
0066 };
0067
0068 static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
0069 enum pci_ers_result new)
0070 {
0071 if (eeh_result_priority(new) > eeh_result_priority(old))
0072 return new;
0073 return old;
0074 }
0075
0076 static bool eeh_dev_removed(struct eeh_dev *edev)
0077 {
0078 return !edev || (edev->mode & EEH_DEV_REMOVED);
0079 }
0080
0081 static bool eeh_edev_actionable(struct eeh_dev *edev)
0082 {
0083 if (!edev->pdev)
0084 return false;
0085 if (edev->pdev->error_state == pci_channel_io_perm_failure)
0086 return false;
0087 if (eeh_dev_removed(edev))
0088 return false;
0089 if (eeh_pe_passed(edev->pe))
0090 return false;
0091
0092 return true;
0093 }
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104 static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
0105 {
0106 if (!pdev || !pdev->dev.driver)
0107 return NULL;
0108
0109 if (!try_module_get(pdev->dev.driver->owner))
0110 return NULL;
0111
0112 return to_pci_driver(pdev->dev.driver);
0113 }
0114
0115
0116
0117
0118
0119
0120
0121
0122 static inline void eeh_pcid_put(struct pci_dev *pdev)
0123 {
0124 if (!pdev || !pdev->dev.driver)
0125 return;
0126
0127 module_put(pdev->dev.driver->owner);
0128 }
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140 static void eeh_disable_irq(struct eeh_dev *edev)
0141 {
0142
0143
0144
0145
0146 if (edev->pdev->msi_enabled || edev->pdev->msix_enabled)
0147 return;
0148
0149 if (!irq_has_action(edev->pdev->irq))
0150 return;
0151
0152 edev->mode |= EEH_DEV_IRQ_DISABLED;
0153 disable_irq_nosync(edev->pdev->irq);
0154 }
0155
0156
0157
0158
0159
0160
0161
0162
0163 static void eeh_enable_irq(struct eeh_dev *edev)
0164 {
0165 if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
0166 edev->mode &= ~EEH_DEV_IRQ_DISABLED;
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187 if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq)))
0188 enable_irq(edev->pdev->irq);
0189 }
0190 }
0191
0192 static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
0193 {
0194 struct pci_dev *pdev;
0195
0196 if (!edev)
0197 return;
0198
0199
0200
0201
0202
0203
0204
0205
0206 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
0207 return;
0208
0209 pdev = eeh_dev_to_pci_dev(edev);
0210 if (!pdev)
0211 return;
0212
0213 pci_save_state(pdev);
0214 }
0215
0216 static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s)
0217 {
0218 struct eeh_pe *pe;
0219 struct eeh_dev *edev, *tmp;
0220
0221 eeh_for_each_pe(root, pe)
0222 eeh_pe_for_each_dev(pe, edev, tmp)
0223 if (eeh_edev_actionable(edev))
0224 edev->pdev->error_state = s;
0225 }
0226
0227 static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
0228 {
0229 struct eeh_pe *pe;
0230 struct eeh_dev *edev, *tmp;
0231
0232 eeh_for_each_pe(root, pe) {
0233 eeh_pe_for_each_dev(pe, edev, tmp) {
0234 if (!eeh_edev_actionable(edev))
0235 continue;
0236
0237 if (!eeh_pcid_get(edev->pdev))
0238 continue;
0239
0240 if (enable)
0241 eeh_enable_irq(edev);
0242 else
0243 eeh_disable_irq(edev);
0244
0245 eeh_pcid_put(edev->pdev);
0246 }
0247 }
0248 }
0249
0250 typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
0251 struct pci_dev *,
0252 struct pci_driver *);
0253 static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
0254 enum pci_ers_result *result)
0255 {
0256 struct pci_dev *pdev;
0257 struct pci_driver *driver;
0258 enum pci_ers_result new_result;
0259
0260 pci_lock_rescan_remove();
0261 pdev = edev->pdev;
0262 if (pdev)
0263 get_device(&pdev->dev);
0264 pci_unlock_rescan_remove();
0265 if (!pdev) {
0266 eeh_edev_info(edev, "no device");
0267 return;
0268 }
0269 device_lock(&pdev->dev);
0270 if (eeh_edev_actionable(edev)) {
0271 driver = eeh_pcid_get(pdev);
0272
0273 if (!driver)
0274 eeh_edev_info(edev, "no driver");
0275 else if (!driver->err_handler)
0276 eeh_edev_info(edev, "driver not EEH aware");
0277 else if (edev->mode & EEH_DEV_NO_HANDLER)
0278 eeh_edev_info(edev, "driver bound too late");
0279 else {
0280 new_result = fn(edev, pdev, driver);
0281 eeh_edev_info(edev, "%s driver reports: '%s'",
0282 driver->name,
0283 pci_ers_result_name(new_result));
0284 if (result)
0285 *result = pci_ers_merge_result(*result,
0286 new_result);
0287 }
0288 if (driver)
0289 eeh_pcid_put(pdev);
0290 } else {
0291 eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
0292 !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
0293 }
0294 device_unlock(&pdev->dev);
0295 if (edev->pdev != pdev)
0296 eeh_edev_warn(edev, "Device changed during processing!\n");
0297 put_device(&pdev->dev);
0298 }
0299
0300 static void eeh_pe_report(const char *name, struct eeh_pe *root,
0301 eeh_report_fn fn, enum pci_ers_result *result)
0302 {
0303 struct eeh_pe *pe;
0304 struct eeh_dev *edev, *tmp;
0305
0306 pr_info("EEH: Beginning: '%s'\n", name);
0307 eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
0308 eeh_pe_report_edev(edev, fn, result);
0309 if (result)
0310 pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
0311 name, pci_ers_result_name(*result));
0312 else
0313 pr_info("EEH: Finished:'%s'", name);
0314 }
0315
0316
0317
0318
0319
0320
0321
0322
0323 static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
0324 struct pci_dev *pdev,
0325 struct pci_driver *driver)
0326 {
0327 enum pci_ers_result rc;
0328
0329 if (!driver->err_handler->error_detected)
0330 return PCI_ERS_RESULT_NONE;
0331
0332 eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
0333 driver->name);
0334 rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);
0335
0336 edev->in_error = true;
0337 pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
0338 return rc;
0339 }
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349 static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
0350 struct pci_dev *pdev,
0351 struct pci_driver *driver)
0352 {
0353 if (!driver->err_handler->mmio_enabled)
0354 return PCI_ERS_RESULT_NONE;
0355 eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
0356 return driver->err_handler->mmio_enabled(pdev);
0357 }
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369 static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
0370 struct pci_dev *pdev,
0371 struct pci_driver *driver)
0372 {
0373 if (!driver->err_handler->slot_reset || !edev->in_error)
0374 return PCI_ERS_RESULT_NONE;
0375 eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
0376 return driver->err_handler->slot_reset(pdev);
0377 }
0378
0379 static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
0380 {
0381 struct pci_dev *pdev;
0382
0383 if (!edev)
0384 return;
0385
0386
0387
0388
0389
0390
0391
0392 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
0393 if (list_is_last(&edev->entry, &edev->pe->edevs))
0394 eeh_pe_restore_bars(edev->pe);
0395
0396 return;
0397 }
0398
0399 pdev = eeh_dev_to_pci_dev(edev);
0400 if (!pdev)
0401 return;
0402
0403 pci_restore_state(pdev);
0404 }
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415 static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
0416 struct pci_dev *pdev,
0417 struct pci_driver *driver)
0418 {
0419 if (!driver->err_handler->resume || !edev->in_error)
0420 return PCI_ERS_RESULT_NONE;
0421
0422 eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
0423 driver->err_handler->resume(pdev);
0424
0425 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
0426 #ifdef CONFIG_PCI_IOV
0427 if (eeh_ops->notify_resume)
0428 eeh_ops->notify_resume(edev);
0429 #endif
0430 return PCI_ERS_RESULT_NONE;
0431 }
0432
0433
0434
0435
0436
0437
0438
0439
0440
0441 static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
0442 struct pci_dev *pdev,
0443 struct pci_driver *driver)
0444 {
0445 enum pci_ers_result rc;
0446
0447 if (!driver->err_handler->error_detected)
0448 return PCI_ERS_RESULT_NONE;
0449
0450 eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
0451 driver->name);
0452 rc = driver->err_handler->error_detected(pdev,
0453 pci_channel_io_perm_failure);
0454
0455 pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT);
0456 return rc;
0457 }
0458
0459 static void *eeh_add_virt_device(struct eeh_dev *edev)
0460 {
0461 struct pci_driver *driver;
0462 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
0463
0464 if (!(edev->physfn)) {
0465 eeh_edev_warn(edev, "Not for VF\n");
0466 return NULL;
0467 }
0468
0469 driver = eeh_pcid_get(dev);
0470 if (driver) {
0471 if (driver->err_handler) {
0472 eeh_pcid_put(dev);
0473 return NULL;
0474 }
0475 eeh_pcid_put(dev);
0476 }
0477
0478 #ifdef CONFIG_PCI_IOV
0479 pci_iov_add_virtfn(edev->physfn, edev->vf_index);
0480 #endif
0481 return NULL;
0482 }
0483
0484 static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
0485 {
0486 struct pci_driver *driver;
0487 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
0488 struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
0489
0490
0491
0492
0493
0494
0495
0496
0497 if (!eeh_edev_actionable(edev) ||
0498 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
0499 return;
0500
0501 if (rmv_data) {
0502 driver = eeh_pcid_get(dev);
0503 if (driver) {
0504 if (driver->err_handler &&
0505 driver->err_handler->error_detected &&
0506 driver->err_handler->slot_reset) {
0507 eeh_pcid_put(dev);
0508 return;
0509 }
0510 eeh_pcid_put(dev);
0511 }
0512 }
0513
0514
0515 pr_info("EEH: Removing %s without EEH sensitive driver\n",
0516 pci_name(dev));
0517 edev->mode |= EEH_DEV_DISCONNECTED;
0518 if (rmv_data)
0519 rmv_data->removed_dev_count++;
0520
0521 if (edev->physfn) {
0522 #ifdef CONFIG_PCI_IOV
0523 pci_iov_remove_virtfn(edev->physfn, edev->vf_index);
0524 edev->pdev = NULL;
0525 #endif
0526 if (rmv_data)
0527 list_add(&edev->rmv_entry, &rmv_data->removed_vf_list);
0528 } else {
0529 pci_lock_rescan_remove();
0530 pci_stop_and_remove_bus_device(dev);
0531 pci_unlock_rescan_remove();
0532 }
0533 }
0534
0535 static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
0536 {
0537 struct eeh_dev *edev, *tmp;
0538
0539 eeh_pe_for_each_dev(pe, edev, tmp) {
0540 if (!(edev->mode & EEH_DEV_DISCONNECTED))
0541 continue;
0542
0543 edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
0544 eeh_pe_tree_remove(edev);
0545 }
0546
0547 return NULL;
0548 }
0549
0550
0551
0552
0553
0554
0555
0556
0557 static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed)
0558 {
0559 struct eeh_pe *pe;
0560 int i;
0561
0562 eeh_for_each_pe(root, pe) {
0563 if (include_passed || !eeh_pe_passed(pe)) {
0564 for (i = 0; i < 3; i++)
0565 if (!eeh_unfreeze_pe(pe))
0566 break;
0567 if (i >= 3)
0568 return -EIO;
0569 }
0570 }
0571 eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed);
0572 return 0;
0573 }
0574
0575 int eeh_pe_reset_and_recover(struct eeh_pe *pe)
0576 {
0577 int ret;
0578
0579
0580 if (pe->state & EEH_PE_RECOVERING)
0581 return 0;
0582
0583
0584 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
0585
0586
0587 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
0588
0589
0590 ret = eeh_pe_reset_full(pe, true);
0591 if (ret) {
0592 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
0593 return ret;
0594 }
0595
0596
0597 ret = eeh_clear_pe_frozen_state(pe, true);
0598 if (ret) {
0599 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
0600 return ret;
0601 }
0602
0603
0604 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
0605
0606
0607 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
0608
0609 return 0;
0610 }
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623 static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
0624 struct eeh_rmv_data *rmv_data,
0625 bool driver_eeh_aware)
0626 {
0627 time64_t tstamp;
0628 int cnt, rc;
0629 struct eeh_dev *edev;
0630 struct eeh_pe *tmp_pe;
0631 bool any_passed = false;
0632
0633 eeh_for_each_pe(pe, tmp_pe)
0634 any_passed |= eeh_pe_passed(tmp_pe);
0635
0636
0637 cnt = pe->freeze_count;
0638 tstamp = pe->tstamp;
0639
0640
0641
0642
0643
0644
0645
0646 eeh_pe_state_mark(pe, EEH_PE_KEEP);
0647 if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
0648 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
0649 } else {
0650 pci_lock_rescan_remove();
0651 pci_hp_remove_devices(bus);
0652 pci_unlock_rescan_remove();
0653 }
0654
0655
0656
0657
0658
0659
0660
0661
0662
0663
0664 rc = eeh_pe_reset_full(pe, false);
0665 if (rc)
0666 return rc;
0667
0668 pci_lock_rescan_remove();
0669
0670
0671 eeh_ops->configure_bridge(pe);
0672 eeh_pe_restore_bars(pe);
0673
0674
0675 rc = eeh_clear_pe_frozen_state(pe, false);
0676 if (rc) {
0677 pci_unlock_rescan_remove();
0678 return rc;
0679 }
0680
0681
0682
0683
0684
0685
0686
0687 if (!driver_eeh_aware || rmv_data->removed_dev_count) {
0688 pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
0689 (driver_eeh_aware ? "partial" : "complete"));
0690 ssleep(5);
0691
0692
0693
0694
0695
0696
0697 edev = list_first_entry(&pe->edevs, struct eeh_dev, entry);
0698 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
0699 if (pe->type & EEH_PE_VF) {
0700 eeh_add_virt_device(edev);
0701 } else {
0702 if (!driver_eeh_aware)
0703 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
0704 pci_hp_add_devices(bus);
0705 }
0706 }
0707 eeh_pe_state_clear(pe, EEH_PE_KEEP, true);
0708
0709 pe->tstamp = tstamp;
0710 pe->freeze_count = cnt;
0711
0712 pci_unlock_rescan_remove();
0713 return 0;
0714 }
0715
0716
0717
0718
0719 #define MAX_WAIT_FOR_RECOVERY 300
0720
0721
0722
0723
0724
0725
0726
0727
0728
0729 static void eeh_pe_cleanup(struct eeh_pe *pe)
0730 {
0731 struct eeh_pe *child_pe, *tmp;
0732
0733 list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child)
0734 eeh_pe_cleanup(child_pe);
0735
0736 if (pe->state & EEH_PE_KEEP)
0737 return;
0738
0739 if (!(pe->state & EEH_PE_INVALID))
0740 return;
0741
0742 if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) {
0743 list_del(&pe->child);
0744 kfree(pe);
0745 }
0746 }
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757
0758
0759
0760 static bool eeh_slot_presence_check(struct pci_dev *pdev)
0761 {
0762 const struct hotplug_slot_ops *ops;
0763 struct pci_slot *slot;
0764 u8 state;
0765 int rc;
0766
0767 if (!pdev)
0768 return false;
0769
0770 if (pdev->error_state == pci_channel_io_perm_failure)
0771 return false;
0772
0773 slot = pdev->slot;
0774 if (!slot || !slot->hotplug)
0775 return true;
0776
0777 ops = slot->hotplug->ops;
0778 if (!ops || !ops->get_adapter_status)
0779 return true;
0780
0781
0782 if (ops->set_attention_status)
0783 ops->set_attention_status(slot->hotplug, 1);
0784
0785 rc = ops->get_adapter_status(slot->hotplug, &state);
0786 if (rc)
0787 return true;
0788
0789 return !!state;
0790 }
0791
0792 static void eeh_clear_slot_attention(struct pci_dev *pdev)
0793 {
0794 const struct hotplug_slot_ops *ops;
0795 struct pci_slot *slot;
0796
0797 if (!pdev)
0798 return;
0799
0800 if (pdev->error_state == pci_channel_io_perm_failure)
0801 return;
0802
0803 slot = pdev->slot;
0804 if (!slot || !slot->hotplug)
0805 return;
0806
0807 ops = slot->hotplug->ops;
0808 if (!ops || !ops->set_attention_status)
0809 return;
0810
0811 ops->set_attention_status(slot->hotplug, 0);
0812 }
0813
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823
0824
0825
0826
0827
0828
0829
0830
0831
0832
0833
0834
0835 void eeh_handle_normal_event(struct eeh_pe *pe)
0836 {
0837 struct pci_bus *bus;
0838 struct eeh_dev *edev, *tmp;
0839 struct eeh_pe *tmp_pe;
0840 int rc = 0;
0841 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
0842 struct eeh_rmv_data rmv_data =
0843 {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
0844 int devices = 0;
0845
0846 bus = eeh_pe_bus_get(pe);
0847 if (!bus) {
0848 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
0849 __func__, pe->phb->global_number, pe->addr);
0850 return;
0851 }
0852
0853
0854
0855
0856
0857
0858
0859
0860
0861
0862 eeh_for_each_pe(pe, tmp_pe)
0863 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
0864 if (eeh_slot_presence_check(edev->pdev))
0865 devices++;
0866
0867 if (!devices) {
0868 pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
0869 pe->phb->global_number, pe->addr);
0870 goto out;
0871 }
0872
0873
0874 if (pe->type & EEH_PE_PHB) {
0875 pr_err("EEH: Recovering PHB#%x, location: %s\n",
0876 pe->phb->global_number, eeh_pe_loc_get(pe));
0877 } else {
0878 struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
0879
0880 pr_err("EEH: Recovering PHB#%x-PE#%x\n",
0881 pe->phb->global_number, pe->addr);
0882 pr_err("EEH: PE location: %s, PHB location: %s\n",
0883 eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
0884 }
0885
0886 #ifdef CONFIG_STACKTRACE
0887
0888
0889
0890
0891 if (pe->trace_entries) {
0892 void **ptrs = (void **) pe->stack_trace;
0893 int i;
0894
0895 pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
0896 pe->phb->global_number, pe->addr);
0897
0898
0899 pr_err("EEH: Call Trace:\n");
0900 for (i = 0; i < pe->trace_entries; i++)
0901 pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
0902
0903 pe->trace_entries = 0;
0904 }
0905 #endif
0906
0907 eeh_for_each_pe(pe, tmp_pe)
0908 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
0909 edev->mode &= ~EEH_DEV_NO_HANDLER;
0910
0911 eeh_pe_update_time_stamp(pe);
0912 pe->freeze_count++;
0913 if (pe->freeze_count > eeh_max_freezes) {
0914 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
0915 pe->phb->global_number, pe->addr,
0916 pe->freeze_count);
0917
0918 goto recover_failed;
0919 }
0920
0921
0922
0923
0924
0925
0926
0927
0928
0929
0930
0931 pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
0932 pe->freeze_count, eeh_max_freezes);
0933 pr_info("EEH: Notify device drivers to shutdown\n");
0934 eeh_set_channel_state(pe, pci_channel_io_frozen);
0935 eeh_set_irq_state(pe, false);
0936 eeh_pe_report("error_detected(IO frozen)", pe,
0937 eeh_report_error, &result);
0938 if (result == PCI_ERS_RESULT_DISCONNECT)
0939 goto recover_failed;
0940
0941
0942
0943
0944
0945 if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE)
0946 result = PCI_ERS_RESULT_NEED_RESET;
0947
0948
0949
0950
0951 rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000);
0952 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
0953 pr_warn("EEH: Permanent failure\n");
0954 goto recover_failed;
0955 }
0956
0957
0958
0959
0960
0961 pr_info("EEH: Collect temporary log\n");
0962 eeh_slot_error_detail(pe, EEH_LOG_TEMP);
0963
0964
0965
0966
0967
0968 if (result == PCI_ERS_RESULT_NONE) {
0969 pr_info("EEH: Reset with hotplug activity\n");
0970 rc = eeh_reset_device(pe, bus, NULL, false);
0971 if (rc) {
0972 pr_warn("%s: Unable to reset, err=%d\n", __func__, rc);
0973 goto recover_failed;
0974 }
0975 }
0976
0977
0978 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
0979 pr_info("EEH: Enable I/O for affected devices\n");
0980 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
0981 if (rc < 0)
0982 goto recover_failed;
0983
0984 if (rc) {
0985 result = PCI_ERS_RESULT_NEED_RESET;
0986 } else {
0987 pr_info("EEH: Notify device drivers to resume I/O\n");
0988 eeh_pe_report("mmio_enabled", pe,
0989 eeh_report_mmio_enabled, &result);
0990 }
0991 }
0992 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
0993 pr_info("EEH: Enabled DMA for affected devices\n");
0994 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
0995 if (rc < 0)
0996 goto recover_failed;
0997
0998 if (rc) {
0999 result = PCI_ERS_RESULT_NEED_RESET;
1000 } else {
1001
1002
1003
1004
1005
1006 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
1007 result = PCI_ERS_RESULT_RECOVERED;
1008 }
1009 }
1010
1011
1012 if (result == PCI_ERS_RESULT_NEED_RESET) {
1013 pr_info("EEH: Reset without hotplug activity\n");
1014 rc = eeh_reset_device(pe, bus, &rmv_data, true);
1015 if (rc) {
1016 pr_warn("%s: Cannot reset, err=%d\n", __func__, rc);
1017 goto recover_failed;
1018 }
1019
1020 result = PCI_ERS_RESULT_NONE;
1021 eeh_set_channel_state(pe, pci_channel_io_normal);
1022 eeh_set_irq_state(pe, true);
1023 eeh_pe_report("slot_reset", pe, eeh_report_reset,
1024 &result);
1025 }
1026
1027 if ((result == PCI_ERS_RESULT_RECOVERED) ||
1028 (result == PCI_ERS_RESULT_NONE)) {
1029
1030
1031
1032
1033 list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
1034 rmv_entry) {
1035 eeh_add_virt_device(edev);
1036 list_del(&edev->rmv_entry);
1037 }
1038
1039
1040 pr_info("EEH: Notify device driver to resume\n");
1041 eeh_set_channel_state(pe, pci_channel_io_normal);
1042 eeh_set_irq_state(pe, true);
1043 eeh_pe_report("resume", pe, eeh_report_resume, NULL);
1044 eeh_for_each_pe(pe, tmp_pe) {
1045 eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
1046 edev->mode &= ~EEH_DEV_NO_HANDLER;
1047 edev->in_error = false;
1048 }
1049 }
1050
1051 pr_info("EEH: Recovery successful.\n");
1052 goto out;
1053 }
1054
1055 recover_failed:
1056
1057
1058
1059
1060
1061 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
1062 "Please try reseating or replacing it\n",
1063 pe->phb->global_number, pe->addr);
1064
1065 eeh_slot_error_detail(pe, EEH_LOG_PERM);
1066
1067
1068 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
1069 eeh_set_irq_state(pe, false);
1070 eeh_pe_report("error_detected(permanent failure)", pe,
1071 eeh_report_failure, NULL);
1072
1073
1074 eeh_pe_state_mark(pe, EEH_PE_REMOVED);
1075
1076
1077
1078
1079
1080
1081 if (pe->type & EEH_PE_VF) {
1082 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
1083 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
1084 } else {
1085 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
1086 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
1087
1088 pci_lock_rescan_remove();
1089 pci_hp_remove_devices(bus);
1090 pci_unlock_rescan_remove();
1091
1092 return;
1093 }
1094
1095 out:
1096
1097
1098
1099
1100 eeh_pe_cleanup(pe);
1101
1102
1103 eeh_for_each_pe(pe, tmp_pe)
1104 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
1105 eeh_clear_slot_attention(edev->pdev);
1106
1107 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
1108 }
1109
1110
1111
1112
1113
1114
1115
1116
1117 void eeh_handle_special_event(void)
1118 {
1119 struct eeh_pe *pe, *phb_pe, *tmp_pe;
1120 struct eeh_dev *edev, *tmp_edev;
1121 struct pci_bus *bus;
1122 struct pci_controller *hose;
1123 unsigned long flags;
1124 int rc;
1125
1126
1127 do {
1128 rc = eeh_ops->next_error(&pe);
1129
1130 switch (rc) {
1131 case EEH_NEXT_ERR_DEAD_IOC:
1132
1133 eeh_serialize_lock(&flags);
1134
1135
1136 eeh_remove_event(NULL, true);
1137
1138 list_for_each_entry(hose, &hose_list, list_node) {
1139 phb_pe = eeh_phb_pe_get(hose);
1140 if (!phb_pe) continue;
1141
1142 eeh_pe_mark_isolated(phb_pe);
1143 }
1144
1145 eeh_serialize_unlock(flags);
1146
1147 break;
1148 case EEH_NEXT_ERR_FROZEN_PE:
1149 case EEH_NEXT_ERR_FENCED_PHB:
1150 case EEH_NEXT_ERR_DEAD_PHB:
1151
1152 eeh_serialize_lock(&flags);
1153
1154
1155 eeh_remove_event(pe, true);
1156
1157 if (rc != EEH_NEXT_ERR_DEAD_PHB)
1158 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
1159 eeh_pe_mark_isolated(pe);
1160
1161 eeh_serialize_unlock(flags);
1162
1163 break;
1164 case EEH_NEXT_ERR_NONE:
1165 return;
1166 default:
1167 pr_warn("%s: Invalid value %d from next_error()\n",
1168 __func__, rc);
1169 return;
1170 }
1171
1172
1173
1174
1175
1176
1177 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
1178 rc == EEH_NEXT_ERR_FENCED_PHB) {
1179 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
1180 eeh_handle_normal_event(pe);
1181 } else {
1182 eeh_for_each_pe(pe, tmp_pe)
1183 eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
1184 edev->mode &= ~EEH_DEV_NO_HANDLER;
1185
1186
1187 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
1188 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
1189 eeh_pe_report(
1190 "error_detected(permanent failure)", pe,
1191 eeh_report_failure, NULL);
1192
1193 pci_lock_rescan_remove();
1194 list_for_each_entry(hose, &hose_list, list_node) {
1195 phb_pe = eeh_phb_pe_get(hose);
1196 if (!phb_pe ||
1197 !(phb_pe->state & EEH_PE_ISOLATED) ||
1198 (phb_pe->state & EEH_PE_RECOVERING))
1199 continue;
1200
1201 bus = eeh_pe_bus_get(phb_pe);
1202 if (!bus) {
1203 pr_err("%s: Cannot find PCI bus for "
1204 "PHB#%x-PE#%x\n",
1205 __func__,
1206 pe->phb->global_number,
1207 pe->addr);
1208 break;
1209 }
1210 pci_hp_remove_devices(bus);
1211 }
1212 pci_unlock_rescan_remove();
1213 }
1214
1215
1216
1217
1218
1219 if (rc == EEH_NEXT_ERR_DEAD_IOC)
1220 break;
1221 } while (rc != EEH_NEXT_ERR_NONE);
1222 }