0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/jiffies.h>
0034 #include <linux/module.h>
0035 #include <linux/timer.h>
0036 #include <linux/workqueue.h>
0037
0038 #include "mthca_dev.h"
0039
0040 enum {
0041 MTHCA_CATAS_POLL_INTERVAL = 5 * HZ,
0042
0043 MTHCA_CATAS_TYPE_INTERNAL = 0,
0044 MTHCA_CATAS_TYPE_UPLINK = 3,
0045 MTHCA_CATAS_TYPE_DDR = 4,
0046 MTHCA_CATAS_TYPE_PARITY = 5,
0047 };
0048
0049 static DEFINE_SPINLOCK(catas_lock);
0050
0051 static LIST_HEAD(catas_list);
0052 static struct workqueue_struct *catas_wq;
0053 static struct work_struct catas_work;
0054
0055 static int catas_reset_disable;
0056 module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
0057 MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
0058
0059 static void catas_reset(struct work_struct *work)
0060 {
0061 struct mthca_dev *dev, *tmpdev;
0062 LIST_HEAD(tlist);
0063 int ret;
0064
0065 mutex_lock(&mthca_device_mutex);
0066
0067 spin_lock_irq(&catas_lock);
0068 list_splice_init(&catas_list, &tlist);
0069 spin_unlock_irq(&catas_lock);
0070
0071 list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
0072 struct pci_dev *pdev = dev->pdev;
0073 ret = __mthca_restart_one(dev->pdev);
0074
0075 if (ret)
0076 printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
0077 pci_name(pdev), ret);
0078 else {
0079 struct mthca_dev *d = pci_get_drvdata(pdev);
0080 mthca_dbg(d, "Reset succeeded\n");
0081 }
0082 }
0083
0084 mutex_unlock(&mthca_device_mutex);
0085 }
0086
0087 static void handle_catas(struct mthca_dev *dev)
0088 {
0089 struct ib_event event;
0090 unsigned long flags;
0091 const char *type;
0092 int i;
0093
0094 event.device = &dev->ib_dev;
0095 event.event = IB_EVENT_DEVICE_FATAL;
0096 event.element.port_num = 0;
0097 dev->active = false;
0098
0099 ib_dispatch_event(&event);
0100
0101 switch (swab32(readl(dev->catas_err.map)) >> 24) {
0102 case MTHCA_CATAS_TYPE_INTERNAL:
0103 type = "internal error";
0104 break;
0105 case MTHCA_CATAS_TYPE_UPLINK:
0106 type = "uplink bus error";
0107 break;
0108 case MTHCA_CATAS_TYPE_DDR:
0109 type = "DDR data error";
0110 break;
0111 case MTHCA_CATAS_TYPE_PARITY:
0112 type = "internal parity error";
0113 break;
0114 default:
0115 type = "unknown error";
0116 break;
0117 }
0118
0119 mthca_err(dev, "Catastrophic error detected: %s\n", type);
0120 for (i = 0; i < dev->catas_err.size; ++i)
0121 mthca_err(dev, " buf[%02x]: %08x\n",
0122 i, swab32(readl(dev->catas_err.map + i)));
0123
0124 if (catas_reset_disable)
0125 return;
0126
0127 spin_lock_irqsave(&catas_lock, flags);
0128 list_add(&dev->catas_err.list, &catas_list);
0129 queue_work(catas_wq, &catas_work);
0130 spin_unlock_irqrestore(&catas_lock, flags);
0131 }
0132
0133 static void poll_catas(struct timer_list *t)
0134 {
0135 struct mthca_dev *dev = from_timer(dev, t, catas_err.timer);
0136 int i;
0137
0138 for (i = 0; i < dev->catas_err.size; ++i)
0139 if (readl(dev->catas_err.map + i)) {
0140 handle_catas(dev);
0141 return;
0142 }
0143
0144 mod_timer(&dev->catas_err.timer,
0145 round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
0146 }
0147
0148 void mthca_start_catas_poll(struct mthca_dev *dev)
0149 {
0150 phys_addr_t addr;
0151
0152 timer_setup(&dev->catas_err.timer, poll_catas, 0);
0153 dev->catas_err.map = NULL;
0154
0155 addr = pci_resource_start(dev->pdev, 0) +
0156 ((pci_resource_len(dev->pdev, 0) - 1) &
0157 dev->catas_err.addr);
0158
0159 dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
0160 if (!dev->catas_err.map) {
0161 mthca_warn(dev, "couldn't map catastrophic error region "
0162 "at 0x%llx/0x%x\n", (unsigned long long) addr,
0163 dev->catas_err.size * 4);
0164 return;
0165 }
0166
0167 dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL;
0168 INIT_LIST_HEAD(&dev->catas_err.list);
0169 add_timer(&dev->catas_err.timer);
0170 }
0171
0172 void mthca_stop_catas_poll(struct mthca_dev *dev)
0173 {
0174 del_timer_sync(&dev->catas_err.timer);
0175
0176 if (dev->catas_err.map)
0177 iounmap(dev->catas_err.map);
0178
0179 spin_lock_irq(&catas_lock);
0180 list_del(&dev->catas_err.list);
0181 spin_unlock_irq(&catas_lock);
0182 }
0183
0184 int __init mthca_catas_init(void)
0185 {
0186 INIT_WORK(&catas_work, catas_reset);
0187
0188 catas_wq = alloc_ordered_workqueue("mthca_catas", WQ_MEM_RECLAIM);
0189 if (!catas_wq)
0190 return -ENOMEM;
0191
0192 return 0;
0193 }
0194
0195 void mthca_catas_cleanup(void)
0196 {
0197 destroy_workqueue(catas_wq);
0198 }