Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /**
0003  * IBM Accelerator Family 'GenWQE'
0004  *
0005  * (C) Copyright IBM Corp. 2013
0006  *
0007  * Author: Frank Haverkamp <haver@linux.vnet.ibm.com>
0008  * Author: Joerg-Stephan Vogt <jsvogt@de.ibm.com>
0009  * Author: Michael Jung <mijung@gmx.net>
0010  * Author: Michael Ruettger <michael@ibmra.de>
0011  */
0012 
0013 /*
0014  * Module initialization and PCIe setup. Card health monitoring and
0015  * recovery functionality. Character device creation and deletion are
0016  * controlled from here.
0017  */
0018 
0019 #include <linux/types.h>
0020 #include <linux/pci.h>
0021 #include <linux/err.h>
0022 #include <linux/aer.h>
0023 #include <linux/string.h>
0024 #include <linux/sched.h>
0025 #include <linux/wait.h>
0026 #include <linux/delay.h>
0027 #include <linux/dma-mapping.h>
0028 #include <linux/module.h>
0029 #include <linux/notifier.h>
0030 #include <linux/device.h>
0031 #include <linux/log2.h>
0032 
0033 #include "card_base.h"
0034 #include "card_ddcb.h"
0035 
0036 MODULE_AUTHOR("Frank Haverkamp <haver@linux.vnet.ibm.com>");
0037 MODULE_AUTHOR("Michael Ruettger <michael@ibmra.de>");
0038 MODULE_AUTHOR("Joerg-Stephan Vogt <jsvogt@de.ibm.com>");
0039 MODULE_AUTHOR("Michael Jung <mijung@gmx.net>");
0040 
0041 MODULE_DESCRIPTION("GenWQE Card");
0042 MODULE_VERSION(DRV_VERSION);
0043 MODULE_LICENSE("GPL");
0044 
0045 static char genwqe_driver_name[] = GENWQE_DEVNAME;
0046 static struct class *class_genwqe;
0047 static struct dentry *debugfs_genwqe;
0048 static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX];
0049 
0050 /* PCI structure for identifying device by PCI vendor and device ID */
0051 static const struct pci_device_id genwqe_device_table[] = {
0052     { .vendor      = PCI_VENDOR_ID_IBM,
0053       .device      = PCI_DEVICE_GENWQE,
0054       .subvendor   = PCI_SUBVENDOR_ID_IBM,
0055       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
0056       .class       = (PCI_CLASSCODE_GENWQE5 << 8),
0057       .class_mask  = ~0,
0058       .driver_data = 0 },
0059 
0060     /* Initial SR-IOV bring-up image */
0061     { .vendor      = PCI_VENDOR_ID_IBM,
0062       .device      = PCI_DEVICE_GENWQE,
0063       .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
0064       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
0065       .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
0066       .class_mask  = ~0,
0067       .driver_data = 0 },
0068 
0069     { .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
0070       .device      = 0x0000,  /* VF Device ID */
0071       .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
0072       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
0073       .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
0074       .class_mask  = ~0,
0075       .driver_data = 0 },
0076 
0077     /* Fixed up image */
0078     { .vendor      = PCI_VENDOR_ID_IBM,
0079       .device      = PCI_DEVICE_GENWQE,
0080       .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
0081       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
0082       .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
0083       .class_mask  = ~0,
0084       .driver_data = 0 },
0085 
0086     { .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
0087       .device      = 0x0000,  /* VF Device ID */
0088       .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
0089       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
0090       .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
0091       .class_mask  = ~0,
0092       .driver_data = 0 },
0093 
0094     /* Even one more ... */
0095     { .vendor      = PCI_VENDOR_ID_IBM,
0096       .device      = PCI_DEVICE_GENWQE,
0097       .subvendor   = PCI_SUBVENDOR_ID_IBM,
0098       .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_NEW,
0099       .class       = (PCI_CLASSCODE_GENWQE5 << 8),
0100       .class_mask  = ~0,
0101       .driver_data = 0 },
0102 
0103     { 0, }          /* 0 terminated list. */
0104 };
0105 
0106 MODULE_DEVICE_TABLE(pci, genwqe_device_table);
0107 
0108 /**
0109  * genwqe_dev_alloc() - Create and prepare a new card descriptor
0110  *
0111  * Return: Pointer to card descriptor, or ERR_PTR(err) on error
0112  */
0113 static struct genwqe_dev *genwqe_dev_alloc(void)
0114 {
0115     unsigned int i = 0, j;
0116     struct genwqe_dev *cd;
0117 
0118     for (i = 0; i < GENWQE_CARD_NO_MAX; i++) {
0119         if (genwqe_devices[i] == NULL)
0120             break;
0121     }
0122     if (i >= GENWQE_CARD_NO_MAX)
0123         return ERR_PTR(-ENODEV);
0124 
0125     cd = kzalloc(sizeof(struct genwqe_dev), GFP_KERNEL);
0126     if (!cd)
0127         return ERR_PTR(-ENOMEM);
0128 
0129     cd->card_idx = i;
0130     cd->class_genwqe = class_genwqe;
0131     cd->debugfs_genwqe = debugfs_genwqe;
0132 
0133     /*
0134      * This comes from kernel config option and can be overritten via
0135      * debugfs.
0136      */
0137     cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
0138 
0139     init_waitqueue_head(&cd->queue_waitq);
0140 
0141     spin_lock_init(&cd->file_lock);
0142     INIT_LIST_HEAD(&cd->file_list);
0143 
0144     cd->card_state = GENWQE_CARD_UNUSED;
0145     spin_lock_init(&cd->print_lock);
0146 
0147     cd->ddcb_software_timeout = GENWQE_DDCB_SOFTWARE_TIMEOUT;
0148     cd->kill_timeout = GENWQE_KILL_TIMEOUT;
0149 
0150     for (j = 0; j < GENWQE_MAX_VFS; j++)
0151         cd->vf_jobtimeout_msec[j] = GENWQE_VF_JOBTIMEOUT_MSEC;
0152 
0153     genwqe_devices[i] = cd;
0154     return cd;
0155 }
0156 
0157 static void genwqe_dev_free(struct genwqe_dev *cd)
0158 {
0159     if (!cd)
0160         return;
0161 
0162     genwqe_devices[cd->card_idx] = NULL;
0163     kfree(cd);
0164 }
0165 
0166 /**
0167  * genwqe_bus_reset() - Card recovery
0168  * @cd: GenWQE device information
0169  *
0170  * pci_reset_function() will recover the device and ensure that the
0171  * registers are accessible again when it completes with success. If
0172  * not, the card will stay dead and registers will be unaccessible
0173  * still.
0174  */
0175 static int genwqe_bus_reset(struct genwqe_dev *cd)
0176 {
0177     int rc = 0;
0178     struct pci_dev *pci_dev = cd->pci_dev;
0179     void __iomem *mmio;
0180 
0181     if (cd->err_inject & GENWQE_INJECT_BUS_RESET_FAILURE)
0182         return -EIO;
0183 
0184     mmio = cd->mmio;
0185     cd->mmio = NULL;
0186     pci_iounmap(pci_dev, mmio);
0187 
0188     pci_release_mem_regions(pci_dev);
0189 
0190     /*
0191      * Firmware/BIOS might change memory mapping during bus reset.
0192      * Settings like enable bus-mastering, ... are backuped and
0193      * restored by the pci_reset_function().
0194      */
0195     dev_dbg(&pci_dev->dev, "[%s] pci_reset function ...\n", __func__);
0196     rc = pci_reset_function(pci_dev);
0197     if (rc) {
0198         dev_err(&pci_dev->dev,
0199             "[%s] err: failed reset func (rc %d)\n", __func__, rc);
0200         return rc;
0201     }
0202     dev_dbg(&pci_dev->dev, "[%s] done with rc=%d\n", __func__, rc);
0203 
0204     /*
0205      * Here is the right spot to clear the register read
0206      * failure. pci_bus_reset() does this job in real systems.
0207      */
0208     cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
0209                 GENWQE_INJECT_GFIR_FATAL |
0210                 GENWQE_INJECT_GFIR_INFO);
0211 
0212     rc = pci_request_mem_regions(pci_dev, genwqe_driver_name);
0213     if (rc) {
0214         dev_err(&pci_dev->dev,
0215             "[%s] err: request bars failed (%d)\n", __func__, rc);
0216         return -EIO;
0217     }
0218 
0219     cd->mmio = pci_iomap(pci_dev, 0, 0);
0220     if (cd->mmio == NULL) {
0221         dev_err(&pci_dev->dev,
0222             "[%s] err: mapping BAR0 failed\n", __func__);
0223         return -ENOMEM;
0224     }
0225     return 0;
0226 }
0227 
0228 /*
0229  * Hardware circumvention section. Certain bitstreams in our test-lab
0230  * had different kinds of problems. Here is where we adjust those
0231  * bitstreams to function will with this version of our device driver.
0232  *
0233  * Thise circumventions are applied to the physical function only.
0234  * The magical numbers below are identifying development/manufacturing
0235  * versions of the bitstream used on the card.
0236  *
0237  * Turn off error reporting for old/manufacturing images.
0238  */
0239 
0240 bool genwqe_need_err_masking(struct genwqe_dev *cd)
0241 {
0242     return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
0243 }
0244 
0245 static void genwqe_tweak_hardware(struct genwqe_dev *cd)
0246 {
0247     struct pci_dev *pci_dev = cd->pci_dev;
0248 
0249     /* Mask FIRs for development images */
0250     if (((cd->slu_unitcfg & 0xFFFF0ull) >= 0x32000ull) &&
0251         ((cd->slu_unitcfg & 0xFFFF0ull) <= 0x33250ull)) {
0252         dev_warn(&pci_dev->dev,
0253              "FIRs masked due to bitstream %016llx.%016llx\n",
0254              cd->slu_unitcfg, cd->app_unitcfg);
0255 
0256         __genwqe_writeq(cd, IO_APP_SEC_LEM_DEBUG_OVR,
0257                 0xFFFFFFFFFFFFFFFFull);
0258 
0259         __genwqe_writeq(cd, IO_APP_ERR_ACT_MASK,
0260                 0x0000000000000000ull);
0261     }
0262 }
0263 
0264 /**
0265  * genwqe_recovery_on_fatal_gfir_required() - Version depended actions
0266  * @cd: GenWQE device information
0267  *
0268  * Bitstreams older than 2013-02-17 have a bug where fatal GFIRs must
0269  * be ignored. This is e.g. true for the bitstream we gave to the card
0270  * manufacturer, but also for some old bitstreams we released to our
0271  * test-lab.
0272  */
0273 int genwqe_recovery_on_fatal_gfir_required(struct genwqe_dev *cd)
0274 {
0275     return (cd->slu_unitcfg & 0xFFFF0ull) >= 0x32170ull;
0276 }
0277 
0278 int genwqe_flash_readback_fails(struct genwqe_dev *cd)
0279 {
0280     return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
0281 }
0282 
0283 /**
0284  * genwqe_T_psec() - Calculate PF/VF timeout register content
0285  * @cd: GenWQE device information
0286  *
0287  * Note: From a design perspective it turned out to be a bad idea to
0288  * use codes here to specifiy the frequency/speed values. An old
0289  * driver cannot understand new codes and is therefore always a
0290  * problem. Better is to measure out the value or put the
0291  * speed/frequency directly into a register which is always a valid
0292  * value for old as well as for new software.
0293  */
0294 /* T = 1/f */
0295 static int genwqe_T_psec(struct genwqe_dev *cd)
0296 {
0297     u16 speed;  /* 1/f -> 250,  200,  166,  175 */
0298     static const int T[] = { 4000, 5000, 6000, 5714 };
0299 
0300     speed = (u16)((cd->slu_unitcfg >> 28) & 0x0full);
0301     if (speed >= ARRAY_SIZE(T))
0302         return -1;  /* illegal value */
0303 
0304     return T[speed];
0305 }
0306 
0307 /**
0308  * genwqe_setup_pf_jtimer() - Setup PF hardware timeouts for DDCB execution
0309  * @cd: GenWQE device information
0310  *
0311  * Do this _after_ card_reset() is called. Otherwise the values will
0312  * vanish. The settings need to be done when the queues are inactive.
0313  *
0314  * The max. timeout value is 2^(10+x) * T (6ns for 166MHz) * 15/16.
0315  * The min. timeout value is 2^(10+x) * T (6ns for 166MHz) * 14/16.
0316  */
0317 static bool genwqe_setup_pf_jtimer(struct genwqe_dev *cd)
0318 {
0319     u32 T = genwqe_T_psec(cd);
0320     u64 x;
0321 
0322     if (GENWQE_PF_JOBTIMEOUT_MSEC == 0)
0323         return false;
0324 
0325     /* PF: large value needed, flash update 2sec per block */
0326     x = ilog2(GENWQE_PF_JOBTIMEOUT_MSEC *
0327           16000000000uL/(T * 15)) - 10;
0328 
0329     genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
0330               0xff00 | (x & 0xff), 0);
0331     return true;
0332 }
0333 
0334 /**
0335  * genwqe_setup_vf_jtimer() - Setup VF hardware timeouts for DDCB execution
0336  * @cd: GenWQE device information
0337  */
0338 static bool genwqe_setup_vf_jtimer(struct genwqe_dev *cd)
0339 {
0340     struct pci_dev *pci_dev = cd->pci_dev;
0341     unsigned int vf;
0342     u32 T = genwqe_T_psec(cd);
0343     u64 x;
0344     int totalvfs;
0345 
0346     totalvfs = pci_sriov_get_totalvfs(pci_dev);
0347     if (totalvfs <= 0)
0348         return false;
0349 
0350     for (vf = 0; vf < totalvfs; vf++) {
0351 
0352         if (cd->vf_jobtimeout_msec[vf] == 0)
0353             continue;
0354 
0355         x = ilog2(cd->vf_jobtimeout_msec[vf] *
0356               16000000000uL/(T * 15)) - 10;
0357 
0358         genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
0359                   0xff00 | (x & 0xff), vf + 1);
0360     }
0361     return true;
0362 }
0363 
0364 static int genwqe_ffdc_buffs_alloc(struct genwqe_dev *cd)
0365 {
0366     unsigned int type, e = 0;
0367 
0368     for (type = 0; type < GENWQE_DBG_UNITS; type++) {
0369         switch (type) {
0370         case GENWQE_DBG_UNIT0:
0371             e = genwqe_ffdc_buff_size(cd, 0);
0372             break;
0373         case GENWQE_DBG_UNIT1:
0374             e = genwqe_ffdc_buff_size(cd, 1);
0375             break;
0376         case GENWQE_DBG_UNIT2:
0377             e = genwqe_ffdc_buff_size(cd, 2);
0378             break;
0379         case GENWQE_DBG_REGS:
0380             e = GENWQE_FFDC_REGS;
0381             break;
0382         }
0383 
0384         /* currently support only the debug units mentioned here */
0385         cd->ffdc[type].entries = e;
0386         cd->ffdc[type].regs =
0387             kmalloc_array(e, sizeof(struct genwqe_reg),
0388                       GFP_KERNEL);
0389         /*
0390          * regs == NULL is ok, the using code treats this as no regs,
0391          * Printing warning is ok in this case.
0392          */
0393     }
0394     return 0;
0395 }
0396 
0397 static void genwqe_ffdc_buffs_free(struct genwqe_dev *cd)
0398 {
0399     unsigned int type;
0400 
0401     for (type = 0; type < GENWQE_DBG_UNITS; type++) {
0402         kfree(cd->ffdc[type].regs);
0403         cd->ffdc[type].regs = NULL;
0404     }
0405 }
0406 
0407 static int genwqe_read_ids(struct genwqe_dev *cd)
0408 {
0409     int err = 0;
0410     int slu_id;
0411     struct pci_dev *pci_dev = cd->pci_dev;
0412 
0413     cd->slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
0414     if (cd->slu_unitcfg == IO_ILLEGAL_VALUE) {
0415         dev_err(&pci_dev->dev,
0416             "err: SLUID=%016llx\n", cd->slu_unitcfg);
0417         err = -EIO;
0418         goto out_err;
0419     }
0420 
0421     slu_id = genwqe_get_slu_id(cd);
0422     if (slu_id < GENWQE_SLU_ARCH_REQ || slu_id == 0xff) {
0423         dev_err(&pci_dev->dev,
0424             "err: incompatible SLU Architecture %u\n", slu_id);
0425         err = -ENOENT;
0426         goto out_err;
0427     }
0428 
0429     cd->app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
0430     if (cd->app_unitcfg == IO_ILLEGAL_VALUE) {
0431         dev_err(&pci_dev->dev,
0432             "err: APPID=%016llx\n", cd->app_unitcfg);
0433         err = -EIO;
0434         goto out_err;
0435     }
0436     genwqe_read_app_id(cd, cd->app_name, sizeof(cd->app_name));
0437 
0438     /*
0439      * Is access to all registers possible? If we are a VF the
0440      * answer is obvious. If we run fully virtualized, we need to
0441      * check if we can access all registers. If we do not have
0442      * full access we will cause an UR and some informational FIRs
0443      * in the PF, but that should not harm.
0444      */
0445     if (pci_dev->is_virtfn)
0446         cd->is_privileged = 0;
0447     else
0448         cd->is_privileged = (__genwqe_readq(cd, IO_SLU_BITSTREAM)
0449                      != IO_ILLEGAL_VALUE);
0450 
0451  out_err:
0452     return err;
0453 }
0454 
0455 static int genwqe_start(struct genwqe_dev *cd)
0456 {
0457     int err;
0458     struct pci_dev *pci_dev = cd->pci_dev;
0459 
0460     err = genwqe_read_ids(cd);
0461     if (err)
0462         return err;
0463 
0464     if (genwqe_is_privileged(cd)) {
0465         /* do this after the tweaks. alloc fail is acceptable */
0466         genwqe_ffdc_buffs_alloc(cd);
0467         genwqe_stop_traps(cd);
0468 
0469         /* Collect registers e.g. FIRs, UNITIDs, traces ... */
0470         genwqe_read_ffdc_regs(cd, cd->ffdc[GENWQE_DBG_REGS].regs,
0471                       cd->ffdc[GENWQE_DBG_REGS].entries, 0);
0472 
0473         genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT0,
0474                       cd->ffdc[GENWQE_DBG_UNIT0].regs,
0475                       cd->ffdc[GENWQE_DBG_UNIT0].entries);
0476 
0477         genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT1,
0478                       cd->ffdc[GENWQE_DBG_UNIT1].regs,
0479                       cd->ffdc[GENWQE_DBG_UNIT1].entries);
0480 
0481         genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT2,
0482                       cd->ffdc[GENWQE_DBG_UNIT2].regs,
0483                       cd->ffdc[GENWQE_DBG_UNIT2].entries);
0484 
0485         genwqe_start_traps(cd);
0486 
0487         if (cd->card_state == GENWQE_CARD_FATAL_ERROR) {
0488             dev_warn(&pci_dev->dev,
0489                  "[%s] chip reload/recovery!\n", __func__);
0490 
0491             /*
0492              * Stealth Mode: Reload chip on either hot
0493              * reset or PERST.
0494              */
0495             cd->softreset = 0x7Cull;
0496             __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
0497                        cd->softreset);
0498 
0499             err = genwqe_bus_reset(cd);
0500             if (err != 0) {
0501                 dev_err(&pci_dev->dev,
0502                     "[%s] err: bus reset failed!\n",
0503                     __func__);
0504                 goto out;
0505             }
0506 
0507             /*
0508              * Re-read the IDs because
0509              * it could happen that the bitstream load
0510              * failed!
0511              */
0512             err = genwqe_read_ids(cd);
0513             if (err)
0514                 goto out;
0515         }
0516     }
0517 
0518     err = genwqe_setup_service_layer(cd);  /* does a reset to the card */
0519     if (err != 0) {
0520         dev_err(&pci_dev->dev,
0521             "[%s] err: could not setup servicelayer!\n", __func__);
0522         err = -ENODEV;
0523         goto out;
0524     }
0525 
0526     if (genwqe_is_privileged(cd)) {  /* code is running _after_ reset */
0527         genwqe_tweak_hardware(cd);
0528 
0529         genwqe_setup_pf_jtimer(cd);
0530         genwqe_setup_vf_jtimer(cd);
0531     }
0532 
0533     err = genwqe_device_create(cd);
0534     if (err < 0) {
0535         dev_err(&pci_dev->dev,
0536             "err: chdev init failed! (err=%d)\n", err);
0537         goto out_release_service_layer;
0538     }
0539     return 0;
0540 
0541  out_release_service_layer:
0542     genwqe_release_service_layer(cd);
0543  out:
0544     if (genwqe_is_privileged(cd))
0545         genwqe_ffdc_buffs_free(cd);
0546     return -EIO;
0547 }
0548 
0549 /**
0550  * genwqe_stop() - Stop card operation
0551  * @cd: GenWQE device information
0552  *
0553  * Recovery notes:
0554  *   As long as genwqe_thread runs we might access registers during
0555  *   error data capture. Same is with the genwqe_health_thread.
0556  *   When genwqe_bus_reset() fails this function might called two times:
0557  *   first by the genwqe_health_thread() and later by genwqe_remove() to
0558  *   unbind the device. We must be able to survive that.
0559  *
0560  * This function must be robust enough to be called twice.
0561  */
0562 static int genwqe_stop(struct genwqe_dev *cd)
0563 {
0564     genwqe_finish_queue(cd);        /* no register access */
0565     genwqe_device_remove(cd);       /* device removed, procs killed */
0566     genwqe_release_service_layer(cd);   /* here genwqe_thread is stopped */
0567 
0568     if (genwqe_is_privileged(cd)) {
0569         pci_disable_sriov(cd->pci_dev); /* access pci config space */
0570         genwqe_ffdc_buffs_free(cd);
0571     }
0572 
0573     return 0;
0574 }
0575 
0576 /**
0577  * genwqe_recover_card() - Try to recover the card if it is possible
0578  * @cd: GenWQE device information
0579  * @fatal_err: Indicate whether to attempt soft reset
0580  *
0581  * If fatal_err is set no register access is possible anymore. It is
0582  * likely that genwqe_start fails in that situation. Proper error
0583  * handling is required in this case.
0584  *
0585  * genwqe_bus_reset() will cause the pci code to call genwqe_remove()
0586  * and later genwqe_probe() for all virtual functions.
0587  */
0588 static int genwqe_recover_card(struct genwqe_dev *cd, int fatal_err)
0589 {
0590     int rc;
0591     struct pci_dev *pci_dev = cd->pci_dev;
0592 
0593     genwqe_stop(cd);
0594 
0595     /*
0596      * Make sure chip is not reloaded to maintain FFDC. Write SLU
0597      * Reset Register, CPLDReset field to 0.
0598      */
0599     if (!fatal_err) {
0600         cd->softreset = 0x70ull;
0601         __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET, cd->softreset);
0602     }
0603 
0604     rc = genwqe_bus_reset(cd);
0605     if (rc != 0) {
0606         dev_err(&pci_dev->dev,
0607             "[%s] err: card recovery impossible!\n", __func__);
0608         return rc;
0609     }
0610 
0611     rc = genwqe_start(cd);
0612     if (rc < 0) {
0613         dev_err(&pci_dev->dev,
0614             "[%s] err: failed to launch device!\n", __func__);
0615         return rc;
0616     }
0617     return 0;
0618 }
0619 
0620 static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir)
0621 {
0622     *gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
0623     return (*gfir & GFIR_ERR_TRIGGER) &&
0624         genwqe_recovery_on_fatal_gfir_required(cd);
0625 }
0626 
0627 /**
0628  * genwqe_fir_checking() - Check the fault isolation registers of the card
0629  * @cd: GenWQE device information
0630  *
0631  * If this code works ok, can be tried out with help of the genwqe_poke tool:
0632  *   sudo ./tools/genwqe_poke 0x8 0xfefefefefef
0633  *
0634  * Now the relevant FIRs/sFIRs should be printed out and the driver should
0635  * invoke recovery (devices are removed and readded).
0636  */
0637 static u64 genwqe_fir_checking(struct genwqe_dev *cd)
0638 {
0639     int j, iterations = 0;
0640     u64 mask, fir, fec, uid, gfir, gfir_masked, sfir, sfec;
0641     u32 fir_addr, fir_clr_addr, fec_addr, sfir_addr, sfec_addr;
0642     struct pci_dev *pci_dev = cd->pci_dev;
0643 
0644  healthMonitor:
0645     iterations++;
0646     if (iterations > 16) {
0647         dev_err(&pci_dev->dev, "* exit looping after %d times\n",
0648             iterations);
0649         goto fatal_error;
0650     }
0651 
0652     gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
0653     if (gfir != 0x0)
0654         dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n",
0655                     IO_SLC_CFGREG_GFIR, gfir);
0656     if (gfir == IO_ILLEGAL_VALUE)
0657         goto fatal_error;
0658 
0659     /*
0660      * Avoid printing when to GFIR bit is on prevents contignous
0661      * printout e.g. for the following bug:
0662      *   FIR set without a 2ndary FIR/FIR cannot be cleared
0663      * Comment out the following if to get the prints:
0664      */
0665     if (gfir == 0)
0666         return 0;
0667 
0668     gfir_masked = gfir & GFIR_ERR_TRIGGER;  /* fatal errors */
0669 
0670     for (uid = 0; uid < GENWQE_MAX_UNITS; uid++) { /* 0..2 in zEDC */
0671 
0672         /* read the primary FIR (pfir) */
0673         fir_addr = (uid << 24) + 0x08;
0674         fir = __genwqe_readq(cd, fir_addr);
0675         if (fir == 0x0)
0676             continue;  /* no error in this unit */
0677 
0678         dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fir_addr, fir);
0679         if (fir == IO_ILLEGAL_VALUE)
0680             goto fatal_error;
0681 
0682         /* read primary FEC */
0683         fec_addr = (uid << 24) + 0x18;
0684         fec = __genwqe_readq(cd, fec_addr);
0685 
0686         dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fec_addr, fec);
0687         if (fec == IO_ILLEGAL_VALUE)
0688             goto fatal_error;
0689 
0690         for (j = 0, mask = 1ULL; j < 64; j++, mask <<= 1) {
0691 
0692             /* secondary fir empty, skip it */
0693             if ((fir & mask) == 0x0)
0694                 continue;
0695 
0696             sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
0697             sfir = __genwqe_readq(cd, sfir_addr);
0698 
0699             if (sfir == IO_ILLEGAL_VALUE)
0700                 goto fatal_error;
0701             dev_err(&pci_dev->dev,
0702                 "* 0x%08x 0x%016llx\n", sfir_addr, sfir);
0703 
0704             sfec_addr = (uid << 24) + 0x300 + 0x08 * j;
0705             sfec = __genwqe_readq(cd, sfec_addr);
0706 
0707             if (sfec == IO_ILLEGAL_VALUE)
0708                 goto fatal_error;
0709             dev_err(&pci_dev->dev,
0710                 "* 0x%08x 0x%016llx\n", sfec_addr, sfec);
0711 
0712             gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
0713             if (gfir == IO_ILLEGAL_VALUE)
0714                 goto fatal_error;
0715 
0716             /* gfir turned on during routine! get out and
0717                start over. */
0718             if ((gfir_masked == 0x0) &&
0719                 (gfir & GFIR_ERR_TRIGGER)) {
0720                 goto healthMonitor;
0721             }
0722 
0723             /* do not clear if we entered with a fatal gfir */
0724             if (gfir_masked == 0x0) {
0725 
0726                 /* NEW clear by mask the logged bits */
0727                 sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
0728                 __genwqe_writeq(cd, sfir_addr, sfir);
0729 
0730                 dev_dbg(&pci_dev->dev,
0731                     "[HM] Clearing  2ndary FIR 0x%08x with 0x%016llx\n",
0732                     sfir_addr, sfir);
0733 
0734                 /*
0735                  * note, these cannot be error-Firs
0736                  * since gfir_masked is 0 after sfir
0737                  * was read. Also, it is safe to do
0738                  * this write if sfir=0. Still need to
0739                  * clear the primary. This just means
0740                  * there is no secondary FIR.
0741                  */
0742 
0743                 /* clear by mask the logged bit. */
0744                 fir_clr_addr = (uid << 24) + 0x10;
0745                 __genwqe_writeq(cd, fir_clr_addr, mask);
0746 
0747                 dev_dbg(&pci_dev->dev,
0748                     "[HM] Clearing primary FIR 0x%08x with 0x%016llx\n",
0749                     fir_clr_addr, mask);
0750             }
0751         }
0752     }
0753     gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
0754     if (gfir == IO_ILLEGAL_VALUE)
0755         goto fatal_error;
0756 
0757     if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) {
0758         /*
0759          * Check once more that it didn't go on after all the
0760          * FIRS were cleared.
0761          */
0762         dev_dbg(&pci_dev->dev, "ACK! Another FIR! Recursing %d!\n",
0763             iterations);
0764         goto healthMonitor;
0765     }
0766     return gfir_masked;
0767 
0768  fatal_error:
0769     return IO_ILLEGAL_VALUE;
0770 }
0771 
0772 /**
0773  * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot
0774  * @pci_dev:    PCI device information struct
0775  *
0776  * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this
0777  * reset method will not work in all cases.
0778  *
0779  * Return: 0 on success or error code from pci_set_pcie_reset_state()
0780  */
0781 static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev)
0782 {
0783     int rc;
0784 
0785     /*
0786      * lock pci config space access from userspace,
0787      * save state and issue PCIe fundamental reset
0788      */
0789     pci_cfg_access_lock(pci_dev);
0790     pci_save_state(pci_dev);
0791     rc = pci_set_pcie_reset_state(pci_dev, pcie_warm_reset);
0792     if (!rc) {
0793         /* keep PCIe reset asserted for 250ms */
0794         msleep(250);
0795         pci_set_pcie_reset_state(pci_dev, pcie_deassert_reset);
0796         /* Wait for 2s to reload flash and train the link */
0797         msleep(2000);
0798     }
0799     pci_restore_state(pci_dev);
0800     pci_cfg_access_unlock(pci_dev);
0801     return rc;
0802 }
0803 
0804 
0805 static int genwqe_platform_recovery(struct genwqe_dev *cd)
0806 {
0807     struct pci_dev *pci_dev = cd->pci_dev;
0808     int rc;
0809 
0810     dev_info(&pci_dev->dev,
0811          "[%s] resetting card for error recovery\n", __func__);
0812 
0813     /* Clear out error injection flags */
0814     cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
0815                 GENWQE_INJECT_GFIR_FATAL |
0816                 GENWQE_INJECT_GFIR_INFO);
0817 
0818     genwqe_stop(cd);
0819 
0820     /* Try recoverying the card with fundamental reset */
0821     rc = genwqe_pci_fundamental_reset(pci_dev);
0822     if (!rc) {
0823         rc = genwqe_start(cd);
0824         if (!rc)
0825             dev_info(&pci_dev->dev,
0826                  "[%s] card recovered\n", __func__);
0827         else
0828             dev_err(&pci_dev->dev,
0829                 "[%s] err: cannot start card services! (err=%d)\n",
0830                 __func__, rc);
0831     } else {
0832         dev_err(&pci_dev->dev,
0833             "[%s] card reset failed\n", __func__);
0834     }
0835 
0836     return rc;
0837 }
0838 
0839 /**
0840  * genwqe_reload_bistream() - reload card bitstream
0841  * @cd: GenWQE device information
0842  *
0843  * Set the appropriate register and call fundamental reset to reaload the card
0844  * bitstream.
0845  *
0846  * Return: 0 on success, error code otherwise
0847  */
0848 static int genwqe_reload_bistream(struct genwqe_dev *cd)
0849 {
0850     struct pci_dev *pci_dev = cd->pci_dev;
0851     int rc;
0852 
0853     dev_info(&pci_dev->dev,
0854          "[%s] resetting card for bitstream reload\n",
0855          __func__);
0856 
0857     genwqe_stop(cd);
0858 
0859     /*
0860      * Cause a CPLD reprogram with the 'next_bitstream'
0861      * partition on PCIe hot or fundamental reset
0862      */
0863     __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
0864             (cd->softreset & 0xcull) | 0x70ull);
0865 
0866     rc = genwqe_pci_fundamental_reset(pci_dev);
0867     if (rc) {
0868         /*
0869          * A fundamental reset failure can be caused
0870          * by lack of support on the arch, so we just
0871          * log the error and try to start the card
0872          * again.
0873          */
0874         dev_err(&pci_dev->dev,
0875             "[%s] err: failed to reset card for bitstream reload\n",
0876             __func__);
0877     }
0878 
0879     rc = genwqe_start(cd);
0880     if (rc) {
0881         dev_err(&pci_dev->dev,
0882             "[%s] err: cannot start card services! (err=%d)\n",
0883             __func__, rc);
0884         return rc;
0885     }
0886     dev_info(&pci_dev->dev,
0887          "[%s] card reloaded\n", __func__);
0888     return 0;
0889 }
0890 
0891 
0892 /**
0893  * genwqe_health_thread() - Health checking thread
0894  * @data: GenWQE device information
0895  *
0896  * This thread is only started for the PF of the card.
0897  *
0898  * This thread monitors the health of the card. A critical situation
0899  * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In
0900  * this case we need to be recovered from outside. Writing to
0901  * registers will very likely not work either.
0902  *
0903  * This thread must only exit if kthread_should_stop() becomes true.
0904  *
0905  * Condition for the health-thread to trigger:
0906  *   a) when a kthread_stop() request comes in or
0907  *   b) a critical GFIR occured
0908  *
0909  * Informational GFIRs are checked and potentially printed in
0910  * GENWQE_HEALTH_CHECK_INTERVAL seconds.
0911  */
0912 static int genwqe_health_thread(void *data)
0913 {
0914     int rc, should_stop = 0;
0915     struct genwqe_dev *cd = data;
0916     struct pci_dev *pci_dev = cd->pci_dev;
0917     u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg;
0918 
0919  health_thread_begin:
0920     while (!kthread_should_stop()) {
0921         rc = wait_event_interruptible_timeout(cd->health_waitq,
0922              (genwqe_health_check_cond(cd, &gfir) ||
0923               (should_stop = kthread_should_stop())),
0924                 GENWQE_HEALTH_CHECK_INTERVAL * HZ);
0925 
0926         if (should_stop)
0927             break;
0928 
0929         if (gfir == IO_ILLEGAL_VALUE) {
0930             dev_err(&pci_dev->dev,
0931                 "[%s] GFIR=%016llx\n", __func__, gfir);
0932             goto fatal_error;
0933         }
0934 
0935         slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
0936         if (slu_unitcfg == IO_ILLEGAL_VALUE) {
0937             dev_err(&pci_dev->dev,
0938                 "[%s] SLU_UNITCFG=%016llx\n",
0939                 __func__, slu_unitcfg);
0940             goto fatal_error;
0941         }
0942 
0943         app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
0944         if (app_unitcfg == IO_ILLEGAL_VALUE) {
0945             dev_err(&pci_dev->dev,
0946                 "[%s] APP_UNITCFG=%016llx\n",
0947                 __func__, app_unitcfg);
0948             goto fatal_error;
0949         }
0950 
0951         gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
0952         if (gfir == IO_ILLEGAL_VALUE) {
0953             dev_err(&pci_dev->dev,
0954                 "[%s] %s: GFIR=%016llx\n", __func__,
0955                 (gfir & GFIR_ERR_TRIGGER) ? "err" : "info",
0956                 gfir);
0957             goto fatal_error;
0958         }
0959 
0960         gfir_masked = genwqe_fir_checking(cd);
0961         if (gfir_masked == IO_ILLEGAL_VALUE)
0962             goto fatal_error;
0963 
0964         /*
0965          * GFIR ErrorTrigger bits set => reset the card!
0966          * Never do this for old/manufacturing images!
0967          */
0968         if ((gfir_masked) && !cd->skip_recovery &&
0969             genwqe_recovery_on_fatal_gfir_required(cd)) {
0970 
0971             cd->card_state = GENWQE_CARD_FATAL_ERROR;
0972 
0973             rc = genwqe_recover_card(cd, 0);
0974             if (rc < 0) {
0975                 /* FIXME Card is unusable and needs unbind! */
0976                 goto fatal_error;
0977             }
0978         }
0979 
0980         if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) {
0981             /* Userspace requested card bitstream reload */
0982             rc = genwqe_reload_bistream(cd);
0983             if (rc)
0984                 goto fatal_error;
0985         }
0986 
0987         cd->last_gfir = gfir;
0988         cond_resched();
0989     }
0990 
0991     return 0;
0992 
0993  fatal_error:
0994     if (cd->use_platform_recovery) {
0995         /*
0996          * Since we use raw accessors, EEH errors won't be detected
0997          * by the platform until we do a non-raw MMIO or config space
0998          * read
0999          */
1000         readq(cd->mmio + IO_SLC_CFGREG_GFIR);
1001 
1002         /* We do nothing if the card is going over PCI recovery */
1003         if (pci_channel_offline(pci_dev))
1004             return -EIO;
1005 
1006         /*
1007          * If it's supported by the platform, we try a fundamental reset
1008          * to recover from a fatal error. Otherwise, we continue to wait
1009          * for an external recovery procedure to take care of it.
1010          */
1011         rc = genwqe_platform_recovery(cd);
1012         if (!rc)
1013             goto health_thread_begin;
1014     }
1015 
1016     dev_err(&pci_dev->dev,
1017         "[%s] card unusable. Please trigger unbind!\n", __func__);
1018 
1019     /* Bring down logical devices to inform user space via udev remove. */
1020     cd->card_state = GENWQE_CARD_FATAL_ERROR;
1021     genwqe_stop(cd);
1022 
1023     /* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */
1024     while (!kthread_should_stop())
1025         cond_resched();
1026 
1027     return -EIO;
1028 }
1029 
1030 static int genwqe_health_check_start(struct genwqe_dev *cd)
1031 {
1032     int rc;
1033 
1034     if (GENWQE_HEALTH_CHECK_INTERVAL <= 0)
1035         return 0;   /* valid for disabling the service */
1036 
1037     /* moved before request_irq() */
1038     /* init_waitqueue_head(&cd->health_waitq); */
1039 
1040     cd->health_thread = kthread_run(genwqe_health_thread, cd,
1041                     GENWQE_DEVNAME "%d_health",
1042                     cd->card_idx);
1043     if (IS_ERR(cd->health_thread)) {
1044         rc = PTR_ERR(cd->health_thread);
1045         cd->health_thread = NULL;
1046         return rc;
1047     }
1048     return 0;
1049 }
1050 
1051 static int genwqe_health_thread_running(struct genwqe_dev *cd)
1052 {
1053     return cd->health_thread != NULL;
1054 }
1055 
1056 static int genwqe_health_check_stop(struct genwqe_dev *cd)
1057 {
1058     if (!genwqe_health_thread_running(cd))
1059         return -EIO;
1060 
1061     kthread_stop(cd->health_thread);
1062     cd->health_thread = NULL;
1063     return 0;
1064 }
1065 
1066 /**
1067  * genwqe_pci_setup() - Allocate PCIe related resources for our card
1068  * @cd: GenWQE device information
1069  */
1070 static int genwqe_pci_setup(struct genwqe_dev *cd)
1071 {
1072     int err;
1073     struct pci_dev *pci_dev = cd->pci_dev;
1074 
1075     err = pci_enable_device_mem(pci_dev);
1076     if (err) {
1077         dev_err(&pci_dev->dev,
1078             "err: failed to enable pci memory (err=%d)\n", err);
1079         goto err_out;
1080     }
1081 
1082     /* Reserve PCI I/O and memory resources */
1083     err = pci_request_mem_regions(pci_dev, genwqe_driver_name);
1084     if (err) {
1085         dev_err(&pci_dev->dev,
1086             "[%s] err: request bars failed (%d)\n", __func__, err);
1087         err = -EIO;
1088         goto err_disable_device;
1089     }
1090 
1091     /* check for 64-bit DMA address supported (DAC) */
1092     /* check for 32-bit DMA address supported (SAC) */
1093     if (dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)) &&
1094         dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32))) {
1095         dev_err(&pci_dev->dev,
1096             "err: neither DMA32 nor DMA64 supported\n");
1097         err = -EIO;
1098         goto out_release_resources;
1099     }
1100 
1101     pci_set_master(pci_dev);
1102     pci_enable_pcie_error_reporting(pci_dev);
1103 
1104     /* EEH recovery requires PCIe fundamental reset */
1105     pci_dev->needs_freset = 1;
1106 
1107     /* request complete BAR-0 space (length = 0) */
1108     cd->mmio_len = pci_resource_len(pci_dev, 0);
1109     cd->mmio = pci_iomap(pci_dev, 0, 0);
1110     if (cd->mmio == NULL) {
1111         dev_err(&pci_dev->dev,
1112             "[%s] err: mapping BAR0 failed\n", __func__);
1113         err = -ENOMEM;
1114         goto out_release_resources;
1115     }
1116 
1117     cd->num_vfs = pci_sriov_get_totalvfs(pci_dev);
1118     if (cd->num_vfs < 0)
1119         cd->num_vfs = 0;
1120 
1121     err = genwqe_read_ids(cd);
1122     if (err)
1123         goto out_iounmap;
1124 
1125     return 0;
1126 
1127  out_iounmap:
1128     pci_iounmap(pci_dev, cd->mmio);
1129  out_release_resources:
1130     pci_release_mem_regions(pci_dev);
1131  err_disable_device:
1132     pci_disable_device(pci_dev);
1133  err_out:
1134     return err;
1135 }
1136 
1137 /**
1138  * genwqe_pci_remove() - Free PCIe related resources for our card
1139  * @cd: GenWQE device information
1140  */
1141 static void genwqe_pci_remove(struct genwqe_dev *cd)
1142 {
1143     struct pci_dev *pci_dev = cd->pci_dev;
1144 
1145     if (cd->mmio)
1146         pci_iounmap(pci_dev, cd->mmio);
1147 
1148     pci_release_mem_regions(pci_dev);
1149     pci_disable_device(pci_dev);
1150 }
1151 
1152 /**
1153  * genwqe_probe() - Device initialization
1154  * @pci_dev:    PCI device information struct
1155  * @id:     PCI device ID
1156  *
1157  * Callable for multiple cards. This function is called on bind.
1158  *
1159  * Return: 0 if succeeded, < 0 when failed
1160  */
1161 static int genwqe_probe(struct pci_dev *pci_dev,
1162             const struct pci_device_id *id)
1163 {
1164     int err;
1165     struct genwqe_dev *cd;
1166 
1167     genwqe_init_crc32();
1168 
1169     cd = genwqe_dev_alloc();
1170     if (IS_ERR(cd)) {
1171         dev_err(&pci_dev->dev, "err: could not alloc mem (err=%d)!\n",
1172             (int)PTR_ERR(cd));
1173         return PTR_ERR(cd);
1174     }
1175 
1176     dev_set_drvdata(&pci_dev->dev, cd);
1177     cd->pci_dev = pci_dev;
1178 
1179     err = genwqe_pci_setup(cd);
1180     if (err < 0) {
1181         dev_err(&pci_dev->dev,
1182             "err: problems with PCI setup (err=%d)\n", err);
1183         goto out_free_dev;
1184     }
1185 
1186     err = genwqe_start(cd);
1187     if (err < 0) {
1188         dev_err(&pci_dev->dev,
1189             "err: cannot start card services! (err=%d)\n", err);
1190         goto out_pci_remove;
1191     }
1192 
1193     if (genwqe_is_privileged(cd)) {
1194         err = genwqe_health_check_start(cd);
1195         if (err < 0) {
1196             dev_err(&pci_dev->dev,
1197                 "err: cannot start health checking! (err=%d)\n",
1198                 err);
1199             goto out_stop_services;
1200         }
1201     }
1202     return 0;
1203 
1204  out_stop_services:
1205     genwqe_stop(cd);
1206  out_pci_remove:
1207     genwqe_pci_remove(cd);
1208  out_free_dev:
1209     genwqe_dev_free(cd);
1210     return err;
1211 }
1212 
1213 /**
1214  * genwqe_remove() - Called when device is removed (hot-plugable)
1215  * @pci_dev:    PCI device information struct
1216  *
1217  * Or when driver is unloaded respecitively when unbind is done.
1218  */
1219 static void genwqe_remove(struct pci_dev *pci_dev)
1220 {
1221     struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1222 
1223     genwqe_health_check_stop(cd);
1224 
1225     /*
1226      * genwqe_stop() must survive if it is called twice
1227      * sequentially. This happens when the health thread calls it
1228      * and fails on genwqe_bus_reset().
1229      */
1230     genwqe_stop(cd);
1231     genwqe_pci_remove(cd);
1232     genwqe_dev_free(cd);
1233 }
1234 
1235 /**
1236  * genwqe_err_error_detected() - Error detection callback
1237  * @pci_dev:    PCI device information struct
1238  * @state:  PCI channel state
1239  *
1240  * This callback is called by the PCI subsystem whenever a PCI bus
1241  * error is detected.
1242  */
1243 static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev,
1244                          pci_channel_state_t state)
1245 {
1246     struct genwqe_dev *cd;
1247 
1248     dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
1249 
1250     cd = dev_get_drvdata(&pci_dev->dev);
1251     if (cd == NULL)
1252         return PCI_ERS_RESULT_DISCONNECT;
1253 
1254     /* Stop the card */
1255     genwqe_health_check_stop(cd);
1256     genwqe_stop(cd);
1257 
1258     /*
1259      * On permanent failure, the PCI code will call device remove
1260      * after the return of this function.
1261      * genwqe_stop() can be called twice.
1262      */
1263     if (state == pci_channel_io_perm_failure) {
1264         return PCI_ERS_RESULT_DISCONNECT;
1265     } else {
1266         genwqe_pci_remove(cd);
1267         return PCI_ERS_RESULT_NEED_RESET;
1268     }
1269 }
1270 
1271 static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
1272 {
1273     int rc;
1274     struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1275 
1276     rc = genwqe_pci_setup(cd);
1277     if (!rc) {
1278         return PCI_ERS_RESULT_RECOVERED;
1279     } else {
1280         dev_err(&pci_dev->dev,
1281             "err: problems with PCI setup (err=%d)\n", rc);
1282         return PCI_ERS_RESULT_DISCONNECT;
1283     }
1284 }
1285 
1286 static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
1287 {
1288     return PCI_ERS_RESULT_NONE;
1289 }
1290 
1291 static void genwqe_err_resume(struct pci_dev *pci_dev)
1292 {
1293     int rc;
1294     struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1295 
1296     rc = genwqe_start(cd);
1297     if (!rc) {
1298         rc = genwqe_health_check_start(cd);
1299         if (rc)
1300             dev_err(&pci_dev->dev,
1301                 "err: cannot start health checking! (err=%d)\n",
1302                 rc);
1303     } else {
1304         dev_err(&pci_dev->dev,
1305             "err: cannot start card services! (err=%d)\n", rc);
1306     }
1307 }
1308 
1309 static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
1310 {
1311     int rc;
1312     struct genwqe_dev *cd = dev_get_drvdata(&dev->dev);
1313 
1314     if (numvfs > 0) {
1315         genwqe_setup_vf_jtimer(cd);
1316         rc = pci_enable_sriov(dev, numvfs);
1317         if (rc < 0)
1318             return rc;
1319         return numvfs;
1320     }
1321     if (numvfs == 0) {
1322         pci_disable_sriov(dev);
1323         return 0;
1324     }
1325     return 0;
1326 }
1327 
1328 static const struct pci_error_handlers genwqe_err_handler = {
1329     .error_detected = genwqe_err_error_detected,
1330     .mmio_enabled   = genwqe_err_result_none,
1331     .slot_reset = genwqe_err_slot_reset,
1332     .resume     = genwqe_err_resume,
1333 };
1334 
1335 static struct pci_driver genwqe_driver = {
1336     .name     = genwqe_driver_name,
1337     .id_table = genwqe_device_table,
1338     .probe    = genwqe_probe,
1339     .remove   = genwqe_remove,
1340     .sriov_configure = genwqe_sriov_configure,
1341     .err_handler = &genwqe_err_handler,
1342 };
1343 
1344 /**
1345  * genwqe_devnode() - Set default access mode for genwqe devices.
1346  * @dev:    Pointer to device (unused)
1347  * @mode:   Carrier to pass-back given mode (permissions)
1348  *
1349  * Default mode should be rw for everybody. Do not change default
1350  * device name.
1351  */
1352 static char *genwqe_devnode(struct device *dev, umode_t *mode)
1353 {
1354     if (mode)
1355         *mode = 0666;
1356     return NULL;
1357 }
1358 
1359 /**
1360  * genwqe_init_module() - Driver registration and initialization
1361  */
1362 static int __init genwqe_init_module(void)
1363 {
1364     int rc;
1365 
1366     class_genwqe = class_create(THIS_MODULE, GENWQE_DEVNAME);
1367     if (IS_ERR(class_genwqe)) {
1368         pr_err("[%s] create class failed\n", __func__);
1369         return -ENOMEM;
1370     }
1371 
1372     class_genwqe->devnode = genwqe_devnode;
1373 
1374     debugfs_genwqe = debugfs_create_dir(GENWQE_DEVNAME, NULL);
1375 
1376     rc = pci_register_driver(&genwqe_driver);
1377     if (rc != 0) {
1378         pr_err("[%s] pci_reg_driver (rc=%d)\n", __func__, rc);
1379         goto err_out0;
1380     }
1381 
1382     return rc;
1383 
1384  err_out0:
1385     debugfs_remove(debugfs_genwqe);
1386     class_destroy(class_genwqe);
1387     return rc;
1388 }
1389 
1390 /**
1391  * genwqe_exit_module() - Driver exit
1392  */
1393 static void __exit genwqe_exit_module(void)
1394 {
1395     pci_unregister_driver(&genwqe_driver);
1396     debugfs_remove(debugfs_genwqe);
1397     class_destroy(class_genwqe);
1398 }
1399 
1400 module_init(genwqe_init_module);
1401 module_exit(genwqe_exit_module);