Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
0003  * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
0004  * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
0005  *
0006  * This software is available to you under a choice of one of two
0007  * licenses.  You may choose to be licensed under the terms of the GNU
0008  * General Public License (GPL) Version 2, available from the file
0009  * COPYING in the main directory of this source tree, or the
0010  * OpenIB.org BSD license below:
0011  *
0012  *     Redistribution and use in source and binary forms, with or
0013  *     without modification, are permitted provided that the following
0014  *     conditions are met:
0015  *
0016  *      - Redistributions of source code must retain the above
0017  *        copyright notice, this list of conditions and the following
0018  *        disclaimer.
0019  *
0020  *      - Redistributions in binary form must reproduce the above
0021  *        copyright notice, this list of conditions and the following
0022  *        disclaimer in the documentation and/or other materials
0023  *        provided with the distribution.
0024  *
0025  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0026  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0027  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0028  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0029  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0030  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0031  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0032  * SOFTWARE.
0033  */
0034 
0035 #include <linux/pci.h>
0036 #include <linux/netdevice.h>
0037 #include <linux/vmalloc.h>
0038 #include <linux/delay.h>
0039 #include <linux/module.h>
0040 #include <linux/printk.h>
0041 #ifdef CONFIG_INFINIBAND_QIB_DCA
0042 #include <linux/dca.h>
0043 #endif
0044 #include <rdma/rdma_vt.h>
0045 
0046 #include "qib.h"
0047 #include "qib_common.h"
0048 #include "qib_mad.h"
0049 #ifdef CONFIG_DEBUG_FS
0050 #include "qib_debugfs.h"
0051 #include "qib_verbs.h"
0052 #endif
0053 
0054 #undef pr_fmt
0055 #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
0056 
0057 /*
0058  * min buffers we want to have per context, after driver
0059  */
0060 #define QIB_MIN_USER_CTXT_BUFCNT 7
0061 
0062 #define QLOGIC_IB_R_SOFTWARE_MASK 0xFF
0063 #define QLOGIC_IB_R_SOFTWARE_SHIFT 24
0064 #define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62)
0065 
0066 /*
0067  * Number of ctxts we are configured to use (to allow for more pio
0068  * buffers per ctxt, etc.)  Zero means use chip value.
0069  */
0070 ushort qib_cfgctxts;
0071 module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
0072 MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
0073 
0074 unsigned qib_numa_aware;
0075 module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
0076 MODULE_PARM_DESC(numa_aware,
0077     "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
0078 
0079 /*
0080  * If set, do not write to any regs if avoidable, hack to allow
0081  * check for deranged default register values.
0082  */
0083 ushort qib_mini_init;
0084 module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO);
0085 MODULE_PARM_DESC(mini_init, "If set, do minimal diag init");
0086 
0087 unsigned qib_n_krcv_queues;
0088 module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO);
0089 MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port");
0090 
0091 unsigned qib_cc_table_size;
0092 module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO);
0093 MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984");
0094 
0095 static void verify_interrupt(struct timer_list *);
0096 
0097 DEFINE_XARRAY_FLAGS(qib_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
0098 u32 qib_cpulist_count;
0099 unsigned long *qib_cpulist;
0100 
0101 /* set number of contexts we'll actually use */
0102 void qib_set_ctxtcnt(struct qib_devdata *dd)
0103 {
0104     if (!qib_cfgctxts) {
0105         dd->cfgctxts = dd->first_user_ctxt + num_online_cpus();
0106         if (dd->cfgctxts > dd->ctxtcnt)
0107             dd->cfgctxts = dd->ctxtcnt;
0108     } else if (qib_cfgctxts < dd->num_pports)
0109         dd->cfgctxts = dd->ctxtcnt;
0110     else if (qib_cfgctxts <= dd->ctxtcnt)
0111         dd->cfgctxts = qib_cfgctxts;
0112     else
0113         dd->cfgctxts = dd->ctxtcnt;
0114     dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 :
0115         dd->cfgctxts - dd->first_user_ctxt;
0116 }
0117 
0118 /*
0119  * Common code for creating the receive context array.
0120  */
0121 int qib_create_ctxts(struct qib_devdata *dd)
0122 {
0123     unsigned i;
0124     int local_node_id = pcibus_to_node(dd->pcidev->bus);
0125 
0126     if (local_node_id < 0)
0127         local_node_id = numa_node_id();
0128     dd->assigned_node_id = local_node_id;
0129 
0130     /*
0131      * Allocate full ctxtcnt array, rather than just cfgctxts, because
0132      * cleanup iterates across all possible ctxts.
0133      */
0134     dd->rcd = kcalloc(dd->ctxtcnt, sizeof(*dd->rcd), GFP_KERNEL);
0135     if (!dd->rcd)
0136         return -ENOMEM;
0137 
0138     /* create (one or more) kctxt */
0139     for (i = 0; i < dd->first_user_ctxt; ++i) {
0140         struct qib_pportdata *ppd;
0141         struct qib_ctxtdata *rcd;
0142 
0143         if (dd->skip_kctxt_mask & (1 << i))
0144             continue;
0145 
0146         ppd = dd->pport + (i % dd->num_pports);
0147 
0148         rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
0149         if (!rcd) {
0150             qib_dev_err(dd,
0151                 "Unable to allocate ctxtdata for Kernel ctxt, failing\n");
0152             kfree(dd->rcd);
0153             dd->rcd = NULL;
0154             return -ENOMEM;
0155         }
0156         rcd->pkeys[0] = QIB_DEFAULT_P_KEY;
0157         rcd->seq_cnt = 1;
0158     }
0159     return 0;
0160 }
0161 
0162 /*
0163  * Common code for user and kernel context setup.
0164  */
0165 struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
0166     int node_id)
0167 {
0168     struct qib_devdata *dd = ppd->dd;
0169     struct qib_ctxtdata *rcd;
0170 
0171     rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
0172     if (rcd) {
0173         INIT_LIST_HEAD(&rcd->qp_wait_list);
0174         rcd->node_id = node_id;
0175         rcd->ppd = ppd;
0176         rcd->dd = dd;
0177         rcd->cnt = 1;
0178         rcd->ctxt = ctxt;
0179         dd->rcd[ctxt] = rcd;
0180 #ifdef CONFIG_DEBUG_FS
0181         if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
0182             rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
0183                 GFP_KERNEL, node_id);
0184             if (!rcd->opstats) {
0185                 kfree(rcd);
0186                 qib_dev_err(dd,
0187                     "Unable to allocate per ctxt stats buffer\n");
0188                 return NULL;
0189             }
0190         }
0191 #endif
0192         dd->f_init_ctxt(rcd);
0193 
0194         /*
0195          * To avoid wasting a lot of memory, we allocate 32KB chunks
0196          * of physically contiguous memory, advance through it until
0197          * used up and then allocate more.  Of course, we need
0198          * memory to store those extra pointers, now.  32KB seems to
0199          * be the most that is "safe" under memory pressure
0200          * (creating large files and then copying them over
0201          * NFS while doing lots of MPI jobs).  The OOM killer can
0202          * get invoked, even though we say we can sleep and this can
0203          * cause significant system problems....
0204          */
0205         rcd->rcvegrbuf_size = 0x8000;
0206         rcd->rcvegrbufs_perchunk =
0207             rcd->rcvegrbuf_size / dd->rcvegrbufsize;
0208         rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
0209             rcd->rcvegrbufs_perchunk - 1) /
0210             rcd->rcvegrbufs_perchunk;
0211         rcd->rcvegrbufs_perchunk_shift =
0212             ilog2(rcd->rcvegrbufs_perchunk);
0213     }
0214     return rcd;
0215 }
0216 
0217 /*
0218  * Common code for initializing the physical port structure.
0219  */
0220 int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,
0221             u8 hw_pidx, u8 port)
0222 {
0223     int size;
0224 
0225     ppd->dd = dd;
0226     ppd->hw_pidx = hw_pidx;
0227     ppd->port = port; /* IB port number, not index */
0228 
0229     spin_lock_init(&ppd->sdma_lock);
0230     spin_lock_init(&ppd->lflags_lock);
0231     spin_lock_init(&ppd->cc_shadow_lock);
0232     init_waitqueue_head(&ppd->state_wait);
0233 
0234     timer_setup(&ppd->symerr_clear_timer, qib_clear_symerror_on_linkup, 0);
0235 
0236     ppd->qib_wq = NULL;
0237     ppd->ibport_data.pmastats =
0238         alloc_percpu(struct qib_pma_counters);
0239     if (!ppd->ibport_data.pmastats)
0240         return -ENOMEM;
0241     ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
0242     ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
0243     ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
0244     if (!(ppd->ibport_data.rvp.rc_acks) ||
0245         !(ppd->ibport_data.rvp.rc_qacks) ||
0246         !(ppd->ibport_data.rvp.rc_delayed_comp))
0247         return -ENOMEM;
0248 
0249     if (qib_cc_table_size < IB_CCT_MIN_ENTRIES)
0250         goto bail;
0251 
0252     ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size,
0253         IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT);
0254 
0255     ppd->cc_max_table_entries =
0256         ppd->cc_supported_table_entries/IB_CCT_ENTRIES;
0257 
0258     size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry)
0259         * IB_CCT_ENTRIES;
0260     ppd->ccti_entries = kzalloc(size, GFP_KERNEL);
0261     if (!ppd->ccti_entries)
0262         goto bail;
0263 
0264     size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry);
0265     ppd->congestion_entries = kzalloc(size, GFP_KERNEL);
0266     if (!ppd->congestion_entries)
0267         goto bail_1;
0268 
0269     size = sizeof(struct cc_table_shadow);
0270     ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL);
0271     if (!ppd->ccti_entries_shadow)
0272         goto bail_2;
0273 
0274     size = sizeof(struct ib_cc_congestion_setting_attr);
0275     ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL);
0276     if (!ppd->congestion_entries_shadow)
0277         goto bail_3;
0278 
0279     return 0;
0280 
0281 bail_3:
0282     kfree(ppd->ccti_entries_shadow);
0283     ppd->ccti_entries_shadow = NULL;
0284 bail_2:
0285     kfree(ppd->congestion_entries);
0286     ppd->congestion_entries = NULL;
0287 bail_1:
0288     kfree(ppd->ccti_entries);
0289     ppd->ccti_entries = NULL;
0290 bail:
0291     /* User is intentionally disabling the congestion control agent */
0292     if (!qib_cc_table_size)
0293         return 0;
0294 
0295     if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) {
0296         qib_cc_table_size = 0;
0297         qib_dev_err(dd,
0298          "Congestion Control table size %d less than minimum %d for port %d\n",
0299          qib_cc_table_size, IB_CCT_MIN_ENTRIES, port);
0300     }
0301 
0302     qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n",
0303         port);
0304     return 0;
0305 }
0306 
0307 static int init_pioavailregs(struct qib_devdata *dd)
0308 {
0309     int ret, pidx;
0310     u64 *status_page;
0311 
0312     dd->pioavailregs_dma = dma_alloc_coherent(
0313         &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys,
0314         GFP_KERNEL);
0315     if (!dd->pioavailregs_dma) {
0316         qib_dev_err(dd,
0317             "failed to allocate PIOavail reg area in memory\n");
0318         ret = -ENOMEM;
0319         goto done;
0320     }
0321 
0322     /*
0323      * We really want L2 cache aligned, but for current CPUs of
0324      * interest, they are the same.
0325      */
0326     status_page = (u64 *)
0327         ((char *) dd->pioavailregs_dma +
0328          ((2 * L1_CACHE_BYTES +
0329            dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
0330     /* device status comes first, for backwards compatibility */
0331     dd->devstatusp = status_page;
0332     *status_page++ = 0;
0333     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0334         dd->pport[pidx].statusp = status_page;
0335         *status_page++ = 0;
0336     }
0337 
0338     /*
0339      * Setup buffer to hold freeze and other messages, accessible to
0340      * apps, following statusp.  This is per-unit, not per port.
0341      */
0342     dd->freezemsg = (char *) status_page;
0343     *dd->freezemsg = 0;
0344     /* length of msg buffer is "whatever is left" */
0345     ret = (char *) status_page - (char *) dd->pioavailregs_dma;
0346     dd->freezelen = PAGE_SIZE - ret;
0347 
0348     ret = 0;
0349 
0350 done:
0351     return ret;
0352 }
0353 
0354 /**
0355  * init_shadow_tids - allocate the shadow TID array
0356  * @dd: the qlogic_ib device
0357  *
0358  * allocate the shadow TID array, so we can qib_munlock previous
0359  * entries.  It may make more sense to move the pageshadow to the
0360  * ctxt data structure, so we only allocate memory for ctxts actually
0361  * in use, since we at 8k per ctxt, now.
0362  * We don't want failures here to prevent use of the driver/chip,
0363  * so no return value.
0364  */
0365 static void init_shadow_tids(struct qib_devdata *dd)
0366 {
0367     struct page **pages;
0368     dma_addr_t *addrs;
0369 
0370     pages = vzalloc(array_size(sizeof(struct page *),
0371                    dd->cfgctxts * dd->rcvtidcnt));
0372     if (!pages)
0373         goto bail;
0374 
0375     addrs = vzalloc(array_size(sizeof(dma_addr_t),
0376                    dd->cfgctxts * dd->rcvtidcnt));
0377     if (!addrs)
0378         goto bail_free;
0379 
0380     dd->pageshadow = pages;
0381     dd->physshadow = addrs;
0382     return;
0383 
0384 bail_free:
0385     vfree(pages);
0386 bail:
0387     dd->pageshadow = NULL;
0388 }
0389 
0390 /*
0391  * Do initialization for device that is only needed on
0392  * first detect, not on resets.
0393  */
0394 static int loadtime_init(struct qib_devdata *dd)
0395 {
0396     int ret = 0;
0397 
0398     if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) &
0399          QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) {
0400         qib_dev_err(dd,
0401             "Driver only handles version %d, chip swversion is %d (%llx), failing\n",
0402             QIB_CHIP_SWVERSION,
0403             (int)(dd->revision >>
0404                 QLOGIC_IB_R_SOFTWARE_SHIFT) &
0405                 QLOGIC_IB_R_SOFTWARE_MASK,
0406             (unsigned long long) dd->revision);
0407         ret = -ENOSYS;
0408         goto done;
0409     }
0410 
0411     if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK)
0412         qib_devinfo(dd->pcidev, "%s", dd->boardversion);
0413 
0414     spin_lock_init(&dd->pioavail_lock);
0415     spin_lock_init(&dd->sendctrl_lock);
0416     spin_lock_init(&dd->uctxt_lock);
0417     spin_lock_init(&dd->qib_diag_trans_lock);
0418     spin_lock_init(&dd->eep_st_lock);
0419     mutex_init(&dd->eep_lock);
0420 
0421     if (qib_mini_init)
0422         goto done;
0423 
0424     ret = init_pioavailregs(dd);
0425     init_shadow_tids(dd);
0426 
0427     qib_get_eeprom_info(dd);
0428 
0429     /* setup time (don't start yet) to verify we got interrupt */
0430     timer_setup(&dd->intrchk_timer, verify_interrupt, 0);
0431 done:
0432     return ret;
0433 }
0434 
0435 /**
0436  * init_after_reset - re-initialize after a reset
0437  * @dd: the qlogic_ib device
0438  *
0439  * sanity check at least some of the values after reset, and
0440  * ensure no receive or transmit (explicitly, in case reset
0441  * failed
0442  */
0443 static int init_after_reset(struct qib_devdata *dd)
0444 {
0445     int i;
0446 
0447     /*
0448      * Ensure chip does no sends or receives, tail updates, or
0449      * pioavail updates while we re-initialize.  This is mostly
0450      * for the driver data structures, not chip registers.
0451      */
0452     for (i = 0; i < dd->num_pports; ++i) {
0453         /*
0454          * ctxt == -1 means "all contexts". Only really safe for
0455          * _dis_abling things, as here.
0456          */
0457         dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS |
0458                   QIB_RCVCTRL_INTRAVAIL_DIS |
0459                   QIB_RCVCTRL_TAILUPD_DIS, -1);
0460         /* Redundant across ports for some, but no big deal.  */
0461         dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS |
0462             QIB_SENDCTRL_AVAIL_DIS);
0463     }
0464 
0465     return 0;
0466 }
0467 
0468 static void enable_chip(struct qib_devdata *dd)
0469 {
0470     u64 rcvmask;
0471     int i;
0472 
0473     /*
0474      * Enable PIO send, and update of PIOavail regs to memory.
0475      */
0476     for (i = 0; i < dd->num_pports; ++i)
0477         dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB |
0478             QIB_SENDCTRL_AVAIL_ENB);
0479     /*
0480      * Enable kernel ctxts' receive and receive interrupt.
0481      * Other ctxts done as user opens and inits them.
0482      */
0483     rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB;
0484     rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ?
0485           QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB;
0486     for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
0487         struct qib_ctxtdata *rcd = dd->rcd[i];
0488 
0489         if (rcd)
0490             dd->f_rcvctrl(rcd->ppd, rcvmask, i);
0491     }
0492 }
0493 
0494 static void verify_interrupt(struct timer_list *t)
0495 {
0496     struct qib_devdata *dd = from_timer(dd, t, intrchk_timer);
0497     u64 int_counter;
0498 
0499     if (!dd)
0500         return; /* being torn down */
0501 
0502     /*
0503      * If we don't have a lid or any interrupts, let the user know and
0504      * don't bother checking again.
0505      */
0506     int_counter = qib_int_counter(dd) - dd->z_int_counter;
0507     if (int_counter == 0) {
0508         if (!dd->f_intr_fallback(dd))
0509             dev_err(&dd->pcidev->dev,
0510                 "No interrupts detected, not usable.\n");
0511         else /* re-arm the timer to see if fallback works */
0512             mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
0513     }
0514 }
0515 
0516 static void init_piobuf_state(struct qib_devdata *dd)
0517 {
0518     int i, pidx;
0519     u32 uctxts;
0520 
0521     /*
0522      * Ensure all buffers are free, and fifos empty.  Buffers
0523      * are common, so only do once for port 0.
0524      *
0525      * After enable and qib_chg_pioavailkernel so we can safely
0526      * enable pioavail updates and PIOENABLE.  After this, packets
0527      * are ready and able to go out.
0528      */
0529     dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL);
0530     for (pidx = 0; pidx < dd->num_pports; ++pidx)
0531         dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH);
0532 
0533     /*
0534      * If not all sendbufs are used, add the one to each of the lower
0535      * numbered contexts.  pbufsctxt and lastctxt_piobuf are
0536      * calculated in chip-specific code because it may cause some
0537      * chip-specific adjustments to be made.
0538      */
0539     uctxts = dd->cfgctxts - dd->first_user_ctxt;
0540     dd->ctxts_extrabuf = dd->pbufsctxt ?
0541         dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0;
0542 
0543     /*
0544      * Set up the shadow copies of the piobufavail registers,
0545      * which we compare against the chip registers for now, and
0546      * the in memory DMA'ed copies of the registers.
0547      * By now pioavail updates to memory should have occurred, so
0548      * copy them into our working/shadow registers; this is in
0549      * case something went wrong with abort, but mostly to get the
0550      * initial values of the generation bit correct.
0551      */
0552     for (i = 0; i < dd->pioavregs; i++) {
0553         __le64 tmp;
0554 
0555         tmp = dd->pioavailregs_dma[i];
0556         /*
0557          * Don't need to worry about pioavailkernel here
0558          * because we will call qib_chg_pioavailkernel() later
0559          * in initialization, to busy out buffers as needed.
0560          */
0561         dd->pioavailshadow[i] = le64_to_cpu(tmp);
0562     }
0563     while (i < ARRAY_SIZE(dd->pioavailshadow))
0564         dd->pioavailshadow[i++] = 0; /* for debugging sanity */
0565 
0566     /* after pioavailshadow is setup */
0567     qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k,
0568                    TXCHK_CHG_TYPE_KERN, NULL);
0569     dd->f_initvl15_bufs(dd);
0570 }
0571 
0572 /**
0573  * qib_create_workqueues - create per port workqueues
0574  * @dd: the qlogic_ib device
0575  */
0576 static int qib_create_workqueues(struct qib_devdata *dd)
0577 {
0578     int pidx;
0579     struct qib_pportdata *ppd;
0580 
0581     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0582         ppd = dd->pport + pidx;
0583         if (!ppd->qib_wq) {
0584             char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
0585 
0586             snprintf(wq_name, sizeof(wq_name), "qib%d_%d",
0587                 dd->unit, pidx);
0588             ppd->qib_wq = alloc_ordered_workqueue(wq_name,
0589                                   WQ_MEM_RECLAIM);
0590             if (!ppd->qib_wq)
0591                 goto wq_error;
0592         }
0593     }
0594     return 0;
0595 wq_error:
0596     pr_err("create_singlethread_workqueue failed for port %d\n",
0597         pidx + 1);
0598     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0599         ppd = dd->pport + pidx;
0600         if (ppd->qib_wq) {
0601             destroy_workqueue(ppd->qib_wq);
0602             ppd->qib_wq = NULL;
0603         }
0604     }
0605     return -ENOMEM;
0606 }
0607 
0608 static void qib_free_pportdata(struct qib_pportdata *ppd)
0609 {
0610     free_percpu(ppd->ibport_data.pmastats);
0611     free_percpu(ppd->ibport_data.rvp.rc_acks);
0612     free_percpu(ppd->ibport_data.rvp.rc_qacks);
0613     free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
0614     ppd->ibport_data.pmastats = NULL;
0615 }
0616 
0617 /**
0618  * qib_init - do the actual initialization sequence on the chip
0619  * @dd: the qlogic_ib device
0620  * @reinit: reinitializing, so don't allocate new memory
0621  *
0622  * Do the actual initialization sequence on the chip.  This is done
0623  * both from the init routine called from the PCI infrastructure, and
0624  * when we reset the chip, or detect that it was reset internally,
0625  * or it's administratively re-enabled.
0626  *
0627  * Memory allocation here and in called routines is only done in
0628  * the first case (reinit == 0).  We have to be careful, because even
0629  * without memory allocation, we need to re-write all the chip registers
0630  * TIDs, etc. after the reset or enable has completed.
0631  */
0632 int qib_init(struct qib_devdata *dd, int reinit)
0633 {
0634     int ret = 0, pidx, lastfail = 0;
0635     u32 portok = 0;
0636     unsigned i;
0637     struct qib_ctxtdata *rcd;
0638     struct qib_pportdata *ppd;
0639     unsigned long flags;
0640 
0641     /* Set linkstate to unknown, so we can watch for a transition. */
0642     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0643         ppd = dd->pport + pidx;
0644         spin_lock_irqsave(&ppd->lflags_lock, flags);
0645         ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED |
0646                  QIBL_LINKDOWN | QIBL_LINKINIT |
0647                  QIBL_LINKV);
0648         spin_unlock_irqrestore(&ppd->lflags_lock, flags);
0649     }
0650 
0651     if (reinit)
0652         ret = init_after_reset(dd);
0653     else
0654         ret = loadtime_init(dd);
0655     if (ret)
0656         goto done;
0657 
0658     /* Bypass most chip-init, to get to device creation */
0659     if (qib_mini_init)
0660         return 0;
0661 
0662     ret = dd->f_late_initreg(dd);
0663     if (ret)
0664         goto done;
0665 
0666     /* dd->rcd can be NULL if early init failed */
0667     for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
0668         /*
0669          * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
0670          * re-init, the simplest way to handle this is to free
0671          * existing, and re-allocate.
0672          * Need to re-create rest of ctxt 0 ctxtdata as well.
0673          */
0674         rcd = dd->rcd[i];
0675         if (!rcd)
0676             continue;
0677 
0678         lastfail = qib_create_rcvhdrq(dd, rcd);
0679         if (!lastfail)
0680             lastfail = qib_setup_eagerbufs(rcd);
0681         if (lastfail)
0682             qib_dev_err(dd,
0683                 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
0684     }
0685 
0686     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0687         int mtu;
0688 
0689         if (lastfail)
0690             ret = lastfail;
0691         ppd = dd->pport + pidx;
0692         mtu = ib_mtu_enum_to_int(qib_ibmtu);
0693         if (mtu == -1) {
0694             mtu = QIB_DEFAULT_MTU;
0695             qib_ibmtu = 0; /* don't leave invalid value */
0696         }
0697         /* set max we can ever have for this driver load */
0698         ppd->init_ibmaxlen = min(mtu > 2048 ?
0699                      dd->piosize4k : dd->piosize2k,
0700                      dd->rcvegrbufsize +
0701                      (dd->rcvhdrentsize << 2));
0702         /*
0703          * Have to initialize ibmaxlen, but this will normally
0704          * change immediately in qib_set_mtu().
0705          */
0706         ppd->ibmaxlen = ppd->init_ibmaxlen;
0707         qib_set_mtu(ppd, mtu);
0708 
0709         spin_lock_irqsave(&ppd->lflags_lock, flags);
0710         ppd->lflags |= QIBL_IB_LINK_DISABLED;
0711         spin_unlock_irqrestore(&ppd->lflags_lock, flags);
0712 
0713         lastfail = dd->f_bringup_serdes(ppd);
0714         if (lastfail) {
0715             qib_devinfo(dd->pcidev,
0716                  "Failed to bringup IB port %u\n", ppd->port);
0717             lastfail = -ENETDOWN;
0718             continue;
0719         }
0720 
0721         portok++;
0722     }
0723 
0724     if (!portok) {
0725         /* none of the ports initialized */
0726         if (!ret && lastfail)
0727             ret = lastfail;
0728         else if (!ret)
0729             ret = -ENETDOWN;
0730         /* but continue on, so we can debug cause */
0731     }
0732 
0733     enable_chip(dd);
0734 
0735     init_piobuf_state(dd);
0736 
0737 done:
0738     if (!ret) {
0739         /* chip is OK for user apps; mark it as initialized */
0740         for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0741             ppd = dd->pport + pidx;
0742             /*
0743              * Set status even if port serdes is not initialized
0744              * so that diags will work.
0745              */
0746             *ppd->statusp |= QIB_STATUS_CHIP_PRESENT |
0747                 QIB_STATUS_INITTED;
0748             if (!ppd->link_speed_enabled)
0749                 continue;
0750             if (dd->flags & QIB_HAS_SEND_DMA)
0751                 ret = qib_setup_sdma(ppd);
0752             timer_setup(&ppd->hol_timer, qib_hol_event, 0);
0753             ppd->hol_state = QIB_HOL_UP;
0754         }
0755 
0756         /* now we can enable all interrupts from the chip */
0757         dd->f_set_intr_state(dd, 1);
0758 
0759         /*
0760          * Setup to verify we get an interrupt, and fallback
0761          * to an alternate if necessary and possible.
0762          */
0763         mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
0764         /* start stats retrieval timer */
0765         mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
0766     }
0767 
0768     /* if ret is non-zero, we probably should do some cleanup here... */
0769     return ret;
0770 }
0771 
0772 /*
0773  * These next two routines are placeholders in case we don't have per-arch
0774  * code for controlling write combining.  If explicit control of write
0775  * combining is not available, performance will probably be awful.
0776  */
0777 
0778 int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd)
0779 {
0780     return -EOPNOTSUPP;
0781 }
0782 
0783 void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd)
0784 {
0785 }
0786 
0787 struct qib_devdata *qib_lookup(int unit)
0788 {
0789     return xa_load(&qib_dev_table, unit);
0790 }
0791 
0792 /*
0793  * Stop the timers during unit shutdown, or after an error late
0794  * in initialization.
0795  */
0796 static void qib_stop_timers(struct qib_devdata *dd)
0797 {
0798     struct qib_pportdata *ppd;
0799     int pidx;
0800 
0801     if (dd->stats_timer.function)
0802         del_timer_sync(&dd->stats_timer);
0803     if (dd->intrchk_timer.function)
0804         del_timer_sync(&dd->intrchk_timer);
0805     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0806         ppd = dd->pport + pidx;
0807         if (ppd->hol_timer.function)
0808             del_timer_sync(&ppd->hol_timer);
0809         if (ppd->led_override_timer.function) {
0810             del_timer_sync(&ppd->led_override_timer);
0811             atomic_set(&ppd->led_override_timer_active, 0);
0812         }
0813         if (ppd->symerr_clear_timer.function)
0814             del_timer_sync(&ppd->symerr_clear_timer);
0815     }
0816 }
0817 
0818 /**
0819  * qib_shutdown_device - shut down a device
0820  * @dd: the qlogic_ib device
0821  *
0822  * This is called to make the device quiet when we are about to
0823  * unload the driver, and also when the device is administratively
0824  * disabled.   It does not free any data structures.
0825  * Everything it does has to be setup again by qib_init(dd, 1)
0826  */
0827 static void qib_shutdown_device(struct qib_devdata *dd)
0828 {
0829     struct qib_pportdata *ppd;
0830     unsigned pidx;
0831 
0832     if (dd->flags & QIB_SHUTDOWN)
0833         return;
0834     dd->flags |= QIB_SHUTDOWN;
0835 
0836     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0837         ppd = dd->pport + pidx;
0838 
0839         spin_lock_irq(&ppd->lflags_lock);
0840         ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT |
0841                  QIBL_LINKARMED | QIBL_LINKACTIVE |
0842                  QIBL_LINKV);
0843         spin_unlock_irq(&ppd->lflags_lock);
0844         *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY);
0845     }
0846     dd->flags &= ~QIB_INITTED;
0847 
0848     /* mask interrupts, but not errors */
0849     dd->f_set_intr_state(dd, 0);
0850 
0851     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0852         ppd = dd->pport + pidx;
0853         dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS |
0854                    QIB_RCVCTRL_CTXT_DIS |
0855                    QIB_RCVCTRL_INTRAVAIL_DIS |
0856                    QIB_RCVCTRL_PKEY_ENB, -1);
0857         /*
0858          * Gracefully stop all sends allowing any in progress to
0859          * trickle out first.
0860          */
0861         dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR);
0862     }
0863 
0864     /*
0865      * Enough for anything that's going to trickle out to have actually
0866      * done so.
0867      */
0868     udelay(20);
0869 
0870     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0871         ppd = dd->pport + pidx;
0872         dd->f_setextled(ppd, 0); /* make sure LEDs are off */
0873 
0874         if (dd->flags & QIB_HAS_SEND_DMA)
0875             qib_teardown_sdma(ppd);
0876 
0877         dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS |
0878                     QIB_SENDCTRL_SEND_DIS);
0879         /*
0880          * Clear SerdesEnable.
0881          * We can't count on interrupts since we are stopping.
0882          */
0883         dd->f_quiet_serdes(ppd);
0884 
0885         if (ppd->qib_wq) {
0886             destroy_workqueue(ppd->qib_wq);
0887             ppd->qib_wq = NULL;
0888         }
0889         qib_free_pportdata(ppd);
0890     }
0891 
0892 }
0893 
0894 /**
0895  * qib_free_ctxtdata - free a context's allocated data
0896  * @dd: the qlogic_ib device
0897  * @rcd: the ctxtdata structure
0898  *
0899  * free up any allocated data for a context
0900  * This should not touch anything that would affect a simultaneous
0901  * re-allocation of context data, because it is called after qib_mutex
0902  * is released (and can be called from reinit as well).
0903  * It should never change any chip state, or global driver state.
0904  */
0905 void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
0906 {
0907     if (!rcd)
0908         return;
0909 
0910     if (rcd->rcvhdrq) {
0911         dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
0912                   rcd->rcvhdrq, rcd->rcvhdrq_phys);
0913         rcd->rcvhdrq = NULL;
0914         if (rcd->rcvhdrtail_kvaddr) {
0915             dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
0916                       rcd->rcvhdrtail_kvaddr,
0917                       rcd->rcvhdrqtailaddr_phys);
0918             rcd->rcvhdrtail_kvaddr = NULL;
0919         }
0920     }
0921     if (rcd->rcvegrbuf) {
0922         unsigned e;
0923 
0924         for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
0925             void *base = rcd->rcvegrbuf[e];
0926             size_t size = rcd->rcvegrbuf_size;
0927 
0928             dma_free_coherent(&dd->pcidev->dev, size,
0929                       base, rcd->rcvegrbuf_phys[e]);
0930         }
0931         kfree(rcd->rcvegrbuf);
0932         rcd->rcvegrbuf = NULL;
0933         kfree(rcd->rcvegrbuf_phys);
0934         rcd->rcvegrbuf_phys = NULL;
0935         rcd->rcvegrbuf_chunks = 0;
0936     }
0937 
0938     kfree(rcd->tid_pg_list);
0939     vfree(rcd->user_event_mask);
0940     vfree(rcd->subctxt_uregbase);
0941     vfree(rcd->subctxt_rcvegrbuf);
0942     vfree(rcd->subctxt_rcvhdr_base);
0943 #ifdef CONFIG_DEBUG_FS
0944     kfree(rcd->opstats);
0945     rcd->opstats = NULL;
0946 #endif
0947     kfree(rcd);
0948 }
0949 
0950 /*
0951  * Perform a PIO buffer bandwidth write test, to verify proper system
0952  * configuration.  Even when all the setup calls work, occasionally
0953  * BIOS or other issues can prevent write combining from working, or
0954  * can cause other bandwidth problems to the chip.
0955  *
0956  * This test simply writes the same buffer over and over again, and
0957  * measures close to the peak bandwidth to the chip (not testing
0958  * data bandwidth to the wire).   On chips that use an address-based
0959  * trigger to send packets to the wire, this is easy.  On chips that
0960  * use a count to trigger, we want to make sure that the packet doesn't
0961  * go out on the wire, or trigger flow control checks.
0962  */
0963 static void qib_verify_pioperf(struct qib_devdata *dd)
0964 {
0965     u32 pbnum, cnt, lcnt;
0966     u32 __iomem *piobuf;
0967     u32 *addr;
0968     u64 msecs, emsecs;
0969 
0970     piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum);
0971     if (!piobuf) {
0972         qib_devinfo(dd->pcidev,
0973              "No PIObufs for checking perf, skipping\n");
0974         return;
0975     }
0976 
0977     /*
0978      * Enough to give us a reasonable test, less than piobuf size, and
0979      * likely multiple of store buffer length.
0980      */
0981     cnt = 1024;
0982 
0983     addr = vmalloc(cnt);
0984     if (!addr)
0985         goto done;
0986 
0987     preempt_disable();  /* we want reasonably accurate elapsed time */
0988     msecs = 1 + jiffies_to_msecs(jiffies);
0989     for (lcnt = 0; lcnt < 10000U; lcnt++) {
0990         /* wait until we cross msec boundary */
0991         if (jiffies_to_msecs(jiffies) >= msecs)
0992             break;
0993         udelay(1);
0994     }
0995 
0996     dd->f_set_armlaunch(dd, 0);
0997 
0998     /*
0999      * length 0, no dwords actually sent
1000      */
1001     writeq(0, piobuf);
1002     qib_flush_wc();
1003 
1004     /*
1005      * This is only roughly accurate, since even with preempt we
1006      * still take interrupts that could take a while.   Running for
1007      * >= 5 msec seems to get us "close enough" to accurate values.
1008      */
1009     msecs = jiffies_to_msecs(jiffies);
1010     for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
1011         qib_pio_copy(piobuf + 64, addr, cnt >> 2);
1012         emsecs = jiffies_to_msecs(jiffies) - msecs;
1013     }
1014 
1015     /* 1 GiB/sec, slightly over IB SDR line rate */
1016     if (lcnt < (emsecs * 1024U))
1017         qib_dev_err(dd,
1018                 "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n",
1019                 lcnt / (u32) emsecs);
1020 
1021     preempt_enable();
1022 
1023     vfree(addr);
1024 
1025 done:
1026     /* disarm piobuf, so it's available again */
1027     dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum));
1028     qib_sendbuf_done(dd, pbnum);
1029     dd->f_set_armlaunch(dd, 1);
1030 }
1031 
1032 void qib_free_devdata(struct qib_devdata *dd)
1033 {
1034     unsigned long flags;
1035 
1036     xa_lock_irqsave(&qib_dev_table, flags);
1037     __xa_erase(&qib_dev_table, dd->unit);
1038     xa_unlock_irqrestore(&qib_dev_table, flags);
1039 
1040 #ifdef CONFIG_DEBUG_FS
1041     qib_dbg_ibdev_exit(&dd->verbs_dev);
1042 #endif
1043     free_percpu(dd->int_counter);
1044     rvt_dealloc_device(&dd->verbs_dev.rdi);
1045 }
1046 
1047 u64 qib_int_counter(struct qib_devdata *dd)
1048 {
1049     int cpu;
1050     u64 int_counter = 0;
1051 
1052     for_each_possible_cpu(cpu)
1053         int_counter += *per_cpu_ptr(dd->int_counter, cpu);
1054     return int_counter;
1055 }
1056 
1057 u64 qib_sps_ints(void)
1058 {
1059     unsigned long index, flags;
1060     struct qib_devdata *dd;
1061     u64 sps_ints = 0;
1062 
1063     xa_lock_irqsave(&qib_dev_table, flags);
1064     xa_for_each(&qib_dev_table, index, dd) {
1065         sps_ints += qib_int_counter(dd);
1066     }
1067     xa_unlock_irqrestore(&qib_dev_table, flags);
1068     return sps_ints;
1069 }
1070 
1071 /*
1072  * Allocate our primary per-unit data structure.  Must be done via verbs
1073  * allocator, because the verbs cleanup process both does cleanup and
1074  * free of the data structure.
1075  * "extra" is for chip-specific data.
1076  */
1077 struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
1078 {
1079     struct qib_devdata *dd;
1080     int ret, nports;
1081 
1082     /* extra is * number of ports */
1083     nports = extra / sizeof(struct qib_pportdata);
1084     dd = (struct qib_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
1085                             nports);
1086     if (!dd)
1087         return ERR_PTR(-ENOMEM);
1088 
1089     ret = xa_alloc_irq(&qib_dev_table, &dd->unit, dd, xa_limit_32b,
1090             GFP_KERNEL);
1091     if (ret < 0) {
1092         qib_early_err(&pdev->dev,
1093                   "Could not allocate unit ID: error %d\n", -ret);
1094         goto bail;
1095     }
1096     rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s%d", "qib", dd->unit);
1097 
1098     dd->int_counter = alloc_percpu(u64);
1099     if (!dd->int_counter) {
1100         ret = -ENOMEM;
1101         qib_early_err(&pdev->dev,
1102                   "Could not allocate per-cpu int_counter\n");
1103         goto bail;
1104     }
1105 
1106     if (!qib_cpulist_count) {
1107         u32 count = num_online_cpus();
1108 
1109         qib_cpulist = bitmap_zalloc(count, GFP_KERNEL);
1110         if (qib_cpulist)
1111             qib_cpulist_count = count;
1112     }
1113 #ifdef CONFIG_DEBUG_FS
1114     qib_dbg_ibdev_init(&dd->verbs_dev);
1115 #endif
1116     return dd;
1117 bail:
1118     if (!list_empty(&dd->list))
1119         list_del_init(&dd->list);
1120     rvt_dealloc_device(&dd->verbs_dev.rdi);
1121     return ERR_PTR(ret);
1122 }
1123 
1124 /*
1125  * Called from freeze mode handlers, and from PCI error
1126  * reporting code.  Should be paranoid about state of
1127  * system and data structures.
1128  */
1129 void qib_disable_after_error(struct qib_devdata *dd)
1130 {
1131     if (dd->flags & QIB_INITTED) {
1132         u32 pidx;
1133 
1134         dd->flags &= ~QIB_INITTED;
1135         if (dd->pport)
1136             for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1137                 struct qib_pportdata *ppd;
1138 
1139                 ppd = dd->pport + pidx;
1140                 if (dd->flags & QIB_PRESENT) {
1141                     qib_set_linkstate(ppd,
1142                         QIB_IB_LINKDOWN_DISABLE);
1143                     dd->f_setextled(ppd, 0);
1144                 }
1145                 *ppd->statusp &= ~QIB_STATUS_IB_READY;
1146             }
1147     }
1148 
1149     /*
1150      * Mark as having had an error for driver, and also
1151      * for /sys and status word mapped to user programs.
1152      * This marks unit as not usable, until reset.
1153      */
1154     if (dd->devstatusp)
1155         *dd->devstatusp |= QIB_STATUS_HWERROR;
1156 }
1157 
1158 static void qib_remove_one(struct pci_dev *);
1159 static int qib_init_one(struct pci_dev *, const struct pci_device_id *);
1160 static void qib_shutdown_one(struct pci_dev *);
1161 
1162 #define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: "
1163 #define PFX QIB_DRV_NAME ": "
1164 
1165 static const struct pci_device_id qib_pci_tbl[] = {
1166     { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) },
1167     { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) },
1168     { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) },
1169     { 0, }
1170 };
1171 
1172 MODULE_DEVICE_TABLE(pci, qib_pci_tbl);
1173 
1174 static struct pci_driver qib_driver = {
1175     .name = QIB_DRV_NAME,
1176     .probe = qib_init_one,
1177     .remove = qib_remove_one,
1178     .shutdown = qib_shutdown_one,
1179     .id_table = qib_pci_tbl,
1180     .err_handler = &qib_pci_err_handler,
1181 };
1182 
1183 #ifdef CONFIG_INFINIBAND_QIB_DCA
1184 
1185 static int qib_notify_dca(struct notifier_block *, unsigned long, void *);
1186 static struct notifier_block dca_notifier = {
1187     .notifier_call  = qib_notify_dca,
1188     .next           = NULL,
1189     .priority       = 0
1190 };
1191 
1192 static int qib_notify_dca_device(struct device *device, void *data)
1193 {
1194     struct qib_devdata *dd = dev_get_drvdata(device);
1195     unsigned long event = *(unsigned long *)data;
1196 
1197     return dd->f_notify_dca(dd, event);
1198 }
1199 
1200 static int qib_notify_dca(struct notifier_block *nb, unsigned long event,
1201                       void *p)
1202 {
1203     int rval;
1204 
1205     rval = driver_for_each_device(&qib_driver.driver, NULL,
1206                       &event, qib_notify_dca_device);
1207     return rval ? NOTIFY_BAD : NOTIFY_DONE;
1208 }
1209 
1210 #endif
1211 
1212 /*
1213  * Do all the generic driver unit- and chip-independent memory
1214  * allocation and initialization.
1215  */
1216 static int __init qib_ib_init(void)
1217 {
1218     int ret;
1219 
1220     ret = qib_dev_init();
1221     if (ret)
1222         goto bail;
1223 
1224     /*
1225      * These must be called before the driver is registered with
1226      * the PCI subsystem.
1227      */
1228 #ifdef CONFIG_INFINIBAND_QIB_DCA
1229     dca_register_notify(&dca_notifier);
1230 #endif
1231 #ifdef CONFIG_DEBUG_FS
1232     qib_dbg_init();
1233 #endif
1234     ret = pci_register_driver(&qib_driver);
1235     if (ret < 0) {
1236         pr_err("Unable to register driver: error %d\n", -ret);
1237         goto bail_dev;
1238     }
1239 
1240     /* not fatal if it doesn't work */
1241     if (qib_init_qibfs())
1242         pr_err("Unable to register ipathfs\n");
1243     goto bail; /* all OK */
1244 
1245 bail_dev:
1246 #ifdef CONFIG_INFINIBAND_QIB_DCA
1247     dca_unregister_notify(&dca_notifier);
1248 #endif
1249 #ifdef CONFIG_DEBUG_FS
1250     qib_dbg_exit();
1251 #endif
1252     qib_dev_cleanup();
1253 bail:
1254     return ret;
1255 }
1256 
1257 module_init(qib_ib_init);
1258 
1259 /*
1260  * Do the non-unit driver cleanup, memory free, etc. at unload.
1261  */
1262 static void __exit qib_ib_cleanup(void)
1263 {
1264     int ret;
1265 
1266     ret = qib_exit_qibfs();
1267     if (ret)
1268         pr_err(
1269             "Unable to cleanup counter filesystem: error %d\n",
1270             -ret);
1271 
1272 #ifdef CONFIG_INFINIBAND_QIB_DCA
1273     dca_unregister_notify(&dca_notifier);
1274 #endif
1275     pci_unregister_driver(&qib_driver);
1276 #ifdef CONFIG_DEBUG_FS
1277     qib_dbg_exit();
1278 #endif
1279 
1280     qib_cpulist_count = 0;
1281     bitmap_free(qib_cpulist);
1282 
1283     WARN_ON(!xa_empty(&qib_dev_table));
1284     qib_dev_cleanup();
1285 }
1286 
1287 module_exit(qib_ib_cleanup);
1288 
1289 /* this can only be called after a successful initialization */
1290 static void cleanup_device_data(struct qib_devdata *dd)
1291 {
1292     int ctxt;
1293     int pidx;
1294     struct qib_ctxtdata **tmp;
1295     unsigned long flags;
1296 
1297     /* users can't do anything more with chip */
1298     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1299         if (dd->pport[pidx].statusp)
1300             *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT;
1301 
1302         spin_lock(&dd->pport[pidx].cc_shadow_lock);
1303 
1304         kfree(dd->pport[pidx].congestion_entries);
1305         dd->pport[pidx].congestion_entries = NULL;
1306         kfree(dd->pport[pidx].ccti_entries);
1307         dd->pport[pidx].ccti_entries = NULL;
1308         kfree(dd->pport[pidx].ccti_entries_shadow);
1309         dd->pport[pidx].ccti_entries_shadow = NULL;
1310         kfree(dd->pport[pidx].congestion_entries_shadow);
1311         dd->pport[pidx].congestion_entries_shadow = NULL;
1312 
1313         spin_unlock(&dd->pport[pidx].cc_shadow_lock);
1314     }
1315 
1316     qib_disable_wc(dd);
1317 
1318     if (dd->pioavailregs_dma) {
1319         dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
1320                   (void *) dd->pioavailregs_dma,
1321                   dd->pioavailregs_phys);
1322         dd->pioavailregs_dma = NULL;
1323     }
1324 
1325     if (dd->pageshadow) {
1326         struct page **tmpp = dd->pageshadow;
1327         dma_addr_t *tmpd = dd->physshadow;
1328         int i;
1329 
1330         for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) {
1331             int ctxt_tidbase = ctxt * dd->rcvtidcnt;
1332             int maxtid = ctxt_tidbase + dd->rcvtidcnt;
1333 
1334             for (i = ctxt_tidbase; i < maxtid; i++) {
1335                 if (!tmpp[i])
1336                     continue;
1337                 dma_unmap_page(&dd->pcidev->dev, tmpd[i],
1338                            PAGE_SIZE, DMA_FROM_DEVICE);
1339                 qib_release_user_pages(&tmpp[i], 1);
1340                 tmpp[i] = NULL;
1341             }
1342         }
1343 
1344         dd->pageshadow = NULL;
1345         vfree(tmpp);
1346         dd->physshadow = NULL;
1347         vfree(tmpd);
1348     }
1349 
1350     /*
1351      * Free any resources still in use (usually just kernel contexts)
1352      * at unload; we do for ctxtcnt, because that's what we allocate.
1353      * We acquire lock to be really paranoid that rcd isn't being
1354      * accessed from some interrupt-related code (that should not happen,
1355      * but best to be sure).
1356      */
1357     spin_lock_irqsave(&dd->uctxt_lock, flags);
1358     tmp = dd->rcd;
1359     dd->rcd = NULL;
1360     spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1361     for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) {
1362         struct qib_ctxtdata *rcd = tmp[ctxt];
1363 
1364         tmp[ctxt] = NULL; /* debugging paranoia */
1365         qib_free_ctxtdata(dd, rcd);
1366     }
1367     kfree(tmp);
1368 }
1369 
1370 /*
1371  * Clean up on unit shutdown, or error during unit load after
1372  * successful initialization.
1373  */
1374 static void qib_postinit_cleanup(struct qib_devdata *dd)
1375 {
1376     /*
1377      * Clean up chip-specific stuff.
1378      * We check for NULL here, because it's outside
1379      * the kregbase check, and we need to call it
1380      * after the free_irq.  Thus it's possible that
1381      * the function pointers were never initialized.
1382      */
1383     if (dd->f_cleanup)
1384         dd->f_cleanup(dd);
1385 
1386     qib_pcie_ddcleanup(dd);
1387 
1388     cleanup_device_data(dd);
1389 
1390     qib_free_devdata(dd);
1391 }
1392 
1393 static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
1394 {
1395     int ret, j, pidx, initfail;
1396     struct qib_devdata *dd = NULL;
1397 
1398     ret = qib_pcie_init(pdev, ent);
1399     if (ret)
1400         goto bail;
1401 
1402     /*
1403      * Do device-specific initialiation, function table setup, dd
1404      * allocation, etc.
1405      */
1406     switch (ent->device) {
1407     case PCI_DEVICE_ID_QLOGIC_IB_6120:
1408 #ifdef CONFIG_PCI_MSI
1409         dd = qib_init_iba6120_funcs(pdev, ent);
1410 #else
1411         qib_early_err(&pdev->dev,
1412             "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n",
1413             ent->device);
1414         dd = ERR_PTR(-ENODEV);
1415 #endif
1416         break;
1417 
1418     case PCI_DEVICE_ID_QLOGIC_IB_7220:
1419         dd = qib_init_iba7220_funcs(pdev, ent);
1420         break;
1421 
1422     case PCI_DEVICE_ID_QLOGIC_IB_7322:
1423         dd = qib_init_iba7322_funcs(pdev, ent);
1424         break;
1425 
1426     default:
1427         qib_early_err(&pdev->dev,
1428             "Failing on unknown Intel deviceid 0x%x\n",
1429             ent->device);
1430         ret = -ENODEV;
1431     }
1432 
1433     if (IS_ERR(dd))
1434         ret = PTR_ERR(dd);
1435     if (ret)
1436         goto bail; /* error already printed */
1437 
1438     ret = qib_create_workqueues(dd);
1439     if (ret)
1440         goto bail;
1441 
1442     /* do the generic initialization */
1443     initfail = qib_init(dd, 0);
1444 
1445     ret = qib_register_ib_device(dd);
1446 
1447     /*
1448      * Now ready for use.  this should be cleared whenever we
1449      * detect a reset, or initiate one.  If earlier failure,
1450      * we still create devices, so diags, etc. can be used
1451      * to determine cause of problem.
1452      */
1453     if (!qib_mini_init && !initfail && !ret)
1454         dd->flags |= QIB_INITTED;
1455 
1456     j = qib_device_create(dd);
1457     if (j)
1458         qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
1459     j = qibfs_add(dd);
1460     if (j)
1461         qib_dev_err(dd, "Failed filesystem setup for counters: %d\n",
1462                 -j);
1463 
1464     if (qib_mini_init || initfail || ret) {
1465         qib_stop_timers(dd);
1466         flush_workqueue(ib_wq);
1467         for (pidx = 0; pidx < dd->num_pports; ++pidx)
1468             dd->f_quiet_serdes(dd->pport + pidx);
1469         if (qib_mini_init)
1470             goto bail;
1471         if (!j) {
1472             (void) qibfs_remove(dd);
1473             qib_device_remove(dd);
1474         }
1475         if (!ret)
1476             qib_unregister_ib_device(dd);
1477         qib_postinit_cleanup(dd);
1478         if (initfail)
1479             ret = initfail;
1480         goto bail;
1481     }
1482 
1483     ret = qib_enable_wc(dd);
1484     if (ret) {
1485         qib_dev_err(dd,
1486             "Write combining not enabled (err %d): performance may be poor\n",
1487             -ret);
1488         ret = 0;
1489     }
1490 
1491     qib_verify_pioperf(dd);
1492 bail:
1493     return ret;
1494 }
1495 
1496 static void qib_remove_one(struct pci_dev *pdev)
1497 {
1498     struct qib_devdata *dd = pci_get_drvdata(pdev);
1499     int ret;
1500 
1501     /* unregister from IB core */
1502     qib_unregister_ib_device(dd);
1503 
1504     /*
1505      * Disable the IB link, disable interrupts on the device,
1506      * clear dma engines, etc.
1507      */
1508     if (!qib_mini_init)
1509         qib_shutdown_device(dd);
1510 
1511     qib_stop_timers(dd);
1512 
1513     /* wait until all of our (qsfp) queue_work() calls complete */
1514     flush_workqueue(ib_wq);
1515 
1516     ret = qibfs_remove(dd);
1517     if (ret)
1518         qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n",
1519                 -ret);
1520 
1521     qib_device_remove(dd);
1522 
1523     qib_postinit_cleanup(dd);
1524 }
1525 
1526 static void qib_shutdown_one(struct pci_dev *pdev)
1527 {
1528     struct qib_devdata *dd = pci_get_drvdata(pdev);
1529 
1530     qib_shutdown_device(dd);
1531 }
1532 
1533 /**
1534  * qib_create_rcvhdrq - create a receive header queue
1535  * @dd: the qlogic_ib device
1536  * @rcd: the context data
1537  *
1538  * This must be contiguous memory (from an i/o perspective), and must be
1539  * DMA'able (which means for some systems, it will go through an IOMMU,
1540  * or be forced into a low address range).
1541  */
1542 int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
1543 {
1544     unsigned amt;
1545     int old_node_id;
1546 
1547     if (!rcd->rcvhdrq) {
1548         dma_addr_t phys_hdrqtail;
1549         gfp_t gfp_flags;
1550 
1551         amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
1552                 sizeof(u32), PAGE_SIZE);
1553         gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
1554             GFP_USER : GFP_KERNEL;
1555 
1556         old_node_id = dev_to_node(&dd->pcidev->dev);
1557         set_dev_node(&dd->pcidev->dev, rcd->node_id);
1558         rcd->rcvhdrq = dma_alloc_coherent(
1559             &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
1560             gfp_flags | __GFP_COMP);
1561         set_dev_node(&dd->pcidev->dev, old_node_id);
1562 
1563         if (!rcd->rcvhdrq) {
1564             qib_dev_err(dd,
1565                 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
1566                 amt, rcd->ctxt);
1567             goto bail;
1568         }
1569 
1570         if (rcd->ctxt >= dd->first_user_ctxt) {
1571             rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
1572             if (!rcd->user_event_mask)
1573                 goto bail_free_hdrq;
1574         }
1575 
1576         if (!(dd->flags & QIB_NODMA_RTAIL)) {
1577             set_dev_node(&dd->pcidev->dev, rcd->node_id);
1578             rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
1579                 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
1580                 gfp_flags);
1581             set_dev_node(&dd->pcidev->dev, old_node_id);
1582             if (!rcd->rcvhdrtail_kvaddr)
1583                 goto bail_free;
1584             rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
1585         }
1586 
1587         rcd->rcvhdrq_size = amt;
1588     }
1589 
1590     /* clear for security and sanity on each use */
1591     memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
1592     if (rcd->rcvhdrtail_kvaddr)
1593         memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE);
1594     return 0;
1595 
1596 bail_free:
1597     qib_dev_err(dd,
1598         "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
1599         rcd->ctxt);
1600     vfree(rcd->user_event_mask);
1601     rcd->user_event_mask = NULL;
1602 bail_free_hdrq:
1603     dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
1604               rcd->rcvhdrq_phys);
1605     rcd->rcvhdrq = NULL;
1606 bail:
1607     return -ENOMEM;
1608 }
1609 
1610 /**
1611  * qib_setup_eagerbufs - allocate eager buffers, both kernel and user contexts.
1612  * @rcd: the context we are setting up.
1613  *
1614  * Allocate the eager TID buffers and program them into hip.
1615  * They are no longer completely contiguous, we do multiple allocation
1616  * calls.  Otherwise we get the OOM code involved, by asking for too
1617  * much per call, with disastrous results on some kernels.
1618  */
1619 int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
1620 {
1621     struct qib_devdata *dd = rcd->dd;
1622     unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
1623     size_t size;
1624     gfp_t gfp_flags;
1625     int old_node_id;
1626 
1627     /*
1628      * GFP_USER, but without GFP_FS, so buffer cache can be
1629      * coalesced (we hope); otherwise, even at order 4,
1630      * heavy filesystem activity makes these fail, and we can
1631      * use compound pages.
1632      */
1633     gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
1634 
1635     egrcnt = rcd->rcvegrcnt;
1636     egroff = rcd->rcvegr_tid_base;
1637     egrsize = dd->rcvegrbufsize;
1638 
1639     chunk = rcd->rcvegrbuf_chunks;
1640     egrperchunk = rcd->rcvegrbufs_perchunk;
1641     size = rcd->rcvegrbuf_size;
1642     if (!rcd->rcvegrbuf) {
1643         rcd->rcvegrbuf =
1644             kcalloc_node(chunk, sizeof(rcd->rcvegrbuf[0]),
1645                      GFP_KERNEL, rcd->node_id);
1646         if (!rcd->rcvegrbuf)
1647             goto bail;
1648     }
1649     if (!rcd->rcvegrbuf_phys) {
1650         rcd->rcvegrbuf_phys =
1651             kmalloc_array_node(chunk,
1652                        sizeof(rcd->rcvegrbuf_phys[0]),
1653                        GFP_KERNEL, rcd->node_id);
1654         if (!rcd->rcvegrbuf_phys)
1655             goto bail_rcvegrbuf;
1656     }
1657     for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
1658         if (rcd->rcvegrbuf[e])
1659             continue;
1660 
1661         old_node_id = dev_to_node(&dd->pcidev->dev);
1662         set_dev_node(&dd->pcidev->dev, rcd->node_id);
1663         rcd->rcvegrbuf[e] =
1664             dma_alloc_coherent(&dd->pcidev->dev, size,
1665                        &rcd->rcvegrbuf_phys[e],
1666                        gfp_flags);
1667         set_dev_node(&dd->pcidev->dev, old_node_id);
1668         if (!rcd->rcvegrbuf[e])
1669             goto bail_rcvegrbuf_phys;
1670     }
1671 
1672     rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0];
1673 
1674     for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) {
1675         dma_addr_t pa = rcd->rcvegrbuf_phys[chunk];
1676         unsigned i;
1677 
1678         /* clear for security and sanity on each use */
1679         memset(rcd->rcvegrbuf[chunk], 0, size);
1680 
1681         for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
1682             dd->f_put_tid(dd, e + egroff +
1683                       (u64 __iomem *)
1684                       ((char __iomem *)
1685                        dd->kregbase +
1686                        dd->rcvegrbase),
1687                       RCVHQ_RCV_TYPE_EAGER, pa);
1688             pa += egrsize;
1689         }
1690         cond_resched(); /* don't hog the cpu */
1691     }
1692 
1693     return 0;
1694 
1695 bail_rcvegrbuf_phys:
1696     for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++)
1697         dma_free_coherent(&dd->pcidev->dev, size,
1698                   rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]);
1699     kfree(rcd->rcvegrbuf_phys);
1700     rcd->rcvegrbuf_phys = NULL;
1701 bail_rcvegrbuf:
1702     kfree(rcd->rcvegrbuf);
1703     rcd->rcvegrbuf = NULL;
1704 bail:
1705     return -ENOMEM;
1706 }
1707 
1708 /*
1709  * Note: Changes to this routine should be mirrored
1710  * for the diagnostics routine qib_remap_ioaddr32().
1711  * There is also related code for VL15 buffers in qib_init_7322_variables().
1712  * The teardown code that unmaps is in qib_pcie_ddcleanup()
1713  */
1714 int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen)
1715 {
1716     u64 __iomem *qib_kregbase = NULL;
1717     void __iomem *qib_piobase = NULL;
1718     u64 __iomem *qib_userbase = NULL;
1719     u64 qib_kreglen;
1720     u64 qib_pio2koffset = dd->piobufbase & 0xffffffff;
1721     u64 qib_pio4koffset = dd->piobufbase >> 32;
1722     u64 qib_pio2klen = dd->piobcnt2k * dd->palign;
1723     u64 qib_pio4klen = dd->piobcnt4k * dd->align4k;
1724     u64 qib_physaddr = dd->physaddr;
1725     u64 qib_piolen;
1726     u64 qib_userlen = 0;
1727 
1728     /*
1729      * Free the old mapping because the kernel will try to reuse the
1730      * old mapping and not create a new mapping with the
1731      * write combining attribute.
1732      */
1733     iounmap(dd->kregbase);
1734     dd->kregbase = NULL;
1735 
1736     /*
1737      * Assumes chip address space looks like:
1738      *  - kregs + sregs + cregs + uregs (in any order)
1739      *  - piobufs (2K and 4K bufs in either order)
1740      * or:
1741      *  - kregs + sregs + cregs (in any order)
1742      *  - piobufs (2K and 4K bufs in either order)
1743      *  - uregs
1744      */
1745     if (dd->piobcnt4k == 0) {
1746         qib_kreglen = qib_pio2koffset;
1747         qib_piolen = qib_pio2klen;
1748     } else if (qib_pio2koffset < qib_pio4koffset) {
1749         qib_kreglen = qib_pio2koffset;
1750         qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen;
1751     } else {
1752         qib_kreglen = qib_pio4koffset;
1753         qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen;
1754     }
1755     qib_piolen += vl15buflen;
1756     /* Map just the configured ports (not all hw ports) */
1757     if (dd->uregbase > qib_kreglen)
1758         qib_userlen = dd->ureg_align * dd->cfgctxts;
1759 
1760     /* Sanity checks passed, now create the new mappings */
1761     qib_kregbase = ioremap(qib_physaddr, qib_kreglen);
1762     if (!qib_kregbase)
1763         goto bail;
1764 
1765     qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen);
1766     if (!qib_piobase)
1767         goto bail_kregbase;
1768 
1769     if (qib_userlen) {
1770         qib_userbase = ioremap(qib_physaddr + dd->uregbase,
1771                            qib_userlen);
1772         if (!qib_userbase)
1773             goto bail_piobase;
1774     }
1775 
1776     dd->kregbase = qib_kregbase;
1777     dd->kregend = (u64 __iomem *)
1778         ((char __iomem *) qib_kregbase + qib_kreglen);
1779     dd->piobase = qib_piobase;
1780     dd->pio2kbase = (void __iomem *)
1781         (((char __iomem *) dd->piobase) +
1782          qib_pio2koffset - qib_kreglen);
1783     if (dd->piobcnt4k)
1784         dd->pio4kbase = (void __iomem *)
1785             (((char __iomem *) dd->piobase) +
1786              qib_pio4koffset - qib_kreglen);
1787     if (qib_userlen)
1788         /* ureg will now be accessed relative to dd->userbase */
1789         dd->userbase = qib_userbase;
1790     return 0;
1791 
1792 bail_piobase:
1793     iounmap(qib_piobase);
1794 bail_kregbase:
1795     iounmap(qib_kregbase);
1796 bail:
1797     return -ENOMEM;
1798 }