Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
0002 /*
0003  * Copyright(c) 2015 - 2020 Intel Corporation.
0004  * Copyright(c) 2021 Cornelis Networks.
0005  */
0006 
0007 #include <linux/pci.h>
0008 #include <linux/netdevice.h>
0009 #include <linux/vmalloc.h>
0010 #include <linux/delay.h>
0011 #include <linux/xarray.h>
0012 #include <linux/module.h>
0013 #include <linux/printk.h>
0014 #include <linux/hrtimer.h>
0015 #include <linux/bitmap.h>
0016 #include <linux/numa.h>
0017 #include <rdma/rdma_vt.h>
0018 
0019 #include "hfi.h"
0020 #include "device.h"
0021 #include "common.h"
0022 #include "trace.h"
0023 #include "mad.h"
0024 #include "sdma.h"
0025 #include "debugfs.h"
0026 #include "verbs.h"
0027 #include "aspm.h"
0028 #include "affinity.h"
0029 #include "vnic.h"
0030 #include "exp_rcv.h"
0031 #include "netdev.h"
0032 
0033 #undef pr_fmt
0034 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
0035 
0036 /*
0037  * min buffers we want to have per context, after driver
0038  */
0039 #define HFI1_MIN_USER_CTXT_BUFCNT 7
0040 
0041 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
0042 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
0043 
0044 #define NUM_IB_PORTS 1
0045 
0046 /*
0047  * Number of user receive contexts we are configured to use (to allow for more
0048  * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
0049  */
0050 int num_user_contexts = -1;
0051 module_param_named(num_user_contexts, num_user_contexts, int, 0444);
0052 MODULE_PARM_DESC(
0053     num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
0054 
0055 uint krcvqs[RXE_NUM_DATA_VL];
0056 int krcvqsset;
0057 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
0058 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
0059 
0060 /* computed based on above array */
0061 unsigned long n_krcvqs;
0062 
0063 static unsigned hfi1_rcvarr_split = 25;
0064 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
0065 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
0066 
0067 static uint eager_buffer_size = (8 << 20); /* 8MB */
0068 module_param(eager_buffer_size, uint, S_IRUGO);
0069 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB");
0070 
0071 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
0072 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
0073 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
0074 
0075 static uint hfi1_hdrq_entsize = 32;
0076 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444);
0077 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)");
0078 
0079 unsigned int user_credit_return_threshold = 33; /* default is 33% */
0080 module_param(user_credit_return_threshold, uint, S_IRUGO);
0081 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
0082 
0083 DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
0084 
0085 static int hfi1_create_kctxt(struct hfi1_devdata *dd,
0086                  struct hfi1_pportdata *ppd)
0087 {
0088     struct hfi1_ctxtdata *rcd;
0089     int ret;
0090 
0091     /* Control context has to be always 0 */
0092     BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
0093 
0094     ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd);
0095     if (ret < 0) {
0096         dd_dev_err(dd, "Kernel receive context allocation failed\n");
0097         return ret;
0098     }
0099 
0100     /*
0101      * Set up the kernel context flags here and now because they use
0102      * default values for all receive side memories.  User contexts will
0103      * be handled as they are created.
0104      */
0105     rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
0106         HFI1_CAP_KGET(NODROP_RHQ_FULL) |
0107         HFI1_CAP_KGET(NODROP_EGR_FULL) |
0108         HFI1_CAP_KGET(DMA_RTAIL);
0109 
0110     /* Control context must use DMA_RTAIL */
0111     if (rcd->ctxt == HFI1_CTRL_CTXT)
0112         rcd->flags |= HFI1_CAP_DMA_RTAIL;
0113     rcd->fast_handler = get_dma_rtail_setting(rcd) ?
0114                 handle_receive_interrupt_dma_rtail :
0115                 handle_receive_interrupt_nodma_rtail;
0116 
0117     hfi1_set_seq_cnt(rcd, 1);
0118 
0119     rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
0120     if (!rcd->sc) {
0121         dd_dev_err(dd, "Kernel send context allocation failed\n");
0122         return -ENOMEM;
0123     }
0124     hfi1_init_ctxt(rcd->sc);
0125 
0126     return 0;
0127 }
0128 
0129 /*
0130  * Create the receive context array and one or more kernel contexts
0131  */
0132 int hfi1_create_kctxts(struct hfi1_devdata *dd)
0133 {
0134     u16 i;
0135     int ret;
0136 
0137     dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd),
0138                    GFP_KERNEL, dd->node);
0139     if (!dd->rcd)
0140         return -ENOMEM;
0141 
0142     for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
0143         ret = hfi1_create_kctxt(dd, dd->pport);
0144         if (ret)
0145             goto bail;
0146     }
0147 
0148     return 0;
0149 bail:
0150     for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
0151         hfi1_free_ctxt(dd->rcd[i]);
0152 
0153     /* All the contexts should be freed, free the array */
0154     kfree(dd->rcd);
0155     dd->rcd = NULL;
0156     return ret;
0157 }
0158 
0159 /*
0160  * Helper routines for the receive context reference count (rcd and uctxt).
0161  */
0162 static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
0163 {
0164     kref_init(&rcd->kref);
0165 }
0166 
0167 /**
0168  * hfi1_rcd_free - When reference is zero clean up.
0169  * @kref: pointer to an initialized rcd data structure
0170  *
0171  */
0172 static void hfi1_rcd_free(struct kref *kref)
0173 {
0174     unsigned long flags;
0175     struct hfi1_ctxtdata *rcd =
0176         container_of(kref, struct hfi1_ctxtdata, kref);
0177 
0178     spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
0179     rcd->dd->rcd[rcd->ctxt] = NULL;
0180     spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
0181 
0182     hfi1_free_ctxtdata(rcd->dd, rcd);
0183 
0184     kfree(rcd);
0185 }
0186 
0187 /**
0188  * hfi1_rcd_put - decrement reference for rcd
0189  * @rcd: pointer to an initialized rcd data structure
0190  *
0191  * Use this to put a reference after the init.
0192  */
0193 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
0194 {
0195     if (rcd)
0196         return kref_put(&rcd->kref, hfi1_rcd_free);
0197 
0198     return 0;
0199 }
0200 
0201 /**
0202  * hfi1_rcd_get - increment reference for rcd
0203  * @rcd: pointer to an initialized rcd data structure
0204  *
0205  * Use this to get a reference after the init.
0206  *
0207  * Return : reflect kref_get_unless_zero(), which returns non-zero on
0208  * increment, otherwise 0.
0209  */
0210 int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
0211 {
0212     return kref_get_unless_zero(&rcd->kref);
0213 }
0214 
0215 /**
0216  * allocate_rcd_index - allocate an rcd index from the rcd array
0217  * @dd: pointer to a valid devdata structure
0218  * @rcd: rcd data structure to assign
0219  * @index: pointer to index that is allocated
0220  *
0221  * Find an empty index in the rcd array, and assign the given rcd to it.
0222  * If the array is full, we are EBUSY.
0223  *
0224  */
0225 static int allocate_rcd_index(struct hfi1_devdata *dd,
0226                   struct hfi1_ctxtdata *rcd, u16 *index)
0227 {
0228     unsigned long flags;
0229     u16 ctxt;
0230 
0231     spin_lock_irqsave(&dd->uctxt_lock, flags);
0232     for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++)
0233         if (!dd->rcd[ctxt])
0234             break;
0235 
0236     if (ctxt < dd->num_rcv_contexts) {
0237         rcd->ctxt = ctxt;
0238         dd->rcd[ctxt] = rcd;
0239         hfi1_rcd_init(rcd);
0240     }
0241     spin_unlock_irqrestore(&dd->uctxt_lock, flags);
0242 
0243     if (ctxt >= dd->num_rcv_contexts)
0244         return -EBUSY;
0245 
0246     *index = ctxt;
0247 
0248     return 0;
0249 }
0250 
0251 /**
0252  * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
0253  * array
0254  * @dd: pointer to a valid devdata structure
0255  * @ctxt: the index of an possilbe rcd
0256  *
0257  * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
0258  * ctxt index is valid.
0259  *
0260  * The caller is responsible for making the _put().
0261  *
0262  */
0263 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
0264                          u16 ctxt)
0265 {
0266     if (ctxt < dd->num_rcv_contexts)
0267         return hfi1_rcd_get_by_index(dd, ctxt);
0268 
0269     return NULL;
0270 }
0271 
0272 /**
0273  * hfi1_rcd_get_by_index - get by index
0274  * @dd: pointer to a valid devdata structure
0275  * @ctxt: the index of an possilbe rcd
0276  *
0277  * We need to protect access to the rcd array.  If access is needed to
0278  * one or more index, get the protecting spinlock and then increment the
0279  * kref.
0280  *
0281  * The caller is responsible for making the _put().
0282  *
0283  */
0284 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
0285 {
0286     unsigned long flags;
0287     struct hfi1_ctxtdata *rcd = NULL;
0288 
0289     spin_lock_irqsave(&dd->uctxt_lock, flags);
0290     if (dd->rcd[ctxt]) {
0291         rcd = dd->rcd[ctxt];
0292         if (!hfi1_rcd_get(rcd))
0293             rcd = NULL;
0294     }
0295     spin_unlock_irqrestore(&dd->uctxt_lock, flags);
0296 
0297     return rcd;
0298 }
0299 
0300 /*
0301  * Common code for user and kernel context create and setup.
0302  * NOTE: the initial kref is done here (hf1_rcd_init()).
0303  */
0304 int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
0305              struct hfi1_ctxtdata **context)
0306 {
0307     struct hfi1_devdata *dd = ppd->dd;
0308     struct hfi1_ctxtdata *rcd;
0309     unsigned kctxt_ngroups = 0;
0310     u32 base;
0311 
0312     if (dd->rcv_entries.nctxt_extra >
0313         dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
0314         kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
0315              (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
0316     rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa);
0317     if (rcd) {
0318         u32 rcvtids, max_entries;
0319         u16 ctxt;
0320         int ret;
0321 
0322         ret = allocate_rcd_index(dd, rcd, &ctxt);
0323         if (ret) {
0324             *context = NULL;
0325             kfree(rcd);
0326             return ret;
0327         }
0328 
0329         INIT_LIST_HEAD(&rcd->qp_wait_list);
0330         hfi1_exp_tid_group_init(rcd);
0331         rcd->ppd = ppd;
0332         rcd->dd = dd;
0333         rcd->numa_id = numa;
0334         rcd->rcv_array_groups = dd->rcv_entries.ngroups;
0335         rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
0336         rcd->slow_handler = handle_receive_interrupt;
0337         rcd->do_interrupt = rcd->slow_handler;
0338         rcd->msix_intr = CCE_NUM_MSIX_VECTORS;
0339 
0340         mutex_init(&rcd->exp_mutex);
0341         spin_lock_init(&rcd->exp_lock);
0342         INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
0343         INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
0344 
0345         hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
0346 
0347         /*
0348          * Calculate the context's RcvArray entry starting point.
0349          * We do this here because we have to take into account all
0350          * the RcvArray entries that previous context would have
0351          * taken and we have to account for any extra groups assigned
0352          * to the static (kernel) or dynamic (vnic/user) contexts.
0353          */
0354         if (ctxt < dd->first_dyn_alloc_ctxt) {
0355             if (ctxt < kctxt_ngroups) {
0356                 base = ctxt * (dd->rcv_entries.ngroups + 1);
0357                 rcd->rcv_array_groups++;
0358             } else {
0359                 base = kctxt_ngroups +
0360                     (ctxt * dd->rcv_entries.ngroups);
0361             }
0362         } else {
0363             u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
0364 
0365             base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
0366                 kctxt_ngroups);
0367             if (ct < dd->rcv_entries.nctxt_extra) {
0368                 base += ct * (dd->rcv_entries.ngroups + 1);
0369                 rcd->rcv_array_groups++;
0370             } else {
0371                 base += dd->rcv_entries.nctxt_extra +
0372                     (ct * dd->rcv_entries.ngroups);
0373             }
0374         }
0375         rcd->eager_base = base * dd->rcv_entries.group_size;
0376 
0377         rcd->rcvhdrq_cnt = rcvhdrcnt;
0378         rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
0379         rcd->rhf_offset =
0380             rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
0381         /*
0382          * Simple Eager buffer allocation: we have already pre-allocated
0383          * the number of RcvArray entry groups. Each ctxtdata structure
0384          * holds the number of groups for that context.
0385          *
0386          * To follow CSR requirements and maintain cacheline alignment,
0387          * make sure all sizes and bases are multiples of group_size.
0388          *
0389          * The expected entry count is what is left after assigning
0390          * eager.
0391          */
0392         max_entries = rcd->rcv_array_groups *
0393             dd->rcv_entries.group_size;
0394         rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
0395         rcd->egrbufs.count = round_down(rcvtids,
0396                         dd->rcv_entries.group_size);
0397         if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
0398             dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
0399                    rcd->ctxt);
0400             rcd->egrbufs.count = MAX_EAGER_ENTRIES;
0401         }
0402         hfi1_cdbg(PROC,
0403               "ctxt%u: max Eager buffer RcvArray entries: %u\n",
0404               rcd->ctxt, rcd->egrbufs.count);
0405 
0406         /*
0407          * Allocate array that will hold the eager buffer accounting
0408          * data.
0409          * This will allocate the maximum possible buffer count based
0410          * on the value of the RcvArray split parameter.
0411          * The resulting value will be rounded down to the closest
0412          * multiple of dd->rcv_entries.group_size.
0413          */
0414         rcd->egrbufs.buffers =
0415             kcalloc_node(rcd->egrbufs.count,
0416                      sizeof(*rcd->egrbufs.buffers),
0417                      GFP_KERNEL, numa);
0418         if (!rcd->egrbufs.buffers)
0419             goto bail;
0420         rcd->egrbufs.rcvtids =
0421             kcalloc_node(rcd->egrbufs.count,
0422                      sizeof(*rcd->egrbufs.rcvtids),
0423                      GFP_KERNEL, numa);
0424         if (!rcd->egrbufs.rcvtids)
0425             goto bail;
0426         rcd->egrbufs.size = eager_buffer_size;
0427         /*
0428          * The size of the buffers programmed into the RcvArray
0429          * entries needs to be big enough to handle the highest
0430          * MTU supported.
0431          */
0432         if (rcd->egrbufs.size < hfi1_max_mtu) {
0433             rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
0434             hfi1_cdbg(PROC,
0435                   "ctxt%u: eager bufs size too small. Adjusting to %u\n",
0436                     rcd->ctxt, rcd->egrbufs.size);
0437         }
0438         rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
0439 
0440         /* Applicable only for statically created kernel contexts */
0441         if (ctxt < dd->first_dyn_alloc_ctxt) {
0442             rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
0443                             GFP_KERNEL, numa);
0444             if (!rcd->opstats)
0445                 goto bail;
0446 
0447             /* Initialize TID flow generations for the context */
0448             hfi1_kern_init_ctxt_generations(rcd);
0449         }
0450 
0451         *context = rcd;
0452         return 0;
0453     }
0454 
0455 bail:
0456     *context = NULL;
0457     hfi1_free_ctxt(rcd);
0458     return -ENOMEM;
0459 }
0460 
0461 /**
0462  * hfi1_free_ctxt - free context
0463  * @rcd: pointer to an initialized rcd data structure
0464  *
0465  * This wrapper is the free function that matches hfi1_create_ctxtdata().
0466  * When a context is done being used (kernel or user), this function is called
0467  * for the "final" put to match the kref init from hf1i_create_ctxtdata().
0468  * Other users of the context do a get/put sequence to make sure that the
0469  * structure isn't removed while in use.
0470  */
0471 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd)
0472 {
0473     hfi1_rcd_put(rcd);
0474 }
0475 
0476 /*
0477  * Select the largest ccti value over all SLs to determine the intra-
0478  * packet gap for the link.
0479  *
0480  * called with cca_timer_lock held (to protect access to cca_timer
0481  * array), and rcu_read_lock() (to protect access to cc_state).
0482  */
0483 void set_link_ipg(struct hfi1_pportdata *ppd)
0484 {
0485     struct hfi1_devdata *dd = ppd->dd;
0486     struct cc_state *cc_state;
0487     int i;
0488     u16 cce, ccti_limit, max_ccti = 0;
0489     u16 shift, mult;
0490     u64 src;
0491     u32 current_egress_rate; /* Mbits /sec */
0492     u64 max_pkt_time;
0493     /*
0494      * max_pkt_time is the maximum packet egress time in units
0495      * of the fabric clock period 1/(805 MHz).
0496      */
0497 
0498     cc_state = get_cc_state(ppd);
0499 
0500     if (!cc_state)
0501         /*
0502          * This should _never_ happen - rcu_read_lock() is held,
0503          * and set_link_ipg() should not be called if cc_state
0504          * is NULL.
0505          */
0506         return;
0507 
0508     for (i = 0; i < OPA_MAX_SLS; i++) {
0509         u16 ccti = ppd->cca_timer[i].ccti;
0510 
0511         if (ccti > max_ccti)
0512             max_ccti = ccti;
0513     }
0514 
0515     ccti_limit = cc_state->cct.ccti_limit;
0516     if (max_ccti > ccti_limit)
0517         max_ccti = ccti_limit;
0518 
0519     cce = cc_state->cct.entries[max_ccti].entry;
0520     shift = (cce & 0xc000) >> 14;
0521     mult = (cce & 0x3fff);
0522 
0523     current_egress_rate = active_egress_rate(ppd);
0524 
0525     max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
0526 
0527     src = (max_pkt_time >> shift) * mult;
0528 
0529     src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
0530     src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
0531 
0532     write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
0533 }
0534 
0535 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
0536 {
0537     struct cca_timer *cca_timer;
0538     struct hfi1_pportdata *ppd;
0539     int sl;
0540     u16 ccti_timer, ccti_min;
0541     struct cc_state *cc_state;
0542     unsigned long flags;
0543     enum hrtimer_restart ret = HRTIMER_NORESTART;
0544 
0545     cca_timer = container_of(t, struct cca_timer, hrtimer);
0546     ppd = cca_timer->ppd;
0547     sl = cca_timer->sl;
0548 
0549     rcu_read_lock();
0550 
0551     cc_state = get_cc_state(ppd);
0552 
0553     if (!cc_state) {
0554         rcu_read_unlock();
0555         return HRTIMER_NORESTART;
0556     }
0557 
0558     /*
0559      * 1) decrement ccti for SL
0560      * 2) calculate IPG for link (set_link_ipg())
0561      * 3) restart timer, unless ccti is at min value
0562      */
0563 
0564     ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
0565     ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
0566 
0567     spin_lock_irqsave(&ppd->cca_timer_lock, flags);
0568 
0569     if (cca_timer->ccti > ccti_min) {
0570         cca_timer->ccti--;
0571         set_link_ipg(ppd);
0572     }
0573 
0574     if (cca_timer->ccti > ccti_min) {
0575         unsigned long nsec = 1024 * ccti_timer;
0576         /* ccti_timer is in units of 1.024 usec */
0577         hrtimer_forward_now(t, ns_to_ktime(nsec));
0578         ret = HRTIMER_RESTART;
0579     }
0580 
0581     spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
0582     rcu_read_unlock();
0583     return ret;
0584 }
0585 
0586 /*
0587  * Common code for initializing the physical port structure.
0588  */
0589 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
0590              struct hfi1_devdata *dd, u8 hw_pidx, u32 port)
0591 {
0592     int i;
0593     uint default_pkey_idx;
0594     struct cc_state *cc_state;
0595 
0596     ppd->dd = dd;
0597     ppd->hw_pidx = hw_pidx;
0598     ppd->port = port; /* IB port number, not index */
0599     ppd->prev_link_width = LINK_WIDTH_DEFAULT;
0600     /*
0601      * There are C_VL_COUNT number of PortVLXmitWait counters.
0602      * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
0603      */
0604     for (i = 0; i < C_VL_COUNT + 1; i++) {
0605         ppd->port_vl_xmit_wait_last[i] = 0;
0606         ppd->vl_xmit_flit_cnt[i] = 0;
0607     }
0608 
0609     default_pkey_idx = 1;
0610 
0611     ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
0612     ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
0613     ppd->pkeys[0] = 0x8001;
0614 
0615     INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
0616     INIT_WORK(&ppd->link_up_work, handle_link_up);
0617     INIT_WORK(&ppd->link_down_work, handle_link_down);
0618     INIT_WORK(&ppd->freeze_work, handle_freeze);
0619     INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
0620     INIT_WORK(&ppd->sma_message_work, handle_sma_message);
0621     INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
0622     INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
0623     INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
0624     INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
0625 
0626     mutex_init(&ppd->hls_lock);
0627     spin_lock_init(&ppd->qsfp_info.qsfp_lock);
0628 
0629     ppd->qsfp_info.ppd = ppd;
0630     ppd->sm_trap_qp = 0x0;
0631     ppd->sa_qp = 0x1;
0632 
0633     ppd->hfi1_wq = NULL;
0634 
0635     spin_lock_init(&ppd->cca_timer_lock);
0636 
0637     for (i = 0; i < OPA_MAX_SLS; i++) {
0638         hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
0639                  HRTIMER_MODE_REL);
0640         ppd->cca_timer[i].ppd = ppd;
0641         ppd->cca_timer[i].sl = i;
0642         ppd->cca_timer[i].ccti = 0;
0643         ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
0644     }
0645 
0646     ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
0647 
0648     spin_lock_init(&ppd->cc_state_lock);
0649     spin_lock_init(&ppd->cc_log_lock);
0650     cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
0651     RCU_INIT_POINTER(ppd->cc_state, cc_state);
0652     if (!cc_state)
0653         goto bail;
0654     return;
0655 
0656 bail:
0657     dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
0658 }
0659 
0660 /*
0661  * Do initialization for device that is only needed on
0662  * first detect, not on resets.
0663  */
0664 static int loadtime_init(struct hfi1_devdata *dd)
0665 {
0666     return 0;
0667 }
0668 
0669 /**
0670  * init_after_reset - re-initialize after a reset
0671  * @dd: the hfi1_ib device
0672  *
0673  * sanity check at least some of the values after reset, and
0674  * ensure no receive or transmit (explicitly, in case reset
0675  * failed
0676  */
0677 static int init_after_reset(struct hfi1_devdata *dd)
0678 {
0679     int i;
0680     struct hfi1_ctxtdata *rcd;
0681     /*
0682      * Ensure chip does no sends or receives, tail updates, or
0683      * pioavail updates while we re-initialize.  This is mostly
0684      * for the driver data structures, not chip registers.
0685      */
0686     for (i = 0; i < dd->num_rcv_contexts; i++) {
0687         rcd = hfi1_rcd_get_by_index(dd, i);
0688         hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
0689                  HFI1_RCVCTRL_INTRAVAIL_DIS |
0690                  HFI1_RCVCTRL_TAILUPD_DIS, rcd);
0691         hfi1_rcd_put(rcd);
0692     }
0693     pio_send_control(dd, PSC_GLOBAL_DISABLE);
0694     for (i = 0; i < dd->num_send_contexts; i++)
0695         sc_disable(dd->send_contexts[i].sc);
0696 
0697     return 0;
0698 }
0699 
0700 static void enable_chip(struct hfi1_devdata *dd)
0701 {
0702     struct hfi1_ctxtdata *rcd;
0703     u32 rcvmask;
0704     u16 i;
0705 
0706     /* enable PIO send */
0707     pio_send_control(dd, PSC_GLOBAL_ENABLE);
0708 
0709     /*
0710      * Enable kernel ctxts' receive and receive interrupt.
0711      * Other ctxts done as user opens and initializes them.
0712      */
0713     for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
0714         rcd = hfi1_rcd_get_by_index(dd, i);
0715         if (!rcd)
0716             continue;
0717         rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
0718         rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
0719             HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
0720         if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
0721             rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
0722         if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL))
0723             rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
0724         if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
0725             rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
0726         if (HFI1_CAP_IS_KSET(TID_RDMA))
0727             rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
0728         hfi1_rcvctrl(dd, rcvmask, rcd);
0729         sc_enable(rcd->sc);
0730         hfi1_rcd_put(rcd);
0731     }
0732 }
0733 
0734 /**
0735  * create_workqueues - create per port workqueues
0736  * @dd: the hfi1_ib device
0737  */
0738 static int create_workqueues(struct hfi1_devdata *dd)
0739 {
0740     int pidx;
0741     struct hfi1_pportdata *ppd;
0742 
0743     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0744         ppd = dd->pport + pidx;
0745         if (!ppd->hfi1_wq) {
0746             ppd->hfi1_wq =
0747                 alloc_workqueue(
0748                     "hfi%d_%d",
0749                     WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
0750                     WQ_MEM_RECLAIM,
0751                     HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
0752                     dd->unit, pidx);
0753             if (!ppd->hfi1_wq)
0754                 goto wq_error;
0755         }
0756         if (!ppd->link_wq) {
0757             /*
0758              * Make the link workqueue single-threaded to enforce
0759              * serialization.
0760              */
0761             ppd->link_wq =
0762                 alloc_workqueue(
0763                     "hfi_link_%d_%d",
0764                     WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND,
0765                     1, /* max_active */
0766                     dd->unit, pidx);
0767             if (!ppd->link_wq)
0768                 goto wq_error;
0769         }
0770     }
0771     return 0;
0772 wq_error:
0773     pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
0774     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0775         ppd = dd->pport + pidx;
0776         if (ppd->hfi1_wq) {
0777             destroy_workqueue(ppd->hfi1_wq);
0778             ppd->hfi1_wq = NULL;
0779         }
0780         if (ppd->link_wq) {
0781             destroy_workqueue(ppd->link_wq);
0782             ppd->link_wq = NULL;
0783         }
0784     }
0785     return -ENOMEM;
0786 }
0787 
0788 /**
0789  * destroy_workqueues - destroy per port workqueues
0790  * @dd: the hfi1_ib device
0791  */
0792 static void destroy_workqueues(struct hfi1_devdata *dd)
0793 {
0794     int pidx;
0795     struct hfi1_pportdata *ppd;
0796 
0797     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0798         ppd = dd->pport + pidx;
0799 
0800         if (ppd->hfi1_wq) {
0801             destroy_workqueue(ppd->hfi1_wq);
0802             ppd->hfi1_wq = NULL;
0803         }
0804         if (ppd->link_wq) {
0805             destroy_workqueue(ppd->link_wq);
0806             ppd->link_wq = NULL;
0807         }
0808     }
0809 }
0810 
0811 /**
0812  * enable_general_intr() - Enable the IRQs that will be handled by the
0813  * general interrupt handler.
0814  * @dd: valid devdata
0815  *
0816  */
0817 static void enable_general_intr(struct hfi1_devdata *dd)
0818 {
0819     set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
0820     set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
0821     set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
0822     set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
0823     set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
0824     set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
0825     set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
0826 }
0827 
0828 /**
0829  * hfi1_init - do the actual initialization sequence on the chip
0830  * @dd: the hfi1_ib device
0831  * @reinit: re-initializing, so don't allocate new memory
0832  *
0833  * Do the actual initialization sequence on the chip.  This is done
0834  * both from the init routine called from the PCI infrastructure, and
0835  * when we reset the chip, or detect that it was reset internally,
0836  * or it's administratively re-enabled.
0837  *
0838  * Memory allocation here and in called routines is only done in
0839  * the first case (reinit == 0).  We have to be careful, because even
0840  * without memory allocation, we need to re-write all the chip registers
0841  * TIDs, etc. after the reset or enable has completed.
0842  */
0843 int hfi1_init(struct hfi1_devdata *dd, int reinit)
0844 {
0845     int ret = 0, pidx, lastfail = 0;
0846     unsigned long len;
0847     u16 i;
0848     struct hfi1_ctxtdata *rcd;
0849     struct hfi1_pportdata *ppd;
0850 
0851     /* Set up send low level handlers */
0852     dd->process_pio_send = hfi1_verbs_send_pio;
0853     dd->process_dma_send = hfi1_verbs_send_dma;
0854     dd->pio_inline_send = pio_copy;
0855     dd->process_vnic_dma_send = hfi1_vnic_send_dma;
0856 
0857     if (is_ax(dd)) {
0858         atomic_set(&dd->drop_packet, DROP_PACKET_ON);
0859         dd->do_drop = true;
0860     } else {
0861         atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
0862         dd->do_drop = false;
0863     }
0864 
0865     /* make sure the link is not "up" */
0866     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0867         ppd = dd->pport + pidx;
0868         ppd->linkup = 0;
0869     }
0870 
0871     if (reinit)
0872         ret = init_after_reset(dd);
0873     else
0874         ret = loadtime_init(dd);
0875     if (ret)
0876         goto done;
0877 
0878     /* dd->rcd can be NULL if early initialization failed */
0879     for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
0880         /*
0881          * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
0882          * re-init, the simplest way to handle this is to free
0883          * existing, and re-allocate.
0884          * Need to re-create rest of ctxt 0 ctxtdata as well.
0885          */
0886         rcd = hfi1_rcd_get_by_index(dd, i);
0887         if (!rcd)
0888             continue;
0889 
0890         lastfail = hfi1_create_rcvhdrq(dd, rcd);
0891         if (!lastfail)
0892             lastfail = hfi1_setup_eagerbufs(rcd);
0893         if (!lastfail)
0894             lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
0895         if (lastfail) {
0896             dd_dev_err(dd,
0897                    "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
0898             ret = lastfail;
0899         }
0900         /* enable IRQ */
0901         hfi1_rcd_put(rcd);
0902     }
0903 
0904     /* Allocate enough memory for user event notification. */
0905     len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS *
0906              sizeof(*dd->events));
0907     dd->events = vmalloc_user(len);
0908     if (!dd->events)
0909         dd_dev_err(dd, "Failed to allocate user events page\n");
0910     /*
0911      * Allocate a page for device and port status.
0912      * Page will be shared amongst all user processes.
0913      */
0914     dd->status = vmalloc_user(PAGE_SIZE);
0915     if (!dd->status)
0916         dd_dev_err(dd, "Failed to allocate dev status page\n");
0917     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0918         ppd = dd->pport + pidx;
0919         if (dd->status)
0920             /* Currently, we only have one port */
0921             ppd->statusp = &dd->status->port;
0922 
0923         set_mtu(ppd);
0924     }
0925 
0926     /* enable chip even if we have an error, so we can debug cause */
0927     enable_chip(dd);
0928 
0929 done:
0930     /*
0931      * Set status even if port serdes is not initialized
0932      * so that diags will work.
0933      */
0934     if (dd->status)
0935         dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
0936             HFI1_STATUS_INITTED;
0937     if (!ret) {
0938         /* enable all interrupts from the chip */
0939         enable_general_intr(dd);
0940         init_qsfp_int(dd);
0941 
0942         /* chip is OK for user apps; mark it as initialized */
0943         for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0944             ppd = dd->pport + pidx;
0945 
0946             /*
0947              * start the serdes - must be after interrupts are
0948              * enabled so we are notified when the link goes up
0949              */
0950             lastfail = bringup_serdes(ppd);
0951             if (lastfail)
0952                 dd_dev_info(dd,
0953                         "Failed to bring up port %u\n",
0954                         ppd->port);
0955 
0956             /*
0957              * Set status even if port serdes is not initialized
0958              * so that diags will work.
0959              */
0960             if (ppd->statusp)
0961                 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
0962                             HFI1_STATUS_INITTED;
0963             if (!ppd->link_speed_enabled)
0964                 continue;
0965         }
0966     }
0967 
0968     /* if ret is non-zero, we probably should do some cleanup here... */
0969     return ret;
0970 }
0971 
0972 struct hfi1_devdata *hfi1_lookup(int unit)
0973 {
0974     return xa_load(&hfi1_dev_table, unit);
0975 }
0976 
0977 /*
0978  * Stop the timers during unit shutdown, or after an error late
0979  * in initialization.
0980  */
0981 static void stop_timers(struct hfi1_devdata *dd)
0982 {
0983     struct hfi1_pportdata *ppd;
0984     int pidx;
0985 
0986     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
0987         ppd = dd->pport + pidx;
0988         if (ppd->led_override_timer.function) {
0989             del_timer_sync(&ppd->led_override_timer);
0990             atomic_set(&ppd->led_override_timer_active, 0);
0991         }
0992     }
0993 }
0994 
0995 /**
0996  * shutdown_device - shut down a device
0997  * @dd: the hfi1_ib device
0998  *
0999  * This is called to make the device quiet when we are about to
1000  * unload the driver, and also when the device is administratively
1001  * disabled.   It does not free any data structures.
1002  * Everything it does has to be setup again by hfi1_init(dd, 1)
1003  */
1004 static void shutdown_device(struct hfi1_devdata *dd)
1005 {
1006     struct hfi1_pportdata *ppd;
1007     struct hfi1_ctxtdata *rcd;
1008     unsigned pidx;
1009     int i;
1010 
1011     if (dd->flags & HFI1_SHUTDOWN)
1012         return;
1013     dd->flags |= HFI1_SHUTDOWN;
1014 
1015     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1016         ppd = dd->pport + pidx;
1017 
1018         ppd->linkup = 0;
1019         if (ppd->statusp)
1020             *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
1021                        HFI1_STATUS_IB_READY);
1022     }
1023     dd->flags &= ~HFI1_INITTED;
1024 
1025     /* mask and clean up interrupts */
1026     set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
1027     msix_clean_up_interrupts(dd);
1028 
1029     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1030         ppd = dd->pport + pidx;
1031         for (i = 0; i < dd->num_rcv_contexts; i++) {
1032             rcd = hfi1_rcd_get_by_index(dd, i);
1033             hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
1034                      HFI1_RCVCTRL_CTXT_DIS |
1035                      HFI1_RCVCTRL_INTRAVAIL_DIS |
1036                      HFI1_RCVCTRL_PKEY_DIS |
1037                      HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd);
1038             hfi1_rcd_put(rcd);
1039         }
1040         /*
1041          * Gracefully stop all sends allowing any in progress to
1042          * trickle out first.
1043          */
1044         for (i = 0; i < dd->num_send_contexts; i++)
1045             sc_flush(dd->send_contexts[i].sc);
1046     }
1047 
1048     /*
1049      * Enough for anything that's going to trickle out to have actually
1050      * done so.
1051      */
1052     udelay(20);
1053 
1054     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1055         ppd = dd->pport + pidx;
1056 
1057         /* disable all contexts */
1058         for (i = 0; i < dd->num_send_contexts; i++)
1059             sc_disable(dd->send_contexts[i].sc);
1060         /* disable the send device */
1061         pio_send_control(dd, PSC_GLOBAL_DISABLE);
1062 
1063         shutdown_led_override(ppd);
1064 
1065         /*
1066          * Clear SerdesEnable.
1067          * We can't count on interrupts since we are stopping.
1068          */
1069         hfi1_quiet_serdes(ppd);
1070         if (ppd->hfi1_wq)
1071             flush_workqueue(ppd->hfi1_wq);
1072         if (ppd->link_wq)
1073             flush_workqueue(ppd->link_wq);
1074     }
1075     sdma_exit(dd);
1076 }
1077 
1078 /**
1079  * hfi1_free_ctxtdata - free a context's allocated data
1080  * @dd: the hfi1_ib device
1081  * @rcd: the ctxtdata structure
1082  *
1083  * free up any allocated data for a context
1084  * It should never change any chip state, or global driver state.
1085  */
1086 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
1087 {
1088     u32 e;
1089 
1090     if (!rcd)
1091         return;
1092 
1093     if (rcd->rcvhdrq) {
1094         dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd),
1095                   rcd->rcvhdrq, rcd->rcvhdrq_dma);
1096         rcd->rcvhdrq = NULL;
1097         if (hfi1_rcvhdrtail_kvaddr(rcd)) {
1098             dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
1099                       (void *)hfi1_rcvhdrtail_kvaddr(rcd),
1100                       rcd->rcvhdrqtailaddr_dma);
1101             rcd->rcvhdrtail_kvaddr = NULL;
1102         }
1103     }
1104 
1105     /* all the RcvArray entries should have been cleared by now */
1106     kfree(rcd->egrbufs.rcvtids);
1107     rcd->egrbufs.rcvtids = NULL;
1108 
1109     for (e = 0; e < rcd->egrbufs.alloced; e++) {
1110         if (rcd->egrbufs.buffers[e].addr)
1111             dma_free_coherent(&dd->pcidev->dev,
1112                       rcd->egrbufs.buffers[e].len,
1113                       rcd->egrbufs.buffers[e].addr,
1114                       rcd->egrbufs.buffers[e].dma);
1115     }
1116     kfree(rcd->egrbufs.buffers);
1117     rcd->egrbufs.alloced = 0;
1118     rcd->egrbufs.buffers = NULL;
1119 
1120     sc_free(rcd->sc);
1121     rcd->sc = NULL;
1122 
1123     vfree(rcd->subctxt_uregbase);
1124     vfree(rcd->subctxt_rcvegrbuf);
1125     vfree(rcd->subctxt_rcvhdr_base);
1126     kfree(rcd->opstats);
1127 
1128     rcd->subctxt_uregbase = NULL;
1129     rcd->subctxt_rcvegrbuf = NULL;
1130     rcd->subctxt_rcvhdr_base = NULL;
1131     rcd->opstats = NULL;
1132 }
1133 
1134 /*
1135  * Release our hold on the shared asic data.  If we are the last one,
1136  * return the structure to be finalized outside the lock.  Must be
1137  * holding hfi1_dev_table lock.
1138  */
1139 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
1140 {
1141     struct hfi1_asic_data *ad;
1142     int other;
1143 
1144     if (!dd->asic_data)
1145         return NULL;
1146     dd->asic_data->dds[dd->hfi1_id] = NULL;
1147     other = dd->hfi1_id ? 0 : 1;
1148     ad = dd->asic_data;
1149     dd->asic_data = NULL;
1150     /* return NULL if the other dd still has a link */
1151     return ad->dds[other] ? NULL : ad;
1152 }
1153 
1154 static void finalize_asic_data(struct hfi1_devdata *dd,
1155                    struct hfi1_asic_data *ad)
1156 {
1157     clean_up_i2c(dd, ad);
1158     kfree(ad);
1159 }
1160 
1161 /**
1162  * hfi1_free_devdata - cleans up and frees per-unit data structure
1163  * @dd: pointer to a valid devdata structure
1164  *
1165  * It cleans up and frees all data structures set up by
1166  * by hfi1_alloc_devdata().
1167  */
1168 void hfi1_free_devdata(struct hfi1_devdata *dd)
1169 {
1170     struct hfi1_asic_data *ad;
1171     unsigned long flags;
1172 
1173     xa_lock_irqsave(&hfi1_dev_table, flags);
1174     __xa_erase(&hfi1_dev_table, dd->unit);
1175     ad = release_asic_data(dd);
1176     xa_unlock_irqrestore(&hfi1_dev_table, flags);
1177 
1178     finalize_asic_data(dd, ad);
1179     free_platform_config(dd);
1180     rcu_barrier(); /* wait for rcu callbacks to complete */
1181     free_percpu(dd->int_counter);
1182     free_percpu(dd->rcv_limit);
1183     free_percpu(dd->send_schedule);
1184     free_percpu(dd->tx_opstats);
1185     dd->int_counter   = NULL;
1186     dd->rcv_limit     = NULL;
1187     dd->send_schedule = NULL;
1188     dd->tx_opstats    = NULL;
1189     kfree(dd->comp_vect);
1190     dd->comp_vect = NULL;
1191     if (dd->rcvhdrtail_dummy_kvaddr)
1192         dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
1193                   (void *)dd->rcvhdrtail_dummy_kvaddr,
1194                   dd->rcvhdrtail_dummy_dma);
1195     dd->rcvhdrtail_dummy_kvaddr = NULL;
1196     sdma_clean(dd, dd->num_sdma);
1197     rvt_dealloc_device(&dd->verbs_dev.rdi);
1198 }
1199 
1200 /**
1201  * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
1202  * @pdev: Valid PCI device
1203  * @extra: How many bytes to alloc past the default
1204  *
1205  * Must be done via verbs allocator, because the verbs cleanup process
1206  * both does cleanup and free of the data structure.
1207  * "extra" is for chip-specific data.
1208  */
1209 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
1210                            size_t extra)
1211 {
1212     struct hfi1_devdata *dd;
1213     int ret, nports;
1214 
1215     /* extra is * number of ports */
1216     nports = extra / sizeof(struct hfi1_pportdata);
1217 
1218     dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
1219                              nports);
1220     if (!dd)
1221         return ERR_PTR(-ENOMEM);
1222     dd->num_pports = nports;
1223     dd->pport = (struct hfi1_pportdata *)(dd + 1);
1224     dd->pcidev = pdev;
1225     pci_set_drvdata(pdev, dd);
1226 
1227     ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
1228             GFP_KERNEL);
1229     if (ret < 0) {
1230         dev_err(&pdev->dev,
1231             "Could not allocate unit ID: error %d\n", -ret);
1232         goto bail;
1233     }
1234     rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
1235     /*
1236      * If the BIOS does not have the NUMA node information set, select
1237      * NUMA 0 so we get consistent performance.
1238      */
1239     dd->node = pcibus_to_node(pdev->bus);
1240     if (dd->node == NUMA_NO_NODE) {
1241         dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
1242         dd->node = 0;
1243     }
1244 
1245     /*
1246      * Initialize all locks for the device. This needs to be as early as
1247      * possible so locks are usable.
1248      */
1249     spin_lock_init(&dd->sc_lock);
1250     spin_lock_init(&dd->sendctrl_lock);
1251     spin_lock_init(&dd->rcvctrl_lock);
1252     spin_lock_init(&dd->uctxt_lock);
1253     spin_lock_init(&dd->hfi1_diag_trans_lock);
1254     spin_lock_init(&dd->sc_init_lock);
1255     spin_lock_init(&dd->dc8051_memlock);
1256     seqlock_init(&dd->sc2vl_lock);
1257     spin_lock_init(&dd->sde_map_lock);
1258     spin_lock_init(&dd->pio_map_lock);
1259     mutex_init(&dd->dc8051_lock);
1260     init_waitqueue_head(&dd->event_queue);
1261     spin_lock_init(&dd->irq_src_lock);
1262 
1263     dd->int_counter = alloc_percpu(u64);
1264     if (!dd->int_counter) {
1265         ret = -ENOMEM;
1266         goto bail;
1267     }
1268 
1269     dd->rcv_limit = alloc_percpu(u64);
1270     if (!dd->rcv_limit) {
1271         ret = -ENOMEM;
1272         goto bail;
1273     }
1274 
1275     dd->send_schedule = alloc_percpu(u64);
1276     if (!dd->send_schedule) {
1277         ret = -ENOMEM;
1278         goto bail;
1279     }
1280 
1281     dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx);
1282     if (!dd->tx_opstats) {
1283         ret = -ENOMEM;
1284         goto bail;
1285     }
1286 
1287     dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
1288     if (!dd->comp_vect) {
1289         ret = -ENOMEM;
1290         goto bail;
1291     }
1292 
1293     /* allocate dummy tail memory for all receive contexts */
1294     dd->rcvhdrtail_dummy_kvaddr =
1295         dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64),
1296                    &dd->rcvhdrtail_dummy_dma, GFP_KERNEL);
1297     if (!dd->rcvhdrtail_dummy_kvaddr) {
1298         ret = -ENOMEM;
1299         goto bail;
1300     }
1301 
1302     atomic_set(&dd->ipoib_rsm_usr_num, 0);
1303     return dd;
1304 
1305 bail:
1306     hfi1_free_devdata(dd);
1307     return ERR_PTR(ret);
1308 }
1309 
1310 /*
1311  * Called from freeze mode handlers, and from PCI error
1312  * reporting code.  Should be paranoid about state of
1313  * system and data structures.
1314  */
1315 void hfi1_disable_after_error(struct hfi1_devdata *dd)
1316 {
1317     if (dd->flags & HFI1_INITTED) {
1318         u32 pidx;
1319 
1320         dd->flags &= ~HFI1_INITTED;
1321         if (dd->pport)
1322             for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1323                 struct hfi1_pportdata *ppd;
1324 
1325                 ppd = dd->pport + pidx;
1326                 if (dd->flags & HFI1_PRESENT)
1327                     set_link_state(ppd, HLS_DN_DISABLE);
1328 
1329                 if (ppd->statusp)
1330                     *ppd->statusp &= ~HFI1_STATUS_IB_READY;
1331             }
1332     }
1333 
1334     /*
1335      * Mark as having had an error for driver, and also
1336      * for /sys and status word mapped to user programs.
1337      * This marks unit as not usable, until reset.
1338      */
1339     if (dd->status)
1340         dd->status->dev |= HFI1_STATUS_HWERROR;
1341 }
1342 
1343 static void remove_one(struct pci_dev *);
1344 static int init_one(struct pci_dev *, const struct pci_device_id *);
1345 static void shutdown_one(struct pci_dev *);
1346 
1347 #define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: "
1348 #define PFX DRIVER_NAME ": "
1349 
1350 const struct pci_device_id hfi1_pci_tbl[] = {
1351     { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
1352     { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
1353     { 0, }
1354 };
1355 
1356 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
1357 
1358 static struct pci_driver hfi1_pci_driver = {
1359     .name = DRIVER_NAME,
1360     .probe = init_one,
1361     .remove = remove_one,
1362     .shutdown = shutdown_one,
1363     .id_table = hfi1_pci_tbl,
1364     .err_handler = &hfi1_pci_err_handler,
1365 };
1366 
1367 static void __init compute_krcvqs(void)
1368 {
1369     int i;
1370 
1371     for (i = 0; i < krcvqsset; i++)
1372         n_krcvqs += krcvqs[i];
1373 }
1374 
1375 /*
1376  * Do all the generic driver unit- and chip-independent memory
1377  * allocation and initialization.
1378  */
1379 static int __init hfi1_mod_init(void)
1380 {
1381     int ret;
1382 
1383     ret = dev_init();
1384     if (ret)
1385         goto bail;
1386 
1387     ret = node_affinity_init();
1388     if (ret)
1389         goto bail;
1390 
1391     /* validate max MTU before any devices start */
1392     if (!valid_opa_max_mtu(hfi1_max_mtu)) {
1393         pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
1394                hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
1395         hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
1396     }
1397     /* valid CUs run from 1-128 in powers of 2 */
1398     if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
1399         hfi1_cu = 1;
1400     /* valid credit return threshold is 0-100, variable is unsigned */
1401     if (user_credit_return_threshold > 100)
1402         user_credit_return_threshold = 100;
1403 
1404     compute_krcvqs();
1405     /*
1406      * sanitize receive interrupt count, time must wait until after
1407      * the hardware type is known
1408      */
1409     if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
1410         rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
1411     /* reject invalid combinations */
1412     if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
1413         pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
1414         rcv_intr_count = 1;
1415     }
1416     if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
1417         /*
1418          * Avoid indefinite packet delivery by requiring a timeout
1419          * if count is > 1.
1420          */
1421         pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
1422         rcv_intr_timeout = 1;
1423     }
1424     if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
1425         /*
1426          * The dynamic algorithm expects a non-zero timeout
1427          * and a count > 1.
1428          */
1429         pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
1430         rcv_intr_dynamic = 0;
1431     }
1432 
1433     /* sanitize link CRC options */
1434     link_crc_mask &= SUPPORTED_CRCS;
1435 
1436     ret = opfn_init();
1437     if (ret < 0) {
1438         pr_err("Failed to allocate opfn_wq");
1439         goto bail_dev;
1440     }
1441 
1442     /*
1443      * These must be called before the driver is registered with
1444      * the PCI subsystem.
1445      */
1446     hfi1_dbg_init();
1447     ret = pci_register_driver(&hfi1_pci_driver);
1448     if (ret < 0) {
1449         pr_err("Unable to register driver: error %d\n", -ret);
1450         goto bail_dev;
1451     }
1452     goto bail; /* all OK */
1453 
1454 bail_dev:
1455     hfi1_dbg_exit();
1456     dev_cleanup();
1457 bail:
1458     return ret;
1459 }
1460 
1461 module_init(hfi1_mod_init);
1462 
1463 /*
1464  * Do the non-unit driver cleanup, memory free, etc. at unload.
1465  */
1466 static void __exit hfi1_mod_cleanup(void)
1467 {
1468     pci_unregister_driver(&hfi1_pci_driver);
1469     opfn_exit();
1470     node_affinity_destroy_all();
1471     hfi1_dbg_exit();
1472 
1473     WARN_ON(!xa_empty(&hfi1_dev_table));
1474     dispose_firmware(); /* asymmetric with obtain_firmware() */
1475     dev_cleanup();
1476 }
1477 
1478 module_exit(hfi1_mod_cleanup);
1479 
1480 /* this can only be called after a successful initialization */
1481 static void cleanup_device_data(struct hfi1_devdata *dd)
1482 {
1483     int ctxt;
1484     int pidx;
1485 
1486     /* users can't do anything more with chip */
1487     for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1488         struct hfi1_pportdata *ppd = &dd->pport[pidx];
1489         struct cc_state *cc_state;
1490         int i;
1491 
1492         if (ppd->statusp)
1493             *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
1494 
1495         for (i = 0; i < OPA_MAX_SLS; i++)
1496             hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
1497 
1498         spin_lock(&ppd->cc_state_lock);
1499         cc_state = get_cc_state_protected(ppd);
1500         RCU_INIT_POINTER(ppd->cc_state, NULL);
1501         spin_unlock(&ppd->cc_state_lock);
1502 
1503         if (cc_state)
1504             kfree_rcu(cc_state, rcu);
1505     }
1506 
1507     free_credit_return(dd);
1508 
1509     /*
1510      * Free any resources still in use (usually just kernel contexts)
1511      * at unload; we do for ctxtcnt, because that's what we allocate.
1512      */
1513     for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) {
1514         struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
1515 
1516         if (rcd) {
1517             hfi1_free_ctxt_rcv_groups(rcd);
1518             hfi1_free_ctxt(rcd);
1519         }
1520     }
1521 
1522     kfree(dd->rcd);
1523     dd->rcd = NULL;
1524 
1525     free_pio_map(dd);
1526     /* must follow rcv context free - need to remove rcv's hooks */
1527     for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
1528         sc_free(dd->send_contexts[ctxt].sc);
1529     dd->num_send_contexts = 0;
1530     kfree(dd->send_contexts);
1531     dd->send_contexts = NULL;
1532     kfree(dd->hw_to_sw);
1533     dd->hw_to_sw = NULL;
1534     kfree(dd->boardname);
1535     vfree(dd->events);
1536     vfree(dd->status);
1537 }
1538 
1539 /*
1540  * Clean up on unit shutdown, or error during unit load after
1541  * successful initialization.
1542  */
1543 static void postinit_cleanup(struct hfi1_devdata *dd)
1544 {
1545     hfi1_start_cleanup(dd);
1546     hfi1_comp_vectors_clean_up(dd);
1547     hfi1_dev_affinity_clean_up(dd);
1548 
1549     hfi1_pcie_ddcleanup(dd);
1550     hfi1_pcie_cleanup(dd->pcidev);
1551 
1552     cleanup_device_data(dd);
1553 
1554     hfi1_free_devdata(dd);
1555 }
1556 
1557 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
1558 {
1559     int ret = 0, j, pidx, initfail;
1560     struct hfi1_devdata *dd;
1561     struct hfi1_pportdata *ppd;
1562 
1563     /* First, lock the non-writable module parameters */
1564     HFI1_CAP_LOCK();
1565 
1566     /* Validate dev ids */
1567     if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
1568           ent->device == PCI_DEVICE_ID_INTEL1)) {
1569         dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
1570             ent->device);
1571         ret = -ENODEV;
1572         goto bail;
1573     }
1574 
1575     /* Allocate the dd so we can get to work */
1576     dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
1577                 sizeof(struct hfi1_pportdata));
1578     if (IS_ERR(dd)) {
1579         ret = PTR_ERR(dd);
1580         goto bail;
1581     }
1582 
1583     /* Validate some global module parameters */
1584     ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt);
1585     if (ret)
1586         goto bail;
1587 
1588     /* use the encoding function as a sanitization check */
1589     if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
1590         dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
1591                hfi1_hdrq_entsize);
1592         ret = -EINVAL;
1593         goto bail;
1594     }
1595 
1596     /* The receive eager buffer size must be set before the receive
1597      * contexts are created.
1598      *
1599      * Set the eager buffer size.  Validate that it falls in a range
1600      * allowed by the hardware - all powers of 2 between the min and
1601      * max.  The maximum valid MTU is within the eager buffer range
1602      * so we do not need to cap the max_mtu by an eager buffer size
1603      * setting.
1604      */
1605     if (eager_buffer_size) {
1606         if (!is_power_of_2(eager_buffer_size))
1607             eager_buffer_size =
1608                 roundup_pow_of_two(eager_buffer_size);
1609         eager_buffer_size =
1610             clamp_val(eager_buffer_size,
1611                   MIN_EAGER_BUFFER * 8,
1612                   MAX_EAGER_BUFFER_TOTAL);
1613         dd_dev_info(dd, "Eager buffer size %u\n",
1614                 eager_buffer_size);
1615     } else {
1616         dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
1617         ret = -EINVAL;
1618         goto bail;
1619     }
1620 
1621     /* restrict value of hfi1_rcvarr_split */
1622     hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
1623 
1624     ret = hfi1_pcie_init(dd);
1625     if (ret)
1626         goto bail;
1627 
1628     /*
1629      * Do device-specific initialization, function table setup, dd
1630      * allocation, etc.
1631      */
1632     ret = hfi1_init_dd(dd);
1633     if (ret)
1634         goto clean_bail; /* error already printed */
1635 
1636     ret = create_workqueues(dd);
1637     if (ret)
1638         goto clean_bail;
1639 
1640     /* do the generic initialization */
1641     initfail = hfi1_init(dd, 0);
1642 
1643     ret = hfi1_register_ib_device(dd);
1644 
1645     /*
1646      * Now ready for use.  this should be cleared whenever we
1647      * detect a reset, or initiate one.  If earlier failure,
1648      * we still create devices, so diags, etc. can be used
1649      * to determine cause of problem.
1650      */
1651     if (!initfail && !ret) {
1652         dd->flags |= HFI1_INITTED;
1653         /* create debufs files after init and ib register */
1654         hfi1_dbg_ibdev_init(&dd->verbs_dev);
1655     }
1656 
1657     j = hfi1_device_create(dd);
1658     if (j)
1659         dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
1660 
1661     if (initfail || ret) {
1662         msix_clean_up_interrupts(dd);
1663         stop_timers(dd);
1664         flush_workqueue(ib_wq);
1665         for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1666             hfi1_quiet_serdes(dd->pport + pidx);
1667             ppd = dd->pport + pidx;
1668             if (ppd->hfi1_wq) {
1669                 destroy_workqueue(ppd->hfi1_wq);
1670                 ppd->hfi1_wq = NULL;
1671             }
1672             if (ppd->link_wq) {
1673                 destroy_workqueue(ppd->link_wq);
1674                 ppd->link_wq = NULL;
1675             }
1676         }
1677         if (!j)
1678             hfi1_device_remove(dd);
1679         if (!ret)
1680             hfi1_unregister_ib_device(dd);
1681         postinit_cleanup(dd);
1682         if (initfail)
1683             ret = initfail;
1684         goto bail;  /* everything already cleaned */
1685     }
1686 
1687     sdma_start(dd);
1688 
1689     return 0;
1690 
1691 clean_bail:
1692     hfi1_pcie_cleanup(pdev);
1693 bail:
1694     return ret;
1695 }
1696 
1697 static void wait_for_clients(struct hfi1_devdata *dd)
1698 {
1699     /*
1700      * Remove the device init value and complete the device if there is
1701      * no clients or wait for active clients to finish.
1702      */
1703     if (refcount_dec_and_test(&dd->user_refcount))
1704         complete(&dd->user_comp);
1705 
1706     wait_for_completion(&dd->user_comp);
1707 }
1708 
1709 static void remove_one(struct pci_dev *pdev)
1710 {
1711     struct hfi1_devdata *dd = pci_get_drvdata(pdev);
1712 
1713     /* close debugfs files before ib unregister */
1714     hfi1_dbg_ibdev_exit(&dd->verbs_dev);
1715 
1716     /* remove the /dev hfi1 interface */
1717     hfi1_device_remove(dd);
1718 
1719     /* wait for existing user space clients to finish */
1720     wait_for_clients(dd);
1721 
1722     /* unregister from IB core */
1723     hfi1_unregister_ib_device(dd);
1724 
1725     /* free netdev data */
1726     hfi1_free_rx(dd);
1727 
1728     /*
1729      * Disable the IB link, disable interrupts on the device,
1730      * clear dma engines, etc.
1731      */
1732     shutdown_device(dd);
1733     destroy_workqueues(dd);
1734 
1735     stop_timers(dd);
1736 
1737     /* wait until all of our (qsfp) queue_work() calls complete */
1738     flush_workqueue(ib_wq);
1739 
1740     postinit_cleanup(dd);
1741 }
1742 
1743 static void shutdown_one(struct pci_dev *pdev)
1744 {
1745     struct hfi1_devdata *dd = pci_get_drvdata(pdev);
1746 
1747     shutdown_device(dd);
1748 }
1749 
1750 /**
1751  * hfi1_create_rcvhdrq - create a receive header queue
1752  * @dd: the hfi1_ib device
1753  * @rcd: the context data
1754  *
1755  * This must be contiguous memory (from an i/o perspective), and must be
1756  * DMA'able (which means for some systems, it will go through an IOMMU,
1757  * or be forced into a low address range).
1758  */
1759 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
1760 {
1761     unsigned amt;
1762 
1763     if (!rcd->rcvhdrq) {
1764         gfp_t gfp_flags;
1765 
1766         amt = rcvhdrq_size(rcd);
1767 
1768         if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
1769             gfp_flags = GFP_KERNEL;
1770         else
1771             gfp_flags = GFP_USER;
1772         rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
1773                           &rcd->rcvhdrq_dma,
1774                           gfp_flags | __GFP_COMP);
1775 
1776         if (!rcd->rcvhdrq) {
1777             dd_dev_err(dd,
1778                    "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
1779                    amt, rcd->ctxt);
1780             goto bail;
1781         }
1782 
1783         if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
1784             HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
1785             rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
1786                                     PAGE_SIZE,
1787                                     &rcd->rcvhdrqtailaddr_dma,
1788                                     gfp_flags);
1789             if (!rcd->rcvhdrtail_kvaddr)
1790                 goto bail_free;
1791         }
1792     }
1793 
1794     set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize,
1795               rcd->rcvhdrq_cnt);
1796 
1797     return 0;
1798 
1799 bail_free:
1800     dd_dev_err(dd,
1801            "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
1802            rcd->ctxt);
1803     dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
1804               rcd->rcvhdrq_dma);
1805     rcd->rcvhdrq = NULL;
1806 bail:
1807     return -ENOMEM;
1808 }
1809 
1810 /**
1811  * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user
1812  * contexts.
1813  * @rcd: the context we are setting up.
1814  *
1815  * Allocate the eager TID buffers and program them into hip.
1816  * They are no longer completely contiguous, we do multiple allocation
1817  * calls.  Otherwise we get the OOM code involved, by asking for too
1818  * much per call, with disastrous results on some kernels.
1819  */
1820 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
1821 {
1822     struct hfi1_devdata *dd = rcd->dd;
1823     u32 max_entries, egrtop, alloced_bytes = 0;
1824     gfp_t gfp_flags;
1825     u16 order, idx = 0;
1826     int ret = 0;
1827     u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
1828 
1829     /*
1830      * GFP_USER, but without GFP_FS, so buffer cache can be
1831      * coalesced (we hope); otherwise, even at order 4,
1832      * heavy filesystem activity makes these fail, and we can
1833      * use compound pages.
1834      */
1835     gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
1836 
1837     /*
1838      * The minimum size of the eager buffers is a groups of MTU-sized
1839      * buffers.
1840      * The global eager_buffer_size parameter is checked against the
1841      * theoretical lower limit of the value. Here, we check against the
1842      * MTU.
1843      */
1844     if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
1845         rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
1846     /*
1847      * If using one-pkt-per-egr-buffer, lower the eager buffer
1848      * size to the max MTU (page-aligned).
1849      */
1850     if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
1851         rcd->egrbufs.rcvtid_size = round_mtu;
1852 
1853     /*
1854      * Eager buffers sizes of 1MB or less require smaller TID sizes
1855      * to satisfy the "multiple of 8 RcvArray entries" requirement.
1856      */
1857     if (rcd->egrbufs.size <= (1 << 20))
1858         rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
1859             rounddown_pow_of_two(rcd->egrbufs.size / 8));
1860 
1861     while (alloced_bytes < rcd->egrbufs.size &&
1862            rcd->egrbufs.alloced < rcd->egrbufs.count) {
1863         rcd->egrbufs.buffers[idx].addr =
1864             dma_alloc_coherent(&dd->pcidev->dev,
1865                        rcd->egrbufs.rcvtid_size,
1866                        &rcd->egrbufs.buffers[idx].dma,
1867                        gfp_flags);
1868         if (rcd->egrbufs.buffers[idx].addr) {
1869             rcd->egrbufs.buffers[idx].len =
1870                 rcd->egrbufs.rcvtid_size;
1871             rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
1872                 rcd->egrbufs.buffers[idx].addr;
1873             rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma =
1874                 rcd->egrbufs.buffers[idx].dma;
1875             rcd->egrbufs.alloced++;
1876             alloced_bytes += rcd->egrbufs.rcvtid_size;
1877             idx++;
1878         } else {
1879             u32 new_size, i, j;
1880             u64 offset = 0;
1881 
1882             /*
1883              * Fail the eager buffer allocation if:
1884              *   - we are already using the lowest acceptable size
1885              *   - we are using one-pkt-per-egr-buffer (this implies
1886              *     that we are accepting only one size)
1887              */
1888             if (rcd->egrbufs.rcvtid_size == round_mtu ||
1889                 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
1890                 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
1891                        rcd->ctxt);
1892                 ret = -ENOMEM;
1893                 goto bail_rcvegrbuf_phys;
1894             }
1895 
1896             new_size = rcd->egrbufs.rcvtid_size / 2;
1897 
1898             /*
1899              * If the first attempt to allocate memory failed, don't
1900              * fail everything but continue with the next lower
1901              * size.
1902              */
1903             if (idx == 0) {
1904                 rcd->egrbufs.rcvtid_size = new_size;
1905                 continue;
1906             }
1907 
1908             /*
1909              * Re-partition already allocated buffers to a smaller
1910              * size.
1911              */
1912             rcd->egrbufs.alloced = 0;
1913             for (i = 0, j = 0, offset = 0; j < idx; i++) {
1914                 if (i >= rcd->egrbufs.count)
1915                     break;
1916                 rcd->egrbufs.rcvtids[i].dma =
1917                     rcd->egrbufs.buffers[j].dma + offset;
1918                 rcd->egrbufs.rcvtids[i].addr =
1919                     rcd->egrbufs.buffers[j].addr + offset;
1920                 rcd->egrbufs.alloced++;
1921                 if ((rcd->egrbufs.buffers[j].dma + offset +
1922                      new_size) ==
1923                     (rcd->egrbufs.buffers[j].dma +
1924                      rcd->egrbufs.buffers[j].len)) {
1925                     j++;
1926                     offset = 0;
1927                 } else {
1928                     offset += new_size;
1929                 }
1930             }
1931             rcd->egrbufs.rcvtid_size = new_size;
1932         }
1933     }
1934     rcd->egrbufs.numbufs = idx;
1935     rcd->egrbufs.size = alloced_bytes;
1936 
1937     hfi1_cdbg(PROC,
1938           "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
1939           rcd->ctxt, rcd->egrbufs.alloced,
1940           rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
1941 
1942     /*
1943      * Set the contexts rcv array head update threshold to the closest
1944      * power of 2 (so we can use a mask instead of modulo) below half
1945      * the allocated entries.
1946      */
1947     rcd->egrbufs.threshold =
1948         rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
1949     /*
1950      * Compute the expected RcvArray entry base. This is done after
1951      * allocating the eager buffers in order to maximize the
1952      * expected RcvArray entries for the context.
1953      */
1954     max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
1955     egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
1956     rcd->expected_count = max_entries - egrtop;
1957     if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
1958         rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
1959 
1960     rcd->expected_base = rcd->eager_base + egrtop;
1961     hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
1962           rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
1963           rcd->eager_base, rcd->expected_base);
1964 
1965     if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
1966         hfi1_cdbg(PROC,
1967               "ctxt%u: current Eager buffer size is invalid %u\n",
1968               rcd->ctxt, rcd->egrbufs.rcvtid_size);
1969         ret = -EINVAL;
1970         goto bail_rcvegrbuf_phys;
1971     }
1972 
1973     for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
1974         hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
1975                  rcd->egrbufs.rcvtids[idx].dma, order);
1976         cond_resched();
1977     }
1978 
1979     return 0;
1980 
1981 bail_rcvegrbuf_phys:
1982     for (idx = 0; idx < rcd->egrbufs.alloced &&
1983          rcd->egrbufs.buffers[idx].addr;
1984          idx++) {
1985         dma_free_coherent(&dd->pcidev->dev,
1986                   rcd->egrbufs.buffers[idx].len,
1987                   rcd->egrbufs.buffers[idx].addr,
1988                   rcd->egrbufs.buffers[idx].dma);
1989         rcd->egrbufs.buffers[idx].addr = NULL;
1990         rcd->egrbufs.buffers[idx].dma = 0;
1991         rcd->egrbufs.buffers[idx].len = 0;
1992     }
1993 
1994     return ret;
1995 }