infiniband/core/cq.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (c) 2015 HGST, a Western Digital Company.
0004  */
0005 #include <linux/err.h>
0006 #include <linux/slab.h>
0007 #include <rdma/ib_verbs.h>
0008
0009 #include "core_priv.h"
0010
0011 #include <trace/events/rdma_core.h>
0012 /* Max size for shared CQ, may require tuning */
0013 #define IB_MAX_SHARED_CQ_SZ     4096U
0014
0015 /* # of WCs to poll for with a single call to ib_poll_cq */
0016 #define IB_POLL_BATCH           16
0017 #define IB_POLL_BATCH_DIRECT        8
0018
0019 /* # of WCs to iterate over before yielding */
0020 #define IB_POLL_BUDGET_IRQ      256
0021 #define IB_POLL_BUDGET_WORKQUEUE    65536
0022
0023 #define IB_POLL_FLAGS \
0024     (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
0025
0026 static const struct dim_cq_moder
0027 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
0028     {1,   0, 1,  0},
0029     {1,   0, 4,  0},
0030     {2,   0, 4,  0},
0031     {2,   0, 8,  0},
0032     {4,   0, 8,  0},
0033     {16,  0, 8,  0},
0034     {16,  0, 16, 0},
0035     {32,  0, 16, 0},
0036     {32,  0, 32, 0},
0037 };
0038
0039 static void ib_cq_rdma_dim_work(struct work_struct *w)
0040 {
0041     struct dim *dim = container_of(w, struct dim, work);
0042     struct ib_cq *cq = dim->priv;
0043
0044     u16 usec = rdma_dim_prof[dim->profile_ix].usec;
0045     u16 comps = rdma_dim_prof[dim->profile_ix].comps;
0046
0047     dim->state = DIM_START_MEASURE;
0048
0049     trace_cq_modify(cq, comps, usec);
0050     cq->device->ops.modify_cq(cq, comps, usec);
0051 }
0052
0053 static void rdma_dim_init(struct ib_cq *cq)
0054 {
0055     struct dim *dim;
0056
0057     if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
0058         cq->poll_ctx == IB_POLL_DIRECT)
0059         return;
0060
0061     dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
0062     if (!dim)
0063         return;
0064
0065     dim->state = DIM_START_MEASURE;
0066     dim->tune_state = DIM_GOING_RIGHT;
0067     dim->profile_ix = RDMA_DIM_START_PROFILE;
0068     dim->priv = cq;
0069     cq->dim = dim;
0070
0071     INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
0072 }
0073
0074 static void rdma_dim_destroy(struct ib_cq *cq)
0075 {
0076     if (!cq->dim)
0077         return;
0078
0079     cancel_work_sync(&cq->dim->work);
0080     kfree(cq->dim);
0081 }
0082
0083 static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
0084 {
0085     int rc;
0086
0087     rc = ib_poll_cq(cq, num_entries, wc);
0088     trace_cq_poll(cq, num_entries, rc);
0089     return rc;
0090 }
0091
0092 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
0093                int batch)
0094 {
0095     int i, n, completed = 0;
0096
0097     trace_cq_process(cq);
0098
0099     /*
0100      * budget might be (-1) if the caller does not
0101      * want to bound this call, thus we need unsigned
0102      * minimum here.
0103      */
0104     while ((n = __poll_cq(cq, min_t(u32, batch,
0105                     budget - completed), wcs)) > 0) {
0106         for (i = 0; i < n; i++) {
0107             struct ib_wc *wc = &wcs[i];
0108
0109             if (wc->wr_cqe)
0110                 wc->wr_cqe->done(cq, wc);
0111             else
0112                 WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
0113         }
0114
0115         completed += n;
0116
0117         if (n != batch || (budget != -1 && completed >= budget))
0118             break;
0119     }
0120
0121     return completed;
0122 }
0123
0124 /**
0125  * ib_process_cq_direct - process a CQ in caller context
0126  * @cq:     CQ to process
0127  * @budget: number of CQEs to poll for
0128  *
0129  * This function is used to process all outstanding CQ entries.
0130  * It does not offload CQ processing to a different context and does
0131  * not ask for completion interrupts from the HCA.
0132  * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
0133  * concurrent processing.
0134  *
0135  * Note: do not pass -1 as %budget unless it is guaranteed that the number
0136  * of completions that will be processed is small.
0137  */
0138 int ib_process_cq_direct(struct ib_cq *cq, int budget)
0139 {
0140     struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
0141
0142     return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
0143 }
0144 EXPORT_SYMBOL(ib_process_cq_direct);
0145
0146 static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
0147 {
0148     WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
0149 }
0150
0151 static int ib_poll_handler(struct irq_poll *iop, int budget)
0152 {
0153     struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
0154     struct dim *dim = cq->dim;
0155     int completed;
0156
0157     completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
0158     if (completed < budget) {
0159         irq_poll_complete(&cq->iop);
0160         if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
0161             trace_cq_reschedule(cq);
0162             irq_poll_sched(&cq->iop);
0163         }
0164     }
0165
0166     if (dim)
0167         rdma_dim(dim, completed);
0168
0169     return completed;
0170 }
0171
0172 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
0173 {
0174     trace_cq_schedule(cq);
0175     irq_poll_sched(&cq->iop);
0176 }
0177
0178 static void ib_cq_poll_work(struct work_struct *work)
0179 {
0180     struct ib_cq *cq = container_of(work, struct ib_cq, work);
0181     int completed;
0182
0183     completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
0184                     IB_POLL_BATCH);
0185     if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
0186         ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
0187         queue_work(cq->comp_wq, &cq->work);
0188     else if (cq->dim)
0189         rdma_dim(cq->dim, completed);
0190 }
0191
0192 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
0193 {
0194     trace_cq_schedule(cq);
0195     queue_work(cq->comp_wq, &cq->work);
0196 }
0197
0198 /**
0199  * __ib_alloc_cq - allocate a completion queue
0200  * @dev:        device to allocate the CQ for
0201  * @private:        driver private data, accessible from cq->cq_context
0202  * @nr_cqe:     number of CQEs to allocate
0203  * @comp_vector:    HCA completion vectors for this CQ
0204  * @poll_ctx:       context to poll the CQ from.
0205  * @caller:     module owner name.
0206  *
0207  * This is the proper interface to allocate a CQ for in-kernel users. A
0208  * CQ allocated with this interface will automatically be polled from the
0209  * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
0210  * to use this CQ abstraction.
0211  */
0212 struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
0213                 int comp_vector, enum ib_poll_context poll_ctx,
0214                 const char *caller)
0215 {
0216     struct ib_cq_init_attr cq_attr = {
0217         .cqe        = nr_cqe,
0218         .comp_vector    = comp_vector,
0219     };
0220     struct ib_cq *cq;
0221     int ret = -ENOMEM;
0222
0223     cq = rdma_zalloc_drv_obj(dev, ib_cq);
0224     if (!cq)
0225         return ERR_PTR(ret);
0226
0227     cq->device = dev;
0228     cq->cq_context = private;
0229     cq->poll_ctx = poll_ctx;
0230     atomic_set(&cq->usecnt, 0);
0231     cq->comp_vector = comp_vector;
0232
0233     cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
0234     if (!cq->wc)
0235         goto out_free_cq;
0236
0237     rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
0238     rdma_restrack_set_name(&cq->res, caller);
0239
0240     ret = dev->ops.create_cq(cq, &cq_attr, NULL);
0241     if (ret)
0242         goto out_free_wc;
0243
0244     rdma_dim_init(cq);
0245
0246     switch (cq->poll_ctx) {
0247     case IB_POLL_DIRECT:
0248         cq->comp_handler = ib_cq_completion_direct;
0249         break;
0250     case IB_POLL_SOFTIRQ:
0251         cq->comp_handler = ib_cq_completion_softirq;
0252
0253         irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
0254         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
0255         break;
0256     case IB_POLL_WORKQUEUE:
0257     case IB_POLL_UNBOUND_WORKQUEUE:
0258         cq->comp_handler = ib_cq_completion_workqueue;
0259         INIT_WORK(&cq->work, ib_cq_poll_work);
0260         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
0261         cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
0262                 ib_comp_wq : ib_comp_unbound_wq;
0263         break;
0264     default:
0265         ret = -EINVAL;
0266         goto out_destroy_cq;
0267     }
0268
0269     rdma_restrack_add(&cq->res);
0270     trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx);
0271     return cq;
0272
0273 out_destroy_cq:
0274     rdma_dim_destroy(cq);
0275     cq->device->ops.destroy_cq(cq, NULL);
0276 out_free_wc:
0277     rdma_restrack_put(&cq->res);
0278     kfree(cq->wc);
0279 out_free_cq:
0280     kfree(cq);
0281     trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret);
0282     return ERR_PTR(ret);
0283 }
0284 EXPORT_SYMBOL(__ib_alloc_cq);
0285
0286 /**
0287  * __ib_alloc_cq_any - allocate a completion queue
0288  * @dev:        device to allocate the CQ for
0289  * @private:        driver private data, accessible from cq->cq_context
0290  * @nr_cqe:     number of CQEs to allocate
0291  * @poll_ctx:       context to poll the CQ from
0292  * @caller:     module owner name
0293  *
0294  * Attempt to spread ULP Completion Queues over each device's interrupt
0295  * vectors. A simple best-effort mechanism is used.
0296  */
0297 struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
0298                 int nr_cqe, enum ib_poll_context poll_ctx,
0299                 const char *caller)
0300 {
0301     static atomic_t counter;
0302     int comp_vector = 0;
0303
0304     if (dev->num_comp_vectors > 1)
0305         comp_vector =
0306             atomic_inc_return(&counter) %
0307             min_t(int, dev->num_comp_vectors, num_online_cpus());
0308
0309     return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
0310                  caller);
0311 }
0312 EXPORT_SYMBOL(__ib_alloc_cq_any);
0313
0314 /**
0315  * ib_free_cq - free a completion queue
0316  * @cq:     completion queue to free.
0317  */
0318 void ib_free_cq(struct ib_cq *cq)
0319 {
0320     int ret;
0321
0322     if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
0323         return;
0324     if (WARN_ON_ONCE(cq->cqe_used))
0325         return;
0326
0327     switch (cq->poll_ctx) {
0328     case IB_POLL_DIRECT:
0329         break;
0330     case IB_POLL_SOFTIRQ:
0331         irq_poll_disable(&cq->iop);
0332         break;
0333     case IB_POLL_WORKQUEUE:
0334     case IB_POLL_UNBOUND_WORKQUEUE:
0335         cancel_work_sync(&cq->work);
0336         break;
0337     default:
0338         WARN_ON_ONCE(1);
0339     }
0340
0341     rdma_dim_destroy(cq);
0342     trace_cq_free(cq);
0343     ret = cq->device->ops.destroy_cq(cq, NULL);
0344     WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
0345     rdma_restrack_del(&cq->res);
0346     kfree(cq->wc);
0347     kfree(cq);
0348 }
0349 EXPORT_SYMBOL(ib_free_cq);
0350
0351 void ib_cq_pool_cleanup(struct ib_device *dev)
0352 {
0353     struct ib_cq *cq, *n;
0354     unsigned int i;
0355
0356     for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
0357         list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
0358                      pool_entry) {
0359             WARN_ON(cq->cqe_used);
0360             list_del(&cq->pool_entry);
0361             cq->shared = false;
0362             ib_free_cq(cq);
0363         }
0364     }
0365 }
0366
0367 static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
0368             enum ib_poll_context poll_ctx)
0369 {
0370     LIST_HEAD(tmp_list);
0371     unsigned int nr_cqs, i;
0372     struct ib_cq *cq, *n;
0373     int ret;
0374
0375     if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
0376         WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
0377         return -EINVAL;
0378     }
0379
0380     /*
0381      * Allocate at least as many CQEs as requested, and otherwise
0382      * a reasonable batch size so that we can share CQs between
0383      * multiple users instead of allocating a larger number of CQs.
0384      */
0385     nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
0386             max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
0387     nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
0388     for (i = 0; i < nr_cqs; i++) {
0389         cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
0390         if (IS_ERR(cq)) {
0391             ret = PTR_ERR(cq);
0392             goto out_free_cqs;
0393         }
0394         cq->shared = true;
0395         list_add_tail(&cq->pool_entry, &tmp_list);
0396     }
0397
0398     spin_lock_irq(&dev->cq_pools_lock);
0399     list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
0400     spin_unlock_irq(&dev->cq_pools_lock);
0401
0402     return 0;
0403
0404 out_free_cqs:
0405     list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) {
0406         cq->shared = false;
0407         ib_free_cq(cq);
0408     }
0409     return ret;
0410 }
0411
0412 /**
0413  * ib_cq_pool_get() - Find the least used completion queue that matches
0414  *   a given cpu hint (or least used for wild card affinity) and fits
0415  *   nr_cqe.
0416  * @dev: rdma device
0417  * @nr_cqe: number of needed cqe entries
0418  * @comp_vector_hint: completion vector hint (-1) for the driver to assign
0419  *   a comp vector based on internal counter
0420  * @poll_ctx: cq polling context
0421  *
0422  * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
0423  * claim entries in it for us.  In case there is no available cq, allocate
0424  * a new cq with the requirements and add it to the device pool.
0425  * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
0426  * for @poll_ctx.
0427  */
0428 struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
0429                  int comp_vector_hint,
0430                  enum ib_poll_context poll_ctx)
0431 {
0432     static unsigned int default_comp_vector;
0433     unsigned int vector, num_comp_vectors;
0434     struct ib_cq *cq, *found = NULL;
0435     int ret;
0436
0437     if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
0438         WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
0439         return ERR_PTR(-EINVAL);
0440     }
0441
0442     num_comp_vectors =
0443         min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
0444     /* Project the affinty to the device completion vector range */
0445     if (comp_vector_hint < 0) {
0446         comp_vector_hint =
0447             (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
0448         WRITE_ONCE(default_comp_vector, comp_vector_hint);
0449     }
0450     vector = comp_vector_hint % num_comp_vectors;
0451
0452     /*
0453      * Find the least used CQ with correct affinity and
0454      * enough free CQ entries
0455      */
0456     while (!found) {
0457         spin_lock_irq(&dev->cq_pools_lock);
0458         list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
0459                     pool_entry) {
0460             /*
0461              * Check to see if we have found a CQ with the
0462              * correct completion vector
0463              */
0464             if (vector != cq->comp_vector)
0465                 continue;
0466             if (cq->cqe_used + nr_cqe > cq->cqe)
0467                 continue;
0468             found = cq;
0469             break;
0470         }
0471
0472         if (found) {
0473             found->cqe_used += nr_cqe;
0474             spin_unlock_irq(&dev->cq_pools_lock);
0475
0476             return found;
0477         }
0478         spin_unlock_irq(&dev->cq_pools_lock);
0479
0480         /*
0481          * Didn't find a match or ran out of CQs in the device
0482          * pool, allocate a new array of CQs.
0483          */
0484         ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
0485         if (ret)
0486             return ERR_PTR(ret);
0487     }
0488
0489     return found;
0490 }
0491 EXPORT_SYMBOL(ib_cq_pool_get);
0492
0493 /**
0494  * ib_cq_pool_put - Return a CQ taken from a shared pool.
0495  * @cq: The CQ to return.
0496  * @nr_cqe: The max number of cqes that the user had requested.
0497  */
0498 void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
0499 {
0500     if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
0501         return;
0502
0503     spin_lock_irq(&cq->device->cq_pools_lock);
0504     cq->cqe_used -= nr_cqe;
0505     spin_unlock_irq(&cq->device->cq_pools_lock);
0506 }
0507 EXPORT_SYMBOL(ib_cq_pool_put);