0001
0002
0003
0004
0005 #include <linux/err.h>
0006 #include <linux/slab.h>
0007 #include <rdma/ib_verbs.h>
0008
0009 #include "core_priv.h"
0010
0011 #include <trace/events/rdma_core.h>
0012
0013 #define IB_MAX_SHARED_CQ_SZ 4096U
0014
0015
0016 #define IB_POLL_BATCH 16
0017 #define IB_POLL_BATCH_DIRECT 8
0018
0019
0020 #define IB_POLL_BUDGET_IRQ 256
0021 #define IB_POLL_BUDGET_WORKQUEUE 65536
0022
0023 #define IB_POLL_FLAGS \
0024 (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
0025
0026 static const struct dim_cq_moder
0027 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
0028 {1, 0, 1, 0},
0029 {1, 0, 4, 0},
0030 {2, 0, 4, 0},
0031 {2, 0, 8, 0},
0032 {4, 0, 8, 0},
0033 {16, 0, 8, 0},
0034 {16, 0, 16, 0},
0035 {32, 0, 16, 0},
0036 {32, 0, 32, 0},
0037 };
0038
0039 static void ib_cq_rdma_dim_work(struct work_struct *w)
0040 {
0041 struct dim *dim = container_of(w, struct dim, work);
0042 struct ib_cq *cq = dim->priv;
0043
0044 u16 usec = rdma_dim_prof[dim->profile_ix].usec;
0045 u16 comps = rdma_dim_prof[dim->profile_ix].comps;
0046
0047 dim->state = DIM_START_MEASURE;
0048
0049 trace_cq_modify(cq, comps, usec);
0050 cq->device->ops.modify_cq(cq, comps, usec);
0051 }
0052
0053 static void rdma_dim_init(struct ib_cq *cq)
0054 {
0055 struct dim *dim;
0056
0057 if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
0058 cq->poll_ctx == IB_POLL_DIRECT)
0059 return;
0060
0061 dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
0062 if (!dim)
0063 return;
0064
0065 dim->state = DIM_START_MEASURE;
0066 dim->tune_state = DIM_GOING_RIGHT;
0067 dim->profile_ix = RDMA_DIM_START_PROFILE;
0068 dim->priv = cq;
0069 cq->dim = dim;
0070
0071 INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
0072 }
0073
0074 static void rdma_dim_destroy(struct ib_cq *cq)
0075 {
0076 if (!cq->dim)
0077 return;
0078
0079 cancel_work_sync(&cq->dim->work);
0080 kfree(cq->dim);
0081 }
0082
0083 static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
0084 {
0085 int rc;
0086
0087 rc = ib_poll_cq(cq, num_entries, wc);
0088 trace_cq_poll(cq, num_entries, rc);
0089 return rc;
0090 }
0091
0092 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
0093 int batch)
0094 {
0095 int i, n, completed = 0;
0096
0097 trace_cq_process(cq);
0098
0099
0100
0101
0102
0103
0104 while ((n = __poll_cq(cq, min_t(u32, batch,
0105 budget - completed), wcs)) > 0) {
0106 for (i = 0; i < n; i++) {
0107 struct ib_wc *wc = &wcs[i];
0108
0109 if (wc->wr_cqe)
0110 wc->wr_cqe->done(cq, wc);
0111 else
0112 WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
0113 }
0114
0115 completed += n;
0116
0117 if (n != batch || (budget != -1 && completed >= budget))
0118 break;
0119 }
0120
0121 return completed;
0122 }
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138 int ib_process_cq_direct(struct ib_cq *cq, int budget)
0139 {
0140 struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
0141
0142 return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
0143 }
0144 EXPORT_SYMBOL(ib_process_cq_direct);
0145
0146 static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
0147 {
0148 WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
0149 }
0150
0151 static int ib_poll_handler(struct irq_poll *iop, int budget)
0152 {
0153 struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
0154 struct dim *dim = cq->dim;
0155 int completed;
0156
0157 completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
0158 if (completed < budget) {
0159 irq_poll_complete(&cq->iop);
0160 if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
0161 trace_cq_reschedule(cq);
0162 irq_poll_sched(&cq->iop);
0163 }
0164 }
0165
0166 if (dim)
0167 rdma_dim(dim, completed);
0168
0169 return completed;
0170 }
0171
0172 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
0173 {
0174 trace_cq_schedule(cq);
0175 irq_poll_sched(&cq->iop);
0176 }
0177
0178 static void ib_cq_poll_work(struct work_struct *work)
0179 {
0180 struct ib_cq *cq = container_of(work, struct ib_cq, work);
0181 int completed;
0182
0183 completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
0184 IB_POLL_BATCH);
0185 if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
0186 ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
0187 queue_work(cq->comp_wq, &cq->work);
0188 else if (cq->dim)
0189 rdma_dim(cq->dim, completed);
0190 }
0191
0192 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
0193 {
0194 trace_cq_schedule(cq);
0195 queue_work(cq->comp_wq, &cq->work);
0196 }
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212 struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
0213 int comp_vector, enum ib_poll_context poll_ctx,
0214 const char *caller)
0215 {
0216 struct ib_cq_init_attr cq_attr = {
0217 .cqe = nr_cqe,
0218 .comp_vector = comp_vector,
0219 };
0220 struct ib_cq *cq;
0221 int ret = -ENOMEM;
0222
0223 cq = rdma_zalloc_drv_obj(dev, ib_cq);
0224 if (!cq)
0225 return ERR_PTR(ret);
0226
0227 cq->device = dev;
0228 cq->cq_context = private;
0229 cq->poll_ctx = poll_ctx;
0230 atomic_set(&cq->usecnt, 0);
0231 cq->comp_vector = comp_vector;
0232
0233 cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
0234 if (!cq->wc)
0235 goto out_free_cq;
0236
0237 rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
0238 rdma_restrack_set_name(&cq->res, caller);
0239
0240 ret = dev->ops.create_cq(cq, &cq_attr, NULL);
0241 if (ret)
0242 goto out_free_wc;
0243
0244 rdma_dim_init(cq);
0245
0246 switch (cq->poll_ctx) {
0247 case IB_POLL_DIRECT:
0248 cq->comp_handler = ib_cq_completion_direct;
0249 break;
0250 case IB_POLL_SOFTIRQ:
0251 cq->comp_handler = ib_cq_completion_softirq;
0252
0253 irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
0254 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
0255 break;
0256 case IB_POLL_WORKQUEUE:
0257 case IB_POLL_UNBOUND_WORKQUEUE:
0258 cq->comp_handler = ib_cq_completion_workqueue;
0259 INIT_WORK(&cq->work, ib_cq_poll_work);
0260 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
0261 cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
0262 ib_comp_wq : ib_comp_unbound_wq;
0263 break;
0264 default:
0265 ret = -EINVAL;
0266 goto out_destroy_cq;
0267 }
0268
0269 rdma_restrack_add(&cq->res);
0270 trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx);
0271 return cq;
0272
0273 out_destroy_cq:
0274 rdma_dim_destroy(cq);
0275 cq->device->ops.destroy_cq(cq, NULL);
0276 out_free_wc:
0277 rdma_restrack_put(&cq->res);
0278 kfree(cq->wc);
0279 out_free_cq:
0280 kfree(cq);
0281 trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret);
0282 return ERR_PTR(ret);
0283 }
0284 EXPORT_SYMBOL(__ib_alloc_cq);
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297 struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
0298 int nr_cqe, enum ib_poll_context poll_ctx,
0299 const char *caller)
0300 {
0301 static atomic_t counter;
0302 int comp_vector = 0;
0303
0304 if (dev->num_comp_vectors > 1)
0305 comp_vector =
0306 atomic_inc_return(&counter) %
0307 min_t(int, dev->num_comp_vectors, num_online_cpus());
0308
0309 return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
0310 caller);
0311 }
0312 EXPORT_SYMBOL(__ib_alloc_cq_any);
0313
0314
0315
0316
0317
0318 void ib_free_cq(struct ib_cq *cq)
0319 {
0320 int ret;
0321
0322 if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
0323 return;
0324 if (WARN_ON_ONCE(cq->cqe_used))
0325 return;
0326
0327 switch (cq->poll_ctx) {
0328 case IB_POLL_DIRECT:
0329 break;
0330 case IB_POLL_SOFTIRQ:
0331 irq_poll_disable(&cq->iop);
0332 break;
0333 case IB_POLL_WORKQUEUE:
0334 case IB_POLL_UNBOUND_WORKQUEUE:
0335 cancel_work_sync(&cq->work);
0336 break;
0337 default:
0338 WARN_ON_ONCE(1);
0339 }
0340
0341 rdma_dim_destroy(cq);
0342 trace_cq_free(cq);
0343 ret = cq->device->ops.destroy_cq(cq, NULL);
0344 WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
0345 rdma_restrack_del(&cq->res);
0346 kfree(cq->wc);
0347 kfree(cq);
0348 }
0349 EXPORT_SYMBOL(ib_free_cq);
0350
0351 void ib_cq_pool_cleanup(struct ib_device *dev)
0352 {
0353 struct ib_cq *cq, *n;
0354 unsigned int i;
0355
0356 for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
0357 list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
0358 pool_entry) {
0359 WARN_ON(cq->cqe_used);
0360 list_del(&cq->pool_entry);
0361 cq->shared = false;
0362 ib_free_cq(cq);
0363 }
0364 }
0365 }
0366
0367 static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
0368 enum ib_poll_context poll_ctx)
0369 {
0370 LIST_HEAD(tmp_list);
0371 unsigned int nr_cqs, i;
0372 struct ib_cq *cq, *n;
0373 int ret;
0374
0375 if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
0376 WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
0377 return -EINVAL;
0378 }
0379
0380
0381
0382
0383
0384
0385 nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
0386 max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
0387 nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
0388 for (i = 0; i < nr_cqs; i++) {
0389 cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
0390 if (IS_ERR(cq)) {
0391 ret = PTR_ERR(cq);
0392 goto out_free_cqs;
0393 }
0394 cq->shared = true;
0395 list_add_tail(&cq->pool_entry, &tmp_list);
0396 }
0397
0398 spin_lock_irq(&dev->cq_pools_lock);
0399 list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
0400 spin_unlock_irq(&dev->cq_pools_lock);
0401
0402 return 0;
0403
0404 out_free_cqs:
0405 list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) {
0406 cq->shared = false;
0407 ib_free_cq(cq);
0408 }
0409 return ret;
0410 }
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428 struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
0429 int comp_vector_hint,
0430 enum ib_poll_context poll_ctx)
0431 {
0432 static unsigned int default_comp_vector;
0433 unsigned int vector, num_comp_vectors;
0434 struct ib_cq *cq, *found = NULL;
0435 int ret;
0436
0437 if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
0438 WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
0439 return ERR_PTR(-EINVAL);
0440 }
0441
0442 num_comp_vectors =
0443 min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
0444
0445 if (comp_vector_hint < 0) {
0446 comp_vector_hint =
0447 (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
0448 WRITE_ONCE(default_comp_vector, comp_vector_hint);
0449 }
0450 vector = comp_vector_hint % num_comp_vectors;
0451
0452
0453
0454
0455
0456 while (!found) {
0457 spin_lock_irq(&dev->cq_pools_lock);
0458 list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
0459 pool_entry) {
0460
0461
0462
0463
0464 if (vector != cq->comp_vector)
0465 continue;
0466 if (cq->cqe_used + nr_cqe > cq->cqe)
0467 continue;
0468 found = cq;
0469 break;
0470 }
0471
0472 if (found) {
0473 found->cqe_used += nr_cqe;
0474 spin_unlock_irq(&dev->cq_pools_lock);
0475
0476 return found;
0477 }
0478 spin_unlock_irq(&dev->cq_pools_lock);
0479
0480
0481
0482
0483
0484 ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
0485 if (ret)
0486 return ERR_PTR(ret);
0487 }
0488
0489 return found;
0490 }
0491 EXPORT_SYMBOL(ib_cq_pool_get);
0492
0493
0494
0495
0496
0497
0498 void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
0499 {
0500 if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
0501 return;
0502
0503 spin_lock_irq(&cq->device->cq_pools_lock);
0504 cq->cqe_used -= nr_cqe;
0505 spin_unlock_irq(&cq->device->cq_pools_lock);
0506 }
0507 EXPORT_SYMBOL(ib_cq_pool_put);