0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/kernel.h>
0034 #include <linux/slab.h>
0035 #include <linux/pci.h>
0036 #include <linux/dma-mapping.h>
0037 #include <rdma/rdma_cm.h>
0038
0039 #include "rds_single_path.h"
0040 #include "rds.h"
0041 #include "ib.h"
0042
0043 static struct kmem_cache *rds_ib_incoming_slab;
0044 static struct kmem_cache *rds_ib_frag_slab;
0045 static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
0046
0047 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
0048 {
0049 struct rds_ib_recv_work *recv;
0050 u32 i;
0051
0052 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
0053 struct ib_sge *sge;
0054
0055 recv->r_ibinc = NULL;
0056 recv->r_frag = NULL;
0057
0058 recv->r_wr.next = NULL;
0059 recv->r_wr.wr_id = i;
0060 recv->r_wr.sg_list = recv->r_sge;
0061 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
0062
0063 sge = &recv->r_sge[0];
0064 sge->addr = ic->i_recv_hdrs_dma[i];
0065 sge->length = sizeof(struct rds_header);
0066 sge->lkey = ic->i_pd->local_dma_lkey;
0067
0068 sge = &recv->r_sge[1];
0069 sge->addr = 0;
0070 sge->length = RDS_FRAG_SIZE;
0071 sge->lkey = ic->i_pd->local_dma_lkey;
0072 }
0073 }
0074
0075
0076
0077
0078
0079 static void list_splice_entire_tail(struct list_head *from,
0080 struct list_head *to)
0081 {
0082 struct list_head *from_last = from->prev;
0083
0084 list_splice_tail(from_last, to);
0085 list_add_tail(from_last, to);
0086 }
0087
0088 static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
0089 {
0090 struct list_head *tmp;
0091
0092 tmp = xchg(&cache->xfer, NULL);
0093 if (tmp) {
0094 if (cache->ready)
0095 list_splice_entire_tail(tmp, cache->ready);
0096 else
0097 cache->ready = tmp;
0098 }
0099 }
0100
0101 static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
0102 {
0103 struct rds_ib_cache_head *head;
0104 int cpu;
0105
0106 cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
0107 if (!cache->percpu)
0108 return -ENOMEM;
0109
0110 for_each_possible_cpu(cpu) {
0111 head = per_cpu_ptr(cache->percpu, cpu);
0112 head->first = NULL;
0113 head->count = 0;
0114 }
0115 cache->xfer = NULL;
0116 cache->ready = NULL;
0117
0118 return 0;
0119 }
0120
0121 int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
0122 {
0123 int ret;
0124
0125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
0126 if (!ret) {
0127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
0128 if (ret)
0129 free_percpu(ic->i_cache_incs.percpu);
0130 }
0131
0132 return ret;
0133 }
0134
0135 static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
0136 struct list_head *caller_list)
0137 {
0138 struct rds_ib_cache_head *head;
0139 int cpu;
0140
0141 for_each_possible_cpu(cpu) {
0142 head = per_cpu_ptr(cache->percpu, cpu);
0143 if (head->first) {
0144 list_splice_entire_tail(head->first, caller_list);
0145 head->first = NULL;
0146 }
0147 }
0148
0149 if (cache->ready) {
0150 list_splice_entire_tail(cache->ready, caller_list);
0151 cache->ready = NULL;
0152 }
0153 }
0154
0155 void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
0156 {
0157 struct rds_ib_incoming *inc;
0158 struct rds_ib_incoming *inc_tmp;
0159 struct rds_page_frag *frag;
0160 struct rds_page_frag *frag_tmp;
0161 LIST_HEAD(list);
0162
0163 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
0164 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
0165 free_percpu(ic->i_cache_incs.percpu);
0166
0167 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
0168 list_del(&inc->ii_cache_entry);
0169 WARN_ON(!list_empty(&inc->ii_frags));
0170 kmem_cache_free(rds_ib_incoming_slab, inc);
0171 atomic_dec(&rds_ib_allocation);
0172 }
0173
0174 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
0175 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
0176 free_percpu(ic->i_cache_frags.percpu);
0177
0178 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
0179 list_del(&frag->f_cache_entry);
0180 WARN_ON(!list_empty(&frag->f_item));
0181 kmem_cache_free(rds_ib_frag_slab, frag);
0182 }
0183 }
0184
0185
0186 static void rds_ib_recv_cache_put(struct list_head *new_item,
0187 struct rds_ib_refill_cache *cache);
0188 static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
0189
0190
0191
0192 static void rds_ib_frag_free(struct rds_ib_connection *ic,
0193 struct rds_page_frag *frag)
0194 {
0195 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
0196
0197 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
0198 atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
0199 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
0200 }
0201
0202
0203 void rds_ib_inc_free(struct rds_incoming *inc)
0204 {
0205 struct rds_ib_incoming *ibinc;
0206 struct rds_page_frag *frag;
0207 struct rds_page_frag *pos;
0208 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
0209
0210 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
0211
0212
0213 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
0214 list_del_init(&frag->f_item);
0215 rds_ib_frag_free(ic, frag);
0216 }
0217 BUG_ON(!list_empty(&ibinc->ii_frags));
0218
0219 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
0220 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
0221 }
0222
0223 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
0224 struct rds_ib_recv_work *recv)
0225 {
0226 if (recv->r_ibinc) {
0227 rds_inc_put(&recv->r_ibinc->ii_inc);
0228 recv->r_ibinc = NULL;
0229 }
0230 if (recv->r_frag) {
0231 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
0232 rds_ib_frag_free(ic, recv->r_frag);
0233 recv->r_frag = NULL;
0234 }
0235 }
0236
0237 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
0238 {
0239 u32 i;
0240
0241 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
0242 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
0243 }
0244
0245 static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
0246 gfp_t slab_mask)
0247 {
0248 struct rds_ib_incoming *ibinc;
0249 struct list_head *cache_item;
0250 int avail_allocs;
0251
0252 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
0253 if (cache_item) {
0254 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
0255 } else {
0256 avail_allocs = atomic_add_unless(&rds_ib_allocation,
0257 1, rds_ib_sysctl_max_recv_allocation);
0258 if (!avail_allocs) {
0259 rds_ib_stats_inc(s_ib_rx_alloc_limit);
0260 return NULL;
0261 }
0262 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
0263 if (!ibinc) {
0264 atomic_dec(&rds_ib_allocation);
0265 return NULL;
0266 }
0267 rds_ib_stats_inc(s_ib_rx_total_incs);
0268 }
0269 INIT_LIST_HEAD(&ibinc->ii_frags);
0270 rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
0271
0272 return ibinc;
0273 }
0274
0275 static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
0276 gfp_t slab_mask, gfp_t page_mask)
0277 {
0278 struct rds_page_frag *frag;
0279 struct list_head *cache_item;
0280 int ret;
0281
0282 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
0283 if (cache_item) {
0284 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
0285 atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
0286 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
0287 } else {
0288 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
0289 if (!frag)
0290 return NULL;
0291
0292 sg_init_table(&frag->f_sg, 1);
0293 ret = rds_page_remainder_alloc(&frag->f_sg,
0294 RDS_FRAG_SIZE, page_mask);
0295 if (ret) {
0296 kmem_cache_free(rds_ib_frag_slab, frag);
0297 return NULL;
0298 }
0299 rds_ib_stats_inc(s_ib_rx_total_frags);
0300 }
0301
0302 INIT_LIST_HEAD(&frag->f_item);
0303
0304 return frag;
0305 }
0306
0307 static int rds_ib_recv_refill_one(struct rds_connection *conn,
0308 struct rds_ib_recv_work *recv, gfp_t gfp)
0309 {
0310 struct rds_ib_connection *ic = conn->c_transport_data;
0311 struct ib_sge *sge;
0312 int ret = -ENOMEM;
0313 gfp_t slab_mask = gfp;
0314 gfp_t page_mask = gfp;
0315
0316 if (gfp & __GFP_DIRECT_RECLAIM) {
0317 slab_mask = GFP_KERNEL;
0318 page_mask = GFP_HIGHUSER;
0319 }
0320
0321 if (!ic->i_cache_incs.ready)
0322 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
0323 if (!ic->i_cache_frags.ready)
0324 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
0325
0326
0327
0328
0329
0330 if (!recv->r_ibinc) {
0331 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
0332 if (!recv->r_ibinc)
0333 goto out;
0334 }
0335
0336 WARN_ON(recv->r_frag);
0337 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
0338 if (!recv->r_frag)
0339 goto out;
0340
0341 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
0342 1, DMA_FROM_DEVICE);
0343 WARN_ON(ret != 1);
0344
0345 sge = &recv->r_sge[0];
0346 sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
0347 sge->length = sizeof(struct rds_header);
0348
0349 sge = &recv->r_sge[1];
0350 sge->addr = sg_dma_address(&recv->r_frag->f_sg);
0351 sge->length = sg_dma_len(&recv->r_frag->f_sg);
0352
0353 ret = 0;
0354 out:
0355 return ret;
0356 }
0357
0358 static int acquire_refill(struct rds_connection *conn)
0359 {
0360 return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
0361 }
0362
0363 static void release_refill(struct rds_connection *conn)
0364 {
0365 clear_bit(RDS_RECV_REFILL, &conn->c_flags);
0366 smp_mb__after_atomic();
0367
0368
0369
0370
0371
0372
0373 if (waitqueue_active(&conn->c_waitq))
0374 wake_up_all(&conn->c_waitq);
0375 }
0376
0377
0378
0379
0380
0381
0382 void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
0383 {
0384 struct rds_ib_connection *ic = conn->c_transport_data;
0385 struct rds_ib_recv_work *recv;
0386 unsigned int posted = 0;
0387 int ret = 0;
0388 bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
0389 bool must_wake = false;
0390 u32 pos;
0391
0392
0393
0394
0395
0396 if (!acquire_refill(conn))
0397 return;
0398
0399 while ((prefill || rds_conn_up(conn)) &&
0400 rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
0401 if (pos >= ic->i_recv_ring.w_nr) {
0402 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
0403 pos);
0404 break;
0405 }
0406
0407 recv = &ic->i_recvs[pos];
0408 ret = rds_ib_recv_refill_one(conn, recv, gfp);
0409 if (ret) {
0410 must_wake = true;
0411 break;
0412 }
0413
0414 rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
0415 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
0416 (long)sg_dma_address(&recv->r_frag->f_sg));
0417
0418
0419 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
0420 if (ret) {
0421 rds_ib_conn_error(conn, "recv post on "
0422 "%pI6c returned %d, disconnecting and "
0423 "reconnecting\n", &conn->c_faddr,
0424 ret);
0425 break;
0426 }
0427
0428 posted++;
0429
0430 if ((posted > 128 && need_resched()) || posted > 8192) {
0431 must_wake = true;
0432 break;
0433 }
0434 }
0435
0436
0437 if (ic->i_flowctl && posted)
0438 rds_ib_advertise_credits(conn, posted);
0439
0440 if (ret)
0441 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
0442
0443 release_refill(conn);
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455 if (rds_conn_up(conn) &&
0456 (must_wake ||
0457 (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
0458 rds_ib_ring_empty(&ic->i_recv_ring))) {
0459 queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
0460 }
0461 if (can_wait)
0462 cond_resched();
0463 }
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478 static void rds_ib_recv_cache_put(struct list_head *new_item,
0479 struct rds_ib_refill_cache *cache)
0480 {
0481 unsigned long flags;
0482 struct list_head *old, *chpfirst;
0483
0484 local_irq_save(flags);
0485
0486 chpfirst = __this_cpu_read(cache->percpu->first);
0487 if (!chpfirst)
0488 INIT_LIST_HEAD(new_item);
0489 else
0490 list_add_tail(new_item, chpfirst);
0491
0492 __this_cpu_write(cache->percpu->first, new_item);
0493 __this_cpu_inc(cache->percpu->count);
0494
0495 if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
0496 goto end;
0497
0498
0499
0500
0501
0502
0503
0504 do {
0505 old = xchg(&cache->xfer, NULL);
0506 if (old)
0507 list_splice_entire_tail(old, chpfirst);
0508 old = cmpxchg(&cache->xfer, NULL, chpfirst);
0509 } while (old);
0510
0511
0512 __this_cpu_write(cache->percpu->first, NULL);
0513 __this_cpu_write(cache->percpu->count, 0);
0514 end:
0515 local_irq_restore(flags);
0516 }
0517
0518 static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
0519 {
0520 struct list_head *head = cache->ready;
0521
0522 if (head) {
0523 if (!list_empty(head)) {
0524 cache->ready = head->next;
0525 list_del_init(head);
0526 } else
0527 cache->ready = NULL;
0528 }
0529
0530 return head;
0531 }
0532
0533 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
0534 {
0535 struct rds_ib_incoming *ibinc;
0536 struct rds_page_frag *frag;
0537 unsigned long to_copy;
0538 unsigned long frag_off = 0;
0539 int copied = 0;
0540 int ret;
0541 u32 len;
0542
0543 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
0544 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
0545 len = be32_to_cpu(inc->i_hdr.h_len);
0546
0547 while (iov_iter_count(to) && copied < len) {
0548 if (frag_off == RDS_FRAG_SIZE) {
0549 frag = list_entry(frag->f_item.next,
0550 struct rds_page_frag, f_item);
0551 frag_off = 0;
0552 }
0553 to_copy = min_t(unsigned long, iov_iter_count(to),
0554 RDS_FRAG_SIZE - frag_off);
0555 to_copy = min_t(unsigned long, to_copy, len - copied);
0556
0557
0558 rds_stats_add(s_copy_to_user, to_copy);
0559 ret = copy_page_to_iter(sg_page(&frag->f_sg),
0560 frag->f_sg.offset + frag_off,
0561 to_copy,
0562 to);
0563 if (ret != to_copy)
0564 return -EFAULT;
0565
0566 frag_off += to_copy;
0567 copied += to_copy;
0568 }
0569
0570 return copied;
0571 }
0572
0573
0574 void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
0575 {
0576 struct ib_send_wr *wr = &ic->i_ack_wr;
0577 struct ib_sge *sge = &ic->i_ack_sge;
0578
0579 sge->addr = ic->i_ack_dma;
0580 sge->length = sizeof(struct rds_header);
0581 sge->lkey = ic->i_pd->local_dma_lkey;
0582
0583 wr->sg_list = sge;
0584 wr->num_sge = 1;
0585 wr->opcode = IB_WR_SEND;
0586 wr->wr_id = RDS_IB_ACK_WR_ID;
0587 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
0588 }
0589
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612 #ifndef KERNEL_HAS_ATOMIC64
0613 void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
0614 {
0615 unsigned long flags;
0616
0617 spin_lock_irqsave(&ic->i_ack_lock, flags);
0618 ic->i_ack_next = seq;
0619 if (ack_required)
0620 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0621 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
0622 }
0623
0624 static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
0625 {
0626 unsigned long flags;
0627 u64 seq;
0628
0629 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0630
0631 spin_lock_irqsave(&ic->i_ack_lock, flags);
0632 seq = ic->i_ack_next;
0633 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
0634
0635 return seq;
0636 }
0637 #else
0638 void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
0639 {
0640 atomic64_set(&ic->i_ack_next, seq);
0641 if (ack_required) {
0642 smp_mb__before_atomic();
0643 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0644 }
0645 }
0646
0647 static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
0648 {
0649 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0650 smp_mb__after_atomic();
0651
0652 return atomic64_read(&ic->i_ack_next);
0653 }
0654 #endif
0655
0656
0657 static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
0658 {
0659 struct rds_header *hdr = ic->i_ack;
0660 u64 seq;
0661 int ret;
0662
0663 seq = rds_ib_get_ack(ic);
0664
0665 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
0666
0667 ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
0668 sizeof(*hdr), DMA_TO_DEVICE);
0669 rds_message_populate_header(hdr, 0, 0, 0);
0670 hdr->h_ack = cpu_to_be64(seq);
0671 hdr->h_credit = adv_credits;
0672 rds_message_make_checksum(hdr);
0673 ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
0674 sizeof(*hdr), DMA_TO_DEVICE);
0675
0676 ic->i_ack_queued = jiffies;
0677
0678 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
0679 if (unlikely(ret)) {
0680
0681
0682
0683 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
0684 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0685
0686 rds_ib_stats_inc(s_ib_ack_send_failure);
0687
0688 rds_ib_conn_error(ic->conn, "sending ack failed\n");
0689 } else
0690 rds_ib_stats_inc(s_ib_ack_sent);
0691 }
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721
0722
0723
0724
0725
0726
0727
0728
0729
0730
0731 void rds_ib_attempt_ack(struct rds_ib_connection *ic)
0732 {
0733 unsigned int adv_credits;
0734
0735 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
0736 return;
0737
0738 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
0739 rds_ib_stats_inc(s_ib_ack_send_delayed);
0740 return;
0741 }
0742
0743
0744 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
0745 rds_ib_stats_inc(s_ib_tx_throttle);
0746 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
0747 return;
0748 }
0749
0750 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
0751 rds_ib_send_ack(ic, adv_credits);
0752 }
0753
0754
0755
0756
0757
0758 void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
0759 {
0760 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
0761 rds_ib_attempt_ack(ic);
0762 }
0763
0764
0765
0766
0767
0768 u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
0769 {
0770 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
0771 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
0772 return rds_ib_get_ack(ic);
0773 }
0774
0775
0776
0777
0778
0779
0780
0781
0782
0783 static void rds_ib_cong_recv(struct rds_connection *conn,
0784 struct rds_ib_incoming *ibinc)
0785 {
0786 struct rds_cong_map *map;
0787 unsigned int map_off;
0788 unsigned int map_page;
0789 struct rds_page_frag *frag;
0790 unsigned long frag_off;
0791 unsigned long to_copy;
0792 unsigned long copied;
0793 __le64 uncongested = 0;
0794 void *addr;
0795
0796
0797 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
0798 return;
0799
0800 map = conn->c_fcong;
0801 map_page = 0;
0802 map_off = 0;
0803
0804 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
0805 frag_off = 0;
0806
0807 copied = 0;
0808
0809 while (copied < RDS_CONG_MAP_BYTES) {
0810 __le64 *src, *dst;
0811 unsigned int k;
0812
0813 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
0814 BUG_ON(to_copy & 7);
0815
0816 addr = kmap_atomic(sg_page(&frag->f_sg));
0817
0818 src = addr + frag->f_sg.offset + frag_off;
0819 dst = (void *)map->m_page_addrs[map_page] + map_off;
0820 for (k = 0; k < to_copy; k += 8) {
0821
0822
0823 uncongested |= ~(*src) & *dst;
0824 *dst++ = *src++;
0825 }
0826 kunmap_atomic(addr);
0827
0828 copied += to_copy;
0829
0830 map_off += to_copy;
0831 if (map_off == PAGE_SIZE) {
0832 map_off = 0;
0833 map_page++;
0834 }
0835
0836 frag_off += to_copy;
0837 if (frag_off == RDS_FRAG_SIZE) {
0838 frag = list_entry(frag->f_item.next,
0839 struct rds_page_frag, f_item);
0840 frag_off = 0;
0841 }
0842 }
0843
0844
0845 rds_cong_map_updated(map, le64_to_cpu(uncongested));
0846 }
0847
0848 static void rds_ib_process_recv(struct rds_connection *conn,
0849 struct rds_ib_recv_work *recv, u32 data_len,
0850 struct rds_ib_ack_state *state)
0851 {
0852 struct rds_ib_connection *ic = conn->c_transport_data;
0853 struct rds_ib_incoming *ibinc = ic->i_ibinc;
0854 struct rds_header *ihdr, *hdr;
0855 dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
0856
0857
0858
0859 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
0860 data_len);
0861
0862 if (data_len < sizeof(struct rds_header)) {
0863 rds_ib_conn_error(conn, "incoming message "
0864 "from %pI6c didn't include a "
0865 "header, disconnecting and "
0866 "reconnecting\n",
0867 &conn->c_faddr);
0868 return;
0869 }
0870 data_len -= sizeof(struct rds_header);
0871
0872 ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
0873
0874 ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
0875 sizeof(*ihdr), DMA_FROM_DEVICE);
0876
0877 if (!rds_message_verify_checksum(ihdr)) {
0878 rds_ib_conn_error(conn, "incoming message "
0879 "from %pI6c has corrupted header - "
0880 "forcing a reconnect\n",
0881 &conn->c_faddr);
0882 rds_stats_inc(s_recv_drop_bad_checksum);
0883 goto done;
0884 }
0885
0886
0887 state->ack_recv = be64_to_cpu(ihdr->h_ack);
0888 state->ack_recv_valid = 1;
0889
0890
0891 if (ihdr->h_credit)
0892 rds_ib_send_add_credits(conn, ihdr->h_credit);
0893
0894 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
0895
0896
0897
0898
0899 rds_ib_stats_inc(s_ib_ack_received);
0900
0901
0902
0903
0904
0905
0906
0907
0908
0909
0910 rds_ib_frag_free(ic, recv->r_frag);
0911 recv->r_frag = NULL;
0912 goto done;
0913 }
0914
0915
0916
0917
0918
0919
0920
0921 if (!ibinc) {
0922 ibinc = recv->r_ibinc;
0923 recv->r_ibinc = NULL;
0924 ic->i_ibinc = ibinc;
0925
0926 hdr = &ibinc->ii_inc.i_hdr;
0927 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
0928 local_clock();
0929 memcpy(hdr, ihdr, sizeof(*hdr));
0930 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
0931 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
0932 local_clock();
0933
0934 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
0935 ic->i_recv_data_rem, hdr->h_flags);
0936 } else {
0937 hdr = &ibinc->ii_inc.i_hdr;
0938
0939
0940 if (hdr->h_sequence != ihdr->h_sequence ||
0941 hdr->h_len != ihdr->h_len ||
0942 hdr->h_sport != ihdr->h_sport ||
0943 hdr->h_dport != ihdr->h_dport) {
0944 rds_ib_conn_error(conn,
0945 "fragment header mismatch; forcing reconnect\n");
0946 goto done;
0947 }
0948 }
0949
0950 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
0951 recv->r_frag = NULL;
0952
0953 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
0954 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
0955 else {
0956 ic->i_recv_data_rem = 0;
0957 ic->i_ibinc = NULL;
0958
0959 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
0960 rds_ib_cong_recv(conn, ibinc);
0961 } else {
0962 rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
0963 &ibinc->ii_inc, GFP_ATOMIC);
0964 state->ack_next = be64_to_cpu(hdr->h_sequence);
0965 state->ack_next_valid = 1;
0966 }
0967
0968
0969
0970
0971 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
0972 rds_stats_inc(s_recv_ack_required);
0973 state->ack_required = 1;
0974 }
0975
0976 rds_inc_put(&ibinc->ii_inc);
0977 }
0978 done:
0979 ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
0980 sizeof(*ihdr), DMA_FROM_DEVICE);
0981 }
0982
0983 void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
0984 struct ib_wc *wc,
0985 struct rds_ib_ack_state *state)
0986 {
0987 struct rds_connection *conn = ic->conn;
0988 struct rds_ib_recv_work *recv;
0989
0990 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
0991 (unsigned long long)wc->wr_id, wc->status,
0992 ib_wc_status_msg(wc->status), wc->byte_len,
0993 be32_to_cpu(wc->ex.imm_data));
0994
0995 rds_ib_stats_inc(s_ib_rx_cq_event);
0996 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
0997 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
0998 DMA_FROM_DEVICE);
0999
1000
1001
1002
1003
1004 if (wc->status == IB_WC_SUCCESS) {
1005 rds_ib_process_recv(conn, recv, wc->byte_len, state);
1006 } else {
1007
1008 if (rds_conn_up(conn) || rds_conn_connecting(conn))
1009 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
1010 &conn->c_laddr, &conn->c_faddr,
1011 conn->c_tos, wc->status,
1012 ib_wc_status_msg(wc->status),
1013 wc->vendor_err);
1014 }
1015
1016
1017
1018
1019
1020
1021
1022
1023 if (recv->r_frag) {
1024 rds_ib_frag_free(ic, recv->r_frag);
1025 recv->r_frag = NULL;
1026 }
1027 rds_ib_ring_free(&ic->i_recv_ring, 1);
1028
1029
1030
1031
1032 if (rds_ib_ring_empty(&ic->i_recv_ring))
1033 rds_ib_stats_inc(s_ib_rx_ring_empty);
1034
1035 if (rds_ib_ring_low(&ic->i_recv_ring)) {
1036 rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
1037 rds_ib_stats_inc(s_ib_rx_refill_from_cq);
1038 }
1039 }
1040
1041 int rds_ib_recv_path(struct rds_conn_path *cp)
1042 {
1043 struct rds_connection *conn = cp->cp_conn;
1044 struct rds_ib_connection *ic = conn->c_transport_data;
1045
1046 rdsdebug("conn %p\n", conn);
1047 if (rds_conn_up(conn)) {
1048 rds_ib_attempt_ack(ic);
1049 rds_ib_recv_refill(conn, 0, GFP_KERNEL);
1050 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
1051 }
1052
1053 return 0;
1054 }
1055
1056 int rds_ib_recv_init(void)
1057 {
1058 struct sysinfo si;
1059 int ret = -ENOMEM;
1060
1061
1062 si_meminfo(&si);
1063 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
1064
1065 rds_ib_incoming_slab =
1066 kmem_cache_create_usercopy("rds_ib_incoming",
1067 sizeof(struct rds_ib_incoming),
1068 0, SLAB_HWCACHE_ALIGN,
1069 offsetof(struct rds_ib_incoming,
1070 ii_inc.i_usercopy),
1071 sizeof(struct rds_inc_usercopy),
1072 NULL);
1073 if (!rds_ib_incoming_slab)
1074 goto out;
1075
1076 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
1077 sizeof(struct rds_page_frag),
1078 0, SLAB_HWCACHE_ALIGN, NULL);
1079 if (!rds_ib_frag_slab) {
1080 kmem_cache_destroy(rds_ib_incoming_slab);
1081 rds_ib_incoming_slab = NULL;
1082 } else
1083 ret = 0;
1084 out:
1085 return ret;
1086 }
1087
1088 void rds_ib_recv_exit(void)
1089 {
1090 WARN_ON(atomic_read(&rds_ib_allocation));
1091
1092 kmem_cache_destroy(rds_ib_incoming_slab);
1093 kmem_cache_destroy(rds_ib_frag_slab);
1094 }