Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
0003  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
0004  *
0005  * This software is available to you under a choice of one of two
0006  * licenses.  You may choose to be licensed under the terms of the GNU
0007  * General Public License (GPL) Version 2, available from the file
0008  * COPYING in the main directory of this source tree, or the
0009  * OpenIB.org BSD license below:
0010  *
0011  *     Redistribution and use in source and binary forms, with or
0012  *     without modification, are permitted provided that the following
0013  *     conditions are met:
0014  *
0015  *      - Redistributions of source code must retain the above
0016  *        copyright notice, this list of conditions and the following
0017  *        disclaimer.
0018  *
0019  *      - Redistributions in binary form must reproduce the above
0020  *        copyright notice, this list of conditions and the following
0021  *        disclaimer in the documentation and/or other materials
0022  *        provided with the distribution.
0023  *
0024  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0025  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0026  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0027  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0028  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0029  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0030  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0031  * SOFTWARE.
0032  */
0033 
0034 #include <linux/io.h>
0035 
0036 #include "qib.h"
0037 
0038 /* cut down ridiculously long IB macro names */
0039 #define OP(x) IB_OPCODE_RC_##x
0040 
0041 
0042 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
0043                u32 psn, u32 pmtu)
0044 {
0045     u32 len;
0046 
0047     len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
0048     return rvt_restart_sge(ss, wqe, len);
0049 }
0050 
0051 /**
0052  * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
0053  * @dev: the device for this QP
0054  * @qp: a pointer to the QP
0055  * @ohdr: a pointer to the IB header being constructed
0056  * @pmtu: the path MTU
0057  *
0058  * Return 1 if constructed; otherwise, return 0.
0059  * Note that we are in the responder's side of the QP context.
0060  * Note the QP s_lock must be held.
0061  */
0062 static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
0063                struct ib_other_headers *ohdr, u32 pmtu)
0064 {
0065     struct rvt_ack_entry *e;
0066     u32 hwords;
0067     u32 len;
0068     u32 bth0;
0069     u32 bth2;
0070 
0071     /* Don't send an ACK if we aren't supposed to. */
0072     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
0073         goto bail;
0074 
0075     /* header size in 32-bit words LRH+BTH = (8+12)/4. */
0076     hwords = 5;
0077 
0078     switch (qp->s_ack_state) {
0079     case OP(RDMA_READ_RESPONSE_LAST):
0080     case OP(RDMA_READ_RESPONSE_ONLY):
0081         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0082         if (e->rdma_sge.mr) {
0083             rvt_put_mr(e->rdma_sge.mr);
0084             e->rdma_sge.mr = NULL;
0085         }
0086         fallthrough;
0087     case OP(ATOMIC_ACKNOWLEDGE):
0088         /*
0089          * We can increment the tail pointer now that the last
0090          * response has been sent instead of only being
0091          * constructed.
0092          */
0093         if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
0094             qp->s_tail_ack_queue = 0;
0095         fallthrough;
0096     case OP(SEND_ONLY):
0097     case OP(ACKNOWLEDGE):
0098         /* Check for no next entry in the queue. */
0099         if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
0100             if (qp->s_flags & RVT_S_ACK_PENDING)
0101                 goto normal;
0102             goto bail;
0103         }
0104 
0105         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0106         if (e->opcode == OP(RDMA_READ_REQUEST)) {
0107             /*
0108              * If a RDMA read response is being resent and
0109              * we haven't seen the duplicate request yet,
0110              * then stop sending the remaining responses the
0111              * responder has seen until the requester resends it.
0112              */
0113             len = e->rdma_sge.sge_length;
0114             if (len && !e->rdma_sge.mr) {
0115                 qp->s_tail_ack_queue = qp->r_head_ack_queue;
0116                 goto bail;
0117             }
0118             /* Copy SGE state in case we need to resend */
0119             qp->s_rdma_mr = e->rdma_sge.mr;
0120             if (qp->s_rdma_mr)
0121                 rvt_get_mr(qp->s_rdma_mr);
0122             qp->s_ack_rdma_sge.sge = e->rdma_sge;
0123             qp->s_ack_rdma_sge.num_sge = 1;
0124             qp->s_cur_sge = &qp->s_ack_rdma_sge;
0125             if (len > pmtu) {
0126                 len = pmtu;
0127                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
0128             } else {
0129                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
0130                 e->sent = 1;
0131             }
0132             ohdr->u.aeth = rvt_compute_aeth(qp);
0133             hwords++;
0134             qp->s_ack_rdma_psn = e->psn;
0135             bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
0136         } else {
0137             /* COMPARE_SWAP or FETCH_ADD */
0138             qp->s_cur_sge = NULL;
0139             len = 0;
0140             qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
0141             ohdr->u.at.aeth = rvt_compute_aeth(qp);
0142             ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
0143             hwords += sizeof(ohdr->u.at) / sizeof(u32);
0144             bth2 = e->psn & QIB_PSN_MASK;
0145             e->sent = 1;
0146         }
0147         bth0 = qp->s_ack_state << 24;
0148         break;
0149 
0150     case OP(RDMA_READ_RESPONSE_FIRST):
0151         qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
0152         fallthrough;
0153     case OP(RDMA_READ_RESPONSE_MIDDLE):
0154         qp->s_cur_sge = &qp->s_ack_rdma_sge;
0155         qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
0156         if (qp->s_rdma_mr)
0157             rvt_get_mr(qp->s_rdma_mr);
0158         len = qp->s_ack_rdma_sge.sge.sge_length;
0159         if (len > pmtu)
0160             len = pmtu;
0161         else {
0162             ohdr->u.aeth = rvt_compute_aeth(qp);
0163             hwords++;
0164             qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
0165             e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0166             e->sent = 1;
0167         }
0168         bth0 = qp->s_ack_state << 24;
0169         bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
0170         break;
0171 
0172     default:
0173 normal:
0174         /*
0175          * Send a regular ACK.
0176          * Set the s_ack_state so we wait until after sending
0177          * the ACK before setting s_ack_state to ACKNOWLEDGE
0178          * (see above).
0179          */
0180         qp->s_ack_state = OP(SEND_ONLY);
0181         qp->s_flags &= ~RVT_S_ACK_PENDING;
0182         qp->s_cur_sge = NULL;
0183         if (qp->s_nak_state)
0184             ohdr->u.aeth =
0185                 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
0186                         (qp->s_nak_state <<
0187                          IB_AETH_CREDIT_SHIFT));
0188         else
0189             ohdr->u.aeth = rvt_compute_aeth(qp);
0190         hwords++;
0191         len = 0;
0192         bth0 = OP(ACKNOWLEDGE) << 24;
0193         bth2 = qp->s_ack_psn & QIB_PSN_MASK;
0194     }
0195     qp->s_rdma_ack_cnt++;
0196     qp->s_hdrwords = hwords;
0197     qp->s_cur_size = len;
0198     qib_make_ruc_header(qp, ohdr, bth0, bth2);
0199     return 1;
0200 
0201 bail:
0202     qp->s_ack_state = OP(ACKNOWLEDGE);
0203     qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING);
0204     return 0;
0205 }
0206 
0207 /**
0208  * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
0209  * @qp: a pointer to the QP
0210  * @flags: unused
0211  *
0212  * Assumes the s_lock is held.
0213  *
0214  * Return 1 if constructed; otherwise, return 0.
0215  */
0216 int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
0217 {
0218     struct qib_qp_priv *priv = qp->priv;
0219     struct qib_ibdev *dev = to_idev(qp->ibqp.device);
0220     struct ib_other_headers *ohdr;
0221     struct rvt_sge_state *ss;
0222     struct rvt_swqe *wqe;
0223     u32 hwords;
0224     u32 len;
0225     u32 bth0;
0226     u32 bth2;
0227     u32 pmtu = qp->pmtu;
0228     char newreq;
0229     int ret = 0;
0230     int delta;
0231 
0232     ohdr = &priv->s_hdr->u.oth;
0233     if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
0234         ohdr = &priv->s_hdr->u.l.oth;
0235 
0236     /* Sending responses has higher priority over sending requests. */
0237     if ((qp->s_flags & RVT_S_RESP_PENDING) &&
0238         qib_make_rc_ack(dev, qp, ohdr, pmtu))
0239         goto done;
0240 
0241     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
0242         if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
0243             goto bail;
0244         /* We are in the error state, flush the work request. */
0245         if (qp->s_last == READ_ONCE(qp->s_head))
0246             goto bail;
0247         /* If DMAs are in progress, we can't flush immediately. */
0248         if (atomic_read(&priv->s_dma_busy)) {
0249             qp->s_flags |= RVT_S_WAIT_DMA;
0250             goto bail;
0251         }
0252         wqe = rvt_get_swqe_ptr(qp, qp->s_last);
0253         rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
0254             IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
0255         /* will get called again */
0256         goto done;
0257     }
0258 
0259     if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
0260         goto bail;
0261 
0262     if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
0263         if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
0264             qp->s_flags |= RVT_S_WAIT_PSN;
0265             goto bail;
0266         }
0267         qp->s_sending_psn = qp->s_psn;
0268         qp->s_sending_hpsn = qp->s_psn - 1;
0269     }
0270 
0271     /* header size in 32-bit words LRH+BTH = (8+12)/4. */
0272     hwords = 5;
0273     bth0 = 0;
0274 
0275     /* Send a request. */
0276     wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
0277     switch (qp->s_state) {
0278     default:
0279         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
0280             goto bail;
0281         /*
0282          * Resend an old request or start a new one.
0283          *
0284          * We keep track of the current SWQE so that
0285          * we don't reset the "furthest progress" state
0286          * if we need to back up.
0287          */
0288         newreq = 0;
0289         if (qp->s_cur == qp->s_tail) {
0290             /* Check if send work queue is empty. */
0291             if (qp->s_tail == READ_ONCE(qp->s_head))
0292                 goto bail;
0293             /*
0294              * If a fence is requested, wait for previous
0295              * RDMA read and atomic operations to finish.
0296              */
0297             if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
0298                 qp->s_num_rd_atomic) {
0299                 qp->s_flags |= RVT_S_WAIT_FENCE;
0300                 goto bail;
0301             }
0302             newreq = 1;
0303             qp->s_psn = wqe->psn;
0304         }
0305         /*
0306          * Note that we have to be careful not to modify the
0307          * original work request since we may need to resend
0308          * it.
0309          */
0310         len = wqe->length;
0311         ss = &qp->s_sge;
0312         bth2 = qp->s_psn & QIB_PSN_MASK;
0313         switch (wqe->wr.opcode) {
0314         case IB_WR_SEND:
0315         case IB_WR_SEND_WITH_IMM:
0316             /* If no credit, return. */
0317             if (!rvt_rc_credit_avail(qp, wqe))
0318                 goto bail;
0319             if (len > pmtu) {
0320                 qp->s_state = OP(SEND_FIRST);
0321                 len = pmtu;
0322                 break;
0323             }
0324             if (wqe->wr.opcode == IB_WR_SEND)
0325                 qp->s_state = OP(SEND_ONLY);
0326             else {
0327                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
0328                 /* Immediate data comes after the BTH */
0329                 ohdr->u.imm_data = wqe->wr.ex.imm_data;
0330                 hwords += 1;
0331             }
0332             if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0333                 bth0 |= IB_BTH_SOLICITED;
0334             bth2 |= IB_BTH_REQ_ACK;
0335             if (++qp->s_cur == qp->s_size)
0336                 qp->s_cur = 0;
0337             break;
0338 
0339         case IB_WR_RDMA_WRITE:
0340             if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0341                 qp->s_lsn++;
0342             goto no_flow_control;
0343         case IB_WR_RDMA_WRITE_WITH_IMM:
0344             /* If no credit, return. */
0345             if (!rvt_rc_credit_avail(qp, wqe))
0346                 goto bail;
0347 no_flow_control:
0348             ohdr->u.rc.reth.vaddr =
0349                 cpu_to_be64(wqe->rdma_wr.remote_addr);
0350             ohdr->u.rc.reth.rkey =
0351                 cpu_to_be32(wqe->rdma_wr.rkey);
0352             ohdr->u.rc.reth.length = cpu_to_be32(len);
0353             hwords += sizeof(struct ib_reth) / sizeof(u32);
0354             if (len > pmtu) {
0355                 qp->s_state = OP(RDMA_WRITE_FIRST);
0356                 len = pmtu;
0357                 break;
0358             }
0359             if (wqe->rdma_wr.wr.opcode == IB_WR_RDMA_WRITE)
0360                 qp->s_state = OP(RDMA_WRITE_ONLY);
0361             else {
0362                 qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
0363                 /* Immediate data comes after RETH */
0364                 ohdr->u.rc.imm_data =
0365                     wqe->rdma_wr.wr.ex.imm_data;
0366                 hwords += 1;
0367                 if (wqe->rdma_wr.wr.send_flags & IB_SEND_SOLICITED)
0368                     bth0 |= IB_BTH_SOLICITED;
0369             }
0370             bth2 |= IB_BTH_REQ_ACK;
0371             if (++qp->s_cur == qp->s_size)
0372                 qp->s_cur = 0;
0373             break;
0374 
0375         case IB_WR_RDMA_READ:
0376             /*
0377              * Don't allow more operations to be started
0378              * than the QP limits allow.
0379              */
0380             if (newreq) {
0381                 if (qp->s_num_rd_atomic >=
0382                     qp->s_max_rd_atomic) {
0383                     qp->s_flags |= RVT_S_WAIT_RDMAR;
0384                     goto bail;
0385                 }
0386                 qp->s_num_rd_atomic++;
0387                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0388                     qp->s_lsn++;
0389             }
0390 
0391             ohdr->u.rc.reth.vaddr =
0392                 cpu_to_be64(wqe->rdma_wr.remote_addr);
0393             ohdr->u.rc.reth.rkey =
0394                 cpu_to_be32(wqe->rdma_wr.rkey);
0395             ohdr->u.rc.reth.length = cpu_to_be32(len);
0396             qp->s_state = OP(RDMA_READ_REQUEST);
0397             hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
0398             ss = NULL;
0399             len = 0;
0400             bth2 |= IB_BTH_REQ_ACK;
0401             if (++qp->s_cur == qp->s_size)
0402                 qp->s_cur = 0;
0403             break;
0404 
0405         case IB_WR_ATOMIC_CMP_AND_SWP:
0406         case IB_WR_ATOMIC_FETCH_AND_ADD:
0407             /*
0408              * Don't allow more operations to be started
0409              * than the QP limits allow.
0410              */
0411             if (newreq) {
0412                 if (qp->s_num_rd_atomic >=
0413                     qp->s_max_rd_atomic) {
0414                     qp->s_flags |= RVT_S_WAIT_RDMAR;
0415                     goto bail;
0416                 }
0417                 qp->s_num_rd_atomic++;
0418                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0419                     qp->s_lsn++;
0420             }
0421             if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
0422                 qp->s_state = OP(COMPARE_SWAP);
0423                 put_ib_ateth_swap(wqe->atomic_wr.swap,
0424                           &ohdr->u.atomic_eth);
0425                 put_ib_ateth_compare(wqe->atomic_wr.compare_add,
0426                              &ohdr->u.atomic_eth);
0427             } else {
0428                 qp->s_state = OP(FETCH_ADD);
0429                 put_ib_ateth_swap(wqe->atomic_wr.compare_add,
0430                           &ohdr->u.atomic_eth);
0431                 put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
0432             }
0433             put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
0434                        &ohdr->u.atomic_eth);
0435             ohdr->u.atomic_eth.rkey = cpu_to_be32(
0436                 wqe->atomic_wr.rkey);
0437             hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
0438             ss = NULL;
0439             len = 0;
0440             bth2 |= IB_BTH_REQ_ACK;
0441             if (++qp->s_cur == qp->s_size)
0442                 qp->s_cur = 0;
0443             break;
0444 
0445         default:
0446             goto bail;
0447         }
0448         qp->s_sge.sge = wqe->sg_list[0];
0449         qp->s_sge.sg_list = wqe->sg_list + 1;
0450         qp->s_sge.num_sge = wqe->wr.num_sge;
0451         qp->s_sge.total_len = wqe->length;
0452         qp->s_len = wqe->length;
0453         if (newreq) {
0454             qp->s_tail++;
0455             if (qp->s_tail >= qp->s_size)
0456                 qp->s_tail = 0;
0457         }
0458         if (wqe->wr.opcode == IB_WR_RDMA_READ)
0459             qp->s_psn = wqe->lpsn + 1;
0460         else
0461             qp->s_psn++;
0462         break;
0463 
0464     case OP(RDMA_READ_RESPONSE_FIRST):
0465         /*
0466          * qp->s_state is normally set to the opcode of the
0467          * last packet constructed for new requests and therefore
0468          * is never set to RDMA read response.
0469          * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
0470          * thread to indicate a SEND needs to be restarted from an
0471          * earlier PSN without interferring with the sending thread.
0472          * See qib_restart_rc().
0473          */
0474         qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
0475         fallthrough;
0476     case OP(SEND_FIRST):
0477         qp->s_state = OP(SEND_MIDDLE);
0478         fallthrough;
0479     case OP(SEND_MIDDLE):
0480         bth2 = qp->s_psn++ & QIB_PSN_MASK;
0481         ss = &qp->s_sge;
0482         len = qp->s_len;
0483         if (len > pmtu) {
0484             len = pmtu;
0485             break;
0486         }
0487         if (wqe->wr.opcode == IB_WR_SEND)
0488             qp->s_state = OP(SEND_LAST);
0489         else {
0490             qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
0491             /* Immediate data comes after the BTH */
0492             ohdr->u.imm_data = wqe->wr.ex.imm_data;
0493             hwords += 1;
0494         }
0495         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0496             bth0 |= IB_BTH_SOLICITED;
0497         bth2 |= IB_BTH_REQ_ACK;
0498         qp->s_cur++;
0499         if (qp->s_cur >= qp->s_size)
0500             qp->s_cur = 0;
0501         break;
0502 
0503     case OP(RDMA_READ_RESPONSE_LAST):
0504         /*
0505          * qp->s_state is normally set to the opcode of the
0506          * last packet constructed for new requests and therefore
0507          * is never set to RDMA read response.
0508          * RDMA_READ_RESPONSE_LAST is used by the ACK processing
0509          * thread to indicate a RDMA write needs to be restarted from
0510          * an earlier PSN without interferring with the sending thread.
0511          * See qib_restart_rc().
0512          */
0513         qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
0514         fallthrough;
0515     case OP(RDMA_WRITE_FIRST):
0516         qp->s_state = OP(RDMA_WRITE_MIDDLE);
0517         fallthrough;
0518     case OP(RDMA_WRITE_MIDDLE):
0519         bth2 = qp->s_psn++ & QIB_PSN_MASK;
0520         ss = &qp->s_sge;
0521         len = qp->s_len;
0522         if (len > pmtu) {
0523             len = pmtu;
0524             break;
0525         }
0526         if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
0527             qp->s_state = OP(RDMA_WRITE_LAST);
0528         else {
0529             qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
0530             /* Immediate data comes after the BTH */
0531             ohdr->u.imm_data = wqe->wr.ex.imm_data;
0532             hwords += 1;
0533             if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0534                 bth0 |= IB_BTH_SOLICITED;
0535         }
0536         bth2 |= IB_BTH_REQ_ACK;
0537         qp->s_cur++;
0538         if (qp->s_cur >= qp->s_size)
0539             qp->s_cur = 0;
0540         break;
0541 
0542     case OP(RDMA_READ_RESPONSE_MIDDLE):
0543         /*
0544          * qp->s_state is normally set to the opcode of the
0545          * last packet constructed for new requests and therefore
0546          * is never set to RDMA read response.
0547          * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
0548          * thread to indicate a RDMA read needs to be restarted from
0549          * an earlier PSN without interferring with the sending thread.
0550          * See qib_restart_rc().
0551          */
0552         len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
0553         ohdr->u.rc.reth.vaddr =
0554             cpu_to_be64(wqe->rdma_wr.remote_addr + len);
0555         ohdr->u.rc.reth.rkey =
0556             cpu_to_be32(wqe->rdma_wr.rkey);
0557         ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
0558         qp->s_state = OP(RDMA_READ_REQUEST);
0559         hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
0560         bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
0561         qp->s_psn = wqe->lpsn + 1;
0562         ss = NULL;
0563         len = 0;
0564         qp->s_cur++;
0565         if (qp->s_cur == qp->s_size)
0566             qp->s_cur = 0;
0567         break;
0568     }
0569     qp->s_sending_hpsn = bth2;
0570     delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
0571     if (delta && delta % QIB_PSN_CREDIT == 0)
0572         bth2 |= IB_BTH_REQ_ACK;
0573     if (qp->s_flags & RVT_S_SEND_ONE) {
0574         qp->s_flags &= ~RVT_S_SEND_ONE;
0575         qp->s_flags |= RVT_S_WAIT_ACK;
0576         bth2 |= IB_BTH_REQ_ACK;
0577     }
0578     qp->s_len -= len;
0579     qp->s_hdrwords = hwords;
0580     qp->s_cur_sge = ss;
0581     qp->s_cur_size = len;
0582     qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
0583 done:
0584     return 1;
0585 bail:
0586     qp->s_flags &= ~RVT_S_BUSY;
0587     return ret;
0588 }
0589 
0590 /**
0591  * qib_send_rc_ack - Construct an ACK packet and send it
0592  * @qp: a pointer to the QP
0593  *
0594  * This is called from qib_rc_rcv() and qib_kreceive().
0595  * Note that RDMA reads and atomics are handled in the
0596  * send side QP state and tasklet.
0597  */
0598 void qib_send_rc_ack(struct rvt_qp *qp)
0599 {
0600     struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
0601     struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
0602     struct qib_pportdata *ppd = ppd_from_ibp(ibp);
0603     u64 pbc;
0604     u16 lrh0;
0605     u32 bth0;
0606     u32 hwords;
0607     u32 pbufn;
0608     u32 __iomem *piobuf;
0609     struct ib_header hdr;
0610     struct ib_other_headers *ohdr;
0611     u32 control;
0612     unsigned long flags;
0613 
0614     spin_lock_irqsave(&qp->s_lock, flags);
0615 
0616     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
0617         goto unlock;
0618 
0619     /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
0620     if ((qp->s_flags & RVT_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
0621         goto queue_ack;
0622 
0623     /* Construct the header with s_lock held so APM doesn't change it. */
0624     ohdr = &hdr.u.oth;
0625     lrh0 = QIB_LRH_BTH;
0626     /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
0627     hwords = 6;
0628     if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
0629              IB_AH_GRH)) {
0630         hwords += qib_make_grh(ibp, &hdr.u.l.grh,
0631                        rdma_ah_read_grh(&qp->remote_ah_attr),
0632                        hwords, 0);
0633         ohdr = &hdr.u.l.oth;
0634         lrh0 = QIB_LRH_GRH;
0635     }
0636     /* read pkey_index w/o lock (its atomic) */
0637     bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
0638     if (qp->s_mig_state == IB_MIG_MIGRATED)
0639         bth0 |= IB_BTH_MIG_REQ;
0640     if (qp->r_nak_state)
0641         ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
0642                         (qp->r_nak_state <<
0643                          IB_AETH_CREDIT_SHIFT));
0644     else
0645         ohdr->u.aeth = rvt_compute_aeth(qp);
0646     lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(&qp->remote_ah_attr)] << 12 |
0647         rdma_ah_get_sl(&qp->remote_ah_attr) << 4;
0648     hdr.lrh[0] = cpu_to_be16(lrh0);
0649     hdr.lrh[1] = cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr));
0650     hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
0651     hdr.lrh[3] = cpu_to_be16(ppd->lid |
0652                  rdma_ah_get_path_bits(&qp->remote_ah_attr));
0653     ohdr->bth[0] = cpu_to_be32(bth0);
0654     ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
0655     ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
0656 
0657     spin_unlock_irqrestore(&qp->s_lock, flags);
0658 
0659     /* Don't try to send ACKs if the link isn't ACTIVE */
0660     if (!(ppd->lflags & QIBL_LINKACTIVE))
0661         goto done;
0662 
0663     control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
0664                        qp->s_srate, lrh0 >> 12);
0665     /* length is + 1 for the control dword */
0666     pbc = ((u64) control << 32) | (hwords + 1);
0667 
0668     piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
0669     if (!piobuf) {
0670         /*
0671          * We are out of PIO buffers at the moment.
0672          * Pass responsibility for sending the ACK to the
0673          * send tasklet so that when a PIO buffer becomes
0674          * available, the ACK is sent ahead of other outgoing
0675          * packets.
0676          */
0677         spin_lock_irqsave(&qp->s_lock, flags);
0678         goto queue_ack;
0679     }
0680 
0681     /*
0682      * Write the pbc.
0683      * We have to flush after the PBC for correctness
0684      * on some cpus or WC buffer can be written out of order.
0685      */
0686     writeq(pbc, piobuf);
0687 
0688     if (dd->flags & QIB_PIO_FLUSH_WC) {
0689         u32 *hdrp = (u32 *) &hdr;
0690 
0691         qib_flush_wc();
0692         qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
0693         qib_flush_wc();
0694         __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
0695     } else
0696         qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
0697 
0698     if (dd->flags & QIB_USE_SPCL_TRIG) {
0699         u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
0700 
0701         qib_flush_wc();
0702         __raw_writel(0xaebecede, piobuf + spcl_off);
0703     }
0704 
0705     qib_flush_wc();
0706     qib_sendbuf_done(dd, pbufn);
0707 
0708     this_cpu_inc(ibp->pmastats->n_unicast_xmit);
0709     goto done;
0710 
0711 queue_ack:
0712     if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
0713         this_cpu_inc(*ibp->rvp.rc_qacks);
0714         qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
0715         qp->s_nak_state = qp->r_nak_state;
0716         qp->s_ack_psn = qp->r_ack_psn;
0717 
0718         /* Schedule the send tasklet. */
0719         qib_schedule_send(qp);
0720     }
0721 unlock:
0722     spin_unlock_irqrestore(&qp->s_lock, flags);
0723 done:
0724     return;
0725 }
0726 
0727 /**
0728  * reset_psn - reset the QP state to send starting from PSN
0729  * @qp: the QP
0730  * @psn: the packet sequence number to restart at
0731  *
0732  * This is called from qib_rc_rcv() to process an incoming RC ACK
0733  * for the given QP.
0734  * Called at interrupt level with the QP s_lock held.
0735  */
0736 static void reset_psn(struct rvt_qp *qp, u32 psn)
0737 {
0738     u32 n = qp->s_acked;
0739     struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
0740     u32 opcode;
0741 
0742     qp->s_cur = n;
0743 
0744     /*
0745      * If we are starting the request from the beginning,
0746      * let the normal send code handle initialization.
0747      */
0748     if (qib_cmp24(psn, wqe->psn) <= 0) {
0749         qp->s_state = OP(SEND_LAST);
0750         goto done;
0751     }
0752 
0753     /* Find the work request opcode corresponding to the given PSN. */
0754     opcode = wqe->wr.opcode;
0755     for (;;) {
0756         int diff;
0757 
0758         if (++n == qp->s_size)
0759             n = 0;
0760         if (n == qp->s_tail)
0761             break;
0762         wqe = rvt_get_swqe_ptr(qp, n);
0763         diff = qib_cmp24(psn, wqe->psn);
0764         if (diff < 0)
0765             break;
0766         qp->s_cur = n;
0767         /*
0768          * If we are starting the request from the beginning,
0769          * let the normal send code handle initialization.
0770          */
0771         if (diff == 0) {
0772             qp->s_state = OP(SEND_LAST);
0773             goto done;
0774         }
0775         opcode = wqe->wr.opcode;
0776     }
0777 
0778     /*
0779      * Set the state to restart in the middle of a request.
0780      * Don't change the s_sge, s_cur_sge, or s_cur_size.
0781      * See qib_make_rc_req().
0782      */
0783     switch (opcode) {
0784     case IB_WR_SEND:
0785     case IB_WR_SEND_WITH_IMM:
0786         qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
0787         break;
0788 
0789     case IB_WR_RDMA_WRITE:
0790     case IB_WR_RDMA_WRITE_WITH_IMM:
0791         qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
0792         break;
0793 
0794     case IB_WR_RDMA_READ:
0795         qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
0796         break;
0797 
0798     default:
0799         /*
0800          * This case shouldn't happen since its only
0801          * one PSN per req.
0802          */
0803         qp->s_state = OP(SEND_LAST);
0804     }
0805 done:
0806     qp->s_psn = psn;
0807     /*
0808      * Set RVT_S_WAIT_PSN as qib_rc_complete() may start the timer
0809      * asynchronously before the send tasklet can get scheduled.
0810      * Doing it in qib_make_rc_req() is too late.
0811      */
0812     if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
0813         (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
0814         qp->s_flags |= RVT_S_WAIT_PSN;
0815 }
0816 
0817 /*
0818  * Back up requester to resend the last un-ACKed request.
0819  * The QP r_lock and s_lock should be held and interrupts disabled.
0820  */
0821 void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
0822 {
0823     struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
0824     struct qib_ibport *ibp;
0825 
0826     if (qp->s_retry == 0) {
0827         if (qp->s_mig_state == IB_MIG_ARMED) {
0828             qib_migrate_qp(qp);
0829             qp->s_retry = qp->s_retry_cnt;
0830         } else if (qp->s_last == qp->s_acked) {
0831             rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
0832             rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
0833             return;
0834         } else /* XXX need to handle delayed completion */
0835             return;
0836     } else
0837         qp->s_retry--;
0838 
0839     ibp = to_iport(qp->ibqp.device, qp->port_num);
0840     if (wqe->wr.opcode == IB_WR_RDMA_READ)
0841         ibp->rvp.n_rc_resends++;
0842     else
0843         ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
0844 
0845     qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
0846              RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
0847              RVT_S_WAIT_ACK);
0848     if (wait)
0849         qp->s_flags |= RVT_S_SEND_ONE;
0850     reset_psn(qp, psn);
0851 }
0852 
0853 /*
0854  * Set qp->s_sending_psn to the next PSN after the given one.
0855  * This would be psn+1 except when RDMA reads are present.
0856  */
0857 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
0858 {
0859     struct rvt_swqe *wqe;
0860     u32 n = qp->s_last;
0861 
0862     /* Find the work request corresponding to the given PSN. */
0863     for (;;) {
0864         wqe = rvt_get_swqe_ptr(qp, n);
0865         if (qib_cmp24(psn, wqe->lpsn) <= 0) {
0866             if (wqe->wr.opcode == IB_WR_RDMA_READ)
0867                 qp->s_sending_psn = wqe->lpsn + 1;
0868             else
0869                 qp->s_sending_psn = psn + 1;
0870             break;
0871         }
0872         if (++n == qp->s_size)
0873             n = 0;
0874         if (n == qp->s_tail)
0875             break;
0876     }
0877 }
0878 
0879 /*
0880  * This should be called with the QP s_lock held and interrupts disabled.
0881  */
0882 void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
0883 {
0884     struct ib_other_headers *ohdr;
0885     struct rvt_swqe *wqe;
0886     u32 opcode;
0887     u32 psn;
0888 
0889     if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
0890         return;
0891 
0892     /* Find out where the BTH is */
0893     if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
0894         ohdr = &hdr->u.oth;
0895     else
0896         ohdr = &hdr->u.l.oth;
0897 
0898     opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
0899     if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
0900         opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
0901         WARN_ON(!qp->s_rdma_ack_cnt);
0902         qp->s_rdma_ack_cnt--;
0903         return;
0904     }
0905 
0906     psn = be32_to_cpu(ohdr->bth[2]);
0907     reset_sending_psn(qp, psn);
0908 
0909     /*
0910      * Start timer after a packet requesting an ACK has been sent and
0911      * there are still requests that haven't been acked.
0912      */
0913     if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
0914         !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
0915         (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
0916         rvt_add_retry_timer(qp);
0917 
0918     while (qp->s_last != qp->s_acked) {
0919         wqe = rvt_get_swqe_ptr(qp, qp->s_last);
0920         if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
0921             qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
0922             break;
0923         rvt_qp_complete_swqe(qp,
0924                      wqe,
0925                      ib_qib_wc_opcode[wqe->wr.opcode],
0926                      IB_WC_SUCCESS);
0927     }
0928     /*
0929      * If we were waiting for sends to complete before resending,
0930      * and they are now complete, restart sending.
0931      */
0932     if (qp->s_flags & RVT_S_WAIT_PSN &&
0933         qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
0934         qp->s_flags &= ~RVT_S_WAIT_PSN;
0935         qp->s_sending_psn = qp->s_psn;
0936         qp->s_sending_hpsn = qp->s_psn - 1;
0937         qib_schedule_send(qp);
0938     }
0939 }
0940 
0941 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
0942 {
0943     qp->s_last_psn = psn;
0944 }
0945 
0946 /*
0947  * Generate a SWQE completion.
0948  * This is similar to qib_send_complete but has to check to be sure
0949  * that the SGEs are not being referenced if the SWQE is being resent.
0950  */
0951 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
0952                      struct rvt_swqe *wqe,
0953                      struct qib_ibport *ibp)
0954 {
0955     /*
0956      * Don't decrement refcount and don't generate a
0957      * completion if the SWQE is being resent until the send
0958      * is finished.
0959      */
0960     if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
0961         qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0)
0962         rvt_qp_complete_swqe(qp,
0963                      wqe,
0964                      ib_qib_wc_opcode[wqe->wr.opcode],
0965                      IB_WC_SUCCESS);
0966     else
0967         this_cpu_inc(*ibp->rvp.rc_delayed_comp);
0968 
0969     qp->s_retry = qp->s_retry_cnt;
0970     update_last_psn(qp, wqe->lpsn);
0971 
0972     /*
0973      * If we are completing a request which is in the process of
0974      * being resent, we can stop resending it since we know the
0975      * responder has already seen it.
0976      */
0977     if (qp->s_acked == qp->s_cur) {
0978         if (++qp->s_cur >= qp->s_size)
0979             qp->s_cur = 0;
0980         qp->s_acked = qp->s_cur;
0981         wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
0982         if (qp->s_acked != qp->s_tail) {
0983             qp->s_state = OP(SEND_LAST);
0984             qp->s_psn = wqe->psn;
0985         }
0986     } else {
0987         if (++qp->s_acked >= qp->s_size)
0988             qp->s_acked = 0;
0989         if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
0990             qp->s_draining = 0;
0991         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
0992     }
0993     return wqe;
0994 }
0995 
0996 /*
0997  * do_rc_ack - process an incoming RC ACK
0998  * @qp: the QP the ACK came in on
0999  * @psn: the packet sequence number of the ACK
1000  * @opcode: the opcode of the request that resulted in the ACK
1001  *
1002  * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1003  * for the given QP.
1004  * Called at interrupt level with the QP s_lock held.
1005  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1006  */
1007 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1008              u64 val, struct qib_ctxtdata *rcd)
1009 {
1010     struct qib_ibport *ibp;
1011     enum ib_wc_status status;
1012     struct rvt_swqe *wqe;
1013     int ret = 0;
1014     u32 ack_psn;
1015     int diff;
1016 
1017     /*
1018      * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1019      * requests and implicitly NAK RDMA read and atomic requests issued
1020      * before the NAK'ed request.  The MSN won't include the NAK'ed
1021      * request but will include an ACK'ed request(s).
1022      */
1023     ack_psn = psn;
1024     if (aeth >> IB_AETH_NAK_SHIFT)
1025         ack_psn--;
1026     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1027     ibp = to_iport(qp->ibqp.device, qp->port_num);
1028 
1029     /*
1030      * The MSN might be for a later WQE than the PSN indicates so
1031      * only complete WQEs that the PSN finishes.
1032      */
1033     while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1034         /*
1035          * RDMA_READ_RESPONSE_ONLY is a special case since
1036          * we want to generate completion events for everything
1037          * before the RDMA read, copy the data, then generate
1038          * the completion for the read.
1039          */
1040         if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1041             opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1042             diff == 0) {
1043             ret = 1;
1044             goto bail;
1045         }
1046         /*
1047          * If this request is a RDMA read or atomic, and the ACK is
1048          * for a later operation, this ACK NAKs the RDMA read or
1049          * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1050          * can ACK a RDMA read and likewise for atomic ops.  Note
1051          * that the NAK case can only happen if relaxed ordering is
1052          * used and requests are sent after an RDMA read or atomic
1053          * is sent but before the response is received.
1054          */
1055         if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1056              (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1057             ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1058               wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1059              (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1060             /* Retry this request. */
1061             if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1062                 qp->r_flags |= RVT_R_RDMAR_SEQ;
1063                 qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1064                 if (list_empty(&qp->rspwait)) {
1065                     qp->r_flags |= RVT_R_RSP_SEND;
1066                     rvt_get_qp(qp);
1067                     list_add_tail(&qp->rspwait,
1068                               &rcd->qp_wait_list);
1069                 }
1070             }
1071             /*
1072              * No need to process the ACK/NAK since we are
1073              * restarting an earlier request.
1074              */
1075             goto bail;
1076         }
1077         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1078             wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1079             u64 *vaddr = wqe->sg_list[0].vaddr;
1080             *vaddr = val;
1081         }
1082         if (qp->s_num_rd_atomic &&
1083             (wqe->wr.opcode == IB_WR_RDMA_READ ||
1084              wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1085              wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1086             qp->s_num_rd_atomic--;
1087             /* Restart sending task if fence is complete */
1088             if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1089                 !qp->s_num_rd_atomic) {
1090                 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1091                          RVT_S_WAIT_ACK);
1092                 qib_schedule_send(qp);
1093             } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1094                 qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1095                          RVT_S_WAIT_ACK);
1096                 qib_schedule_send(qp);
1097             }
1098         }
1099         wqe = do_rc_completion(qp, wqe, ibp);
1100         if (qp->s_acked == qp->s_tail)
1101             break;
1102     }
1103 
1104     switch (aeth >> IB_AETH_NAK_SHIFT) {
1105     case 0:         /* ACK */
1106         this_cpu_inc(*ibp->rvp.rc_acks);
1107         if (qp->s_acked != qp->s_tail) {
1108             /*
1109              * We are expecting more ACKs so
1110              * reset the retransmit timer.
1111              */
1112             rvt_mod_retry_timer(qp);
1113             /*
1114              * We can stop resending the earlier packets and
1115              * continue with the next packet the receiver wants.
1116              */
1117             if (qib_cmp24(qp->s_psn, psn) <= 0)
1118                 reset_psn(qp, psn + 1);
1119         } else {
1120             /* No more acks - kill all timers */
1121             rvt_stop_rc_timers(qp);
1122             if (qib_cmp24(qp->s_psn, psn) <= 0) {
1123                 qp->s_state = OP(SEND_LAST);
1124                 qp->s_psn = psn + 1;
1125             }
1126         }
1127         if (qp->s_flags & RVT_S_WAIT_ACK) {
1128             qp->s_flags &= ~RVT_S_WAIT_ACK;
1129             qib_schedule_send(qp);
1130         }
1131         rvt_get_credit(qp, aeth);
1132         qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1133         qp->s_retry = qp->s_retry_cnt;
1134         update_last_psn(qp, psn);
1135         return 1;
1136 
1137     case 1:         /* RNR NAK */
1138         ibp->rvp.n_rnr_naks++;
1139         if (qp->s_acked == qp->s_tail)
1140             goto bail;
1141         if (qp->s_flags & RVT_S_WAIT_RNR)
1142             goto bail;
1143         if (qp->s_rnr_retry == 0) {
1144             status = IB_WC_RNR_RETRY_EXC_ERR;
1145             goto class_b;
1146         }
1147         if (qp->s_rnr_retry_cnt < 7)
1148             qp->s_rnr_retry--;
1149 
1150         /* The last valid PSN is the previous PSN. */
1151         update_last_psn(qp, psn - 1);
1152 
1153         ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1154 
1155         reset_psn(qp, psn);
1156 
1157         qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1158         rvt_stop_rc_timers(qp);
1159         rvt_add_rnr_timer(qp, aeth);
1160         return 0;
1161 
1162     case 3:         /* NAK */
1163         if (qp->s_acked == qp->s_tail)
1164             goto bail;
1165         /* The last valid PSN is the previous PSN. */
1166         update_last_psn(qp, psn - 1);
1167         switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1168             IB_AETH_CREDIT_MASK) {
1169         case 0: /* PSN sequence error */
1170             ibp->rvp.n_seq_naks++;
1171             /*
1172              * Back up to the responder's expected PSN.
1173              * Note that we might get a NAK in the middle of an
1174              * RDMA READ response which terminates the RDMA
1175              * READ.
1176              */
1177             qib_restart_rc(qp, psn, 0);
1178             qib_schedule_send(qp);
1179             break;
1180 
1181         case 1: /* Invalid Request */
1182             status = IB_WC_REM_INV_REQ_ERR;
1183             ibp->rvp.n_other_naks++;
1184             goto class_b;
1185 
1186         case 2: /* Remote Access Error */
1187             status = IB_WC_REM_ACCESS_ERR;
1188             ibp->rvp.n_other_naks++;
1189             goto class_b;
1190 
1191         case 3: /* Remote Operation Error */
1192             status = IB_WC_REM_OP_ERR;
1193             ibp->rvp.n_other_naks++;
1194 class_b:
1195             if (qp->s_last == qp->s_acked) {
1196                 rvt_send_complete(qp, wqe, status);
1197                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1198             }
1199             break;
1200 
1201         default:
1202             /* Ignore other reserved NAK error codes */
1203             goto reserved;
1204         }
1205         qp->s_retry = qp->s_retry_cnt;
1206         qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1207         goto bail;
1208 
1209     default:                /* 2: reserved */
1210 reserved:
1211         /* Ignore reserved NAK codes. */
1212         goto bail;
1213     }
1214 
1215 bail:
1216     rvt_stop_rc_timers(qp);
1217     return ret;
1218 }
1219 
1220 /*
1221  * We have seen an out of sequence RDMA read middle or last packet.
1222  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1223  */
1224 static void rdma_seq_err(struct rvt_qp *qp, struct qib_ibport *ibp, u32 psn,
1225              struct qib_ctxtdata *rcd)
1226 {
1227     struct rvt_swqe *wqe;
1228 
1229     /* Remove QP from retry timer */
1230     rvt_stop_rc_timers(qp);
1231 
1232     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1233 
1234     while (qib_cmp24(psn, wqe->lpsn) > 0) {
1235         if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1236             wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1237             wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1238             break;
1239         wqe = do_rc_completion(qp, wqe, ibp);
1240     }
1241 
1242     ibp->rvp.n_rdma_seq++;
1243     qp->r_flags |= RVT_R_RDMAR_SEQ;
1244     qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1245     if (list_empty(&qp->rspwait)) {
1246         qp->r_flags |= RVT_R_RSP_SEND;
1247         rvt_get_qp(qp);
1248         list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1249     }
1250 }
1251 
1252 /**
1253  * qib_rc_rcv_resp - process an incoming RC response packet
1254  * @ibp: the port this packet came in on
1255  * @ohdr: the other headers for this packet
1256  * @data: the packet data
1257  * @tlen: the packet length
1258  * @qp: the QP for this packet
1259  * @opcode: the opcode for this packet
1260  * @psn: the packet sequence number for this packet
1261  * @hdrsize: the header length
1262  * @pmtu: the path MTU
1263  * @rcd: the context pointer
1264  *
1265  * This is called from qib_rc_rcv() to process an incoming RC response
1266  * packet for the given QP.
1267  * Called at interrupt level.
1268  */
1269 static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1270                 struct ib_other_headers *ohdr,
1271                 void *data, u32 tlen,
1272                 struct rvt_qp *qp,
1273                 u32 opcode,
1274                 u32 psn, u32 hdrsize, u32 pmtu,
1275                 struct qib_ctxtdata *rcd)
1276 {
1277     struct rvt_swqe *wqe;
1278     struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1279     enum ib_wc_status status;
1280     unsigned long flags;
1281     int diff;
1282     u32 pad;
1283     u32 aeth;
1284     u64 val;
1285 
1286     if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1287         /*
1288          * If ACK'd PSN on SDMA busy list try to make progress to
1289          * reclaim SDMA credits.
1290          */
1291         if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1292             (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1293 
1294             /*
1295              * If send tasklet not running attempt to progress
1296              * SDMA queue.
1297              */
1298             if (!(qp->s_flags & RVT_S_BUSY)) {
1299                 /* Acquire SDMA Lock */
1300                 spin_lock_irqsave(&ppd->sdma_lock, flags);
1301                 /* Invoke sdma make progress */
1302                 qib_sdma_make_progress(ppd);
1303                 /* Release SDMA Lock */
1304                 spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1305             }
1306         }
1307     }
1308 
1309     spin_lock_irqsave(&qp->s_lock, flags);
1310     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1311         goto ack_done;
1312 
1313     /* Ignore invalid responses. */
1314     if (qib_cmp24(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1315         goto ack_done;
1316 
1317     /* Ignore duplicate responses. */
1318     diff = qib_cmp24(psn, qp->s_last_psn);
1319     if (unlikely(diff <= 0)) {
1320         /* Update credits for "ghost" ACKs */
1321         if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1322             aeth = be32_to_cpu(ohdr->u.aeth);
1323             if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1324                 rvt_get_credit(qp, aeth);
1325         }
1326         goto ack_done;
1327     }
1328 
1329     /*
1330      * Skip everything other than the PSN we expect, if we are waiting
1331      * for a reply to a restarted RDMA read or atomic op.
1332      */
1333     if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1334         if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1335             goto ack_done;
1336         qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1337     }
1338 
1339     if (unlikely(qp->s_acked == qp->s_tail))
1340         goto ack_done;
1341     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1342     status = IB_WC_SUCCESS;
1343 
1344     switch (opcode) {
1345     case OP(ACKNOWLEDGE):
1346     case OP(ATOMIC_ACKNOWLEDGE):
1347     case OP(RDMA_READ_RESPONSE_FIRST):
1348         aeth = be32_to_cpu(ohdr->u.aeth);
1349         if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1350             val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1351         else
1352             val = 0;
1353         if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1354             opcode != OP(RDMA_READ_RESPONSE_FIRST))
1355             goto ack_done;
1356         hdrsize += 4;
1357         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1358         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1359             goto ack_op_err;
1360         /*
1361          * If this is a response to a resent RDMA read, we
1362          * have to be careful to copy the data to the right
1363          * location.
1364          */
1365         qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1366                           wqe, psn, pmtu);
1367         goto read_middle;
1368 
1369     case OP(RDMA_READ_RESPONSE_MIDDLE):
1370         /* no AETH, no ACK */
1371         if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1372             goto ack_seq_err;
1373         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1374             goto ack_op_err;
1375 read_middle:
1376         if (unlikely(tlen != (hdrsize + pmtu + 4)))
1377             goto ack_len_err;
1378         if (unlikely(pmtu >= qp->s_rdma_read_len))
1379             goto ack_len_err;
1380 
1381         /*
1382          * We got a response so update the timeout.
1383          * 4.096 usec. * (1 << qp->timeout)
1384          */
1385         rvt_mod_retry_timer(qp);
1386         if (qp->s_flags & RVT_S_WAIT_ACK) {
1387             qp->s_flags &= ~RVT_S_WAIT_ACK;
1388             qib_schedule_send(qp);
1389         }
1390 
1391         if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1392             qp->s_retry = qp->s_retry_cnt;
1393 
1394         /*
1395          * Update the RDMA receive state but do the copy w/o
1396          * holding the locks and blocking interrupts.
1397          */
1398         qp->s_rdma_read_len -= pmtu;
1399         update_last_psn(qp, psn);
1400         spin_unlock_irqrestore(&qp->s_lock, flags);
1401         rvt_copy_sge(qp, &qp->s_rdma_read_sge,
1402                  data, pmtu, false, false);
1403         goto bail;
1404 
1405     case OP(RDMA_READ_RESPONSE_ONLY):
1406         aeth = be32_to_cpu(ohdr->u.aeth);
1407         if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1408             goto ack_done;
1409         /* Get the number of bytes the message was padded by. */
1410         pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1411         /*
1412          * Check that the data size is >= 0 && <= pmtu.
1413          * Remember to account for the AETH header (4) and
1414          * ICRC (4).
1415          */
1416         if (unlikely(tlen < (hdrsize + pad + 8)))
1417             goto ack_len_err;
1418         /*
1419          * If this is a response to a resent RDMA read, we
1420          * have to be careful to copy the data to the right
1421          * location.
1422          */
1423         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1424         qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1425                           wqe, psn, pmtu);
1426         goto read_last;
1427 
1428     case OP(RDMA_READ_RESPONSE_LAST):
1429         /* ACKs READ req. */
1430         if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1431             goto ack_seq_err;
1432         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1433             goto ack_op_err;
1434         /* Get the number of bytes the message was padded by. */
1435         pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1436         /*
1437          * Check that the data size is >= 1 && <= pmtu.
1438          * Remember to account for the AETH header (4) and
1439          * ICRC (4).
1440          */
1441         if (unlikely(tlen <= (hdrsize + pad + 8)))
1442             goto ack_len_err;
1443 read_last:
1444         tlen -= hdrsize + pad + 8;
1445         if (unlikely(tlen != qp->s_rdma_read_len))
1446             goto ack_len_err;
1447         aeth = be32_to_cpu(ohdr->u.aeth);
1448         rvt_copy_sge(qp, &qp->s_rdma_read_sge,
1449                  data, tlen, false, false);
1450         WARN_ON(qp->s_rdma_read_sge.num_sge);
1451         (void) do_rc_ack(qp, aeth, psn,
1452                  OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1453         goto ack_done;
1454     }
1455 
1456 ack_op_err:
1457     status = IB_WC_LOC_QP_OP_ERR;
1458     goto ack_err;
1459 
1460 ack_seq_err:
1461     rdma_seq_err(qp, ibp, psn, rcd);
1462     goto ack_done;
1463 
1464 ack_len_err:
1465     status = IB_WC_LOC_LEN_ERR;
1466 ack_err:
1467     if (qp->s_last == qp->s_acked) {
1468         rvt_send_complete(qp, wqe, status);
1469         rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1470     }
1471 ack_done:
1472     spin_unlock_irqrestore(&qp->s_lock, flags);
1473 bail:
1474     return;
1475 }
1476 
1477 /**
1478  * qib_rc_rcv_error - process an incoming duplicate or error RC packet
1479  * @ohdr: the other headers for this packet
1480  * @data: the packet data
1481  * @qp: the QP for this packet
1482  * @opcode: the opcode for this packet
1483  * @psn: the packet sequence number for this packet
1484  * @diff: the difference between the PSN and the expected PSN
1485  * @rcd: the context pointer
1486  *
1487  * This is called from qib_rc_rcv() to process an unexpected
1488  * incoming RC packet for the given QP.
1489  * Called at interrupt level.
1490  * Return 1 if no more processing is needed; otherwise return 0 to
1491  * schedule a response to be sent.
1492  */
1493 static int qib_rc_rcv_error(struct ib_other_headers *ohdr,
1494                 void *data,
1495                 struct rvt_qp *qp,
1496                 u32 opcode,
1497                 u32 psn,
1498                 int diff,
1499                 struct qib_ctxtdata *rcd)
1500 {
1501     struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1502     struct rvt_ack_entry *e;
1503     unsigned long flags;
1504     u8 i, prev;
1505     int old_req;
1506 
1507     if (diff > 0) {
1508         /*
1509          * Packet sequence error.
1510          * A NAK will ACK earlier sends and RDMA writes.
1511          * Don't queue the NAK if we already sent one.
1512          */
1513         if (!qp->r_nak_state) {
1514             ibp->rvp.n_rc_seqnak++;
1515             qp->r_nak_state = IB_NAK_PSN_ERROR;
1516             /* Use the expected PSN. */
1517             qp->r_ack_psn = qp->r_psn;
1518             /*
1519              * Wait to send the sequence NAK until all packets
1520              * in the receive queue have been processed.
1521              * Otherwise, we end up propagating congestion.
1522              */
1523             if (list_empty(&qp->rspwait)) {
1524                 qp->r_flags |= RVT_R_RSP_NAK;
1525                 rvt_get_qp(qp);
1526                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1527             }
1528         }
1529         goto done;
1530     }
1531 
1532     /*
1533      * Handle a duplicate request.  Don't re-execute SEND, RDMA
1534      * write or atomic op.  Don't NAK errors, just silently drop
1535      * the duplicate request.  Note that r_sge, r_len, and
1536      * r_rcv_len may be in use so don't modify them.
1537      *
1538      * We are supposed to ACK the earliest duplicate PSN but we
1539      * can coalesce an outstanding duplicate ACK.  We have to
1540      * send the earliest so that RDMA reads can be restarted at
1541      * the requester's expected PSN.
1542      *
1543      * First, find where this duplicate PSN falls within the
1544      * ACKs previously sent.
1545      * old_req is true if there is an older response that is scheduled
1546      * to be sent before sending this one.
1547      */
1548     e = NULL;
1549     old_req = 1;
1550     ibp->rvp.n_rc_dupreq++;
1551 
1552     spin_lock_irqsave(&qp->s_lock, flags);
1553 
1554     for (i = qp->r_head_ack_queue; ; i = prev) {
1555         if (i == qp->s_tail_ack_queue)
1556             old_req = 0;
1557         if (i)
1558             prev = i - 1;
1559         else
1560             prev = QIB_MAX_RDMA_ATOMIC;
1561         if (prev == qp->r_head_ack_queue) {
1562             e = NULL;
1563             break;
1564         }
1565         e = &qp->s_ack_queue[prev];
1566         if (!e->opcode) {
1567             e = NULL;
1568             break;
1569         }
1570         if (qib_cmp24(psn, e->psn) >= 0) {
1571             if (prev == qp->s_tail_ack_queue &&
1572                 qib_cmp24(psn, e->lpsn) <= 0)
1573                 old_req = 0;
1574             break;
1575         }
1576     }
1577     switch (opcode) {
1578     case OP(RDMA_READ_REQUEST): {
1579         struct ib_reth *reth;
1580         u32 offset;
1581         u32 len;
1582 
1583         /*
1584          * If we didn't find the RDMA read request in the ack queue,
1585          * we can ignore this request.
1586          */
1587         if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1588             goto unlock_done;
1589         /* RETH comes after BTH */
1590         reth = &ohdr->u.rc.reth;
1591         /*
1592          * Address range must be a subset of the original
1593          * request and start on pmtu boundaries.
1594          * We reuse the old ack_queue slot since the requester
1595          * should not back up and request an earlier PSN for the
1596          * same request.
1597          */
1598         offset = ((psn - e->psn) & QIB_PSN_MASK) *
1599             qp->pmtu;
1600         len = be32_to_cpu(reth->length);
1601         if (unlikely(offset + len != e->rdma_sge.sge_length))
1602             goto unlock_done;
1603         if (e->rdma_sge.mr) {
1604             rvt_put_mr(e->rdma_sge.mr);
1605             e->rdma_sge.mr = NULL;
1606         }
1607         if (len != 0) {
1608             u32 rkey = be32_to_cpu(reth->rkey);
1609             u64 vaddr = be64_to_cpu(reth->vaddr);
1610             int ok;
1611 
1612             ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1613                      IB_ACCESS_REMOTE_READ);
1614             if (unlikely(!ok))
1615                 goto unlock_done;
1616         } else {
1617             e->rdma_sge.vaddr = NULL;
1618             e->rdma_sge.length = 0;
1619             e->rdma_sge.sge_length = 0;
1620         }
1621         e->psn = psn;
1622         if (old_req)
1623             goto unlock_done;
1624         qp->s_tail_ack_queue = prev;
1625         break;
1626     }
1627 
1628     case OP(COMPARE_SWAP):
1629     case OP(FETCH_ADD): {
1630         /*
1631          * If we didn't find the atomic request in the ack queue
1632          * or the send tasklet is already backed up to send an
1633          * earlier entry, we can ignore this request.
1634          */
1635         if (!e || e->opcode != (u8) opcode || old_req)
1636             goto unlock_done;
1637         qp->s_tail_ack_queue = prev;
1638         break;
1639     }
1640 
1641     default:
1642         /*
1643          * Ignore this operation if it doesn't request an ACK
1644          * or an earlier RDMA read or atomic is going to be resent.
1645          */
1646         if (!(psn & IB_BTH_REQ_ACK) || old_req)
1647             goto unlock_done;
1648         /*
1649          * Resend the most recent ACK if this request is
1650          * after all the previous RDMA reads and atomics.
1651          */
1652         if (i == qp->r_head_ack_queue) {
1653             spin_unlock_irqrestore(&qp->s_lock, flags);
1654             qp->r_nak_state = 0;
1655             qp->r_ack_psn = qp->r_psn - 1;
1656             goto send_ack;
1657         }
1658         /*
1659          * Try to send a simple ACK to work around a Mellanox bug
1660          * which doesn't accept a RDMA read response or atomic
1661          * response as an ACK for earlier SENDs or RDMA writes.
1662          */
1663         if (!(qp->s_flags & RVT_S_RESP_PENDING)) {
1664             spin_unlock_irqrestore(&qp->s_lock, flags);
1665             qp->r_nak_state = 0;
1666             qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1667             goto send_ack;
1668         }
1669         /*
1670          * Resend the RDMA read or atomic op which
1671          * ACKs this duplicate request.
1672          */
1673         qp->s_tail_ack_queue = i;
1674         break;
1675     }
1676     qp->s_ack_state = OP(ACKNOWLEDGE);
1677     qp->s_flags |= RVT_S_RESP_PENDING;
1678     qp->r_nak_state = 0;
1679     qib_schedule_send(qp);
1680 
1681 unlock_done:
1682     spin_unlock_irqrestore(&qp->s_lock, flags);
1683 done:
1684     return 1;
1685 
1686 send_ack:
1687     return 0;
1688 }
1689 
1690 static inline void qib_update_ack_queue(struct rvt_qp *qp, unsigned n)
1691 {
1692     unsigned next;
1693 
1694     next = n + 1;
1695     if (next > QIB_MAX_RDMA_ATOMIC)
1696         next = 0;
1697     qp->s_tail_ack_queue = next;
1698     qp->s_ack_state = OP(ACKNOWLEDGE);
1699 }
1700 
1701 /**
1702  * qib_rc_rcv - process an incoming RC packet
1703  * @rcd: the context pointer
1704  * @hdr: the header of this packet
1705  * @has_grh: true if the header has a GRH
1706  * @data: the packet data
1707  * @tlen: the packet length
1708  * @qp: the QP for this packet
1709  *
1710  * This is called from qib_qp_rcv() to process an incoming RC packet
1711  * for the given QP.
1712  * Called at interrupt level.
1713  */
1714 void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
1715         int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
1716 {
1717     struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1718     struct ib_other_headers *ohdr;
1719     u32 opcode;
1720     u32 hdrsize;
1721     u32 psn;
1722     u32 pad;
1723     struct ib_wc wc;
1724     u32 pmtu = qp->pmtu;
1725     int diff;
1726     struct ib_reth *reth;
1727     unsigned long flags;
1728     int ret;
1729 
1730     /* Check for GRH */
1731     if (!has_grh) {
1732         ohdr = &hdr->u.oth;
1733         hdrsize = 8 + 12;       /* LRH + BTH */
1734     } else {
1735         ohdr = &hdr->u.l.oth;
1736         hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1737     }
1738 
1739     opcode = be32_to_cpu(ohdr->bth[0]);
1740     if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1741         return;
1742 
1743     psn = be32_to_cpu(ohdr->bth[2]);
1744     opcode >>= 24;
1745 
1746     /*
1747      * Process responses (ACKs) before anything else.  Note that the
1748      * packet sequence number will be for something in the send work
1749      * queue rather than the expected receive packet sequence number.
1750      * In other words, this QP is the requester.
1751      */
1752     if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1753         opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1754         qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1755                 hdrsize, pmtu, rcd);
1756         return;
1757     }
1758 
1759     /* Compute 24 bits worth of difference. */
1760     diff = qib_cmp24(psn, qp->r_psn);
1761     if (unlikely(diff)) {
1762         if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1763             return;
1764         goto send_ack;
1765     }
1766 
1767     /* Check for opcode sequence errors. */
1768     switch (qp->r_state) {
1769     case OP(SEND_FIRST):
1770     case OP(SEND_MIDDLE):
1771         if (opcode == OP(SEND_MIDDLE) ||
1772             opcode == OP(SEND_LAST) ||
1773             opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1774             break;
1775         goto nack_inv;
1776 
1777     case OP(RDMA_WRITE_FIRST):
1778     case OP(RDMA_WRITE_MIDDLE):
1779         if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1780             opcode == OP(RDMA_WRITE_LAST) ||
1781             opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1782             break;
1783         goto nack_inv;
1784 
1785     default:
1786         if (opcode == OP(SEND_MIDDLE) ||
1787             opcode == OP(SEND_LAST) ||
1788             opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1789             opcode == OP(RDMA_WRITE_MIDDLE) ||
1790             opcode == OP(RDMA_WRITE_LAST) ||
1791             opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1792             goto nack_inv;
1793         /*
1794          * Note that it is up to the requester to not send a new
1795          * RDMA read or atomic operation before receiving an ACK
1796          * for the previous operation.
1797          */
1798         break;
1799     }
1800 
1801     if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
1802         rvt_comm_est(qp);
1803 
1804     /* OK, process the packet. */
1805     switch (opcode) {
1806     case OP(SEND_FIRST):
1807         ret = rvt_get_rwqe(qp, false);
1808         if (ret < 0)
1809             goto nack_op_err;
1810         if (!ret)
1811             goto rnr_nak;
1812         qp->r_rcv_len = 0;
1813         fallthrough;
1814     case OP(SEND_MIDDLE):
1815     case OP(RDMA_WRITE_MIDDLE):
1816 send_middle:
1817         /* Check for invalid length PMTU or posted rwqe len. */
1818         if (unlikely(tlen != (hdrsize + pmtu + 4)))
1819             goto nack_inv;
1820         qp->r_rcv_len += pmtu;
1821         if (unlikely(qp->r_rcv_len > qp->r_len))
1822             goto nack_inv;
1823         rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
1824         break;
1825 
1826     case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1827         /* consume RWQE */
1828         ret = rvt_get_rwqe(qp, true);
1829         if (ret < 0)
1830             goto nack_op_err;
1831         if (!ret)
1832             goto rnr_nak;
1833         goto send_last_imm;
1834 
1835     case OP(SEND_ONLY):
1836     case OP(SEND_ONLY_WITH_IMMEDIATE):
1837         ret = rvt_get_rwqe(qp, false);
1838         if (ret < 0)
1839             goto nack_op_err;
1840         if (!ret)
1841             goto rnr_nak;
1842         qp->r_rcv_len = 0;
1843         if (opcode == OP(SEND_ONLY))
1844             goto no_immediate_data;
1845         fallthrough;    /* for SEND_ONLY_WITH_IMMEDIATE */
1846     case OP(SEND_LAST_WITH_IMMEDIATE):
1847 send_last_imm:
1848         wc.ex.imm_data = ohdr->u.imm_data;
1849         hdrsize += 4;
1850         wc.wc_flags = IB_WC_WITH_IMM;
1851         goto send_last;
1852     case OP(SEND_LAST):
1853     case OP(RDMA_WRITE_LAST):
1854 no_immediate_data:
1855         wc.wc_flags = 0;
1856         wc.ex.imm_data = 0;
1857 send_last:
1858         /* Get the number of bytes the message was padded by. */
1859         pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1860         /* Check for invalid length. */
1861         /* XXX LAST len should be >= 1 */
1862         if (unlikely(tlen < (hdrsize + pad + 4)))
1863             goto nack_inv;
1864         /* Don't count the CRC. */
1865         tlen -= (hdrsize + pad + 4);
1866         wc.byte_len = tlen + qp->r_rcv_len;
1867         if (unlikely(wc.byte_len > qp->r_len))
1868             goto nack_inv;
1869         rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
1870         rvt_put_ss(&qp->r_sge);
1871         qp->r_msn++;
1872         if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
1873             break;
1874         wc.wr_id = qp->r_wr_id;
1875         wc.status = IB_WC_SUCCESS;
1876         if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
1877             opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
1878             wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
1879         else
1880             wc.opcode = IB_WC_RECV;
1881         wc.qp = &qp->ibqp;
1882         wc.src_qp = qp->remote_qpn;
1883         wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1884         wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1885         /* zero fields that are N/A */
1886         wc.vendor_err = 0;
1887         wc.pkey_index = 0;
1888         wc.dlid_path_bits = 0;
1889         wc.port_num = 0;
1890         /* Signal completion event if the solicited bit is set. */
1891         rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
1892         break;
1893 
1894     case OP(RDMA_WRITE_FIRST):
1895     case OP(RDMA_WRITE_ONLY):
1896     case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
1897         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
1898             goto nack_inv;
1899         /* consume RWQE */
1900         reth = &ohdr->u.rc.reth;
1901         hdrsize += sizeof(*reth);
1902         qp->r_len = be32_to_cpu(reth->length);
1903         qp->r_rcv_len = 0;
1904         qp->r_sge.sg_list = NULL;
1905         if (qp->r_len != 0) {
1906             u32 rkey = be32_to_cpu(reth->rkey);
1907             u64 vaddr = be64_to_cpu(reth->vaddr);
1908             int ok;
1909 
1910             /* Check rkey & NAK */
1911             ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
1912                      rkey, IB_ACCESS_REMOTE_WRITE);
1913             if (unlikely(!ok))
1914                 goto nack_acc;
1915             qp->r_sge.num_sge = 1;
1916         } else {
1917             qp->r_sge.num_sge = 0;
1918             qp->r_sge.sge.mr = NULL;
1919             qp->r_sge.sge.vaddr = NULL;
1920             qp->r_sge.sge.length = 0;
1921             qp->r_sge.sge.sge_length = 0;
1922         }
1923         if (opcode == OP(RDMA_WRITE_FIRST))
1924             goto send_middle;
1925         else if (opcode == OP(RDMA_WRITE_ONLY))
1926             goto no_immediate_data;
1927         ret = rvt_get_rwqe(qp, true);
1928         if (ret < 0)
1929             goto nack_op_err;
1930         if (!ret) {
1931             rvt_put_ss(&qp->r_sge);
1932             goto rnr_nak;
1933         }
1934         wc.ex.imm_data = ohdr->u.rc.imm_data;
1935         hdrsize += 4;
1936         wc.wc_flags = IB_WC_WITH_IMM;
1937         goto send_last;
1938 
1939     case OP(RDMA_READ_REQUEST): {
1940         struct rvt_ack_entry *e;
1941         u32 len;
1942         u8 next;
1943 
1944         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
1945             goto nack_inv;
1946         next = qp->r_head_ack_queue + 1;
1947         /* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
1948         if (next > QIB_MAX_RDMA_ATOMIC)
1949             next = 0;
1950         spin_lock_irqsave(&qp->s_lock, flags);
1951         if (unlikely(next == qp->s_tail_ack_queue)) {
1952             if (!qp->s_ack_queue[next].sent)
1953                 goto nack_inv_unlck;
1954             qib_update_ack_queue(qp, next);
1955         }
1956         e = &qp->s_ack_queue[qp->r_head_ack_queue];
1957         if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
1958             rvt_put_mr(e->rdma_sge.mr);
1959             e->rdma_sge.mr = NULL;
1960         }
1961         reth = &ohdr->u.rc.reth;
1962         len = be32_to_cpu(reth->length);
1963         if (len) {
1964             u32 rkey = be32_to_cpu(reth->rkey);
1965             u64 vaddr = be64_to_cpu(reth->vaddr);
1966             int ok;
1967 
1968             /* Check rkey & NAK */
1969             ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
1970                      rkey, IB_ACCESS_REMOTE_READ);
1971             if (unlikely(!ok))
1972                 goto nack_acc_unlck;
1973             /*
1974              * Update the next expected PSN.  We add 1 later
1975              * below, so only add the remainder here.
1976              */
1977             qp->r_psn += rvt_div_mtu(qp, len - 1);
1978         } else {
1979             e->rdma_sge.mr = NULL;
1980             e->rdma_sge.vaddr = NULL;
1981             e->rdma_sge.length = 0;
1982             e->rdma_sge.sge_length = 0;
1983         }
1984         e->opcode = opcode;
1985         e->sent = 0;
1986         e->psn = psn;
1987         e->lpsn = qp->r_psn;
1988         /*
1989          * We need to increment the MSN here instead of when we
1990          * finish sending the result since a duplicate request would
1991          * increment it more than once.
1992          */
1993         qp->r_msn++;
1994         qp->r_psn++;
1995         qp->r_state = opcode;
1996         qp->r_nak_state = 0;
1997         qp->r_head_ack_queue = next;
1998 
1999         /* Schedule the send tasklet. */
2000         qp->s_flags |= RVT_S_RESP_PENDING;
2001         qib_schedule_send(qp);
2002 
2003         goto sunlock;
2004     }
2005 
2006     case OP(COMPARE_SWAP):
2007     case OP(FETCH_ADD): {
2008         struct ib_atomic_eth *ateth;
2009         struct rvt_ack_entry *e;
2010         u64 vaddr;
2011         atomic64_t *maddr;
2012         u64 sdata;
2013         u32 rkey;
2014         u8 next;
2015 
2016         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2017             goto nack_inv;
2018         next = qp->r_head_ack_queue + 1;
2019         if (next > QIB_MAX_RDMA_ATOMIC)
2020             next = 0;
2021         spin_lock_irqsave(&qp->s_lock, flags);
2022         if (unlikely(next == qp->s_tail_ack_queue)) {
2023             if (!qp->s_ack_queue[next].sent)
2024                 goto nack_inv_unlck;
2025             qib_update_ack_queue(qp, next);
2026         }
2027         e = &qp->s_ack_queue[qp->r_head_ack_queue];
2028         if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2029             rvt_put_mr(e->rdma_sge.mr);
2030             e->rdma_sge.mr = NULL;
2031         }
2032         ateth = &ohdr->u.atomic_eth;
2033         vaddr = get_ib_ateth_vaddr(ateth);
2034         if (unlikely(vaddr & (sizeof(u64) - 1)))
2035             goto nack_inv_unlck;
2036         rkey = be32_to_cpu(ateth->rkey);
2037         /* Check rkey & NAK */
2038         if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2039                       vaddr, rkey,
2040                       IB_ACCESS_REMOTE_ATOMIC)))
2041             goto nack_acc_unlck;
2042         /* Perform atomic OP and save result. */
2043         maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2044         sdata = get_ib_ateth_swap(ateth);
2045         e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2046             (u64) atomic64_add_return(sdata, maddr) - sdata :
2047             (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2048                       get_ib_ateth_compare(ateth),
2049                       sdata);
2050         rvt_put_mr(qp->r_sge.sge.mr);
2051         qp->r_sge.num_sge = 0;
2052         e->opcode = opcode;
2053         e->sent = 0;
2054         e->psn = psn;
2055         e->lpsn = psn;
2056         qp->r_msn++;
2057         qp->r_psn++;
2058         qp->r_state = opcode;
2059         qp->r_nak_state = 0;
2060         qp->r_head_ack_queue = next;
2061 
2062         /* Schedule the send tasklet. */
2063         qp->s_flags |= RVT_S_RESP_PENDING;
2064         qib_schedule_send(qp);
2065 
2066         goto sunlock;
2067     }
2068 
2069     default:
2070         /* NAK unknown opcodes. */
2071         goto nack_inv;
2072     }
2073     qp->r_psn++;
2074     qp->r_state = opcode;
2075     qp->r_ack_psn = psn;
2076     qp->r_nak_state = 0;
2077     /* Send an ACK if requested or required. */
2078     if (psn & (1 << 31))
2079         goto send_ack;
2080     return;
2081 
2082 rnr_nak:
2083     qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2084     qp->r_ack_psn = qp->r_psn;
2085     /* Queue RNR NAK for later */
2086     if (list_empty(&qp->rspwait)) {
2087         qp->r_flags |= RVT_R_RSP_NAK;
2088         rvt_get_qp(qp);
2089         list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2090     }
2091     return;
2092 
2093 nack_op_err:
2094     rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2095     qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2096     qp->r_ack_psn = qp->r_psn;
2097     /* Queue NAK for later */
2098     if (list_empty(&qp->rspwait)) {
2099         qp->r_flags |= RVT_R_RSP_NAK;
2100         rvt_get_qp(qp);
2101         list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2102     }
2103     return;
2104 
2105 nack_inv_unlck:
2106     spin_unlock_irqrestore(&qp->s_lock, flags);
2107 nack_inv:
2108     rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2109     qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2110     qp->r_ack_psn = qp->r_psn;
2111     /* Queue NAK for later */
2112     if (list_empty(&qp->rspwait)) {
2113         qp->r_flags |= RVT_R_RSP_NAK;
2114         rvt_get_qp(qp);
2115         list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2116     }
2117     return;
2118 
2119 nack_acc_unlck:
2120     spin_unlock_irqrestore(&qp->s_lock, flags);
2121 nack_acc:
2122     rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2123     qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2124     qp->r_ack_psn = qp->r_psn;
2125 send_ack:
2126     qib_send_rc_ack(qp);
2127     return;
2128 
2129 sunlock:
2130     spin_unlock_irqrestore(&qp->s_lock, flags);
2131 }