Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
0002 /*
0003  * Copyright(c) 2015 - 2018 Intel Corporation.
0004  */
0005 
0006 #include <linux/io.h>
0007 #include <rdma/rdma_vt.h>
0008 #include <rdma/rdmavt_qp.h>
0009 
0010 #include "hfi.h"
0011 #include "qp.h"
0012 #include "rc.h"
0013 #include "verbs_txreq.h"
0014 #include "trace.h"
0015 
0016 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
0017                       u8 *prev_ack, bool *scheduled)
0018     __must_hold(&qp->s_lock)
0019 {
0020     struct rvt_ack_entry *e = NULL;
0021     u8 i, p;
0022     bool s = true;
0023 
0024     for (i = qp->r_head_ack_queue; ; i = p) {
0025         if (i == qp->s_tail_ack_queue)
0026             s = false;
0027         if (i)
0028             p = i - 1;
0029         else
0030             p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
0031         if (p == qp->r_head_ack_queue) {
0032             e = NULL;
0033             break;
0034         }
0035         e = &qp->s_ack_queue[p];
0036         if (!e->opcode) {
0037             e = NULL;
0038             break;
0039         }
0040         if (cmp_psn(psn, e->psn) >= 0) {
0041             if (p == qp->s_tail_ack_queue &&
0042                 cmp_psn(psn, e->lpsn) <= 0)
0043                 s = false;
0044             break;
0045         }
0046     }
0047     if (prev)
0048         *prev = p;
0049     if (prev_ack)
0050         *prev_ack = i;
0051     if (scheduled)
0052         *scheduled = s;
0053     return e;
0054 }
0055 
0056 /**
0057  * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
0058  * @dev: the device for this QP
0059  * @qp: a pointer to the QP
0060  * @ohdr: a pointer to the IB header being constructed
0061  * @ps: the xmit packet state
0062  *
0063  * Return 1 if constructed; otherwise, return 0.
0064  * Note that we are in the responder's side of the QP context.
0065  * Note the QP s_lock must be held.
0066  */
0067 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
0068                struct ib_other_headers *ohdr,
0069                struct hfi1_pkt_state *ps)
0070 {
0071     struct rvt_ack_entry *e;
0072     u32 hwords, hdrlen;
0073     u32 len = 0;
0074     u32 bth0 = 0, bth2 = 0;
0075     u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
0076     int middle = 0;
0077     u32 pmtu = qp->pmtu;
0078     struct hfi1_qp_priv *qpriv = qp->priv;
0079     bool last_pkt;
0080     u32 delta;
0081     u8 next = qp->s_tail_ack_queue;
0082     struct tid_rdma_request *req;
0083 
0084     trace_hfi1_rsp_make_rc_ack(qp, 0);
0085     lockdep_assert_held(&qp->s_lock);
0086     /* Don't send an ACK if we aren't supposed to. */
0087     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
0088         goto bail;
0089 
0090     if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
0091         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
0092         hwords = 5;
0093     else
0094         /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
0095         hwords = 7;
0096 
0097     switch (qp->s_ack_state) {
0098     case OP(RDMA_READ_RESPONSE_LAST):
0099     case OP(RDMA_READ_RESPONSE_ONLY):
0100         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0101         release_rdma_sge_mr(e);
0102         fallthrough;
0103     case OP(ATOMIC_ACKNOWLEDGE):
0104         /*
0105          * We can increment the tail pointer now that the last
0106          * response has been sent instead of only being
0107          * constructed.
0108          */
0109         if (++next > rvt_size_atomic(&dev->rdi))
0110             next = 0;
0111         /*
0112          * Only advance the s_acked_ack_queue pointer if there
0113          * have been no TID RDMA requests.
0114          */
0115         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0116         if (e->opcode != TID_OP(WRITE_REQ) &&
0117             qp->s_acked_ack_queue == qp->s_tail_ack_queue)
0118             qp->s_acked_ack_queue = next;
0119         qp->s_tail_ack_queue = next;
0120         trace_hfi1_rsp_make_rc_ack(qp, e->psn);
0121         fallthrough;
0122     case OP(SEND_ONLY):
0123     case OP(ACKNOWLEDGE):
0124         /* Check for no next entry in the queue. */
0125         if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
0126             if (qp->s_flags & RVT_S_ACK_PENDING)
0127                 goto normal;
0128             goto bail;
0129         }
0130 
0131         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0132         /* Check for tid write fence */
0133         if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
0134             hfi1_tid_rdma_ack_interlock(qp, e)) {
0135             iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
0136             goto bail;
0137         }
0138         if (e->opcode == OP(RDMA_READ_REQUEST)) {
0139             /*
0140              * If a RDMA read response is being resent and
0141              * we haven't seen the duplicate request yet,
0142              * then stop sending the remaining responses the
0143              * responder has seen until the requester re-sends it.
0144              */
0145             len = e->rdma_sge.sge_length;
0146             if (len && !e->rdma_sge.mr) {
0147                 if (qp->s_acked_ack_queue ==
0148                     qp->s_tail_ack_queue)
0149                     qp->s_acked_ack_queue =
0150                         qp->r_head_ack_queue;
0151                 qp->s_tail_ack_queue = qp->r_head_ack_queue;
0152                 goto bail;
0153             }
0154             /* Copy SGE state in case we need to resend */
0155             ps->s_txreq->mr = e->rdma_sge.mr;
0156             if (ps->s_txreq->mr)
0157                 rvt_get_mr(ps->s_txreq->mr);
0158             qp->s_ack_rdma_sge.sge = e->rdma_sge;
0159             qp->s_ack_rdma_sge.num_sge = 1;
0160             ps->s_txreq->ss = &qp->s_ack_rdma_sge;
0161             if (len > pmtu) {
0162                 len = pmtu;
0163                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
0164             } else {
0165                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
0166                 e->sent = 1;
0167             }
0168             ohdr->u.aeth = rvt_compute_aeth(qp);
0169             hwords++;
0170             qp->s_ack_rdma_psn = e->psn;
0171             bth2 = mask_psn(qp->s_ack_rdma_psn++);
0172         } else if (e->opcode == TID_OP(WRITE_REQ)) {
0173             /*
0174              * If a TID RDMA WRITE RESP is being resent, we have to
0175              * wait for the actual request. All requests that are to
0176              * be resent will have their state set to
0177              * TID_REQUEST_RESEND. When the new request arrives, the
0178              * state will be changed to TID_REQUEST_RESEND_ACTIVE.
0179              */
0180             req = ack_to_tid_req(e);
0181             if (req->state == TID_REQUEST_RESEND ||
0182                 req->state == TID_REQUEST_INIT_RESEND)
0183                 goto bail;
0184             qp->s_ack_state = TID_OP(WRITE_RESP);
0185             qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
0186             goto write_resp;
0187         } else if (e->opcode == TID_OP(READ_REQ)) {
0188             /*
0189              * If a TID RDMA read response is being resent and
0190              * we haven't seen the duplicate request yet,
0191              * then stop sending the remaining responses the
0192              * responder has seen until the requester re-sends it.
0193              */
0194             len = e->rdma_sge.sge_length;
0195             if (len && !e->rdma_sge.mr) {
0196                 if (qp->s_acked_ack_queue ==
0197                     qp->s_tail_ack_queue)
0198                     qp->s_acked_ack_queue =
0199                         qp->r_head_ack_queue;
0200                 qp->s_tail_ack_queue = qp->r_head_ack_queue;
0201                 goto bail;
0202             }
0203             /* Copy SGE state in case we need to resend */
0204             ps->s_txreq->mr = e->rdma_sge.mr;
0205             if (ps->s_txreq->mr)
0206                 rvt_get_mr(ps->s_txreq->mr);
0207             qp->s_ack_rdma_sge.sge = e->rdma_sge;
0208             qp->s_ack_rdma_sge.num_sge = 1;
0209             qp->s_ack_state = TID_OP(READ_RESP);
0210             goto read_resp;
0211         } else {
0212             /* COMPARE_SWAP or FETCH_ADD */
0213             ps->s_txreq->ss = NULL;
0214             len = 0;
0215             qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
0216             ohdr->u.at.aeth = rvt_compute_aeth(qp);
0217             ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
0218             hwords += sizeof(ohdr->u.at) / sizeof(u32);
0219             bth2 = mask_psn(e->psn);
0220             e->sent = 1;
0221         }
0222         trace_hfi1_tid_write_rsp_make_rc_ack(qp);
0223         bth0 = qp->s_ack_state << 24;
0224         break;
0225 
0226     case OP(RDMA_READ_RESPONSE_FIRST):
0227         qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
0228         fallthrough;
0229     case OP(RDMA_READ_RESPONSE_MIDDLE):
0230         ps->s_txreq->ss = &qp->s_ack_rdma_sge;
0231         ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
0232         if (ps->s_txreq->mr)
0233             rvt_get_mr(ps->s_txreq->mr);
0234         len = qp->s_ack_rdma_sge.sge.sge_length;
0235         if (len > pmtu) {
0236             len = pmtu;
0237             middle = HFI1_CAP_IS_KSET(SDMA_AHG);
0238         } else {
0239             ohdr->u.aeth = rvt_compute_aeth(qp);
0240             hwords++;
0241             qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
0242             e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0243             e->sent = 1;
0244         }
0245         bth0 = qp->s_ack_state << 24;
0246         bth2 = mask_psn(qp->s_ack_rdma_psn++);
0247         break;
0248 
0249     case TID_OP(WRITE_RESP):
0250 write_resp:
0251         /*
0252          * 1. Check if RVT_S_ACK_PENDING is set. If yes,
0253          *    goto normal.
0254          * 2. Attempt to allocate TID resources.
0255          * 3. Remove RVT_S_RESP_PENDING flags from s_flags
0256          * 4. If resources not available:
0257          *    4.1 Set RVT_S_WAIT_TID_SPACE
0258          *    4.2 Queue QP on RCD TID queue
0259          *    4.3 Put QP on iowait list.
0260          *    4.4 Build IB RNR NAK with appropriate timeout value
0261          *    4.5 Return indication progress made.
0262          * 5. If resources are available:
0263          *    5.1 Program HW flow CSRs
0264          *    5.2 Build TID RDMA WRITE RESP packet
0265          *    5.3 If more resources needed, do 2.1 - 2.3.
0266          *    5.4 Wake up next QP on RCD TID queue.
0267          *    5.5 Return indication progress made.
0268          */
0269 
0270         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0271         req = ack_to_tid_req(e);
0272 
0273         /*
0274          * Send scheduled RNR NAK's. RNR NAK's need to be sent at
0275          * segment boundaries, not at request boundaries. Don't change
0276          * s_ack_state because we are still in the middle of a request
0277          */
0278         if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
0279             qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
0280             req->cur_seg == req->alloc_seg) {
0281             qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
0282             goto normal_no_state;
0283         }
0284 
0285         bth2 = mask_psn(qp->s_ack_rdma_psn);
0286         hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
0287                             bth2, &len,
0288                             &ps->s_txreq->ss);
0289         if (!hdrlen)
0290             return 0;
0291 
0292         hwords += hdrlen;
0293         bth0 = qp->s_ack_state << 24;
0294         qp->s_ack_rdma_psn++;
0295         trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
0296                              e->lpsn, req);
0297         if (req->cur_seg != req->total_segs)
0298             break;
0299 
0300         e->sent = 1;
0301         /* Do not free e->rdma_sge until all data are received */
0302         qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
0303         break;
0304 
0305     case TID_OP(READ_RESP):
0306 read_resp:
0307         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
0308         ps->s_txreq->ss = &qp->s_ack_rdma_sge;
0309         delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
0310                               &bth1, &bth2, &len,
0311                               &last_pkt);
0312         if (delta == 0)
0313             goto error_qp;
0314         hwords += delta;
0315         if (last_pkt) {
0316             e->sent = 1;
0317             /*
0318              * Increment qp->s_tail_ack_queue through s_ack_state
0319              * transition.
0320              */
0321             qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
0322         }
0323         break;
0324     case TID_OP(READ_REQ):
0325         goto bail;
0326 
0327     default:
0328 normal:
0329         /*
0330          * Send a regular ACK.
0331          * Set the s_ack_state so we wait until after sending
0332          * the ACK before setting s_ack_state to ACKNOWLEDGE
0333          * (see above).
0334          */
0335         qp->s_ack_state = OP(SEND_ONLY);
0336 normal_no_state:
0337         if (qp->s_nak_state)
0338             ohdr->u.aeth =
0339                 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
0340                         (qp->s_nak_state <<
0341                          IB_AETH_CREDIT_SHIFT));
0342         else
0343             ohdr->u.aeth = rvt_compute_aeth(qp);
0344         hwords++;
0345         len = 0;
0346         bth0 = OP(ACKNOWLEDGE) << 24;
0347         bth2 = mask_psn(qp->s_ack_psn);
0348         qp->s_flags &= ~RVT_S_ACK_PENDING;
0349         ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
0350         ps->s_txreq->ss = NULL;
0351     }
0352     qp->s_rdma_ack_cnt++;
0353     ps->s_txreq->sde = qpriv->s_sde;
0354     ps->s_txreq->s_cur_size = len;
0355     ps->s_txreq->hdr_dwords = hwords;
0356     hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
0357     return 1;
0358 error_qp:
0359     spin_unlock_irqrestore(&qp->s_lock, ps->flags);
0360     spin_lock_irqsave(&qp->r_lock, ps->flags);
0361     spin_lock(&qp->s_lock);
0362     rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
0363     spin_unlock(&qp->s_lock);
0364     spin_unlock_irqrestore(&qp->r_lock, ps->flags);
0365     spin_lock_irqsave(&qp->s_lock, ps->flags);
0366 bail:
0367     qp->s_ack_state = OP(ACKNOWLEDGE);
0368     /*
0369      * Ensure s_rdma_ack_cnt changes are committed prior to resetting
0370      * RVT_S_RESP_PENDING
0371      */
0372     smp_wmb();
0373     qp->s_flags &= ~(RVT_S_RESP_PENDING
0374                 | RVT_S_ACK_PENDING
0375                 | HFI1_S_AHG_VALID);
0376     return 0;
0377 }
0378 
0379 /**
0380  * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
0381  * @qp: a pointer to the QP
0382  * @ps: the current packet state
0383  *
0384  * Assumes s_lock is held.
0385  *
0386  * Return 1 if constructed; otherwise, return 0.
0387  */
0388 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
0389 {
0390     struct hfi1_qp_priv *priv = qp->priv;
0391     struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
0392     struct ib_other_headers *ohdr;
0393     struct rvt_sge_state *ss = NULL;
0394     struct rvt_swqe *wqe;
0395     struct hfi1_swqe_priv *wpriv;
0396     struct tid_rdma_request *req = NULL;
0397     /* header size in 32-bit words LRH+BTH = (8+12)/4. */
0398     u32 hwords = 5;
0399     u32 len = 0;
0400     u32 bth0 = 0, bth2 = 0;
0401     u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
0402     u32 pmtu = qp->pmtu;
0403     char newreq;
0404     int middle = 0;
0405     int delta;
0406     struct tid_rdma_flow *flow = NULL;
0407     struct tid_rdma_params *remote;
0408 
0409     trace_hfi1_sender_make_rc_req(qp);
0410     lockdep_assert_held(&qp->s_lock);
0411     ps->s_txreq = get_txreq(ps->dev, qp);
0412     if (!ps->s_txreq)
0413         goto bail_no_tx;
0414 
0415     if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
0416         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
0417         hwords = 5;
0418         if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
0419             ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
0420         else
0421             ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
0422     } else {
0423         /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
0424         hwords = 7;
0425         if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
0426             (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
0427             ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
0428         else
0429             ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
0430     }
0431 
0432     /* Sending responses has higher priority over sending requests. */
0433     if ((qp->s_flags & RVT_S_RESP_PENDING) &&
0434         make_rc_ack(dev, qp, ohdr, ps))
0435         return 1;
0436 
0437     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
0438         if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
0439             goto bail;
0440         /* We are in the error state, flush the work request. */
0441         if (qp->s_last == READ_ONCE(qp->s_head))
0442             goto bail;
0443         /* If DMAs are in progress, we can't flush immediately. */
0444         if (iowait_sdma_pending(&priv->s_iowait)) {
0445             qp->s_flags |= RVT_S_WAIT_DMA;
0446             goto bail;
0447         }
0448         clear_ahg(qp);
0449         wqe = rvt_get_swqe_ptr(qp, qp->s_last);
0450         hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
0451                      IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
0452         /* will get called again */
0453         goto done_free_tx;
0454     }
0455 
0456     if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
0457         goto bail;
0458 
0459     if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
0460         if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
0461             qp->s_flags |= RVT_S_WAIT_PSN;
0462             goto bail;
0463         }
0464         qp->s_sending_psn = qp->s_psn;
0465         qp->s_sending_hpsn = qp->s_psn - 1;
0466     }
0467 
0468     /* Send a request. */
0469     wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
0470 check_s_state:
0471     switch (qp->s_state) {
0472     default:
0473         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
0474             goto bail;
0475         /*
0476          * Resend an old request or start a new one.
0477          *
0478          * We keep track of the current SWQE so that
0479          * we don't reset the "furthest progress" state
0480          * if we need to back up.
0481          */
0482         newreq = 0;
0483         if (qp->s_cur == qp->s_tail) {
0484             /* Check if send work queue is empty. */
0485             if (qp->s_tail == READ_ONCE(qp->s_head)) {
0486                 clear_ahg(qp);
0487                 goto bail;
0488             }
0489             /*
0490              * If a fence is requested, wait for previous
0491              * RDMA read and atomic operations to finish.
0492              * However, there is no need to guard against
0493              * TID RDMA READ after TID RDMA READ.
0494              */
0495             if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
0496                 qp->s_num_rd_atomic &&
0497                 (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
0498                  priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
0499                 qp->s_flags |= RVT_S_WAIT_FENCE;
0500                 goto bail;
0501             }
0502             /*
0503              * Local operations are processed immediately
0504              * after all prior requests have completed
0505              */
0506             if (wqe->wr.opcode == IB_WR_REG_MR ||
0507                 wqe->wr.opcode == IB_WR_LOCAL_INV) {
0508                 int local_ops = 0;
0509                 int err = 0;
0510 
0511                 if (qp->s_last != qp->s_cur)
0512                     goto bail;
0513                 if (++qp->s_cur == qp->s_size)
0514                     qp->s_cur = 0;
0515                 if (++qp->s_tail == qp->s_size)
0516                     qp->s_tail = 0;
0517                 if (!(wqe->wr.send_flags &
0518                       RVT_SEND_COMPLETION_ONLY)) {
0519                     err = rvt_invalidate_rkey(
0520                         qp,
0521                         wqe->wr.ex.invalidate_rkey);
0522                     local_ops = 1;
0523                 }
0524                 rvt_send_complete(qp, wqe,
0525                           err ? IB_WC_LOC_PROT_ERR
0526                               : IB_WC_SUCCESS);
0527                 if (local_ops)
0528                     atomic_dec(&qp->local_ops_pending);
0529                 goto done_free_tx;
0530             }
0531 
0532             newreq = 1;
0533             qp->s_psn = wqe->psn;
0534         }
0535         /*
0536          * Note that we have to be careful not to modify the
0537          * original work request since we may need to resend
0538          * it.
0539          */
0540         len = wqe->length;
0541         ss = &qp->s_sge;
0542         bth2 = mask_psn(qp->s_psn);
0543 
0544         /*
0545          * Interlock between various IB requests and TID RDMA
0546          * if necessary.
0547          */
0548         if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
0549             hfi1_tid_rdma_wqe_interlock(qp, wqe))
0550             goto bail;
0551 
0552         switch (wqe->wr.opcode) {
0553         case IB_WR_SEND:
0554         case IB_WR_SEND_WITH_IMM:
0555         case IB_WR_SEND_WITH_INV:
0556             /* If no credit, return. */
0557             if (!rvt_rc_credit_avail(qp, wqe))
0558                 goto bail;
0559             if (len > pmtu) {
0560                 qp->s_state = OP(SEND_FIRST);
0561                 len = pmtu;
0562                 break;
0563             }
0564             if (wqe->wr.opcode == IB_WR_SEND) {
0565                 qp->s_state = OP(SEND_ONLY);
0566             } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
0567                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
0568                 /* Immediate data comes after the BTH */
0569                 ohdr->u.imm_data = wqe->wr.ex.imm_data;
0570                 hwords += 1;
0571             } else {
0572                 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
0573                 /* Invalidate rkey comes after the BTH */
0574                 ohdr->u.ieth = cpu_to_be32(
0575                         wqe->wr.ex.invalidate_rkey);
0576                 hwords += 1;
0577             }
0578             if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0579                 bth0 |= IB_BTH_SOLICITED;
0580             bth2 |= IB_BTH_REQ_ACK;
0581             if (++qp->s_cur == qp->s_size)
0582                 qp->s_cur = 0;
0583             break;
0584 
0585         case IB_WR_RDMA_WRITE:
0586             if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0587                 qp->s_lsn++;
0588             goto no_flow_control;
0589         case IB_WR_RDMA_WRITE_WITH_IMM:
0590             /* If no credit, return. */
0591             if (!rvt_rc_credit_avail(qp, wqe))
0592                 goto bail;
0593 no_flow_control:
0594             put_ib_reth_vaddr(
0595                 wqe->rdma_wr.remote_addr,
0596                 &ohdr->u.rc.reth);
0597             ohdr->u.rc.reth.rkey =
0598                 cpu_to_be32(wqe->rdma_wr.rkey);
0599             ohdr->u.rc.reth.length = cpu_to_be32(len);
0600             hwords += sizeof(struct ib_reth) / sizeof(u32);
0601             if (len > pmtu) {
0602                 qp->s_state = OP(RDMA_WRITE_FIRST);
0603                 len = pmtu;
0604                 break;
0605             }
0606             if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
0607                 qp->s_state = OP(RDMA_WRITE_ONLY);
0608             } else {
0609                 qp->s_state =
0610                     OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
0611                 /* Immediate data comes after RETH */
0612                 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
0613                 hwords += 1;
0614                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0615                     bth0 |= IB_BTH_SOLICITED;
0616             }
0617             bth2 |= IB_BTH_REQ_ACK;
0618             if (++qp->s_cur == qp->s_size)
0619                 qp->s_cur = 0;
0620             break;
0621 
0622         case IB_WR_TID_RDMA_WRITE:
0623             if (newreq) {
0624                 /*
0625                  * Limit the number of TID RDMA WRITE requests.
0626                  */
0627                 if (atomic_read(&priv->n_tid_requests) >=
0628                     HFI1_TID_RDMA_WRITE_CNT)
0629                     goto bail;
0630 
0631                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0632                     qp->s_lsn++;
0633             }
0634 
0635             hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
0636                                 &bth1, &bth2,
0637                                 &len);
0638             ss = NULL;
0639             if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
0640                 priv->s_tid_cur = qp->s_cur;
0641                 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
0642                     priv->s_tid_tail = qp->s_cur;
0643                     priv->s_state = TID_OP(WRITE_RESP);
0644                 }
0645             } else if (priv->s_tid_cur == priv->s_tid_head) {
0646                 struct rvt_swqe *__w;
0647                 struct tid_rdma_request *__r;
0648 
0649                 __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
0650                 __r = wqe_to_tid_req(__w);
0651 
0652                 /*
0653                  * The s_tid_cur pointer is advanced to s_cur if
0654                  * any of the following conditions about the WQE
0655                  * to which s_ti_cur currently points to are
0656                  * satisfied:
0657                  *   1. The request is not a TID RDMA WRITE
0658                  *      request,
0659                  *   2. The request is in the INACTIVE or
0660                  *      COMPLETE states (TID RDMA READ requests
0661                  *      stay at INACTIVE and TID RDMA WRITE
0662                  *      transition to COMPLETE when done),
0663                  *   3. The request is in the ACTIVE or SYNC
0664                  *      state and the number of completed
0665                  *      segments is equal to the total segment
0666                  *      count.
0667                  *      (If ACTIVE, the request is waiting for
0668                  *       ACKs. If SYNC, the request has not
0669                  *       received any responses because it's
0670                  *       waiting on a sync point.)
0671                  */
0672                 if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
0673                     __r->state == TID_REQUEST_INACTIVE ||
0674                     __r->state == TID_REQUEST_COMPLETE ||
0675                     ((__r->state == TID_REQUEST_ACTIVE ||
0676                       __r->state == TID_REQUEST_SYNC) &&
0677                      __r->comp_seg == __r->total_segs)) {
0678                     if (priv->s_tid_tail ==
0679                         priv->s_tid_cur &&
0680                         priv->s_state ==
0681                         TID_OP(WRITE_DATA_LAST)) {
0682                         priv->s_tid_tail = qp->s_cur;
0683                         priv->s_state =
0684                             TID_OP(WRITE_RESP);
0685                     }
0686                     priv->s_tid_cur = qp->s_cur;
0687                 }
0688                 /*
0689                  * A corner case: when the last TID RDMA WRITE
0690                  * request was completed, s_tid_head,
0691                  * s_tid_cur, and s_tid_tail all point to the
0692                  * same location. Other requests are posted and
0693                  * s_cur wraps around to the same location,
0694                  * where a new TID RDMA WRITE is posted. In
0695                  * this case, none of the indices need to be
0696                  * updated. However, the priv->s_state should.
0697                  */
0698                 if (priv->s_tid_tail == qp->s_cur &&
0699                     priv->s_state == TID_OP(WRITE_DATA_LAST))
0700                     priv->s_state = TID_OP(WRITE_RESP);
0701             }
0702             req = wqe_to_tid_req(wqe);
0703             if (newreq) {
0704                 priv->s_tid_head = qp->s_cur;
0705                 priv->pending_tid_w_resp += req->total_segs;
0706                 atomic_inc(&priv->n_tid_requests);
0707                 atomic_dec(&priv->n_requests);
0708             } else {
0709                 req->state = TID_REQUEST_RESEND;
0710                 req->comp_seg = delta_psn(bth2, wqe->psn);
0711                 /*
0712                  * Pull back any segments since we are going
0713                  * to re-receive them.
0714                  */
0715                 req->setup_head = req->clear_tail;
0716                 priv->pending_tid_w_resp +=
0717                     delta_psn(wqe->lpsn, bth2) + 1;
0718             }
0719 
0720             trace_hfi1_tid_write_sender_make_req(qp, newreq);
0721             trace_hfi1_tid_req_make_req_write(qp, newreq,
0722                               wqe->wr.opcode,
0723                               wqe->psn, wqe->lpsn,
0724                               req);
0725             if (++qp->s_cur == qp->s_size)
0726                 qp->s_cur = 0;
0727             break;
0728 
0729         case IB_WR_RDMA_READ:
0730             /*
0731              * Don't allow more operations to be started
0732              * than the QP limits allow.
0733              */
0734             if (qp->s_num_rd_atomic >=
0735                 qp->s_max_rd_atomic) {
0736                 qp->s_flags |= RVT_S_WAIT_RDMAR;
0737                 goto bail;
0738             }
0739             qp->s_num_rd_atomic++;
0740             if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0741                 qp->s_lsn++;
0742             put_ib_reth_vaddr(
0743                 wqe->rdma_wr.remote_addr,
0744                 &ohdr->u.rc.reth);
0745             ohdr->u.rc.reth.rkey =
0746                 cpu_to_be32(wqe->rdma_wr.rkey);
0747             ohdr->u.rc.reth.length = cpu_to_be32(len);
0748             qp->s_state = OP(RDMA_READ_REQUEST);
0749             hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
0750             ss = NULL;
0751             len = 0;
0752             bth2 |= IB_BTH_REQ_ACK;
0753             if (++qp->s_cur == qp->s_size)
0754                 qp->s_cur = 0;
0755             break;
0756 
0757         case IB_WR_TID_RDMA_READ:
0758             trace_hfi1_tid_read_sender_make_req(qp, newreq);
0759             wpriv = wqe->priv;
0760             req = wqe_to_tid_req(wqe);
0761             trace_hfi1_tid_req_make_req_read(qp, newreq,
0762                              wqe->wr.opcode,
0763                              wqe->psn, wqe->lpsn,
0764                              req);
0765             delta = cmp_psn(qp->s_psn, wqe->psn);
0766 
0767             /*
0768              * Don't allow more operations to be started
0769              * than the QP limits allow. We could get here under
0770              * three conditions; (1) It's a new request; (2) We are
0771              * sending the second or later segment of a request,
0772              * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
0773              * when the last segment of a previous request is
0774              * received just before this; (3) We are re-sending a
0775              * request.
0776              */
0777             if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
0778                 qp->s_flags |= RVT_S_WAIT_RDMAR;
0779                 goto bail;
0780             }
0781             if (newreq) {
0782                 struct tid_rdma_flow *flow =
0783                     &req->flows[req->setup_head];
0784 
0785                 /*
0786                  * Set up s_sge as it is needed for TID
0787                  * allocation. However, if the pages have been
0788                  * walked and mapped, skip it. An earlier try
0789                  * has failed to allocate the TID entries.
0790                  */
0791                 if (!flow->npagesets) {
0792                     qp->s_sge.sge = wqe->sg_list[0];
0793                     qp->s_sge.sg_list = wqe->sg_list + 1;
0794                     qp->s_sge.num_sge = wqe->wr.num_sge;
0795                     qp->s_sge.total_len = wqe->length;
0796                     qp->s_len = wqe->length;
0797                     req->isge = 0;
0798                     req->clear_tail = req->setup_head;
0799                     req->flow_idx = req->setup_head;
0800                     req->state = TID_REQUEST_ACTIVE;
0801                 }
0802             } else if (delta == 0) {
0803                 /* Re-send a request */
0804                 req->cur_seg = 0;
0805                 req->comp_seg = 0;
0806                 req->ack_pending = 0;
0807                 req->flow_idx = req->clear_tail;
0808                 req->state = TID_REQUEST_RESEND;
0809             }
0810             req->s_next_psn = qp->s_psn;
0811             /* Read one segment at a time */
0812             len = min_t(u32, req->seg_len,
0813                     wqe->length - req->seg_len * req->cur_seg);
0814             delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
0815                                  &bth1, &bth2,
0816                                  &len);
0817             if (delta <= 0) {
0818                 /* Wait for TID space */
0819                 goto bail;
0820             }
0821             if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0822                 qp->s_lsn++;
0823             hwords += delta;
0824             ss = &wpriv->ss;
0825             /* Check if this is the last segment */
0826             if (req->cur_seg >= req->total_segs &&
0827                 ++qp->s_cur == qp->s_size)
0828                 qp->s_cur = 0;
0829             break;
0830 
0831         case IB_WR_ATOMIC_CMP_AND_SWP:
0832         case IB_WR_ATOMIC_FETCH_AND_ADD:
0833             /*
0834              * Don't allow more operations to be started
0835              * than the QP limits allow.
0836              */
0837             if (qp->s_num_rd_atomic >=
0838                 qp->s_max_rd_atomic) {
0839                 qp->s_flags |= RVT_S_WAIT_RDMAR;
0840                 goto bail;
0841             }
0842             qp->s_num_rd_atomic++;
0843             fallthrough;
0844         case IB_WR_OPFN:
0845             if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
0846                 qp->s_lsn++;
0847             if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
0848                 wqe->wr.opcode == IB_WR_OPFN) {
0849                 qp->s_state = OP(COMPARE_SWAP);
0850                 put_ib_ateth_swap(wqe->atomic_wr.swap,
0851                           &ohdr->u.atomic_eth);
0852                 put_ib_ateth_compare(wqe->atomic_wr.compare_add,
0853                              &ohdr->u.atomic_eth);
0854             } else {
0855                 qp->s_state = OP(FETCH_ADD);
0856                 put_ib_ateth_swap(wqe->atomic_wr.compare_add,
0857                           &ohdr->u.atomic_eth);
0858                 put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
0859             }
0860             put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
0861                        &ohdr->u.atomic_eth);
0862             ohdr->u.atomic_eth.rkey = cpu_to_be32(
0863                 wqe->atomic_wr.rkey);
0864             hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
0865             ss = NULL;
0866             len = 0;
0867             bth2 |= IB_BTH_REQ_ACK;
0868             if (++qp->s_cur == qp->s_size)
0869                 qp->s_cur = 0;
0870             break;
0871 
0872         default:
0873             goto bail;
0874         }
0875         if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
0876             qp->s_sge.sge = wqe->sg_list[0];
0877             qp->s_sge.sg_list = wqe->sg_list + 1;
0878             qp->s_sge.num_sge = wqe->wr.num_sge;
0879             qp->s_sge.total_len = wqe->length;
0880             qp->s_len = wqe->length;
0881         }
0882         if (newreq) {
0883             qp->s_tail++;
0884             if (qp->s_tail >= qp->s_size)
0885                 qp->s_tail = 0;
0886         }
0887         if (wqe->wr.opcode == IB_WR_RDMA_READ ||
0888             wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
0889             qp->s_psn = wqe->lpsn + 1;
0890         else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
0891             qp->s_psn = req->s_next_psn;
0892         else
0893             qp->s_psn++;
0894         break;
0895 
0896     case OP(RDMA_READ_RESPONSE_FIRST):
0897         /*
0898          * qp->s_state is normally set to the opcode of the
0899          * last packet constructed for new requests and therefore
0900          * is never set to RDMA read response.
0901          * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
0902          * thread to indicate a SEND needs to be restarted from an
0903          * earlier PSN without interfering with the sending thread.
0904          * See restart_rc().
0905          */
0906         qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
0907         fallthrough;
0908     case OP(SEND_FIRST):
0909         qp->s_state = OP(SEND_MIDDLE);
0910         fallthrough;
0911     case OP(SEND_MIDDLE):
0912         bth2 = mask_psn(qp->s_psn++);
0913         ss = &qp->s_sge;
0914         len = qp->s_len;
0915         if (len > pmtu) {
0916             len = pmtu;
0917             middle = HFI1_CAP_IS_KSET(SDMA_AHG);
0918             break;
0919         }
0920         if (wqe->wr.opcode == IB_WR_SEND) {
0921             qp->s_state = OP(SEND_LAST);
0922         } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
0923             qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
0924             /* Immediate data comes after the BTH */
0925             ohdr->u.imm_data = wqe->wr.ex.imm_data;
0926             hwords += 1;
0927         } else {
0928             qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
0929             /* invalidate data comes after the BTH */
0930             ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
0931             hwords += 1;
0932         }
0933         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0934             bth0 |= IB_BTH_SOLICITED;
0935         bth2 |= IB_BTH_REQ_ACK;
0936         qp->s_cur++;
0937         if (qp->s_cur >= qp->s_size)
0938             qp->s_cur = 0;
0939         break;
0940 
0941     case OP(RDMA_READ_RESPONSE_LAST):
0942         /*
0943          * qp->s_state is normally set to the opcode of the
0944          * last packet constructed for new requests and therefore
0945          * is never set to RDMA read response.
0946          * RDMA_READ_RESPONSE_LAST is used by the ACK processing
0947          * thread to indicate a RDMA write needs to be restarted from
0948          * an earlier PSN without interfering with the sending thread.
0949          * See restart_rc().
0950          */
0951         qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
0952         fallthrough;
0953     case OP(RDMA_WRITE_FIRST):
0954         qp->s_state = OP(RDMA_WRITE_MIDDLE);
0955         fallthrough;
0956     case OP(RDMA_WRITE_MIDDLE):
0957         bth2 = mask_psn(qp->s_psn++);
0958         ss = &qp->s_sge;
0959         len = qp->s_len;
0960         if (len > pmtu) {
0961             len = pmtu;
0962             middle = HFI1_CAP_IS_KSET(SDMA_AHG);
0963             break;
0964         }
0965         if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
0966             qp->s_state = OP(RDMA_WRITE_LAST);
0967         } else {
0968             qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
0969             /* Immediate data comes after the BTH */
0970             ohdr->u.imm_data = wqe->wr.ex.imm_data;
0971             hwords += 1;
0972             if (wqe->wr.send_flags & IB_SEND_SOLICITED)
0973                 bth0 |= IB_BTH_SOLICITED;
0974         }
0975         bth2 |= IB_BTH_REQ_ACK;
0976         qp->s_cur++;
0977         if (qp->s_cur >= qp->s_size)
0978             qp->s_cur = 0;
0979         break;
0980 
0981     case OP(RDMA_READ_RESPONSE_MIDDLE):
0982         /*
0983          * qp->s_state is normally set to the opcode of the
0984          * last packet constructed for new requests and therefore
0985          * is never set to RDMA read response.
0986          * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
0987          * thread to indicate a RDMA read needs to be restarted from
0988          * an earlier PSN without interfering with the sending thread.
0989          * See restart_rc().
0990          */
0991         len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
0992         put_ib_reth_vaddr(
0993             wqe->rdma_wr.remote_addr + len,
0994             &ohdr->u.rc.reth);
0995         ohdr->u.rc.reth.rkey =
0996             cpu_to_be32(wqe->rdma_wr.rkey);
0997         ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
0998         qp->s_state = OP(RDMA_READ_REQUEST);
0999         hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
1000         bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
1001         qp->s_psn = wqe->lpsn + 1;
1002         ss = NULL;
1003         len = 0;
1004         qp->s_cur++;
1005         if (qp->s_cur == qp->s_size)
1006             qp->s_cur = 0;
1007         break;
1008 
1009     case TID_OP(WRITE_RESP):
1010         /*
1011          * This value for s_state is used for restarting a TID RDMA
1012          * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
1013          * for more).
1014          */
1015         req = wqe_to_tid_req(wqe);
1016         req->state = TID_REQUEST_RESEND;
1017         rcu_read_lock();
1018         remote = rcu_dereference(priv->tid_rdma.remote);
1019         req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
1020         len = wqe->length - (req->comp_seg * remote->max_len);
1021         rcu_read_unlock();
1022 
1023         bth2 = mask_psn(qp->s_psn);
1024         hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
1025                             &bth2, &len);
1026         qp->s_psn = wqe->lpsn + 1;
1027         ss = NULL;
1028         qp->s_state = TID_OP(WRITE_REQ);
1029         priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
1030         priv->s_tid_cur = qp->s_cur;
1031         if (++qp->s_cur == qp->s_size)
1032             qp->s_cur = 0;
1033         trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
1034                           wqe->psn, wqe->lpsn, req);
1035         break;
1036 
1037     case TID_OP(READ_RESP):
1038         if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
1039             goto bail;
1040         /* This is used to restart a TID read request */
1041         req = wqe_to_tid_req(wqe);
1042         wpriv = wqe->priv;
1043         /*
1044          * Back down. The field qp->s_psn has been set to the psn with
1045          * which the request should be restart. It's OK to use division
1046          * as this is on the retry path.
1047          */
1048         req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
1049 
1050         /*
1051          * The following function need to be redefined to return the
1052          * status to make sure that we find the flow. At the same
1053          * time, we can use the req->state change to check if the
1054          * call succeeds or not.
1055          */
1056         req->state = TID_REQUEST_RESEND;
1057         hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
1058         if (req->state != TID_REQUEST_ACTIVE) {
1059             /*
1060              * Failed to find the flow. Release all allocated tid
1061              * resources.
1062              */
1063             hfi1_kern_exp_rcv_clear_all(req);
1064             hfi1_kern_clear_hw_flow(priv->rcd, qp);
1065 
1066             hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
1067             goto bail;
1068         }
1069         req->state = TID_REQUEST_RESEND;
1070         len = min_t(u32, req->seg_len,
1071                 wqe->length - req->seg_len * req->cur_seg);
1072         flow = &req->flows[req->flow_idx];
1073         len -= flow->sent;
1074         req->s_next_psn = flow->flow_state.ib_lpsn + 1;
1075         delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
1076                             &bth2, &len);
1077         if (delta <= 0) {
1078             /* Wait for TID space */
1079             goto bail;
1080         }
1081         hwords += delta;
1082         ss = &wpriv->ss;
1083         /* Check if this is the last segment */
1084         if (req->cur_seg >= req->total_segs &&
1085             ++qp->s_cur == qp->s_size)
1086             qp->s_cur = 0;
1087         qp->s_psn = req->s_next_psn;
1088         trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1089                          wqe->psn, wqe->lpsn, req);
1090         break;
1091     case TID_OP(READ_REQ):
1092         req = wqe_to_tid_req(wqe);
1093         delta = cmp_psn(qp->s_psn, wqe->psn);
1094         /*
1095          * If the current WR is not TID RDMA READ, or this is the start
1096          * of a new request, we need to change the qp->s_state so that
1097          * the request can be set up properly.
1098          */
1099         if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
1100             qp->s_cur == qp->s_tail) {
1101             qp->s_state = OP(RDMA_READ_REQUEST);
1102             if (delta == 0 || qp->s_cur == qp->s_tail)
1103                 goto check_s_state;
1104             else
1105                 goto bail;
1106         }
1107 
1108         /* Rate limiting */
1109         if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
1110             qp->s_flags |= RVT_S_WAIT_RDMAR;
1111             goto bail;
1112         }
1113 
1114         wpriv = wqe->priv;
1115         /* Read one segment at a time */
1116         len = min_t(u32, req->seg_len,
1117                 wqe->length - req->seg_len * req->cur_seg);
1118         delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
1119                              &bth2, &len);
1120         if (delta <= 0) {
1121             /* Wait for TID space */
1122             goto bail;
1123         }
1124         hwords += delta;
1125         ss = &wpriv->ss;
1126         /* Check if this is the last segment */
1127         if (req->cur_seg >= req->total_segs &&
1128             ++qp->s_cur == qp->s_size)
1129             qp->s_cur = 0;
1130         qp->s_psn = req->s_next_psn;
1131         trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1132                          wqe->psn, wqe->lpsn, req);
1133         break;
1134     }
1135     qp->s_sending_hpsn = bth2;
1136     delta = delta_psn(bth2, wqe->psn);
1137     if (delta && delta % HFI1_PSN_CREDIT == 0 &&
1138         wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
1139         bth2 |= IB_BTH_REQ_ACK;
1140     if (qp->s_flags & RVT_S_SEND_ONE) {
1141         qp->s_flags &= ~RVT_S_SEND_ONE;
1142         qp->s_flags |= RVT_S_WAIT_ACK;
1143         bth2 |= IB_BTH_REQ_ACK;
1144     }
1145     qp->s_len -= len;
1146     ps->s_txreq->hdr_dwords = hwords;
1147     ps->s_txreq->sde = priv->s_sde;
1148     ps->s_txreq->ss = ss;
1149     ps->s_txreq->s_cur_size = len;
1150     hfi1_make_ruc_header(
1151         qp,
1152         ohdr,
1153         bth0 | (qp->s_state << 24),
1154         bth1,
1155         bth2,
1156         middle,
1157         ps);
1158     return 1;
1159 
1160 done_free_tx:
1161     hfi1_put_txreq(ps->s_txreq);
1162     ps->s_txreq = NULL;
1163     return 1;
1164 
1165 bail:
1166     hfi1_put_txreq(ps->s_txreq);
1167 
1168 bail_no_tx:
1169     ps->s_txreq = NULL;
1170     qp->s_flags &= ~RVT_S_BUSY;
1171     /*
1172      * If we didn't get a txreq, the QP will be woken up later to try
1173      * again. Set the flags to indicate which work item to wake
1174      * up.
1175      */
1176     iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
1177     return 0;
1178 }
1179 
1180 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
1181                       struct ib_other_headers *ohdr,
1182                       u32 bth0, u32 bth1)
1183 {
1184     if (qp->r_nak_state)
1185         ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
1186                         (qp->r_nak_state <<
1187                          IB_AETH_CREDIT_SHIFT));
1188     else
1189         ohdr->u.aeth = rvt_compute_aeth(qp);
1190 
1191     ohdr->bth[0] = cpu_to_be32(bth0);
1192     ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
1193     ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
1194 }
1195 
1196 static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn)
1197 {
1198     struct rvt_qp *qp = packet->qp;
1199     struct hfi1_ibport *ibp;
1200     unsigned long flags;
1201 
1202     spin_lock_irqsave(&qp->s_lock, flags);
1203     if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1204         goto unlock;
1205     ibp = rcd_to_iport(packet->rcd);
1206     this_cpu_inc(*ibp->rvp.rc_qacks);
1207     qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
1208     qp->s_nak_state = qp->r_nak_state;
1209     qp->s_ack_psn = qp->r_ack_psn;
1210     if (is_fecn)
1211         qp->s_flags |= RVT_S_ECN;
1212 
1213     /* Schedule the send tasklet. */
1214     hfi1_schedule_send(qp);
1215 unlock:
1216     spin_unlock_irqrestore(&qp->s_lock, flags);
1217 }
1218 
1219 static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
1220                        struct hfi1_opa_header *opa_hdr,
1221                        u8 sc5, bool is_fecn,
1222                        u64 *pbc_flags, u32 *hwords,
1223                        u32 *nwords)
1224 {
1225     struct rvt_qp *qp = packet->qp;
1226     struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1227     struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1228     struct ib_header *hdr = &opa_hdr->ibh;
1229     struct ib_other_headers *ohdr;
1230     u16 lrh0 = HFI1_LRH_BTH;
1231     u16 pkey;
1232     u32 bth0, bth1;
1233 
1234     opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
1235     ohdr = &hdr->u.oth;
1236     /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
1237     *hwords = 6;
1238 
1239     if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
1240         *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
1241                      rdma_ah_read_grh(&qp->remote_ah_attr),
1242                      *hwords - 2, SIZE_OF_CRC);
1243         ohdr = &hdr->u.l.oth;
1244         lrh0 = HFI1_LRH_GRH;
1245     }
1246     /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1247     *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
1248 
1249     /* read pkey_index w/o lock (its atomic) */
1250     pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
1251 
1252     lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
1253         (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
1254             IB_SL_SHIFT;
1255 
1256     hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
1257              opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
1258              ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
1259 
1260     bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
1261     if (qp->s_mig_state == IB_MIG_MIGRATED)
1262         bth0 |= IB_BTH_MIG_REQ;
1263     bth1 = (!!is_fecn) << IB_BECN_SHIFT;
1264     /*
1265      * Inline ACKs go out without the use of the Verbs send engine, so
1266      * we need to set the STL Verbs Extended bit here
1267      */
1268     bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
1269     hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
1270 }
1271 
1272 static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet,
1273                     struct hfi1_opa_header *opa_hdr,
1274                     u8 sc5, bool is_fecn,
1275                     u64 *pbc_flags, u32 *hwords,
1276                     u32 *nwords)
1277 {
1278     struct rvt_qp *qp = packet->qp;
1279     struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1280     struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1281     struct hfi1_16b_header *hdr = &opa_hdr->opah;
1282     struct ib_other_headers *ohdr;
1283     u32 bth0, bth1 = 0;
1284     u16 len, pkey;
1285     bool becn = is_fecn;
1286     u8 l4 = OPA_16B_L4_IB_LOCAL;
1287     u8 extra_bytes;
1288 
1289     opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
1290     ohdr = &hdr->u.oth;
1291     /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
1292     *hwords = 8;
1293     extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
1294     *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
1295 
1296     if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
1297         hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
1298         *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
1299                      rdma_ah_read_grh(&qp->remote_ah_attr),
1300                      *hwords - 4, *nwords);
1301         ohdr = &hdr->u.l.oth;
1302         l4 = OPA_16B_L4_IB_GLOBAL;
1303     }
1304     *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
1305 
1306     /* read pkey_index w/o lock (its atomic) */
1307     pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
1308 
1309     /* Convert dwords to flits */
1310     len = (*hwords + *nwords) >> 1;
1311 
1312     hfi1_make_16b_hdr(hdr, ppd->lid |
1313               (rdma_ah_get_path_bits(&qp->remote_ah_attr) &
1314               ((1 << ppd->lmc) - 1)),
1315               opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
1316                       16B), len, pkey, becn, 0, l4, sc5);
1317 
1318     bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
1319     bth0 |= extra_bytes << 20;
1320     if (qp->s_mig_state == IB_MIG_MIGRATED)
1321         bth1 = OPA_BTH_MIG_REQ;
1322     hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
1323 }
1324 
1325 typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet,
1326                  struct hfi1_opa_header *opa_hdr,
1327                  u8 sc5, bool is_fecn,
1328                  u64 *pbc_flags, u32 *hwords,
1329                  u32 *nwords);
1330 
1331 /* We support only two types - 9B and 16B for now */
1332 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
1333     [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
1334     [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
1335 };
1336 
1337 /*
1338  * hfi1_send_rc_ack - Construct an ACK packet and send it
1339  *
1340  * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
1341  * Note that RDMA reads and atomics are handled in the
1342  * send side QP state and send engine.
1343  */
1344 void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
1345 {
1346     struct hfi1_ctxtdata *rcd = packet->rcd;
1347     struct rvt_qp *qp = packet->qp;
1348     struct hfi1_ibport *ibp = rcd_to_iport(rcd);
1349     struct hfi1_qp_priv *priv = qp->priv;
1350     struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1351     u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
1352     u64 pbc, pbc_flags = 0;
1353     u32 hwords = 0;
1354     u32 nwords = 0;
1355     u32 plen;
1356     struct pio_buf *pbuf;
1357     struct hfi1_opa_header opa_hdr;
1358 
1359     /* clear the defer count */
1360     qp->r_adefered = 0;
1361 
1362     /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
1363     if (qp->s_flags & RVT_S_RESP_PENDING) {
1364         hfi1_queue_rc_ack(packet, is_fecn);
1365         return;
1366     }
1367 
1368     /* Ensure s_rdma_ack_cnt changes are committed */
1369     if (qp->s_rdma_ack_cnt) {
1370         hfi1_queue_rc_ack(packet, is_fecn);
1371         return;
1372     }
1373 
1374     /* Don't try to send ACKs if the link isn't ACTIVE */
1375     if (driver_lstate(ppd) != IB_PORT_ACTIVE)
1376         return;
1377 
1378     /* Make the appropriate header */
1379     hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn,
1380                          &pbc_flags, &hwords, &nwords);
1381 
1382     plen = 2 /* PBC */ + hwords + nwords;
1383     pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
1384              sc_to_vlt(ppd->dd, sc5), plen);
1385     pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
1386     if (IS_ERR_OR_NULL(pbuf)) {
1387         /*
1388          * We have no room to send at the moment.  Pass
1389          * responsibility for sending the ACK to the send engine
1390          * so that when enough buffer space becomes available,
1391          * the ACK is sent ahead of other outgoing packets.
1392          */
1393         hfi1_queue_rc_ack(packet, is_fecn);
1394         return;
1395     }
1396     trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1397                    &opa_hdr, ib_is_sc5(sc5));
1398 
1399     /* write the pbc and data */
1400     ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
1401                  (priv->hdr_type == HFI1_PKT_TYPE_9B ?
1402                  (void *)&opa_hdr.ibh :
1403                  (void *)&opa_hdr.opah), hwords);
1404     return;
1405 }
1406 
1407 /**
1408  * update_num_rd_atomic - update the qp->s_num_rd_atomic
1409  * @qp: the QP
1410  * @psn: the packet sequence number to restart at
1411  * @wqe: the wqe
1412  *
1413  * This is called from reset_psn() to update qp->s_num_rd_atomic
1414  * for the current wqe.
1415  * Called at interrupt level with the QP s_lock held.
1416  */
1417 static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
1418                  struct rvt_swqe *wqe)
1419 {
1420     u32 opcode = wqe->wr.opcode;
1421 
1422     if (opcode == IB_WR_RDMA_READ ||
1423         opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1424         opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1425         qp->s_num_rd_atomic++;
1426     } else if (opcode == IB_WR_TID_RDMA_READ) {
1427         struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1428         struct hfi1_qp_priv *priv = qp->priv;
1429 
1430         if (cmp_psn(psn, wqe->lpsn) <= 0) {
1431             u32 cur_seg;
1432 
1433             cur_seg = (psn - wqe->psn) / priv->pkts_ps;
1434             req->ack_pending = cur_seg - req->comp_seg;
1435             priv->pending_tid_r_segs += req->ack_pending;
1436             qp->s_num_rd_atomic += req->ack_pending;
1437             trace_hfi1_tid_req_update_num_rd_atomic(qp, 0,
1438                                 wqe->wr.opcode,
1439                                 wqe->psn,
1440                                 wqe->lpsn,
1441                                 req);
1442         } else {
1443             priv->pending_tid_r_segs += req->total_segs;
1444             qp->s_num_rd_atomic += req->total_segs;
1445         }
1446     }
1447 }
1448 
1449 /**
1450  * reset_psn - reset the QP state to send starting from PSN
1451  * @qp: the QP
1452  * @psn: the packet sequence number to restart at
1453  *
1454  * This is called from hfi1_rc_rcv() to process an incoming RC ACK
1455  * for the given QP.
1456  * Called at interrupt level with the QP s_lock held.
1457  */
1458 static void reset_psn(struct rvt_qp *qp, u32 psn)
1459 {
1460     u32 n = qp->s_acked;
1461     struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
1462     u32 opcode;
1463     struct hfi1_qp_priv *priv = qp->priv;
1464 
1465     lockdep_assert_held(&qp->s_lock);
1466     qp->s_cur = n;
1467     priv->pending_tid_r_segs = 0;
1468     priv->pending_tid_w_resp = 0;
1469     qp->s_num_rd_atomic = 0;
1470 
1471     /*
1472      * If we are starting the request from the beginning,
1473      * let the normal send code handle initialization.
1474      */
1475     if (cmp_psn(psn, wqe->psn) <= 0) {
1476         qp->s_state = OP(SEND_LAST);
1477         goto done;
1478     }
1479     update_num_rd_atomic(qp, psn, wqe);
1480 
1481     /* Find the work request opcode corresponding to the given PSN. */
1482     for (;;) {
1483         int diff;
1484 
1485         if (++n == qp->s_size)
1486             n = 0;
1487         if (n == qp->s_tail)
1488             break;
1489         wqe = rvt_get_swqe_ptr(qp, n);
1490         diff = cmp_psn(psn, wqe->psn);
1491         if (diff < 0) {
1492             /* Point wqe back to the previous one*/
1493             wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1494             break;
1495         }
1496         qp->s_cur = n;
1497         /*
1498          * If we are starting the request from the beginning,
1499          * let the normal send code handle initialization.
1500          */
1501         if (diff == 0) {
1502             qp->s_state = OP(SEND_LAST);
1503             goto done;
1504         }
1505 
1506         update_num_rd_atomic(qp, psn, wqe);
1507     }
1508     opcode = wqe->wr.opcode;
1509 
1510     /*
1511      * Set the state to restart in the middle of a request.
1512      * Don't change the s_sge, s_cur_sge, or s_cur_size.
1513      * See hfi1_make_rc_req().
1514      */
1515     switch (opcode) {
1516     case IB_WR_SEND:
1517     case IB_WR_SEND_WITH_IMM:
1518         qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1519         break;
1520 
1521     case IB_WR_RDMA_WRITE:
1522     case IB_WR_RDMA_WRITE_WITH_IMM:
1523         qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1524         break;
1525 
1526     case IB_WR_TID_RDMA_WRITE:
1527         qp->s_state = TID_OP(WRITE_RESP);
1528         break;
1529 
1530     case IB_WR_RDMA_READ:
1531         qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1532         break;
1533 
1534     case IB_WR_TID_RDMA_READ:
1535         qp->s_state = TID_OP(READ_RESP);
1536         break;
1537 
1538     default:
1539         /*
1540          * This case shouldn't happen since its only
1541          * one PSN per req.
1542          */
1543         qp->s_state = OP(SEND_LAST);
1544     }
1545 done:
1546     priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
1547     qp->s_psn = psn;
1548     /*
1549      * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1550      * asynchronously before the send engine can get scheduled.
1551      * Doing it in hfi1_make_rc_req() is too late.
1552      */
1553     if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1554         (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1555         qp->s_flags |= RVT_S_WAIT_PSN;
1556     qp->s_flags &= ~HFI1_S_AHG_VALID;
1557     trace_hfi1_sender_reset_psn(qp);
1558 }
1559 
1560 /*
1561  * Back up requester to resend the last un-ACKed request.
1562  * The QP r_lock and s_lock should be held and interrupts disabled.
1563  */
1564 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1565 {
1566     struct hfi1_qp_priv *priv = qp->priv;
1567     struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1568     struct hfi1_ibport *ibp;
1569 
1570     lockdep_assert_held(&qp->r_lock);
1571     lockdep_assert_held(&qp->s_lock);
1572     trace_hfi1_sender_restart_rc(qp);
1573     if (qp->s_retry == 0) {
1574         if (qp->s_mig_state == IB_MIG_ARMED) {
1575             hfi1_migrate_qp(qp);
1576             qp->s_retry = qp->s_retry_cnt;
1577         } else if (qp->s_last == qp->s_acked) {
1578             /*
1579              * We need special handling for the OPFN request WQEs as
1580              * they are not allowed to generate real user errors
1581              */
1582             if (wqe->wr.opcode == IB_WR_OPFN) {
1583                 struct hfi1_ibport *ibp =
1584                     to_iport(qp->ibqp.device, qp->port_num);
1585                 /*
1586                  * Call opfn_conn_reply() with capcode and
1587                  * remaining data as 0 to close out the
1588                  * current request
1589                  */
1590                 opfn_conn_reply(qp, priv->opfn.curr);
1591                 wqe = do_rc_completion(qp, wqe, ibp);
1592                 qp->s_flags &= ~RVT_S_WAIT_ACK;
1593             } else {
1594                 trace_hfi1_tid_write_sender_restart_rc(qp, 0);
1595                 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
1596                     struct tid_rdma_request *req;
1597 
1598                     req = wqe_to_tid_req(wqe);
1599                     hfi1_kern_exp_rcv_clear_all(req);
1600                     hfi1_kern_clear_hw_flow(priv->rcd, qp);
1601                 }
1602 
1603                 hfi1_trdma_send_complete(qp, wqe,
1604                              IB_WC_RETRY_EXC_ERR);
1605                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1606             }
1607             return;
1608         } else { /* need to handle delayed completion */
1609             return;
1610         }
1611     } else {
1612         qp->s_retry--;
1613     }
1614 
1615     ibp = to_iport(qp->ibqp.device, qp->port_num);
1616     if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1617         wqe->wr.opcode == IB_WR_TID_RDMA_READ)
1618         ibp->rvp.n_rc_resends++;
1619     else
1620         ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1621 
1622     qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1623              RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1624              RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
1625     if (wait)
1626         qp->s_flags |= RVT_S_SEND_ONE;
1627     reset_psn(qp, psn);
1628 }
1629 
1630 /*
1631  * Set qp->s_sending_psn to the next PSN after the given one.
1632  * This would be psn+1 except when RDMA reads or TID RDMA ops
1633  * are present.
1634  */
1635 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1636 {
1637     struct rvt_swqe *wqe;
1638     u32 n = qp->s_last;
1639 
1640     lockdep_assert_held(&qp->s_lock);
1641     /* Find the work request corresponding to the given PSN. */
1642     for (;;) {
1643         wqe = rvt_get_swqe_ptr(qp, n);
1644         if (cmp_psn(psn, wqe->lpsn) <= 0) {
1645             if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1646                 wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
1647                 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
1648                 qp->s_sending_psn = wqe->lpsn + 1;
1649             else
1650                 qp->s_sending_psn = psn + 1;
1651             break;
1652         }
1653         if (++n == qp->s_size)
1654             n = 0;
1655         if (n == qp->s_tail)
1656             break;
1657     }
1658 }
1659 
1660 /**
1661  * hfi1_rc_verbs_aborted - handle abort status
1662  * @qp: the QP
1663  * @opah: the opa header
1664  *
1665  * This code modifies both ACK bit in BTH[2]
1666  * and the s_flags to go into send one mode.
1667  *
1668  * This serves to throttle the send engine to only
1669  * send a single packet in the likely case the
1670  * a link has gone down.
1671  */
1672 void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1673 {
1674     struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah);
1675     u8 opcode = ib_bth_get_opcode(ohdr);
1676     u32 psn;
1677 
1678     /* ignore responses */
1679     if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1680          opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1681         opcode == TID_OP(READ_RESP) ||
1682         opcode == TID_OP(WRITE_RESP))
1683         return;
1684 
1685     psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK;
1686     ohdr->bth[2] = cpu_to_be32(psn);
1687     qp->s_flags |= RVT_S_SEND_ONE;
1688 }
1689 
1690 /*
1691  * This should be called with the QP s_lock held and interrupts disabled.
1692  */
1693 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1694 {
1695     struct ib_other_headers *ohdr;
1696     struct hfi1_qp_priv *priv = qp->priv;
1697     struct rvt_swqe *wqe;
1698     u32 opcode, head, tail;
1699     u32 psn;
1700     struct tid_rdma_request *req;
1701 
1702     lockdep_assert_held(&qp->s_lock);
1703     if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
1704         return;
1705 
1706     ohdr = hfi1_get_rc_ohdr(opah);
1707     opcode = ib_bth_get_opcode(ohdr);
1708     if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1709          opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1710         opcode == TID_OP(READ_RESP) ||
1711         opcode == TID_OP(WRITE_RESP)) {
1712         WARN_ON(!qp->s_rdma_ack_cnt);
1713         qp->s_rdma_ack_cnt--;
1714         return;
1715     }
1716 
1717     psn = ib_bth_get_psn(ohdr);
1718     /*
1719      * Don't attempt to reset the sending PSN for packets in the
1720      * KDETH PSN space since the PSN does not match anything.
1721      */
1722     if (opcode != TID_OP(WRITE_DATA) &&
1723         opcode != TID_OP(WRITE_DATA_LAST) &&
1724         opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
1725         reset_sending_psn(qp, psn);
1726 
1727     /* Handle TID RDMA WRITE packets differently */
1728     if (opcode >= TID_OP(WRITE_REQ) &&
1729         opcode <= TID_OP(WRITE_DATA_LAST)) {
1730         head = priv->s_tid_head;
1731         tail = priv->s_tid_cur;
1732         /*
1733          * s_tid_cur is set to s_tid_head in the case, where
1734          * a new TID RDMA request is being started and all
1735          * previous ones have been completed.
1736          * Therefore, we need to do a secondary check in order
1737          * to properly determine whether we should start the
1738          * RC timer.
1739          */
1740         wqe = rvt_get_swqe_ptr(qp, tail);
1741         req = wqe_to_tid_req(wqe);
1742         if (head == tail && req->comp_seg < req->total_segs) {
1743             if (tail == 0)
1744                 tail = qp->s_size - 1;
1745             else
1746                 tail -= 1;
1747         }
1748     } else {
1749         head = qp->s_tail;
1750         tail = qp->s_acked;
1751     }
1752 
1753     /*
1754      * Start timer after a packet requesting an ACK has been sent and
1755      * there are still requests that haven't been acked.
1756      */
1757     if ((psn & IB_BTH_REQ_ACK) && tail != head &&
1758         opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
1759         opcode != TID_OP(RESYNC) &&
1760         !(qp->s_flags &
1761           (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1762         (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1763         if (opcode == TID_OP(READ_REQ))
1764             rvt_add_retry_timer_ext(qp, priv->timeout_shift);
1765         else
1766             rvt_add_retry_timer(qp);
1767     }
1768 
1769     /* Start TID RDMA ACK timer */
1770     if ((opcode == TID_OP(WRITE_DATA) ||
1771          opcode == TID_OP(WRITE_DATA_LAST) ||
1772          opcode == TID_OP(RESYNC)) &&
1773         (psn & IB_BTH_REQ_ACK) &&
1774         !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
1775         (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1776         /*
1777          * The TID RDMA ACK packet could be received before this
1778          * function is called. Therefore, add the timer only if TID
1779          * RDMA ACK packets are actually pending.
1780          */
1781         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1782         req = wqe_to_tid_req(wqe);
1783         if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
1784             req->ack_seg < req->cur_seg)
1785             hfi1_add_tid_retry_timer(qp);
1786     }
1787 
1788     while (qp->s_last != qp->s_acked) {
1789         wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1790         if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1791             cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1792             break;
1793         trdma_clean_swqe(qp, wqe);
1794         trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1795         rvt_qp_complete_swqe(qp,
1796                      wqe,
1797                      ib_hfi1_wc_opcode[wqe->wr.opcode],
1798                      IB_WC_SUCCESS);
1799     }
1800     /*
1801      * If we were waiting for sends to complete before re-sending,
1802      * and they are now complete, restart sending.
1803      */
1804     trace_hfi1_sendcomplete(qp, psn);
1805     if (qp->s_flags & RVT_S_WAIT_PSN &&
1806         cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1807         qp->s_flags &= ~RVT_S_WAIT_PSN;
1808         qp->s_sending_psn = qp->s_psn;
1809         qp->s_sending_hpsn = qp->s_psn - 1;
1810         hfi1_schedule_send(qp);
1811     }
1812 }
1813 
1814 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1815 {
1816     qp->s_last_psn = psn;
1817 }
1818 
1819 /*
1820  * Generate a SWQE completion.
1821  * This is similar to hfi1_send_complete but has to check to be sure
1822  * that the SGEs are not being referenced if the SWQE is being resent.
1823  */
1824 struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1825                   struct rvt_swqe *wqe,
1826                   struct hfi1_ibport *ibp)
1827 {
1828     struct hfi1_qp_priv *priv = qp->priv;
1829 
1830     lockdep_assert_held(&qp->s_lock);
1831     /*
1832      * Don't decrement refcount and don't generate a
1833      * completion if the SWQE is being resent until the send
1834      * is finished.
1835      */
1836     trace_hfi1_rc_completion(qp, wqe->lpsn);
1837     if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1838         cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1839         trdma_clean_swqe(qp, wqe);
1840         trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1841         rvt_qp_complete_swqe(qp,
1842                      wqe,
1843                      ib_hfi1_wc_opcode[wqe->wr.opcode],
1844                      IB_WC_SUCCESS);
1845     } else {
1846         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1847 
1848         this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1849         /*
1850          * If send progress not running attempt to progress
1851          * SDMA queue.
1852          */
1853         if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1854             struct sdma_engine *engine;
1855             u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1856             u8 sc5;
1857 
1858             /* For now use sc to find engine */
1859             sc5 = ibp->sl_to_sc[sl];
1860             engine = qp_to_sdma_engine(qp, sc5);
1861             sdma_engine_progress_schedule(engine);
1862         }
1863     }
1864 
1865     qp->s_retry = qp->s_retry_cnt;
1866     /*
1867      * Don't update the last PSN if the request being completed is
1868      * a TID RDMA WRITE request.
1869      * Completion of the TID RDMA WRITE requests are done by the
1870      * TID RDMA ACKs and as such could be for a request that has
1871      * already been ACKed as far as the IB state machine is
1872      * concerned.
1873      */
1874     if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
1875         update_last_psn(qp, wqe->lpsn);
1876 
1877     /*
1878      * If we are completing a request which is in the process of
1879      * being resent, we can stop re-sending it since we know the
1880      * responder has already seen it.
1881      */
1882     if (qp->s_acked == qp->s_cur) {
1883         if (++qp->s_cur >= qp->s_size)
1884             qp->s_cur = 0;
1885         qp->s_acked = qp->s_cur;
1886         wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1887         if (qp->s_acked != qp->s_tail) {
1888             qp->s_state = OP(SEND_LAST);
1889             qp->s_psn = wqe->psn;
1890         }
1891     } else {
1892         if (++qp->s_acked >= qp->s_size)
1893             qp->s_acked = 0;
1894         if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1895             qp->s_draining = 0;
1896         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1897     }
1898     if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
1899         priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
1900         hfi1_schedule_send(qp);
1901     }
1902     return wqe;
1903 }
1904 
1905 static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
1906 {
1907     /* Retry this request. */
1908     if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1909         qp->r_flags |= RVT_R_RDMAR_SEQ;
1910         hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1911         if (list_empty(&qp->rspwait)) {
1912             qp->r_flags |= RVT_R_RSP_SEND;
1913             rvt_get_qp(qp);
1914             list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1915         }
1916     }
1917 }
1918 
1919 /**
1920  * update_qp_retry_state - Update qp retry state.
1921  * @qp: the QP
1922  * @psn: the packet sequence number of the TID RDMA WRITE RESP.
1923  * @spsn:  The start psn for the given TID RDMA WRITE swqe.
1924  * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
1925  *
1926  * This function is called to update the qp retry state upon
1927  * receiving a TID WRITE RESP after the qp is scheduled to retry
1928  * a request.
1929  */
1930 static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
1931                   u32 lpsn)
1932 {
1933     struct hfi1_qp_priv *qpriv = qp->priv;
1934 
1935     qp->s_psn = psn + 1;
1936     /*
1937      * If this is the first TID RDMA WRITE RESP packet for the current
1938      * request, change the s_state so that the retry will be processed
1939      * correctly. Similarly, if this is the last TID RDMA WRITE RESP
1940      * packet, change the s_state and advance the s_cur.
1941      */
1942     if (cmp_psn(psn, lpsn) >= 0) {
1943         qp->s_cur = qpriv->s_tid_cur + 1;
1944         if (qp->s_cur >= qp->s_size)
1945             qp->s_cur = 0;
1946         qp->s_state = TID_OP(WRITE_REQ);
1947     } else  if (!cmp_psn(psn, spsn)) {
1948         qp->s_cur = qpriv->s_tid_cur;
1949         qp->s_state = TID_OP(WRITE_RESP);
1950     }
1951 }
1952 
1953 /*
1954  * do_rc_ack - process an incoming RC ACK
1955  * @qp: the QP the ACK came in on
1956  * @psn: the packet sequence number of the ACK
1957  * @opcode: the opcode of the request that resulted in the ACK
1958  *
1959  * This is called from rc_rcv_resp() to process an incoming RC ACK
1960  * for the given QP.
1961  * May be called at interrupt level, with the QP s_lock held.
1962  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1963  */
1964 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1965           u64 val, struct hfi1_ctxtdata *rcd)
1966 {
1967     struct hfi1_ibport *ibp;
1968     enum ib_wc_status status;
1969     struct hfi1_qp_priv *qpriv = qp->priv;
1970     struct rvt_swqe *wqe;
1971     int ret = 0;
1972     u32 ack_psn;
1973     int diff;
1974     struct rvt_dev_info *rdi;
1975 
1976     lockdep_assert_held(&qp->s_lock);
1977     /*
1978      * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1979      * requests and implicitly NAK RDMA read and atomic requests issued
1980      * before the NAK'ed request.  The MSN won't include the NAK'ed
1981      * request but will include an ACK'ed request(s).
1982      */
1983     ack_psn = psn;
1984     if (aeth >> IB_AETH_NAK_SHIFT)
1985         ack_psn--;
1986     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1987     ibp = rcd_to_iport(rcd);
1988 
1989     /*
1990      * The MSN might be for a later WQE than the PSN indicates so
1991      * only complete WQEs that the PSN finishes.
1992      */
1993     while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1994         /*
1995          * RDMA_READ_RESPONSE_ONLY is a special case since
1996          * we want to generate completion events for everything
1997          * before the RDMA read, copy the data, then generate
1998          * the completion for the read.
1999          */
2000         if (wqe->wr.opcode == IB_WR_RDMA_READ &&
2001             opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
2002             diff == 0) {
2003             ret = 1;
2004             goto bail_stop;
2005         }
2006         /*
2007          * If this request is a RDMA read or atomic, and the ACK is
2008          * for a later operation, this ACK NAKs the RDMA read or
2009          * atomic.  In other words, only a RDMA_READ_LAST or ONLY
2010          * can ACK a RDMA read and likewise for atomic ops.  Note
2011          * that the NAK case can only happen if relaxed ordering is
2012          * used and requests are sent after an RDMA read or atomic
2013          * is sent but before the response is received.
2014          */
2015         if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
2016              (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
2017             (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
2018              (opcode != TID_OP(READ_RESP) || diff != 0)) ||
2019             ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2020               wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
2021              (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
2022             (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2023              (delta_psn(psn, qp->s_last_psn) != 1))) {
2024             set_restart_qp(qp, rcd);
2025             /*
2026              * No need to process the ACK/NAK since we are
2027              * restarting an earlier request.
2028              */
2029             goto bail_stop;
2030         }
2031         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2032             wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2033             u64 *vaddr = wqe->sg_list[0].vaddr;
2034             *vaddr = val;
2035         }
2036         if (wqe->wr.opcode == IB_WR_OPFN)
2037             opfn_conn_reply(qp, val);
2038 
2039         if (qp->s_num_rd_atomic &&
2040             (wqe->wr.opcode == IB_WR_RDMA_READ ||
2041              wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2042              wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
2043             qp->s_num_rd_atomic--;
2044             /* Restart sending task if fence is complete */
2045             if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2046                 !qp->s_num_rd_atomic) {
2047                 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2048                          RVT_S_WAIT_ACK);
2049                 hfi1_schedule_send(qp);
2050             } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2051                 qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
2052                          RVT_S_WAIT_ACK);
2053                 hfi1_schedule_send(qp);
2054             }
2055         }
2056 
2057         /*
2058          * TID RDMA WRITE requests will be completed by the TID RDMA
2059          * ACK packet handler (see tid_rdma.c).
2060          */
2061         if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
2062             break;
2063 
2064         wqe = do_rc_completion(qp, wqe, ibp);
2065         if (qp->s_acked == qp->s_tail)
2066             break;
2067     }
2068 
2069     trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
2070     trace_hfi1_sender_do_rc_ack(qp);
2071     switch (aeth >> IB_AETH_NAK_SHIFT) {
2072     case 0:         /* ACK */
2073         this_cpu_inc(*ibp->rvp.rc_acks);
2074         if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2075             if (wqe_to_tid_req(wqe)->ack_pending)
2076                 rvt_mod_retry_timer_ext(qp,
2077                             qpriv->timeout_shift);
2078             else
2079                 rvt_stop_rc_timers(qp);
2080         } else if (qp->s_acked != qp->s_tail) {
2081             struct rvt_swqe *__w = NULL;
2082 
2083             if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
2084                 __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
2085 
2086             /*
2087              * Stop timers if we've received all of the TID RDMA
2088              * WRITE * responses.
2089              */
2090             if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2091                 opcode == TID_OP(WRITE_RESP)) {
2092                 /*
2093                  * Normally, the loop above would correctly
2094                  * process all WQEs from s_acked onward and
2095                  * either complete them or check for correct
2096                  * PSN sequencing.
2097                  * However, for TID RDMA, due to pipelining,
2098                  * the response may not be for the request at
2099                  * s_acked so the above look would just be
2100                  * skipped. This does not allow for checking
2101                  * the PSN sequencing. It has to be done
2102                  * separately.
2103                  */
2104                 if (cmp_psn(psn, qp->s_last_psn + 1)) {
2105                     set_restart_qp(qp, rcd);
2106                     goto bail_stop;
2107                 }
2108                 /*
2109                  * If the psn is being resent, stop the
2110                  * resending.
2111                  */
2112                 if (qp->s_cur != qp->s_tail &&
2113                     cmp_psn(qp->s_psn, psn) <= 0)
2114                     update_qp_retry_state(qp, psn,
2115                                   __w->psn,
2116                                   __w->lpsn);
2117                 else if (--qpriv->pending_tid_w_resp)
2118                     rvt_mod_retry_timer(qp);
2119                 else
2120                     rvt_stop_rc_timers(qp);
2121             } else {
2122                 /*
2123                  * We are expecting more ACKs so
2124                  * mod the retry timer.
2125                  */
2126                 rvt_mod_retry_timer(qp);
2127                 /*
2128                  * We can stop re-sending the earlier packets
2129                  * and continue with the next packet the
2130                  * receiver wants.
2131                  */
2132                 if (cmp_psn(qp->s_psn, psn) <= 0)
2133                     reset_psn(qp, psn + 1);
2134             }
2135         } else {
2136             /* No more acks - kill all timers */
2137             rvt_stop_rc_timers(qp);
2138             if (cmp_psn(qp->s_psn, psn) <= 0) {
2139                 qp->s_state = OP(SEND_LAST);
2140                 qp->s_psn = psn + 1;
2141             }
2142         }
2143         if (qp->s_flags & RVT_S_WAIT_ACK) {
2144             qp->s_flags &= ~RVT_S_WAIT_ACK;
2145             hfi1_schedule_send(qp);
2146         }
2147         rvt_get_credit(qp, aeth);
2148         qp->s_rnr_retry = qp->s_rnr_retry_cnt;
2149         qp->s_retry = qp->s_retry_cnt;
2150         /*
2151          * If the current request is a TID RDMA WRITE request and the
2152          * response is not a TID RDMA WRITE RESP packet, s_last_psn
2153          * can't be advanced.
2154          */
2155         if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2156             opcode != TID_OP(WRITE_RESP) &&
2157             cmp_psn(psn, wqe->psn) >= 0)
2158             return 1;
2159         update_last_psn(qp, psn);
2160         return 1;
2161 
2162     case 1:         /* RNR NAK */
2163         ibp->rvp.n_rnr_naks++;
2164         if (qp->s_acked == qp->s_tail)
2165             goto bail_stop;
2166         if (qp->s_flags & RVT_S_WAIT_RNR)
2167             goto bail_stop;
2168         rdi = ib_to_rvt(qp->ibqp.device);
2169         if (!(rdi->post_parms[wqe->wr.opcode].flags &
2170                RVT_OPERATION_IGN_RNR_CNT)) {
2171             if (qp->s_rnr_retry == 0) {
2172                 status = IB_WC_RNR_RETRY_EXC_ERR;
2173                 goto class_b;
2174             }
2175             if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
2176                 qp->s_rnr_retry--;
2177         }
2178 
2179         /*
2180          * The last valid PSN is the previous PSN. For TID RDMA WRITE
2181          * request, s_last_psn should be incremented only when a TID
2182          * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
2183          * WRITE RESP packets.
2184          */
2185         if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
2186             reset_psn(qp, qp->s_last_psn + 1);
2187         } else {
2188             update_last_psn(qp, psn - 1);
2189             reset_psn(qp, psn);
2190         }
2191 
2192         ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
2193         qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
2194         rvt_stop_rc_timers(qp);
2195         rvt_add_rnr_timer(qp, aeth);
2196         return 0;
2197 
2198     case 3:         /* NAK */
2199         if (qp->s_acked == qp->s_tail)
2200             goto bail_stop;
2201         /* The last valid PSN is the previous PSN. */
2202         update_last_psn(qp, psn - 1);
2203         switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
2204             IB_AETH_CREDIT_MASK) {
2205         case 0: /* PSN sequence error */
2206             ibp->rvp.n_seq_naks++;
2207             /*
2208              * Back up to the responder's expected PSN.
2209              * Note that we might get a NAK in the middle of an
2210              * RDMA READ response which terminates the RDMA
2211              * READ.
2212              */
2213             hfi1_restart_rc(qp, psn, 0);
2214             hfi1_schedule_send(qp);
2215             break;
2216 
2217         case 1: /* Invalid Request */
2218             status = IB_WC_REM_INV_REQ_ERR;
2219             ibp->rvp.n_other_naks++;
2220             goto class_b;
2221 
2222         case 2: /* Remote Access Error */
2223             status = IB_WC_REM_ACCESS_ERR;
2224             ibp->rvp.n_other_naks++;
2225             goto class_b;
2226 
2227         case 3: /* Remote Operation Error */
2228             status = IB_WC_REM_OP_ERR;
2229             ibp->rvp.n_other_naks++;
2230 class_b:
2231             if (qp->s_last == qp->s_acked) {
2232                 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
2233                     hfi1_kern_read_tid_flow_free(qp);
2234 
2235                 hfi1_trdma_send_complete(qp, wqe, status);
2236                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2237             }
2238             break;
2239 
2240         default:
2241             /* Ignore other reserved NAK error codes */
2242             goto reserved;
2243         }
2244         qp->s_retry = qp->s_retry_cnt;
2245         qp->s_rnr_retry = qp->s_rnr_retry_cnt;
2246         goto bail_stop;
2247 
2248     default:                /* 2: reserved */
2249 reserved:
2250         /* Ignore reserved NAK codes. */
2251         goto bail_stop;
2252     }
2253     /* cannot be reached  */
2254 bail_stop:
2255     rvt_stop_rc_timers(qp);
2256     return ret;
2257 }
2258 
2259 /*
2260  * We have seen an out of sequence RDMA read middle or last packet.
2261  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
2262  */
2263 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
2264              struct hfi1_ctxtdata *rcd)
2265 {
2266     struct rvt_swqe *wqe;
2267 
2268     lockdep_assert_held(&qp->s_lock);
2269     /* Remove QP from retry timer */
2270     rvt_stop_rc_timers(qp);
2271 
2272     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2273 
2274     while (cmp_psn(psn, wqe->lpsn) > 0) {
2275         if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2276             wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2277             wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
2278             wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2279             wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
2280             break;
2281         wqe = do_rc_completion(qp, wqe, ibp);
2282     }
2283 
2284     ibp->rvp.n_rdma_seq++;
2285     qp->r_flags |= RVT_R_RDMAR_SEQ;
2286     hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
2287     if (list_empty(&qp->rspwait)) {
2288         qp->r_flags |= RVT_R_RSP_SEND;
2289         rvt_get_qp(qp);
2290         list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2291     }
2292 }
2293 
2294 /**
2295  * rc_rcv_resp - process an incoming RC response packet
2296  * @packet: data packet information
2297  *
2298  * This is called from hfi1_rc_rcv() to process an incoming RC response
2299  * packet for the given QP.
2300  * Called at interrupt level.
2301  */
2302 static void rc_rcv_resp(struct hfi1_packet *packet)
2303 {
2304     struct hfi1_ctxtdata *rcd = packet->rcd;
2305     void *data = packet->payload;
2306     u32 tlen = packet->tlen;
2307     struct rvt_qp *qp = packet->qp;
2308     struct hfi1_ibport *ibp;
2309     struct ib_other_headers *ohdr = packet->ohdr;
2310     struct rvt_swqe *wqe;
2311     enum ib_wc_status status;
2312     unsigned long flags;
2313     int diff;
2314     u64 val;
2315     u32 aeth;
2316     u32 psn = ib_bth_get_psn(packet->ohdr);
2317     u32 pmtu = qp->pmtu;
2318     u16 hdrsize = packet->hlen;
2319     u8 opcode = packet->opcode;
2320     u8 pad = packet->pad;
2321     u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2322 
2323     spin_lock_irqsave(&qp->s_lock, flags);
2324     trace_hfi1_ack(qp, psn);
2325 
2326     /* Ignore invalid responses. */
2327     if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
2328         goto ack_done;
2329 
2330     /* Ignore duplicate responses. */
2331     diff = cmp_psn(psn, qp->s_last_psn);
2332     if (unlikely(diff <= 0)) {
2333         /* Update credits for "ghost" ACKs */
2334         if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
2335             aeth = be32_to_cpu(ohdr->u.aeth);
2336             if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
2337                 rvt_get_credit(qp, aeth);
2338         }
2339         goto ack_done;
2340     }
2341 
2342     /*
2343      * Skip everything other than the PSN we expect, if we are waiting
2344      * for a reply to a restarted RDMA read or atomic op.
2345      */
2346     if (qp->r_flags & RVT_R_RDMAR_SEQ) {
2347         if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
2348             goto ack_done;
2349         qp->r_flags &= ~RVT_R_RDMAR_SEQ;
2350     }
2351 
2352     if (unlikely(qp->s_acked == qp->s_tail))
2353         goto ack_done;
2354     wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2355     status = IB_WC_SUCCESS;
2356 
2357     switch (opcode) {
2358     case OP(ACKNOWLEDGE):
2359     case OP(ATOMIC_ACKNOWLEDGE):
2360     case OP(RDMA_READ_RESPONSE_FIRST):
2361         aeth = be32_to_cpu(ohdr->u.aeth);
2362         if (opcode == OP(ATOMIC_ACKNOWLEDGE))
2363             val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
2364         else
2365             val = 0;
2366         if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
2367             opcode != OP(RDMA_READ_RESPONSE_FIRST))
2368             goto ack_done;
2369         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2370         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2371             goto ack_op_err;
2372         /*
2373          * If this is a response to a resent RDMA read, we
2374          * have to be careful to copy the data to the right
2375          * location.
2376          */
2377         qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
2378                           wqe, psn, pmtu);
2379         goto read_middle;
2380 
2381     case OP(RDMA_READ_RESPONSE_MIDDLE):
2382         /* no AETH, no ACK */
2383         if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
2384             goto ack_seq_err;
2385         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2386             goto ack_op_err;
2387 read_middle:
2388         if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2389             goto ack_len_err;
2390         if (unlikely(pmtu >= qp->s_rdma_read_len))
2391             goto ack_len_err;
2392 
2393         /*
2394          * We got a response so update the timeout.
2395          * 4.096 usec. * (1 << qp->timeout)
2396          */
2397         rvt_mod_retry_timer(qp);
2398         if (qp->s_flags & RVT_S_WAIT_ACK) {
2399             qp->s_flags &= ~RVT_S_WAIT_ACK;
2400             hfi1_schedule_send(qp);
2401         }
2402 
2403         if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
2404             qp->s_retry = qp->s_retry_cnt;
2405 
2406         /*
2407          * Update the RDMA receive state but do the copy w/o
2408          * holding the locks and blocking interrupts.
2409          */
2410         qp->s_rdma_read_len -= pmtu;
2411         update_last_psn(qp, psn);
2412         spin_unlock_irqrestore(&qp->s_lock, flags);
2413         rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2414                  data, pmtu, false, false);
2415         goto bail;
2416 
2417     case OP(RDMA_READ_RESPONSE_ONLY):
2418         aeth = be32_to_cpu(ohdr->u.aeth);
2419         if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
2420             goto ack_done;
2421         /*
2422          * Check that the data size is >= 0 && <= pmtu.
2423          * Remember to account for ICRC (4).
2424          */
2425         if (unlikely(tlen < (hdrsize + extra_bytes)))
2426             goto ack_len_err;
2427         /*
2428          * If this is a response to a resent RDMA read, we
2429          * have to be careful to copy the data to the right
2430          * location.
2431          */
2432         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2433         qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
2434                           wqe, psn, pmtu);
2435         goto read_last;
2436 
2437     case OP(RDMA_READ_RESPONSE_LAST):
2438         /* ACKs READ req. */
2439         if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
2440             goto ack_seq_err;
2441         if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2442             goto ack_op_err;
2443         /*
2444          * Check that the data size is >= 1 && <= pmtu.
2445          * Remember to account for ICRC (4).
2446          */
2447         if (unlikely(tlen <= (hdrsize + extra_bytes)))
2448             goto ack_len_err;
2449 read_last:
2450         tlen -= hdrsize + extra_bytes;
2451         if (unlikely(tlen != qp->s_rdma_read_len))
2452             goto ack_len_err;
2453         aeth = be32_to_cpu(ohdr->u.aeth);
2454         rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2455                  data, tlen, false, false);
2456         WARN_ON(qp->s_rdma_read_sge.num_sge);
2457         (void)do_rc_ack(qp, aeth, psn,
2458                  OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
2459         goto ack_done;
2460     }
2461 
2462 ack_op_err:
2463     status = IB_WC_LOC_QP_OP_ERR;
2464     goto ack_err;
2465 
2466 ack_seq_err:
2467     ibp = rcd_to_iport(rcd);
2468     rdma_seq_err(qp, ibp, psn, rcd);
2469     goto ack_done;
2470 
2471 ack_len_err:
2472     status = IB_WC_LOC_LEN_ERR;
2473 ack_err:
2474     if (qp->s_last == qp->s_acked) {
2475         rvt_send_complete(qp, wqe, status);
2476         rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2477     }
2478 ack_done:
2479     spin_unlock_irqrestore(&qp->s_lock, flags);
2480 bail:
2481     return;
2482 }
2483 
2484 static inline void rc_cancel_ack(struct rvt_qp *qp)
2485 {
2486     qp->r_adefered = 0;
2487     if (list_empty(&qp->rspwait))
2488         return;
2489     list_del_init(&qp->rspwait);
2490     qp->r_flags &= ~RVT_R_RSP_NAK;
2491     rvt_put_qp(qp);
2492 }
2493 
2494 /**
2495  * rc_rcv_error - process an incoming duplicate or error RC packet
2496  * @ohdr: the other headers for this packet
2497  * @data: the packet data
2498  * @qp: the QP for this packet
2499  * @opcode: the opcode for this packet
2500  * @psn: the packet sequence number for this packet
2501  * @diff: the difference between the PSN and the expected PSN
2502  * @rcd: the receive context
2503  *
2504  * This is called from hfi1_rc_rcv() to process an unexpected
2505  * incoming RC packet for the given QP.
2506  * Called at interrupt level.
2507  * Return 1 if no more processing is needed; otherwise return 0 to
2508  * schedule a response to be sent.
2509  */
2510 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
2511                  struct rvt_qp *qp, u32 opcode, u32 psn,
2512                  int diff, struct hfi1_ctxtdata *rcd)
2513 {
2514     struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2515     struct rvt_ack_entry *e;
2516     unsigned long flags;
2517     u8 prev;
2518     u8 mra; /* most recent ACK */
2519     bool old_req;
2520 
2521     trace_hfi1_rcv_error(qp, psn);
2522     if (diff > 0) {
2523         /*
2524          * Packet sequence error.
2525          * A NAK will ACK earlier sends and RDMA writes.
2526          * Don't queue the NAK if we already sent one.
2527          */
2528         if (!qp->r_nak_state) {
2529             ibp->rvp.n_rc_seqnak++;
2530             qp->r_nak_state = IB_NAK_PSN_ERROR;
2531             /* Use the expected PSN. */
2532             qp->r_ack_psn = qp->r_psn;
2533             /*
2534              * Wait to send the sequence NAK until all packets
2535              * in the receive queue have been processed.
2536              * Otherwise, we end up propagating congestion.
2537              */
2538             rc_defered_ack(rcd, qp);
2539         }
2540         goto done;
2541     }
2542 
2543     /*
2544      * Handle a duplicate request.  Don't re-execute SEND, RDMA
2545      * write or atomic op.  Don't NAK errors, just silently drop
2546      * the duplicate request.  Note that r_sge, r_len, and
2547      * r_rcv_len may be in use so don't modify them.
2548      *
2549      * We are supposed to ACK the earliest duplicate PSN but we
2550      * can coalesce an outstanding duplicate ACK.  We have to
2551      * send the earliest so that RDMA reads can be restarted at
2552      * the requester's expected PSN.
2553      *
2554      * First, find where this duplicate PSN falls within the
2555      * ACKs previously sent.
2556      * old_req is true if there is an older response that is scheduled
2557      * to be sent before sending this one.
2558      */
2559     e = NULL;
2560     old_req = true;
2561     ibp->rvp.n_rc_dupreq++;
2562 
2563     spin_lock_irqsave(&qp->s_lock, flags);
2564 
2565     e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
2566 
2567     switch (opcode) {
2568     case OP(RDMA_READ_REQUEST): {
2569         struct ib_reth *reth;
2570         u32 offset;
2571         u32 len;
2572 
2573         /*
2574          * If we didn't find the RDMA read request in the ack queue,
2575          * we can ignore this request.
2576          */
2577         if (!e || e->opcode != OP(RDMA_READ_REQUEST))
2578             goto unlock_done;
2579         /* RETH comes after BTH */
2580         reth = &ohdr->u.rc.reth;
2581         /*
2582          * Address range must be a subset of the original
2583          * request and start on pmtu boundaries.
2584          * We reuse the old ack_queue slot since the requester
2585          * should not back up and request an earlier PSN for the
2586          * same request.
2587          */
2588         offset = delta_psn(psn, e->psn) * qp->pmtu;
2589         len = be32_to_cpu(reth->length);
2590         if (unlikely(offset + len != e->rdma_sge.sge_length))
2591             goto unlock_done;
2592         release_rdma_sge_mr(e);
2593         if (len != 0) {
2594             u32 rkey = be32_to_cpu(reth->rkey);
2595             u64 vaddr = get_ib_reth_vaddr(reth);
2596             int ok;
2597 
2598             ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2599                      IB_ACCESS_REMOTE_READ);
2600             if (unlikely(!ok))
2601                 goto unlock_done;
2602         } else {
2603             e->rdma_sge.vaddr = NULL;
2604             e->rdma_sge.length = 0;
2605             e->rdma_sge.sge_length = 0;
2606         }
2607         e->psn = psn;
2608         if (old_req)
2609             goto unlock_done;
2610         if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2611             qp->s_acked_ack_queue = prev;
2612         qp->s_tail_ack_queue = prev;
2613         break;
2614     }
2615 
2616     case OP(COMPARE_SWAP):
2617     case OP(FETCH_ADD): {
2618         /*
2619          * If we didn't find the atomic request in the ack queue
2620          * or the send engine is already backed up to send an
2621          * earlier entry, we can ignore this request.
2622          */
2623         if (!e || e->opcode != (u8)opcode || old_req)
2624             goto unlock_done;
2625         if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2626             qp->s_acked_ack_queue = prev;
2627         qp->s_tail_ack_queue = prev;
2628         break;
2629     }
2630 
2631     default:
2632         /*
2633          * Ignore this operation if it doesn't request an ACK
2634          * or an earlier RDMA read or atomic is going to be resent.
2635          */
2636         if (!(psn & IB_BTH_REQ_ACK) || old_req)
2637             goto unlock_done;
2638         /*
2639          * Resend the most recent ACK if this request is
2640          * after all the previous RDMA reads and atomics.
2641          */
2642         if (mra == qp->r_head_ack_queue) {
2643             spin_unlock_irqrestore(&qp->s_lock, flags);
2644             qp->r_nak_state = 0;
2645             qp->r_ack_psn = qp->r_psn - 1;
2646             goto send_ack;
2647         }
2648 
2649         /*
2650          * Resend the RDMA read or atomic op which
2651          * ACKs this duplicate request.
2652          */
2653         if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2654             qp->s_acked_ack_queue = mra;
2655         qp->s_tail_ack_queue = mra;
2656         break;
2657     }
2658     qp->s_ack_state = OP(ACKNOWLEDGE);
2659     qp->s_flags |= RVT_S_RESP_PENDING;
2660     qp->r_nak_state = 0;
2661     hfi1_schedule_send(qp);
2662 
2663 unlock_done:
2664     spin_unlock_irqrestore(&qp->s_lock, flags);
2665 done:
2666     return 1;
2667 
2668 send_ack:
2669     return 0;
2670 }
2671 
2672 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
2673               u32 lqpn, u32 rqpn, u8 svc_type)
2674 {
2675     struct opa_hfi1_cong_log_event_internal *cc_event;
2676     unsigned long flags;
2677 
2678     if (sl >= OPA_MAX_SLS)
2679         return;
2680 
2681     spin_lock_irqsave(&ppd->cc_log_lock, flags);
2682 
2683     ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
2684     ppd->threshold_event_counter++;
2685 
2686     cc_event = &ppd->cc_events[ppd->cc_log_idx++];
2687     if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
2688         ppd->cc_log_idx = 0;
2689     cc_event->lqpn = lqpn & RVT_QPN_MASK;
2690     cc_event->rqpn = rqpn & RVT_QPN_MASK;
2691     cc_event->sl = sl;
2692     cc_event->svc_type = svc_type;
2693     cc_event->rlid = rlid;
2694     /* keep timestamp in units of 1.024 usec */
2695     cc_event->timestamp = ktime_get_ns() / 1024;
2696 
2697     spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
2698 }
2699 
2700 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
2701           u32 rqpn, u8 svc_type)
2702 {
2703     struct cca_timer *cca_timer;
2704     u16 ccti, ccti_incr, ccti_timer, ccti_limit;
2705     u8 trigger_threshold;
2706     struct cc_state *cc_state;
2707     unsigned long flags;
2708 
2709     if (sl >= OPA_MAX_SLS)
2710         return;
2711 
2712     cc_state = get_cc_state(ppd);
2713 
2714     if (!cc_state)
2715         return;
2716 
2717     /*
2718      * 1) increase CCTI (for this SL)
2719      * 2) select IPG (i.e., call set_link_ipg())
2720      * 3) start timer
2721      */
2722     ccti_limit = cc_state->cct.ccti_limit;
2723     ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
2724     ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
2725     trigger_threshold =
2726         cc_state->cong_setting.entries[sl].trigger_threshold;
2727 
2728     spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2729 
2730     cca_timer = &ppd->cca_timer[sl];
2731     if (cca_timer->ccti < ccti_limit) {
2732         if (cca_timer->ccti + ccti_incr <= ccti_limit)
2733             cca_timer->ccti += ccti_incr;
2734         else
2735             cca_timer->ccti = ccti_limit;
2736         set_link_ipg(ppd);
2737     }
2738 
2739     ccti = cca_timer->ccti;
2740 
2741     if (!hrtimer_active(&cca_timer->hrtimer)) {
2742         /* ccti_timer is in units of 1.024 usec */
2743         unsigned long nsec = 1024 * ccti_timer;
2744 
2745         hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2746                   HRTIMER_MODE_REL_PINNED);
2747     }
2748 
2749     spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2750 
2751     if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2752         log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2753 }
2754 
2755 /**
2756  * hfi1_rc_rcv - process an incoming RC packet
2757  * @packet: data packet information
2758  *
2759  * This is called from qp_rcv() to process an incoming RC packet
2760  * for the given QP.
2761  * May be called at interrupt level.
2762  */
2763 void hfi1_rc_rcv(struct hfi1_packet *packet)
2764 {
2765     struct hfi1_ctxtdata *rcd = packet->rcd;
2766     void *data = packet->payload;
2767     u32 tlen = packet->tlen;
2768     struct rvt_qp *qp = packet->qp;
2769     struct hfi1_qp_priv *qpriv = qp->priv;
2770     struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2771     struct ib_other_headers *ohdr = packet->ohdr;
2772     u32 opcode = packet->opcode;
2773     u32 hdrsize = packet->hlen;
2774     u32 psn = ib_bth_get_psn(packet->ohdr);
2775     u32 pad = packet->pad;
2776     struct ib_wc wc;
2777     u32 pmtu = qp->pmtu;
2778     int diff;
2779     struct ib_reth *reth;
2780     unsigned long flags;
2781     int ret;
2782     bool copy_last = false, fecn;
2783     u32 rkey;
2784     u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2785 
2786     lockdep_assert_held(&qp->r_lock);
2787 
2788     if (hfi1_ruc_check_hdr(ibp, packet))
2789         return;
2790 
2791     fecn = process_ecn(qp, packet);
2792     opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
2793 
2794     /*
2795      * Process responses (ACKs) before anything else.  Note that the
2796      * packet sequence number will be for something in the send work
2797      * queue rather than the expected receive packet sequence number.
2798      * In other words, this QP is the requester.
2799      */
2800     if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2801         opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2802         rc_rcv_resp(packet);
2803         return;
2804     }
2805 
2806     /* Compute 24 bits worth of difference. */
2807     diff = delta_psn(psn, qp->r_psn);
2808     if (unlikely(diff)) {
2809         if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2810             return;
2811         goto send_ack;
2812     }
2813 
2814     /* Check for opcode sequence errors. */
2815     switch (qp->r_state) {
2816     case OP(SEND_FIRST):
2817     case OP(SEND_MIDDLE):
2818         if (opcode == OP(SEND_MIDDLE) ||
2819             opcode == OP(SEND_LAST) ||
2820             opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2821             opcode == OP(SEND_LAST_WITH_INVALIDATE))
2822             break;
2823         goto nack_inv;
2824 
2825     case OP(RDMA_WRITE_FIRST):
2826     case OP(RDMA_WRITE_MIDDLE):
2827         if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2828             opcode == OP(RDMA_WRITE_LAST) ||
2829             opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2830             break;
2831         goto nack_inv;
2832 
2833     default:
2834         if (opcode == OP(SEND_MIDDLE) ||
2835             opcode == OP(SEND_LAST) ||
2836             opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2837             opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2838             opcode == OP(RDMA_WRITE_MIDDLE) ||
2839             opcode == OP(RDMA_WRITE_LAST) ||
2840             opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2841             goto nack_inv;
2842         /*
2843          * Note that it is up to the requester to not send a new
2844          * RDMA read or atomic operation before receiving an ACK
2845          * for the previous operation.
2846          */
2847         break;
2848     }
2849 
2850     if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2851         rvt_comm_est(qp);
2852 
2853     /* OK, process the packet. */
2854     switch (opcode) {
2855     case OP(SEND_FIRST):
2856         ret = rvt_get_rwqe(qp, false);
2857         if (ret < 0)
2858             goto nack_op_err;
2859         if (!ret)
2860             goto rnr_nak;
2861         qp->r_rcv_len = 0;
2862         fallthrough;
2863     case OP(SEND_MIDDLE):
2864     case OP(RDMA_WRITE_MIDDLE):
2865 send_middle:
2866         /* Check for invalid length PMTU or posted rwqe len. */
2867         /*
2868          * There will be no padding for 9B packet but 16B packets
2869          * will come in with some padding since we always add
2870          * CRC and LT bytes which will need to be flit aligned
2871          */
2872         if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2873             goto nack_inv;
2874         qp->r_rcv_len += pmtu;
2875         if (unlikely(qp->r_rcv_len > qp->r_len))
2876             goto nack_inv;
2877         rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
2878         break;
2879 
2880     case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2881         /* consume RWQE */
2882         ret = rvt_get_rwqe(qp, true);
2883         if (ret < 0)
2884             goto nack_op_err;
2885         if (!ret)
2886             goto rnr_nak;
2887         goto send_last_imm;
2888 
2889     case OP(SEND_ONLY):
2890     case OP(SEND_ONLY_WITH_IMMEDIATE):
2891     case OP(SEND_ONLY_WITH_INVALIDATE):
2892         ret = rvt_get_rwqe(qp, false);
2893         if (ret < 0)
2894             goto nack_op_err;
2895         if (!ret)
2896             goto rnr_nak;
2897         qp->r_rcv_len = 0;
2898         if (opcode == OP(SEND_ONLY))
2899             goto no_immediate_data;
2900         if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2901             goto send_last_inv;
2902         fallthrough;    /* for SEND_ONLY_WITH_IMMEDIATE */
2903     case OP(SEND_LAST_WITH_IMMEDIATE):
2904 send_last_imm:
2905         wc.ex.imm_data = ohdr->u.imm_data;
2906         wc.wc_flags = IB_WC_WITH_IMM;
2907         goto send_last;
2908     case OP(SEND_LAST_WITH_INVALIDATE):
2909 send_last_inv:
2910         rkey = be32_to_cpu(ohdr->u.ieth);
2911         if (rvt_invalidate_rkey(qp, rkey))
2912             goto no_immediate_data;
2913         wc.ex.invalidate_rkey = rkey;
2914         wc.wc_flags = IB_WC_WITH_INVALIDATE;
2915         goto send_last;
2916     case OP(RDMA_WRITE_LAST):
2917         copy_last = rvt_is_user_qp(qp);
2918         fallthrough;
2919     case OP(SEND_LAST):
2920 no_immediate_data:
2921         wc.wc_flags = 0;
2922         wc.ex.imm_data = 0;
2923 send_last:
2924         /* Check for invalid length. */
2925         /* LAST len should be >= 1 */
2926         if (unlikely(tlen < (hdrsize + extra_bytes)))
2927             goto nack_inv;
2928         /* Don't count the CRC(and padding and LT byte for 16B). */
2929         tlen -= (hdrsize + extra_bytes);
2930         wc.byte_len = tlen + qp->r_rcv_len;
2931         if (unlikely(wc.byte_len > qp->r_len))
2932             goto nack_inv;
2933         rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
2934         rvt_put_ss(&qp->r_sge);
2935         qp->r_msn++;
2936         if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2937             break;
2938         wc.wr_id = qp->r_wr_id;
2939         wc.status = IB_WC_SUCCESS;
2940         if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2941             opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2942             wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2943         else
2944             wc.opcode = IB_WC_RECV;
2945         wc.qp = &qp->ibqp;
2946         wc.src_qp = qp->remote_qpn;
2947         wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
2948         /*
2949          * It seems that IB mandates the presence of an SL in a
2950          * work completion only for the UD transport (see section
2951          * 11.4.2 of IBTA Vol. 1).
2952          *
2953          * However, the way the SL is chosen below is consistent
2954          * with the way that IB/qib works and is trying avoid
2955          * introducing incompatibilities.
2956          *
2957          * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2958          */
2959         wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
2960         /* zero fields that are N/A */
2961         wc.vendor_err = 0;
2962         wc.pkey_index = 0;
2963         wc.dlid_path_bits = 0;
2964         wc.port_num = 0;
2965         /* Signal completion event if the solicited bit is set. */
2966         rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
2967         break;
2968 
2969     case OP(RDMA_WRITE_ONLY):
2970         copy_last = rvt_is_user_qp(qp);
2971         fallthrough;
2972     case OP(RDMA_WRITE_FIRST):
2973     case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2974         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2975             goto nack_inv;
2976         /* consume RWQE */
2977         reth = &ohdr->u.rc.reth;
2978         qp->r_len = be32_to_cpu(reth->length);
2979         qp->r_rcv_len = 0;
2980         qp->r_sge.sg_list = NULL;
2981         if (qp->r_len != 0) {
2982             u32 rkey = be32_to_cpu(reth->rkey);
2983             u64 vaddr = get_ib_reth_vaddr(reth);
2984             int ok;
2985 
2986             /* Check rkey & NAK */
2987             ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2988                      rkey, IB_ACCESS_REMOTE_WRITE);
2989             if (unlikely(!ok))
2990                 goto nack_acc;
2991             qp->r_sge.num_sge = 1;
2992         } else {
2993             qp->r_sge.num_sge = 0;
2994             qp->r_sge.sge.mr = NULL;
2995             qp->r_sge.sge.vaddr = NULL;
2996             qp->r_sge.sge.length = 0;
2997             qp->r_sge.sge.sge_length = 0;
2998         }
2999         if (opcode == OP(RDMA_WRITE_FIRST))
3000             goto send_middle;
3001         else if (opcode == OP(RDMA_WRITE_ONLY))
3002             goto no_immediate_data;
3003         ret = rvt_get_rwqe(qp, true);
3004         if (ret < 0)
3005             goto nack_op_err;
3006         if (!ret) {
3007             /* peer will send again */
3008             rvt_put_ss(&qp->r_sge);
3009             goto rnr_nak;
3010         }
3011         wc.ex.imm_data = ohdr->u.rc.imm_data;
3012         wc.wc_flags = IB_WC_WITH_IMM;
3013         goto send_last;
3014 
3015     case OP(RDMA_READ_REQUEST): {
3016         struct rvt_ack_entry *e;
3017         u32 len;
3018         u8 next;
3019 
3020         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
3021             goto nack_inv;
3022         next = qp->r_head_ack_queue + 1;
3023         /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
3024         if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3025             next = 0;
3026         spin_lock_irqsave(&qp->s_lock, flags);
3027         if (unlikely(next == qp->s_acked_ack_queue)) {
3028             if (!qp->s_ack_queue[next].sent)
3029                 goto nack_inv_unlck;
3030             update_ack_queue(qp, next);
3031         }
3032         e = &qp->s_ack_queue[qp->r_head_ack_queue];
3033         release_rdma_sge_mr(e);
3034         reth = &ohdr->u.rc.reth;
3035         len = be32_to_cpu(reth->length);
3036         if (len) {
3037             u32 rkey = be32_to_cpu(reth->rkey);
3038             u64 vaddr = get_ib_reth_vaddr(reth);
3039             int ok;
3040 
3041             /* Check rkey & NAK */
3042             ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
3043                      rkey, IB_ACCESS_REMOTE_READ);
3044             if (unlikely(!ok))
3045                 goto nack_acc_unlck;
3046             /*
3047              * Update the next expected PSN.  We add 1 later
3048              * below, so only add the remainder here.
3049              */
3050             qp->r_psn += rvt_div_mtu(qp, len - 1);
3051         } else {
3052             e->rdma_sge.mr = NULL;
3053             e->rdma_sge.vaddr = NULL;
3054             e->rdma_sge.length = 0;
3055             e->rdma_sge.sge_length = 0;
3056         }
3057         e->opcode = opcode;
3058         e->sent = 0;
3059         e->psn = psn;
3060         e->lpsn = qp->r_psn;
3061         /*
3062          * We need to increment the MSN here instead of when we
3063          * finish sending the result since a duplicate request would
3064          * increment it more than once.
3065          */
3066         qp->r_msn++;
3067         qp->r_psn++;
3068         qp->r_state = opcode;
3069         qp->r_nak_state = 0;
3070         qp->r_head_ack_queue = next;
3071         qpriv->r_tid_alloc = qp->r_head_ack_queue;
3072 
3073         /* Schedule the send engine. */
3074         qp->s_flags |= RVT_S_RESP_PENDING;
3075         if (fecn)
3076             qp->s_flags |= RVT_S_ECN;
3077         hfi1_schedule_send(qp);
3078 
3079         spin_unlock_irqrestore(&qp->s_lock, flags);
3080         return;
3081     }
3082 
3083     case OP(COMPARE_SWAP):
3084     case OP(FETCH_ADD): {
3085         struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
3086         u64 vaddr = get_ib_ateth_vaddr(ateth);
3087         bool opfn = opcode == OP(COMPARE_SWAP) &&
3088             vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
3089         struct rvt_ack_entry *e;
3090         atomic64_t *maddr;
3091         u64 sdata;
3092         u32 rkey;
3093         u8 next;
3094 
3095         if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
3096                  !opfn))
3097             goto nack_inv;
3098         next = qp->r_head_ack_queue + 1;
3099         if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3100             next = 0;
3101         spin_lock_irqsave(&qp->s_lock, flags);
3102         if (unlikely(next == qp->s_acked_ack_queue)) {
3103             if (!qp->s_ack_queue[next].sent)
3104                 goto nack_inv_unlck;
3105             update_ack_queue(qp, next);
3106         }
3107         e = &qp->s_ack_queue[qp->r_head_ack_queue];
3108         release_rdma_sge_mr(e);
3109         /* Process OPFN special virtual address */
3110         if (opfn) {
3111             opfn_conn_response(qp, e, ateth);
3112             goto ack;
3113         }
3114         if (unlikely(vaddr & (sizeof(u64) - 1)))
3115             goto nack_inv_unlck;
3116         rkey = be32_to_cpu(ateth->rkey);
3117         /* Check rkey & NAK */
3118         if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
3119                       vaddr, rkey,
3120                       IB_ACCESS_REMOTE_ATOMIC)))
3121             goto nack_acc_unlck;
3122         /* Perform atomic OP and save result. */
3123         maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
3124         sdata = get_ib_ateth_swap(ateth);
3125         e->atomic_data = (opcode == OP(FETCH_ADD)) ?
3126             (u64)atomic64_add_return(sdata, maddr) - sdata :
3127             (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
3128                       get_ib_ateth_compare(ateth),
3129                       sdata);
3130         rvt_put_mr(qp->r_sge.sge.mr);
3131         qp->r_sge.num_sge = 0;
3132 ack:
3133         e->opcode = opcode;
3134         e->sent = 0;
3135         e->psn = psn;
3136         e->lpsn = psn;
3137         qp->r_msn++;
3138         qp->r_psn++;
3139         qp->r_state = opcode;
3140         qp->r_nak_state = 0;
3141         qp->r_head_ack_queue = next;
3142         qpriv->r_tid_alloc = qp->r_head_ack_queue;
3143 
3144         /* Schedule the send engine. */
3145         qp->s_flags |= RVT_S_RESP_PENDING;
3146         if (fecn)
3147             qp->s_flags |= RVT_S_ECN;
3148         hfi1_schedule_send(qp);
3149 
3150         spin_unlock_irqrestore(&qp->s_lock, flags);
3151         return;
3152     }
3153 
3154     default:
3155         /* NAK unknown opcodes. */
3156         goto nack_inv;
3157     }
3158     qp->r_psn++;
3159     qp->r_state = opcode;
3160     qp->r_ack_psn = psn;
3161     qp->r_nak_state = 0;
3162     /* Send an ACK if requested or required. */
3163     if (psn & IB_BTH_REQ_ACK || fecn) {
3164         if (packet->numpkt == 0 || fecn ||
3165             qp->r_adefered >= HFI1_PSN_CREDIT) {
3166             rc_cancel_ack(qp);
3167             goto send_ack;
3168         }
3169         qp->r_adefered++;
3170         rc_defered_ack(rcd, qp);
3171     }
3172     return;
3173 
3174 rnr_nak:
3175     qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
3176     qp->r_ack_psn = qp->r_psn;
3177     /* Queue RNR NAK for later */
3178     rc_defered_ack(rcd, qp);
3179     return;
3180 
3181 nack_op_err:
3182     rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3183     qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
3184     qp->r_ack_psn = qp->r_psn;
3185     /* Queue NAK for later */
3186     rc_defered_ack(rcd, qp);
3187     return;
3188 
3189 nack_inv_unlck:
3190     spin_unlock_irqrestore(&qp->s_lock, flags);
3191 nack_inv:
3192     rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3193     qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3194     qp->r_ack_psn = qp->r_psn;
3195     /* Queue NAK for later */
3196     rc_defered_ack(rcd, qp);
3197     return;
3198 
3199 nack_acc_unlck:
3200     spin_unlock_irqrestore(&qp->s_lock, flags);
3201 nack_acc:
3202     rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3203     qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3204     qp->r_ack_psn = qp->r_psn;
3205 send_ack:
3206     hfi1_send_rc_ack(packet, fecn);
3207 }
3208 
3209 void hfi1_rc_hdrerr(
3210     struct hfi1_ctxtdata *rcd,
3211     struct hfi1_packet *packet,
3212     struct rvt_qp *qp)
3213 {
3214     struct hfi1_ibport *ibp = rcd_to_iport(rcd);
3215     int diff;
3216     u32 opcode;
3217     u32 psn;
3218 
3219     if (hfi1_ruc_check_hdr(ibp, packet))
3220         return;
3221 
3222     psn = ib_bth_get_psn(packet->ohdr);
3223     opcode = ib_bth_get_opcode(packet->ohdr);
3224 
3225     /* Only deal with RDMA Writes for now */
3226     if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
3227         diff = delta_psn(psn, qp->r_psn);
3228         if (!qp->r_nak_state && diff >= 0) {
3229             ibp->rvp.n_rc_seqnak++;
3230             qp->r_nak_state = IB_NAK_PSN_ERROR;
3231             /* Use the expected PSN. */
3232             qp->r_ack_psn = qp->r_psn;
3233             /*
3234              * Wait to send the sequence
3235              * NAK until all packets
3236              * in the receive queue have
3237              * been processed.
3238              * Otherwise, we end up
3239              * propagating congestion.
3240              */
3241             rc_defered_ack(rcd, qp);
3242         } /* Out of sequence NAK */
3243     } /* QP Request NAKs */
3244 }