Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
0002 /*
0003  * Copyright(c) 2020 - Cornelis Networks, Inc.
0004  * Copyright(c) 2015 - 2018 Intel Corporation.
0005  */
0006 
0007 #include <linux/mm.h>
0008 #include <linux/types.h>
0009 #include <linux/device.h>
0010 #include <linux/dmapool.h>
0011 #include <linux/slab.h>
0012 #include <linux/list.h>
0013 #include <linux/highmem.h>
0014 #include <linux/io.h>
0015 #include <linux/uio.h>
0016 #include <linux/rbtree.h>
0017 #include <linux/spinlock.h>
0018 #include <linux/delay.h>
0019 #include <linux/kthread.h>
0020 #include <linux/mmu_context.h>
0021 #include <linux/module.h>
0022 #include <linux/vmalloc.h>
0023 #include <linux/string.h>
0024 
0025 #include "hfi.h"
0026 #include "sdma.h"
0027 #include "mmu_rb.h"
0028 #include "user_sdma.h"
0029 #include "verbs.h"  /* for the headers */
0030 #include "common.h" /* for struct hfi1_tid_info */
0031 #include "trace.h"
0032 
0033 static uint hfi1_sdma_comp_ring_size = 128;
0034 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
0035 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
0036 
0037 static unsigned initial_pkt_count = 8;
0038 
0039 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
0040 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
0041 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
0042 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
0043 static int pin_vector_pages(struct user_sdma_request *req,
0044                 struct user_sdma_iovec *iovec);
0045 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
0046                    unsigned start, unsigned npages);
0047 static int check_header_template(struct user_sdma_request *req,
0048                  struct hfi1_pkt_header *hdr, u32 lrhlen,
0049                  u32 datalen);
0050 static int set_txreq_header(struct user_sdma_request *req,
0051                 struct user_sdma_txreq *tx, u32 datalen);
0052 static int set_txreq_header_ahg(struct user_sdma_request *req,
0053                 struct user_sdma_txreq *tx, u32 len);
0054 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
0055                   struct hfi1_user_sdma_comp_q *cq,
0056                   u16 idx, enum hfi1_sdma_comp_state state,
0057                   int ret);
0058 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
0059 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
0060 
0061 static int defer_packet_queue(
0062     struct sdma_engine *sde,
0063     struct iowait_work *wait,
0064     struct sdma_txreq *txreq,
0065     uint seq,
0066     bool pkts_sent);
0067 static void activate_packet_queue(struct iowait *wait, int reason);
0068 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
0069                unsigned long len);
0070 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
0071 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
0072              void *arg2, bool *stop);
0073 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
0074 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
0075 
0076 static struct mmu_rb_ops sdma_rb_ops = {
0077     .filter = sdma_rb_filter,
0078     .insert = sdma_rb_insert,
0079     .evict = sdma_rb_evict,
0080     .remove = sdma_rb_remove,
0081     .invalidate = sdma_rb_invalidate
0082 };
0083 
0084 static int defer_packet_queue(
0085     struct sdma_engine *sde,
0086     struct iowait_work *wait,
0087     struct sdma_txreq *txreq,
0088     uint seq,
0089     bool pkts_sent)
0090 {
0091     struct hfi1_user_sdma_pkt_q *pq =
0092         container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
0093 
0094     write_seqlock(&sde->waitlock);
0095     trace_hfi1_usdma_defer(pq, sde, &pq->busy);
0096     if (sdma_progress(sde, seq, txreq))
0097         goto eagain;
0098     /*
0099      * We are assuming that if the list is enqueued somewhere, it
0100      * is to the dmawait list since that is the only place where
0101      * it is supposed to be enqueued.
0102      */
0103     xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
0104     if (list_empty(&pq->busy.list)) {
0105         pq->busy.lock = &sde->waitlock;
0106         iowait_get_priority(&pq->busy);
0107         iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
0108     }
0109     write_sequnlock(&sde->waitlock);
0110     return -EBUSY;
0111 eagain:
0112     write_sequnlock(&sde->waitlock);
0113     return -EAGAIN;
0114 }
0115 
0116 static void activate_packet_queue(struct iowait *wait, int reason)
0117 {
0118     struct hfi1_user_sdma_pkt_q *pq =
0119         container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
0120 
0121     trace_hfi1_usdma_activate(pq, wait, reason);
0122     xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
0123     wake_up(&wait->wait_dma);
0124 };
0125 
0126 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
0127                 struct hfi1_filedata *fd)
0128 {
0129     int ret = -ENOMEM;
0130     char buf[64];
0131     struct hfi1_devdata *dd;
0132     struct hfi1_user_sdma_comp_q *cq;
0133     struct hfi1_user_sdma_pkt_q *pq;
0134 
0135     if (!uctxt || !fd)
0136         return -EBADF;
0137 
0138     if (!hfi1_sdma_comp_ring_size)
0139         return -EINVAL;
0140 
0141     dd = uctxt->dd;
0142 
0143     pq = kzalloc(sizeof(*pq), GFP_KERNEL);
0144     if (!pq)
0145         return -ENOMEM;
0146     pq->dd = dd;
0147     pq->ctxt = uctxt->ctxt;
0148     pq->subctxt = fd->subctxt;
0149     pq->n_max_reqs = hfi1_sdma_comp_ring_size;
0150     atomic_set(&pq->n_reqs, 0);
0151     init_waitqueue_head(&pq->wait);
0152     atomic_set(&pq->n_locked, 0);
0153 
0154     iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
0155             activate_packet_queue, NULL, NULL);
0156     pq->reqidx = 0;
0157 
0158     pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
0159                sizeof(*pq->reqs),
0160                GFP_KERNEL);
0161     if (!pq->reqs)
0162         goto pq_reqs_nomem;
0163 
0164     pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
0165     if (!pq->req_in_use)
0166         goto pq_reqs_no_in_use;
0167 
0168     snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
0169          fd->subctxt);
0170     pq->txreq_cache = kmem_cache_create(buf,
0171                         sizeof(struct user_sdma_txreq),
0172                         L1_CACHE_BYTES,
0173                         SLAB_HWCACHE_ALIGN,
0174                         NULL);
0175     if (!pq->txreq_cache) {
0176         dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
0177                uctxt->ctxt);
0178         goto pq_txreq_nomem;
0179     }
0180 
0181     cq = kzalloc(sizeof(*cq), GFP_KERNEL);
0182     if (!cq)
0183         goto cq_nomem;
0184 
0185     cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
0186                  * hfi1_sdma_comp_ring_size));
0187     if (!cq->comps)
0188         goto cq_comps_nomem;
0189 
0190     cq->nentries = hfi1_sdma_comp_ring_size;
0191 
0192     ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
0193                    &pq->handler);
0194     if (ret) {
0195         dd_dev_err(dd, "Failed to register with MMU %d", ret);
0196         goto pq_mmu_fail;
0197     }
0198 
0199     rcu_assign_pointer(fd->pq, pq);
0200     fd->cq = cq;
0201 
0202     return 0;
0203 
0204 pq_mmu_fail:
0205     vfree(cq->comps);
0206 cq_comps_nomem:
0207     kfree(cq);
0208 cq_nomem:
0209     kmem_cache_destroy(pq->txreq_cache);
0210 pq_txreq_nomem:
0211     bitmap_free(pq->req_in_use);
0212 pq_reqs_no_in_use:
0213     kfree(pq->reqs);
0214 pq_reqs_nomem:
0215     kfree(pq);
0216 
0217     return ret;
0218 }
0219 
0220 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
0221 {
0222     unsigned long flags;
0223     seqlock_t *lock = pq->busy.lock;
0224 
0225     if (!lock)
0226         return;
0227     write_seqlock_irqsave(lock, flags);
0228     if (!list_empty(&pq->busy.list)) {
0229         list_del_init(&pq->busy.list);
0230         pq->busy.lock = NULL;
0231     }
0232     write_sequnlock_irqrestore(lock, flags);
0233 }
0234 
0235 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
0236                    struct hfi1_ctxtdata *uctxt)
0237 {
0238     struct hfi1_user_sdma_pkt_q *pq;
0239 
0240     trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
0241 
0242     spin_lock(&fd->pq_rcu_lock);
0243     pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
0244                     lockdep_is_held(&fd->pq_rcu_lock));
0245     if (pq) {
0246         rcu_assign_pointer(fd->pq, NULL);
0247         spin_unlock(&fd->pq_rcu_lock);
0248         synchronize_srcu(&fd->pq_srcu);
0249         /* at this point there can be no more new requests */
0250         if (pq->handler)
0251             hfi1_mmu_rb_unregister(pq->handler);
0252         iowait_sdma_drain(&pq->busy);
0253         /* Wait until all requests have been freed. */
0254         wait_event_interruptible(
0255             pq->wait,
0256             !atomic_read(&pq->n_reqs));
0257         kfree(pq->reqs);
0258         bitmap_free(pq->req_in_use);
0259         kmem_cache_destroy(pq->txreq_cache);
0260         flush_pq_iowait(pq);
0261         kfree(pq);
0262     } else {
0263         spin_unlock(&fd->pq_rcu_lock);
0264     }
0265     if (fd->cq) {
0266         vfree(fd->cq->comps);
0267         kfree(fd->cq);
0268         fd->cq = NULL;
0269     }
0270     return 0;
0271 }
0272 
0273 static u8 dlid_to_selector(u16 dlid)
0274 {
0275     static u8 mapping[256];
0276     static int initialized;
0277     static u8 next;
0278     int hash;
0279 
0280     if (!initialized) {
0281         memset(mapping, 0xFF, 256);
0282         initialized = 1;
0283     }
0284 
0285     hash = ((dlid >> 8) ^ dlid) & 0xFF;
0286     if (mapping[hash] == 0xFF) {
0287         mapping[hash] = next;
0288         next = (next + 1) & 0x7F;
0289     }
0290 
0291     return mapping[hash];
0292 }
0293 
0294 /**
0295  * hfi1_user_sdma_process_request() - Process and start a user sdma request
0296  * @fd: valid file descriptor
0297  * @iovec: array of io vectors to process
0298  * @dim: overall iovec array size
0299  * @count: number of io vector array entries processed
0300  */
0301 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
0302                    struct iovec *iovec, unsigned long dim,
0303                    unsigned long *count)
0304 {
0305     int ret = 0, i;
0306     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0307     struct hfi1_user_sdma_pkt_q *pq =
0308         srcu_dereference(fd->pq, &fd->pq_srcu);
0309     struct hfi1_user_sdma_comp_q *cq = fd->cq;
0310     struct hfi1_devdata *dd = pq->dd;
0311     unsigned long idx = 0;
0312     u8 pcount = initial_pkt_count;
0313     struct sdma_req_info info;
0314     struct user_sdma_request *req;
0315     u8 opcode, sc, vl;
0316     u16 pkey;
0317     u32 slid;
0318     u16 dlid;
0319     u32 selector;
0320 
0321     if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
0322         hfi1_cdbg(
0323            SDMA,
0324            "[%u:%u:%u] First vector not big enough for header %lu/%lu",
0325            dd->unit, uctxt->ctxt, fd->subctxt,
0326            iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
0327         return -EINVAL;
0328     }
0329     ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
0330     if (ret) {
0331         hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
0332               dd->unit, uctxt->ctxt, fd->subctxt, ret);
0333         return -EFAULT;
0334     }
0335 
0336     trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
0337                      (u16 *)&info);
0338     if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
0339         hfi1_cdbg(SDMA,
0340               "[%u:%u:%u:%u] Invalid comp index",
0341               dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
0342         return -EINVAL;
0343     }
0344 
0345     /*
0346      * Sanity check the header io vector count.  Need at least 1 vector
0347      * (header) and cannot be larger than the actual io vector count.
0348      */
0349     if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
0350         hfi1_cdbg(SDMA,
0351               "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
0352               dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
0353               req_iovcnt(info.ctrl), dim);
0354         return -EINVAL;
0355     }
0356 
0357     if (!info.fragsize) {
0358         hfi1_cdbg(SDMA,
0359               "[%u:%u:%u:%u] Request does not specify fragsize",
0360               dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
0361         return -EINVAL;
0362     }
0363 
0364     /* Try to claim the request. */
0365     if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
0366         hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
0367               dd->unit, uctxt->ctxt, fd->subctxt,
0368               info.comp_idx);
0369         return -EBADSLT;
0370     }
0371     /*
0372      * All safety checks have been done and this request has been claimed.
0373      */
0374     trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
0375                          info.comp_idx);
0376     req = pq->reqs + info.comp_idx;
0377     req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
0378     req->data_len  = 0;
0379     req->pq = pq;
0380     req->cq = cq;
0381     req->ahg_idx = -1;
0382     req->iov_idx = 0;
0383     req->sent = 0;
0384     req->seqnum = 0;
0385     req->seqcomp = 0;
0386     req->seqsubmitted = 0;
0387     req->tids = NULL;
0388     req->has_error = 0;
0389     INIT_LIST_HEAD(&req->txps);
0390 
0391     memcpy(&req->info, &info, sizeof(info));
0392 
0393     /* The request is initialized, count it */
0394     atomic_inc(&pq->n_reqs);
0395 
0396     if (req_opcode(info.ctrl) == EXPECTED) {
0397         /* expected must have a TID info and at least one data vector */
0398         if (req->data_iovs < 2) {
0399             SDMA_DBG(req,
0400                  "Not enough vectors for expected request");
0401             ret = -EINVAL;
0402             goto free_req;
0403         }
0404         req->data_iovs--;
0405     }
0406 
0407     if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
0408         SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
0409              MAX_VECTORS_PER_REQ);
0410         ret = -EINVAL;
0411         goto free_req;
0412     }
0413     /* Copy the header from the user buffer */
0414     ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
0415                  sizeof(req->hdr));
0416     if (ret) {
0417         SDMA_DBG(req, "Failed to copy header template (%d)", ret);
0418         ret = -EFAULT;
0419         goto free_req;
0420     }
0421 
0422     /* If Static rate control is not enabled, sanitize the header. */
0423     if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
0424         req->hdr.pbc[2] = 0;
0425 
0426     /* Validate the opcode. Do not trust packets from user space blindly. */
0427     opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
0428     if ((opcode & USER_OPCODE_CHECK_MASK) !=
0429          USER_OPCODE_CHECK_VAL) {
0430         SDMA_DBG(req, "Invalid opcode (%d)", opcode);
0431         ret = -EINVAL;
0432         goto free_req;
0433     }
0434     /*
0435      * Validate the vl. Do not trust packets from user space blindly.
0436      * VL comes from PBC, SC comes from LRH, and the VL needs to
0437      * match the SC look up.
0438      */
0439     vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
0440     sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
0441           (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
0442     if (vl >= dd->pport->vls_operational ||
0443         vl != sc_to_vlt(dd, sc)) {
0444         SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
0445         ret = -EINVAL;
0446         goto free_req;
0447     }
0448 
0449     /* Checking P_KEY for requests from user-space */
0450     pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
0451     slid = be16_to_cpu(req->hdr.lrh[3]);
0452     if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
0453         ret = -EINVAL;
0454         goto free_req;
0455     }
0456 
0457     /*
0458      * Also should check the BTH.lnh. If it says the next header is GRH then
0459      * the RXE parsing will be off and will land in the middle of the KDETH
0460      * or miss it entirely.
0461      */
0462     if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
0463         SDMA_DBG(req, "User tried to pass in a GRH");
0464         ret = -EINVAL;
0465         goto free_req;
0466     }
0467 
0468     req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
0469     /*
0470      * Calculate the initial TID offset based on the values of
0471      * KDETH.OFFSET and KDETH.OM that are passed in.
0472      */
0473     req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
0474         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
0475          KDETH_OM_LARGE : KDETH_OM_SMALL);
0476     trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
0477                            info.comp_idx, req->tidoffset);
0478     idx++;
0479 
0480     /* Save all the IO vector structures */
0481     for (i = 0; i < req->data_iovs; i++) {
0482         req->iovs[i].offset = 0;
0483         INIT_LIST_HEAD(&req->iovs[i].list);
0484         memcpy(&req->iovs[i].iov,
0485                iovec + idx++,
0486                sizeof(req->iovs[i].iov));
0487         ret = pin_vector_pages(req, &req->iovs[i]);
0488         if (ret) {
0489             req->data_iovs = i;
0490             goto free_req;
0491         }
0492         req->data_len += req->iovs[i].iov.iov_len;
0493     }
0494     trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
0495                      info.comp_idx, req->data_len);
0496     if (pcount > req->info.npkts)
0497         pcount = req->info.npkts;
0498     /*
0499      * Copy any TID info
0500      * User space will provide the TID info only when the
0501      * request type is EXPECTED. This is true even if there is
0502      * only one packet in the request and the header is already
0503      * setup. The reason for the singular TID case is that the
0504      * driver needs to perform safety checks.
0505      */
0506     if (req_opcode(req->info.ctrl) == EXPECTED) {
0507         u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
0508         u32 *tmp;
0509 
0510         if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
0511             ret = -EINVAL;
0512             goto free_req;
0513         }
0514 
0515         /*
0516          * We have to copy all of the tids because they may vary
0517          * in size and, therefore, the TID count might not be
0518          * equal to the pkt count. However, there is no way to
0519          * tell at this point.
0520          */
0521         tmp = memdup_user(iovec[idx].iov_base,
0522                   ntids * sizeof(*req->tids));
0523         if (IS_ERR(tmp)) {
0524             ret = PTR_ERR(tmp);
0525             SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
0526                  ntids, ret);
0527             goto free_req;
0528         }
0529         req->tids = tmp;
0530         req->n_tids = ntids;
0531         req->tididx = 0;
0532         idx++;
0533     }
0534 
0535     dlid = be16_to_cpu(req->hdr.lrh[1]);
0536     selector = dlid_to_selector(dlid);
0537     selector += uctxt->ctxt + fd->subctxt;
0538     req->sde = sdma_select_user_engine(dd, selector, vl);
0539 
0540     if (!req->sde || !sdma_running(req->sde)) {
0541         ret = -ECOMM;
0542         goto free_req;
0543     }
0544 
0545     /* We don't need an AHG entry if the request contains only one packet */
0546     if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
0547         req->ahg_idx = sdma_ahg_alloc(req->sde);
0548 
0549     set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
0550     pq->state = SDMA_PKT_Q_ACTIVE;
0551 
0552     /*
0553      * This is a somewhat blocking send implementation.
0554      * The driver will block the caller until all packets of the
0555      * request have been submitted to the SDMA engine. However, it
0556      * will not wait for send completions.
0557      */
0558     while (req->seqsubmitted != req->info.npkts) {
0559         ret = user_sdma_send_pkts(req, pcount);
0560         if (ret < 0) {
0561             int we_ret;
0562 
0563             if (ret != -EBUSY)
0564                 goto free_req;
0565             we_ret = wait_event_interruptible_timeout(
0566                 pq->busy.wait_dma,
0567                 pq->state == SDMA_PKT_Q_ACTIVE,
0568                 msecs_to_jiffies(
0569                     SDMA_IOWAIT_TIMEOUT));
0570             trace_hfi1_usdma_we(pq, we_ret);
0571             if (we_ret <= 0)
0572                 flush_pq_iowait(pq);
0573         }
0574     }
0575     *count += idx;
0576     return 0;
0577 free_req:
0578     /*
0579      * If the submitted seqsubmitted == npkts, the completion routine
0580      * controls the final state.  If sequbmitted < npkts, wait for any
0581      * outstanding packets to finish before cleaning up.
0582      */
0583     if (req->seqsubmitted < req->info.npkts) {
0584         if (req->seqsubmitted)
0585             wait_event(pq->busy.wait_dma,
0586                    (req->seqcomp == req->seqsubmitted - 1));
0587         user_sdma_free_request(req, true);
0588         pq_update(pq);
0589         set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
0590     }
0591     return ret;
0592 }
0593 
0594 static inline u32 compute_data_length(struct user_sdma_request *req,
0595                       struct user_sdma_txreq *tx)
0596 {
0597     /*
0598      * Determine the proper size of the packet data.
0599      * The size of the data of the first packet is in the header
0600      * template. However, it includes the header and ICRC, which need
0601      * to be subtracted.
0602      * The minimum representable packet data length in a header is 4 bytes,
0603      * therefore, when the data length request is less than 4 bytes, there's
0604      * only one packet, and the packet data length is equal to that of the
0605      * request data length.
0606      * The size of the remaining packets is the minimum of the frag
0607      * size (MTU) or remaining data in the request.
0608      */
0609     u32 len;
0610 
0611     if (!req->seqnum) {
0612         if (req->data_len < sizeof(u32))
0613             len = req->data_len;
0614         else
0615             len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
0616                    (sizeof(tx->hdr) - 4));
0617     } else if (req_opcode(req->info.ctrl) == EXPECTED) {
0618         u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
0619             PAGE_SIZE;
0620         /*
0621          * Get the data length based on the remaining space in the
0622          * TID pair.
0623          */
0624         len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
0625         /* If we've filled up the TID pair, move to the next one. */
0626         if (unlikely(!len) && ++req->tididx < req->n_tids &&
0627             req->tids[req->tididx]) {
0628             tidlen = EXP_TID_GET(req->tids[req->tididx],
0629                          LEN) * PAGE_SIZE;
0630             req->tidoffset = 0;
0631             len = min_t(u32, tidlen, req->info.fragsize);
0632         }
0633         /*
0634          * Since the TID pairs map entire pages, make sure that we
0635          * are not going to try to send more data that we have
0636          * remaining.
0637          */
0638         len = min(len, req->data_len - req->sent);
0639     } else {
0640         len = min(req->data_len - req->sent, (u32)req->info.fragsize);
0641     }
0642     trace_hfi1_sdma_user_compute_length(req->pq->dd,
0643                         req->pq->ctxt,
0644                         req->pq->subctxt,
0645                         req->info.comp_idx,
0646                         len);
0647     return len;
0648 }
0649 
0650 static inline u32 pad_len(u32 len)
0651 {
0652     if (len & (sizeof(u32) - 1))
0653         len += sizeof(u32) - (len & (sizeof(u32) - 1));
0654     return len;
0655 }
0656 
0657 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
0658 {
0659     /* (Size of complete header - size of PBC) + 4B ICRC + data length */
0660     return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
0661 }
0662 
0663 static int user_sdma_txadd_ahg(struct user_sdma_request *req,
0664                    struct user_sdma_txreq *tx,
0665                    u32 datalen)
0666 {
0667     int ret;
0668     u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
0669     u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
0670     struct hfi1_user_sdma_pkt_q *pq = req->pq;
0671 
0672     /*
0673      * Copy the request header into the tx header
0674      * because the HW needs a cacheline-aligned
0675      * address.
0676      * This copy can be optimized out if the hdr
0677      * member of user_sdma_request were also
0678      * cacheline aligned.
0679      */
0680     memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
0681     if (PBC2LRH(pbclen) != lrhlen) {
0682         pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
0683         tx->hdr.pbc[0] = cpu_to_le16(pbclen);
0684     }
0685     ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
0686     if (ret)
0687         return ret;
0688     ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
0689                   sizeof(tx->hdr) + datalen, req->ahg_idx,
0690                   0, NULL, 0, user_sdma_txreq_cb);
0691     if (ret)
0692         return ret;
0693     ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
0694     if (ret)
0695         sdma_txclean(pq->dd, &tx->txreq);
0696     return ret;
0697 }
0698 
0699 static int user_sdma_txadd(struct user_sdma_request *req,
0700                struct user_sdma_txreq *tx,
0701                struct user_sdma_iovec *iovec, u32 datalen,
0702                u32 *queued_ptr, u32 *data_sent_ptr,
0703                u64 *iov_offset_ptr)
0704 {
0705     int ret;
0706     unsigned int pageidx, len;
0707     unsigned long base, offset;
0708     u64 iov_offset = *iov_offset_ptr;
0709     u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
0710     struct hfi1_user_sdma_pkt_q *pq = req->pq;
0711 
0712     base = (unsigned long)iovec->iov.iov_base;
0713     offset = offset_in_page(base + iovec->offset + iov_offset);
0714     pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
0715            PAGE_SHIFT);
0716     len = offset + req->info.fragsize > PAGE_SIZE ?
0717         PAGE_SIZE - offset : req->info.fragsize;
0718     len = min((datalen - queued), len);
0719     ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
0720                   offset, len);
0721     if (ret) {
0722         SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
0723         return ret;
0724     }
0725     iov_offset += len;
0726     queued += len;
0727     data_sent += len;
0728     if (unlikely(queued < datalen && pageidx == iovec->npages &&
0729              req->iov_idx < req->data_iovs - 1)) {
0730         iovec->offset += iov_offset;
0731         iovec = &req->iovs[++req->iov_idx];
0732         iov_offset = 0;
0733     }
0734 
0735     *queued_ptr = queued;
0736     *data_sent_ptr = data_sent;
0737     *iov_offset_ptr = iov_offset;
0738     return ret;
0739 }
0740 
0741 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
0742 {
0743     int ret = 0;
0744     u16 count;
0745     unsigned npkts = 0;
0746     struct user_sdma_txreq *tx = NULL;
0747     struct hfi1_user_sdma_pkt_q *pq = NULL;
0748     struct user_sdma_iovec *iovec = NULL;
0749 
0750     if (!req->pq)
0751         return -EINVAL;
0752 
0753     pq = req->pq;
0754 
0755     /* If tx completion has reported an error, we are done. */
0756     if (READ_ONCE(req->has_error))
0757         return -EFAULT;
0758 
0759     /*
0760      * Check if we might have sent the entire request already
0761      */
0762     if (unlikely(req->seqnum == req->info.npkts)) {
0763         if (!list_empty(&req->txps))
0764             goto dosend;
0765         return ret;
0766     }
0767 
0768     if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
0769         maxpkts = req->info.npkts - req->seqnum;
0770 
0771     while (npkts < maxpkts) {
0772         u32 datalen = 0, queued = 0, data_sent = 0;
0773         u64 iov_offset = 0;
0774 
0775         /*
0776          * Check whether any of the completions have come back
0777          * with errors. If so, we are not going to process any
0778          * more packets from this request.
0779          */
0780         if (READ_ONCE(req->has_error))
0781             return -EFAULT;
0782 
0783         tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
0784         if (!tx)
0785             return -ENOMEM;
0786 
0787         tx->flags = 0;
0788         tx->req = req;
0789         INIT_LIST_HEAD(&tx->list);
0790 
0791         /*
0792          * For the last packet set the ACK request
0793          * and disable header suppression.
0794          */
0795         if (req->seqnum == req->info.npkts - 1)
0796             tx->flags |= (TXREQ_FLAGS_REQ_ACK |
0797                       TXREQ_FLAGS_REQ_DISABLE_SH);
0798 
0799         /*
0800          * Calculate the payload size - this is min of the fragment
0801          * (MTU) size or the remaining bytes in the request but only
0802          * if we have payload data.
0803          */
0804         if (req->data_len) {
0805             iovec = &req->iovs[req->iov_idx];
0806             if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
0807                 if (++req->iov_idx == req->data_iovs) {
0808                     ret = -EFAULT;
0809                     goto free_tx;
0810                 }
0811                 iovec = &req->iovs[req->iov_idx];
0812                 WARN_ON(iovec->offset);
0813             }
0814 
0815             datalen = compute_data_length(req, tx);
0816 
0817             /*
0818              * Disable header suppression for the payload <= 8DWS.
0819              * If there is an uncorrectable error in the receive
0820              * data FIFO when the received payload size is less than
0821              * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
0822              * not reported.There is set RHF.EccErr if the header
0823              * is not suppressed.
0824              */
0825             if (!datalen) {
0826                 SDMA_DBG(req,
0827                      "Request has data but pkt len is 0");
0828                 ret = -EFAULT;
0829                 goto free_tx;
0830             } else if (datalen <= 32) {
0831                 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
0832             }
0833         }
0834 
0835         if (req->ahg_idx >= 0) {
0836             if (!req->seqnum) {
0837                 ret = user_sdma_txadd_ahg(req, tx, datalen);
0838                 if (ret)
0839                     goto free_tx;
0840             } else {
0841                 int changes;
0842 
0843                 changes = set_txreq_header_ahg(req, tx,
0844                                    datalen);
0845                 if (changes < 0) {
0846                     ret = changes;
0847                     goto free_tx;
0848                 }
0849             }
0850         } else {
0851             ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
0852                       datalen, user_sdma_txreq_cb);
0853             if (ret)
0854                 goto free_tx;
0855             /*
0856              * Modify the header for this packet. This only needs
0857              * to be done if we are not going to use AHG. Otherwise,
0858              * the HW will do it based on the changes we gave it
0859              * during sdma_txinit_ahg().
0860              */
0861             ret = set_txreq_header(req, tx, datalen);
0862             if (ret)
0863                 goto free_txreq;
0864         }
0865 
0866         /*
0867          * If the request contains any data vectors, add up to
0868          * fragsize bytes to the descriptor.
0869          */
0870         while (queued < datalen &&
0871                (req->sent + data_sent) < req->data_len) {
0872             ret = user_sdma_txadd(req, tx, iovec, datalen,
0873                           &queued, &data_sent, &iov_offset);
0874             if (ret)
0875                 goto free_txreq;
0876         }
0877         /*
0878          * The txreq was submitted successfully so we can update
0879          * the counters.
0880          */
0881         req->koffset += datalen;
0882         if (req_opcode(req->info.ctrl) == EXPECTED)
0883             req->tidoffset += datalen;
0884         req->sent += data_sent;
0885         if (req->data_len)
0886             iovec->offset += iov_offset;
0887         list_add_tail(&tx->txreq.list, &req->txps);
0888         /*
0889          * It is important to increment this here as it is used to
0890          * generate the BTH.PSN and, therefore, can't be bulk-updated
0891          * outside of the loop.
0892          */
0893         tx->seqnum = req->seqnum++;
0894         npkts++;
0895     }
0896 dosend:
0897     ret = sdma_send_txlist(req->sde,
0898                    iowait_get_ib_work(&pq->busy),
0899                    &req->txps, &count);
0900     req->seqsubmitted += count;
0901     if (req->seqsubmitted == req->info.npkts) {
0902         /*
0903          * The txreq has already been submitted to the HW queue
0904          * so we can free the AHG entry now. Corruption will not
0905          * happen due to the sequential manner in which
0906          * descriptors are processed.
0907          */
0908         if (req->ahg_idx >= 0)
0909             sdma_ahg_free(req->sde, req->ahg_idx);
0910     }
0911     return ret;
0912 
0913 free_txreq:
0914     sdma_txclean(pq->dd, &tx->txreq);
0915 free_tx:
0916     kmem_cache_free(pq->txreq_cache, tx);
0917     return ret;
0918 }
0919 
0920 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
0921 {
0922     struct evict_data evict_data;
0923 
0924     evict_data.cleared = 0;
0925     evict_data.target = npages;
0926     hfi1_mmu_rb_evict(pq->handler, &evict_data);
0927     return evict_data.cleared;
0928 }
0929 
0930 static int pin_sdma_pages(struct user_sdma_request *req,
0931               struct user_sdma_iovec *iovec,
0932               struct sdma_mmu_node *node,
0933               int npages)
0934 {
0935     int pinned, cleared;
0936     struct page **pages;
0937     struct hfi1_user_sdma_pkt_q *pq = req->pq;
0938 
0939     pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
0940     if (!pages)
0941         return -ENOMEM;
0942     memcpy(pages, node->pages, node->npages * sizeof(*pages));
0943 
0944     npages -= node->npages;
0945 retry:
0946     if (!hfi1_can_pin_pages(pq->dd, current->mm,
0947                 atomic_read(&pq->n_locked), npages)) {
0948         cleared = sdma_cache_evict(pq, npages);
0949         if (cleared >= npages)
0950             goto retry;
0951     }
0952     pinned = hfi1_acquire_user_pages(current->mm,
0953                      ((unsigned long)iovec->iov.iov_base +
0954                      (node->npages * PAGE_SIZE)), npages, 0,
0955                      pages + node->npages);
0956     if (pinned < 0) {
0957         kfree(pages);
0958         return pinned;
0959     }
0960     if (pinned != npages) {
0961         unpin_vector_pages(current->mm, pages, node->npages, pinned);
0962         return -EFAULT;
0963     }
0964     kfree(node->pages);
0965     node->rb.len = iovec->iov.iov_len;
0966     node->pages = pages;
0967     atomic_add(pinned, &pq->n_locked);
0968     return pinned;
0969 }
0970 
0971 static void unpin_sdma_pages(struct sdma_mmu_node *node)
0972 {
0973     if (node->npages) {
0974         unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
0975                    node->npages);
0976         atomic_sub(node->npages, &node->pq->n_locked);
0977     }
0978 }
0979 
0980 static int pin_vector_pages(struct user_sdma_request *req,
0981                 struct user_sdma_iovec *iovec)
0982 {
0983     int ret = 0, pinned, npages;
0984     struct hfi1_user_sdma_pkt_q *pq = req->pq;
0985     struct sdma_mmu_node *node = NULL;
0986     struct mmu_rb_node *rb_node;
0987     struct iovec *iov;
0988     bool extracted;
0989 
0990     extracted =
0991         hfi1_mmu_rb_remove_unless_exact(pq->handler,
0992                         (unsigned long)
0993                         iovec->iov.iov_base,
0994                         iovec->iov.iov_len, &rb_node);
0995     if (rb_node) {
0996         node = container_of(rb_node, struct sdma_mmu_node, rb);
0997         if (!extracted) {
0998             atomic_inc(&node->refcount);
0999             iovec->pages = node->pages;
1000             iovec->npages = node->npages;
1001             iovec->node = node;
1002             return 0;
1003         }
1004     }
1005 
1006     if (!node) {
1007         node = kzalloc(sizeof(*node), GFP_KERNEL);
1008         if (!node)
1009             return -ENOMEM;
1010 
1011         node->rb.addr = (unsigned long)iovec->iov.iov_base;
1012         node->pq = pq;
1013         atomic_set(&node->refcount, 0);
1014     }
1015 
1016     iov = &iovec->iov;
1017     npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1018     if (node->npages < npages) {
1019         pinned = pin_sdma_pages(req, iovec, node, npages);
1020         if (pinned < 0) {
1021             ret = pinned;
1022             goto bail;
1023         }
1024         node->npages += pinned;
1025         npages = node->npages;
1026     }
1027     iovec->pages = node->pages;
1028     iovec->npages = npages;
1029     iovec->node = node;
1030 
1031     ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1032     if (ret) {
1033         iovec->node = NULL;
1034         goto bail;
1035     }
1036     return 0;
1037 bail:
1038     unpin_sdma_pages(node);
1039     kfree(node);
1040     return ret;
1041 }
1042 
1043 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1044                    unsigned start, unsigned npages)
1045 {
1046     hfi1_release_user_pages(mm, pages + start, npages, false);
1047     kfree(pages);
1048 }
1049 
1050 static int check_header_template(struct user_sdma_request *req,
1051                  struct hfi1_pkt_header *hdr, u32 lrhlen,
1052                  u32 datalen)
1053 {
1054     /*
1055      * Perform safety checks for any type of packet:
1056      *    - transfer size is multiple of 64bytes
1057      *    - packet length is multiple of 4 bytes
1058      *    - packet length is not larger than MTU size
1059      *
1060      * These checks are only done for the first packet of the
1061      * transfer since the header is "given" to us by user space.
1062      * For the remainder of the packets we compute the values.
1063      */
1064     if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1065         lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1066         return -EINVAL;
1067 
1068     if (req_opcode(req->info.ctrl) == EXPECTED) {
1069         /*
1070          * The header is checked only on the first packet. Furthermore,
1071          * we ensure that at least one TID entry is copied when the
1072          * request is submitted. Therefore, we don't have to verify that
1073          * tididx points to something sane.
1074          */
1075         u32 tidval = req->tids[req->tididx],
1076             tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1077             tididx = EXP_TID_GET(tidval, IDX),
1078             tidctrl = EXP_TID_GET(tidval, CTRL),
1079             tidoff;
1080         __le32 kval = hdr->kdeth.ver_tid_offset;
1081 
1082         tidoff = KDETH_GET(kval, OFFSET) *
1083               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1084                KDETH_OM_LARGE : KDETH_OM_SMALL);
1085         /*
1086          * Expected receive packets have the following
1087          * additional checks:
1088          *     - offset is not larger than the TID size
1089          *     - TIDCtrl values match between header and TID array
1090          *     - TID indexes match between header and TID array
1091          */
1092         if ((tidoff + datalen > tidlen) ||
1093             KDETH_GET(kval, TIDCTRL) != tidctrl ||
1094             KDETH_GET(kval, TID) != tididx)
1095             return -EINVAL;
1096     }
1097     return 0;
1098 }
1099 
1100 /*
1101  * Correctly set the BTH.PSN field based on type of
1102  * transfer - eager packets can just increment the PSN but
1103  * expected packets encode generation and sequence in the
1104  * BTH.PSN field so just incrementing will result in errors.
1105  */
1106 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1107 {
1108     u32 val = be32_to_cpu(bthpsn),
1109         mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1110             0xffffffull),
1111         psn = val & mask;
1112     if (expct)
1113         psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1114             ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1115     else
1116         psn = psn + frags;
1117     return psn & mask;
1118 }
1119 
1120 static int set_txreq_header(struct user_sdma_request *req,
1121                 struct user_sdma_txreq *tx, u32 datalen)
1122 {
1123     struct hfi1_user_sdma_pkt_q *pq = req->pq;
1124     struct hfi1_pkt_header *hdr = &tx->hdr;
1125     u8 omfactor; /* KDETH.OM */
1126     u16 pbclen;
1127     int ret;
1128     u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1129 
1130     /* Copy the header template to the request before modification */
1131     memcpy(hdr, &req->hdr, sizeof(*hdr));
1132 
1133     /*
1134      * Check if the PBC and LRH length are mismatched. If so
1135      * adjust both in the header.
1136      */
1137     pbclen = le16_to_cpu(hdr->pbc[0]);
1138     if (PBC2LRH(pbclen) != lrhlen) {
1139         pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1140         hdr->pbc[0] = cpu_to_le16(pbclen);
1141         hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1142         /*
1143          * Third packet
1144          * This is the first packet in the sequence that has
1145          * a "static" size that can be used for the rest of
1146          * the packets (besides the last one).
1147          */
1148         if (unlikely(req->seqnum == 2)) {
1149             /*
1150              * From this point on the lengths in both the
1151              * PBC and LRH are the same until the last
1152              * packet.
1153              * Adjust the template so we don't have to update
1154              * every packet
1155              */
1156             req->hdr.pbc[0] = hdr->pbc[0];
1157             req->hdr.lrh[2] = hdr->lrh[2];
1158         }
1159     }
1160     /*
1161      * We only have to modify the header if this is not the
1162      * first packet in the request. Otherwise, we use the
1163      * header given to us.
1164      */
1165     if (unlikely(!req->seqnum)) {
1166         ret = check_header_template(req, hdr, lrhlen, datalen);
1167         if (ret)
1168             return ret;
1169         goto done;
1170     }
1171 
1172     hdr->bth[2] = cpu_to_be32(
1173         set_pkt_bth_psn(hdr->bth[2],
1174                 (req_opcode(req->info.ctrl) == EXPECTED),
1175                 req->seqnum));
1176 
1177     /* Set ACK request on last packet */
1178     if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1179         hdr->bth[2] |= cpu_to_be32(1UL << 31);
1180 
1181     /* Set the new offset */
1182     hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1183     /* Expected packets have to fill in the new TID information */
1184     if (req_opcode(req->info.ctrl) == EXPECTED) {
1185         tidval = req->tids[req->tididx];
1186         /*
1187          * If the offset puts us at the end of the current TID,
1188          * advance everything.
1189          */
1190         if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1191                      PAGE_SIZE)) {
1192             req->tidoffset = 0;
1193             /*
1194              * Since we don't copy all the TIDs, all at once,
1195              * we have to check again.
1196              */
1197             if (++req->tididx > req->n_tids - 1 ||
1198                 !req->tids[req->tididx]) {
1199                 return -EINVAL;
1200             }
1201             tidval = req->tids[req->tididx];
1202         }
1203         omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1204             KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1205             KDETH_OM_SMALL_SHIFT;
1206         /* Set KDETH.TIDCtrl based on value for this TID. */
1207         KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1208               EXP_TID_GET(tidval, CTRL));
1209         /* Set KDETH.TID based on value for this TID */
1210         KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1211               EXP_TID_GET(tidval, IDX));
1212         /* Clear KDETH.SH when DISABLE_SH flag is set */
1213         if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1214             KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1215         /*
1216          * Set the KDETH.OFFSET and KDETH.OM based on size of
1217          * transfer.
1218          */
1219         trace_hfi1_sdma_user_tid_info(
1220             pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1221             req->tidoffset, req->tidoffset >> omfactor,
1222             omfactor != KDETH_OM_SMALL_SHIFT);
1223         KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1224               req->tidoffset >> omfactor);
1225         KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1226               omfactor != KDETH_OM_SMALL_SHIFT);
1227     }
1228 done:
1229     trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1230                     req->info.comp_idx, hdr, tidval);
1231     return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1232 }
1233 
1234 static int set_txreq_header_ahg(struct user_sdma_request *req,
1235                 struct user_sdma_txreq *tx, u32 datalen)
1236 {
1237     u32 ahg[AHG_KDETH_ARRAY_SIZE];
1238     int idx = 0;
1239     u8 omfactor; /* KDETH.OM */
1240     struct hfi1_user_sdma_pkt_q *pq = req->pq;
1241     struct hfi1_pkt_header *hdr = &req->hdr;
1242     u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1243     u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1244     size_t array_size = ARRAY_SIZE(ahg);
1245 
1246     if (PBC2LRH(pbclen) != lrhlen) {
1247         /* PBC.PbcLengthDWs */
1248         idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1249                      (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1250         if (idx < 0)
1251             return idx;
1252         /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1253         idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1254                      (__force u16)cpu_to_be16(lrhlen >> 2));
1255         if (idx < 0)
1256             return idx;
1257     }
1258 
1259     /*
1260      * Do the common updates
1261      */
1262     /* BTH.PSN and BTH.A */
1263     val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1264         (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1265     if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1266         val32 |= 1UL << 31;
1267     idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1268                  (__force u16)cpu_to_be16(val32 >> 16));
1269     if (idx < 0)
1270         return idx;
1271     idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1272                  (__force u16)cpu_to_be16(val32 & 0xffff));
1273     if (idx < 0)
1274         return idx;
1275     /* KDETH.Offset */
1276     idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1277                  (__force u16)cpu_to_le16(req->koffset & 0xffff));
1278     if (idx < 0)
1279         return idx;
1280     idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1281                  (__force u16)cpu_to_le16(req->koffset >> 16));
1282     if (idx < 0)
1283         return idx;
1284     if (req_opcode(req->info.ctrl) == EXPECTED) {
1285         __le16 val;
1286 
1287         tidval = req->tids[req->tididx];
1288 
1289         /*
1290          * If the offset puts us at the end of the current TID,
1291          * advance everything.
1292          */
1293         if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1294                      PAGE_SIZE)) {
1295             req->tidoffset = 0;
1296             /*
1297              * Since we don't copy all the TIDs, all at once,
1298              * we have to check again.
1299              */
1300             if (++req->tididx > req->n_tids - 1 ||
1301                 !req->tids[req->tididx])
1302                 return -EINVAL;
1303             tidval = req->tids[req->tididx];
1304         }
1305         omfactor = ((EXP_TID_GET(tidval, LEN) *
1306                   PAGE_SIZE) >=
1307                  KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1308                  KDETH_OM_SMALL_SHIFT;
1309         /* KDETH.OM and KDETH.OFFSET (TID) */
1310         idx = ahg_header_set(
1311                 ahg, idx, array_size, 7, 0, 16,
1312                 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1313                 ((req->tidoffset >> omfactor)
1314                 & 0x7fff)));
1315         if (idx < 0)
1316             return idx;
1317         /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1318         val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1319                    (EXP_TID_GET(tidval, IDX) & 0x3ff));
1320 
1321         if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1322             val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1323                               INTR) <<
1324                         AHG_KDETH_INTR_SHIFT));
1325         } else {
1326             val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1327                    cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1328                    cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1329                               INTR) <<
1330                          AHG_KDETH_INTR_SHIFT));
1331         }
1332 
1333         idx = ahg_header_set(ahg, idx, array_size,
1334                      7, 16, 14, (__force u16)val);
1335         if (idx < 0)
1336             return idx;
1337     }
1338 
1339     trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1340                     req->info.comp_idx, req->sde->this_idx,
1341                     req->ahg_idx, ahg, idx, tidval);
1342     sdma_txinit_ahg(&tx->txreq,
1343             SDMA_TXREQ_F_USE_AHG,
1344             datalen, req->ahg_idx, idx,
1345             ahg, sizeof(req->hdr),
1346             user_sdma_txreq_cb);
1347 
1348     return idx;
1349 }
1350 
1351 /**
1352  * user_sdma_txreq_cb() - SDMA tx request completion callback.
1353  * @txreq: valid sdma tx request
1354  * @status: success/failure of request
1355  *
1356  * Called when the SDMA progress state machine gets notification that
1357  * the SDMA descriptors for this tx request have been processed by the
1358  * DMA engine. Called in interrupt context.
1359  * Only do work on completed sequences.
1360  */
1361 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1362 {
1363     struct user_sdma_txreq *tx =
1364         container_of(txreq, struct user_sdma_txreq, txreq);
1365     struct user_sdma_request *req;
1366     struct hfi1_user_sdma_pkt_q *pq;
1367     struct hfi1_user_sdma_comp_q *cq;
1368     enum hfi1_sdma_comp_state state = COMPLETE;
1369 
1370     if (!tx->req)
1371         return;
1372 
1373     req = tx->req;
1374     pq = req->pq;
1375     cq = req->cq;
1376 
1377     if (status != SDMA_TXREQ_S_OK) {
1378         SDMA_DBG(req, "SDMA completion with error %d",
1379              status);
1380         WRITE_ONCE(req->has_error, 1);
1381         state = ERROR;
1382     }
1383 
1384     req->seqcomp = tx->seqnum;
1385     kmem_cache_free(pq->txreq_cache, tx);
1386 
1387     /* sequence isn't complete?  We are done */
1388     if (req->seqcomp != req->info.npkts - 1)
1389         return;
1390 
1391     user_sdma_free_request(req, false);
1392     set_comp_state(pq, cq, req->info.comp_idx, state, status);
1393     pq_update(pq);
1394 }
1395 
1396 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1397 {
1398     if (atomic_dec_and_test(&pq->n_reqs))
1399         wake_up(&pq->wait);
1400 }
1401 
1402 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1403 {
1404     int i;
1405 
1406     if (!list_empty(&req->txps)) {
1407         struct sdma_txreq *t, *p;
1408 
1409         list_for_each_entry_safe(t, p, &req->txps, list) {
1410             struct user_sdma_txreq *tx =
1411                 container_of(t, struct user_sdma_txreq, txreq);
1412             list_del_init(&t->list);
1413             sdma_txclean(req->pq->dd, t);
1414             kmem_cache_free(req->pq->txreq_cache, tx);
1415         }
1416     }
1417 
1418     for (i = 0; i < req->data_iovs; i++) {
1419         struct sdma_mmu_node *node = req->iovs[i].node;
1420 
1421         if (!node)
1422             continue;
1423 
1424         req->iovs[i].node = NULL;
1425 
1426         if (unpin)
1427             hfi1_mmu_rb_remove(req->pq->handler,
1428                        &node->rb);
1429         else
1430             atomic_dec(&node->refcount);
1431     }
1432 
1433     kfree(req->tids);
1434     clear_bit(req->info.comp_idx, req->pq->req_in_use);
1435 }
1436 
1437 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1438                   struct hfi1_user_sdma_comp_q *cq,
1439                   u16 idx, enum hfi1_sdma_comp_state state,
1440                   int ret)
1441 {
1442     if (state == ERROR)
1443         cq->comps[idx].errcode = -ret;
1444     smp_wmb(); /* make sure errcode is visible first */
1445     cq->comps[idx].status = state;
1446     trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1447                     idx, state, ret);
1448 }
1449 
1450 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1451                unsigned long len)
1452 {
1453     return (bool)(node->addr == addr);
1454 }
1455 
1456 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1457 {
1458     struct sdma_mmu_node *node =
1459         container_of(mnode, struct sdma_mmu_node, rb);
1460 
1461     atomic_inc(&node->refcount);
1462     return 0;
1463 }
1464 
1465 /*
1466  * Return 1 to remove the node from the rb tree and call the remove op.
1467  *
1468  * Called with the rb tree lock held.
1469  */
1470 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1471              void *evict_arg, bool *stop)
1472 {
1473     struct sdma_mmu_node *node =
1474         container_of(mnode, struct sdma_mmu_node, rb);
1475     struct evict_data *evict_data = evict_arg;
1476 
1477     /* is this node still being used? */
1478     if (atomic_read(&node->refcount))
1479         return 0; /* keep this node */
1480 
1481     /* this node will be evicted, add its pages to our count */
1482     evict_data->cleared += node->npages;
1483 
1484     /* have enough pages been cleared? */
1485     if (evict_data->cleared >= evict_data->target)
1486         *stop = true;
1487 
1488     return 1; /* remove this node */
1489 }
1490 
1491 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1492 {
1493     struct sdma_mmu_node *node =
1494         container_of(mnode, struct sdma_mmu_node, rb);
1495 
1496     unpin_sdma_pages(node);
1497     kfree(node);
1498 }
1499 
1500 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1501 {
1502     struct sdma_mmu_node *node =
1503         container_of(mnode, struct sdma_mmu_node, rb);
1504 
1505     if (!atomic_read(&node->refcount))
1506         return 1;
1507     return 0;
1508 }