Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
0002 /*
0003  * Copyright(c) 2020 Cornelis Networks, Inc.
0004  * Copyright(c) 2015-2018 Intel Corporation.
0005  */
0006 #include <asm/page.h>
0007 #include <linux/string.h>
0008 
0009 #include "mmu_rb.h"
0010 #include "user_exp_rcv.h"
0011 #include "trace.h"
0012 
0013 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
0014                 struct exp_tid_set *set,
0015                 struct hfi1_filedata *fd);
0016 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
0017 static int set_rcvarray_entry(struct hfi1_filedata *fd,
0018                   struct tid_user_buf *tbuf,
0019                   u32 rcventry, struct tid_group *grp,
0020                   u16 pageidx, unsigned int npages);
0021 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
0022                     struct tid_rb_node *tnode);
0023 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
0024                   const struct mmu_notifier_range *range,
0025                   unsigned long cur_seq);
0026 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
0027                 struct tid_group *grp,
0028                 unsigned int start, u16 count,
0029                 u32 *tidlist, unsigned int *tididx,
0030                 unsigned int *pmapped);
0031 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
0032                   struct tid_group **grp);
0033 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
0034 
0035 static const struct mmu_interval_notifier_ops tid_mn_ops = {
0036     .invalidate = tid_rb_invalidate,
0037 };
0038 
0039 /*
0040  * Initialize context and file private data needed for Expected
0041  * receive caching. This needs to be done after the context has
0042  * been configured with the eager/expected RcvEntry counts.
0043  */
0044 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
0045                struct hfi1_ctxtdata *uctxt)
0046 {
0047     int ret = 0;
0048 
0049     fd->entry_to_rb = kcalloc(uctxt->expected_count,
0050                   sizeof(struct rb_node *),
0051                   GFP_KERNEL);
0052     if (!fd->entry_to_rb)
0053         return -ENOMEM;
0054 
0055     if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
0056         fd->invalid_tid_idx = 0;
0057         fd->invalid_tids = kcalloc(uctxt->expected_count,
0058                        sizeof(*fd->invalid_tids),
0059                        GFP_KERNEL);
0060         if (!fd->invalid_tids) {
0061             kfree(fd->entry_to_rb);
0062             fd->entry_to_rb = NULL;
0063             return -ENOMEM;
0064         }
0065         fd->use_mn = true;
0066     }
0067 
0068     /*
0069      * PSM does not have a good way to separate, count, and
0070      * effectively enforce a limit on RcvArray entries used by
0071      * subctxts (when context sharing is used) when TID caching
0072      * is enabled. To help with that, we calculate a per-process
0073      * RcvArray entry share and enforce that.
0074      * If TID caching is not in use, PSM deals with usage on its
0075      * own. In that case, we allow any subctxt to take all of the
0076      * entries.
0077      *
0078      * Make sure that we set the tid counts only after successful
0079      * init.
0080      */
0081     spin_lock(&fd->tid_lock);
0082     if (uctxt->subctxt_cnt && fd->use_mn) {
0083         u16 remainder;
0084 
0085         fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
0086         remainder = uctxt->expected_count % uctxt->subctxt_cnt;
0087         if (remainder && fd->subctxt < remainder)
0088             fd->tid_limit++;
0089     } else {
0090         fd->tid_limit = uctxt->expected_count;
0091     }
0092     spin_unlock(&fd->tid_lock);
0093 
0094     return ret;
0095 }
0096 
0097 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
0098 {
0099     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0100 
0101     mutex_lock(&uctxt->exp_mutex);
0102     if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
0103         unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
0104     if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
0105         unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
0106     mutex_unlock(&uctxt->exp_mutex);
0107 
0108     kfree(fd->invalid_tids);
0109     fd->invalid_tids = NULL;
0110 
0111     kfree(fd->entry_to_rb);
0112     fd->entry_to_rb = NULL;
0113 }
0114 
0115 /*
0116  * Release pinned receive buffer pages.
0117  *
0118  * @mapped: true if the pages have been DMA mapped. false otherwise.
0119  * @idx: Index of the first page to unpin.
0120  * @npages: No of pages to unpin.
0121  *
0122  * If the pages have been DMA mapped (indicated by mapped parameter), their
0123  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
0124  * their info will be passed via a struct tid_user_buf.
0125  */
0126 static void unpin_rcv_pages(struct hfi1_filedata *fd,
0127                 struct tid_user_buf *tidbuf,
0128                 struct tid_rb_node *node,
0129                 unsigned int idx,
0130                 unsigned int npages,
0131                 bool mapped)
0132 {
0133     struct page **pages;
0134     struct hfi1_devdata *dd = fd->uctxt->dd;
0135     struct mm_struct *mm;
0136 
0137     if (mapped) {
0138         dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
0139                  node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
0140         pages = &node->pages[idx];
0141         mm = mm_from_tid_node(node);
0142     } else {
0143         pages = &tidbuf->pages[idx];
0144         mm = current->mm;
0145     }
0146     hfi1_release_user_pages(mm, pages, npages, mapped);
0147     fd->tid_n_pinned -= npages;
0148 }
0149 
0150 /*
0151  * Pin receive buffer pages.
0152  */
0153 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
0154 {
0155     int pinned;
0156     unsigned int npages;
0157     unsigned long vaddr = tidbuf->vaddr;
0158     struct page **pages = NULL;
0159     struct hfi1_devdata *dd = fd->uctxt->dd;
0160 
0161     /* Get the number of pages the user buffer spans */
0162     npages = num_user_pages(vaddr, tidbuf->length);
0163     if (!npages)
0164         return -EINVAL;
0165 
0166     if (npages > fd->uctxt->expected_count) {
0167         dd_dev_err(dd, "Expected buffer too big\n");
0168         return -EINVAL;
0169     }
0170 
0171     /* Allocate the array of struct page pointers needed for pinning */
0172     pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
0173     if (!pages)
0174         return -ENOMEM;
0175 
0176     /*
0177      * Pin all the pages of the user buffer. If we can't pin all the
0178      * pages, accept the amount pinned so far and program only that.
0179      * User space knows how to deal with partially programmed buffers.
0180      */
0181     if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
0182         kfree(pages);
0183         return -ENOMEM;
0184     }
0185 
0186     pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
0187     if (pinned <= 0) {
0188         kfree(pages);
0189         return pinned;
0190     }
0191     tidbuf->pages = pages;
0192     tidbuf->npages = npages;
0193     fd->tid_n_pinned += pinned;
0194     return pinned;
0195 }
0196 
0197 /*
0198  * RcvArray entry allocation for Expected Receives is done by the
0199  * following algorithm:
0200  *
0201  * The context keeps 3 lists of groups of RcvArray entries:
0202  *   1. List of empty groups - tid_group_list
0203  *      This list is created during user context creation and
0204  *      contains elements which describe sets (of 8) of empty
0205  *      RcvArray entries.
0206  *   2. List of partially used groups - tid_used_list
0207  *      This list contains sets of RcvArray entries which are
0208  *      not completely used up. Another mapping request could
0209  *      use some of all of the remaining entries.
0210  *   3. List of full groups - tid_full_list
0211  *      This is the list where sets that are completely used
0212  *      up go.
0213  *
0214  * An attempt to optimize the usage of RcvArray entries is
0215  * made by finding all sets of physically contiguous pages in a
0216  * user's buffer.
0217  * These physically contiguous sets are further split into
0218  * sizes supported by the receive engine of the HFI. The
0219  * resulting sets of pages are stored in struct tid_pageset,
0220  * which describes the sets as:
0221  *    * .count - number of pages in this set
0222  *    * .idx - starting index into struct page ** array
0223  *                    of this set
0224  *
0225  * From this point on, the algorithm deals with the page sets
0226  * described above. The number of pagesets is divided by the
0227  * RcvArray group size to produce the number of full groups
0228  * needed.
0229  *
0230  * Groups from the 3 lists are manipulated using the following
0231  * rules:
0232  *   1. For each set of 8 pagesets, a complete group from
0233  *      tid_group_list is taken, programmed, and moved to
0234  *      the tid_full_list list.
0235  *   2. For all remaining pagesets:
0236  *      2.1 If the tid_used_list is empty and the tid_group_list
0237  *          is empty, stop processing pageset and return only
0238  *          what has been programmed up to this point.
0239  *      2.2 If the tid_used_list is empty and the tid_group_list
0240  *          is not empty, move a group from tid_group_list to
0241  *          tid_used_list.
0242  *      2.3 For each group is tid_used_group, program as much as
0243  *          can fit into the group. If the group becomes fully
0244  *          used, move it to tid_full_list.
0245  */
0246 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
0247                 struct hfi1_tid_info *tinfo)
0248 {
0249     int ret = 0, need_group = 0, pinned;
0250     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0251     struct hfi1_devdata *dd = uctxt->dd;
0252     unsigned int ngroups, pageidx = 0, pageset_count,
0253         tididx = 0, mapped, mapped_pages = 0;
0254     u32 *tidlist = NULL;
0255     struct tid_user_buf *tidbuf;
0256 
0257     if (!PAGE_ALIGNED(tinfo->vaddr))
0258         return -EINVAL;
0259 
0260     tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
0261     if (!tidbuf)
0262         return -ENOMEM;
0263 
0264     tidbuf->vaddr = tinfo->vaddr;
0265     tidbuf->length = tinfo->length;
0266     tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
0267                 GFP_KERNEL);
0268     if (!tidbuf->psets) {
0269         kfree(tidbuf);
0270         return -ENOMEM;
0271     }
0272 
0273     pinned = pin_rcv_pages(fd, tidbuf);
0274     if (pinned <= 0) {
0275         kfree(tidbuf->psets);
0276         kfree(tidbuf);
0277         return pinned;
0278     }
0279 
0280     /* Find sets of physically contiguous pages */
0281     tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
0282 
0283     /*
0284      * We don't need to access this under a lock since tid_used is per
0285      * process and the same process cannot be in hfi1_user_exp_rcv_clear()
0286      * and hfi1_user_exp_rcv_setup() at the same time.
0287      */
0288     spin_lock(&fd->tid_lock);
0289     if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
0290         pageset_count = fd->tid_limit - fd->tid_used;
0291     else
0292         pageset_count = tidbuf->n_psets;
0293     spin_unlock(&fd->tid_lock);
0294 
0295     if (!pageset_count)
0296         goto bail;
0297 
0298     ngroups = pageset_count / dd->rcv_entries.group_size;
0299     tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
0300     if (!tidlist) {
0301         ret = -ENOMEM;
0302         goto nomem;
0303     }
0304 
0305     tididx = 0;
0306 
0307     /*
0308      * From this point on, we are going to be using shared (between master
0309      * and subcontexts) context resources. We need to take the lock.
0310      */
0311     mutex_lock(&uctxt->exp_mutex);
0312     /*
0313      * The first step is to program the RcvArray entries which are complete
0314      * groups.
0315      */
0316     while (ngroups && uctxt->tid_group_list.count) {
0317         struct tid_group *grp =
0318             tid_group_pop(&uctxt->tid_group_list);
0319 
0320         ret = program_rcvarray(fd, tidbuf, grp,
0321                        pageidx, dd->rcv_entries.group_size,
0322                        tidlist, &tididx, &mapped);
0323         /*
0324          * If there was a failure to program the RcvArray
0325          * entries for the entire group, reset the grp fields
0326          * and add the grp back to the free group list.
0327          */
0328         if (ret <= 0) {
0329             tid_group_add_tail(grp, &uctxt->tid_group_list);
0330             hfi1_cdbg(TID,
0331                   "Failed to program RcvArray group %d", ret);
0332             goto unlock;
0333         }
0334 
0335         tid_group_add_tail(grp, &uctxt->tid_full_list);
0336         ngroups--;
0337         pageidx += ret;
0338         mapped_pages += mapped;
0339     }
0340 
0341     while (pageidx < pageset_count) {
0342         struct tid_group *grp, *ptr;
0343         /*
0344          * If we don't have any partially used tid groups, check
0345          * if we have empty groups. If so, take one from there and
0346          * put in the partially used list.
0347          */
0348         if (!uctxt->tid_used_list.count || need_group) {
0349             if (!uctxt->tid_group_list.count)
0350                 goto unlock;
0351 
0352             grp = tid_group_pop(&uctxt->tid_group_list);
0353             tid_group_add_tail(grp, &uctxt->tid_used_list);
0354             need_group = 0;
0355         }
0356         /*
0357          * There is an optimization opportunity here - instead of
0358          * fitting as many page sets as we can, check for a group
0359          * later on in the list that could fit all of them.
0360          */
0361         list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
0362                      list) {
0363             unsigned use = min_t(unsigned, pageset_count - pageidx,
0364                          grp->size - grp->used);
0365 
0366             ret = program_rcvarray(fd, tidbuf, grp,
0367                            pageidx, use, tidlist,
0368                            &tididx, &mapped);
0369             if (ret < 0) {
0370                 hfi1_cdbg(TID,
0371                       "Failed to program RcvArray entries %d",
0372                       ret);
0373                 goto unlock;
0374             } else if (ret > 0) {
0375                 if (grp->used == grp->size)
0376                     tid_group_move(grp,
0377                                &uctxt->tid_used_list,
0378                                &uctxt->tid_full_list);
0379                 pageidx += ret;
0380                 mapped_pages += mapped;
0381                 need_group = 0;
0382                 /* Check if we are done so we break out early */
0383                 if (pageidx >= pageset_count)
0384                     break;
0385             } else if (WARN_ON(ret == 0)) {
0386                 /*
0387                  * If ret is 0, we did not program any entries
0388                  * into this group, which can only happen if
0389                  * we've screwed up the accounting somewhere.
0390                  * Warn and try to continue.
0391                  */
0392                 need_group = 1;
0393             }
0394         }
0395     }
0396 unlock:
0397     mutex_unlock(&uctxt->exp_mutex);
0398 nomem:
0399     hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
0400           mapped_pages, ret);
0401     if (tididx) {
0402         spin_lock(&fd->tid_lock);
0403         fd->tid_used += tididx;
0404         spin_unlock(&fd->tid_lock);
0405         tinfo->tidcnt = tididx;
0406         tinfo->length = mapped_pages * PAGE_SIZE;
0407 
0408         if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
0409                  tidlist, sizeof(tidlist[0]) * tididx)) {
0410             /*
0411              * On failure to copy to the user level, we need to undo
0412              * everything done so far so we don't leak resources.
0413              */
0414             tinfo->tidlist = (unsigned long)&tidlist;
0415             hfi1_user_exp_rcv_clear(fd, tinfo);
0416             tinfo->tidlist = 0;
0417             ret = -EFAULT;
0418             goto bail;
0419         }
0420     }
0421 
0422     /*
0423      * If not everything was mapped (due to insufficient RcvArray entries,
0424      * for example), unpin all unmapped pages so we can pin them nex time.
0425      */
0426     if (mapped_pages != pinned)
0427         unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
0428                 (pinned - mapped_pages), false);
0429 bail:
0430     kfree(tidbuf->psets);
0431     kfree(tidlist);
0432     kfree(tidbuf->pages);
0433     kfree(tidbuf);
0434     return ret > 0 ? 0 : ret;
0435 }
0436 
0437 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
0438                 struct hfi1_tid_info *tinfo)
0439 {
0440     int ret = 0;
0441     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0442     u32 *tidinfo;
0443     unsigned tididx;
0444 
0445     if (unlikely(tinfo->tidcnt > fd->tid_used))
0446         return -EINVAL;
0447 
0448     tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
0449                   sizeof(tidinfo[0]) * tinfo->tidcnt);
0450     if (IS_ERR(tidinfo))
0451         return PTR_ERR(tidinfo);
0452 
0453     mutex_lock(&uctxt->exp_mutex);
0454     for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
0455         ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
0456         if (ret) {
0457             hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
0458                   ret);
0459             break;
0460         }
0461     }
0462     spin_lock(&fd->tid_lock);
0463     fd->tid_used -= tididx;
0464     spin_unlock(&fd->tid_lock);
0465     tinfo->tidcnt = tididx;
0466     mutex_unlock(&uctxt->exp_mutex);
0467 
0468     kfree(tidinfo);
0469     return ret;
0470 }
0471 
0472 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
0473                   struct hfi1_tid_info *tinfo)
0474 {
0475     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0476     unsigned long *ev = uctxt->dd->events +
0477         (uctxt_offset(uctxt) + fd->subctxt);
0478     u32 *array;
0479     int ret = 0;
0480 
0481     /*
0482      * copy_to_user() can sleep, which will leave the invalid_lock
0483      * locked and cause the MMU notifier to be blocked on the lock
0484      * for a long time.
0485      * Copy the data to a local buffer so we can release the lock.
0486      */
0487     array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
0488     if (!array)
0489         return -EFAULT;
0490 
0491     spin_lock(&fd->invalid_lock);
0492     if (fd->invalid_tid_idx) {
0493         memcpy(array, fd->invalid_tids, sizeof(*array) *
0494                fd->invalid_tid_idx);
0495         memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
0496                fd->invalid_tid_idx);
0497         tinfo->tidcnt = fd->invalid_tid_idx;
0498         fd->invalid_tid_idx = 0;
0499         /*
0500          * Reset the user flag while still holding the lock.
0501          * Otherwise, PSM can miss events.
0502          */
0503         clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
0504     } else {
0505         tinfo->tidcnt = 0;
0506     }
0507     spin_unlock(&fd->invalid_lock);
0508 
0509     if (tinfo->tidcnt) {
0510         if (copy_to_user((void __user *)tinfo->tidlist,
0511                  array, sizeof(*array) * tinfo->tidcnt))
0512             ret = -EFAULT;
0513     }
0514     kfree(array);
0515 
0516     return ret;
0517 }
0518 
0519 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
0520 {
0521     unsigned pagecount, pageidx, setcount = 0, i;
0522     unsigned long pfn, this_pfn;
0523     struct page **pages = tidbuf->pages;
0524     struct tid_pageset *list = tidbuf->psets;
0525 
0526     if (!npages)
0527         return 0;
0528 
0529     /*
0530      * Look for sets of physically contiguous pages in the user buffer.
0531      * This will allow us to optimize Expected RcvArray entry usage by
0532      * using the bigger supported sizes.
0533      */
0534     pfn = page_to_pfn(pages[0]);
0535     for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
0536         this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
0537 
0538         /*
0539          * If the pfn's are not sequential, pages are not physically
0540          * contiguous.
0541          */
0542         if (this_pfn != ++pfn) {
0543             /*
0544              * At this point we have to loop over the set of
0545              * physically contiguous pages and break them down it
0546              * sizes supported by the HW.
0547              * There are two main constraints:
0548              *     1. The max buffer size is MAX_EXPECTED_BUFFER.
0549              *        If the total set size is bigger than that
0550              *        program only a MAX_EXPECTED_BUFFER chunk.
0551              *     2. The buffer size has to be a power of two. If
0552              *        it is not, round down to the closes power of
0553              *        2 and program that size.
0554              */
0555             while (pagecount) {
0556                 int maxpages = pagecount;
0557                 u32 bufsize = pagecount * PAGE_SIZE;
0558 
0559                 if (bufsize > MAX_EXPECTED_BUFFER)
0560                     maxpages =
0561                         MAX_EXPECTED_BUFFER >>
0562                         PAGE_SHIFT;
0563                 else if (!is_power_of_2(bufsize))
0564                     maxpages =
0565                         rounddown_pow_of_two(bufsize) >>
0566                         PAGE_SHIFT;
0567 
0568                 list[setcount].idx = pageidx;
0569                 list[setcount].count = maxpages;
0570                 pagecount -= maxpages;
0571                 pageidx += maxpages;
0572                 setcount++;
0573             }
0574             pageidx = i;
0575             pagecount = 1;
0576             pfn = this_pfn;
0577         } else {
0578             pagecount++;
0579         }
0580     }
0581     return setcount;
0582 }
0583 
0584 /**
0585  * program_rcvarray() - program an RcvArray group with receive buffers
0586  * @fd: filedata pointer
0587  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
0588  *    virtual address, buffer length, page pointers, pagesets (array of
0589  *    struct tid_pageset holding information on physically contiguous
0590  *    chunks from the user buffer), and other fields.
0591  * @grp: RcvArray group
0592  * @start: starting index into sets array
0593  * @count: number of struct tid_pageset's to program
0594  * @tidlist: the array of u32 elements when the information about the
0595  *           programmed RcvArray entries is to be encoded.
0596  * @tididx: starting offset into tidlist
0597  * @pmapped: (output parameter) number of pages programmed into the RcvArray
0598  *           entries.
0599  *
0600  * This function will program up to 'count' number of RcvArray entries from the
0601  * group 'grp'. To make best use of write-combining writes, the function will
0602  * perform writes to the unused RcvArray entries which will be ignored by the
0603  * HW. Each RcvArray entry will be programmed with a physically contiguous
0604  * buffer chunk from the user's virtual buffer.
0605  *
0606  * Return:
0607  * -EINVAL if the requested count is larger than the size of the group,
0608  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
0609  * number of RcvArray entries programmed.
0610  */
0611 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
0612                 struct tid_group *grp,
0613                 unsigned int start, u16 count,
0614                 u32 *tidlist, unsigned int *tididx,
0615                 unsigned int *pmapped)
0616 {
0617     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0618     struct hfi1_devdata *dd = uctxt->dd;
0619     u16 idx;
0620     u32 tidinfo = 0, rcventry, useidx = 0;
0621     int mapped = 0;
0622 
0623     /* Count should never be larger than the group size */
0624     if (count > grp->size)
0625         return -EINVAL;
0626 
0627     /* Find the first unused entry in the group */
0628     for (idx = 0; idx < grp->size; idx++) {
0629         if (!(grp->map & (1 << idx))) {
0630             useidx = idx;
0631             break;
0632         }
0633         rcv_array_wc_fill(dd, grp->base + idx);
0634     }
0635 
0636     idx = 0;
0637     while (idx < count) {
0638         u16 npages, pageidx, setidx = start + idx;
0639         int ret = 0;
0640 
0641         /*
0642          * If this entry in the group is used, move to the next one.
0643          * If we go past the end of the group, exit the loop.
0644          */
0645         if (useidx >= grp->size) {
0646             break;
0647         } else if (grp->map & (1 << useidx)) {
0648             rcv_array_wc_fill(dd, grp->base + useidx);
0649             useidx++;
0650             continue;
0651         }
0652 
0653         rcventry = grp->base + useidx;
0654         npages = tbuf->psets[setidx].count;
0655         pageidx = tbuf->psets[setidx].idx;
0656 
0657         ret = set_rcvarray_entry(fd, tbuf,
0658                      rcventry, grp, pageidx,
0659                      npages);
0660         if (ret)
0661             return ret;
0662         mapped += npages;
0663 
0664         tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
0665             EXP_TID_SET(LEN, npages);
0666         tidlist[(*tididx)++] = tidinfo;
0667         grp->used++;
0668         grp->map |= 1 << useidx++;
0669         idx++;
0670     }
0671 
0672     /* Fill the rest of the group with "blank" writes */
0673     for (; useidx < grp->size; useidx++)
0674         rcv_array_wc_fill(dd, grp->base + useidx);
0675     *pmapped = mapped;
0676     return idx;
0677 }
0678 
0679 static int set_rcvarray_entry(struct hfi1_filedata *fd,
0680                   struct tid_user_buf *tbuf,
0681                   u32 rcventry, struct tid_group *grp,
0682                   u16 pageidx, unsigned int npages)
0683 {
0684     int ret;
0685     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0686     struct tid_rb_node *node;
0687     struct hfi1_devdata *dd = uctxt->dd;
0688     dma_addr_t phys;
0689     struct page **pages = tbuf->pages + pageidx;
0690 
0691     /*
0692      * Allocate the node first so we can handle a potential
0693      * failure before we've programmed anything.
0694      */
0695     node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
0696     if (!node)
0697         return -ENOMEM;
0698 
0699     phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
0700                   npages * PAGE_SIZE, DMA_FROM_DEVICE);
0701     if (dma_mapping_error(&dd->pcidev->dev, phys)) {
0702         dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
0703                phys);
0704         kfree(node);
0705         return -EFAULT;
0706     }
0707 
0708     node->fdata = fd;
0709     node->phys = page_to_phys(pages[0]);
0710     node->npages = npages;
0711     node->rcventry = rcventry;
0712     node->dma_addr = phys;
0713     node->grp = grp;
0714     node->freed = false;
0715     memcpy(node->pages, pages, flex_array_size(node, pages, npages));
0716 
0717     if (fd->use_mn) {
0718         ret = mmu_interval_notifier_insert(
0719             &node->notifier, current->mm,
0720             tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
0721             &tid_mn_ops);
0722         if (ret)
0723             goto out_unmap;
0724         /*
0725          * FIXME: This is in the wrong order, the notifier should be
0726          * established before the pages are pinned by pin_rcv_pages.
0727          */
0728         mmu_interval_read_begin(&node->notifier);
0729     }
0730     fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
0731 
0732     hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
0733     trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
0734                    node->notifier.interval_tree.start, node->phys,
0735                    phys);
0736     return 0;
0737 
0738 out_unmap:
0739     hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
0740           node->rcventry, node->notifier.interval_tree.start,
0741           node->phys, ret);
0742     dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
0743              DMA_FROM_DEVICE);
0744     kfree(node);
0745     return -EFAULT;
0746 }
0747 
0748 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
0749                   struct tid_group **grp)
0750 {
0751     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0752     struct hfi1_devdata *dd = uctxt->dd;
0753     struct tid_rb_node *node;
0754     u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
0755     u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
0756 
0757     if (tididx >= uctxt->expected_count) {
0758         dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
0759                tididx, uctxt->ctxt);
0760         return -EINVAL;
0761     }
0762 
0763     if (tidctrl == 0x3)
0764         return -EINVAL;
0765 
0766     rcventry = tididx + (tidctrl - 1);
0767 
0768     node = fd->entry_to_rb[rcventry];
0769     if (!node || node->rcventry != (uctxt->expected_base + rcventry))
0770         return -EBADF;
0771 
0772     if (grp)
0773         *grp = node->grp;
0774 
0775     if (fd->use_mn)
0776         mmu_interval_notifier_remove(&node->notifier);
0777     cacheless_tid_rb_remove(fd, node);
0778 
0779     return 0;
0780 }
0781 
0782 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
0783 {
0784     struct hfi1_ctxtdata *uctxt = fd->uctxt;
0785     struct hfi1_devdata *dd = uctxt->dd;
0786 
0787     trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
0788                  node->npages,
0789                  node->notifier.interval_tree.start, node->phys,
0790                  node->dma_addr);
0791 
0792     /*
0793      * Make sure device has seen the write before we unpin the
0794      * pages.
0795      */
0796     hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
0797 
0798     unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
0799 
0800     node->grp->used--;
0801     node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
0802 
0803     if (node->grp->used == node->grp->size - 1)
0804         tid_group_move(node->grp, &uctxt->tid_full_list,
0805                    &uctxt->tid_used_list);
0806     else if (!node->grp->used)
0807         tid_group_move(node->grp, &uctxt->tid_used_list,
0808                    &uctxt->tid_group_list);
0809     kfree(node);
0810 }
0811 
0812 /*
0813  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
0814  * clearing nodes in the non-cached case.
0815  */
0816 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
0817                 struct exp_tid_set *set,
0818                 struct hfi1_filedata *fd)
0819 {
0820     struct tid_group *grp, *ptr;
0821     int i;
0822 
0823     list_for_each_entry_safe(grp, ptr, &set->list, list) {
0824         list_del_init(&grp->list);
0825 
0826         for (i = 0; i < grp->size; i++) {
0827             if (grp->map & (1 << i)) {
0828                 u16 rcventry = grp->base + i;
0829                 struct tid_rb_node *node;
0830 
0831                 node = fd->entry_to_rb[rcventry -
0832                               uctxt->expected_base];
0833                 if (!node || node->rcventry != rcventry)
0834                     continue;
0835 
0836                 if (fd->use_mn)
0837                     mmu_interval_notifier_remove(
0838                         &node->notifier);
0839                 cacheless_tid_rb_remove(fd, node);
0840             }
0841         }
0842     }
0843 }
0844 
0845 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
0846                   const struct mmu_notifier_range *range,
0847                   unsigned long cur_seq)
0848 {
0849     struct tid_rb_node *node =
0850         container_of(mni, struct tid_rb_node, notifier);
0851     struct hfi1_filedata *fdata = node->fdata;
0852     struct hfi1_ctxtdata *uctxt = fdata->uctxt;
0853 
0854     if (node->freed)
0855         return true;
0856 
0857     trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
0858                  node->notifier.interval_tree.start,
0859                  node->rcventry, node->npages, node->dma_addr);
0860     node->freed = true;
0861 
0862     spin_lock(&fdata->invalid_lock);
0863     if (fdata->invalid_tid_idx < uctxt->expected_count) {
0864         fdata->invalid_tids[fdata->invalid_tid_idx] =
0865             rcventry2tidinfo(node->rcventry - uctxt->expected_base);
0866         fdata->invalid_tids[fdata->invalid_tid_idx] |=
0867             EXP_TID_SET(LEN, node->npages);
0868         if (!fdata->invalid_tid_idx) {
0869             unsigned long *ev;
0870 
0871             /*
0872              * hfi1_set_uevent_bits() sets a user event flag
0873              * for all processes. Because calling into the
0874              * driver to process TID cache invalidations is
0875              * expensive and TID cache invalidations are
0876              * handled on a per-process basis, we can
0877              * optimize this to set the flag only for the
0878              * process in question.
0879              */
0880             ev = uctxt->dd->events +
0881                 (uctxt_offset(uctxt) + fdata->subctxt);
0882             set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
0883         }
0884         fdata->invalid_tid_idx++;
0885     }
0886     spin_unlock(&fdata->invalid_lock);
0887     return true;
0888 }
0889 
0890 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
0891                     struct tid_rb_node *tnode)
0892 {
0893     u32 base = fdata->uctxt->expected_base;
0894 
0895     fdata->entry_to_rb[tnode->rcventry - base] = NULL;
0896     clear_tid_node(fdata, tnode);
0897 }