Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  */
0032 
0033 #include <linux/types.h>
0034 #include <linux/sched.h>
0035 #include <linux/sched/mm.h>
0036 #include <linux/sched/task.h>
0037 #include <linux/pid.h>
0038 #include <linux/slab.h>
0039 #include <linux/export.h>
0040 #include <linux/vmalloc.h>
0041 #include <linux/hugetlb.h>
0042 #include <linux/interval_tree.h>
0043 #include <linux/hmm.h>
0044 #include <linux/pagemap.h>
0045 
0046 #include <rdma/ib_verbs.h>
0047 #include <rdma/ib_umem.h>
0048 #include <rdma/ib_umem_odp.h>
0049 
0050 #include "uverbs.h"
0051 
0052 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
0053                    const struct mmu_interval_notifier_ops *ops)
0054 {
0055     int ret;
0056 
0057     umem_odp->umem.is_odp = 1;
0058     mutex_init(&umem_odp->umem_mutex);
0059 
0060     if (!umem_odp->is_implicit_odp) {
0061         size_t page_size = 1UL << umem_odp->page_shift;
0062         unsigned long start;
0063         unsigned long end;
0064         size_t ndmas, npfns;
0065 
0066         start = ALIGN_DOWN(umem_odp->umem.address, page_size);
0067         if (check_add_overflow(umem_odp->umem.address,
0068                        (unsigned long)umem_odp->umem.length,
0069                        &end))
0070             return -EOVERFLOW;
0071         end = ALIGN(end, page_size);
0072         if (unlikely(end < page_size))
0073             return -EOVERFLOW;
0074 
0075         ndmas = (end - start) >> umem_odp->page_shift;
0076         if (!ndmas)
0077             return -EINVAL;
0078 
0079         npfns = (end - start) >> PAGE_SHIFT;
0080         umem_odp->pfn_list = kvcalloc(
0081             npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
0082         if (!umem_odp->pfn_list)
0083             return -ENOMEM;
0084 
0085         umem_odp->dma_list = kvcalloc(
0086             ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
0087         if (!umem_odp->dma_list) {
0088             ret = -ENOMEM;
0089             goto out_pfn_list;
0090         }
0091 
0092         ret = mmu_interval_notifier_insert(&umem_odp->notifier,
0093                            umem_odp->umem.owning_mm,
0094                            start, end - start, ops);
0095         if (ret)
0096             goto out_dma_list;
0097     }
0098 
0099     return 0;
0100 
0101 out_dma_list:
0102     kvfree(umem_odp->dma_list);
0103 out_pfn_list:
0104     kvfree(umem_odp->pfn_list);
0105     return ret;
0106 }
0107 
0108 /**
0109  * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
0110  *
0111  * Implicit ODP umems do not have a VA range and do not have any page lists.
0112  * They exist only to hold the per_mm reference to help the driver create
0113  * children umems.
0114  *
0115  * @device: IB device to create UMEM
0116  * @access: ib_reg_mr access flags
0117  */
0118 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
0119                            int access)
0120 {
0121     struct ib_umem *umem;
0122     struct ib_umem_odp *umem_odp;
0123     int ret;
0124 
0125     if (access & IB_ACCESS_HUGETLB)
0126         return ERR_PTR(-EINVAL);
0127 
0128     umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
0129     if (!umem_odp)
0130         return ERR_PTR(-ENOMEM);
0131     umem = &umem_odp->umem;
0132     umem->ibdev = device;
0133     umem->writable = ib_access_writable(access);
0134     umem->owning_mm = current->mm;
0135     umem_odp->is_implicit_odp = 1;
0136     umem_odp->page_shift = PAGE_SHIFT;
0137 
0138     umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
0139     ret = ib_init_umem_odp(umem_odp, NULL);
0140     if (ret) {
0141         put_pid(umem_odp->tgid);
0142         kfree(umem_odp);
0143         return ERR_PTR(ret);
0144     }
0145     return umem_odp;
0146 }
0147 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
0148 
0149 /**
0150  * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
0151  *                           parent ODP umem
0152  *
0153  * @root: The parent umem enclosing the child. This must be allocated using
0154  *        ib_alloc_implicit_odp_umem()
0155  * @addr: The starting userspace VA
0156  * @size: The length of the userspace VA
0157  * @ops: MMU interval ops, currently only @invalidate
0158  */
0159 struct ib_umem_odp *
0160 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
0161             size_t size,
0162             const struct mmu_interval_notifier_ops *ops)
0163 {
0164     /*
0165      * Caller must ensure that root cannot be freed during the call to
0166      * ib_alloc_odp_umem.
0167      */
0168     struct ib_umem_odp *odp_data;
0169     struct ib_umem *umem;
0170     int ret;
0171 
0172     if (WARN_ON(!root->is_implicit_odp))
0173         return ERR_PTR(-EINVAL);
0174 
0175     odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
0176     if (!odp_data)
0177         return ERR_PTR(-ENOMEM);
0178     umem = &odp_data->umem;
0179     umem->ibdev = root->umem.ibdev;
0180     umem->length     = size;
0181     umem->address    = addr;
0182     umem->writable   = root->umem.writable;
0183     umem->owning_mm  = root->umem.owning_mm;
0184     odp_data->page_shift = PAGE_SHIFT;
0185     odp_data->notifier.ops = ops;
0186 
0187     /*
0188      * A mmget must be held when registering a notifier, the owming_mm only
0189      * has a mm_grab at this point.
0190      */
0191     if (!mmget_not_zero(umem->owning_mm)) {
0192         ret = -EFAULT;
0193         goto out_free;
0194     }
0195 
0196     odp_data->tgid = get_pid(root->tgid);
0197     ret = ib_init_umem_odp(odp_data, ops);
0198     if (ret)
0199         goto out_tgid;
0200     mmput(umem->owning_mm);
0201     return odp_data;
0202 
0203 out_tgid:
0204     put_pid(odp_data->tgid);
0205     mmput(umem->owning_mm);
0206 out_free:
0207     kfree(odp_data);
0208     return ERR_PTR(ret);
0209 }
0210 EXPORT_SYMBOL(ib_umem_odp_alloc_child);
0211 
0212 /**
0213  * ib_umem_odp_get - Create a umem_odp for a userspace va
0214  *
0215  * @device: IB device struct to get UMEM
0216  * @addr: userspace virtual address to start at
0217  * @size: length of region to pin
0218  * @access: IB_ACCESS_xxx flags for memory being pinned
0219  * @ops: MMU interval ops, currently only @invalidate
0220  *
0221  * The driver should use when the access flags indicate ODP memory. It avoids
0222  * pinning, instead, stores the mm for future page fault handling in
0223  * conjunction with MMU notifiers.
0224  */
0225 struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
0226                     unsigned long addr, size_t size, int access,
0227                     const struct mmu_interval_notifier_ops *ops)
0228 {
0229     struct ib_umem_odp *umem_odp;
0230     int ret;
0231 
0232     if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
0233         return ERR_PTR(-EINVAL);
0234 
0235     umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
0236     if (!umem_odp)
0237         return ERR_PTR(-ENOMEM);
0238 
0239     umem_odp->umem.ibdev = device;
0240     umem_odp->umem.length = size;
0241     umem_odp->umem.address = addr;
0242     umem_odp->umem.writable = ib_access_writable(access);
0243     umem_odp->umem.owning_mm = current->mm;
0244     umem_odp->notifier.ops = ops;
0245 
0246     umem_odp->page_shift = PAGE_SHIFT;
0247 #ifdef CONFIG_HUGETLB_PAGE
0248     if (access & IB_ACCESS_HUGETLB)
0249         umem_odp->page_shift = HPAGE_SHIFT;
0250 #endif
0251 
0252     umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
0253     ret = ib_init_umem_odp(umem_odp, ops);
0254     if (ret)
0255         goto err_put_pid;
0256     return umem_odp;
0257 
0258 err_put_pid:
0259     put_pid(umem_odp->tgid);
0260     kfree(umem_odp);
0261     return ERR_PTR(ret);
0262 }
0263 EXPORT_SYMBOL(ib_umem_odp_get);
0264 
0265 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
0266 {
0267     /*
0268      * Ensure that no more pages are mapped in the umem.
0269      *
0270      * It is the driver's responsibility to ensure, before calling us,
0271      * that the hardware will not attempt to access the MR any more.
0272      */
0273     if (!umem_odp->is_implicit_odp) {
0274         mutex_lock(&umem_odp->umem_mutex);
0275         ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
0276                         ib_umem_end(umem_odp));
0277         mutex_unlock(&umem_odp->umem_mutex);
0278         mmu_interval_notifier_remove(&umem_odp->notifier);
0279         kvfree(umem_odp->dma_list);
0280         kvfree(umem_odp->pfn_list);
0281     }
0282     put_pid(umem_odp->tgid);
0283     kfree(umem_odp);
0284 }
0285 EXPORT_SYMBOL(ib_umem_odp_release);
0286 
0287 /*
0288  * Map for DMA and insert a single page into the on-demand paging page tables.
0289  *
0290  * @umem: the umem to insert the page to.
0291  * @dma_index: index in the umem to add the dma to.
0292  * @page: the page struct to map and add.
0293  * @access_mask: access permissions needed for this page.
0294  *
0295  * The function returns -EFAULT if the DMA mapping operation fails.
0296  *
0297  */
0298 static int ib_umem_odp_map_dma_single_page(
0299         struct ib_umem_odp *umem_odp,
0300         unsigned int dma_index,
0301         struct page *page,
0302         u64 access_mask)
0303 {
0304     struct ib_device *dev = umem_odp->umem.ibdev;
0305     dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
0306 
0307     if (*dma_addr) {
0308         /*
0309          * If the page is already dma mapped it means it went through
0310          * a non-invalidating trasition, like read-only to writable.
0311          * Resync the flags.
0312          */
0313         *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
0314         return 0;
0315     }
0316 
0317     *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
0318                     DMA_BIDIRECTIONAL);
0319     if (ib_dma_mapping_error(dev, *dma_addr)) {
0320         *dma_addr = 0;
0321         return -EFAULT;
0322     }
0323     umem_odp->npages++;
0324     *dma_addr |= access_mask;
0325     return 0;
0326 }
0327 
0328 /**
0329  * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
0330  *
0331  * Maps the range passed in the argument to DMA addresses.
0332  * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
0333  * Upon success the ODP MR will be locked to let caller complete its device
0334  * page table update.
0335  *
0336  * Returns the number of pages mapped in success, negative error code
0337  * for failure.
0338  * @umem_odp: the umem to map and pin
0339  * @user_virt: the address from which we need to map.
0340  * @bcnt: the minimal number of bytes to pin and map. The mapping might be
0341  *        bigger due to alignment, and may also be smaller in case of an error
0342  *        pinning or mapping a page. The actual pages mapped is returned in
0343  *        the return value.
0344  * @access_mask: bit mask of the requested access permissions for the given
0345  *               range.
0346  * @fault: is faulting required for the given range
0347  */
0348 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
0349                  u64 bcnt, u64 access_mask, bool fault)
0350             __acquires(&umem_odp->umem_mutex)
0351 {
0352     struct task_struct *owning_process  = NULL;
0353     struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
0354     int pfn_index, dma_index, ret = 0, start_idx;
0355     unsigned int page_shift, hmm_order, pfn_start_idx;
0356     unsigned long num_pfns, current_seq;
0357     struct hmm_range range = {};
0358     unsigned long timeout;
0359 
0360     if (access_mask == 0)
0361         return -EINVAL;
0362 
0363     if (user_virt < ib_umem_start(umem_odp) ||
0364         user_virt + bcnt > ib_umem_end(umem_odp))
0365         return -EFAULT;
0366 
0367     page_shift = umem_odp->page_shift;
0368 
0369     /*
0370      * owning_process is allowed to be NULL, this means somehow the mm is
0371      * existing beyond the lifetime of the originating process.. Presumably
0372      * mmget_not_zero will fail in this case.
0373      */
0374     owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
0375     if (!owning_process || !mmget_not_zero(owning_mm)) {
0376         ret = -EINVAL;
0377         goto out_put_task;
0378     }
0379 
0380     range.notifier = &umem_odp->notifier;
0381     range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
0382     range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
0383     pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
0384     num_pfns = (range.end - range.start) >> PAGE_SHIFT;
0385     if (fault) {
0386         range.default_flags = HMM_PFN_REQ_FAULT;
0387 
0388         if (access_mask & ODP_WRITE_ALLOWED_BIT)
0389             range.default_flags |= HMM_PFN_REQ_WRITE;
0390     }
0391 
0392     range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
0393     timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
0394 
0395 retry:
0396     current_seq = range.notifier_seq =
0397         mmu_interval_read_begin(&umem_odp->notifier);
0398 
0399     mmap_read_lock(owning_mm);
0400     ret = hmm_range_fault(&range);
0401     mmap_read_unlock(owning_mm);
0402     if (unlikely(ret)) {
0403         if (ret == -EBUSY && !time_after(jiffies, timeout))
0404             goto retry;
0405         goto out_put_mm;
0406     }
0407 
0408     start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
0409     dma_index = start_idx;
0410 
0411     mutex_lock(&umem_odp->umem_mutex);
0412     if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
0413         mutex_unlock(&umem_odp->umem_mutex);
0414         goto retry;
0415     }
0416 
0417     for (pfn_index = 0; pfn_index < num_pfns;
0418         pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
0419 
0420         if (fault) {
0421             /*
0422              * Since we asked for hmm_range_fault() to populate
0423              * pages it shouldn't return an error entry on success.
0424              */
0425             WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
0426             WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
0427         } else {
0428             if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
0429                 WARN_ON(umem_odp->dma_list[dma_index]);
0430                 continue;
0431             }
0432             access_mask = ODP_READ_ALLOWED_BIT;
0433             if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
0434                 access_mask |= ODP_WRITE_ALLOWED_BIT;
0435         }
0436 
0437         hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
0438         /* If a hugepage was detected and ODP wasn't set for, the umem
0439          * page_shift will be used, the opposite case is an error.
0440          */
0441         if (hmm_order + PAGE_SHIFT < page_shift) {
0442             ret = -EINVAL;
0443             ibdev_dbg(umem_odp->umem.ibdev,
0444                   "%s: un-expected hmm_order %u, page_shift %u\n",
0445                   __func__, hmm_order, page_shift);
0446             break;
0447         }
0448 
0449         ret = ib_umem_odp_map_dma_single_page(
0450                 umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
0451                 access_mask);
0452         if (ret < 0) {
0453             ibdev_dbg(umem_odp->umem.ibdev,
0454                   "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
0455             break;
0456         }
0457     }
0458     /* upon success lock should stay on hold for the callee */
0459     if (!ret)
0460         ret = dma_index - start_idx;
0461     else
0462         mutex_unlock(&umem_odp->umem_mutex);
0463 
0464 out_put_mm:
0465     mmput_async(owning_mm);
0466 out_put_task:
0467     if (owning_process)
0468         put_task_struct(owning_process);
0469     return ret;
0470 }
0471 EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
0472 
0473 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
0474                  u64 bound)
0475 {
0476     dma_addr_t dma_addr;
0477     dma_addr_t dma;
0478     int idx;
0479     u64 addr;
0480     struct ib_device *dev = umem_odp->umem.ibdev;
0481 
0482     lockdep_assert_held(&umem_odp->umem_mutex);
0483 
0484     virt = max_t(u64, virt, ib_umem_start(umem_odp));
0485     bound = min_t(u64, bound, ib_umem_end(umem_odp));
0486     for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
0487         idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
0488         dma = umem_odp->dma_list[idx];
0489 
0490         /* The access flags guaranteed a valid DMA address in case was NULL */
0491         if (dma) {
0492             unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
0493             struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
0494 
0495             dma_addr = dma & ODP_DMA_ADDR_MASK;
0496             ib_dma_unmap_page(dev, dma_addr,
0497                       BIT(umem_odp->page_shift),
0498                       DMA_BIDIRECTIONAL);
0499             if (dma & ODP_WRITE_ALLOWED_BIT) {
0500                 struct page *head_page = compound_head(page);
0501                 /*
0502                  * set_page_dirty prefers being called with
0503                  * the page lock. However, MMU notifiers are
0504                  * called sometimes with and sometimes without
0505                  * the lock. We rely on the umem_mutex instead
0506                  * to prevent other mmu notifiers from
0507                  * continuing and allowing the page mapping to
0508                  * be removed.
0509                  */
0510                 set_page_dirty(head_page);
0511             }
0512             umem_odp->dma_list[idx] = 0;
0513             umem_odp->npages--;
0514         }
0515     }
0516 }
0517 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);