Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * VFIO: IOMMU DMA mapping support for TCE on POWER
0004  *
0005  * Copyright (C) 2013 IBM Corp.  All rights reserved.
0006  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
0007  *
0008  * Derived from original vfio_iommu_type1.c:
0009  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
0010  *     Author: Alex Williamson <alex.williamson@redhat.com>
0011  */
0012 
0013 #include <linux/module.h>
0014 #include <linux/pci.h>
0015 #include <linux/slab.h>
0016 #include <linux/uaccess.h>
0017 #include <linux/err.h>
0018 #include <linux/vfio.h>
0019 #include <linux/vmalloc.h>
0020 #include <linux/sched/mm.h>
0021 #include <linux/sched/signal.h>
0022 #include <linux/mm.h>
0023 #include "vfio.h"
0024 
0025 #include <asm/iommu.h>
0026 #include <asm/tce.h>
0027 #include <asm/mmu_context.h>
0028 
0029 #define DRIVER_VERSION  "0.1"
0030 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
0031 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
0032 
0033 static void tce_iommu_detach_group(void *iommu_data,
0034         struct iommu_group *iommu_group);
0035 
0036 /*
0037  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
0038  *
0039  * This code handles mapping and unmapping of user data buffers
0040  * into DMA'ble space using the IOMMU
0041  */
0042 
0043 struct tce_iommu_group {
0044     struct list_head next;
0045     struct iommu_group *grp;
0046 };
0047 
0048 /*
0049  * A container needs to remember which preregistered region  it has
0050  * referenced to do proper cleanup at the userspace process exit.
0051  */
0052 struct tce_iommu_prereg {
0053     struct list_head next;
0054     struct mm_iommu_table_group_mem_t *mem;
0055 };
0056 
0057 /*
0058  * The container descriptor supports only a single group per container.
0059  * Required by the API as the container is not supplied with the IOMMU group
0060  * at the moment of initialization.
0061  */
0062 struct tce_container {
0063     struct mutex lock;
0064     bool enabled;
0065     bool v2;
0066     bool def_window_pending;
0067     unsigned long locked_pages;
0068     struct mm_struct *mm;
0069     struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
0070     struct list_head group_list;
0071     struct list_head prereg_list;
0072 };
0073 
0074 static long tce_iommu_mm_set(struct tce_container *container)
0075 {
0076     if (container->mm) {
0077         if (container->mm == current->mm)
0078             return 0;
0079         return -EPERM;
0080     }
0081     BUG_ON(!current->mm);
0082     container->mm = current->mm;
0083     mmgrab(container->mm);
0084 
0085     return 0;
0086 }
0087 
0088 static long tce_iommu_prereg_free(struct tce_container *container,
0089         struct tce_iommu_prereg *tcemem)
0090 {
0091     long ret;
0092 
0093     ret = mm_iommu_put(container->mm, tcemem->mem);
0094     if (ret)
0095         return ret;
0096 
0097     list_del(&tcemem->next);
0098     kfree(tcemem);
0099 
0100     return 0;
0101 }
0102 
0103 static long tce_iommu_unregister_pages(struct tce_container *container,
0104         __u64 vaddr, __u64 size)
0105 {
0106     struct mm_iommu_table_group_mem_t *mem;
0107     struct tce_iommu_prereg *tcemem;
0108     bool found = false;
0109     long ret;
0110 
0111     if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
0112         return -EINVAL;
0113 
0114     mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
0115     if (!mem)
0116         return -ENOENT;
0117 
0118     list_for_each_entry(tcemem, &container->prereg_list, next) {
0119         if (tcemem->mem == mem) {
0120             found = true;
0121             break;
0122         }
0123     }
0124 
0125     if (!found)
0126         ret = -ENOENT;
0127     else
0128         ret = tce_iommu_prereg_free(container, tcemem);
0129 
0130     mm_iommu_put(container->mm, mem);
0131 
0132     return ret;
0133 }
0134 
0135 static long tce_iommu_register_pages(struct tce_container *container,
0136         __u64 vaddr, __u64 size)
0137 {
0138     long ret = 0;
0139     struct mm_iommu_table_group_mem_t *mem = NULL;
0140     struct tce_iommu_prereg *tcemem;
0141     unsigned long entries = size >> PAGE_SHIFT;
0142 
0143     if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
0144             ((vaddr + size) < vaddr))
0145         return -EINVAL;
0146 
0147     mem = mm_iommu_get(container->mm, vaddr, entries);
0148     if (mem) {
0149         list_for_each_entry(tcemem, &container->prereg_list, next) {
0150             if (tcemem->mem == mem) {
0151                 ret = -EBUSY;
0152                 goto put_exit;
0153             }
0154         }
0155     } else {
0156         ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
0157         if (ret)
0158             return ret;
0159     }
0160 
0161     tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
0162     if (!tcemem) {
0163         ret = -ENOMEM;
0164         goto put_exit;
0165     }
0166 
0167     tcemem->mem = mem;
0168     list_add(&tcemem->next, &container->prereg_list);
0169 
0170     container->enabled = true;
0171 
0172     return 0;
0173 
0174 put_exit:
0175     mm_iommu_put(container->mm, mem);
0176     return ret;
0177 }
0178 
0179 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
0180         unsigned int it_page_shift)
0181 {
0182     struct page *page;
0183     unsigned long size = 0;
0184 
0185     if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
0186         return size == (1UL << it_page_shift);
0187 
0188     page = pfn_to_page(hpa >> PAGE_SHIFT);
0189     /*
0190      * Check that the TCE table granularity is not bigger than the size of
0191      * a page we just found. Otherwise the hardware can get access to
0192      * a bigger memory chunk that it should.
0193      */
0194     return page_shift(compound_head(page)) >= it_page_shift;
0195 }
0196 
0197 static inline bool tce_groups_attached(struct tce_container *container)
0198 {
0199     return !list_empty(&container->group_list);
0200 }
0201 
0202 static long tce_iommu_find_table(struct tce_container *container,
0203         phys_addr_t ioba, struct iommu_table **ptbl)
0204 {
0205     long i;
0206 
0207     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
0208         struct iommu_table *tbl = container->tables[i];
0209 
0210         if (tbl) {
0211             unsigned long entry = ioba >> tbl->it_page_shift;
0212             unsigned long start = tbl->it_offset;
0213             unsigned long end = start + tbl->it_size;
0214 
0215             if ((start <= entry) && (entry < end)) {
0216                 *ptbl = tbl;
0217                 return i;
0218             }
0219         }
0220     }
0221 
0222     return -1;
0223 }
0224 
0225 static int tce_iommu_find_free_table(struct tce_container *container)
0226 {
0227     int i;
0228 
0229     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
0230         if (!container->tables[i])
0231             return i;
0232     }
0233 
0234     return -ENOSPC;
0235 }
0236 
0237 static int tce_iommu_enable(struct tce_container *container)
0238 {
0239     int ret = 0;
0240     unsigned long locked;
0241     struct iommu_table_group *table_group;
0242     struct tce_iommu_group *tcegrp;
0243 
0244     if (container->enabled)
0245         return -EBUSY;
0246 
0247     /*
0248      * When userspace pages are mapped into the IOMMU, they are effectively
0249      * locked memory, so, theoretically, we need to update the accounting
0250      * of locked pages on each map and unmap.  For powerpc, the map unmap
0251      * paths can be very hot, though, and the accounting would kill
0252      * performance, especially since it would be difficult to impossible
0253      * to handle the accounting in real mode only.
0254      *
0255      * To address that, rather than precisely accounting every page, we
0256      * instead account for a worst case on locked memory when the iommu is
0257      * enabled and disabled.  The worst case upper bound on locked memory
0258      * is the size of the whole iommu window, which is usually relatively
0259      * small (compared to total memory sizes) on POWER hardware.
0260      *
0261      * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
0262      * that would effectively kill the guest at random points, much better
0263      * enforcing the limit based on the max that the guest can map.
0264      *
0265      * Unfortunately at the moment it counts whole tables, no matter how
0266      * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
0267      * each with 2GB DMA window, 8GB will be counted here. The reason for
0268      * this is that we cannot tell here the amount of RAM used by the guest
0269      * as this information is only available from KVM and VFIO is
0270      * KVM agnostic.
0271      *
0272      * So we do not allow enabling a container without a group attached
0273      * as there is no way to know how much we should increment
0274      * the locked_vm counter.
0275      */
0276     if (!tce_groups_attached(container))
0277         return -ENODEV;
0278 
0279     tcegrp = list_first_entry(&container->group_list,
0280             struct tce_iommu_group, next);
0281     table_group = iommu_group_get_iommudata(tcegrp->grp);
0282     if (!table_group)
0283         return -ENODEV;
0284 
0285     if (!table_group->tce32_size)
0286         return -EPERM;
0287 
0288     ret = tce_iommu_mm_set(container);
0289     if (ret)
0290         return ret;
0291 
0292     locked = table_group->tce32_size >> PAGE_SHIFT;
0293     ret = account_locked_vm(container->mm, locked, true);
0294     if (ret)
0295         return ret;
0296 
0297     container->locked_pages = locked;
0298 
0299     container->enabled = true;
0300 
0301     return ret;
0302 }
0303 
0304 static void tce_iommu_disable(struct tce_container *container)
0305 {
0306     if (!container->enabled)
0307         return;
0308 
0309     container->enabled = false;
0310 
0311     BUG_ON(!container->mm);
0312     account_locked_vm(container->mm, container->locked_pages, false);
0313 }
0314 
0315 static void *tce_iommu_open(unsigned long arg)
0316 {
0317     struct tce_container *container;
0318 
0319     if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
0320         pr_err("tce_vfio: Wrong IOMMU type\n");
0321         return ERR_PTR(-EINVAL);
0322     }
0323 
0324     container = kzalloc(sizeof(*container), GFP_KERNEL);
0325     if (!container)
0326         return ERR_PTR(-ENOMEM);
0327 
0328     mutex_init(&container->lock);
0329     INIT_LIST_HEAD_RCU(&container->group_list);
0330     INIT_LIST_HEAD_RCU(&container->prereg_list);
0331 
0332     container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
0333 
0334     return container;
0335 }
0336 
0337 static int tce_iommu_clear(struct tce_container *container,
0338         struct iommu_table *tbl,
0339         unsigned long entry, unsigned long pages);
0340 static void tce_iommu_free_table(struct tce_container *container,
0341         struct iommu_table *tbl);
0342 
0343 static void tce_iommu_release(void *iommu_data)
0344 {
0345     struct tce_container *container = iommu_data;
0346     struct tce_iommu_group *tcegrp;
0347     struct tce_iommu_prereg *tcemem, *tmtmp;
0348     long i;
0349 
0350     while (tce_groups_attached(container)) {
0351         tcegrp = list_first_entry(&container->group_list,
0352                 struct tce_iommu_group, next);
0353         tce_iommu_detach_group(iommu_data, tcegrp->grp);
0354     }
0355 
0356     /*
0357      * If VFIO created a table, it was not disposed
0358      * by tce_iommu_detach_group() so do it now.
0359      */
0360     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
0361         struct iommu_table *tbl = container->tables[i];
0362 
0363         if (!tbl)
0364             continue;
0365 
0366         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
0367         tce_iommu_free_table(container, tbl);
0368     }
0369 
0370     list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
0371         WARN_ON(tce_iommu_prereg_free(container, tcemem));
0372 
0373     tce_iommu_disable(container);
0374     if (container->mm)
0375         mmdrop(container->mm);
0376     mutex_destroy(&container->lock);
0377 
0378     kfree(container);
0379 }
0380 
0381 static void tce_iommu_unuse_page(unsigned long hpa)
0382 {
0383     struct page *page;
0384 
0385     page = pfn_to_page(hpa >> PAGE_SHIFT);
0386     unpin_user_page(page);
0387 }
0388 
0389 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
0390         unsigned long tce, unsigned long shift,
0391         unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
0392 {
0393     long ret = 0;
0394     struct mm_iommu_table_group_mem_t *mem;
0395 
0396     mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
0397     if (!mem)
0398         return -EINVAL;
0399 
0400     ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
0401     if (ret)
0402         return -EINVAL;
0403 
0404     *pmem = mem;
0405 
0406     return 0;
0407 }
0408 
0409 static void tce_iommu_unuse_page_v2(struct tce_container *container,
0410         struct iommu_table *tbl, unsigned long entry)
0411 {
0412     struct mm_iommu_table_group_mem_t *mem = NULL;
0413     int ret;
0414     unsigned long hpa = 0;
0415     __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
0416 
0417     if (!pua)
0418         return;
0419 
0420     ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
0421             tbl->it_page_shift, &hpa, &mem);
0422     if (ret)
0423         pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
0424                 __func__, be64_to_cpu(*pua), entry, ret);
0425     if (mem)
0426         mm_iommu_mapped_dec(mem);
0427 
0428     *pua = cpu_to_be64(0);
0429 }
0430 
0431 static int tce_iommu_clear(struct tce_container *container,
0432         struct iommu_table *tbl,
0433         unsigned long entry, unsigned long pages)
0434 {
0435     unsigned long oldhpa;
0436     long ret;
0437     enum dma_data_direction direction;
0438     unsigned long lastentry = entry + pages, firstentry = entry;
0439 
0440     for ( ; entry < lastentry; ++entry) {
0441         if (tbl->it_indirect_levels && tbl->it_userspace) {
0442             /*
0443              * For multilevel tables, we can take a shortcut here
0444              * and skip some TCEs as we know that the userspace
0445              * addresses cache is a mirror of the real TCE table
0446              * and if it is missing some indirect levels, then
0447              * the hardware table does not have them allocated
0448              * either and therefore does not require updating.
0449              */
0450             __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
0451                     entry);
0452             if (!pua) {
0453                 /* align to level_size which is power of two */
0454                 entry |= tbl->it_level_size - 1;
0455                 continue;
0456             }
0457         }
0458 
0459         cond_resched();
0460 
0461         direction = DMA_NONE;
0462         oldhpa = 0;
0463         ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
0464                 &direction);
0465         if (ret)
0466             continue;
0467 
0468         if (direction == DMA_NONE)
0469             continue;
0470 
0471         if (container->v2) {
0472             tce_iommu_unuse_page_v2(container, tbl, entry);
0473             continue;
0474         }
0475 
0476         tce_iommu_unuse_page(oldhpa);
0477     }
0478 
0479     iommu_tce_kill(tbl, firstentry, pages);
0480 
0481     return 0;
0482 }
0483 
0484 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
0485 {
0486     struct page *page = NULL;
0487     enum dma_data_direction direction = iommu_tce_direction(tce);
0488 
0489     if (pin_user_pages_fast(tce & PAGE_MASK, 1,
0490             direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
0491             &page) != 1)
0492         return -EFAULT;
0493 
0494     *hpa = __pa((unsigned long) page_address(page));
0495 
0496     return 0;
0497 }
0498 
0499 static long tce_iommu_build(struct tce_container *container,
0500         struct iommu_table *tbl,
0501         unsigned long entry, unsigned long tce, unsigned long pages,
0502         enum dma_data_direction direction)
0503 {
0504     long i, ret = 0;
0505     unsigned long hpa;
0506     enum dma_data_direction dirtmp;
0507 
0508     for (i = 0; i < pages; ++i) {
0509         unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
0510 
0511         ret = tce_iommu_use_page(tce, &hpa);
0512         if (ret)
0513             break;
0514 
0515         if (!tce_page_is_contained(container->mm, hpa,
0516                 tbl->it_page_shift)) {
0517             ret = -EPERM;
0518             break;
0519         }
0520 
0521         hpa |= offset;
0522         dirtmp = direction;
0523         ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
0524                 &hpa, &dirtmp);
0525         if (ret) {
0526             tce_iommu_unuse_page(hpa);
0527             pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
0528                     __func__, entry << tbl->it_page_shift,
0529                     tce, ret);
0530             break;
0531         }
0532 
0533         if (dirtmp != DMA_NONE)
0534             tce_iommu_unuse_page(hpa);
0535 
0536         tce += IOMMU_PAGE_SIZE(tbl);
0537     }
0538 
0539     if (ret)
0540         tce_iommu_clear(container, tbl, entry, i);
0541     else
0542         iommu_tce_kill(tbl, entry, pages);
0543 
0544     return ret;
0545 }
0546 
0547 static long tce_iommu_build_v2(struct tce_container *container,
0548         struct iommu_table *tbl,
0549         unsigned long entry, unsigned long tce, unsigned long pages,
0550         enum dma_data_direction direction)
0551 {
0552     long i, ret = 0;
0553     unsigned long hpa;
0554     enum dma_data_direction dirtmp;
0555 
0556     for (i = 0; i < pages; ++i) {
0557         struct mm_iommu_table_group_mem_t *mem = NULL;
0558         __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
0559 
0560         ret = tce_iommu_prereg_ua_to_hpa(container,
0561                 tce, tbl->it_page_shift, &hpa, &mem);
0562         if (ret)
0563             break;
0564 
0565         if (!tce_page_is_contained(container->mm, hpa,
0566                 tbl->it_page_shift)) {
0567             ret = -EPERM;
0568             break;
0569         }
0570 
0571         /* Preserve offset within IOMMU page */
0572         hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
0573         dirtmp = direction;
0574 
0575         /* The registered region is being unregistered */
0576         if (mm_iommu_mapped_inc(mem))
0577             break;
0578 
0579         ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
0580                 &hpa, &dirtmp);
0581         if (ret) {
0582             /* dirtmp cannot be DMA_NONE here */
0583             tce_iommu_unuse_page_v2(container, tbl, entry + i);
0584             pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
0585                     __func__, entry << tbl->it_page_shift,
0586                     tce, ret);
0587             break;
0588         }
0589 
0590         if (dirtmp != DMA_NONE)
0591             tce_iommu_unuse_page_v2(container, tbl, entry + i);
0592 
0593         *pua = cpu_to_be64(tce);
0594 
0595         tce += IOMMU_PAGE_SIZE(tbl);
0596     }
0597 
0598     if (ret)
0599         tce_iommu_clear(container, tbl, entry, i);
0600     else
0601         iommu_tce_kill(tbl, entry, pages);
0602 
0603     return ret;
0604 }
0605 
0606 static long tce_iommu_create_table(struct tce_container *container,
0607             struct iommu_table_group *table_group,
0608             int num,
0609             __u32 page_shift,
0610             __u64 window_size,
0611             __u32 levels,
0612             struct iommu_table **ptbl)
0613 {
0614     long ret, table_size;
0615 
0616     table_size = table_group->ops->get_table_size(page_shift, window_size,
0617             levels);
0618     if (!table_size)
0619         return -EINVAL;
0620 
0621     ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
0622     if (ret)
0623         return ret;
0624 
0625     ret = table_group->ops->create_table(table_group, num,
0626             page_shift, window_size, levels, ptbl);
0627 
0628     WARN_ON(!ret && !(*ptbl)->it_ops->free);
0629     WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
0630 
0631     return ret;
0632 }
0633 
0634 static void tce_iommu_free_table(struct tce_container *container,
0635         struct iommu_table *tbl)
0636 {
0637     unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
0638 
0639     iommu_tce_table_put(tbl);
0640     account_locked_vm(container->mm, pages, false);
0641 }
0642 
0643 static long tce_iommu_create_window(struct tce_container *container,
0644         __u32 page_shift, __u64 window_size, __u32 levels,
0645         __u64 *start_addr)
0646 {
0647     struct tce_iommu_group *tcegrp;
0648     struct iommu_table_group *table_group;
0649     struct iommu_table *tbl = NULL;
0650     long ret, num;
0651 
0652     num = tce_iommu_find_free_table(container);
0653     if (num < 0)
0654         return num;
0655 
0656     /* Get the first group for ops::create_table */
0657     tcegrp = list_first_entry(&container->group_list,
0658             struct tce_iommu_group, next);
0659     table_group = iommu_group_get_iommudata(tcegrp->grp);
0660     if (!table_group)
0661         return -EFAULT;
0662 
0663     if (!(table_group->pgsizes & (1ULL << page_shift)))
0664         return -EINVAL;
0665 
0666     if (!table_group->ops->set_window || !table_group->ops->unset_window ||
0667             !table_group->ops->get_table_size ||
0668             !table_group->ops->create_table)
0669         return -EPERM;
0670 
0671     /* Create TCE table */
0672     ret = tce_iommu_create_table(container, table_group, num,
0673             page_shift, window_size, levels, &tbl);
0674     if (ret)
0675         return ret;
0676 
0677     BUG_ON(!tbl->it_ops->free);
0678 
0679     /*
0680      * Program the table to every group.
0681      * Groups have been tested for compatibility at the attach time.
0682      */
0683     list_for_each_entry(tcegrp, &container->group_list, next) {
0684         table_group = iommu_group_get_iommudata(tcegrp->grp);
0685 
0686         ret = table_group->ops->set_window(table_group, num, tbl);
0687         if (ret)
0688             goto unset_exit;
0689     }
0690 
0691     container->tables[num] = tbl;
0692 
0693     /* Return start address assigned by platform in create_table() */
0694     *start_addr = tbl->it_offset << tbl->it_page_shift;
0695 
0696     return 0;
0697 
0698 unset_exit:
0699     list_for_each_entry(tcegrp, &container->group_list, next) {
0700         table_group = iommu_group_get_iommudata(tcegrp->grp);
0701         table_group->ops->unset_window(table_group, num);
0702     }
0703     tce_iommu_free_table(container, tbl);
0704 
0705     return ret;
0706 }
0707 
0708 static long tce_iommu_remove_window(struct tce_container *container,
0709         __u64 start_addr)
0710 {
0711     struct iommu_table_group *table_group = NULL;
0712     struct iommu_table *tbl;
0713     struct tce_iommu_group *tcegrp;
0714     int num;
0715 
0716     num = tce_iommu_find_table(container, start_addr, &tbl);
0717     if (num < 0)
0718         return -EINVAL;
0719 
0720     BUG_ON(!tbl->it_size);
0721 
0722     /* Detach groups from IOMMUs */
0723     list_for_each_entry(tcegrp, &container->group_list, next) {
0724         table_group = iommu_group_get_iommudata(tcegrp->grp);
0725 
0726         /*
0727          * SPAPR TCE IOMMU exposes the default DMA window to
0728          * the guest via dma32_window_start/size of
0729          * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
0730          * the userspace to remove this window, some do not so
0731          * here we check for the platform capability.
0732          */
0733         if (!table_group->ops || !table_group->ops->unset_window)
0734             return -EPERM;
0735 
0736         table_group->ops->unset_window(table_group, num);
0737     }
0738 
0739     /* Free table */
0740     tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
0741     tce_iommu_free_table(container, tbl);
0742     container->tables[num] = NULL;
0743 
0744     return 0;
0745 }
0746 
0747 static long tce_iommu_create_default_window(struct tce_container *container)
0748 {
0749     long ret;
0750     __u64 start_addr = 0;
0751     struct tce_iommu_group *tcegrp;
0752     struct iommu_table_group *table_group;
0753 
0754     if (!container->def_window_pending)
0755         return 0;
0756 
0757     if (!tce_groups_attached(container))
0758         return -ENODEV;
0759 
0760     tcegrp = list_first_entry(&container->group_list,
0761             struct tce_iommu_group, next);
0762     table_group = iommu_group_get_iommudata(tcegrp->grp);
0763     if (!table_group)
0764         return -ENODEV;
0765 
0766     ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
0767             table_group->tce32_size, 1, &start_addr);
0768     WARN_ON_ONCE(!ret && start_addr);
0769 
0770     if (!ret)
0771         container->def_window_pending = false;
0772 
0773     return ret;
0774 }
0775 
0776 static long tce_iommu_ioctl(void *iommu_data,
0777                  unsigned int cmd, unsigned long arg)
0778 {
0779     struct tce_container *container = iommu_data;
0780     unsigned long minsz, ddwsz;
0781     long ret;
0782 
0783     switch (cmd) {
0784     case VFIO_CHECK_EXTENSION:
0785         switch (arg) {
0786         case VFIO_SPAPR_TCE_IOMMU:
0787         case VFIO_SPAPR_TCE_v2_IOMMU:
0788             ret = 1;
0789             break;
0790         default:
0791             ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
0792             break;
0793         }
0794 
0795         return (ret < 0) ? 0 : ret;
0796     }
0797 
0798     /*
0799      * Sanity check to prevent one userspace from manipulating
0800      * another userspace mm.
0801      */
0802     BUG_ON(!container);
0803     if (container->mm && container->mm != current->mm)
0804         return -EPERM;
0805 
0806     switch (cmd) {
0807     case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
0808         struct vfio_iommu_spapr_tce_info info;
0809         struct tce_iommu_group *tcegrp;
0810         struct iommu_table_group *table_group;
0811 
0812         if (!tce_groups_attached(container))
0813             return -ENXIO;
0814 
0815         tcegrp = list_first_entry(&container->group_list,
0816                 struct tce_iommu_group, next);
0817         table_group = iommu_group_get_iommudata(tcegrp->grp);
0818 
0819         if (!table_group)
0820             return -ENXIO;
0821 
0822         minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
0823                 dma32_window_size);
0824 
0825         if (copy_from_user(&info, (void __user *)arg, minsz))
0826             return -EFAULT;
0827 
0828         if (info.argsz < minsz)
0829             return -EINVAL;
0830 
0831         info.dma32_window_start = table_group->tce32_start;
0832         info.dma32_window_size = table_group->tce32_size;
0833         info.flags = 0;
0834         memset(&info.ddw, 0, sizeof(info.ddw));
0835 
0836         if (table_group->max_dynamic_windows_supported &&
0837                 container->v2) {
0838             info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
0839             info.ddw.pgsizes = table_group->pgsizes;
0840             info.ddw.max_dynamic_windows_supported =
0841                 table_group->max_dynamic_windows_supported;
0842             info.ddw.levels = table_group->max_levels;
0843         }
0844 
0845         ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
0846 
0847         if (info.argsz >= ddwsz)
0848             minsz = ddwsz;
0849 
0850         if (copy_to_user((void __user *)arg, &info, minsz))
0851             return -EFAULT;
0852 
0853         return 0;
0854     }
0855     case VFIO_IOMMU_MAP_DMA: {
0856         struct vfio_iommu_type1_dma_map param;
0857         struct iommu_table *tbl = NULL;
0858         long num;
0859         enum dma_data_direction direction;
0860 
0861         if (!container->enabled)
0862             return -EPERM;
0863 
0864         minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
0865 
0866         if (copy_from_user(&param, (void __user *)arg, minsz))
0867             return -EFAULT;
0868 
0869         if (param.argsz < minsz)
0870             return -EINVAL;
0871 
0872         if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
0873                 VFIO_DMA_MAP_FLAG_WRITE))
0874             return -EINVAL;
0875 
0876         ret = tce_iommu_create_default_window(container);
0877         if (ret)
0878             return ret;
0879 
0880         num = tce_iommu_find_table(container, param.iova, &tbl);
0881         if (num < 0)
0882             return -ENXIO;
0883 
0884         if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
0885                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
0886             return -EINVAL;
0887 
0888         /* iova is checked by the IOMMU API */
0889         if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
0890             if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
0891                 direction = DMA_BIDIRECTIONAL;
0892             else
0893                 direction = DMA_TO_DEVICE;
0894         } else {
0895             if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
0896                 direction = DMA_FROM_DEVICE;
0897             else
0898                 return -EINVAL;
0899         }
0900 
0901         ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
0902         if (ret)
0903             return ret;
0904 
0905         if (container->v2)
0906             ret = tce_iommu_build_v2(container, tbl,
0907                     param.iova >> tbl->it_page_shift,
0908                     param.vaddr,
0909                     param.size >> tbl->it_page_shift,
0910                     direction);
0911         else
0912             ret = tce_iommu_build(container, tbl,
0913                     param.iova >> tbl->it_page_shift,
0914                     param.vaddr,
0915                     param.size >> tbl->it_page_shift,
0916                     direction);
0917 
0918         iommu_flush_tce(tbl);
0919 
0920         return ret;
0921     }
0922     case VFIO_IOMMU_UNMAP_DMA: {
0923         struct vfio_iommu_type1_dma_unmap param;
0924         struct iommu_table *tbl = NULL;
0925         long num;
0926 
0927         if (!container->enabled)
0928             return -EPERM;
0929 
0930         minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
0931                 size);
0932 
0933         if (copy_from_user(&param, (void __user *)arg, minsz))
0934             return -EFAULT;
0935 
0936         if (param.argsz < minsz)
0937             return -EINVAL;
0938 
0939         /* No flag is supported now */
0940         if (param.flags)
0941             return -EINVAL;
0942 
0943         ret = tce_iommu_create_default_window(container);
0944         if (ret)
0945             return ret;
0946 
0947         num = tce_iommu_find_table(container, param.iova, &tbl);
0948         if (num < 0)
0949             return -ENXIO;
0950 
0951         if (param.size & ~IOMMU_PAGE_MASK(tbl))
0952             return -EINVAL;
0953 
0954         ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
0955                 param.size >> tbl->it_page_shift);
0956         if (ret)
0957             return ret;
0958 
0959         ret = tce_iommu_clear(container, tbl,
0960                 param.iova >> tbl->it_page_shift,
0961                 param.size >> tbl->it_page_shift);
0962         iommu_flush_tce(tbl);
0963 
0964         return ret;
0965     }
0966     case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
0967         struct vfio_iommu_spapr_register_memory param;
0968 
0969         if (!container->v2)
0970             break;
0971 
0972         minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
0973                 size);
0974 
0975         ret = tce_iommu_mm_set(container);
0976         if (ret)
0977             return ret;
0978 
0979         if (copy_from_user(&param, (void __user *)arg, minsz))
0980             return -EFAULT;
0981 
0982         if (param.argsz < minsz)
0983             return -EINVAL;
0984 
0985         /* No flag is supported now */
0986         if (param.flags)
0987             return -EINVAL;
0988 
0989         mutex_lock(&container->lock);
0990         ret = tce_iommu_register_pages(container, param.vaddr,
0991                 param.size);
0992         mutex_unlock(&container->lock);
0993 
0994         return ret;
0995     }
0996     case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
0997         struct vfio_iommu_spapr_register_memory param;
0998 
0999         if (!container->v2)
1000             break;
1001 
1002         if (!container->mm)
1003             return -EPERM;
1004 
1005         minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006                 size);
1007 
1008         if (copy_from_user(&param, (void __user *)arg, minsz))
1009             return -EFAULT;
1010 
1011         if (param.argsz < minsz)
1012             return -EINVAL;
1013 
1014         /* No flag is supported now */
1015         if (param.flags)
1016             return -EINVAL;
1017 
1018         mutex_lock(&container->lock);
1019         ret = tce_iommu_unregister_pages(container, param.vaddr,
1020                 param.size);
1021         mutex_unlock(&container->lock);
1022 
1023         return ret;
1024     }
1025     case VFIO_IOMMU_ENABLE:
1026         if (container->v2)
1027             break;
1028 
1029         mutex_lock(&container->lock);
1030         ret = tce_iommu_enable(container);
1031         mutex_unlock(&container->lock);
1032         return ret;
1033 
1034 
1035     case VFIO_IOMMU_DISABLE:
1036         if (container->v2)
1037             break;
1038 
1039         mutex_lock(&container->lock);
1040         tce_iommu_disable(container);
1041         mutex_unlock(&container->lock);
1042         return 0;
1043 
1044     case VFIO_EEH_PE_OP: {
1045         struct tce_iommu_group *tcegrp;
1046 
1047         ret = 0;
1048         list_for_each_entry(tcegrp, &container->group_list, next) {
1049             ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050                     cmd, arg);
1051             if (ret)
1052                 return ret;
1053         }
1054         return ret;
1055     }
1056 
1057     case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058         struct vfio_iommu_spapr_tce_create create;
1059 
1060         if (!container->v2)
1061             break;
1062 
1063         ret = tce_iommu_mm_set(container);
1064         if (ret)
1065             return ret;
1066 
1067         if (!tce_groups_attached(container))
1068             return -ENXIO;
1069 
1070         minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071                 start_addr);
1072 
1073         if (copy_from_user(&create, (void __user *)arg, minsz))
1074             return -EFAULT;
1075 
1076         if (create.argsz < minsz)
1077             return -EINVAL;
1078 
1079         if (create.flags)
1080             return -EINVAL;
1081 
1082         mutex_lock(&container->lock);
1083 
1084         ret = tce_iommu_create_default_window(container);
1085         if (!ret)
1086             ret = tce_iommu_create_window(container,
1087                     create.page_shift,
1088                     create.window_size, create.levels,
1089                     &create.start_addr);
1090 
1091         mutex_unlock(&container->lock);
1092 
1093         if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094             ret = -EFAULT;
1095 
1096         return ret;
1097     }
1098     case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099         struct vfio_iommu_spapr_tce_remove remove;
1100 
1101         if (!container->v2)
1102             break;
1103 
1104         ret = tce_iommu_mm_set(container);
1105         if (ret)
1106             return ret;
1107 
1108         if (!tce_groups_attached(container))
1109             return -ENXIO;
1110 
1111         minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112                 start_addr);
1113 
1114         if (copy_from_user(&remove, (void __user *)arg, minsz))
1115             return -EFAULT;
1116 
1117         if (remove.argsz < minsz)
1118             return -EINVAL;
1119 
1120         if (remove.flags)
1121             return -EINVAL;
1122 
1123         if (container->def_window_pending && !remove.start_addr) {
1124             container->def_window_pending = false;
1125             return 0;
1126         }
1127 
1128         mutex_lock(&container->lock);
1129 
1130         ret = tce_iommu_remove_window(container, remove.start_addr);
1131 
1132         mutex_unlock(&container->lock);
1133 
1134         return ret;
1135     }
1136     }
1137 
1138     return -ENOTTY;
1139 }
1140 
1141 static void tce_iommu_release_ownership(struct tce_container *container,
1142         struct iommu_table_group *table_group)
1143 {
1144     int i;
1145 
1146     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147         struct iommu_table *tbl = container->tables[i];
1148 
1149         if (!tbl)
1150             continue;
1151 
1152         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1153         if (tbl->it_map)
1154             iommu_release_ownership(tbl);
1155 
1156         container->tables[i] = NULL;
1157     }
1158 }
1159 
1160 static int tce_iommu_take_ownership(struct tce_container *container,
1161         struct iommu_table_group *table_group)
1162 {
1163     int i, j, rc = 0;
1164 
1165     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166         struct iommu_table *tbl = table_group->tables[i];
1167 
1168         if (!tbl || !tbl->it_map)
1169             continue;
1170 
1171         rc = iommu_take_ownership(tbl);
1172         if (rc) {
1173             for (j = 0; j < i; ++j)
1174                 iommu_release_ownership(
1175                         table_group->tables[j]);
1176 
1177             return rc;
1178         }
1179     }
1180 
1181     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182         container->tables[i] = table_group->tables[i];
1183 
1184     return 0;
1185 }
1186 
1187 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188         struct iommu_table_group *table_group)
1189 {
1190     long i;
1191 
1192     if (!table_group->ops->unset_window) {
1193         WARN_ON_ONCE(1);
1194         return;
1195     }
1196 
1197     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198         if (container->tables[i])
1199             table_group->ops->unset_window(table_group, i);
1200 
1201     table_group->ops->release_ownership(table_group);
1202 }
1203 
1204 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205         struct iommu_table_group *table_group)
1206 {
1207     long i, ret = 0;
1208 
1209     if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210             !table_group->ops->release_ownership) {
1211         WARN_ON_ONCE(1);
1212         return -EFAULT;
1213     }
1214 
1215     table_group->ops->take_ownership(table_group);
1216 
1217     /* Set all windows to the new group */
1218     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219         struct iommu_table *tbl = container->tables[i];
1220 
1221         if (!tbl)
1222             continue;
1223 
1224         ret = table_group->ops->set_window(table_group, i, tbl);
1225         if (ret)
1226             goto release_exit;
1227     }
1228 
1229     return 0;
1230 
1231 release_exit:
1232     for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233         table_group->ops->unset_window(table_group, i);
1234 
1235     table_group->ops->release_ownership(table_group);
1236 
1237     return ret;
1238 }
1239 
1240 static int tce_iommu_attach_group(void *iommu_data,
1241         struct iommu_group *iommu_group, enum vfio_group_type type)
1242 {
1243     int ret = 0;
1244     struct tce_container *container = iommu_data;
1245     struct iommu_table_group *table_group;
1246     struct tce_iommu_group *tcegrp = NULL;
1247 
1248     if (type == VFIO_EMULATED_IOMMU)
1249         return -EINVAL;
1250 
1251     mutex_lock(&container->lock);
1252 
1253     /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1254             iommu_group_id(iommu_group), iommu_group); */
1255     table_group = iommu_group_get_iommudata(iommu_group);
1256     if (!table_group) {
1257         ret = -ENODEV;
1258         goto unlock_exit;
1259     }
1260 
1261     if (tce_groups_attached(container) && (!table_group->ops ||
1262             !table_group->ops->take_ownership ||
1263             !table_group->ops->release_ownership)) {
1264         ret = -EBUSY;
1265         goto unlock_exit;
1266     }
1267 
1268     /*
1269      * Check if new group has the same iommu_table_group_ops
1270      * (i.e. compatible)
1271      */
1272     list_for_each_entry(tcegrp, &container->group_list, next) {
1273         struct iommu_table_group *table_group_tmp;
1274 
1275         if (tcegrp->grp == iommu_group) {
1276             pr_warn("tce_vfio: Group %d is already attached\n",
1277                     iommu_group_id(iommu_group));
1278             ret = -EBUSY;
1279             goto unlock_exit;
1280         }
1281         table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1282         if (table_group_tmp->ops->create_table !=
1283                 table_group->ops->create_table) {
1284             pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1285                     iommu_group_id(iommu_group),
1286                     iommu_group_id(tcegrp->grp));
1287             ret = -EPERM;
1288             goto unlock_exit;
1289         }
1290     }
1291 
1292     tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1293     if (!tcegrp) {
1294         ret = -ENOMEM;
1295         goto unlock_exit;
1296     }
1297 
1298     if (!table_group->ops || !table_group->ops->take_ownership ||
1299             !table_group->ops->release_ownership) {
1300         if (container->v2) {
1301             ret = -EPERM;
1302             goto free_exit;
1303         }
1304         ret = tce_iommu_take_ownership(container, table_group);
1305     } else {
1306         if (!container->v2) {
1307             ret = -EPERM;
1308             goto free_exit;
1309         }
1310         ret = tce_iommu_take_ownership_ddw(container, table_group);
1311         if (!tce_groups_attached(container) && !container->tables[0])
1312             container->def_window_pending = true;
1313     }
1314 
1315     if (!ret) {
1316         tcegrp->grp = iommu_group;
1317         list_add(&tcegrp->next, &container->group_list);
1318     }
1319 
1320 free_exit:
1321     if (ret && tcegrp)
1322         kfree(tcegrp);
1323 
1324 unlock_exit:
1325     mutex_unlock(&container->lock);
1326 
1327     return ret;
1328 }
1329 
1330 static void tce_iommu_detach_group(void *iommu_data,
1331         struct iommu_group *iommu_group)
1332 {
1333     struct tce_container *container = iommu_data;
1334     struct iommu_table_group *table_group;
1335     bool found = false;
1336     struct tce_iommu_group *tcegrp;
1337 
1338     mutex_lock(&container->lock);
1339 
1340     list_for_each_entry(tcegrp, &container->group_list, next) {
1341         if (tcegrp->grp == iommu_group) {
1342             found = true;
1343             break;
1344         }
1345     }
1346 
1347     if (!found) {
1348         pr_warn("tce_vfio: detaching unattached group #%u\n",
1349                 iommu_group_id(iommu_group));
1350         goto unlock_exit;
1351     }
1352 
1353     list_del(&tcegrp->next);
1354     kfree(tcegrp);
1355 
1356     table_group = iommu_group_get_iommudata(iommu_group);
1357     BUG_ON(!table_group);
1358 
1359     if (!table_group->ops || !table_group->ops->release_ownership)
1360         tce_iommu_release_ownership(container, table_group);
1361     else
1362         tce_iommu_release_ownership_ddw(container, table_group);
1363 
1364 unlock_exit:
1365     mutex_unlock(&container->lock);
1366 }
1367 
1368 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1369     .name       = "iommu-vfio-powerpc",
1370     .owner      = THIS_MODULE,
1371     .open       = tce_iommu_open,
1372     .release    = tce_iommu_release,
1373     .ioctl      = tce_iommu_ioctl,
1374     .attach_group   = tce_iommu_attach_group,
1375     .detach_group   = tce_iommu_detach_group,
1376 };
1377 
1378 static int __init tce_iommu_init(void)
1379 {
1380     return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1381 }
1382 
1383 static void __exit tce_iommu_cleanup(void)
1384 {
1385     vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1386 }
1387 
1388 module_init(tce_iommu_init);
1389 module_exit(tce_iommu_cleanup);
1390 
1391 MODULE_VERSION(DRIVER_VERSION);
1392 MODULE_LICENSE("GPL v2");
1393 MODULE_AUTHOR(DRIVER_AUTHOR);
1394 MODULE_DESCRIPTION(DRIVER_DESC);
1395