Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * VFIO core
0004  *
0005  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
0006  *     Author: Alex Williamson <alex.williamson@redhat.com>
0007  *
0008  * Derived from original vfio:
0009  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
0010  * Author: Tom Lyon, pugs@cisco.com
0011  */
0012 
0013 #include <linux/cdev.h>
0014 #include <linux/compat.h>
0015 #include <linux/device.h>
0016 #include <linux/file.h>
0017 #include <linux/anon_inodes.h>
0018 #include <linux/fs.h>
0019 #include <linux/idr.h>
0020 #include <linux/iommu.h>
0021 #include <linux/list.h>
0022 #include <linux/miscdevice.h>
0023 #include <linux/module.h>
0024 #include <linux/mutex.h>
0025 #include <linux/pci.h>
0026 #include <linux/rwsem.h>
0027 #include <linux/sched.h>
0028 #include <linux/slab.h>
0029 #include <linux/stat.h>
0030 #include <linux/string.h>
0031 #include <linux/uaccess.h>
0032 #include <linux/vfio.h>
0033 #include <linux/wait.h>
0034 #include <linux/sched/signal.h>
0035 #include "vfio.h"
0036 
0037 #define DRIVER_VERSION  "0.3"
0038 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
0039 #define DRIVER_DESC "VFIO - User Level meta-driver"
0040 
0041 static struct vfio {
0042     struct class            *class;
0043     struct list_head        iommu_drivers_list;
0044     struct mutex            iommu_drivers_lock;
0045     struct list_head        group_list;
0046     struct mutex            group_lock; /* locks group_list */
0047     struct ida          group_ida;
0048     dev_t               group_devt;
0049 } vfio;
0050 
0051 struct vfio_iommu_driver {
0052     const struct vfio_iommu_driver_ops  *ops;
0053     struct list_head            vfio_next;
0054 };
0055 
0056 struct vfio_container {
0057     struct kref         kref;
0058     struct list_head        group_list;
0059     struct rw_semaphore     group_lock;
0060     struct vfio_iommu_driver    *iommu_driver;
0061     void                *iommu_data;
0062     bool                noiommu;
0063 };
0064 
0065 struct vfio_group {
0066     struct device           dev;
0067     struct cdev         cdev;
0068     refcount_t          users;
0069     unsigned int            container_users;
0070     struct iommu_group      *iommu_group;
0071     struct vfio_container       *container;
0072     struct list_head        device_list;
0073     struct mutex            device_lock;
0074     struct list_head        vfio_next;
0075     struct list_head        container_next;
0076     enum vfio_group_type        type;
0077     unsigned int            dev_counter;
0078     struct rw_semaphore     group_rwsem;
0079     struct kvm          *kvm;
0080     struct file         *opened_file;
0081     struct blocking_notifier_head   notifier;
0082 };
0083 
0084 #ifdef CONFIG_VFIO_NOIOMMU
0085 static bool noiommu __read_mostly;
0086 module_param_named(enable_unsafe_noiommu_mode,
0087            noiommu, bool, S_IRUGO | S_IWUSR);
0088 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
0089 #endif
0090 
0091 static DEFINE_XARRAY(vfio_device_set_xa);
0092 static const struct file_operations vfio_group_fops;
0093 
0094 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
0095 {
0096     unsigned long idx = (unsigned long)set_id;
0097     struct vfio_device_set *new_dev_set;
0098     struct vfio_device_set *dev_set;
0099 
0100     if (WARN_ON(!set_id))
0101         return -EINVAL;
0102 
0103     /*
0104      * Atomically acquire a singleton object in the xarray for this set_id
0105      */
0106     xa_lock(&vfio_device_set_xa);
0107     dev_set = xa_load(&vfio_device_set_xa, idx);
0108     if (dev_set)
0109         goto found_get_ref;
0110     xa_unlock(&vfio_device_set_xa);
0111 
0112     new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
0113     if (!new_dev_set)
0114         return -ENOMEM;
0115     mutex_init(&new_dev_set->lock);
0116     INIT_LIST_HEAD(&new_dev_set->device_list);
0117     new_dev_set->set_id = set_id;
0118 
0119     xa_lock(&vfio_device_set_xa);
0120     dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
0121                    GFP_KERNEL);
0122     if (!dev_set) {
0123         dev_set = new_dev_set;
0124         goto found_get_ref;
0125     }
0126 
0127     kfree(new_dev_set);
0128     if (xa_is_err(dev_set)) {
0129         xa_unlock(&vfio_device_set_xa);
0130         return xa_err(dev_set);
0131     }
0132 
0133 found_get_ref:
0134     dev_set->device_count++;
0135     xa_unlock(&vfio_device_set_xa);
0136     mutex_lock(&dev_set->lock);
0137     device->dev_set = dev_set;
0138     list_add_tail(&device->dev_set_list, &dev_set->device_list);
0139     mutex_unlock(&dev_set->lock);
0140     return 0;
0141 }
0142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
0143 
0144 static void vfio_release_device_set(struct vfio_device *device)
0145 {
0146     struct vfio_device_set *dev_set = device->dev_set;
0147 
0148     if (!dev_set)
0149         return;
0150 
0151     mutex_lock(&dev_set->lock);
0152     list_del(&device->dev_set_list);
0153     mutex_unlock(&dev_set->lock);
0154 
0155     xa_lock(&vfio_device_set_xa);
0156     if (!--dev_set->device_count) {
0157         __xa_erase(&vfio_device_set_xa,
0158                (unsigned long)dev_set->set_id);
0159         mutex_destroy(&dev_set->lock);
0160         kfree(dev_set);
0161     }
0162     xa_unlock(&vfio_device_set_xa);
0163 }
0164 
0165 #ifdef CONFIG_VFIO_NOIOMMU
0166 static void *vfio_noiommu_open(unsigned long arg)
0167 {
0168     if (arg != VFIO_NOIOMMU_IOMMU)
0169         return ERR_PTR(-EINVAL);
0170     if (!capable(CAP_SYS_RAWIO))
0171         return ERR_PTR(-EPERM);
0172 
0173     return NULL;
0174 }
0175 
0176 static void vfio_noiommu_release(void *iommu_data)
0177 {
0178 }
0179 
0180 static long vfio_noiommu_ioctl(void *iommu_data,
0181                    unsigned int cmd, unsigned long arg)
0182 {
0183     if (cmd == VFIO_CHECK_EXTENSION)
0184         return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
0185 
0186     return -ENOTTY;
0187 }
0188 
0189 static int vfio_noiommu_attach_group(void *iommu_data,
0190         struct iommu_group *iommu_group, enum vfio_group_type type)
0191 {
0192     return 0;
0193 }
0194 
0195 static void vfio_noiommu_detach_group(void *iommu_data,
0196                       struct iommu_group *iommu_group)
0197 {
0198 }
0199 
0200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
0201     .name = "vfio-noiommu",
0202     .owner = THIS_MODULE,
0203     .open = vfio_noiommu_open,
0204     .release = vfio_noiommu_release,
0205     .ioctl = vfio_noiommu_ioctl,
0206     .attach_group = vfio_noiommu_attach_group,
0207     .detach_group = vfio_noiommu_detach_group,
0208 };
0209 
0210 /*
0211  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
0212  * use vfio-noiommu.
0213  */
0214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
0215         const struct vfio_iommu_driver *driver)
0216 {
0217     return container->noiommu == (driver->ops == &vfio_noiommu_ops);
0218 }
0219 #else
0220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
0221         const struct vfio_iommu_driver *driver)
0222 {
0223     return true;
0224 }
0225 #endif /* CONFIG_VFIO_NOIOMMU */
0226 
0227 /*
0228  * IOMMU driver registration
0229  */
0230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
0231 {
0232     struct vfio_iommu_driver *driver, *tmp;
0233 
0234     if (WARN_ON(!ops->register_device != !ops->unregister_device))
0235         return -EINVAL;
0236 
0237     driver = kzalloc(sizeof(*driver), GFP_KERNEL);
0238     if (!driver)
0239         return -ENOMEM;
0240 
0241     driver->ops = ops;
0242 
0243     mutex_lock(&vfio.iommu_drivers_lock);
0244 
0245     /* Check for duplicates */
0246     list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
0247         if (tmp->ops == ops) {
0248             mutex_unlock(&vfio.iommu_drivers_lock);
0249             kfree(driver);
0250             return -EINVAL;
0251         }
0252     }
0253 
0254     list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
0255 
0256     mutex_unlock(&vfio.iommu_drivers_lock);
0257 
0258     return 0;
0259 }
0260 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
0261 
0262 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
0263 {
0264     struct vfio_iommu_driver *driver;
0265 
0266     mutex_lock(&vfio.iommu_drivers_lock);
0267     list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
0268         if (driver->ops == ops) {
0269             list_del(&driver->vfio_next);
0270             mutex_unlock(&vfio.iommu_drivers_lock);
0271             kfree(driver);
0272             return;
0273         }
0274     }
0275     mutex_unlock(&vfio.iommu_drivers_lock);
0276 }
0277 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
0278 
0279 static void vfio_group_get(struct vfio_group *group);
0280 
0281 /*
0282  * Container objects - containers are created when /dev/vfio/vfio is
0283  * opened, but their lifecycle extends until the last user is done, so
0284  * it's freed via kref.  Must support container/group/device being
0285  * closed in any order.
0286  */
0287 static void vfio_container_get(struct vfio_container *container)
0288 {
0289     kref_get(&container->kref);
0290 }
0291 
0292 static void vfio_container_release(struct kref *kref)
0293 {
0294     struct vfio_container *container;
0295     container = container_of(kref, struct vfio_container, kref);
0296 
0297     kfree(container);
0298 }
0299 
0300 static void vfio_container_put(struct vfio_container *container)
0301 {
0302     kref_put(&container->kref, vfio_container_release);
0303 }
0304 
0305 /*
0306  * Group objects - create, release, get, put, search
0307  */
0308 static struct vfio_group *
0309 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
0310 {
0311     struct vfio_group *group;
0312 
0313     list_for_each_entry(group, &vfio.group_list, vfio_next) {
0314         if (group->iommu_group == iommu_group) {
0315             vfio_group_get(group);
0316             return group;
0317         }
0318     }
0319     return NULL;
0320 }
0321 
0322 static struct vfio_group *
0323 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
0324 {
0325     struct vfio_group *group;
0326 
0327     mutex_lock(&vfio.group_lock);
0328     group = __vfio_group_get_from_iommu(iommu_group);
0329     mutex_unlock(&vfio.group_lock);
0330     return group;
0331 }
0332 
0333 static void vfio_group_release(struct device *dev)
0334 {
0335     struct vfio_group *group = container_of(dev, struct vfio_group, dev);
0336 
0337     mutex_destroy(&group->device_lock);
0338     iommu_group_put(group->iommu_group);
0339     ida_free(&vfio.group_ida, MINOR(group->dev.devt));
0340     kfree(group);
0341 }
0342 
0343 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
0344                        enum vfio_group_type type)
0345 {
0346     struct vfio_group *group;
0347     int minor;
0348 
0349     group = kzalloc(sizeof(*group), GFP_KERNEL);
0350     if (!group)
0351         return ERR_PTR(-ENOMEM);
0352 
0353     minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
0354     if (minor < 0) {
0355         kfree(group);
0356         return ERR_PTR(minor);
0357     }
0358 
0359     device_initialize(&group->dev);
0360     group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
0361     group->dev.class = vfio.class;
0362     group->dev.release = vfio_group_release;
0363     cdev_init(&group->cdev, &vfio_group_fops);
0364     group->cdev.owner = THIS_MODULE;
0365 
0366     refcount_set(&group->users, 1);
0367     init_rwsem(&group->group_rwsem);
0368     INIT_LIST_HEAD(&group->device_list);
0369     mutex_init(&group->device_lock);
0370     group->iommu_group = iommu_group;
0371     /* put in vfio_group_release() */
0372     iommu_group_ref_get(iommu_group);
0373     group->type = type;
0374     BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
0375 
0376     return group;
0377 }
0378 
0379 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
0380         enum vfio_group_type type)
0381 {
0382     struct vfio_group *group;
0383     struct vfio_group *ret;
0384     int err;
0385 
0386     group = vfio_group_alloc(iommu_group, type);
0387     if (IS_ERR(group))
0388         return group;
0389 
0390     err = dev_set_name(&group->dev, "%s%d",
0391                group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
0392                iommu_group_id(iommu_group));
0393     if (err) {
0394         ret = ERR_PTR(err);
0395         goto err_put;
0396     }
0397 
0398     mutex_lock(&vfio.group_lock);
0399 
0400     /* Did we race creating this group? */
0401     ret = __vfio_group_get_from_iommu(iommu_group);
0402     if (ret)
0403         goto err_unlock;
0404 
0405     err = cdev_device_add(&group->cdev, &group->dev);
0406     if (err) {
0407         ret = ERR_PTR(err);
0408         goto err_unlock;
0409     }
0410 
0411     list_add(&group->vfio_next, &vfio.group_list);
0412 
0413     mutex_unlock(&vfio.group_lock);
0414     return group;
0415 
0416 err_unlock:
0417     mutex_unlock(&vfio.group_lock);
0418 err_put:
0419     put_device(&group->dev);
0420     return ret;
0421 }
0422 
0423 static void vfio_group_put(struct vfio_group *group)
0424 {
0425     if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
0426         return;
0427 
0428     /*
0429      * These data structures all have paired operations that can only be
0430      * undone when the caller holds a live reference on the group. Since all
0431      * pairs must be undone these WARN_ON's indicate some caller did not
0432      * properly hold the group reference.
0433      */
0434     WARN_ON(!list_empty(&group->device_list));
0435     WARN_ON(group->container || group->container_users);
0436     WARN_ON(group->notifier.head);
0437 
0438     list_del(&group->vfio_next);
0439     cdev_device_del(&group->cdev, &group->dev);
0440     mutex_unlock(&vfio.group_lock);
0441 
0442     put_device(&group->dev);
0443 }
0444 
0445 static void vfio_group_get(struct vfio_group *group)
0446 {
0447     refcount_inc(&group->users);
0448 }
0449 
0450 /*
0451  * Device objects - create, release, get, put, search
0452  */
0453 /* Device reference always implies a group reference */
0454 static void vfio_device_put(struct vfio_device *device)
0455 {
0456     if (refcount_dec_and_test(&device->refcount))
0457         complete(&device->comp);
0458 }
0459 
0460 static bool vfio_device_try_get(struct vfio_device *device)
0461 {
0462     return refcount_inc_not_zero(&device->refcount);
0463 }
0464 
0465 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
0466                          struct device *dev)
0467 {
0468     struct vfio_device *device;
0469 
0470     mutex_lock(&group->device_lock);
0471     list_for_each_entry(device, &group->device_list, group_next) {
0472         if (device->dev == dev && vfio_device_try_get(device)) {
0473             mutex_unlock(&group->device_lock);
0474             return device;
0475         }
0476     }
0477     mutex_unlock(&group->device_lock);
0478     return NULL;
0479 }
0480 
0481 /*
0482  * VFIO driver API
0483  */
0484 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
0485              const struct vfio_device_ops *ops)
0486 {
0487     init_completion(&device->comp);
0488     device->dev = dev;
0489     device->ops = ops;
0490 }
0491 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
0492 
0493 void vfio_uninit_group_dev(struct vfio_device *device)
0494 {
0495     vfio_release_device_set(device);
0496 }
0497 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
0498 
0499 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
0500         enum vfio_group_type type)
0501 {
0502     struct iommu_group *iommu_group;
0503     struct vfio_group *group;
0504     int ret;
0505 
0506     iommu_group = iommu_group_alloc();
0507     if (IS_ERR(iommu_group))
0508         return ERR_CAST(iommu_group);
0509 
0510     ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
0511     if (ret)
0512         goto out_put_group;
0513     ret = iommu_group_add_device(iommu_group, dev);
0514     if (ret)
0515         goto out_put_group;
0516 
0517     group = vfio_create_group(iommu_group, type);
0518     if (IS_ERR(group)) {
0519         ret = PTR_ERR(group);
0520         goto out_remove_device;
0521     }
0522     iommu_group_put(iommu_group);
0523     return group;
0524 
0525 out_remove_device:
0526     iommu_group_remove_device(dev);
0527 out_put_group:
0528     iommu_group_put(iommu_group);
0529     return ERR_PTR(ret);
0530 }
0531 
0532 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
0533 {
0534     struct iommu_group *iommu_group;
0535     struct vfio_group *group;
0536 
0537     iommu_group = iommu_group_get(dev);
0538 #ifdef CONFIG_VFIO_NOIOMMU
0539     if (!iommu_group && noiommu) {
0540         /*
0541          * With noiommu enabled, create an IOMMU group for devices that
0542          * don't already have one, implying no IOMMU hardware/driver
0543          * exists.  Taint the kernel because we're about to give a DMA
0544          * capable device to a user without IOMMU protection.
0545          */
0546         group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
0547         if (!IS_ERR(group)) {
0548             add_taint(TAINT_USER, LOCKDEP_STILL_OK);
0549             dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
0550         }
0551         return group;
0552     }
0553 #endif
0554     if (!iommu_group)
0555         return ERR_PTR(-EINVAL);
0556 
0557     /*
0558      * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
0559      * restore cache coherency. It has to be checked here because it is only
0560      * valid for cases where we are using iommu groups.
0561      */
0562     if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
0563         iommu_group_put(iommu_group);
0564         return ERR_PTR(-EINVAL);
0565     }
0566 
0567     group = vfio_group_get_from_iommu(iommu_group);
0568     if (!group)
0569         group = vfio_create_group(iommu_group, VFIO_IOMMU);
0570 
0571     /* The vfio_group holds a reference to the iommu_group */
0572     iommu_group_put(iommu_group);
0573     return group;
0574 }
0575 
0576 static int __vfio_register_dev(struct vfio_device *device,
0577         struct vfio_group *group)
0578 {
0579     struct vfio_device *existing_device;
0580 
0581     if (IS_ERR(group))
0582         return PTR_ERR(group);
0583 
0584     /*
0585      * If the driver doesn't specify a set then the device is added to a
0586      * singleton set just for itself.
0587      */
0588     if (!device->dev_set)
0589         vfio_assign_device_set(device, device);
0590 
0591     existing_device = vfio_group_get_device(group, device->dev);
0592     if (existing_device) {
0593         dev_WARN(device->dev, "Device already exists on group %d\n",
0594              iommu_group_id(group->iommu_group));
0595         vfio_device_put(existing_device);
0596         if (group->type == VFIO_NO_IOMMU ||
0597             group->type == VFIO_EMULATED_IOMMU)
0598             iommu_group_remove_device(device->dev);
0599         vfio_group_put(group);
0600         return -EBUSY;
0601     }
0602 
0603     /* Our reference on group is moved to the device */
0604     device->group = group;
0605 
0606     /* Refcounting can't start until the driver calls register */
0607     refcount_set(&device->refcount, 1);
0608 
0609     mutex_lock(&group->device_lock);
0610     list_add(&device->group_next, &group->device_list);
0611     group->dev_counter++;
0612     mutex_unlock(&group->device_lock);
0613 
0614     return 0;
0615 }
0616 
0617 int vfio_register_group_dev(struct vfio_device *device)
0618 {
0619     return __vfio_register_dev(device,
0620         vfio_group_find_or_alloc(device->dev));
0621 }
0622 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
0623 
0624 /*
0625  * Register a virtual device without IOMMU backing.  The user of this
0626  * device must not be able to directly trigger unmediated DMA.
0627  */
0628 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
0629 {
0630     return __vfio_register_dev(device,
0631         vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
0632 }
0633 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
0634 
0635 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
0636                              char *buf)
0637 {
0638     struct vfio_device *it, *device = ERR_PTR(-ENODEV);
0639 
0640     mutex_lock(&group->device_lock);
0641     list_for_each_entry(it, &group->device_list, group_next) {
0642         int ret;
0643 
0644         if (it->ops->match) {
0645             ret = it->ops->match(it, buf);
0646             if (ret < 0) {
0647                 device = ERR_PTR(ret);
0648                 break;
0649             }
0650         } else {
0651             ret = !strcmp(dev_name(it->dev), buf);
0652         }
0653 
0654         if (ret && vfio_device_try_get(it)) {
0655             device = it;
0656             break;
0657         }
0658     }
0659     mutex_unlock(&group->device_lock);
0660 
0661     return device;
0662 }
0663 
0664 /*
0665  * Decrement the device reference count and wait for the device to be
0666  * removed.  Open file descriptors for the device... */
0667 void vfio_unregister_group_dev(struct vfio_device *device)
0668 {
0669     struct vfio_group *group = device->group;
0670     unsigned int i = 0;
0671     bool interrupted = false;
0672     long rc;
0673 
0674     vfio_device_put(device);
0675     rc = try_wait_for_completion(&device->comp);
0676     while (rc <= 0) {
0677         if (device->ops->request)
0678             device->ops->request(device, i++);
0679 
0680         if (interrupted) {
0681             rc = wait_for_completion_timeout(&device->comp,
0682                              HZ * 10);
0683         } else {
0684             rc = wait_for_completion_interruptible_timeout(
0685                 &device->comp, HZ * 10);
0686             if (rc < 0) {
0687                 interrupted = true;
0688                 dev_warn(device->dev,
0689                      "Device is currently in use, task"
0690                      " \"%s\" (%d) "
0691                      "blocked until device is released",
0692                      current->comm, task_pid_nr(current));
0693             }
0694         }
0695     }
0696 
0697     mutex_lock(&group->device_lock);
0698     list_del(&device->group_next);
0699     group->dev_counter--;
0700     mutex_unlock(&group->device_lock);
0701 
0702     if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
0703         iommu_group_remove_device(device->dev);
0704 
0705     /* Matches the get in vfio_register_group_dev() */
0706     vfio_group_put(group);
0707 }
0708 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
0709 
0710 /*
0711  * VFIO base fd, /dev/vfio/vfio
0712  */
0713 static long vfio_ioctl_check_extension(struct vfio_container *container,
0714                        unsigned long arg)
0715 {
0716     struct vfio_iommu_driver *driver;
0717     long ret = 0;
0718 
0719     down_read(&container->group_lock);
0720 
0721     driver = container->iommu_driver;
0722 
0723     switch (arg) {
0724         /* No base extensions yet */
0725     default:
0726         /*
0727          * If no driver is set, poll all registered drivers for
0728          * extensions and return the first positive result.  If
0729          * a driver is already set, further queries will be passed
0730          * only to that driver.
0731          */
0732         if (!driver) {
0733             mutex_lock(&vfio.iommu_drivers_lock);
0734             list_for_each_entry(driver, &vfio.iommu_drivers_list,
0735                         vfio_next) {
0736 
0737                 if (!list_empty(&container->group_list) &&
0738                     !vfio_iommu_driver_allowed(container,
0739                                    driver))
0740                     continue;
0741                 if (!try_module_get(driver->ops->owner))
0742                     continue;
0743 
0744                 ret = driver->ops->ioctl(NULL,
0745                              VFIO_CHECK_EXTENSION,
0746                              arg);
0747                 module_put(driver->ops->owner);
0748                 if (ret > 0)
0749                     break;
0750             }
0751             mutex_unlock(&vfio.iommu_drivers_lock);
0752         } else
0753             ret = driver->ops->ioctl(container->iommu_data,
0754                          VFIO_CHECK_EXTENSION, arg);
0755     }
0756 
0757     up_read(&container->group_lock);
0758 
0759     return ret;
0760 }
0761 
0762 /* hold write lock on container->group_lock */
0763 static int __vfio_container_attach_groups(struct vfio_container *container,
0764                       struct vfio_iommu_driver *driver,
0765                       void *data)
0766 {
0767     struct vfio_group *group;
0768     int ret = -ENODEV;
0769 
0770     list_for_each_entry(group, &container->group_list, container_next) {
0771         ret = driver->ops->attach_group(data, group->iommu_group,
0772                         group->type);
0773         if (ret)
0774             goto unwind;
0775     }
0776 
0777     return ret;
0778 
0779 unwind:
0780     list_for_each_entry_continue_reverse(group, &container->group_list,
0781                          container_next) {
0782         driver->ops->detach_group(data, group->iommu_group);
0783     }
0784 
0785     return ret;
0786 }
0787 
0788 static long vfio_ioctl_set_iommu(struct vfio_container *container,
0789                  unsigned long arg)
0790 {
0791     struct vfio_iommu_driver *driver;
0792     long ret = -ENODEV;
0793 
0794     down_write(&container->group_lock);
0795 
0796     /*
0797      * The container is designed to be an unprivileged interface while
0798      * the group can be assigned to specific users.  Therefore, only by
0799      * adding a group to a container does the user get the privilege of
0800      * enabling the iommu, which may allocate finite resources.  There
0801      * is no unset_iommu, but by removing all the groups from a container,
0802      * the container is deprivileged and returns to an unset state.
0803      */
0804     if (list_empty(&container->group_list) || container->iommu_driver) {
0805         up_write(&container->group_lock);
0806         return -EINVAL;
0807     }
0808 
0809     mutex_lock(&vfio.iommu_drivers_lock);
0810     list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
0811         void *data;
0812 
0813         if (!vfio_iommu_driver_allowed(container, driver))
0814             continue;
0815         if (!try_module_get(driver->ops->owner))
0816             continue;
0817 
0818         /*
0819          * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
0820          * so test which iommu driver reported support for this
0821          * extension and call open on them.  We also pass them the
0822          * magic, allowing a single driver to support multiple
0823          * interfaces if they'd like.
0824          */
0825         if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
0826             module_put(driver->ops->owner);
0827             continue;
0828         }
0829 
0830         data = driver->ops->open(arg);
0831         if (IS_ERR(data)) {
0832             ret = PTR_ERR(data);
0833             module_put(driver->ops->owner);
0834             continue;
0835         }
0836 
0837         ret = __vfio_container_attach_groups(container, driver, data);
0838         if (ret) {
0839             driver->ops->release(data);
0840             module_put(driver->ops->owner);
0841             continue;
0842         }
0843 
0844         container->iommu_driver = driver;
0845         container->iommu_data = data;
0846         break;
0847     }
0848 
0849     mutex_unlock(&vfio.iommu_drivers_lock);
0850     up_write(&container->group_lock);
0851 
0852     return ret;
0853 }
0854 
0855 static long vfio_fops_unl_ioctl(struct file *filep,
0856                 unsigned int cmd, unsigned long arg)
0857 {
0858     struct vfio_container *container = filep->private_data;
0859     struct vfio_iommu_driver *driver;
0860     void *data;
0861     long ret = -EINVAL;
0862 
0863     if (!container)
0864         return ret;
0865 
0866     switch (cmd) {
0867     case VFIO_GET_API_VERSION:
0868         ret = VFIO_API_VERSION;
0869         break;
0870     case VFIO_CHECK_EXTENSION:
0871         ret = vfio_ioctl_check_extension(container, arg);
0872         break;
0873     case VFIO_SET_IOMMU:
0874         ret = vfio_ioctl_set_iommu(container, arg);
0875         break;
0876     default:
0877         driver = container->iommu_driver;
0878         data = container->iommu_data;
0879 
0880         if (driver) /* passthrough all unrecognized ioctls */
0881             ret = driver->ops->ioctl(data, cmd, arg);
0882     }
0883 
0884     return ret;
0885 }
0886 
0887 static int vfio_fops_open(struct inode *inode, struct file *filep)
0888 {
0889     struct vfio_container *container;
0890 
0891     container = kzalloc(sizeof(*container), GFP_KERNEL);
0892     if (!container)
0893         return -ENOMEM;
0894 
0895     INIT_LIST_HEAD(&container->group_list);
0896     init_rwsem(&container->group_lock);
0897     kref_init(&container->kref);
0898 
0899     filep->private_data = container;
0900 
0901     return 0;
0902 }
0903 
0904 static int vfio_fops_release(struct inode *inode, struct file *filep)
0905 {
0906     struct vfio_container *container = filep->private_data;
0907     struct vfio_iommu_driver *driver = container->iommu_driver;
0908 
0909     if (driver && driver->ops->notify)
0910         driver->ops->notify(container->iommu_data,
0911                     VFIO_IOMMU_CONTAINER_CLOSE);
0912 
0913     filep->private_data = NULL;
0914 
0915     vfio_container_put(container);
0916 
0917     return 0;
0918 }
0919 
0920 static const struct file_operations vfio_fops = {
0921     .owner      = THIS_MODULE,
0922     .open       = vfio_fops_open,
0923     .release    = vfio_fops_release,
0924     .unlocked_ioctl = vfio_fops_unl_ioctl,
0925     .compat_ioctl   = compat_ptr_ioctl,
0926 };
0927 
0928 /*
0929  * VFIO Group fd, /dev/vfio/$GROUP
0930  */
0931 static void __vfio_group_unset_container(struct vfio_group *group)
0932 {
0933     struct vfio_container *container = group->container;
0934     struct vfio_iommu_driver *driver;
0935 
0936     lockdep_assert_held_write(&group->group_rwsem);
0937 
0938     down_write(&container->group_lock);
0939 
0940     driver = container->iommu_driver;
0941     if (driver)
0942         driver->ops->detach_group(container->iommu_data,
0943                       group->iommu_group);
0944 
0945     if (group->type == VFIO_IOMMU)
0946         iommu_group_release_dma_owner(group->iommu_group);
0947 
0948     group->container = NULL;
0949     group->container_users = 0;
0950     list_del(&group->container_next);
0951 
0952     /* Detaching the last group deprivileges a container, remove iommu */
0953     if (driver && list_empty(&container->group_list)) {
0954         driver->ops->release(container->iommu_data);
0955         module_put(driver->ops->owner);
0956         container->iommu_driver = NULL;
0957         container->iommu_data = NULL;
0958     }
0959 
0960     up_write(&container->group_lock);
0961 
0962     vfio_container_put(container);
0963 }
0964 
0965 /*
0966  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
0967  * if there was no container to unset.  Since the ioctl is called on
0968  * the group, we know that still exists, therefore the only valid
0969  * transition here is 1->0.
0970  */
0971 static int vfio_group_unset_container(struct vfio_group *group)
0972 {
0973     lockdep_assert_held_write(&group->group_rwsem);
0974 
0975     if (!group->container)
0976         return -EINVAL;
0977     if (group->container_users != 1)
0978         return -EBUSY;
0979     __vfio_group_unset_container(group);
0980     return 0;
0981 }
0982 
0983 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
0984 {
0985     struct fd f;
0986     struct vfio_container *container;
0987     struct vfio_iommu_driver *driver;
0988     int ret = 0;
0989 
0990     lockdep_assert_held_write(&group->group_rwsem);
0991 
0992     if (group->container || WARN_ON(group->container_users))
0993         return -EINVAL;
0994 
0995     if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
0996         return -EPERM;
0997 
0998     f = fdget(container_fd);
0999     if (!f.file)
1000         return -EBADF;
1001 
1002     /* Sanity check, is this really our fd? */
1003     if (f.file->f_op != &vfio_fops) {
1004         fdput(f);
1005         return -EINVAL;
1006     }
1007 
1008     container = f.file->private_data;
1009     WARN_ON(!container); /* fget ensures we don't race vfio_release */
1010 
1011     down_write(&container->group_lock);
1012 
1013     /* Real groups and fake groups cannot mix */
1014     if (!list_empty(&container->group_list) &&
1015         container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1016         ret = -EPERM;
1017         goto unlock_out;
1018     }
1019 
1020     if (group->type == VFIO_IOMMU) {
1021         ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1022         if (ret)
1023             goto unlock_out;
1024     }
1025 
1026     driver = container->iommu_driver;
1027     if (driver) {
1028         ret = driver->ops->attach_group(container->iommu_data,
1029                         group->iommu_group,
1030                         group->type);
1031         if (ret) {
1032             if (group->type == VFIO_IOMMU)
1033                 iommu_group_release_dma_owner(
1034                     group->iommu_group);
1035             goto unlock_out;
1036         }
1037     }
1038 
1039     group->container = container;
1040     group->container_users = 1;
1041     container->noiommu = (group->type == VFIO_NO_IOMMU);
1042     list_add(&group->container_next, &container->group_list);
1043 
1044     /* Get a reference on the container and mark a user within the group */
1045     vfio_container_get(container);
1046 
1047 unlock_out:
1048     up_write(&container->group_lock);
1049     fdput(f);
1050     return ret;
1051 }
1052 
1053 static const struct file_operations vfio_device_fops;
1054 
1055 /* true if the vfio_device has open_device() called but not close_device() */
1056 static bool vfio_assert_device_open(struct vfio_device *device)
1057 {
1058     return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1059 }
1060 
1061 static int vfio_device_assign_container(struct vfio_device *device)
1062 {
1063     struct vfio_group *group = device->group;
1064 
1065     lockdep_assert_held_write(&group->group_rwsem);
1066 
1067     if (!group->container || !group->container->iommu_driver ||
1068         WARN_ON(!group->container_users))
1069         return -EINVAL;
1070 
1071     if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1072         return -EPERM;
1073 
1074     get_file(group->opened_file);
1075     group->container_users++;
1076     return 0;
1077 }
1078 
1079 static void vfio_device_unassign_container(struct vfio_device *device)
1080 {
1081     down_write(&device->group->group_rwsem);
1082     WARN_ON(device->group->container_users <= 1);
1083     device->group->container_users--;
1084     fput(device->group->opened_file);
1085     up_write(&device->group->group_rwsem);
1086 }
1087 
1088 static struct file *vfio_device_open(struct vfio_device *device)
1089 {
1090     struct vfio_iommu_driver *iommu_driver;
1091     struct file *filep;
1092     int ret;
1093 
1094     down_write(&device->group->group_rwsem);
1095     ret = vfio_device_assign_container(device);
1096     up_write(&device->group->group_rwsem);
1097     if (ret)
1098         return ERR_PTR(ret);
1099 
1100     if (!try_module_get(device->dev->driver->owner)) {
1101         ret = -ENODEV;
1102         goto err_unassign_container;
1103     }
1104 
1105     mutex_lock(&device->dev_set->lock);
1106     device->open_count++;
1107     if (device->open_count == 1) {
1108         /*
1109          * Here we pass the KVM pointer with the group under the read
1110          * lock.  If the device driver will use it, it must obtain a
1111          * reference and release it during close_device.
1112          */
1113         down_read(&device->group->group_rwsem);
1114         device->kvm = device->group->kvm;
1115 
1116         if (device->ops->open_device) {
1117             ret = device->ops->open_device(device);
1118             if (ret)
1119                 goto err_undo_count;
1120         }
1121 
1122         iommu_driver = device->group->container->iommu_driver;
1123         if (iommu_driver && iommu_driver->ops->register_device)
1124             iommu_driver->ops->register_device(
1125                 device->group->container->iommu_data, device);
1126 
1127         up_read(&device->group->group_rwsem);
1128     }
1129     mutex_unlock(&device->dev_set->lock);
1130 
1131     /*
1132      * We can't use anon_inode_getfd() because we need to modify
1133      * the f_mode flags directly to allow more than just ioctls
1134      */
1135     filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1136                    device, O_RDWR);
1137     if (IS_ERR(filep)) {
1138         ret = PTR_ERR(filep);
1139         goto err_close_device;
1140     }
1141 
1142     /*
1143      * TODO: add an anon_inode interface to do this.
1144      * Appears to be missing by lack of need rather than
1145      * explicitly prevented.  Now there's need.
1146      */
1147     filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
1148 
1149     if (device->group->type == VFIO_NO_IOMMU)
1150         dev_warn(device->dev, "vfio-noiommu device opened by user "
1151              "(%s:%d)\n", current->comm, task_pid_nr(current));
1152     /*
1153      * On success the ref of device is moved to the file and
1154      * put in vfio_device_fops_release()
1155      */
1156     return filep;
1157 
1158 err_close_device:
1159     mutex_lock(&device->dev_set->lock);
1160     down_read(&device->group->group_rwsem);
1161     if (device->open_count == 1 && device->ops->close_device) {
1162         device->ops->close_device(device);
1163 
1164         iommu_driver = device->group->container->iommu_driver;
1165         if (iommu_driver && iommu_driver->ops->unregister_device)
1166             iommu_driver->ops->unregister_device(
1167                 device->group->container->iommu_data, device);
1168     }
1169 err_undo_count:
1170     up_read(&device->group->group_rwsem);
1171     device->open_count--;
1172     if (device->open_count == 0 && device->kvm)
1173         device->kvm = NULL;
1174     mutex_unlock(&device->dev_set->lock);
1175     module_put(device->dev->driver->owner);
1176 err_unassign_container:
1177     vfio_device_unassign_container(device);
1178     return ERR_PTR(ret);
1179 }
1180 
1181 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1182 {
1183     struct vfio_device *device;
1184     struct file *filep;
1185     int fdno;
1186     int ret;
1187 
1188     device = vfio_device_get_from_name(group, buf);
1189     if (IS_ERR(device))
1190         return PTR_ERR(device);
1191 
1192     fdno = get_unused_fd_flags(O_CLOEXEC);
1193     if (fdno < 0) {
1194         ret = fdno;
1195         goto err_put_device;
1196     }
1197 
1198     filep = vfio_device_open(device);
1199     if (IS_ERR(filep)) {
1200         ret = PTR_ERR(filep);
1201         goto err_put_fdno;
1202     }
1203 
1204     fd_install(fdno, filep);
1205     return fdno;
1206 
1207 err_put_fdno:
1208     put_unused_fd(fdno);
1209 err_put_device:
1210     vfio_device_put(device);
1211     return ret;
1212 }
1213 
1214 static long vfio_group_fops_unl_ioctl(struct file *filep,
1215                       unsigned int cmd, unsigned long arg)
1216 {
1217     struct vfio_group *group = filep->private_data;
1218     long ret = -ENOTTY;
1219 
1220     switch (cmd) {
1221     case VFIO_GROUP_GET_STATUS:
1222     {
1223         struct vfio_group_status status;
1224         unsigned long minsz;
1225 
1226         minsz = offsetofend(struct vfio_group_status, flags);
1227 
1228         if (copy_from_user(&status, (void __user *)arg, minsz))
1229             return -EFAULT;
1230 
1231         if (status.argsz < minsz)
1232             return -EINVAL;
1233 
1234         status.flags = 0;
1235 
1236         down_read(&group->group_rwsem);
1237         if (group->container)
1238             status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1239                     VFIO_GROUP_FLAGS_VIABLE;
1240         else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1241             status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1242         up_read(&group->group_rwsem);
1243 
1244         if (copy_to_user((void __user *)arg, &status, minsz))
1245             return -EFAULT;
1246 
1247         ret = 0;
1248         break;
1249     }
1250     case VFIO_GROUP_SET_CONTAINER:
1251     {
1252         int fd;
1253 
1254         if (get_user(fd, (int __user *)arg))
1255             return -EFAULT;
1256 
1257         if (fd < 0)
1258             return -EINVAL;
1259 
1260         down_write(&group->group_rwsem);
1261         ret = vfio_group_set_container(group, fd);
1262         up_write(&group->group_rwsem);
1263         break;
1264     }
1265     case VFIO_GROUP_UNSET_CONTAINER:
1266         down_write(&group->group_rwsem);
1267         ret = vfio_group_unset_container(group);
1268         up_write(&group->group_rwsem);
1269         break;
1270     case VFIO_GROUP_GET_DEVICE_FD:
1271     {
1272         char *buf;
1273 
1274         buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1275         if (IS_ERR(buf))
1276             return PTR_ERR(buf);
1277 
1278         ret = vfio_group_get_device_fd(group, buf);
1279         kfree(buf);
1280         break;
1281     }
1282     }
1283 
1284     return ret;
1285 }
1286 
1287 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1288 {
1289     struct vfio_group *group =
1290         container_of(inode->i_cdev, struct vfio_group, cdev);
1291     int ret;
1292 
1293     down_write(&group->group_rwsem);
1294 
1295     /* users can be zero if this races with vfio_group_put() */
1296     if (!refcount_inc_not_zero(&group->users)) {
1297         ret = -ENODEV;
1298         goto err_unlock;
1299     }
1300 
1301     if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1302         ret = -EPERM;
1303         goto err_put;
1304     }
1305 
1306     /*
1307      * Do we need multiple instances of the group open?  Seems not.
1308      */
1309     if (group->opened_file) {
1310         ret = -EBUSY;
1311         goto err_put;
1312     }
1313     group->opened_file = filep;
1314     filep->private_data = group;
1315 
1316     up_write(&group->group_rwsem);
1317     return 0;
1318 err_put:
1319     vfio_group_put(group);
1320 err_unlock:
1321     up_write(&group->group_rwsem);
1322     return ret;
1323 }
1324 
1325 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1326 {
1327     struct vfio_group *group = filep->private_data;
1328 
1329     filep->private_data = NULL;
1330 
1331     down_write(&group->group_rwsem);
1332     /*
1333      * Device FDs hold a group file reference, therefore the group release
1334      * is only called when there are no open devices.
1335      */
1336     WARN_ON(group->notifier.head);
1337     if (group->container) {
1338         WARN_ON(group->container_users != 1);
1339         __vfio_group_unset_container(group);
1340     }
1341     group->opened_file = NULL;
1342     up_write(&group->group_rwsem);
1343 
1344     vfio_group_put(group);
1345 
1346     return 0;
1347 }
1348 
1349 static const struct file_operations vfio_group_fops = {
1350     .owner      = THIS_MODULE,
1351     .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1352     .compat_ioctl   = compat_ptr_ioctl,
1353     .open       = vfio_group_fops_open,
1354     .release    = vfio_group_fops_release,
1355 };
1356 
1357 /*
1358  * VFIO Device fd
1359  */
1360 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1361 {
1362     struct vfio_device *device = filep->private_data;
1363     struct vfio_iommu_driver *iommu_driver;
1364 
1365     mutex_lock(&device->dev_set->lock);
1366     vfio_assert_device_open(device);
1367     down_read(&device->group->group_rwsem);
1368     if (device->open_count == 1 && device->ops->close_device)
1369         device->ops->close_device(device);
1370 
1371     iommu_driver = device->group->container->iommu_driver;
1372     if (iommu_driver && iommu_driver->ops->unregister_device)
1373         iommu_driver->ops->unregister_device(
1374             device->group->container->iommu_data, device);
1375     up_read(&device->group->group_rwsem);
1376     device->open_count--;
1377     if (device->open_count == 0)
1378         device->kvm = NULL;
1379     mutex_unlock(&device->dev_set->lock);
1380 
1381     module_put(device->dev->driver->owner);
1382 
1383     vfio_device_unassign_container(device);
1384 
1385     vfio_device_put(device);
1386 
1387     return 0;
1388 }
1389 
1390 /*
1391  * vfio_mig_get_next_state - Compute the next step in the FSM
1392  * @cur_fsm - The current state the device is in
1393  * @new_fsm - The target state to reach
1394  * @next_fsm - Pointer to the next step to get to new_fsm
1395  *
1396  * Return 0 upon success, otherwise -errno
1397  * Upon success the next step in the state progression between cur_fsm and
1398  * new_fsm will be set in next_fsm.
1399  *
1400  * This breaks down requests for combination transitions into smaller steps and
1401  * returns the next step to get to new_fsm. The function may need to be called
1402  * multiple times before reaching new_fsm.
1403  *
1404  */
1405 int vfio_mig_get_next_state(struct vfio_device *device,
1406                 enum vfio_device_mig_state cur_fsm,
1407                 enum vfio_device_mig_state new_fsm,
1408                 enum vfio_device_mig_state *next_fsm)
1409 {
1410     enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1411     /*
1412      * The coding in this table requires the driver to implement the
1413      * following FSM arcs:
1414      *         RESUMING -> STOP
1415      *         STOP -> RESUMING
1416      *         STOP -> STOP_COPY
1417      *         STOP_COPY -> STOP
1418      *
1419      * If P2P is supported then the driver must also implement these FSM
1420      * arcs:
1421      *         RUNNING -> RUNNING_P2P
1422      *         RUNNING_P2P -> RUNNING
1423      *         RUNNING_P2P -> STOP
1424      *         STOP -> RUNNING_P2P
1425      * Without P2P the driver must implement:
1426      *         RUNNING -> STOP
1427      *         STOP -> RUNNING
1428      *
1429      * The coding will step through multiple states for some combination
1430      * transitions; if all optional features are supported, this means the
1431      * following ones:
1432      *         RESUMING -> STOP -> RUNNING_P2P
1433      *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1434      *         RESUMING -> STOP -> STOP_COPY
1435      *         RUNNING -> RUNNING_P2P -> STOP
1436      *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1437      *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1438      *         RUNNING_P2P -> STOP -> RESUMING
1439      *         RUNNING_P2P -> STOP -> STOP_COPY
1440      *         STOP -> RUNNING_P2P -> RUNNING
1441      *         STOP_COPY -> STOP -> RESUMING
1442      *         STOP_COPY -> STOP -> RUNNING_P2P
1443      *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1444      */
1445     static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1446         [VFIO_DEVICE_STATE_STOP] = {
1447             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1449             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1450             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1452             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1453         },
1454         [VFIO_DEVICE_STATE_RUNNING] = {
1455             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1456             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1458             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1459             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1461         },
1462         [VFIO_DEVICE_STATE_STOP_COPY] = {
1463             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1464             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1465             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1466             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1467             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1468             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1469         },
1470         [VFIO_DEVICE_STATE_RESUMING] = {
1471             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1472             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1473             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1474             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1475             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1476             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1477         },
1478         [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1479             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1480             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1481             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1482             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1483             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1484             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1485         },
1486         [VFIO_DEVICE_STATE_ERROR] = {
1487             [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1488             [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1489             [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1490             [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1491             [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1492             [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1493         },
1494     };
1495 
1496     static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1497         [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1498         [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1499         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1500         [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1501         [VFIO_DEVICE_STATE_RUNNING_P2P] =
1502             VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1503         [VFIO_DEVICE_STATE_ERROR] = ~0U,
1504     };
1505 
1506     if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1507             (state_flags_table[cur_fsm] & device->migration_flags) !=
1508             state_flags_table[cur_fsm]))
1509         return -EINVAL;
1510 
1511     if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1512        (state_flags_table[new_fsm] & device->migration_flags) !=
1513             state_flags_table[new_fsm])
1514         return -EINVAL;
1515 
1516     /*
1517      * Arcs touching optional and unsupported states are skipped over. The
1518      * driver will instead see an arc from the original state to the next
1519      * logical state, as per the above comment.
1520      */
1521     *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1522     while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1523             state_flags_table[*next_fsm])
1524         *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1525 
1526     return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1527 }
1528 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1529 
1530 /*
1531  * Convert the drivers's struct file into a FD number and return it to userspace
1532  */
1533 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1534                    struct vfio_device_feature_mig_state *mig)
1535 {
1536     int ret;
1537     int fd;
1538 
1539     fd = get_unused_fd_flags(O_CLOEXEC);
1540     if (fd < 0) {
1541         ret = fd;
1542         goto out_fput;
1543     }
1544 
1545     mig->data_fd = fd;
1546     if (copy_to_user(arg, mig, sizeof(*mig))) {
1547         ret = -EFAULT;
1548         goto out_put_unused;
1549     }
1550     fd_install(fd, filp);
1551     return 0;
1552 
1553 out_put_unused:
1554     put_unused_fd(fd);
1555 out_fput:
1556     fput(filp);
1557     return ret;
1558 }
1559 
1560 static int
1561 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1562                        u32 flags, void __user *arg,
1563                        size_t argsz)
1564 {
1565     size_t minsz =
1566         offsetofend(struct vfio_device_feature_mig_state, data_fd);
1567     struct vfio_device_feature_mig_state mig;
1568     struct file *filp = NULL;
1569     int ret;
1570 
1571     if (!device->mig_ops)
1572         return -ENOTTY;
1573 
1574     ret = vfio_check_feature(flags, argsz,
1575                  VFIO_DEVICE_FEATURE_SET |
1576                  VFIO_DEVICE_FEATURE_GET,
1577                  sizeof(mig));
1578     if (ret != 1)
1579         return ret;
1580 
1581     if (copy_from_user(&mig, arg, minsz))
1582         return -EFAULT;
1583 
1584     if (flags & VFIO_DEVICE_FEATURE_GET) {
1585         enum vfio_device_mig_state curr_state;
1586 
1587         ret = device->mig_ops->migration_get_state(device,
1588                                &curr_state);
1589         if (ret)
1590             return ret;
1591         mig.device_state = curr_state;
1592         goto out_copy;
1593     }
1594 
1595     /* Handle the VFIO_DEVICE_FEATURE_SET */
1596     filp = device->mig_ops->migration_set_state(device, mig.device_state);
1597     if (IS_ERR(filp) || !filp)
1598         goto out_copy;
1599 
1600     return vfio_ioct_mig_return_fd(filp, arg, &mig);
1601 out_copy:
1602     mig.data_fd = -1;
1603     if (copy_to_user(arg, &mig, sizeof(mig)))
1604         return -EFAULT;
1605     if (IS_ERR(filp))
1606         return PTR_ERR(filp);
1607     return 0;
1608 }
1609 
1610 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1611                            u32 flags, void __user *arg,
1612                            size_t argsz)
1613 {
1614     struct vfio_device_feature_migration mig = {
1615         .flags = device->migration_flags,
1616     };
1617     int ret;
1618 
1619     if (!device->mig_ops)
1620         return -ENOTTY;
1621 
1622     ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1623                  sizeof(mig));
1624     if (ret != 1)
1625         return ret;
1626     if (copy_to_user(arg, &mig, sizeof(mig)))
1627         return -EFAULT;
1628     return 0;
1629 }
1630 
1631 static int vfio_ioctl_device_feature(struct vfio_device *device,
1632                      struct vfio_device_feature __user *arg)
1633 {
1634     size_t minsz = offsetofend(struct vfio_device_feature, flags);
1635     struct vfio_device_feature feature;
1636 
1637     if (copy_from_user(&feature, arg, minsz))
1638         return -EFAULT;
1639 
1640     if (feature.argsz < minsz)
1641         return -EINVAL;
1642 
1643     /* Check unknown flags */
1644     if (feature.flags &
1645         ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1646           VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1647         return -EINVAL;
1648 
1649     /* GET & SET are mutually exclusive except with PROBE */
1650     if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1651         (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1652         (feature.flags & VFIO_DEVICE_FEATURE_GET))
1653         return -EINVAL;
1654 
1655     switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1656     case VFIO_DEVICE_FEATURE_MIGRATION:
1657         return vfio_ioctl_device_feature_migration(
1658             device, feature.flags, arg->data,
1659             feature.argsz - minsz);
1660     case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1661         return vfio_ioctl_device_feature_mig_device_state(
1662             device, feature.flags, arg->data,
1663             feature.argsz - minsz);
1664     default:
1665         if (unlikely(!device->ops->device_feature))
1666             return -EINVAL;
1667         return device->ops->device_feature(device, feature.flags,
1668                            arg->data,
1669                            feature.argsz - minsz);
1670     }
1671 }
1672 
1673 static long vfio_device_fops_unl_ioctl(struct file *filep,
1674                        unsigned int cmd, unsigned long arg)
1675 {
1676     struct vfio_device *device = filep->private_data;
1677 
1678     switch (cmd) {
1679     case VFIO_DEVICE_FEATURE:
1680         return vfio_ioctl_device_feature(device, (void __user *)arg);
1681     default:
1682         if (unlikely(!device->ops->ioctl))
1683             return -EINVAL;
1684         return device->ops->ioctl(device, cmd, arg);
1685     }
1686 }
1687 
1688 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1689                      size_t count, loff_t *ppos)
1690 {
1691     struct vfio_device *device = filep->private_data;
1692 
1693     if (unlikely(!device->ops->read))
1694         return -EINVAL;
1695 
1696     return device->ops->read(device, buf, count, ppos);
1697 }
1698 
1699 static ssize_t vfio_device_fops_write(struct file *filep,
1700                       const char __user *buf,
1701                       size_t count, loff_t *ppos)
1702 {
1703     struct vfio_device *device = filep->private_data;
1704 
1705     if (unlikely(!device->ops->write))
1706         return -EINVAL;
1707 
1708     return device->ops->write(device, buf, count, ppos);
1709 }
1710 
1711 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1712 {
1713     struct vfio_device *device = filep->private_data;
1714 
1715     if (unlikely(!device->ops->mmap))
1716         return -EINVAL;
1717 
1718     return device->ops->mmap(device, vma);
1719 }
1720 
1721 static const struct file_operations vfio_device_fops = {
1722     .owner      = THIS_MODULE,
1723     .release    = vfio_device_fops_release,
1724     .read       = vfio_device_fops_read,
1725     .write      = vfio_device_fops_write,
1726     .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1727     .compat_ioctl   = compat_ptr_ioctl,
1728     .mmap       = vfio_device_fops_mmap,
1729 };
1730 
1731 /**
1732  * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1733  * @file: VFIO group file
1734  *
1735  * The returned iommu_group is valid as long as a ref is held on the file.
1736  */
1737 struct iommu_group *vfio_file_iommu_group(struct file *file)
1738 {
1739     struct vfio_group *group = file->private_data;
1740 
1741     if (file->f_op != &vfio_group_fops)
1742         return NULL;
1743     return group->iommu_group;
1744 }
1745 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1746 
1747 /**
1748  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1749  *        is always CPU cache coherent
1750  * @file: VFIO group file
1751  *
1752  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1753  * bit in DMA transactions. A return of false indicates that the user has
1754  * rights to access additional instructions such as wbinvd on x86.
1755  */
1756 bool vfio_file_enforced_coherent(struct file *file)
1757 {
1758     struct vfio_group *group = file->private_data;
1759     bool ret;
1760 
1761     if (file->f_op != &vfio_group_fops)
1762         return true;
1763 
1764     down_read(&group->group_rwsem);
1765     if (group->container) {
1766         ret = vfio_ioctl_check_extension(group->container,
1767                          VFIO_DMA_CC_IOMMU);
1768     } else {
1769         /*
1770          * Since the coherency state is determined only once a container
1771          * is attached the user must do so before they can prove they
1772          * have permission.
1773          */
1774         ret = true;
1775     }
1776     up_read(&group->group_rwsem);
1777     return ret;
1778 }
1779 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1780 
1781 /**
1782  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1783  * @file: VFIO group file
1784  * @kvm: KVM to link
1785  *
1786  * When a VFIO device is first opened the KVM will be available in
1787  * device->kvm if one was associated with the group.
1788  */
1789 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1790 {
1791     struct vfio_group *group = file->private_data;
1792 
1793     if (file->f_op != &vfio_group_fops)
1794         return;
1795 
1796     down_write(&group->group_rwsem);
1797     group->kvm = kvm;
1798     up_write(&group->group_rwsem);
1799 }
1800 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1801 
1802 /**
1803  * vfio_file_has_dev - True if the VFIO file is a handle for device
1804  * @file: VFIO file to check
1805  * @device: Device that must be part of the file
1806  *
1807  * Returns true if given file has permission to manipulate the given device.
1808  */
1809 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1810 {
1811     struct vfio_group *group = file->private_data;
1812 
1813     if (file->f_op != &vfio_group_fops)
1814         return false;
1815 
1816     return group == device->group;
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1819 
1820 /*
1821  * Sub-module support
1822  */
1823 /*
1824  * Helper for managing a buffer of info chain capabilities, allocate or
1825  * reallocate a buffer with additional @size, filling in @id and @version
1826  * of the capability.  A pointer to the new capability is returned.
1827  *
1828  * NB. The chain is based at the head of the buffer, so new entries are
1829  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1830  * next offsets prior to copying to the user buffer.
1831  */
1832 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1833                            size_t size, u16 id, u16 version)
1834 {
1835     void *buf;
1836     struct vfio_info_cap_header *header, *tmp;
1837 
1838     buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1839     if (!buf) {
1840         kfree(caps->buf);
1841         caps->buf = NULL;
1842         caps->size = 0;
1843         return ERR_PTR(-ENOMEM);
1844     }
1845 
1846     caps->buf = buf;
1847     header = buf + caps->size;
1848 
1849     /* Eventually copied to user buffer, zero */
1850     memset(header, 0, size);
1851 
1852     header->id = id;
1853     header->version = version;
1854 
1855     /* Add to the end of the capability chain */
1856     for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1857         ; /* nothing */
1858 
1859     tmp->next = caps->size;
1860     caps->size += size;
1861 
1862     return header;
1863 }
1864 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1865 
1866 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1867 {
1868     struct vfio_info_cap_header *tmp;
1869     void *buf = (void *)caps->buf;
1870 
1871     for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1872         tmp->next += offset;
1873 }
1874 EXPORT_SYMBOL(vfio_info_cap_shift);
1875 
1876 int vfio_info_add_capability(struct vfio_info_cap *caps,
1877                  struct vfio_info_cap_header *cap, size_t size)
1878 {
1879     struct vfio_info_cap_header *header;
1880 
1881     header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1882     if (IS_ERR(header))
1883         return PTR_ERR(header);
1884 
1885     memcpy(header + 1, cap + 1, size - sizeof(*header));
1886 
1887     return 0;
1888 }
1889 EXPORT_SYMBOL(vfio_info_add_capability);
1890 
1891 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1892                        int max_irq_type, size_t *data_size)
1893 {
1894     unsigned long minsz;
1895     size_t size;
1896 
1897     minsz = offsetofend(struct vfio_irq_set, count);
1898 
1899     if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1900         (hdr->count >= (U32_MAX - hdr->start)) ||
1901         (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1902                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1903         return -EINVAL;
1904 
1905     if (data_size)
1906         *data_size = 0;
1907 
1908     if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1909         return -EINVAL;
1910 
1911     switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1912     case VFIO_IRQ_SET_DATA_NONE:
1913         size = 0;
1914         break;
1915     case VFIO_IRQ_SET_DATA_BOOL:
1916         size = sizeof(uint8_t);
1917         break;
1918     case VFIO_IRQ_SET_DATA_EVENTFD:
1919         size = sizeof(int32_t);
1920         break;
1921     default:
1922         return -EINVAL;
1923     }
1924 
1925     if (size) {
1926         if (hdr->argsz - minsz < hdr->count * size)
1927             return -EINVAL;
1928 
1929         if (!data_size)
1930             return -EINVAL;
1931 
1932         *data_size = hdr->count * size;
1933     }
1934 
1935     return 0;
1936 }
1937 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1938 
1939 /*
1940  * Pin contiguous user pages and return their associated host pages for local
1941  * domain only.
1942  * @device [in]  : device
1943  * @iova [in]    : starting IOVA of user pages to be pinned.
1944  * @npage [in]   : count of pages to be pinned.  This count should not
1945  *         be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1946  * @prot [in]    : protection flags
1947  * @pages[out]   : array of host pages
1948  * Return error or number of pages pinned.
1949  */
1950 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1951            int npage, int prot, struct page **pages)
1952 {
1953     struct vfio_container *container;
1954     struct vfio_group *group = device->group;
1955     struct vfio_iommu_driver *driver;
1956     int ret;
1957 
1958     if (!pages || !npage || !vfio_assert_device_open(device))
1959         return -EINVAL;
1960 
1961     if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1962         return -E2BIG;
1963 
1964     if (group->dev_counter > 1)
1965         return -EINVAL;
1966 
1967     /* group->container cannot change while a vfio device is open */
1968     container = group->container;
1969     driver = container->iommu_driver;
1970     if (likely(driver && driver->ops->pin_pages))
1971         ret = driver->ops->pin_pages(container->iommu_data,
1972                          group->iommu_group, iova,
1973                          npage, prot, pages);
1974     else
1975         ret = -ENOTTY;
1976 
1977     return ret;
1978 }
1979 EXPORT_SYMBOL(vfio_pin_pages);
1980 
1981 /*
1982  * Unpin contiguous host pages for local domain only.
1983  * @device [in]  : device
1984  * @iova [in]    : starting address of user pages to be unpinned.
1985  * @npage [in]   : count of pages to be unpinned.  This count should not
1986  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1987  */
1988 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1989 {
1990     struct vfio_container *container;
1991     struct vfio_iommu_driver *driver;
1992 
1993     if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
1994         return;
1995 
1996     if (WARN_ON(!vfio_assert_device_open(device)))
1997         return;
1998 
1999     /* group->container cannot change while a vfio device is open */
2000     container = device->group->container;
2001     driver = container->iommu_driver;
2002 
2003     driver->ops->unpin_pages(container->iommu_data, iova, npage);
2004 }
2005 EXPORT_SYMBOL(vfio_unpin_pages);
2006 
2007 /*
2008  * This interface allows the CPUs to perform some sort of virtual DMA on
2009  * behalf of the device.
2010  *
2011  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2012  * into/from a kernel buffer.
2013  *
2014  * As the read/write of user space memory is conducted via the CPUs and is
2015  * not a real device DMA, it is not necessary to pin the user space memory.
2016  *
2017  * @device [in]     : VFIO device
2018  * @iova [in]       : base IOVA of a user space buffer
2019  * @data [in]       : pointer to kernel buffer
2020  * @len [in]        : kernel buffer length
2021  * @write       : indicate read or write
2022  * Return error code on failure or 0 on success.
2023  */
2024 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
2025         size_t len, bool write)
2026 {
2027     struct vfio_container *container;
2028     struct vfio_iommu_driver *driver;
2029     int ret = 0;
2030 
2031     if (!data || len <= 0 || !vfio_assert_device_open(device))
2032         return -EINVAL;
2033 
2034     /* group->container cannot change while a vfio device is open */
2035     container = device->group->container;
2036     driver = container->iommu_driver;
2037 
2038     if (likely(driver && driver->ops->dma_rw))
2039         ret = driver->ops->dma_rw(container->iommu_data,
2040                       iova, data, len, write);
2041     else
2042         ret = -ENOTTY;
2043     return ret;
2044 }
2045 EXPORT_SYMBOL(vfio_dma_rw);
2046 
2047 /*
2048  * Module/class support
2049  */
2050 static char *vfio_devnode(struct device *dev, umode_t *mode)
2051 {
2052     return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2053 }
2054 
2055 static struct miscdevice vfio_dev = {
2056     .minor = VFIO_MINOR,
2057     .name = "vfio",
2058     .fops = &vfio_fops,
2059     .nodename = "vfio/vfio",
2060     .mode = S_IRUGO | S_IWUGO,
2061 };
2062 
2063 static int __init vfio_init(void)
2064 {
2065     int ret;
2066 
2067     ida_init(&vfio.group_ida);
2068     mutex_init(&vfio.group_lock);
2069     mutex_init(&vfio.iommu_drivers_lock);
2070     INIT_LIST_HEAD(&vfio.group_list);
2071     INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2072 
2073     ret = misc_register(&vfio_dev);
2074     if (ret) {
2075         pr_err("vfio: misc device register failed\n");
2076         return ret;
2077     }
2078 
2079     /* /dev/vfio/$GROUP */
2080     vfio.class = class_create(THIS_MODULE, "vfio");
2081     if (IS_ERR(vfio.class)) {
2082         ret = PTR_ERR(vfio.class);
2083         goto err_class;
2084     }
2085 
2086     vfio.class->devnode = vfio_devnode;
2087 
2088     ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2089     if (ret)
2090         goto err_alloc_chrdev;
2091 
2092 #ifdef CONFIG_VFIO_NOIOMMU
2093     ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
2094 #endif
2095     if (ret)
2096         goto err_driver_register;
2097 
2098     pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2099     return 0;
2100 
2101 err_driver_register:
2102     unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2103 err_alloc_chrdev:
2104     class_destroy(vfio.class);
2105     vfio.class = NULL;
2106 err_class:
2107     misc_deregister(&vfio_dev);
2108     return ret;
2109 }
2110 
2111 static void __exit vfio_cleanup(void)
2112 {
2113     WARN_ON(!list_empty(&vfio.group_list));
2114 
2115 #ifdef CONFIG_VFIO_NOIOMMU
2116     vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2117 #endif
2118     ida_destroy(&vfio.group_ida);
2119     unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2120     class_destroy(vfio.class);
2121     vfio.class = NULL;
2122     misc_deregister(&vfio_dev);
2123     xa_destroy(&vfio_device_set_xa);
2124 }
2125 
2126 module_init(vfio_init);
2127 module_exit(vfio_cleanup);
2128 
2129 MODULE_VERSION(DRIVER_VERSION);
2130 MODULE_LICENSE("GPL v2");
2131 MODULE_AUTHOR(DRIVER_AUTHOR);
2132 MODULE_DESCRIPTION(DRIVER_DESC);
2133 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2134 MODULE_ALIAS("devname:vfio/vfio");
2135 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");