Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
0003  *
0004  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
0005  *
0006  * Permission is hereby granted, free of charge, to any person obtaining a
0007  * copy of this software and associated documentation files (the "Software"),
0008  * to deal in the Software without restriction, including without limitation
0009  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0010  * and/or sell copies of the Software, and to permit persons to whom the
0011  * Software is furnished to do so, subject to the following conditions:
0012  *
0013  * The above copyright notice and this permission notice (including the next
0014  * paragraph) shall be included in all copies or substantial portions of the
0015  * Software.
0016  *
0017  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0018  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0019  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0020  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0021  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
0022  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0023  * SOFTWARE.
0024  *
0025  * Authors:
0026  *    Kevin Tian <kevin.tian@intel.com>
0027  *    Jike Song <jike.song@intel.com>
0028  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
0029  *    Eddie Dong <eddie.dong@intel.com>
0030  *
0031  * Contributors:
0032  *    Niu Bing <bing.niu@intel.com>
0033  *    Zhi Wang <zhi.a.wang@intel.com>
0034  */
0035 
0036 #include <linux/init.h>
0037 #include <linux/device.h>
0038 #include <linux/mm.h>
0039 #include <linux/kthread.h>
0040 #include <linux/sched/mm.h>
0041 #include <linux/types.h>
0042 #include <linux/list.h>
0043 #include <linux/rbtree.h>
0044 #include <linux/spinlock.h>
0045 #include <linux/eventfd.h>
0046 #include <linux/uuid.h>
0047 #include <linux/mdev.h>
0048 #include <linux/debugfs.h>
0049 
0050 #include <linux/nospec.h>
0051 
0052 #include <drm/drm_edid.h>
0053 
0054 #include "i915_drv.h"
0055 #include "intel_gvt.h"
0056 #include "gvt.h"
0057 
0058 MODULE_IMPORT_NS(DMA_BUF);
0059 MODULE_IMPORT_NS(I915_GVT);
0060 
0061 /* helper macros copied from vfio-pci */
0062 #define VFIO_PCI_OFFSET_SHIFT   40
0063 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
0064 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
0065 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
0066 
0067 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
0068 
0069 #define OPREGION_SIGNATURE "IntelGraphicsMem"
0070 
0071 struct vfio_region;
0072 struct intel_vgpu_regops {
0073     size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
0074             size_t count, loff_t *ppos, bool iswrite);
0075     void (*release)(struct intel_vgpu *vgpu,
0076             struct vfio_region *region);
0077 };
0078 
0079 struct vfio_region {
0080     u32             type;
0081     u32             subtype;
0082     size_t              size;
0083     u32             flags;
0084     const struct intel_vgpu_regops  *ops;
0085     void                *data;
0086 };
0087 
0088 struct vfio_edid_region {
0089     struct vfio_region_gfx_edid vfio_edid_regs;
0090     void *edid_blob;
0091 };
0092 
0093 struct kvmgt_pgfn {
0094     gfn_t gfn;
0095     struct hlist_node hnode;
0096 };
0097 
0098 struct gvt_dma {
0099     struct intel_vgpu *vgpu;
0100     struct rb_node gfn_node;
0101     struct rb_node dma_addr_node;
0102     gfn_t gfn;
0103     dma_addr_t dma_addr;
0104     unsigned long size;
0105     struct kref ref;
0106 };
0107 
0108 #define vfio_dev_to_vgpu(vfio_dev) \
0109     container_of((vfio_dev), struct intel_vgpu, vfio_device)
0110 
0111 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
0112         const u8 *val, int len,
0113         struct kvm_page_track_notifier_node *node);
0114 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
0115         struct kvm_memory_slot *slot,
0116         struct kvm_page_track_notifier_node *node);
0117 
0118 static ssize_t available_instances_show(struct mdev_type *mtype,
0119                     struct mdev_type_attribute *attr,
0120                     char *buf)
0121 {
0122     struct intel_vgpu_type *type;
0123     unsigned int num = 0;
0124     struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
0125 
0126     type = &gvt->types[mtype_get_type_group_id(mtype)];
0127     if (!type)
0128         num = 0;
0129     else
0130         num = type->avail_instance;
0131 
0132     return sprintf(buf, "%u\n", num);
0133 }
0134 
0135 static ssize_t device_api_show(struct mdev_type *mtype,
0136                    struct mdev_type_attribute *attr, char *buf)
0137 {
0138     return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
0139 }
0140 
0141 static ssize_t description_show(struct mdev_type *mtype,
0142                 struct mdev_type_attribute *attr, char *buf)
0143 {
0144     struct intel_vgpu_type *type;
0145     struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
0146 
0147     type = &gvt->types[mtype_get_type_group_id(mtype)];
0148     if (!type)
0149         return 0;
0150 
0151     return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
0152                "fence: %d\nresolution: %s\n"
0153                "weight: %d\n",
0154                BYTES_TO_MB(type->low_gm_size),
0155                BYTES_TO_MB(type->high_gm_size),
0156                type->fence, vgpu_edid_str(type->resolution),
0157                type->weight);
0158 }
0159 
0160 static ssize_t name_show(struct mdev_type *mtype,
0161              struct mdev_type_attribute *attr, char *buf)
0162 {
0163     struct intel_vgpu_type *type;
0164     struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
0165 
0166     type = &gvt->types[mtype_get_type_group_id(mtype)];
0167     if (!type)
0168         return 0;
0169 
0170     return sprintf(buf, "%s\n", type->name);
0171 }
0172 
0173 static MDEV_TYPE_ATTR_RO(available_instances);
0174 static MDEV_TYPE_ATTR_RO(device_api);
0175 static MDEV_TYPE_ATTR_RO(description);
0176 static MDEV_TYPE_ATTR_RO(name);
0177 
0178 static struct attribute *gvt_type_attrs[] = {
0179     &mdev_type_attr_available_instances.attr,
0180     &mdev_type_attr_device_api.attr,
0181     &mdev_type_attr_description.attr,
0182     &mdev_type_attr_name.attr,
0183     NULL,
0184 };
0185 
0186 static struct attribute_group *gvt_vgpu_type_groups[] = {
0187     [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
0188 };
0189 
0190 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
0191 {
0192     int i, j;
0193     struct intel_vgpu_type *type;
0194     struct attribute_group *group;
0195 
0196     for (i = 0; i < gvt->num_types; i++) {
0197         type = &gvt->types[i];
0198 
0199         group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
0200         if (!group)
0201             goto unwind;
0202 
0203         group->name = type->name;
0204         group->attrs = gvt_type_attrs;
0205         gvt_vgpu_type_groups[i] = group;
0206     }
0207 
0208     return 0;
0209 
0210 unwind:
0211     for (j = 0; j < i; j++) {
0212         group = gvt_vgpu_type_groups[j];
0213         kfree(group);
0214     }
0215 
0216     return -ENOMEM;
0217 }
0218 
0219 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
0220 {
0221     int i;
0222     struct attribute_group *group;
0223 
0224     for (i = 0; i < gvt->num_types; i++) {
0225         group = gvt_vgpu_type_groups[i];
0226         gvt_vgpu_type_groups[i] = NULL;
0227         kfree(group);
0228     }
0229 }
0230 
0231 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
0232         unsigned long size)
0233 {
0234     vfio_unpin_pages(&vgpu->vfio_device, gfn << PAGE_SHIFT,
0235              DIV_ROUND_UP(size, PAGE_SIZE));
0236 }
0237 
0238 /* Pin a normal or compound guest page for dma. */
0239 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
0240         unsigned long size, struct page **page)
0241 {
0242     int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
0243     struct page *base_page = NULL;
0244     int npage;
0245     int ret;
0246 
0247     /*
0248      * We pin the pages one-by-one to avoid allocating a big arrary
0249      * on stack to hold pfns.
0250      */
0251     for (npage = 0; npage < total_pages; npage++) {
0252         dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
0253         struct page *cur_page;
0254 
0255         ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
0256                      IOMMU_READ | IOMMU_WRITE, &cur_page);
0257         if (ret != 1) {
0258             gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
0259                      &cur_iova, ret);
0260             goto err;
0261         }
0262 
0263         if (npage == 0)
0264             base_page = cur_page;
0265         else if (base_page + npage != cur_page) {
0266             gvt_vgpu_err("The pages are not continuous\n");
0267             ret = -EINVAL;
0268             npage++;
0269             goto err;
0270         }
0271     }
0272 
0273     *page = base_page;
0274     return 0;
0275 err:
0276     gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
0277     return ret;
0278 }
0279 
0280 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
0281         dma_addr_t *dma_addr, unsigned long size)
0282 {
0283     struct device *dev = vgpu->gvt->gt->i915->drm.dev;
0284     struct page *page = NULL;
0285     int ret;
0286 
0287     ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
0288     if (ret)
0289         return ret;
0290 
0291     /* Setup DMA mapping. */
0292     *dma_addr = dma_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL);
0293     if (dma_mapping_error(dev, *dma_addr)) {
0294         gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
0295                  page_to_pfn(page), ret);
0296         gvt_unpin_guest_page(vgpu, gfn, size);
0297         return -ENOMEM;
0298     }
0299 
0300     return 0;
0301 }
0302 
0303 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
0304         dma_addr_t dma_addr, unsigned long size)
0305 {
0306     struct device *dev = vgpu->gvt->gt->i915->drm.dev;
0307 
0308     dma_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL);
0309     gvt_unpin_guest_page(vgpu, gfn, size);
0310 }
0311 
0312 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
0313         dma_addr_t dma_addr)
0314 {
0315     struct rb_node *node = vgpu->dma_addr_cache.rb_node;
0316     struct gvt_dma *itr;
0317 
0318     while (node) {
0319         itr = rb_entry(node, struct gvt_dma, dma_addr_node);
0320 
0321         if (dma_addr < itr->dma_addr)
0322             node = node->rb_left;
0323         else if (dma_addr > itr->dma_addr)
0324             node = node->rb_right;
0325         else
0326             return itr;
0327     }
0328     return NULL;
0329 }
0330 
0331 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
0332 {
0333     struct rb_node *node = vgpu->gfn_cache.rb_node;
0334     struct gvt_dma *itr;
0335 
0336     while (node) {
0337         itr = rb_entry(node, struct gvt_dma, gfn_node);
0338 
0339         if (gfn < itr->gfn)
0340             node = node->rb_left;
0341         else if (gfn > itr->gfn)
0342             node = node->rb_right;
0343         else
0344             return itr;
0345     }
0346     return NULL;
0347 }
0348 
0349 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
0350         dma_addr_t dma_addr, unsigned long size)
0351 {
0352     struct gvt_dma *new, *itr;
0353     struct rb_node **link, *parent = NULL;
0354 
0355     new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
0356     if (!new)
0357         return -ENOMEM;
0358 
0359     new->vgpu = vgpu;
0360     new->gfn = gfn;
0361     new->dma_addr = dma_addr;
0362     new->size = size;
0363     kref_init(&new->ref);
0364 
0365     /* gfn_cache maps gfn to struct gvt_dma. */
0366     link = &vgpu->gfn_cache.rb_node;
0367     while (*link) {
0368         parent = *link;
0369         itr = rb_entry(parent, struct gvt_dma, gfn_node);
0370 
0371         if (gfn < itr->gfn)
0372             link = &parent->rb_left;
0373         else
0374             link = &parent->rb_right;
0375     }
0376     rb_link_node(&new->gfn_node, parent, link);
0377     rb_insert_color(&new->gfn_node, &vgpu->gfn_cache);
0378 
0379     /* dma_addr_cache maps dma addr to struct gvt_dma. */
0380     parent = NULL;
0381     link = &vgpu->dma_addr_cache.rb_node;
0382     while (*link) {
0383         parent = *link;
0384         itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
0385 
0386         if (dma_addr < itr->dma_addr)
0387             link = &parent->rb_left;
0388         else
0389             link = &parent->rb_right;
0390     }
0391     rb_link_node(&new->dma_addr_node, parent, link);
0392     rb_insert_color(&new->dma_addr_node, &vgpu->dma_addr_cache);
0393 
0394     vgpu->nr_cache_entries++;
0395     return 0;
0396 }
0397 
0398 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
0399                 struct gvt_dma *entry)
0400 {
0401     rb_erase(&entry->gfn_node, &vgpu->gfn_cache);
0402     rb_erase(&entry->dma_addr_node, &vgpu->dma_addr_cache);
0403     kfree(entry);
0404     vgpu->nr_cache_entries--;
0405 }
0406 
0407 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
0408 {
0409     struct gvt_dma *dma;
0410     struct rb_node *node = NULL;
0411 
0412     for (;;) {
0413         mutex_lock(&vgpu->cache_lock);
0414         node = rb_first(&vgpu->gfn_cache);
0415         if (!node) {
0416             mutex_unlock(&vgpu->cache_lock);
0417             break;
0418         }
0419         dma = rb_entry(node, struct gvt_dma, gfn_node);
0420         gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
0421         __gvt_cache_remove_entry(vgpu, dma);
0422         mutex_unlock(&vgpu->cache_lock);
0423     }
0424 }
0425 
0426 static void gvt_cache_init(struct intel_vgpu *vgpu)
0427 {
0428     vgpu->gfn_cache = RB_ROOT;
0429     vgpu->dma_addr_cache = RB_ROOT;
0430     vgpu->nr_cache_entries = 0;
0431     mutex_init(&vgpu->cache_lock);
0432 }
0433 
0434 static void kvmgt_protect_table_init(struct intel_vgpu *info)
0435 {
0436     hash_init(info->ptable);
0437 }
0438 
0439 static void kvmgt_protect_table_destroy(struct intel_vgpu *info)
0440 {
0441     struct kvmgt_pgfn *p;
0442     struct hlist_node *tmp;
0443     int i;
0444 
0445     hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
0446         hash_del(&p->hnode);
0447         kfree(p);
0448     }
0449 }
0450 
0451 static struct kvmgt_pgfn *
0452 __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
0453 {
0454     struct kvmgt_pgfn *p, *res = NULL;
0455 
0456     hash_for_each_possible(info->ptable, p, hnode, gfn) {
0457         if (gfn == p->gfn) {
0458             res = p;
0459             break;
0460         }
0461     }
0462 
0463     return res;
0464 }
0465 
0466 static bool kvmgt_gfn_is_write_protected(struct intel_vgpu *info, gfn_t gfn)
0467 {
0468     struct kvmgt_pgfn *p;
0469 
0470     p = __kvmgt_protect_table_find(info, gfn);
0471     return !!p;
0472 }
0473 
0474 static void kvmgt_protect_table_add(struct intel_vgpu *info, gfn_t gfn)
0475 {
0476     struct kvmgt_pgfn *p;
0477 
0478     if (kvmgt_gfn_is_write_protected(info, gfn))
0479         return;
0480 
0481     p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
0482     if (WARN(!p, "gfn: 0x%llx\n", gfn))
0483         return;
0484 
0485     p->gfn = gfn;
0486     hash_add(info->ptable, &p->hnode, gfn);
0487 }
0488 
0489 static void kvmgt_protect_table_del(struct intel_vgpu *info, gfn_t gfn)
0490 {
0491     struct kvmgt_pgfn *p;
0492 
0493     p = __kvmgt_protect_table_find(info, gfn);
0494     if (p) {
0495         hash_del(&p->hnode);
0496         kfree(p);
0497     }
0498 }
0499 
0500 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
0501         size_t count, loff_t *ppos, bool iswrite)
0502 {
0503     unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
0504             VFIO_PCI_NUM_REGIONS;
0505     void *base = vgpu->region[i].data;
0506     loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
0507 
0508 
0509     if (pos >= vgpu->region[i].size || iswrite) {
0510         gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
0511         return -EINVAL;
0512     }
0513     count = min(count, (size_t)(vgpu->region[i].size - pos));
0514     memcpy(buf, base + pos, count);
0515 
0516     return count;
0517 }
0518 
0519 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
0520         struct vfio_region *region)
0521 {
0522 }
0523 
0524 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
0525     .rw = intel_vgpu_reg_rw_opregion,
0526     .release = intel_vgpu_reg_release_opregion,
0527 };
0528 
0529 static int handle_edid_regs(struct intel_vgpu *vgpu,
0530             struct vfio_edid_region *region, char *buf,
0531             size_t count, u16 offset, bool is_write)
0532 {
0533     struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
0534     unsigned int data;
0535 
0536     if (offset + count > sizeof(*regs))
0537         return -EINVAL;
0538 
0539     if (count != 4)
0540         return -EINVAL;
0541 
0542     if (is_write) {
0543         data = *((unsigned int *)buf);
0544         switch (offset) {
0545         case offsetof(struct vfio_region_gfx_edid, link_state):
0546             if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
0547                 if (!drm_edid_block_valid(
0548                     (u8 *)region->edid_blob,
0549                     0,
0550                     true,
0551                     NULL)) {
0552                     gvt_vgpu_err("invalid EDID blob\n");
0553                     return -EINVAL;
0554                 }
0555                 intel_vgpu_emulate_hotplug(vgpu, true);
0556             } else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
0557                 intel_vgpu_emulate_hotplug(vgpu, false);
0558             else {
0559                 gvt_vgpu_err("invalid EDID link state %d\n",
0560                     regs->link_state);
0561                 return -EINVAL;
0562             }
0563             regs->link_state = data;
0564             break;
0565         case offsetof(struct vfio_region_gfx_edid, edid_size):
0566             if (data > regs->edid_max_size) {
0567                 gvt_vgpu_err("EDID size is bigger than %d!\n",
0568                     regs->edid_max_size);
0569                 return -EINVAL;
0570             }
0571             regs->edid_size = data;
0572             break;
0573         default:
0574             /* read-only regs */
0575             gvt_vgpu_err("write read-only EDID region at offset %d\n",
0576                 offset);
0577             return -EPERM;
0578         }
0579     } else {
0580         memcpy(buf, (char *)regs + offset, count);
0581     }
0582 
0583     return count;
0584 }
0585 
0586 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
0587             size_t count, u16 offset, bool is_write)
0588 {
0589     if (offset + count > region->vfio_edid_regs.edid_size)
0590         return -EINVAL;
0591 
0592     if (is_write)
0593         memcpy(region->edid_blob + offset, buf, count);
0594     else
0595         memcpy(buf, region->edid_blob + offset, count);
0596 
0597     return count;
0598 }
0599 
0600 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
0601         size_t count, loff_t *ppos, bool iswrite)
0602 {
0603     int ret;
0604     unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
0605             VFIO_PCI_NUM_REGIONS;
0606     struct vfio_edid_region *region = vgpu->region[i].data;
0607     loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
0608 
0609     if (pos < region->vfio_edid_regs.edid_offset) {
0610         ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
0611     } else {
0612         pos -= EDID_BLOB_OFFSET;
0613         ret = handle_edid_blob(region, buf, count, pos, iswrite);
0614     }
0615 
0616     if (ret < 0)
0617         gvt_vgpu_err("failed to access EDID region\n");
0618 
0619     return ret;
0620 }
0621 
0622 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
0623                     struct vfio_region *region)
0624 {
0625     kfree(region->data);
0626 }
0627 
0628 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
0629     .rw = intel_vgpu_reg_rw_edid,
0630     .release = intel_vgpu_reg_release_edid,
0631 };
0632 
0633 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
0634         unsigned int type, unsigned int subtype,
0635         const struct intel_vgpu_regops *ops,
0636         size_t size, u32 flags, void *data)
0637 {
0638     struct vfio_region *region;
0639 
0640     region = krealloc(vgpu->region,
0641             (vgpu->num_regions + 1) * sizeof(*region),
0642             GFP_KERNEL);
0643     if (!region)
0644         return -ENOMEM;
0645 
0646     vgpu->region = region;
0647     vgpu->region[vgpu->num_regions].type = type;
0648     vgpu->region[vgpu->num_regions].subtype = subtype;
0649     vgpu->region[vgpu->num_regions].ops = ops;
0650     vgpu->region[vgpu->num_regions].size = size;
0651     vgpu->region[vgpu->num_regions].flags = flags;
0652     vgpu->region[vgpu->num_regions].data = data;
0653     vgpu->num_regions++;
0654     return 0;
0655 }
0656 
0657 int intel_gvt_set_opregion(struct intel_vgpu *vgpu)
0658 {
0659     void *base;
0660     int ret;
0661 
0662     /* Each vgpu has its own opregion, although VFIO would create another
0663      * one later. This one is used to expose opregion to VFIO. And the
0664      * other one created by VFIO later, is used by guest actually.
0665      */
0666     base = vgpu_opregion(vgpu)->va;
0667     if (!base)
0668         return -ENOMEM;
0669 
0670     if (memcmp(base, OPREGION_SIGNATURE, 16)) {
0671         memunmap(base);
0672         return -EINVAL;
0673     }
0674 
0675     ret = intel_vgpu_register_reg(vgpu,
0676             PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
0677             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
0678             &intel_vgpu_regops_opregion, OPREGION_SIZE,
0679             VFIO_REGION_INFO_FLAG_READ, base);
0680 
0681     return ret;
0682 }
0683 
0684 int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
0685 {
0686     struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
0687     struct vfio_edid_region *base;
0688     int ret;
0689 
0690     base = kzalloc(sizeof(*base), GFP_KERNEL);
0691     if (!base)
0692         return -ENOMEM;
0693 
0694     /* TODO: Add multi-port and EDID extension block support */
0695     base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
0696     base->vfio_edid_regs.edid_max_size = EDID_SIZE;
0697     base->vfio_edid_regs.edid_size = EDID_SIZE;
0698     base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
0699     base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
0700     base->edid_blob = port->edid->edid_block;
0701 
0702     ret = intel_vgpu_register_reg(vgpu,
0703             VFIO_REGION_TYPE_GFX,
0704             VFIO_REGION_SUBTYPE_GFX_EDID,
0705             &intel_vgpu_regops_edid, EDID_SIZE,
0706             VFIO_REGION_INFO_FLAG_READ |
0707             VFIO_REGION_INFO_FLAG_WRITE |
0708             VFIO_REGION_INFO_FLAG_CAPS, base);
0709 
0710     return ret;
0711 }
0712 
0713 static void intel_vgpu_dma_unmap(struct vfio_device *vfio_dev, u64 iova,
0714                  u64 length)
0715 {
0716     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
0717     struct gvt_dma *entry;
0718     u64 iov_pfn = iova >> PAGE_SHIFT;
0719     u64 end_iov_pfn = iov_pfn + length / PAGE_SIZE;
0720 
0721     mutex_lock(&vgpu->cache_lock);
0722     for (; iov_pfn < end_iov_pfn; iov_pfn++) {
0723         entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
0724         if (!entry)
0725             continue;
0726 
0727         gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
0728                    entry->size);
0729         __gvt_cache_remove_entry(vgpu, entry);
0730     }
0731     mutex_unlock(&vgpu->cache_lock);
0732 }
0733 
0734 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
0735 {
0736     struct intel_vgpu *itr;
0737     int id;
0738     bool ret = false;
0739 
0740     mutex_lock(&vgpu->gvt->lock);
0741     for_each_active_vgpu(vgpu->gvt, itr, id) {
0742         if (!itr->attached)
0743             continue;
0744 
0745         if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) {
0746             ret = true;
0747             goto out;
0748         }
0749     }
0750 out:
0751     mutex_unlock(&vgpu->gvt->lock);
0752     return ret;
0753 }
0754 
0755 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
0756 {
0757     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
0758 
0759     if (vgpu->attached)
0760         return -EEXIST;
0761 
0762     if (!vgpu->vfio_device.kvm ||
0763         vgpu->vfio_device.kvm->mm != current->mm) {
0764         gvt_vgpu_err("KVM is required to use Intel vGPU\n");
0765         return -ESRCH;
0766     }
0767 
0768     kvm_get_kvm(vgpu->vfio_device.kvm);
0769 
0770     if (__kvmgt_vgpu_exist(vgpu))
0771         return -EEXIST;
0772 
0773     vgpu->attached = true;
0774 
0775     kvmgt_protect_table_init(vgpu);
0776     gvt_cache_init(vgpu);
0777 
0778     vgpu->track_node.track_write = kvmgt_page_track_write;
0779     vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
0780     kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
0781                      &vgpu->track_node);
0782 
0783     debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs,
0784                  &vgpu->nr_cache_entries);
0785 
0786     intel_gvt_activate_vgpu(vgpu);
0787 
0788     atomic_set(&vgpu->released, 0);
0789     return 0;
0790 }
0791 
0792 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
0793 {
0794     struct eventfd_ctx *trigger;
0795 
0796     trigger = vgpu->msi_trigger;
0797     if (trigger) {
0798         eventfd_ctx_put(trigger);
0799         vgpu->msi_trigger = NULL;
0800     }
0801 }
0802 
0803 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
0804 {
0805     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
0806 
0807     if (!vgpu->attached)
0808         return;
0809 
0810     if (atomic_cmpxchg(&vgpu->released, 0, 1))
0811         return;
0812 
0813     intel_gvt_release_vgpu(vgpu);
0814 
0815     debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs));
0816 
0817     kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
0818                        &vgpu->track_node);
0819     kvmgt_protect_table_destroy(vgpu);
0820     gvt_cache_destroy(vgpu);
0821 
0822     intel_vgpu_release_msi_eventfd_ctx(vgpu);
0823 
0824     vgpu->attached = false;
0825 
0826     if (vgpu->vfio_device.kvm)
0827         kvm_put_kvm(vgpu->vfio_device.kvm);
0828 }
0829 
0830 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
0831 {
0832     u32 start_lo, start_hi;
0833     u32 mem_type;
0834 
0835     start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
0836             PCI_BASE_ADDRESS_MEM_MASK;
0837     mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
0838             PCI_BASE_ADDRESS_MEM_TYPE_MASK;
0839 
0840     switch (mem_type) {
0841     case PCI_BASE_ADDRESS_MEM_TYPE_64:
0842         start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
0843                         + bar + 4));
0844         break;
0845     case PCI_BASE_ADDRESS_MEM_TYPE_32:
0846     case PCI_BASE_ADDRESS_MEM_TYPE_1M:
0847         /* 1M mem BAR treated as 32-bit BAR */
0848     default:
0849         /* mem unknown type treated as 32-bit BAR */
0850         start_hi = 0;
0851         break;
0852     }
0853 
0854     return ((u64)start_hi << 32) | start_lo;
0855 }
0856 
0857 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
0858                  void *buf, unsigned int count, bool is_write)
0859 {
0860     u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
0861     int ret;
0862 
0863     if (is_write)
0864         ret = intel_vgpu_emulate_mmio_write(vgpu,
0865                     bar_start + off, buf, count);
0866     else
0867         ret = intel_vgpu_emulate_mmio_read(vgpu,
0868                     bar_start + off, buf, count);
0869     return ret;
0870 }
0871 
0872 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
0873 {
0874     return off >= vgpu_aperture_offset(vgpu) &&
0875            off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
0876 }
0877 
0878 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
0879         void *buf, unsigned long count, bool is_write)
0880 {
0881     void __iomem *aperture_va;
0882 
0883     if (!intel_vgpu_in_aperture(vgpu, off) ||
0884         !intel_vgpu_in_aperture(vgpu, off + count)) {
0885         gvt_vgpu_err("Invalid aperture offset %llu\n", off);
0886         return -EINVAL;
0887     }
0888 
0889     aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
0890                     ALIGN_DOWN(off, PAGE_SIZE),
0891                     count + offset_in_page(off));
0892     if (!aperture_va)
0893         return -EIO;
0894 
0895     if (is_write)
0896         memcpy_toio(aperture_va + offset_in_page(off), buf, count);
0897     else
0898         memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
0899 
0900     io_mapping_unmap(aperture_va);
0901 
0902     return 0;
0903 }
0904 
0905 static ssize_t intel_vgpu_rw(struct intel_vgpu *vgpu, char *buf,
0906             size_t count, loff_t *ppos, bool is_write)
0907 {
0908     unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
0909     u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
0910     int ret = -EINVAL;
0911 
0912 
0913     if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) {
0914         gvt_vgpu_err("invalid index: %u\n", index);
0915         return -EINVAL;
0916     }
0917 
0918     switch (index) {
0919     case VFIO_PCI_CONFIG_REGION_INDEX:
0920         if (is_write)
0921             ret = intel_vgpu_emulate_cfg_write(vgpu, pos,
0922                         buf, count);
0923         else
0924             ret = intel_vgpu_emulate_cfg_read(vgpu, pos,
0925                         buf, count);
0926         break;
0927     case VFIO_PCI_BAR0_REGION_INDEX:
0928         ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
0929                     buf, count, is_write);
0930         break;
0931     case VFIO_PCI_BAR2_REGION_INDEX:
0932         ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
0933         break;
0934     case VFIO_PCI_BAR1_REGION_INDEX:
0935     case VFIO_PCI_BAR3_REGION_INDEX:
0936     case VFIO_PCI_BAR4_REGION_INDEX:
0937     case VFIO_PCI_BAR5_REGION_INDEX:
0938     case VFIO_PCI_VGA_REGION_INDEX:
0939     case VFIO_PCI_ROM_REGION_INDEX:
0940         break;
0941     default:
0942         if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
0943             return -EINVAL;
0944 
0945         index -= VFIO_PCI_NUM_REGIONS;
0946         return vgpu->region[index].ops->rw(vgpu, buf, count,
0947                 ppos, is_write);
0948     }
0949 
0950     return ret == 0 ? count : ret;
0951 }
0952 
0953 static bool gtt_entry(struct intel_vgpu *vgpu, loff_t *ppos)
0954 {
0955     unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
0956     struct intel_gvt *gvt = vgpu->gvt;
0957     int offset;
0958 
0959     /* Only allow MMIO GGTT entry access */
0960     if (index != PCI_BASE_ADDRESS_0)
0961         return false;
0962 
0963     offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
0964         intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
0965 
0966     return (offset >= gvt->device_info.gtt_start_offset &&
0967         offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
0968             true : false;
0969 }
0970 
0971 static ssize_t intel_vgpu_read(struct vfio_device *vfio_dev, char __user *buf,
0972             size_t count, loff_t *ppos)
0973 {
0974     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
0975     unsigned int done = 0;
0976     int ret;
0977 
0978     while (count) {
0979         size_t filled;
0980 
0981         /* Only support GGTT entry 8 bytes read */
0982         if (count >= 8 && !(*ppos % 8) &&
0983             gtt_entry(vgpu, ppos)) {
0984             u64 val;
0985 
0986             ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
0987                     ppos, false);
0988             if (ret <= 0)
0989                 goto read_err;
0990 
0991             if (copy_to_user(buf, &val, sizeof(val)))
0992                 goto read_err;
0993 
0994             filled = 8;
0995         } else if (count >= 4 && !(*ppos % 4)) {
0996             u32 val;
0997 
0998             ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
0999                     ppos, false);
1000             if (ret <= 0)
1001                 goto read_err;
1002 
1003             if (copy_to_user(buf, &val, sizeof(val)))
1004                 goto read_err;
1005 
1006             filled = 4;
1007         } else if (count >= 2 && !(*ppos % 2)) {
1008             u16 val;
1009 
1010             ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1011                     ppos, false);
1012             if (ret <= 0)
1013                 goto read_err;
1014 
1015             if (copy_to_user(buf, &val, sizeof(val)))
1016                 goto read_err;
1017 
1018             filled = 2;
1019         } else {
1020             u8 val;
1021 
1022             ret = intel_vgpu_rw(vgpu, &val, sizeof(val), ppos,
1023                     false);
1024             if (ret <= 0)
1025                 goto read_err;
1026 
1027             if (copy_to_user(buf, &val, sizeof(val)))
1028                 goto read_err;
1029 
1030             filled = 1;
1031         }
1032 
1033         count -= filled;
1034         done += filled;
1035         *ppos += filled;
1036         buf += filled;
1037     }
1038 
1039     return done;
1040 
1041 read_err:
1042     return -EFAULT;
1043 }
1044 
1045 static ssize_t intel_vgpu_write(struct vfio_device *vfio_dev,
1046                 const char __user *buf,
1047                 size_t count, loff_t *ppos)
1048 {
1049     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1050     unsigned int done = 0;
1051     int ret;
1052 
1053     while (count) {
1054         size_t filled;
1055 
1056         /* Only support GGTT entry 8 bytes write */
1057         if (count >= 8 && !(*ppos % 8) &&
1058             gtt_entry(vgpu, ppos)) {
1059             u64 val;
1060 
1061             if (copy_from_user(&val, buf, sizeof(val)))
1062                 goto write_err;
1063 
1064             ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1065                     ppos, true);
1066             if (ret <= 0)
1067                 goto write_err;
1068 
1069             filled = 8;
1070         } else if (count >= 4 && !(*ppos % 4)) {
1071             u32 val;
1072 
1073             if (copy_from_user(&val, buf, sizeof(val)))
1074                 goto write_err;
1075 
1076             ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1077                     ppos, true);
1078             if (ret <= 0)
1079                 goto write_err;
1080 
1081             filled = 4;
1082         } else if (count >= 2 && !(*ppos % 2)) {
1083             u16 val;
1084 
1085             if (copy_from_user(&val, buf, sizeof(val)))
1086                 goto write_err;
1087 
1088             ret = intel_vgpu_rw(vgpu, (char *)&val,
1089                     sizeof(val), ppos, true);
1090             if (ret <= 0)
1091                 goto write_err;
1092 
1093             filled = 2;
1094         } else {
1095             u8 val;
1096 
1097             if (copy_from_user(&val, buf, sizeof(val)))
1098                 goto write_err;
1099 
1100             ret = intel_vgpu_rw(vgpu, &val, sizeof(val),
1101                     ppos, true);
1102             if (ret <= 0)
1103                 goto write_err;
1104 
1105             filled = 1;
1106         }
1107 
1108         count -= filled;
1109         done += filled;
1110         *ppos += filled;
1111         buf += filled;
1112     }
1113 
1114     return done;
1115 write_err:
1116     return -EFAULT;
1117 }
1118 
1119 static int intel_vgpu_mmap(struct vfio_device *vfio_dev,
1120         struct vm_area_struct *vma)
1121 {
1122     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1123     unsigned int index;
1124     u64 virtaddr;
1125     unsigned long req_size, pgoff, req_start;
1126     pgprot_t pg_prot;
1127 
1128     index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1129     if (index >= VFIO_PCI_ROM_REGION_INDEX)
1130         return -EINVAL;
1131 
1132     if (vma->vm_end < vma->vm_start)
1133         return -EINVAL;
1134     if ((vma->vm_flags & VM_SHARED) == 0)
1135         return -EINVAL;
1136     if (index != VFIO_PCI_BAR2_REGION_INDEX)
1137         return -EINVAL;
1138 
1139     pg_prot = vma->vm_page_prot;
1140     virtaddr = vma->vm_start;
1141     req_size = vma->vm_end - vma->vm_start;
1142     pgoff = vma->vm_pgoff &
1143         ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1144     req_start = pgoff << PAGE_SHIFT;
1145 
1146     if (!intel_vgpu_in_aperture(vgpu, req_start))
1147         return -EINVAL;
1148     if (req_start + req_size >
1149         vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1150         return -EINVAL;
1151 
1152     pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1153 
1154     return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1155 }
1156 
1157 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1158 {
1159     if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1160         return 1;
1161 
1162     return 0;
1163 }
1164 
1165 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1166             unsigned int index, unsigned int start,
1167             unsigned int count, u32 flags,
1168             void *data)
1169 {
1170     return 0;
1171 }
1172 
1173 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1174             unsigned int index, unsigned int start,
1175             unsigned int count, u32 flags, void *data)
1176 {
1177     return 0;
1178 }
1179 
1180 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1181         unsigned int index, unsigned int start, unsigned int count,
1182         u32 flags, void *data)
1183 {
1184     return 0;
1185 }
1186 
1187 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1188         unsigned int index, unsigned int start, unsigned int count,
1189         u32 flags, void *data)
1190 {
1191     struct eventfd_ctx *trigger;
1192 
1193     if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1194         int fd = *(int *)data;
1195 
1196         trigger = eventfd_ctx_fdget(fd);
1197         if (IS_ERR(trigger)) {
1198             gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1199             return PTR_ERR(trigger);
1200         }
1201         vgpu->msi_trigger = trigger;
1202     } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1203         intel_vgpu_release_msi_eventfd_ctx(vgpu);
1204 
1205     return 0;
1206 }
1207 
1208 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1209         unsigned int index, unsigned int start, unsigned int count,
1210         void *data)
1211 {
1212     int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1213             unsigned int start, unsigned int count, u32 flags,
1214             void *data) = NULL;
1215 
1216     switch (index) {
1217     case VFIO_PCI_INTX_IRQ_INDEX:
1218         switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1219         case VFIO_IRQ_SET_ACTION_MASK:
1220             func = intel_vgpu_set_intx_mask;
1221             break;
1222         case VFIO_IRQ_SET_ACTION_UNMASK:
1223             func = intel_vgpu_set_intx_unmask;
1224             break;
1225         case VFIO_IRQ_SET_ACTION_TRIGGER:
1226             func = intel_vgpu_set_intx_trigger;
1227             break;
1228         }
1229         break;
1230     case VFIO_PCI_MSI_IRQ_INDEX:
1231         switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1232         case VFIO_IRQ_SET_ACTION_MASK:
1233         case VFIO_IRQ_SET_ACTION_UNMASK:
1234             /* XXX Need masking support exported */
1235             break;
1236         case VFIO_IRQ_SET_ACTION_TRIGGER:
1237             func = intel_vgpu_set_msi_trigger;
1238             break;
1239         }
1240         break;
1241     }
1242 
1243     if (!func)
1244         return -ENOTTY;
1245 
1246     return func(vgpu, index, start, count, flags, data);
1247 }
1248 
1249 static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
1250                  unsigned long arg)
1251 {
1252     struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1253     unsigned long minsz;
1254 
1255     gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1256 
1257     if (cmd == VFIO_DEVICE_GET_INFO) {
1258         struct vfio_device_info info;
1259 
1260         minsz = offsetofend(struct vfio_device_info, num_irqs);
1261 
1262         if (copy_from_user(&info, (void __user *)arg, minsz))
1263             return -EFAULT;
1264 
1265         if (info.argsz < minsz)
1266             return -EINVAL;
1267 
1268         info.flags = VFIO_DEVICE_FLAGS_PCI;
1269         info.flags |= VFIO_DEVICE_FLAGS_RESET;
1270         info.num_regions = VFIO_PCI_NUM_REGIONS +
1271                 vgpu->num_regions;
1272         info.num_irqs = VFIO_PCI_NUM_IRQS;
1273 
1274         return copy_to_user((void __user *)arg, &info, minsz) ?
1275             -EFAULT : 0;
1276 
1277     } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1278         struct vfio_region_info info;
1279         struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1280         unsigned int i;
1281         int ret;
1282         struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1283         int nr_areas = 1;
1284         int cap_type_id;
1285 
1286         minsz = offsetofend(struct vfio_region_info, offset);
1287 
1288         if (copy_from_user(&info, (void __user *)arg, minsz))
1289             return -EFAULT;
1290 
1291         if (info.argsz < minsz)
1292             return -EINVAL;
1293 
1294         switch (info.index) {
1295         case VFIO_PCI_CONFIG_REGION_INDEX:
1296             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1297             info.size = vgpu->gvt->device_info.cfg_space_size;
1298             info.flags = VFIO_REGION_INFO_FLAG_READ |
1299                      VFIO_REGION_INFO_FLAG_WRITE;
1300             break;
1301         case VFIO_PCI_BAR0_REGION_INDEX:
1302             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1303             info.size = vgpu->cfg_space.bar[info.index].size;
1304             if (!info.size) {
1305                 info.flags = 0;
1306                 break;
1307             }
1308 
1309             info.flags = VFIO_REGION_INFO_FLAG_READ |
1310                      VFIO_REGION_INFO_FLAG_WRITE;
1311             break;
1312         case VFIO_PCI_BAR1_REGION_INDEX:
1313             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1314             info.size = 0;
1315             info.flags = 0;
1316             break;
1317         case VFIO_PCI_BAR2_REGION_INDEX:
1318             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1319             info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1320                     VFIO_REGION_INFO_FLAG_MMAP |
1321                     VFIO_REGION_INFO_FLAG_READ |
1322                     VFIO_REGION_INFO_FLAG_WRITE;
1323             info.size = gvt_aperture_sz(vgpu->gvt);
1324 
1325             sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1326                      GFP_KERNEL);
1327             if (!sparse)
1328                 return -ENOMEM;
1329 
1330             sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1331             sparse->header.version = 1;
1332             sparse->nr_areas = nr_areas;
1333             cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1334             sparse->areas[0].offset =
1335                     PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1336             sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1337             break;
1338 
1339         case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1340             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1341             info.size = 0;
1342             info.flags = 0;
1343 
1344             gvt_dbg_core("get region info bar:%d\n", info.index);
1345             break;
1346 
1347         case VFIO_PCI_ROM_REGION_INDEX:
1348         case VFIO_PCI_VGA_REGION_INDEX:
1349             info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1350             info.size = 0;
1351             info.flags = 0;
1352 
1353             gvt_dbg_core("get region info index:%d\n", info.index);
1354             break;
1355         default:
1356             {
1357                 struct vfio_region_info_cap_type cap_type = {
1358                     .header.id = VFIO_REGION_INFO_CAP_TYPE,
1359                     .header.version = 1 };
1360 
1361                 if (info.index >= VFIO_PCI_NUM_REGIONS +
1362                         vgpu->num_regions)
1363                     return -EINVAL;
1364                 info.index =
1365                     array_index_nospec(info.index,
1366                             VFIO_PCI_NUM_REGIONS +
1367                             vgpu->num_regions);
1368 
1369                 i = info.index - VFIO_PCI_NUM_REGIONS;
1370 
1371                 info.offset =
1372                     VFIO_PCI_INDEX_TO_OFFSET(info.index);
1373                 info.size = vgpu->region[i].size;
1374                 info.flags = vgpu->region[i].flags;
1375 
1376                 cap_type.type = vgpu->region[i].type;
1377                 cap_type.subtype = vgpu->region[i].subtype;
1378 
1379                 ret = vfio_info_add_capability(&caps,
1380                             &cap_type.header,
1381                             sizeof(cap_type));
1382                 if (ret)
1383                     return ret;
1384             }
1385         }
1386 
1387         if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1388             switch (cap_type_id) {
1389             case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1390                 ret = vfio_info_add_capability(&caps,
1391                     &sparse->header,
1392                     struct_size(sparse, areas,
1393                             sparse->nr_areas));
1394                 if (ret) {
1395                     kfree(sparse);
1396                     return ret;
1397                 }
1398                 break;
1399             default:
1400                 kfree(sparse);
1401                 return -EINVAL;
1402             }
1403         }
1404 
1405         if (caps.size) {
1406             info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1407             if (info.argsz < sizeof(info) + caps.size) {
1408                 info.argsz = sizeof(info) + caps.size;
1409                 info.cap_offset = 0;
1410             } else {
1411                 vfio_info_cap_shift(&caps, sizeof(info));
1412                 if (copy_to_user((void __user *)arg +
1413                           sizeof(info), caps.buf,
1414                           caps.size)) {
1415                     kfree(caps.buf);
1416                     kfree(sparse);
1417                     return -EFAULT;
1418                 }
1419                 info.cap_offset = sizeof(info);
1420             }
1421 
1422             kfree(caps.buf);
1423         }
1424 
1425         kfree(sparse);
1426         return copy_to_user((void __user *)arg, &info, minsz) ?
1427             -EFAULT : 0;
1428     } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1429         struct vfio_irq_info info;
1430 
1431         minsz = offsetofend(struct vfio_irq_info, count);
1432 
1433         if (copy_from_user(&info, (void __user *)arg, minsz))
1434             return -EFAULT;
1435 
1436         if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1437             return -EINVAL;
1438 
1439         switch (info.index) {
1440         case VFIO_PCI_INTX_IRQ_INDEX:
1441         case VFIO_PCI_MSI_IRQ_INDEX:
1442             break;
1443         default:
1444             return -EINVAL;
1445         }
1446 
1447         info.flags = VFIO_IRQ_INFO_EVENTFD;
1448 
1449         info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1450 
1451         if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1452             info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1453                        VFIO_IRQ_INFO_AUTOMASKED);
1454         else
1455             info.flags |= VFIO_IRQ_INFO_NORESIZE;
1456 
1457         return copy_to_user((void __user *)arg, &info, minsz) ?
1458             -EFAULT : 0;
1459     } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1460         struct vfio_irq_set hdr;
1461         u8 *data = NULL;
1462         int ret = 0;
1463         size_t data_size = 0;
1464 
1465         minsz = offsetofend(struct vfio_irq_set, count);
1466 
1467         if (copy_from_user(&hdr, (void __user *)arg, minsz))
1468             return -EFAULT;
1469 
1470         if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1471             int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1472 
1473             ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1474                         VFIO_PCI_NUM_IRQS, &data_size);
1475             if (ret) {
1476                 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1477                 return -EINVAL;
1478             }
1479             if (data_size) {
1480                 data = memdup_user((void __user *)(arg + minsz),
1481                            data_size);
1482                 if (IS_ERR(data))
1483                     return PTR_ERR(data);
1484             }
1485         }
1486 
1487         ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1488                     hdr.start, hdr.count, data);
1489         kfree(data);
1490 
1491         return ret;
1492     } else if (cmd == VFIO_DEVICE_RESET) {
1493         intel_gvt_reset_vgpu(vgpu);
1494         return 0;
1495     } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1496         struct vfio_device_gfx_plane_info dmabuf;
1497         int ret = 0;
1498 
1499         minsz = offsetofend(struct vfio_device_gfx_plane_info,
1500                     dmabuf_id);
1501         if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1502             return -EFAULT;
1503         if (dmabuf.argsz < minsz)
1504             return -EINVAL;
1505 
1506         ret = intel_vgpu_query_plane(vgpu, &dmabuf);
1507         if (ret != 0)
1508             return ret;
1509 
1510         return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1511                                 -EFAULT : 0;
1512     } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1513         __u32 dmabuf_id;
1514 
1515         if (get_user(dmabuf_id, (__u32 __user *)arg))
1516             return -EFAULT;
1517         return intel_vgpu_get_dmabuf(vgpu, dmabuf_id);
1518     }
1519 
1520     return -ENOTTY;
1521 }
1522 
1523 static ssize_t
1524 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1525          char *buf)
1526 {
1527     struct intel_vgpu *vgpu = dev_get_drvdata(dev);
1528 
1529     return sprintf(buf, "%d\n", vgpu->id);
1530 }
1531 
1532 static DEVICE_ATTR_RO(vgpu_id);
1533 
1534 static struct attribute *intel_vgpu_attrs[] = {
1535     &dev_attr_vgpu_id.attr,
1536     NULL
1537 };
1538 
1539 static const struct attribute_group intel_vgpu_group = {
1540     .name = "intel_vgpu",
1541     .attrs = intel_vgpu_attrs,
1542 };
1543 
1544 static const struct attribute_group *intel_vgpu_groups[] = {
1545     &intel_vgpu_group,
1546     NULL,
1547 };
1548 
1549 static const struct vfio_device_ops intel_vgpu_dev_ops = {
1550     .open_device    = intel_vgpu_open_device,
1551     .close_device   = intel_vgpu_close_device,
1552     .read       = intel_vgpu_read,
1553     .write      = intel_vgpu_write,
1554     .mmap       = intel_vgpu_mmap,
1555     .ioctl      = intel_vgpu_ioctl,
1556     .dma_unmap  = intel_vgpu_dma_unmap,
1557 };
1558 
1559 static int intel_vgpu_probe(struct mdev_device *mdev)
1560 {
1561     struct device *pdev = mdev_parent_dev(mdev);
1562     struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt;
1563     struct intel_vgpu_type *type;
1564     struct intel_vgpu *vgpu;
1565     int ret;
1566 
1567     type = &gvt->types[mdev_get_type_group_id(mdev)];
1568     if (!type)
1569         return -EINVAL;
1570 
1571     vgpu = intel_gvt_create_vgpu(gvt, type);
1572     if (IS_ERR(vgpu)) {
1573         gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu));
1574         return PTR_ERR(vgpu);
1575     }
1576 
1577     vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev,
1578                 &intel_vgpu_dev_ops);
1579 
1580     dev_set_drvdata(&mdev->dev, vgpu);
1581     ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device);
1582     if (ret) {
1583         intel_gvt_destroy_vgpu(vgpu);
1584         return ret;
1585     }
1586 
1587     gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
1588              dev_name(mdev_dev(mdev)));
1589     return 0;
1590 }
1591 
1592 static void intel_vgpu_remove(struct mdev_device *mdev)
1593 {
1594     struct intel_vgpu *vgpu = dev_get_drvdata(&mdev->dev);
1595 
1596     if (WARN_ON_ONCE(vgpu->attached))
1597         return;
1598     intel_gvt_destroy_vgpu(vgpu);
1599 }
1600 
1601 static struct mdev_driver intel_vgpu_mdev_driver = {
1602     .driver = {
1603         .name       = "intel_vgpu_mdev",
1604         .owner      = THIS_MODULE,
1605         .dev_groups = intel_vgpu_groups,
1606     },
1607     .probe      = intel_vgpu_probe,
1608     .remove     = intel_vgpu_remove,
1609     .supported_type_groups  = gvt_vgpu_type_groups,
1610 };
1611 
1612 int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
1613 {
1614     struct kvm *kvm = info->vfio_device.kvm;
1615     struct kvm_memory_slot *slot;
1616     int idx;
1617 
1618     if (!info->attached)
1619         return -ESRCH;
1620 
1621     idx = srcu_read_lock(&kvm->srcu);
1622     slot = gfn_to_memslot(kvm, gfn);
1623     if (!slot) {
1624         srcu_read_unlock(&kvm->srcu, idx);
1625         return -EINVAL;
1626     }
1627 
1628     write_lock(&kvm->mmu_lock);
1629 
1630     if (kvmgt_gfn_is_write_protected(info, gfn))
1631         goto out;
1632 
1633     kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1634     kvmgt_protect_table_add(info, gfn);
1635 
1636 out:
1637     write_unlock(&kvm->mmu_lock);
1638     srcu_read_unlock(&kvm->srcu, idx);
1639     return 0;
1640 }
1641 
1642 int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
1643 {
1644     struct kvm *kvm = info->vfio_device.kvm;
1645     struct kvm_memory_slot *slot;
1646     int idx;
1647 
1648     if (!info->attached)
1649         return 0;
1650 
1651     idx = srcu_read_lock(&kvm->srcu);
1652     slot = gfn_to_memslot(kvm, gfn);
1653     if (!slot) {
1654         srcu_read_unlock(&kvm->srcu, idx);
1655         return -EINVAL;
1656     }
1657 
1658     write_lock(&kvm->mmu_lock);
1659 
1660     if (!kvmgt_gfn_is_write_protected(info, gfn))
1661         goto out;
1662 
1663     kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1664     kvmgt_protect_table_del(info, gfn);
1665 
1666 out:
1667     write_unlock(&kvm->mmu_lock);
1668     srcu_read_unlock(&kvm->srcu, idx);
1669     return 0;
1670 }
1671 
1672 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1673         const u8 *val, int len,
1674         struct kvm_page_track_notifier_node *node)
1675 {
1676     struct intel_vgpu *info =
1677         container_of(node, struct intel_vgpu, track_node);
1678 
1679     if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1680         intel_vgpu_page_track_handler(info, gpa,
1681                              (void *)val, len);
1682 }
1683 
1684 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1685         struct kvm_memory_slot *slot,
1686         struct kvm_page_track_notifier_node *node)
1687 {
1688     int i;
1689     gfn_t gfn;
1690     struct intel_vgpu *info =
1691         container_of(node, struct intel_vgpu, track_node);
1692 
1693     write_lock(&kvm->mmu_lock);
1694     for (i = 0; i < slot->npages; i++) {
1695         gfn = slot->base_gfn + i;
1696         if (kvmgt_gfn_is_write_protected(info, gfn)) {
1697             kvm_slot_page_track_remove_page(kvm, slot, gfn,
1698                         KVM_PAGE_TRACK_WRITE);
1699             kvmgt_protect_table_del(info, gfn);
1700         }
1701     }
1702     write_unlock(&kvm->mmu_lock);
1703 }
1704 
1705 void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)
1706 {
1707     int i;
1708 
1709     if (!vgpu->region)
1710         return;
1711 
1712     for (i = 0; i < vgpu->num_regions; i++)
1713         if (vgpu->region[i].ops->release)
1714             vgpu->region[i].ops->release(vgpu,
1715                     &vgpu->region[i]);
1716     vgpu->num_regions = 0;
1717     kfree(vgpu->region);
1718     vgpu->region = NULL;
1719 }
1720 
1721 int intel_gvt_dma_map_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
1722         unsigned long size, dma_addr_t *dma_addr)
1723 {
1724     struct gvt_dma *entry;
1725     int ret;
1726 
1727     if (!vgpu->attached)
1728         return -EINVAL;
1729 
1730     mutex_lock(&vgpu->cache_lock);
1731 
1732     entry = __gvt_cache_find_gfn(vgpu, gfn);
1733     if (!entry) {
1734         ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1735         if (ret)
1736             goto err_unlock;
1737 
1738         ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1739         if (ret)
1740             goto err_unmap;
1741     } else if (entry->size != size) {
1742         /* the same gfn with different size: unmap and re-map */
1743         gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1744         __gvt_cache_remove_entry(vgpu, entry);
1745 
1746         ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1747         if (ret)
1748             goto err_unlock;
1749 
1750         ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1751         if (ret)
1752             goto err_unmap;
1753     } else {
1754         kref_get(&entry->ref);
1755         *dma_addr = entry->dma_addr;
1756     }
1757 
1758     mutex_unlock(&vgpu->cache_lock);
1759     return 0;
1760 
1761 err_unmap:
1762     gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1763 err_unlock:
1764     mutex_unlock(&vgpu->cache_lock);
1765     return ret;
1766 }
1767 
1768 int intel_gvt_dma_pin_guest_page(struct intel_vgpu *vgpu, dma_addr_t dma_addr)
1769 {
1770     struct gvt_dma *entry;
1771     int ret = 0;
1772 
1773     if (!vgpu->attached)
1774         return -ENODEV;
1775 
1776     mutex_lock(&vgpu->cache_lock);
1777     entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1778     if (entry)
1779         kref_get(&entry->ref);
1780     else
1781         ret = -ENOMEM;
1782     mutex_unlock(&vgpu->cache_lock);
1783 
1784     return ret;
1785 }
1786 
1787 static void __gvt_dma_release(struct kref *ref)
1788 {
1789     struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1790 
1791     gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1792                entry->size);
1793     __gvt_cache_remove_entry(entry->vgpu, entry);
1794 }
1795 
1796 void intel_gvt_dma_unmap_guest_page(struct intel_vgpu *vgpu,
1797         dma_addr_t dma_addr)
1798 {
1799     struct gvt_dma *entry;
1800 
1801     if (!vgpu->attached)
1802         return;
1803 
1804     mutex_lock(&vgpu->cache_lock);
1805     entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1806     if (entry)
1807         kref_put(&entry->ref, __gvt_dma_release);
1808     mutex_unlock(&vgpu->cache_lock);
1809 }
1810 
1811 static void init_device_info(struct intel_gvt *gvt)
1812 {
1813     struct intel_gvt_device_info *info = &gvt->device_info;
1814     struct pci_dev *pdev = to_pci_dev(gvt->gt->i915->drm.dev);
1815 
1816     info->max_support_vgpus = 8;
1817     info->cfg_space_size = PCI_CFG_SPACE_EXP_SIZE;
1818     info->mmio_size = 2 * 1024 * 1024;
1819     info->mmio_bar = 0;
1820     info->gtt_start_offset = 8 * 1024 * 1024;
1821     info->gtt_entry_size = 8;
1822     info->gtt_entry_size_shift = 3;
1823     info->gmadr_bytes_in_cmd = 8;
1824     info->max_surface_size = 36 * 1024 * 1024;
1825     info->msi_cap_offset = pdev->msi_cap;
1826 }
1827 
1828 static void intel_gvt_test_and_emulate_vblank(struct intel_gvt *gvt)
1829 {
1830     struct intel_vgpu *vgpu;
1831     int id;
1832 
1833     mutex_lock(&gvt->lock);
1834     idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) {
1835         if (test_and_clear_bit(INTEL_GVT_REQUEST_EMULATE_VBLANK + id,
1836                        (void *)&gvt->service_request)) {
1837             if (vgpu->active)
1838                 intel_vgpu_emulate_vblank(vgpu);
1839         }
1840     }
1841     mutex_unlock(&gvt->lock);
1842 }
1843 
1844 static int gvt_service_thread(void *data)
1845 {
1846     struct intel_gvt *gvt = (struct intel_gvt *)data;
1847     int ret;
1848 
1849     gvt_dbg_core("service thread start\n");
1850 
1851     while (!kthread_should_stop()) {
1852         ret = wait_event_interruptible(gvt->service_thread_wq,
1853                 kthread_should_stop() || gvt->service_request);
1854 
1855         if (kthread_should_stop())
1856             break;
1857 
1858         if (WARN_ONCE(ret, "service thread is waken up by signal.\n"))
1859             continue;
1860 
1861         intel_gvt_test_and_emulate_vblank(gvt);
1862 
1863         if (test_bit(INTEL_GVT_REQUEST_SCHED,
1864                 (void *)&gvt->service_request) ||
1865             test_bit(INTEL_GVT_REQUEST_EVENT_SCHED,
1866                     (void *)&gvt->service_request)) {
1867             intel_gvt_schedule(gvt);
1868         }
1869     }
1870 
1871     return 0;
1872 }
1873 
1874 static void clean_service_thread(struct intel_gvt *gvt)
1875 {
1876     kthread_stop(gvt->service_thread);
1877 }
1878 
1879 static int init_service_thread(struct intel_gvt *gvt)
1880 {
1881     init_waitqueue_head(&gvt->service_thread_wq);
1882 
1883     gvt->service_thread = kthread_run(gvt_service_thread,
1884             gvt, "gvt_service_thread");
1885     if (IS_ERR(gvt->service_thread)) {
1886         gvt_err("fail to start service thread.\n");
1887         return PTR_ERR(gvt->service_thread);
1888     }
1889     return 0;
1890 }
1891 
1892 /**
1893  * intel_gvt_clean_device - clean a GVT device
1894  * @i915: i915 private
1895  *
1896  * This function is called at the driver unloading stage, to free the
1897  * resources owned by a GVT device.
1898  *
1899  */
1900 static void intel_gvt_clean_device(struct drm_i915_private *i915)
1901 {
1902     struct intel_gvt *gvt = fetch_and_zero(&i915->gvt);
1903 
1904     if (drm_WARN_ON(&i915->drm, !gvt))
1905         return;
1906 
1907     mdev_unregister_device(i915->drm.dev);
1908     intel_gvt_cleanup_vgpu_type_groups(gvt);
1909     intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
1910     intel_gvt_clean_vgpu_types(gvt);
1911 
1912     intel_gvt_debugfs_clean(gvt);
1913     clean_service_thread(gvt);
1914     intel_gvt_clean_cmd_parser(gvt);
1915     intel_gvt_clean_sched_policy(gvt);
1916     intel_gvt_clean_workload_scheduler(gvt);
1917     intel_gvt_clean_gtt(gvt);
1918     intel_gvt_free_firmware(gvt);
1919     intel_gvt_clean_mmio_info(gvt);
1920     idr_destroy(&gvt->vgpu_idr);
1921 
1922     kfree(i915->gvt);
1923 }
1924 
1925 /**
1926  * intel_gvt_init_device - initialize a GVT device
1927  * @i915: drm i915 private data
1928  *
1929  * This function is called at the initialization stage, to initialize
1930  * necessary GVT components.
1931  *
1932  * Returns:
1933  * Zero on success, negative error code if failed.
1934  *
1935  */
1936 static int intel_gvt_init_device(struct drm_i915_private *i915)
1937 {
1938     struct intel_gvt *gvt;
1939     struct intel_vgpu *vgpu;
1940     int ret;
1941 
1942     if (drm_WARN_ON(&i915->drm, i915->gvt))
1943         return -EEXIST;
1944 
1945     gvt = kzalloc(sizeof(struct intel_gvt), GFP_KERNEL);
1946     if (!gvt)
1947         return -ENOMEM;
1948 
1949     gvt_dbg_core("init gvt device\n");
1950 
1951     idr_init_base(&gvt->vgpu_idr, 1);
1952     spin_lock_init(&gvt->scheduler.mmio_context_lock);
1953     mutex_init(&gvt->lock);
1954     mutex_init(&gvt->sched_lock);
1955     gvt->gt = to_gt(i915);
1956     i915->gvt = gvt;
1957 
1958     init_device_info(gvt);
1959 
1960     ret = intel_gvt_setup_mmio_info(gvt);
1961     if (ret)
1962         goto out_clean_idr;
1963 
1964     intel_gvt_init_engine_mmio_context(gvt);
1965 
1966     ret = intel_gvt_load_firmware(gvt);
1967     if (ret)
1968         goto out_clean_mmio_info;
1969 
1970     ret = intel_gvt_init_irq(gvt);
1971     if (ret)
1972         goto out_free_firmware;
1973 
1974     ret = intel_gvt_init_gtt(gvt);
1975     if (ret)
1976         goto out_free_firmware;
1977 
1978     ret = intel_gvt_init_workload_scheduler(gvt);
1979     if (ret)
1980         goto out_clean_gtt;
1981 
1982     ret = intel_gvt_init_sched_policy(gvt);
1983     if (ret)
1984         goto out_clean_workload_scheduler;
1985 
1986     ret = intel_gvt_init_cmd_parser(gvt);
1987     if (ret)
1988         goto out_clean_sched_policy;
1989 
1990     ret = init_service_thread(gvt);
1991     if (ret)
1992         goto out_clean_cmd_parser;
1993 
1994     ret = intel_gvt_init_vgpu_types(gvt);
1995     if (ret)
1996         goto out_clean_thread;
1997 
1998     vgpu = intel_gvt_create_idle_vgpu(gvt);
1999     if (IS_ERR(vgpu)) {
2000         ret = PTR_ERR(vgpu);
2001         gvt_err("failed to create idle vgpu\n");
2002         goto out_clean_types;
2003     }
2004     gvt->idle_vgpu = vgpu;
2005 
2006     intel_gvt_debugfs_init(gvt);
2007 
2008     ret = intel_gvt_init_vgpu_type_groups(gvt);
2009     if (ret)
2010         goto out_destroy_idle_vgpu;
2011 
2012     ret = mdev_register_device(i915->drm.dev, &intel_vgpu_mdev_driver);
2013     if (ret)
2014         goto out_cleanup_vgpu_type_groups;
2015 
2016     gvt_dbg_core("gvt device initialization is done\n");
2017     return 0;
2018 
2019 out_cleanup_vgpu_type_groups:
2020     intel_gvt_cleanup_vgpu_type_groups(gvt);
2021 out_destroy_idle_vgpu:
2022     intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
2023     intel_gvt_debugfs_clean(gvt);
2024 out_clean_types:
2025     intel_gvt_clean_vgpu_types(gvt);
2026 out_clean_thread:
2027     clean_service_thread(gvt);
2028 out_clean_cmd_parser:
2029     intel_gvt_clean_cmd_parser(gvt);
2030 out_clean_sched_policy:
2031     intel_gvt_clean_sched_policy(gvt);
2032 out_clean_workload_scheduler:
2033     intel_gvt_clean_workload_scheduler(gvt);
2034 out_clean_gtt:
2035     intel_gvt_clean_gtt(gvt);
2036 out_free_firmware:
2037     intel_gvt_free_firmware(gvt);
2038 out_clean_mmio_info:
2039     intel_gvt_clean_mmio_info(gvt);
2040 out_clean_idr:
2041     idr_destroy(&gvt->vgpu_idr);
2042     kfree(gvt);
2043     i915->gvt = NULL;
2044     return ret;
2045 }
2046 
2047 static void intel_gvt_pm_resume(struct drm_i915_private *i915)
2048 {
2049     struct intel_gvt *gvt = i915->gvt;
2050 
2051     intel_gvt_restore_fence(gvt);
2052     intel_gvt_restore_mmio(gvt);
2053     intel_gvt_restore_ggtt(gvt);
2054 }
2055 
2056 static const struct intel_vgpu_ops intel_gvt_vgpu_ops = {
2057     .init_device    = intel_gvt_init_device,
2058     .clean_device   = intel_gvt_clean_device,
2059     .pm_resume  = intel_gvt_pm_resume,
2060 };
2061 
2062 static int __init kvmgt_init(void)
2063 {
2064     int ret;
2065 
2066     ret = intel_gvt_set_ops(&intel_gvt_vgpu_ops);
2067     if (ret)
2068         return ret;
2069 
2070     ret = mdev_register_driver(&intel_vgpu_mdev_driver);
2071     if (ret)
2072         intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2073     return ret;
2074 }
2075 
2076 static void __exit kvmgt_exit(void)
2077 {
2078     mdev_unregister_driver(&intel_vgpu_mdev_driver);
2079     intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2080 }
2081 
2082 module_init(kvmgt_init);
2083 module_exit(kvmgt_exit);
2084 
2085 MODULE_LICENSE("GPL and additional rights");
2086 MODULE_AUTHOR("Intel Corporation");