Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Device driver to expose SGX enclave memory to KVM guests.
0004  *
0005  * Copyright(c) 2021 Intel Corporation.
0006  */
0007 
0008 #include <linux/miscdevice.h>
0009 #include <linux/mm.h>
0010 #include <linux/mman.h>
0011 #include <linux/sched/mm.h>
0012 #include <linux/sched/signal.h>
0013 #include <linux/slab.h>
0014 #include <linux/xarray.h>
0015 #include <asm/sgx.h>
0016 #include <uapi/asm/sgx.h>
0017 
0018 #include "encls.h"
0019 #include "sgx.h"
0020 
0021 struct sgx_vepc {
0022     struct xarray page_array;
0023     struct mutex lock;
0024 };
0025 
0026 /*
0027  * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
0028  * virtual EPC instances, and the lock to protect it.
0029  */
0030 static struct mutex zombie_secs_pages_lock;
0031 static struct list_head zombie_secs_pages;
0032 
0033 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
0034                 struct vm_area_struct *vma, unsigned long addr)
0035 {
0036     struct sgx_epc_page *epc_page;
0037     unsigned long index, pfn;
0038     int ret;
0039 
0040     WARN_ON(!mutex_is_locked(&vepc->lock));
0041 
0042     /* Calculate index of EPC page in virtual EPC's page_array */
0043     index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
0044 
0045     epc_page = xa_load(&vepc->page_array, index);
0046     if (epc_page)
0047         return 0;
0048 
0049     epc_page = sgx_alloc_epc_page(vepc, false);
0050     if (IS_ERR(epc_page))
0051         return PTR_ERR(epc_page);
0052 
0053     ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
0054     if (ret)
0055         goto err_free;
0056 
0057     pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
0058 
0059     ret = vmf_insert_pfn(vma, addr, pfn);
0060     if (ret != VM_FAULT_NOPAGE) {
0061         ret = -EFAULT;
0062         goto err_delete;
0063     }
0064 
0065     return 0;
0066 
0067 err_delete:
0068     xa_erase(&vepc->page_array, index);
0069 err_free:
0070     sgx_free_epc_page(epc_page);
0071     return ret;
0072 }
0073 
0074 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
0075 {
0076     struct vm_area_struct *vma = vmf->vma;
0077     struct sgx_vepc *vepc = vma->vm_private_data;
0078     int ret;
0079 
0080     mutex_lock(&vepc->lock);
0081     ret = __sgx_vepc_fault(vepc, vma, vmf->address);
0082     mutex_unlock(&vepc->lock);
0083 
0084     if (!ret)
0085         return VM_FAULT_NOPAGE;
0086 
0087     if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
0088         mmap_read_unlock(vma->vm_mm);
0089         return VM_FAULT_RETRY;
0090     }
0091 
0092     return VM_FAULT_SIGBUS;
0093 }
0094 
0095 static const struct vm_operations_struct sgx_vepc_vm_ops = {
0096     .fault = sgx_vepc_fault,
0097 };
0098 
0099 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
0100 {
0101     struct sgx_vepc *vepc = file->private_data;
0102 
0103     if (!(vma->vm_flags & VM_SHARED))
0104         return -EINVAL;
0105 
0106     vma->vm_ops = &sgx_vepc_vm_ops;
0107     /* Don't copy VMA in fork() */
0108     vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
0109     vma->vm_private_data = vepc;
0110 
0111     return 0;
0112 }
0113 
0114 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
0115 {
0116     /*
0117      * Take a previously guest-owned EPC page and return it to the
0118      * general EPC page pool.
0119      *
0120      * Guests can not be trusted to have left this page in a good
0121      * state, so run EREMOVE on the page unconditionally.  In the
0122      * case that a guest properly EREMOVE'd this page, a superfluous
0123      * EREMOVE is harmless.
0124      */
0125     return __eremove(sgx_get_epc_virt_addr(epc_page));
0126 }
0127 
0128 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
0129 {
0130     int ret = sgx_vepc_remove_page(epc_page);
0131     if (ret) {
0132         /*
0133          * Only SGX_CHILD_PRESENT is expected, which is because of
0134          * EREMOVE'ing an SECS still with child, in which case it can
0135          * be handled by EREMOVE'ing the SECS again after all pages in
0136          * virtual EPC have been EREMOVE'd. See comments in below in
0137          * sgx_vepc_release().
0138          *
0139          * The user of virtual EPC (KVM) needs to guarantee there's no
0140          * logical processor is still running in the enclave in guest,
0141          * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
0142          * handled here.
0143          */
0144         WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
0145               ret, ret);
0146         return ret;
0147     }
0148 
0149     sgx_free_epc_page(epc_page);
0150     return 0;
0151 }
0152 
0153 static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
0154 {
0155     struct sgx_epc_page *entry;
0156     unsigned long index;
0157     long failures = 0;
0158 
0159     xa_for_each(&vepc->page_array, index, entry) {
0160         int ret = sgx_vepc_remove_page(entry);
0161         if (ret) {
0162             if (ret == SGX_CHILD_PRESENT) {
0163                 /* The page is a SECS, userspace will retry.  */
0164                 failures++;
0165             } else {
0166                 /*
0167                  * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
0168                  * WARN, as userspace can induce said failures by
0169                  * calling the ioctl concurrently on multiple vEPCs or
0170                  * while one or more CPUs is running the enclave.  Only
0171                  * a #PF on EREMOVE indicates a kernel/hardware issue.
0172                  */
0173                 WARN_ON_ONCE(encls_faulted(ret) &&
0174                          ENCLS_TRAPNR(ret) != X86_TRAP_GP);
0175                 return -EBUSY;
0176             }
0177         }
0178         cond_resched();
0179     }
0180 
0181     /*
0182      * Return the number of SECS pages that failed to be removed, so
0183      * userspace knows that it has to retry.
0184      */
0185     return failures;
0186 }
0187 
0188 static int sgx_vepc_release(struct inode *inode, struct file *file)
0189 {
0190     struct sgx_vepc *vepc = file->private_data;
0191     struct sgx_epc_page *epc_page, *tmp, *entry;
0192     unsigned long index;
0193 
0194     LIST_HEAD(secs_pages);
0195 
0196     xa_for_each(&vepc->page_array, index, entry) {
0197         /*
0198          * Remove all normal, child pages.  sgx_vepc_free_page()
0199          * will fail if EREMOVE fails, but this is OK and expected on
0200          * SECS pages.  Those can only be EREMOVE'd *after* all their
0201          * child pages. Retries below will clean them up.
0202          */
0203         if (sgx_vepc_free_page(entry))
0204             continue;
0205 
0206         xa_erase(&vepc->page_array, index);
0207     }
0208 
0209     /*
0210      * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
0211      * only had children in this 'epc' area.
0212      */
0213     xa_for_each(&vepc->page_array, index, entry) {
0214         epc_page = entry;
0215         /*
0216          * An EREMOVE failure here means that the SECS page still
0217          * has children.  But, since all children in this 'sgx_vepc'
0218          * have been removed, the SECS page must have a child on
0219          * another instance.
0220          */
0221         if (sgx_vepc_free_page(epc_page))
0222             list_add_tail(&epc_page->list, &secs_pages);
0223 
0224         xa_erase(&vepc->page_array, index);
0225     }
0226 
0227     /*
0228      * SECS pages are "pinned" by child pages, and "unpinned" once all
0229      * children have been EREMOVE'd.  A child page in this instance
0230      * may have pinned an SECS page encountered in an earlier release(),
0231      * creating a zombie.  Since some children were EREMOVE'd above,
0232      * try to EREMOVE all zombies in the hopes that one was unpinned.
0233      */
0234     mutex_lock(&zombie_secs_pages_lock);
0235     list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
0236         /*
0237          * Speculatively remove the page from the list of zombies,
0238          * if the page is successfully EREMOVE'd it will be added to
0239          * the list of free pages.  If EREMOVE fails, throw the page
0240          * on the local list, which will be spliced on at the end.
0241          */
0242         list_del(&epc_page->list);
0243 
0244         if (sgx_vepc_free_page(epc_page))
0245             list_add_tail(&epc_page->list, &secs_pages);
0246     }
0247 
0248     if (!list_empty(&secs_pages))
0249         list_splice_tail(&secs_pages, &zombie_secs_pages);
0250     mutex_unlock(&zombie_secs_pages_lock);
0251 
0252     xa_destroy(&vepc->page_array);
0253     kfree(vepc);
0254 
0255     return 0;
0256 }
0257 
0258 static int sgx_vepc_open(struct inode *inode, struct file *file)
0259 {
0260     struct sgx_vepc *vepc;
0261 
0262     vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
0263     if (!vepc)
0264         return -ENOMEM;
0265     mutex_init(&vepc->lock);
0266     xa_init(&vepc->page_array);
0267 
0268     file->private_data = vepc;
0269 
0270     return 0;
0271 }
0272 
0273 static long sgx_vepc_ioctl(struct file *file,
0274                unsigned int cmd, unsigned long arg)
0275 {
0276     struct sgx_vepc *vepc = file->private_data;
0277 
0278     switch (cmd) {
0279     case SGX_IOC_VEPC_REMOVE_ALL:
0280         if (arg)
0281             return -EINVAL;
0282         return sgx_vepc_remove_all(vepc);
0283 
0284     default:
0285         return -ENOTTY;
0286     }
0287 }
0288 
0289 static const struct file_operations sgx_vepc_fops = {
0290     .owner      = THIS_MODULE,
0291     .open       = sgx_vepc_open,
0292     .unlocked_ioctl = sgx_vepc_ioctl,
0293     .compat_ioctl   = sgx_vepc_ioctl,
0294     .release    = sgx_vepc_release,
0295     .mmap       = sgx_vepc_mmap,
0296 };
0297 
0298 static struct miscdevice sgx_vepc_dev = {
0299     .minor      = MISC_DYNAMIC_MINOR,
0300     .name       = "sgx_vepc",
0301     .nodename   = "sgx_vepc",
0302     .fops       = &sgx_vepc_fops,
0303 };
0304 
0305 int __init sgx_vepc_init(void)
0306 {
0307     /* SGX virtualization requires KVM to work */
0308     if (!cpu_feature_enabled(X86_FEATURE_VMX))
0309         return -ENODEV;
0310 
0311     INIT_LIST_HEAD(&zombie_secs_pages);
0312     mutex_init(&zombie_secs_pages_lock);
0313 
0314     return misc_register(&sgx_vepc_dev);
0315 }
0316 
0317 /**
0318  * sgx_virt_ecreate() - Run ECREATE on behalf of guest
0319  * @pageinfo:   Pointer to PAGEINFO structure
0320  * @secs:   Userspace pointer to SECS page
0321  * @trapnr: trap number injected to guest in case of ECREATE error
0322  *
0323  * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
0324  * of enforcing policies of guest's enclaves, and return the trap number
0325  * which should be injected to guest in case of any ECREATE error.
0326  *
0327  * Return:
0328  * -  0:    ECREATE was successful.
0329  * - <0:    on error.
0330  */
0331 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
0332              int *trapnr)
0333 {
0334     int ret;
0335 
0336     /*
0337      * @secs is an untrusted, userspace-provided address.  It comes from
0338      * KVM and is assumed to be a valid pointer which points somewhere in
0339      * userspace.  This can fault and call SGX or other fault handlers when
0340      * userspace mapping @secs doesn't exist.
0341      *
0342      * Add a WARN() to make sure @secs is already valid userspace pointer
0343      * from caller (KVM), who should already have handled invalid pointer
0344      * case (for instance, made by malicious guest).  All other checks,
0345      * such as alignment of @secs, are deferred to ENCLS itself.
0346      */
0347     if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
0348         return -EINVAL;
0349 
0350     __uaccess_begin();
0351     ret = __ecreate(pageinfo, (void *)secs);
0352     __uaccess_end();
0353 
0354     if (encls_faulted(ret)) {
0355         *trapnr = ENCLS_TRAPNR(ret);
0356         return -EFAULT;
0357     }
0358 
0359     /* ECREATE doesn't return an error code, it faults or succeeds. */
0360     WARN_ON_ONCE(ret);
0361     return 0;
0362 }
0363 EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
0364 
0365 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
0366                 void __user *secs)
0367 {
0368     int ret;
0369 
0370     /*
0371      * Make sure all userspace pointers from caller (KVM) are valid.
0372      * All other checks deferred to ENCLS itself.  Also see comment
0373      * for @secs in sgx_virt_ecreate().
0374      */
0375 #define SGX_EINITTOKEN_SIZE 304
0376     if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
0377              !access_ok(token, SGX_EINITTOKEN_SIZE) ||
0378              !access_ok(secs, PAGE_SIZE)))
0379         return -EINVAL;
0380 
0381     __uaccess_begin();
0382     ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
0383     __uaccess_end();
0384 
0385     return ret;
0386 }
0387 
0388 /**
0389  * sgx_virt_einit() - Run EINIT on behalf of guest
0390  * @sigstruct:      Userspace pointer to SIGSTRUCT structure
0391  * @token:      Userspace pointer to EINITTOKEN structure
0392  * @secs:       Userspace pointer to SECS page
0393  * @lepubkeyhash:   Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
0394  * @trapnr:     trap number injected to guest in case of EINIT error
0395  *
0396  * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
0397  * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
0398  * needs to update hardware values to guest's virtual MSR values in order to
0399  * ensure EINIT is executed with expected hardware values.
0400  *
0401  * Return:
0402  * -  0:    EINIT was successful.
0403  * - <0:    on error.
0404  */
0405 int sgx_virt_einit(void __user *sigstruct, void __user *token,
0406            void __user *secs, u64 *lepubkeyhash, int *trapnr)
0407 {
0408     int ret;
0409 
0410     if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
0411         ret = __sgx_virt_einit(sigstruct, token, secs);
0412     } else {
0413         preempt_disable();
0414 
0415         sgx_update_lepubkeyhash(lepubkeyhash);
0416 
0417         ret = __sgx_virt_einit(sigstruct, token, secs);
0418         preempt_enable();
0419     }
0420 
0421     /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
0422     if (ret == -EINVAL)
0423         return ret;
0424 
0425     if (encls_faulted(ret)) {
0426         *trapnr = ENCLS_TRAPNR(ret);
0427         return -EFAULT;
0428     }
0429 
0430     return ret;
0431 }
0432 EXPORT_SYMBOL_GPL(sgx_virt_einit);