virt/nitro_enclaves/ne_misc_dev.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
0004  */
0005
0006 /**
0007  * DOC: Enclave lifetime management driver for Nitro Enclaves (NE).
0008  * Nitro is a hypervisor that has been developed by Amazon.
0009  */
0010
0011 #include <linux/anon_inodes.h>
0012 #include <linux/capability.h>
0013 #include <linux/cpu.h>
0014 #include <linux/device.h>
0015 #include <linux/file.h>
0016 #include <linux/hugetlb.h>
0017 #include <linux/limits.h>
0018 #include <linux/list.h>
0019 #include <linux/miscdevice.h>
0020 #include <linux/mm.h>
0021 #include <linux/mman.h>
0022 #include <linux/module.h>
0023 #include <linux/mutex.h>
0024 #include <linux/nitro_enclaves.h>
0025 #include <linux/pci.h>
0026 #include <linux/poll.h>
0027 #include <linux/range.h>
0028 #include <linux/slab.h>
0029 #include <linux/types.h>
0030 #include <uapi/linux/vm_sockets.h>
0031
0032 #include "ne_misc_dev.h"
0033 #include "ne_pci_dev.h"
0034
0035 /**
0036  * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma
0037  *        separated. The NE CPU pool includes CPUs from a single NUMA
0038  *        node.
0039  */
0040 #define NE_CPUS_SIZE        (512)
0041
0042 /**
0043  * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF)
0044  *          image in enclave memory.
0045  */
0046 #define NE_EIF_LOAD_OFFSET  (8 * 1024UL * 1024UL)
0047
0048 /**
0049  * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched
0050  *               with.
0051  */
0052 #define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL)
0053
0054 /**
0055  * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region.
0056  */
0057 #define NE_MIN_MEM_REGION_SIZE  (2 * 1024UL * 1024UL)
0058
0059 /**
0060  * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM.
0061  */
0062 #define NE_PARENT_VM_CID    (3)
0063
0064 static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
0065
0066 static const struct file_operations ne_fops = {
0067     .owner      = THIS_MODULE,
0068     .llseek     = noop_llseek,
0069     .unlocked_ioctl = ne_ioctl,
0070 };
0071
0072 static struct miscdevice ne_misc_dev = {
0073     .minor  = MISC_DYNAMIC_MINOR,
0074     .name   = "nitro_enclaves",
0075     .fops   = &ne_fops,
0076     .mode   = 0660,
0077 };
0078
0079 struct ne_devs ne_devs = {
0080     .ne_misc_dev    = &ne_misc_dev,
0081 };
0082
0083 /*
0084  * TODO: Update logic to create new sysfs entries instead of using
0085  * a kernel parameter e.g. if multiple sysfs files needed.
0086  */
0087 static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
0088
0089 static const struct kernel_param_ops ne_cpu_pool_ops = {
0090     .get    = param_get_string,
0091     .set    = ne_set_kernel_param,
0092 };
0093
0094 static char ne_cpus[NE_CPUS_SIZE];
0095 static struct kparam_string ne_cpus_arg = {
0096     .maxlen = sizeof(ne_cpus),
0097     .string = ne_cpus,
0098 };
0099
0100 module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644);
0101 /* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */
0102 MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves");
0103
0104 /**
0105  * struct ne_cpu_pool - CPU pool used for Nitro Enclaves.
0106  * @avail_threads_per_core: Available full CPU cores to be dedicated to
0107  *              enclave(s). The cpumasks from the array, indexed
0108  *              by core id, contain all the threads from the
0109  *              available cores, that are not set for created
0110  *              enclave(s). The full CPU cores are part of the
0111  *              NE CPU pool.
0112  * @mutex:          Mutex for the access to the NE CPU pool.
0113  * @nr_parent_vm_cores :    The size of the available threads per core array.
0114  *              The total number of CPU cores available on the
0115  *              primary / parent VM.
0116  * @nr_threads_per_core:    The number of threads that a full CPU core has.
0117  * @numa_node:          NUMA node of the CPUs in the pool.
0118  */
0119 struct ne_cpu_pool {
0120     cpumask_var_t   *avail_threads_per_core;
0121     struct mutex    mutex;
0122     unsigned int    nr_parent_vm_cores;
0123     unsigned int    nr_threads_per_core;
0124     int     numa_node;
0125 };
0126
0127 static struct ne_cpu_pool ne_cpu_pool;
0128
0129 /**
0130  * struct ne_phys_contig_mem_regions - Contiguous physical memory regions.
0131  * @num:    The number of regions that currently has.
0132  * @regions:    The array of physical memory regions.
0133  */
0134 struct ne_phys_contig_mem_regions {
0135     unsigned long num;
0136     struct range  *regions;
0137 };
0138
0139 /**
0140  * ne_check_enclaves_created() - Verify if at least one enclave has been created.
0141  * @void:   No parameters provided.
0142  *
0143  * Context: Process context.
0144  * Return:
0145  * * True if at least one enclave is created.
0146  * * False otherwise.
0147  */
0148 static bool ne_check_enclaves_created(void)
0149 {
0150     struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
0151     bool ret = false;
0152
0153     if (!ne_pci_dev)
0154         return ret;
0155
0156     mutex_lock(&ne_pci_dev->enclaves_list_mutex);
0157
0158     if (!list_empty(&ne_pci_dev->enclaves_list))
0159         ret = true;
0160
0161     mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
0162
0163     return ret;
0164 }
0165
0166 /**
0167  * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such
0168  *           as not sharing CPU cores with the primary / parent VM
0169  *           or not using CPU 0, which should remain available for
0170  *           the primary / parent VM. Offline the CPUs from the
0171  *           pool after the checks passed.
0172  * @ne_cpu_list:    The CPU list used for setting NE CPU pool.
0173  *
0174  * Context: Process context.
0175  * Return:
0176  * * 0 on success.
0177  * * Negative return value on failure.
0178  */
0179 static int ne_setup_cpu_pool(const char *ne_cpu_list)
0180 {
0181     int core_id = -1;
0182     unsigned int cpu = 0;
0183     cpumask_var_t cpu_pool;
0184     unsigned int cpu_sibling = 0;
0185     unsigned int i = 0;
0186     int numa_node = -1;
0187     int rc = -EINVAL;
0188
0189     if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL))
0190         return -ENOMEM;
0191
0192     mutex_lock(&ne_cpu_pool.mutex);
0193
0194     rc = cpulist_parse(ne_cpu_list, cpu_pool);
0195     if (rc < 0) {
0196         pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc);
0197
0198         goto free_pool_cpumask;
0199     }
0200
0201     cpu = cpumask_any(cpu_pool);
0202     if (cpu >= nr_cpu_ids) {
0203         pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name);
0204
0205         rc = -EINVAL;
0206
0207         goto free_pool_cpumask;
0208     }
0209
0210     /*
0211      * Check if the CPUs are online, to further get info about them
0212      * e.g. numa node, core id, siblings.
0213      */
0214     for_each_cpu(cpu, cpu_pool)
0215         if (cpu_is_offline(cpu)) {
0216             pr_err("%s: CPU %d is offline, has to be online to get its metadata\n",
0217                    ne_misc_dev.name, cpu);
0218
0219             rc = -EINVAL;
0220
0221             goto free_pool_cpumask;
0222         }
0223
0224     /*
0225      * Check if the CPUs from the NE CPU pool are from the same NUMA node.
0226      */
0227     for_each_cpu(cpu, cpu_pool)
0228         if (numa_node < 0) {
0229             numa_node = cpu_to_node(cpu);
0230             if (numa_node < 0) {
0231                 pr_err("%s: Invalid NUMA node %d\n",
0232                        ne_misc_dev.name, numa_node);
0233
0234                 rc = -EINVAL;
0235
0236                 goto free_pool_cpumask;
0237             }
0238         } else {
0239             if (numa_node != cpu_to_node(cpu)) {
0240                 pr_err("%s: CPUs with different NUMA nodes\n",
0241                        ne_misc_dev.name);
0242
0243                 rc = -EINVAL;
0244
0245                 goto free_pool_cpumask;
0246             }
0247         }
0248
0249     /*
0250      * Check if CPU 0 and its siblings are included in the provided CPU pool
0251      * They should remain available for the primary / parent VM.
0252      */
0253     if (cpumask_test_cpu(0, cpu_pool)) {
0254         pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name);
0255
0256         rc = -EINVAL;
0257
0258         goto free_pool_cpumask;
0259     }
0260
0261     for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
0262         if (cpumask_test_cpu(cpu_sibling, cpu_pool)) {
0263             pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n",
0264                    ne_misc_dev.name, cpu_sibling);
0265
0266             rc = -EINVAL;
0267
0268             goto free_pool_cpumask;
0269         }
0270     }
0271
0272     /*
0273      * Check if CPU siblings are included in the provided CPU pool. The
0274      * expectation is that full CPU cores are made available in the CPU pool
0275      * for enclaves.
0276      */
0277     for_each_cpu(cpu, cpu_pool) {
0278         for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
0279             if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) {
0280                 pr_err("%s: CPU %d is not in CPU pool\n",
0281                        ne_misc_dev.name, cpu_sibling);
0282
0283                 rc = -EINVAL;
0284
0285                 goto free_pool_cpumask;
0286             }
0287         }
0288     }
0289
0290     /* Calculate the number of threads from a full CPU core. */
0291     cpu = cpumask_any(cpu_pool);
0292     for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu))
0293         ne_cpu_pool.nr_threads_per_core++;
0294
0295     ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
0296
0297     ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores,
0298                              sizeof(*ne_cpu_pool.avail_threads_per_core),
0299                              GFP_KERNEL);
0300     if (!ne_cpu_pool.avail_threads_per_core) {
0301         rc = -ENOMEM;
0302
0303         goto free_pool_cpumask;
0304     }
0305
0306     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
0307         if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) {
0308             rc = -ENOMEM;
0309
0310             goto free_cores_cpumask;
0311         }
0312
0313     /*
0314      * Split the NE CPU pool in threads per core to keep the CPU topology
0315      * after offlining the CPUs.
0316      */
0317     for_each_cpu(cpu, cpu_pool) {
0318         core_id = topology_core_id(cpu);
0319         if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) {
0320             pr_err("%s: Invalid core id  %d for CPU %d\n",
0321                    ne_misc_dev.name, core_id, cpu);
0322
0323             rc = -EINVAL;
0324
0325             goto clear_cpumask;
0326         }
0327
0328         cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]);
0329     }
0330
0331     /*
0332      * CPUs that are given to enclave(s) should not be considered online
0333      * by Linux anymore, as the hypervisor will degrade them to floating.
0334      * The physical CPUs (full cores) are carved out of the primary / parent
0335      * VM and given to the enclave VM. The same number of vCPUs would run
0336      * on less pCPUs for the primary / parent VM.
0337      *
0338      * We offline them here, to not degrade performance and expose correct
0339      * topology to Linux and user space.
0340      */
0341     for_each_cpu(cpu, cpu_pool) {
0342         rc = remove_cpu(cpu);
0343         if (rc != 0) {
0344             pr_err("%s: CPU %d is not offlined [rc=%d]\n",
0345                    ne_misc_dev.name, cpu, rc);
0346
0347             goto online_cpus;
0348         }
0349     }
0350
0351     free_cpumask_var(cpu_pool);
0352
0353     ne_cpu_pool.numa_node = numa_node;
0354
0355     mutex_unlock(&ne_cpu_pool.mutex);
0356
0357     return 0;
0358
0359 online_cpus:
0360     for_each_cpu(cpu, cpu_pool)
0361         add_cpu(cpu);
0362 clear_cpumask:
0363     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
0364         cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
0365 free_cores_cpumask:
0366     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
0367         free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
0368     kfree(ne_cpu_pool.avail_threads_per_core);
0369 free_pool_cpumask:
0370     free_cpumask_var(cpu_pool);
0371     ne_cpu_pool.nr_parent_vm_cores = 0;
0372     ne_cpu_pool.nr_threads_per_core = 0;
0373     ne_cpu_pool.numa_node = -1;
0374     mutex_unlock(&ne_cpu_pool.mutex);
0375
0376     return rc;
0377 }
0378
0379 /**
0380  * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the
0381  *              CPU pool.
0382  * @void:   No parameters provided.
0383  *
0384  * Context: Process context.
0385  */
0386 static void ne_teardown_cpu_pool(void)
0387 {
0388     unsigned int cpu = 0;
0389     unsigned int i = 0;
0390     int rc = -EINVAL;
0391
0392     mutex_lock(&ne_cpu_pool.mutex);
0393
0394     if (!ne_cpu_pool.nr_parent_vm_cores) {
0395         mutex_unlock(&ne_cpu_pool.mutex);
0396
0397         return;
0398     }
0399
0400     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) {
0401         for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) {
0402             rc = add_cpu(cpu);
0403             if (rc != 0)
0404                 pr_err("%s: CPU %d is not onlined [rc=%d]\n",
0405                        ne_misc_dev.name, cpu, rc);
0406         }
0407
0408         cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
0409
0410         free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
0411     }
0412
0413     kfree(ne_cpu_pool.avail_threads_per_core);
0414     ne_cpu_pool.nr_parent_vm_cores = 0;
0415     ne_cpu_pool.nr_threads_per_core = 0;
0416     ne_cpu_pool.numa_node = -1;
0417
0418     mutex_unlock(&ne_cpu_pool.mutex);
0419 }
0420
0421 /**
0422  * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter.
0423  * @val:    NE CPU pool string value.
0424  * @kp :    NE kernel parameter associated with the NE CPU pool.
0425  *
0426  * Context: Process context.
0427  * Return:
0428  * * 0 on success.
0429  * * Negative return value on failure.
0430  */
0431 static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
0432 {
0433     char error_val[] = "";
0434     int rc = -EINVAL;
0435
0436     if (!capable(CAP_SYS_ADMIN))
0437         return -EPERM;
0438
0439     if (ne_check_enclaves_created()) {
0440         pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name);
0441
0442         return -EPERM;
0443     }
0444
0445     ne_teardown_cpu_pool();
0446
0447     rc = ne_setup_cpu_pool(val);
0448     if (rc < 0) {
0449         pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc);
0450
0451         param_set_copystring(error_val, kp);
0452
0453         return rc;
0454     }
0455
0456     rc = param_set_copystring(val, kp);
0457     if (rc < 0) {
0458         pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc);
0459
0460         ne_teardown_cpu_pool();
0461
0462         param_set_copystring(error_val, kp);
0463
0464         return rc;
0465     }
0466
0467     return 0;
0468 }
0469
0470 /**
0471  * ne_donated_cpu() - Check if the provided CPU is already used by the enclave.
0472  * @ne_enclave :    Private data associated with the current enclave.
0473  * @cpu:        CPU to check if already used.
0474  *
0475  * Context: Process context. This function is called with the ne_enclave mutex held.
0476  * Return:
0477  * * True if the provided CPU is already used by the enclave.
0478  * * False otherwise.
0479  */
0480 static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu)
0481 {
0482     if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids))
0483         return true;
0484
0485     return false;
0486 }
0487
0488 /**
0489  * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the
0490  *                  NE CPU pool.
0491  * @void:   No parameters provided.
0492  *
0493  * Context: Process context. This function is called with the ne_enclave and
0494  *      ne_cpu_pool mutexes held.
0495  * Return:
0496  * * Core id.
0497  * * -1 if no CPU core available in the pool.
0498  */
0499 static int ne_get_unused_core_from_cpu_pool(void)
0500 {
0501     int core_id = -1;
0502     unsigned int i = 0;
0503
0504     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
0505         if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) {
0506             core_id = i;
0507
0508             break;
0509         }
0510
0511     return core_id;
0512 }
0513
0514 /**
0515  * ne_set_enclave_threads_per_core() - Set the threads of the provided core in
0516  *                     the enclave data structure.
0517  * @ne_enclave :    Private data associated with the current enclave.
0518  * @core_id:        Core id to get its threads from the NE CPU pool.
0519  * @vcpu_id:        vCPU id part of the provided core.
0520  *
0521  * Context: Process context. This function is called with the ne_enclave and
0522  *      ne_cpu_pool mutexes held.
0523  * Return:
0524  * * 0 on success.
0525  * * Negative return value on failure.
0526  */
0527 static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave,
0528                        int core_id, u32 vcpu_id)
0529 {
0530     unsigned int cpu = 0;
0531
0532     if (core_id < 0 && vcpu_id == 0) {
0533         dev_err_ratelimited(ne_misc_dev.this_device,
0534                     "No CPUs available in NE CPU pool\n");
0535
0536         return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
0537     }
0538
0539     if (core_id < 0) {
0540         dev_err_ratelimited(ne_misc_dev.this_device,
0541                     "CPU %d is not in NE CPU pool\n", vcpu_id);
0542
0543         return -NE_ERR_VCPU_NOT_IN_CPU_POOL;
0544     }
0545
0546     if (core_id >= ne_enclave->nr_parent_vm_cores) {
0547         dev_err_ratelimited(ne_misc_dev.this_device,
0548                     "Invalid core id %d - ne_enclave\n", core_id);
0549
0550         return -NE_ERR_VCPU_INVALID_CPU_CORE;
0551     }
0552
0553     for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id])
0554         cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]);
0555
0556     cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]);
0557
0558     return 0;
0559 }
0560
0561 /**
0562  * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the
0563  *              remaining sibling(s) of a CPU core or the first
0564  *              sibling of a new CPU core.
0565  * @ne_enclave :    Private data associated with the current enclave.
0566  * @vcpu_id:        vCPU to get from the NE CPU pool.
0567  *
0568  * Context: Process context. This function is called with the ne_enclave mutex held.
0569  * Return:
0570  * * 0 on success.
0571  * * Negative return value on failure.
0572  */
0573 static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)
0574 {
0575     int core_id = -1;
0576     unsigned int cpu = 0;
0577     unsigned int i = 0;
0578     int rc = -EINVAL;
0579
0580     /*
0581      * If previously allocated a thread of a core to this enclave, first
0582      * check remaining sibling(s) for new CPU allocations, so that full
0583      * CPU cores are used for the enclave.
0584      */
0585     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
0586         for_each_cpu(cpu, ne_enclave->threads_per_core[i])
0587             if (!ne_donated_cpu(ne_enclave, cpu)) {
0588                 *vcpu_id = cpu;
0589
0590                 return 0;
0591             }
0592
0593     mutex_lock(&ne_cpu_pool.mutex);
0594
0595     /*
0596      * If no remaining siblings, get a core from the NE CPU pool and keep
0597      * track of all the threads in the enclave threads per core data structure.
0598      */
0599     core_id = ne_get_unused_core_from_cpu_pool();
0600
0601     rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id);
0602     if (rc < 0)
0603         goto unlock_mutex;
0604
0605     *vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]);
0606
0607     rc = 0;
0608
0609 unlock_mutex:
0610     mutex_unlock(&ne_cpu_pool.mutex);
0611
0612     return rc;
0613 }
0614
0615 /**
0616  * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the
0617  *                    core associated with the provided vCPU.
0618  * @vcpu_id:    Provided vCPU id to get its associated core id.
0619  *
0620  * Context: Process context. This function is called with the ne_enclave and
0621  *      ne_cpu_pool mutexes held.
0622  * Return:
0623  * * Core id.
0624  * * -1 if the provided vCPU is not in the pool.
0625  */
0626 static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id)
0627 {
0628     int core_id = -1;
0629     unsigned int i = 0;
0630
0631     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
0632         if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) {
0633             core_id = i;
0634
0635             break;
0636     }
0637
0638     return core_id;
0639 }
0640
0641 /**
0642  * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs
0643  *              from the pool.
0644  * @ne_enclave :    Private data associated with the current enclave.
0645  * @vcpu_id:        ID of the vCPU to check if available in the NE CPU pool.
0646  *
0647  * Context: Process context. This function is called with the ne_enclave mutex held.
0648  * Return:
0649  * * 0 on success.
0650  * * Negative return value on failure.
0651  */
0652 static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id)
0653 {
0654     int core_id = -1;
0655     unsigned int i = 0;
0656     int rc = -EINVAL;
0657
0658     if (ne_donated_cpu(ne_enclave, vcpu_id)) {
0659         dev_err_ratelimited(ne_misc_dev.this_device,
0660                     "CPU %d already used\n", vcpu_id);
0661
0662         return -NE_ERR_VCPU_ALREADY_USED;
0663     }
0664
0665     /*
0666      * If previously allocated a thread of a core to this enclave, but not
0667      * the full core, first check remaining sibling(s).
0668      */
0669     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
0670         if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i]))
0671             return 0;
0672
0673     mutex_lock(&ne_cpu_pool.mutex);
0674
0675     /*
0676      * If no remaining siblings, get from the NE CPU pool the core
0677      * associated with the vCPU and keep track of all the threads in the
0678      * enclave threads per core data structure.
0679      */
0680     core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id);
0681
0682     rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id);
0683     if (rc < 0)
0684         goto unlock_mutex;
0685
0686     rc = 0;
0687
0688 unlock_mutex:
0689     mutex_unlock(&ne_cpu_pool.mutex);
0690
0691     return rc;
0692 }
0693
0694 /**
0695  * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current
0696  *           enclave.
0697  * @ne_enclave :    Private data associated with the current enclave.
0698  * @vcpu_id:        ID of the CPU to be associated with the given slot,
0699  *          apic id on x86.
0700  *
0701  * Context: Process context. This function is called with the ne_enclave mutex held.
0702  * Return:
0703  * * 0 on success.
0704  * * Negative return value on failure.
0705  */
0706 static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
0707 {
0708     struct ne_pci_dev_cmd_reply cmd_reply = {};
0709     struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
0710     int rc = -EINVAL;
0711     struct slot_add_vcpu_req slot_add_vcpu_req = {};
0712
0713     if (ne_enclave->mm != current->mm)
0714         return -EIO;
0715
0716     slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
0717     slot_add_vcpu_req.vcpu_id = vcpu_id;
0718
0719     rc = ne_do_request(pdev, SLOT_ADD_VCPU,
0720                &slot_add_vcpu_req, sizeof(slot_add_vcpu_req),
0721                &cmd_reply, sizeof(cmd_reply));
0722     if (rc < 0) {
0723         dev_err_ratelimited(ne_misc_dev.this_device,
0724                     "Error in slot add vCPU [rc=%d]\n", rc);
0725
0726         return rc;
0727     }
0728
0729     cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids);
0730
0731     ne_enclave->nr_vcpus++;
0732
0733     return 0;
0734 }
0735
0736 /**
0737  * ne_sanity_check_user_mem_region() - Sanity check the user space memory
0738  *                     region received during the set user
0739  *                     memory region ioctl call.
0740  * @ne_enclave :    Private data associated with the current enclave.
0741  * @mem_region :    User space memory region to be sanity checked.
0742  *
0743  * Context: Process context. This function is called with the ne_enclave mutex held.
0744  * Return:
0745  * * 0 on success.
0746  * * Negative return value on failure.
0747  */
0748 static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
0749                        struct ne_user_memory_region mem_region)
0750 {
0751     struct ne_mem_region *ne_mem_region = NULL;
0752
0753     if (ne_enclave->mm != current->mm)
0754         return -EIO;
0755
0756     if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
0757         dev_err_ratelimited(ne_misc_dev.this_device,
0758                     "User space memory size is not multiple of 2 MiB\n");
0759
0760         return -NE_ERR_INVALID_MEM_REGION_SIZE;
0761     }
0762
0763     if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) {
0764         dev_err_ratelimited(ne_misc_dev.this_device,
0765                     "User space address is not 2 MiB aligned\n");
0766
0767         return -NE_ERR_UNALIGNED_MEM_REGION_ADDR;
0768     }
0769
0770     if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) ||
0771         !access_ok((void __user *)(unsigned long)mem_region.userspace_addr,
0772                mem_region.memory_size)) {
0773         dev_err_ratelimited(ne_misc_dev.this_device,
0774                     "Invalid user space address range\n");
0775
0776         return -NE_ERR_INVALID_MEM_REGION_ADDR;
0777     }
0778
0779     list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list,
0780                 mem_region_list_entry) {
0781         u64 memory_size = ne_mem_region->memory_size;
0782         u64 userspace_addr = ne_mem_region->userspace_addr;
0783
0784         if ((userspace_addr <= mem_region.userspace_addr &&
0785              mem_region.userspace_addr < (userspace_addr + memory_size)) ||
0786             (mem_region.userspace_addr <= userspace_addr &&
0787             (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
0788             dev_err_ratelimited(ne_misc_dev.this_device,
0789                         "User space memory region already used\n");
0790
0791             return -NE_ERR_MEM_REGION_ALREADY_USED;
0792         }
0793     }
0794
0795     return 0;
0796 }
0797
0798 /**
0799  * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space
0800  *                      memory region received during the set
0801  *                      user memory region ioctl call.
0802  * @ne_enclave :    Private data associated with the current enclave.
0803  * @mem_region_page:    Page from the user space memory region to be sanity checked.
0804  *
0805  * Context: Process context. This function is called with the ne_enclave mutex held.
0806  * Return:
0807  * * 0 on success.
0808  * * Negative return value on failure.
0809  */
0810 static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
0811                         struct page *mem_region_page)
0812 {
0813     if (!PageHuge(mem_region_page)) {
0814         dev_err_ratelimited(ne_misc_dev.this_device,
0815                     "Not a hugetlbfs page\n");
0816
0817         return -NE_ERR_MEM_NOT_HUGE_PAGE;
0818     }
0819
0820     if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) {
0821         dev_err_ratelimited(ne_misc_dev.this_device,
0822                     "Page size not multiple of 2 MiB\n");
0823
0824         return -NE_ERR_INVALID_PAGE_SIZE;
0825     }
0826
0827     if (ne_enclave->numa_node != page_to_nid(mem_region_page)) {
0828         dev_err_ratelimited(ne_misc_dev.this_device,
0829                     "Page is not from NUMA node %d\n",
0830                     ne_enclave->numa_node);
0831
0832         return -NE_ERR_MEM_DIFFERENT_NUMA_NODE;
0833     }
0834
0835     return 0;
0836 }
0837
0838 /**
0839  * ne_sanity_check_phys_mem_region() - Sanity check the start address and the size
0840  *                                     of a physical memory region.
0841  * @phys_mem_region_paddr : Physical start address of the region to be sanity checked.
0842  * @phys_mem_region_size  : Length of the region to be sanity checked.
0843  *
0844  * Context: Process context. This function is called with the ne_enclave mutex held.
0845  * Return:
0846  * * 0 on success.
0847  * * Negative return value on failure.
0848  */
0849 static int ne_sanity_check_phys_mem_region(u64 phys_mem_region_paddr,
0850                        u64 phys_mem_region_size)
0851 {
0852     if (phys_mem_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
0853         dev_err_ratelimited(ne_misc_dev.this_device,
0854                     "Physical mem region size is not multiple of 2 MiB\n");
0855
0856         return -EINVAL;
0857     }
0858
0859     if (!IS_ALIGNED(phys_mem_region_paddr, NE_MIN_MEM_REGION_SIZE)) {
0860         dev_err_ratelimited(ne_misc_dev.this_device,
0861                     "Physical mem region address is not 2 MiB aligned\n");
0862
0863         return -EINVAL;
0864     }
0865
0866     return 0;
0867 }
0868
0869 /**
0870  * ne_merge_phys_contig_memory_regions() - Add a memory region and merge the adjacent
0871  *                                         regions if they are physically contiguous.
0872  * @phys_contig_regions : Private data associated with the contiguous physical memory regions.
0873  * @page_paddr :          Physical start address of the region to be added.
0874  * @page_size :           Length of the region to be added.
0875  *
0876  * Context: Process context. This function is called with the ne_enclave mutex held.
0877  * Return:
0878  * * 0 on success.
0879  * * Negative return value on failure.
0880  */
0881 static int
0882 ne_merge_phys_contig_memory_regions(struct ne_phys_contig_mem_regions *phys_contig_regions,
0883                     u64 page_paddr, u64 page_size)
0884 {
0885     unsigned long num = phys_contig_regions->num;
0886     int rc = 0;
0887
0888     rc = ne_sanity_check_phys_mem_region(page_paddr, page_size);
0889     if (rc < 0)
0890         return rc;
0891
0892     /* Physically contiguous, just merge */
0893     if (num && (phys_contig_regions->regions[num - 1].end + 1) == page_paddr) {
0894         phys_contig_regions->regions[num - 1].end += page_size;
0895     } else {
0896         phys_contig_regions->regions[num].start = page_paddr;
0897         phys_contig_regions->regions[num].end = page_paddr + page_size - 1;
0898         phys_contig_regions->num++;
0899     }
0900
0901     return 0;
0902 }
0903
0904 /**
0905  * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot
0906  *                     associated with the current enclave.
0907  * @ne_enclave :    Private data associated with the current enclave.
0908  * @mem_region :    User space memory region to be associated with the given slot.
0909  *
0910  * Context: Process context. This function is called with the ne_enclave mutex held.
0911  * Return:
0912  * * 0 on success.
0913  * * Negative return value on failure.
0914  */
0915 static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
0916                        struct ne_user_memory_region mem_region)
0917 {
0918     long gup_rc = 0;
0919     unsigned long i = 0;
0920     unsigned long max_nr_pages = 0;
0921     unsigned long memory_size = 0;
0922     struct ne_mem_region *ne_mem_region = NULL;
0923     struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
0924     struct ne_phys_contig_mem_regions phys_contig_mem_regions = {};
0925     int rc = -EINVAL;
0926
0927     rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region);
0928     if (rc < 0)
0929         return rc;
0930
0931     ne_mem_region = kzalloc(sizeof(*ne_mem_region), GFP_KERNEL);
0932     if (!ne_mem_region)
0933         return -ENOMEM;
0934
0935     max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE;
0936
0937     ne_mem_region->pages = kcalloc(max_nr_pages, sizeof(*ne_mem_region->pages),
0938                        GFP_KERNEL);
0939     if (!ne_mem_region->pages) {
0940         rc = -ENOMEM;
0941
0942         goto free_mem_region;
0943     }
0944
0945     phys_contig_mem_regions.regions = kcalloc(max_nr_pages,
0946                           sizeof(*phys_contig_mem_regions.regions),
0947                           GFP_KERNEL);
0948     if (!phys_contig_mem_regions.regions) {
0949         rc = -ENOMEM;
0950
0951         goto free_mem_region;
0952     }
0953
0954     do {
0955         i = ne_mem_region->nr_pages;
0956
0957         if (i == max_nr_pages) {
0958             dev_err_ratelimited(ne_misc_dev.this_device,
0959                         "Reached max nr of pages in the pages data struct\n");
0960
0961             rc = -ENOMEM;
0962
0963             goto put_pages;
0964         }
0965
0966         gup_rc = get_user_pages_unlocked(mem_region.userspace_addr + memory_size, 1,
0967                          ne_mem_region->pages + i, FOLL_GET);
0968
0969         if (gup_rc < 0) {
0970             rc = gup_rc;
0971
0972             dev_err_ratelimited(ne_misc_dev.this_device,
0973                         "Error in get user pages [rc=%d]\n", rc);
0974
0975             goto put_pages;
0976         }
0977
0978         rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]);
0979         if (rc < 0)
0980             goto put_pages;
0981
0982         rc = ne_merge_phys_contig_memory_regions(&phys_contig_mem_regions,
0983                              page_to_phys(ne_mem_region->pages[i]),
0984                              page_size(ne_mem_region->pages[i]));
0985         if (rc < 0)
0986             goto put_pages;
0987
0988         memory_size += page_size(ne_mem_region->pages[i]);
0989
0990         ne_mem_region->nr_pages++;
0991     } while (memory_size < mem_region.memory_size);
0992
0993     if ((ne_enclave->nr_mem_regions + phys_contig_mem_regions.num) >
0994         ne_enclave->max_mem_regions) {
0995         dev_err_ratelimited(ne_misc_dev.this_device,
0996                     "Reached max memory regions %lld\n",
0997                     ne_enclave->max_mem_regions);
0998
0999         rc = -NE_ERR_MEM_MAX_REGIONS;
1000
1001         goto put_pages;
1002     }
1003
1004     for (i = 0; i < phys_contig_mem_regions.num; i++) {
1005         u64 phys_region_addr = phys_contig_mem_regions.regions[i].start;
1006         u64 phys_region_size = range_len(&phys_contig_mem_regions.regions[i]);
1007
1008         rc = ne_sanity_check_phys_mem_region(phys_region_addr, phys_region_size);
1009         if (rc < 0)
1010             goto put_pages;
1011     }
1012
1013     ne_mem_region->memory_size = mem_region.memory_size;
1014     ne_mem_region->userspace_addr = mem_region.userspace_addr;
1015
1016     list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list);
1017
1018     for (i = 0; i < phys_contig_mem_regions.num; i++) {
1019         struct ne_pci_dev_cmd_reply cmd_reply = {};
1020         struct slot_add_mem_req slot_add_mem_req = {};
1021
1022         slot_add_mem_req.slot_uid = ne_enclave->slot_uid;
1023         slot_add_mem_req.paddr = phys_contig_mem_regions.regions[i].start;
1024         slot_add_mem_req.size = range_len(&phys_contig_mem_regions.regions[i]);
1025
1026         rc = ne_do_request(pdev, SLOT_ADD_MEM,
1027                    &slot_add_mem_req, sizeof(slot_add_mem_req),
1028                    &cmd_reply, sizeof(cmd_reply));
1029         if (rc < 0) {
1030             dev_err_ratelimited(ne_misc_dev.this_device,
1031                         "Error in slot add mem [rc=%d]\n", rc);
1032
1033             kfree(phys_contig_mem_regions.regions);
1034
1035             /*
1036              * Exit here without put pages as memory regions may
1037              * already been added.
1038              */
1039             return rc;
1040         }
1041
1042         ne_enclave->mem_size += slot_add_mem_req.size;
1043         ne_enclave->nr_mem_regions++;
1044     }
1045
1046     kfree(phys_contig_mem_regions.regions);
1047
1048     return 0;
1049
1050 put_pages:
1051     for (i = 0; i < ne_mem_region->nr_pages; i++)
1052         put_page(ne_mem_region->pages[i]);
1053 free_mem_region:
1054     kfree(phys_contig_mem_regions.regions);
1055     kfree(ne_mem_region->pages);
1056     kfree(ne_mem_region);
1057
1058     return rc;
1059 }
1060
1061 /**
1062  * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources,
1063  *                such as memory and CPU, have been set.
1064  * @ne_enclave :        Private data associated with the current enclave.
1065  * @enclave_start_info :    Enclave info that includes enclave cid and flags.
1066  *
1067  * Context: Process context. This function is called with the ne_enclave mutex held.
1068  * Return:
1069  * * 0 on success.
1070  * * Negative return value on failure.
1071  */
1072 static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
1073                   struct ne_enclave_start_info *enclave_start_info)
1074 {
1075     struct ne_pci_dev_cmd_reply cmd_reply = {};
1076     unsigned int cpu = 0;
1077     struct enclave_start_req enclave_start_req = {};
1078     unsigned int i = 0;
1079     struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
1080     int rc = -EINVAL;
1081
1082     if (!ne_enclave->nr_mem_regions) {
1083         dev_err_ratelimited(ne_misc_dev.this_device,
1084                     "Enclave has no mem regions\n");
1085
1086         return -NE_ERR_NO_MEM_REGIONS_ADDED;
1087     }
1088
1089     if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) {
1090         dev_err_ratelimited(ne_misc_dev.this_device,
1091                     "Enclave memory is less than %ld\n",
1092                     NE_MIN_ENCLAVE_MEM_SIZE);
1093
1094         return -NE_ERR_ENCLAVE_MEM_MIN_SIZE;
1095     }
1096
1097     if (!ne_enclave->nr_vcpus) {
1098         dev_err_ratelimited(ne_misc_dev.this_device,
1099                     "Enclave has no vCPUs\n");
1100
1101         return -NE_ERR_NO_VCPUS_ADDED;
1102     }
1103
1104     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1105         for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1106             if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) {
1107                 dev_err_ratelimited(ne_misc_dev.this_device,
1108                             "Full CPU cores not used\n");
1109
1110                 return -NE_ERR_FULL_CORES_NOT_USED;
1111             }
1112
1113     enclave_start_req.enclave_cid = enclave_start_info->enclave_cid;
1114     enclave_start_req.flags = enclave_start_info->flags;
1115     enclave_start_req.slot_uid = ne_enclave->slot_uid;
1116
1117     rc = ne_do_request(pdev, ENCLAVE_START,
1118                &enclave_start_req, sizeof(enclave_start_req),
1119                &cmd_reply, sizeof(cmd_reply));
1120     if (rc < 0) {
1121         dev_err_ratelimited(ne_misc_dev.this_device,
1122                     "Error in enclave start [rc=%d]\n", rc);
1123
1124         return rc;
1125     }
1126
1127     ne_enclave->state = NE_STATE_RUNNING;
1128
1129     enclave_start_info->enclave_cid = cmd_reply.enclave_cid;
1130
1131     return 0;
1132 }
1133
1134 /**
1135  * ne_enclave_ioctl() - Ioctl function provided by the enclave file.
1136  * @file:   File associated with this ioctl function.
1137  * @cmd:    The command that is set for the ioctl call.
1138  * @arg:    The argument that is provided for the ioctl call.
1139  *
1140  * Context: Process context.
1141  * Return:
1142  * * 0 on success.
1143  * * Negative return value on failure.
1144  */
1145 static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1146 {
1147     struct ne_enclave *ne_enclave = file->private_data;
1148
1149     switch (cmd) {
1150     case NE_ADD_VCPU: {
1151         int rc = -EINVAL;
1152         u32 vcpu_id = 0;
1153
1154         if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id)))
1155             return -EFAULT;
1156
1157         mutex_lock(&ne_enclave->enclave_info_mutex);
1158
1159         if (ne_enclave->state != NE_STATE_INIT) {
1160             dev_err_ratelimited(ne_misc_dev.this_device,
1161                         "Enclave is not in init state\n");
1162
1163             mutex_unlock(&ne_enclave->enclave_info_mutex);
1164
1165             return -NE_ERR_NOT_IN_INIT_STATE;
1166         }
1167
1168         if (vcpu_id >= (ne_enclave->nr_parent_vm_cores *
1169             ne_enclave->nr_threads_per_core)) {
1170             dev_err_ratelimited(ne_misc_dev.this_device,
1171                         "vCPU id higher than max CPU id\n");
1172
1173             mutex_unlock(&ne_enclave->enclave_info_mutex);
1174
1175             return -NE_ERR_INVALID_VCPU;
1176         }
1177
1178         if (!vcpu_id) {
1179             /* Use the CPU pool for choosing a CPU for the enclave. */
1180             rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
1181             if (rc < 0) {
1182                 dev_err_ratelimited(ne_misc_dev.this_device,
1183                             "Error in get CPU from pool [rc=%d]\n",
1184                             rc);
1185
1186                 mutex_unlock(&ne_enclave->enclave_info_mutex);
1187
1188                 return rc;
1189             }
1190         } else {
1191             /* Check if the provided vCPU is available in the NE CPU pool. */
1192             rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id);
1193             if (rc < 0) {
1194                 dev_err_ratelimited(ne_misc_dev.this_device,
1195                             "Error in check CPU %d in pool [rc=%d]\n",
1196                             vcpu_id, rc);
1197
1198                 mutex_unlock(&ne_enclave->enclave_info_mutex);
1199
1200                 return rc;
1201             }
1202         }
1203
1204         rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id);
1205         if (rc < 0) {
1206             mutex_unlock(&ne_enclave->enclave_info_mutex);
1207
1208             return rc;
1209         }
1210
1211         mutex_unlock(&ne_enclave->enclave_info_mutex);
1212
1213         if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id)))
1214             return -EFAULT;
1215
1216         return 0;
1217     }
1218
1219     case NE_GET_IMAGE_LOAD_INFO: {
1220         struct ne_image_load_info image_load_info = {};
1221
1222         if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info)))
1223             return -EFAULT;
1224
1225         mutex_lock(&ne_enclave->enclave_info_mutex);
1226
1227         if (ne_enclave->state != NE_STATE_INIT) {
1228             dev_err_ratelimited(ne_misc_dev.this_device,
1229                         "Enclave is not in init state\n");
1230
1231             mutex_unlock(&ne_enclave->enclave_info_mutex);
1232
1233             return -NE_ERR_NOT_IN_INIT_STATE;
1234         }
1235
1236         mutex_unlock(&ne_enclave->enclave_info_mutex);
1237
1238         if (!image_load_info.flags ||
1239             image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) {
1240             dev_err_ratelimited(ne_misc_dev.this_device,
1241                         "Incorrect flag in enclave image load info\n");
1242
1243             return -NE_ERR_INVALID_FLAG_VALUE;
1244         }
1245
1246         if (image_load_info.flags == NE_EIF_IMAGE)
1247             image_load_info.memory_offset = NE_EIF_LOAD_OFFSET;
1248
1249         if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info)))
1250             return -EFAULT;
1251
1252         return 0;
1253     }
1254
1255     case NE_SET_USER_MEMORY_REGION: {
1256         struct ne_user_memory_region mem_region = {};
1257         int rc = -EINVAL;
1258
1259         if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region)))
1260             return -EFAULT;
1261
1262         if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) {
1263             dev_err_ratelimited(ne_misc_dev.this_device,
1264                         "Incorrect flag for user memory region\n");
1265
1266             return -NE_ERR_INVALID_FLAG_VALUE;
1267         }
1268
1269         mutex_lock(&ne_enclave->enclave_info_mutex);
1270
1271         if (ne_enclave->state != NE_STATE_INIT) {
1272             dev_err_ratelimited(ne_misc_dev.this_device,
1273                         "Enclave is not in init state\n");
1274
1275             mutex_unlock(&ne_enclave->enclave_info_mutex);
1276
1277             return -NE_ERR_NOT_IN_INIT_STATE;
1278         }
1279
1280         rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region);
1281         if (rc < 0) {
1282             mutex_unlock(&ne_enclave->enclave_info_mutex);
1283
1284             return rc;
1285         }
1286
1287         mutex_unlock(&ne_enclave->enclave_info_mutex);
1288
1289         return 0;
1290     }
1291
1292     case NE_START_ENCLAVE: {
1293         struct ne_enclave_start_info enclave_start_info = {};
1294         int rc = -EINVAL;
1295
1296         if (copy_from_user(&enclave_start_info, (void __user *)arg,
1297                    sizeof(enclave_start_info)))
1298             return -EFAULT;
1299
1300         if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) {
1301             dev_err_ratelimited(ne_misc_dev.this_device,
1302                         "Incorrect flag in enclave start info\n");
1303
1304             return -NE_ERR_INVALID_FLAG_VALUE;
1305         }
1306
1307         /*
1308          * Do not use well-known CIDs - 0, 1, 2 - for enclaves.
1309          * VMADDR_CID_ANY = -1U
1310          * VMADDR_CID_HYPERVISOR = 0
1311          * VMADDR_CID_LOCAL = 1
1312          * VMADDR_CID_HOST = 2
1313          * Note: 0 is used as a placeholder to auto-generate an enclave CID.
1314          * http://man7.org/linux/man-pages/man7/vsock.7.html
1315          */
1316         if (enclave_start_info.enclave_cid > 0 &&
1317             enclave_start_info.enclave_cid <= VMADDR_CID_HOST) {
1318             dev_err_ratelimited(ne_misc_dev.this_device,
1319                         "Well-known CID value, not to be used for enclaves\n");
1320
1321             return -NE_ERR_INVALID_ENCLAVE_CID;
1322         }
1323
1324         if (enclave_start_info.enclave_cid == U32_MAX) {
1325             dev_err_ratelimited(ne_misc_dev.this_device,
1326                         "Well-known CID value, not to be used for enclaves\n");
1327
1328             return -NE_ERR_INVALID_ENCLAVE_CID;
1329         }
1330
1331         /*
1332          * Do not use the CID of the primary / parent VM for enclaves.
1333          */
1334         if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) {
1335             dev_err_ratelimited(ne_misc_dev.this_device,
1336                         "CID of the parent VM, not to be used for enclaves\n");
1337
1338             return -NE_ERR_INVALID_ENCLAVE_CID;
1339         }
1340
1341         /* 64-bit CIDs are not yet supported for the vsock device. */
1342         if (enclave_start_info.enclave_cid > U32_MAX) {
1343             dev_err_ratelimited(ne_misc_dev.this_device,
1344                         "64-bit CIDs not yet supported for the vsock device\n");
1345
1346             return -NE_ERR_INVALID_ENCLAVE_CID;
1347         }
1348
1349         mutex_lock(&ne_enclave->enclave_info_mutex);
1350
1351         if (ne_enclave->state != NE_STATE_INIT) {
1352             dev_err_ratelimited(ne_misc_dev.this_device,
1353                         "Enclave is not in init state\n");
1354
1355             mutex_unlock(&ne_enclave->enclave_info_mutex);
1356
1357             return -NE_ERR_NOT_IN_INIT_STATE;
1358         }
1359
1360         rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info);
1361         if (rc < 0) {
1362             mutex_unlock(&ne_enclave->enclave_info_mutex);
1363
1364             return rc;
1365         }
1366
1367         mutex_unlock(&ne_enclave->enclave_info_mutex);
1368
1369         if (copy_to_user((void __user *)arg, &enclave_start_info,
1370                  sizeof(enclave_start_info)))
1371             return -EFAULT;
1372
1373         return 0;
1374     }
1375
1376     default:
1377         return -ENOTTY;
1378     }
1379
1380     return 0;
1381 }
1382
1383 /**
1384  * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries
1385  *                      from the enclave data structure.
1386  * @ne_enclave :    Private data associated with the current enclave.
1387  *
1388  * Context: Process context. This function is called with the ne_enclave mutex held.
1389  */
1390 static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave)
1391 {
1392     unsigned long i = 0;
1393     struct ne_mem_region *ne_mem_region = NULL;
1394     struct ne_mem_region *ne_mem_region_tmp = NULL;
1395
1396     list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp,
1397                  &ne_enclave->mem_regions_list,
1398                  mem_region_list_entry) {
1399         list_del(&ne_mem_region->mem_region_list_entry);
1400
1401         for (i = 0; i < ne_mem_region->nr_pages; i++)
1402             put_page(ne_mem_region->pages[i]);
1403
1404         kfree(ne_mem_region->pages);
1405
1406         kfree(ne_mem_region);
1407     }
1408 }
1409
1410 /**
1411  * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from
1412  *                       the enclave data structure.
1413  * @ne_enclave :    Private data associated with the current enclave.
1414  *
1415  * Context: Process context. This function is called with the ne_enclave mutex held.
1416  */
1417 static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave)
1418 {
1419     unsigned int cpu = 0;
1420     unsigned int i = 0;
1421
1422     mutex_lock(&ne_cpu_pool.mutex);
1423
1424     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) {
1425         for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1426             /* Update the available NE CPU pool. */
1427             cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]);
1428
1429         free_cpumask_var(ne_enclave->threads_per_core[i]);
1430     }
1431
1432     mutex_unlock(&ne_cpu_pool.mutex);
1433
1434     kfree(ne_enclave->threads_per_core);
1435
1436     free_cpumask_var(ne_enclave->vcpu_ids);
1437 }
1438
1439 /**
1440  * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data
1441  *                     structure that is part of the NE PCI
1442  *                     device private data.
1443  * @ne_enclave :    Private data associated with the current enclave.
1444  * @ne_pci_dev :    Private data associated with the PCI device.
1445  *
1446  * Context: Process context. This function is called with the ne_pci_dev enclave
1447  *      mutex held.
1448  */
1449 static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave,
1450                         struct ne_pci_dev *ne_pci_dev)
1451 {
1452     struct ne_enclave *ne_enclave_entry = NULL;
1453     struct ne_enclave *ne_enclave_entry_tmp = NULL;
1454
1455     list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp,
1456                  &ne_pci_dev->enclaves_list, enclave_list_entry) {
1457         if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) {
1458             list_del(&ne_enclave_entry->enclave_list_entry);
1459
1460             break;
1461         }
1462     }
1463 }
1464
1465 /**
1466  * ne_enclave_release() - Release function provided by the enclave file.
1467  * @inode:  Inode associated with this file release function.
1468  * @file:   File associated with this release function.
1469  *
1470  * Context: Process context.
1471  * Return:
1472  * * 0 on success.
1473  * * Negative return value on failure.
1474  */
1475 static int ne_enclave_release(struct inode *inode, struct file *file)
1476 {
1477     struct ne_pci_dev_cmd_reply cmd_reply = {};
1478     struct enclave_stop_req enclave_stop_request = {};
1479     struct ne_enclave *ne_enclave = file->private_data;
1480     struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1481     struct pci_dev *pdev = ne_pci_dev->pdev;
1482     int rc = -EINVAL;
1483     struct slot_free_req slot_free_req = {};
1484
1485     if (!ne_enclave)
1486         return 0;
1487
1488     /*
1489      * Early exit in case there is an error in the enclave creation logic
1490      * and fput() is called on the cleanup path.
1491      */
1492     if (!ne_enclave->slot_uid)
1493         return 0;
1494
1495     /*
1496      * Acquire the enclave list mutex before the enclave mutex
1497      * in order to avoid deadlocks with @ref ne_event_work_handler.
1498      */
1499     mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1500     mutex_lock(&ne_enclave->enclave_info_mutex);
1501
1502     if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) {
1503         enclave_stop_request.slot_uid = ne_enclave->slot_uid;
1504
1505         rc = ne_do_request(pdev, ENCLAVE_STOP,
1506                    &enclave_stop_request, sizeof(enclave_stop_request),
1507                    &cmd_reply, sizeof(cmd_reply));
1508         if (rc < 0) {
1509             dev_err_ratelimited(ne_misc_dev.this_device,
1510                         "Error in enclave stop [rc=%d]\n", rc);
1511
1512             goto unlock_mutex;
1513         }
1514
1515         memset(&cmd_reply, 0, sizeof(cmd_reply));
1516     }
1517
1518     slot_free_req.slot_uid = ne_enclave->slot_uid;
1519
1520     rc = ne_do_request(pdev, SLOT_FREE,
1521                &slot_free_req, sizeof(slot_free_req),
1522                &cmd_reply, sizeof(cmd_reply));
1523     if (rc < 0) {
1524         dev_err_ratelimited(ne_misc_dev.this_device,
1525                     "Error in slot free [rc=%d]\n", rc);
1526
1527         goto unlock_mutex;
1528     }
1529
1530     ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev);
1531     ne_enclave_remove_all_mem_region_entries(ne_enclave);
1532     ne_enclave_remove_all_vcpu_id_entries(ne_enclave);
1533
1534     mutex_unlock(&ne_enclave->enclave_info_mutex);
1535     mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1536
1537     kfree(ne_enclave);
1538
1539     return 0;
1540
1541 unlock_mutex:
1542     mutex_unlock(&ne_enclave->enclave_info_mutex);
1543     mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1544
1545     return rc;
1546 }
1547
1548 /**
1549  * ne_enclave_poll() - Poll functionality used for enclave out-of-band events.
1550  * @file:   File associated with this poll function.
1551  * @wait:   Poll table data structure.
1552  *
1553  * Context: Process context.
1554  * Return:
1555  * * Poll mask.
1556  */
1557 static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
1558 {
1559     __poll_t mask = 0;
1560     struct ne_enclave *ne_enclave = file->private_data;
1561
1562     poll_wait(file, &ne_enclave->eventq, wait);
1563
1564     if (ne_enclave->has_event)
1565         mask |= EPOLLHUP;
1566
1567     return mask;
1568 }
1569
1570 static const struct file_operations ne_enclave_fops = {
1571     .owner      = THIS_MODULE,
1572     .llseek     = noop_llseek,
1573     .poll       = ne_enclave_poll,
1574     .unlocked_ioctl = ne_enclave_ioctl,
1575     .release    = ne_enclave_release,
1576 };
1577
1578 /**
1579  * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create
1580  *            enclave file descriptor to be further used for enclave
1581  *            resources handling e.g. memory regions and CPUs.
1582  * @ne_pci_dev :    Private data associated with the PCI device.
1583  * @slot_uid:       User pointer to store the generated unique slot id
1584  *          associated with an enclave to.
1585  *
1586  * Context: Process context. This function is called with the ne_pci_dev enclave
1587  *      mutex held.
1588  * Return:
1589  * * Enclave fd on success.
1590  * * Negative return value on failure.
1591  */
1592 static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid)
1593 {
1594     struct ne_pci_dev_cmd_reply cmd_reply = {};
1595     int enclave_fd = -1;
1596     struct file *enclave_file = NULL;
1597     unsigned int i = 0;
1598     struct ne_enclave *ne_enclave = NULL;
1599     struct pci_dev *pdev = ne_pci_dev->pdev;
1600     int rc = -EINVAL;
1601     struct slot_alloc_req slot_alloc_req = {};
1602
1603     mutex_lock(&ne_cpu_pool.mutex);
1604
1605     for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
1606         if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i]))
1607             break;
1608
1609     if (i == ne_cpu_pool.nr_parent_vm_cores) {
1610         dev_err_ratelimited(ne_misc_dev.this_device,
1611                     "No CPUs available in CPU pool\n");
1612
1613         mutex_unlock(&ne_cpu_pool.mutex);
1614
1615         return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
1616     }
1617
1618     mutex_unlock(&ne_cpu_pool.mutex);
1619
1620     ne_enclave = kzalloc(sizeof(*ne_enclave), GFP_KERNEL);
1621     if (!ne_enclave)
1622         return -ENOMEM;
1623
1624     mutex_lock(&ne_cpu_pool.mutex);
1625
1626     ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores;
1627     ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core;
1628     ne_enclave->numa_node = ne_cpu_pool.numa_node;
1629
1630     mutex_unlock(&ne_cpu_pool.mutex);
1631
1632     ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores,
1633                            sizeof(*ne_enclave->threads_per_core),
1634                            GFP_KERNEL);
1635     if (!ne_enclave->threads_per_core) {
1636         rc = -ENOMEM;
1637
1638         goto free_ne_enclave;
1639     }
1640
1641     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1642         if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) {
1643             rc = -ENOMEM;
1644
1645             goto free_cpumask;
1646         }
1647
1648     if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) {
1649         rc = -ENOMEM;
1650
1651         goto free_cpumask;
1652     }
1653
1654     enclave_fd = get_unused_fd_flags(O_CLOEXEC);
1655     if (enclave_fd < 0) {
1656         rc = enclave_fd;
1657
1658         dev_err_ratelimited(ne_misc_dev.this_device,
1659                     "Error in getting unused fd [rc=%d]\n", rc);
1660
1661         goto free_cpumask;
1662     }
1663
1664     enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR);
1665     if (IS_ERR(enclave_file)) {
1666         rc = PTR_ERR(enclave_file);
1667
1668         dev_err_ratelimited(ne_misc_dev.this_device,
1669                     "Error in anon inode get file [rc=%d]\n", rc);
1670
1671         goto put_fd;
1672     }
1673
1674     rc = ne_do_request(pdev, SLOT_ALLOC,
1675                &slot_alloc_req, sizeof(slot_alloc_req),
1676                &cmd_reply, sizeof(cmd_reply));
1677     if (rc < 0) {
1678         dev_err_ratelimited(ne_misc_dev.this_device,
1679                     "Error in slot alloc [rc=%d]\n", rc);
1680
1681         goto put_file;
1682     }
1683
1684     init_waitqueue_head(&ne_enclave->eventq);
1685     ne_enclave->has_event = false;
1686     mutex_init(&ne_enclave->enclave_info_mutex);
1687     ne_enclave->max_mem_regions = cmd_reply.mem_regions;
1688     INIT_LIST_HEAD(&ne_enclave->mem_regions_list);
1689     ne_enclave->mm = current->mm;
1690     ne_enclave->slot_uid = cmd_reply.slot_uid;
1691     ne_enclave->state = NE_STATE_INIT;
1692
1693     list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list);
1694
1695     if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) {
1696         /*
1697          * As we're holding the only reference to 'enclave_file', fput()
1698          * will call ne_enclave_release() which will do a proper cleanup
1699          * of all so far allocated resources, leaving only the unused fd
1700          * for us to free.
1701          */
1702         fput(enclave_file);
1703         put_unused_fd(enclave_fd);
1704
1705         return -EFAULT;
1706     }
1707
1708     fd_install(enclave_fd, enclave_file);
1709
1710     return enclave_fd;
1711
1712 put_file:
1713     fput(enclave_file);
1714 put_fd:
1715     put_unused_fd(enclave_fd);
1716 free_cpumask:
1717     free_cpumask_var(ne_enclave->vcpu_ids);
1718     for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1719         free_cpumask_var(ne_enclave->threads_per_core[i]);
1720     kfree(ne_enclave->threads_per_core);
1721 free_ne_enclave:
1722     kfree(ne_enclave);
1723
1724     return rc;
1725 }
1726
1727 /**
1728  * ne_ioctl() - Ioctl function provided by the NE misc device.
1729  * @file:   File associated with this ioctl function.
1730  * @cmd:    The command that is set for the ioctl call.
1731  * @arg:    The argument that is provided for the ioctl call.
1732  *
1733  * Context: Process context.
1734  * Return:
1735  * * Ioctl result (e.g. enclave file descriptor) on success.
1736  * * Negative return value on failure.
1737  */
1738 static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1739 {
1740     switch (cmd) {
1741     case NE_CREATE_VM: {
1742         int enclave_fd = -1;
1743         struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1744         u64 __user *slot_uid = (void __user *)arg;
1745
1746         mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1747         enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
1748         mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1749
1750         return enclave_fd;
1751     }
1752
1753     default:
1754         return -ENOTTY;
1755     }
1756
1757     return 0;
1758 }
1759
1760 #if defined(CONFIG_NITRO_ENCLAVES_MISC_DEV_TEST)
1761 #include "ne_misc_dev_test.c"
1762 #endif
1763
1764 static int __init ne_init(void)
1765 {
1766     mutex_init(&ne_cpu_pool.mutex);
1767
1768     return pci_register_driver(&ne_pci_driver);
1769 }
1770
1771 static void __exit ne_exit(void)
1772 {
1773     pci_unregister_driver(&ne_pci_driver);
1774
1775     ne_teardown_cpu_pool();
1776 }
1777
1778 module_init(ne_init);
1779 module_exit(ne_exit);
1780
1781 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
1782 MODULE_DESCRIPTION("Nitro Enclaves Driver");
1783 MODULE_LICENSE("GPL v2");