cpu/sgx/main.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*  Copyright(c) 2016-20 Intel Corporation. */
0003
0004 #include <linux/file.h>
0005 #include <linux/freezer.h>
0006 #include <linux/highmem.h>
0007 #include <linux/kthread.h>
0008 #include <linux/miscdevice.h>
0009 #include <linux/node.h>
0010 #include <linux/pagemap.h>
0011 #include <linux/ratelimit.h>
0012 #include <linux/sched/mm.h>
0013 #include <linux/sched/signal.h>
0014 #include <linux/slab.h>
0015 #include <linux/sysfs.h>
0016 #include <asm/sgx.h>
0017 #include "driver.h"
0018 #include "encl.h"
0019 #include "encls.h"
0020
0021 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
0022 static int sgx_nr_epc_sections;
0023 static struct task_struct *ksgxd_tsk;
0024 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
0025 static DEFINE_XARRAY(sgx_epc_address_space);
0026
0027 /*
0028  * These variables are part of the state of the reclaimer, and must be accessed
0029  * with sgx_reclaimer_lock acquired.
0030  */
0031 static LIST_HEAD(sgx_active_page_list);
0032 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
0033
0034 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
0035
0036 /* Nodes with one or more EPC sections. */
0037 static nodemask_t sgx_numa_mask;
0038
0039 /*
0040  * Array with one list_head for each possible NUMA node.  Each
0041  * list contains all the sgx_epc_section's which are on that
0042  * node.
0043  */
0044 static struct sgx_numa_node *sgx_numa_nodes;
0045
0046 static LIST_HEAD(sgx_dirty_page_list);
0047
0048 /*
0049  * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
0050  * from the input list, and made available for the page allocator. SECS pages
0051  * prepending their children in the input list are left intact.
0052  *
0053  * Return 0 when sanitization was successful or kthread was stopped, and the
0054  * number of unsanitized pages otherwise.
0055  */
0056 static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
0057 {
0058     unsigned long left_dirty = 0;
0059     struct sgx_epc_page *page;
0060     LIST_HEAD(dirty);
0061     int ret;
0062
0063     /* dirty_page_list is thread-local, no need for a lock: */
0064     while (!list_empty(dirty_page_list)) {
0065         if (kthread_should_stop())
0066             return 0;
0067
0068         page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
0069
0070         /*
0071          * Checking page->poison without holding the node->lock
0072          * is racy, but losing the race (i.e. poison is set just
0073          * after the check) just means __eremove() will be uselessly
0074          * called for a page that sgx_free_epc_page() will put onto
0075          * the node->sgx_poison_page_list later.
0076          */
0077         if (page->poison) {
0078             struct sgx_epc_section *section = &sgx_epc_sections[page->section];
0079             struct sgx_numa_node *node = section->node;
0080
0081             spin_lock(&node->lock);
0082             list_move(&page->list, &node->sgx_poison_page_list);
0083             spin_unlock(&node->lock);
0084
0085             continue;
0086         }
0087
0088         ret = __eremove(sgx_get_epc_virt_addr(page));
0089         if (!ret) {
0090             /*
0091              * page is now sanitized.  Make it available via the SGX
0092              * page allocator:
0093              */
0094             list_del(&page->list);
0095             sgx_free_epc_page(page);
0096         } else {
0097             /* The page is not yet clean - move to the dirty list. */
0098             list_move_tail(&page->list, &dirty);
0099             left_dirty++;
0100         }
0101
0102         cond_resched();
0103     }
0104
0105     list_splice(&dirty, dirty_page_list);
0106     return left_dirty;
0107 }
0108
0109 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
0110 {
0111     struct sgx_encl_page *page = epc_page->owner;
0112     struct sgx_encl *encl = page->encl;
0113     struct sgx_encl_mm *encl_mm;
0114     bool ret = true;
0115     int idx;
0116
0117     idx = srcu_read_lock(&encl->srcu);
0118
0119     list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
0120         if (!mmget_not_zero(encl_mm->mm))
0121             continue;
0122
0123         mmap_read_lock(encl_mm->mm);
0124         ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
0125         mmap_read_unlock(encl_mm->mm);
0126
0127         mmput_async(encl_mm->mm);
0128
0129         if (!ret)
0130             break;
0131     }
0132
0133     srcu_read_unlock(&encl->srcu, idx);
0134
0135     if (!ret)
0136         return false;
0137
0138     return true;
0139 }
0140
0141 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
0142 {
0143     struct sgx_encl_page *page = epc_page->owner;
0144     unsigned long addr = page->desc & PAGE_MASK;
0145     struct sgx_encl *encl = page->encl;
0146     int ret;
0147
0148     sgx_zap_enclave_ptes(encl, addr);
0149
0150     mutex_lock(&encl->lock);
0151
0152     ret = __eblock(sgx_get_epc_virt_addr(epc_page));
0153     if (encls_failed(ret))
0154         ENCLS_WARN(ret, "EBLOCK");
0155
0156     mutex_unlock(&encl->lock);
0157 }
0158
0159 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
0160               struct sgx_backing *backing)
0161 {
0162     struct sgx_pageinfo pginfo;
0163     int ret;
0164
0165     pginfo.addr = 0;
0166     pginfo.secs = 0;
0167
0168     pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
0169     pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
0170               backing->pcmd_offset;
0171
0172     ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
0173     set_page_dirty(backing->pcmd);
0174     set_page_dirty(backing->contents);
0175
0176     kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
0177                           backing->pcmd_offset));
0178     kunmap_atomic((void *)(unsigned long)pginfo.contents);
0179
0180     return ret;
0181 }
0182
0183 void sgx_ipi_cb(void *info)
0184 {
0185 }
0186
0187 /*
0188  * Swap page to the regular memory transformed to the blocked state by using
0189  * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
0190  *
0191  * The first trial just tries to write the page assuming that some other thread
0192  * has reset the count for threads inside the enclave by using ETRACK, and
0193  * previous thread count has been zeroed out. The second trial calls ETRACK
0194  * before EWB. If that fails we kick all the HW threads out, and then do EWB,
0195  * which should be guaranteed the succeed.
0196  */
0197 static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
0198              struct sgx_backing *backing)
0199 {
0200     struct sgx_encl_page *encl_page = epc_page->owner;
0201     struct sgx_encl *encl = encl_page->encl;
0202     struct sgx_va_page *va_page;
0203     unsigned int va_offset;
0204     void *va_slot;
0205     int ret;
0206
0207     encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
0208
0209     va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
0210                    list);
0211     va_offset = sgx_alloc_va_slot(va_page);
0212     va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
0213     if (sgx_va_page_full(va_page))
0214         list_move_tail(&va_page->list, &encl->va_pages);
0215
0216     ret = __sgx_encl_ewb(epc_page, va_slot, backing);
0217     if (ret == SGX_NOT_TRACKED) {
0218         ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
0219         if (ret) {
0220             if (encls_failed(ret))
0221                 ENCLS_WARN(ret, "ETRACK");
0222         }
0223
0224         ret = __sgx_encl_ewb(epc_page, va_slot, backing);
0225         if (ret == SGX_NOT_TRACKED) {
0226             /*
0227              * Slow path, send IPIs to kick cpus out of the
0228              * enclave.  Note, it's imperative that the cpu
0229              * mask is generated *after* ETRACK, else we'll
0230              * miss cpus that entered the enclave between
0231              * generating the mask and incrementing epoch.
0232              */
0233             on_each_cpu_mask(sgx_encl_cpumask(encl),
0234                      sgx_ipi_cb, NULL, 1);
0235             ret = __sgx_encl_ewb(epc_page, va_slot, backing);
0236         }
0237     }
0238
0239     if (ret) {
0240         if (encls_failed(ret))
0241             ENCLS_WARN(ret, "EWB");
0242
0243         sgx_free_va_slot(va_page, va_offset);
0244     } else {
0245         encl_page->desc |= va_offset;
0246         encl_page->va_page = va_page;
0247     }
0248 }
0249
0250 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
0251                 struct sgx_backing *backing)
0252 {
0253     struct sgx_encl_page *encl_page = epc_page->owner;
0254     struct sgx_encl *encl = encl_page->encl;
0255     struct sgx_backing secs_backing;
0256     int ret;
0257
0258     mutex_lock(&encl->lock);
0259
0260     sgx_encl_ewb(epc_page, backing);
0261     encl_page->epc_page = NULL;
0262     encl->secs_child_cnt--;
0263     sgx_encl_put_backing(backing);
0264
0265     if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
0266         ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
0267                        &secs_backing);
0268         if (ret)
0269             goto out;
0270
0271         sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
0272
0273         sgx_encl_free_epc_page(encl->secs.epc_page);
0274         encl->secs.epc_page = NULL;
0275
0276         sgx_encl_put_backing(&secs_backing);
0277     }
0278
0279 out:
0280     mutex_unlock(&encl->lock);
0281 }
0282
0283 /*
0284  * Take a fixed number of pages from the head of the active page pool and
0285  * reclaim them to the enclave's private shmem files. Skip the pages, which have
0286  * been accessed since the last scan. Move those pages to the tail of active
0287  * page pool so that the pages get scanned in LRU like fashion.
0288  *
0289  * Batch process a chunk of pages (at the moment 16) in order to degrade amount
0290  * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
0291  * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
0292  * + EWB) but not sufficiently. Reclaiming one page at a time would also be
0293  * problematic as it would increase the lock contention too much, which would
0294  * halt forward progress.
0295  */
0296 static void sgx_reclaim_pages(void)
0297 {
0298     struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
0299     struct sgx_backing backing[SGX_NR_TO_SCAN];
0300     struct sgx_encl_page *encl_page;
0301     struct sgx_epc_page *epc_page;
0302     pgoff_t page_index;
0303     int cnt = 0;
0304     int ret;
0305     int i;
0306
0307     spin_lock(&sgx_reclaimer_lock);
0308     for (i = 0; i < SGX_NR_TO_SCAN; i++) {
0309         if (list_empty(&sgx_active_page_list))
0310             break;
0311
0312         epc_page = list_first_entry(&sgx_active_page_list,
0313                         struct sgx_epc_page, list);
0314         list_del_init(&epc_page->list);
0315         encl_page = epc_page->owner;
0316
0317         if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
0318             chunk[cnt++] = epc_page;
0319         else
0320             /* The owner is freeing the page. No need to add the
0321              * page back to the list of reclaimable pages.
0322              */
0323             epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
0324     }
0325     spin_unlock(&sgx_reclaimer_lock);
0326
0327     for (i = 0; i < cnt; i++) {
0328         epc_page = chunk[i];
0329         encl_page = epc_page->owner;
0330
0331         if (!sgx_reclaimer_age(epc_page))
0332             goto skip;
0333
0334         page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
0335
0336         mutex_lock(&encl_page->encl->lock);
0337         ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
0338         if (ret) {
0339             mutex_unlock(&encl_page->encl->lock);
0340             goto skip;
0341         }
0342
0343         encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
0344         mutex_unlock(&encl_page->encl->lock);
0345         continue;
0346
0347 skip:
0348         spin_lock(&sgx_reclaimer_lock);
0349         list_add_tail(&epc_page->list, &sgx_active_page_list);
0350         spin_unlock(&sgx_reclaimer_lock);
0351
0352         kref_put(&encl_page->encl->refcount, sgx_encl_release);
0353
0354         chunk[i] = NULL;
0355     }
0356
0357     for (i = 0; i < cnt; i++) {
0358         epc_page = chunk[i];
0359         if (epc_page)
0360             sgx_reclaimer_block(epc_page);
0361     }
0362
0363     for (i = 0; i < cnt; i++) {
0364         epc_page = chunk[i];
0365         if (!epc_page)
0366             continue;
0367
0368         encl_page = epc_page->owner;
0369         sgx_reclaimer_write(epc_page, &backing[i]);
0370
0371         kref_put(&encl_page->encl->refcount, sgx_encl_release);
0372         epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
0373
0374         sgx_free_epc_page(epc_page);
0375     }
0376 }
0377
0378 static bool sgx_should_reclaim(unsigned long watermark)
0379 {
0380     return atomic_long_read(&sgx_nr_free_pages) < watermark &&
0381            !list_empty(&sgx_active_page_list);
0382 }
0383
0384 /*
0385  * sgx_reclaim_direct() should be called (without enclave's mutex held)
0386  * in locations where SGX memory resources might be low and might be
0387  * needed in order to make forward progress.
0388  */
0389 void sgx_reclaim_direct(void)
0390 {
0391     if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
0392         sgx_reclaim_pages();
0393 }
0394
0395 static int ksgxd(void *p)
0396 {
0397     set_freezable();
0398
0399     /*
0400      * Sanitize pages in order to recover from kexec(). The 2nd pass is
0401      * required for SECS pages, whose child pages blocked EREMOVE.
0402      */
0403     __sgx_sanitize_pages(&sgx_dirty_page_list);
0404     WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
0405
0406     while (!kthread_should_stop()) {
0407         if (try_to_freeze())
0408             continue;
0409
0410         wait_event_freezable(ksgxd_waitq,
0411                      kthread_should_stop() ||
0412                      sgx_should_reclaim(SGX_NR_HIGH_PAGES));
0413
0414         if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
0415             sgx_reclaim_pages();
0416
0417         cond_resched();
0418     }
0419
0420     return 0;
0421 }
0422
0423 static bool __init sgx_page_reclaimer_init(void)
0424 {
0425     struct task_struct *tsk;
0426
0427     tsk = kthread_run(ksgxd, NULL, "ksgxd");
0428     if (IS_ERR(tsk))
0429         return false;
0430
0431     ksgxd_tsk = tsk;
0432
0433     return true;
0434 }
0435
0436 bool current_is_ksgxd(void)
0437 {
0438     return current == ksgxd_tsk;
0439 }
0440
0441 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
0442 {
0443     struct sgx_numa_node *node = &sgx_numa_nodes[nid];
0444     struct sgx_epc_page *page = NULL;
0445
0446     spin_lock(&node->lock);
0447
0448     if (list_empty(&node->free_page_list)) {
0449         spin_unlock(&node->lock);
0450         return NULL;
0451     }
0452
0453     page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
0454     list_del_init(&page->list);
0455     page->flags = 0;
0456
0457     spin_unlock(&node->lock);
0458     atomic_long_dec(&sgx_nr_free_pages);
0459
0460     return page;
0461 }
0462
0463 /**
0464  * __sgx_alloc_epc_page() - Allocate an EPC page
0465  *
0466  * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
0467  * from the NUMA node, where the caller is executing.
0468  *
0469  * Return:
0470  * - an EPC page:   A borrowed EPC pages were available.
0471  * - NULL:      Out of EPC pages.
0472  */
0473 struct sgx_epc_page *__sgx_alloc_epc_page(void)
0474 {
0475     struct sgx_epc_page *page;
0476     int nid_of_current = numa_node_id();
0477     int nid = nid_of_current;
0478
0479     if (node_isset(nid_of_current, sgx_numa_mask)) {
0480         page = __sgx_alloc_epc_page_from_node(nid_of_current);
0481         if (page)
0482             return page;
0483     }
0484
0485     /* Fall back to the non-local NUMA nodes: */
0486     while (true) {
0487         nid = next_node_in(nid, sgx_numa_mask);
0488         if (nid == nid_of_current)
0489             break;
0490
0491         page = __sgx_alloc_epc_page_from_node(nid);
0492         if (page)
0493             return page;
0494     }
0495
0496     return ERR_PTR(-ENOMEM);
0497 }
0498
0499 /**
0500  * sgx_mark_page_reclaimable() - Mark a page as reclaimable
0501  * @page:   EPC page
0502  *
0503  * Mark a page as reclaimable and add it to the active page list. Pages
0504  * are automatically removed from the active list when freed.
0505  */
0506 void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
0507 {
0508     spin_lock(&sgx_reclaimer_lock);
0509     page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
0510     list_add_tail(&page->list, &sgx_active_page_list);
0511     spin_unlock(&sgx_reclaimer_lock);
0512 }
0513
0514 /**
0515  * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
0516  * @page:   EPC page
0517  *
0518  * Clear the reclaimable flag and remove the page from the active page list.
0519  *
0520  * Return:
0521  *   0 on success,
0522  *   -EBUSY if the page is in the process of being reclaimed
0523  */
0524 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
0525 {
0526     spin_lock(&sgx_reclaimer_lock);
0527     if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
0528         /* The page is being reclaimed. */
0529         if (list_empty(&page->list)) {
0530             spin_unlock(&sgx_reclaimer_lock);
0531             return -EBUSY;
0532         }
0533
0534         list_del(&page->list);
0535         page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
0536     }
0537     spin_unlock(&sgx_reclaimer_lock);
0538
0539     return 0;
0540 }
0541
0542 /**
0543  * sgx_alloc_epc_page() - Allocate an EPC page
0544  * @owner:  the owner of the EPC page
0545  * @reclaim:    reclaim pages if necessary
0546  *
0547  * Iterate through EPC sections and borrow a free EPC page to the caller. When a
0548  * page is no longer needed it must be released with sgx_free_epc_page(). If
0549  * @reclaim is set to true, directly reclaim pages when we are out of pages. No
0550  * mm's can be locked when @reclaim is set to true.
0551  *
0552  * Finally, wake up ksgxd when the number of pages goes below the watermark
0553  * before returning back to the caller.
0554  *
0555  * Return:
0556  *   an EPC page,
0557  *   -errno on error
0558  */
0559 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
0560 {
0561     struct sgx_epc_page *page;
0562
0563     for ( ; ; ) {
0564         page = __sgx_alloc_epc_page();
0565         if (!IS_ERR(page)) {
0566             page->owner = owner;
0567             break;
0568         }
0569
0570         if (list_empty(&sgx_active_page_list))
0571             return ERR_PTR(-ENOMEM);
0572
0573         if (!reclaim) {
0574             page = ERR_PTR(-EBUSY);
0575             break;
0576         }
0577
0578         if (signal_pending(current)) {
0579             page = ERR_PTR(-ERESTARTSYS);
0580             break;
0581         }
0582
0583         sgx_reclaim_pages();
0584         cond_resched();
0585     }
0586
0587     if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
0588         wake_up(&ksgxd_waitq);
0589
0590     return page;
0591 }
0592
0593 /**
0594  * sgx_free_epc_page() - Free an EPC page
0595  * @page:   an EPC page
0596  *
0597  * Put the EPC page back to the list of free pages. It's the caller's
0598  * responsibility to make sure that the page is in uninitialized state. In other
0599  * words, do EREMOVE, EWB or whatever operation is necessary before calling
0600  * this function.
0601  */
0602 void sgx_free_epc_page(struct sgx_epc_page *page)
0603 {
0604     struct sgx_epc_section *section = &sgx_epc_sections[page->section];
0605     struct sgx_numa_node *node = section->node;
0606
0607     spin_lock(&node->lock);
0608
0609     page->owner = NULL;
0610     if (page->poison)
0611         list_add(&page->list, &node->sgx_poison_page_list);
0612     else
0613         list_add_tail(&page->list, &node->free_page_list);
0614     page->flags = SGX_EPC_PAGE_IS_FREE;
0615
0616     spin_unlock(&node->lock);
0617     atomic_long_inc(&sgx_nr_free_pages);
0618 }
0619
0620 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
0621                      unsigned long index,
0622                      struct sgx_epc_section *section)
0623 {
0624     unsigned long nr_pages = size >> PAGE_SHIFT;
0625     unsigned long i;
0626
0627     section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
0628     if (!section->virt_addr)
0629         return false;
0630
0631     section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
0632     if (!section->pages) {
0633         memunmap(section->virt_addr);
0634         return false;
0635     }
0636
0637     section->phys_addr = phys_addr;
0638     xa_store_range(&sgx_epc_address_space, section->phys_addr,
0639                phys_addr + size - 1, section, GFP_KERNEL);
0640
0641     for (i = 0; i < nr_pages; i++) {
0642         section->pages[i].section = index;
0643         section->pages[i].flags = 0;
0644         section->pages[i].owner = NULL;
0645         section->pages[i].poison = 0;
0646         list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
0647     }
0648
0649     return true;
0650 }
0651
0652 bool arch_is_platform_page(u64 paddr)
0653 {
0654     return !!xa_load(&sgx_epc_address_space, paddr);
0655 }
0656 EXPORT_SYMBOL_GPL(arch_is_platform_page);
0657
0658 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
0659 {
0660     struct sgx_epc_section *section;
0661
0662     section = xa_load(&sgx_epc_address_space, paddr);
0663     if (!section)
0664         return NULL;
0665
0666     return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
0667 }
0668
0669 /*
0670  * Called in process context to handle a hardware reported
0671  * error in an SGX EPC page.
0672  * If the MF_ACTION_REQUIRED bit is set in flags, then the
0673  * context is the task that consumed the poison data. Otherwise
0674  * this is called from a kernel thread unrelated to the page.
0675  */
0676 int arch_memory_failure(unsigned long pfn, int flags)
0677 {
0678     struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
0679     struct sgx_epc_section *section;
0680     struct sgx_numa_node *node;
0681
0682     /*
0683      * mm/memory-failure.c calls this routine for all errors
0684      * where there isn't a "struct page" for the address. But that
0685      * includes other address ranges besides SGX.
0686      */
0687     if (!page)
0688         return -ENXIO;
0689
0690     /*
0691      * If poison was consumed synchronously. Send a SIGBUS to
0692      * the task. Hardware has already exited the SGX enclave and
0693      * will not allow re-entry to an enclave that has a memory
0694      * error. The signal may help the task understand why the
0695      * enclave is broken.
0696      */
0697     if (flags & MF_ACTION_REQUIRED)
0698         force_sig(SIGBUS);
0699
0700     section = &sgx_epc_sections[page->section];
0701     node = section->node;
0702
0703     spin_lock(&node->lock);
0704
0705     /* Already poisoned? Nothing more to do */
0706     if (page->poison)
0707         goto out;
0708
0709     page->poison = 1;
0710
0711     /*
0712      * If the page is on a free list, move it to the per-node
0713      * poison page list.
0714      */
0715     if (page->flags & SGX_EPC_PAGE_IS_FREE) {
0716         list_move(&page->list, &node->sgx_poison_page_list);
0717         goto out;
0718     }
0719
0720     /*
0721      * TBD: Add additional plumbing to enable pre-emptive
0722      * action for asynchronous poison notification. Until
0723      * then just hope that the poison:
0724      * a) is not accessed - sgx_free_epc_page() will deal with it
0725      *    when the user gives it back
0726      * b) results in a recoverable machine check rather than
0727      *    a fatal one
0728      */
0729 out:
0730     spin_unlock(&node->lock);
0731     return 0;
0732 }
0733
0734 /**
0735  * A section metric is concatenated in a way that @low bits 12-31 define the
0736  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
0737  * metric.
0738  */
0739 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
0740 {
0741     return (low & GENMASK_ULL(31, 12)) +
0742            ((high & GENMASK_ULL(19, 0)) << 32);
0743 }
0744
0745 #ifdef CONFIG_NUMA
0746 static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
0747 {
0748     return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
0749 }
0750 static DEVICE_ATTR_RO(sgx_total_bytes);
0751
0752 static umode_t arch_node_attr_is_visible(struct kobject *kobj,
0753         struct attribute *attr, int idx)
0754 {
0755     /* Make all x86/ attributes invisible when SGX is not initialized: */
0756     if (nodes_empty(sgx_numa_mask))
0757         return 0;
0758
0759     return attr->mode;
0760 }
0761
0762 static struct attribute *arch_node_dev_attrs[] = {
0763     &dev_attr_sgx_total_bytes.attr,
0764     NULL,
0765 };
0766
0767 const struct attribute_group arch_node_dev_group = {
0768     .name = "x86",
0769     .attrs = arch_node_dev_attrs,
0770     .is_visible = arch_node_attr_is_visible,
0771 };
0772
0773 static void __init arch_update_sysfs_visibility(int nid)
0774 {
0775     struct node *node = node_devices[nid];
0776     int ret;
0777
0778     ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
0779
0780     if (ret)
0781         pr_err("sysfs update failed (%d), files may be invisible", ret);
0782 }
0783 #else /* !CONFIG_NUMA */
0784 static void __init arch_update_sysfs_visibility(int nid) {}
0785 #endif
0786
0787 static bool __init sgx_page_cache_init(void)
0788 {
0789     u32 eax, ebx, ecx, edx, type;
0790     u64 pa, size;
0791     int nid;
0792     int i;
0793
0794     sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
0795     if (!sgx_numa_nodes)
0796         return false;
0797
0798     for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
0799         cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
0800
0801         type = eax & SGX_CPUID_EPC_MASK;
0802         if (type == SGX_CPUID_EPC_INVALID)
0803             break;
0804
0805         if (type != SGX_CPUID_EPC_SECTION) {
0806             pr_err_once("Unknown EPC section type: %u\n", type);
0807             break;
0808         }
0809
0810         pa   = sgx_calc_section_metric(eax, ebx);
0811         size = sgx_calc_section_metric(ecx, edx);
0812
0813         pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
0814
0815         if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
0816             pr_err("No free memory for an EPC section\n");
0817             break;
0818         }
0819
0820         nid = numa_map_to_online_node(phys_to_target_node(pa));
0821         if (nid == NUMA_NO_NODE) {
0822             /* The physical address is already printed above. */
0823             pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
0824             nid = 0;
0825         }
0826
0827         if (!node_isset(nid, sgx_numa_mask)) {
0828             spin_lock_init(&sgx_numa_nodes[nid].lock);
0829             INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
0830             INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
0831             node_set(nid, sgx_numa_mask);
0832             sgx_numa_nodes[nid].size = 0;
0833
0834             /* Make SGX-specific node sysfs files visible: */
0835             arch_update_sysfs_visibility(nid);
0836         }
0837
0838         sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
0839         sgx_numa_nodes[nid].size += size;
0840
0841         sgx_nr_epc_sections++;
0842     }
0843
0844     if (!sgx_nr_epc_sections) {
0845         pr_err("There are zero EPC sections.\n");
0846         return false;
0847     }
0848
0849     return true;
0850 }
0851
0852 /*
0853  * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
0854  * Bare-metal driver requires to update them to hash of enclave's signer
0855  * before EINIT. KVM needs to update them to guest's virtual MSR values
0856  * before doing EINIT from guest.
0857  */
0858 void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
0859 {
0860     int i;
0861
0862     WARN_ON_ONCE(preemptible());
0863
0864     for (i = 0; i < 4; i++)
0865         wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
0866 }
0867
0868 const struct file_operations sgx_provision_fops = {
0869     .owner          = THIS_MODULE,
0870 };
0871
0872 static struct miscdevice sgx_dev_provision = {
0873     .minor = MISC_DYNAMIC_MINOR,
0874     .name = "sgx_provision",
0875     .nodename = "sgx_provision",
0876     .fops = &sgx_provision_fops,
0877 };
0878
0879 /**
0880  * sgx_set_attribute() - Update allowed attributes given file descriptor
0881  * @allowed_attributes:     Pointer to allowed enclave attributes
0882  * @attribute_fd:       File descriptor for specific attribute
0883  *
0884  * Append enclave attribute indicated by file descriptor to allowed
0885  * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
0886  * /dev/sgx_provision is supported.
0887  *
0888  * Return:
0889  * -0:      SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
0890  * -EINVAL: Invalid, or not supported file descriptor
0891  */
0892 int sgx_set_attribute(unsigned long *allowed_attributes,
0893               unsigned int attribute_fd)
0894 {
0895     struct file *file;
0896
0897     file = fget(attribute_fd);
0898     if (!file)
0899         return -EINVAL;
0900
0901     if (file->f_op != &sgx_provision_fops) {
0902         fput(file);
0903         return -EINVAL;
0904     }
0905
0906     *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
0907
0908     fput(file);
0909     return 0;
0910 }
0911 EXPORT_SYMBOL_GPL(sgx_set_attribute);
0912
0913 static int __init sgx_init(void)
0914 {
0915     int ret;
0916     int i;
0917
0918     if (!cpu_feature_enabled(X86_FEATURE_SGX))
0919         return -ENODEV;
0920
0921     if (!sgx_page_cache_init())
0922         return -ENOMEM;
0923
0924     if (!sgx_page_reclaimer_init()) {
0925         ret = -ENOMEM;
0926         goto err_page_cache;
0927     }
0928
0929     ret = misc_register(&sgx_dev_provision);
0930     if (ret)
0931         goto err_kthread;
0932
0933     /*
0934      * Always try to initialize the native *and* KVM drivers.
0935      * The KVM driver is less picky than the native one and
0936      * can function if the native one is not supported on the
0937      * current system or fails to initialize.
0938      *
0939      * Error out only if both fail to initialize.
0940      */
0941     ret = sgx_drv_init();
0942
0943     if (sgx_vepc_init() && ret)
0944         goto err_provision;
0945
0946     return 0;
0947
0948 err_provision:
0949     misc_deregister(&sgx_dev_provision);
0950
0951 err_kthread:
0952     kthread_stop(ksgxd_tsk);
0953
0954 err_page_cache:
0955     for (i = 0; i < sgx_nr_epc_sections; i++) {
0956         vfree(sgx_epc_sections[i].pages);
0957         memunmap(sgx_epc_sections[i].virt_addr);
0958     }
0959
0960     return ret;
0961 }
0962
0963 device_initcall(sgx_init);