Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* ----------------------------------------------------------------------- *
0003  *
0004  *   Copyright 2014 Intel Corporation; author: H. Peter Anvin
0005  *
0006  * ----------------------------------------------------------------------- */
0007 
0008 /*
0009  * The IRET instruction, when returning to a 16-bit segment, only
0010  * restores the bottom 16 bits of the user space stack pointer.  This
0011  * causes some 16-bit software to break, but it also leaks kernel state
0012  * to user space.
0013  *
0014  * This works around this by creating percpu "ministacks", each of which
0015  * is mapped 2^16 times 64K apart.  When we detect that the return SS is
0016  * on the LDT, we copy the IRET frame to the ministack and use the
0017  * relevant alias to return to userspace.  The ministacks are mapped
0018  * readonly, so if the IRET fault we promote #GP to #DF which is an IST
0019  * vector and thus has its own stack; we then do the fixup in the #DF
0020  * handler.
0021  *
0022  * This file sets up the ministacks and the related page tables.  The
0023  * actual ministack invocation is in entry_64.S.
0024  */
0025 
0026 #include <linux/init.h>
0027 #include <linux/init_task.h>
0028 #include <linux/kernel.h>
0029 #include <linux/percpu.h>
0030 #include <linux/gfp.h>
0031 #include <linux/random.h>
0032 #include <linux/pgtable.h>
0033 #include <asm/pgalloc.h>
0034 #include <asm/setup.h>
0035 #include <asm/espfix.h>
0036 
0037 /*
0038  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
0039  * it up to a cache line to avoid unnecessary sharing.
0040  */
0041 #define ESPFIX_STACK_SIZE   (8*8UL)
0042 #define ESPFIX_STACKS_PER_PAGE  (PAGE_SIZE/ESPFIX_STACK_SIZE)
0043 
0044 /* There is address space for how many espfix pages? */
0045 #define ESPFIX_PAGE_SPACE   (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
0046 
0047 #define ESPFIX_MAX_CPUS     (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
0048 #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
0049 # error "Need more virtual address space for the ESPFIX hack"
0050 #endif
0051 
0052 #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
0053 
0054 /* This contains the *bottom* address of the espfix stack */
0055 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
0056 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
0057 
0058 /* Initialization mutex - should this be a spinlock? */
0059 static DEFINE_MUTEX(espfix_init_mutex);
0060 
0061 /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
0062 #define ESPFIX_MAX_PAGES  DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
0063 static void *espfix_pages[ESPFIX_MAX_PAGES];
0064 
0065 static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
0066     __aligned(PAGE_SIZE);
0067 
0068 static unsigned int page_random, slot_random;
0069 
0070 /*
0071  * This returns the bottom address of the espfix stack for a specific CPU.
0072  * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
0073  * we have to account for some amount of padding at the end of each page.
0074  */
0075 static inline unsigned long espfix_base_addr(unsigned int cpu)
0076 {
0077     unsigned long page, slot;
0078     unsigned long addr;
0079 
0080     page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
0081     slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
0082     addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
0083     addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
0084     addr += ESPFIX_BASE_ADDR;
0085     return addr;
0086 }
0087 
0088 #define PTE_STRIDE        (65536/PAGE_SIZE)
0089 #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
0090 #define ESPFIX_PMD_CLONES PTRS_PER_PMD
0091 #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
0092 
0093 #define PGTABLE_PROT      ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
0094 
0095 static void init_espfix_random(void)
0096 {
0097     unsigned long rand;
0098 
0099     /*
0100      * This is run before the entropy pools are initialized,
0101      * but this is hopefully better than nothing.
0102      */
0103     if (!arch_get_random_longs(&rand, 1)) {
0104         /* The constant is an arbitrary large prime */
0105         rand = rdtsc();
0106         rand *= 0xc345c6b72fd16123UL;
0107     }
0108 
0109     slot_random = rand % ESPFIX_STACKS_PER_PAGE;
0110     page_random = (rand / ESPFIX_STACKS_PER_PAGE)
0111         & (ESPFIX_PAGE_SPACE - 1);
0112 }
0113 
0114 void __init init_espfix_bsp(void)
0115 {
0116     pgd_t *pgd;
0117     p4d_t *p4d;
0118 
0119     /* Install the espfix pud into the kernel page directory */
0120     pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
0121     p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
0122     p4d_populate(&init_mm, p4d, espfix_pud_page);
0123 
0124     /* Randomize the locations */
0125     init_espfix_random();
0126 
0127     /* The rest is the same as for any other processor */
0128     init_espfix_ap(0);
0129 }
0130 
0131 void init_espfix_ap(int cpu)
0132 {
0133     unsigned int page;
0134     unsigned long addr;
0135     pud_t pud, *pud_p;
0136     pmd_t pmd, *pmd_p;
0137     pte_t pte, *pte_p;
0138     int n, node;
0139     void *stack_page;
0140     pteval_t ptemask;
0141 
0142     /* We only have to do this once... */
0143     if (likely(per_cpu(espfix_stack, cpu)))
0144         return;     /* Already initialized */
0145 
0146     addr = espfix_base_addr(cpu);
0147     page = cpu/ESPFIX_STACKS_PER_PAGE;
0148 
0149     /* Did another CPU already set this up? */
0150     stack_page = READ_ONCE(espfix_pages[page]);
0151     if (likely(stack_page))
0152         goto done;
0153 
0154     mutex_lock(&espfix_init_mutex);
0155 
0156     /* Did we race on the lock? */
0157     stack_page = READ_ONCE(espfix_pages[page]);
0158     if (stack_page)
0159         goto unlock_done;
0160 
0161     node = cpu_to_node(cpu);
0162     ptemask = __supported_pte_mask;
0163 
0164     pud_p = &espfix_pud_page[pud_index(addr)];
0165     pud = *pud_p;
0166     if (!pud_present(pud)) {
0167         struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
0168 
0169         pmd_p = (pmd_t *)page_address(page);
0170         pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
0171         paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
0172         for (n = 0; n < ESPFIX_PUD_CLONES; n++)
0173             set_pud(&pud_p[n], pud);
0174     }
0175 
0176     pmd_p = pmd_offset(&pud, addr);
0177     pmd = *pmd_p;
0178     if (!pmd_present(pmd)) {
0179         struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
0180 
0181         pte_p = (pte_t *)page_address(page);
0182         pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
0183         paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
0184         for (n = 0; n < ESPFIX_PMD_CLONES; n++)
0185             set_pmd(&pmd_p[n], pmd);
0186     }
0187 
0188     pte_p = pte_offset_kernel(&pmd, addr);
0189     stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
0190     /*
0191      * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since
0192      * this is mapped to userspace.
0193      */
0194     pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
0195     for (n = 0; n < ESPFIX_PTE_CLONES; n++)
0196         set_pte(&pte_p[n*PTE_STRIDE], pte);
0197 
0198     /* Job is done for this CPU and any CPU which shares this page */
0199     WRITE_ONCE(espfix_pages[page], stack_page);
0200 
0201 unlock_done:
0202     mutex_unlock(&espfix_init_mutex);
0203 done:
0204     per_cpu(espfix_stack, cpu) = addr;
0205     per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
0206                       + (addr & ~PAGE_MASK);
0207 }