x86/mm/pgtable.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/mm.h>
0003 #include <linux/gfp.h>
0004 #include <linux/hugetlb.h>
0005 #include <asm/pgalloc.h>
0006 #include <asm/tlb.h>
0007 #include <asm/fixmap.h>
0008 #include <asm/mtrr.h>
0009
0010 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
0011 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
0012 EXPORT_SYMBOL(physical_mask);
0013 #endif
0014
0015 #ifdef CONFIG_HIGHPTE
0016 #define PGTABLE_HIGHMEM __GFP_HIGHMEM
0017 #else
0018 #define PGTABLE_HIGHMEM 0
0019 #endif
0020
0021 #ifndef CONFIG_PARAVIRT
0022 static inline
0023 void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
0024 {
0025     tlb_remove_page(tlb, table);
0026 }
0027 #endif
0028
0029 gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
0030
0031 pgtable_t pte_alloc_one(struct mm_struct *mm)
0032 {
0033     return __pte_alloc_one(mm, __userpte_alloc_gfp);
0034 }
0035
0036 static int __init setup_userpte(char *arg)
0037 {
0038     if (!arg)
0039         return -EINVAL;
0040
0041     /*
0042      * "userpte=nohigh" disables allocation of user pagetables in
0043      * high memory.
0044      */
0045     if (strcmp(arg, "nohigh") == 0)
0046         __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
0047     else
0048         return -EINVAL;
0049     return 0;
0050 }
0051 early_param("userpte", setup_userpte);
0052
0053 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
0054 {
0055     pgtable_pte_page_dtor(pte);
0056     paravirt_release_pte(page_to_pfn(pte));
0057     paravirt_tlb_remove_table(tlb, pte);
0058 }
0059
0060 #if CONFIG_PGTABLE_LEVELS > 2
0061 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
0062 {
0063     struct page *page = virt_to_page(pmd);
0064     paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
0065     /*
0066      * NOTE! For PAE, any changes to the top page-directory-pointer-table
0067      * entries need a full cr3 reload to flush.
0068      */
0069 #ifdef CONFIG_X86_PAE
0070     tlb->need_flush_all = 1;
0071 #endif
0072     pgtable_pmd_page_dtor(page);
0073     paravirt_tlb_remove_table(tlb, page);
0074 }
0075
0076 #if CONFIG_PGTABLE_LEVELS > 3
0077 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
0078 {
0079     paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
0080     paravirt_tlb_remove_table(tlb, virt_to_page(pud));
0081 }
0082
0083 #if CONFIG_PGTABLE_LEVELS > 4
0084 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
0085 {
0086     paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
0087     paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
0088 }
0089 #endif  /* CONFIG_PGTABLE_LEVELS > 4 */
0090 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
0091 #endif  /* CONFIG_PGTABLE_LEVELS > 2 */
0092
0093 static inline void pgd_list_add(pgd_t *pgd)
0094 {
0095     struct page *page = virt_to_page(pgd);
0096
0097     list_add(&page->lru, &pgd_list);
0098 }
0099
0100 static inline void pgd_list_del(pgd_t *pgd)
0101 {
0102     struct page *page = virt_to_page(pgd);
0103
0104     list_del(&page->lru);
0105 }
0106
0107 #define UNSHARED_PTRS_PER_PGD               \
0108     (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
0109 #define MAX_UNSHARED_PTRS_PER_PGD           \
0110     max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
0111
0112
0113 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
0114 {
0115     virt_to_page(pgd)->pt_mm = mm;
0116 }
0117
0118 struct mm_struct *pgd_page_get_mm(struct page *page)
0119 {
0120     return page->pt_mm;
0121 }
0122
0123 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
0124 {
0125     /* If the pgd points to a shared pagetable level (either the
0126        ptes in non-PAE, or shared PMD in PAE), then just copy the
0127        references from swapper_pg_dir. */
0128     if (CONFIG_PGTABLE_LEVELS == 2 ||
0129         (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
0130         CONFIG_PGTABLE_LEVELS >= 4) {
0131         clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
0132                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
0133                 KERNEL_PGD_PTRS);
0134     }
0135
0136     /* list required to sync kernel mapping updates */
0137     if (!SHARED_KERNEL_PMD) {
0138         pgd_set_mm(pgd, mm);
0139         pgd_list_add(pgd);
0140     }
0141 }
0142
0143 static void pgd_dtor(pgd_t *pgd)
0144 {
0145     if (SHARED_KERNEL_PMD)
0146         return;
0147
0148     spin_lock(&pgd_lock);
0149     pgd_list_del(pgd);
0150     spin_unlock(&pgd_lock);
0151 }
0152
0153 /*
0154  * List of all pgd's needed for non-PAE so it can invalidate entries
0155  * in both cached and uncached pgd's; not needed for PAE since the
0156  * kernel pmd is shared. If PAE were not to share the pmd a similar
0157  * tactic would be needed. This is essentially codepath-based locking
0158  * against pageattr.c; it is the unique case in which a valid change
0159  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
0160  * vmalloc faults work because attached pagetables are never freed.
0161  * -- nyc
0162  */
0163
0164 #ifdef CONFIG_X86_PAE
0165 /*
0166  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
0167  * updating the top-level pagetable entries to guarantee the
0168  * processor notices the update.  Since this is expensive, and
0169  * all 4 top-level entries are used almost immediately in a
0170  * new process's life, we just pre-populate them here.
0171  *
0172  * Also, if we're in a paravirt environment where the kernel pmd is
0173  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
0174  * and initialize the kernel pmds here.
0175  */
0176 #define PREALLOCATED_PMDS   UNSHARED_PTRS_PER_PGD
0177 #define MAX_PREALLOCATED_PMDS   MAX_UNSHARED_PTRS_PER_PGD
0178
0179 /*
0180  * We allocate separate PMDs for the kernel part of the user page-table
0181  * when PTI is enabled. We need them to map the per-process LDT into the
0182  * user-space page-table.
0183  */
0184 #define PREALLOCATED_USER_PMDS   (boot_cpu_has(X86_FEATURE_PTI) ? \
0185                     KERNEL_PGD_PTRS : 0)
0186 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
0187
0188 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
0189 {
0190     paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
0191
0192     /* Note: almost everything apart from _PAGE_PRESENT is
0193        reserved at the pmd (PDPT) level. */
0194     set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
0195
0196     /*
0197      * According to Intel App note "TLBs, Paging-Structure Caches,
0198      * and Their Invalidation", April 2007, document 317080-001,
0199      * section 8.1: in PAE mode we explicitly have to flush the
0200      * TLB via cr3 if the top-level pgd is changed...
0201      */
0202     flush_tlb_mm(mm);
0203 }
0204 #else  /* !CONFIG_X86_PAE */
0205
0206 /* No need to prepopulate any pagetable entries in non-PAE modes. */
0207 #define PREALLOCATED_PMDS   0
0208 #define MAX_PREALLOCATED_PMDS   0
0209 #define PREALLOCATED_USER_PMDS   0
0210 #define MAX_PREALLOCATED_USER_PMDS 0
0211 #endif  /* CONFIG_X86_PAE */
0212
0213 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
0214 {
0215     int i;
0216
0217     for (i = 0; i < count; i++)
0218         if (pmds[i]) {
0219             pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
0220             free_page((unsigned long)pmds[i]);
0221             mm_dec_nr_pmds(mm);
0222         }
0223 }
0224
0225 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
0226 {
0227     int i;
0228     bool failed = false;
0229     gfp_t gfp = GFP_PGTABLE_USER;
0230
0231     if (mm == &init_mm)
0232         gfp &= ~__GFP_ACCOUNT;
0233
0234     for (i = 0; i < count; i++) {
0235         pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
0236         if (!pmd)
0237             failed = true;
0238         if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
0239             free_page((unsigned long)pmd);
0240             pmd = NULL;
0241             failed = true;
0242         }
0243         if (pmd)
0244             mm_inc_nr_pmds(mm);
0245         pmds[i] = pmd;
0246     }
0247
0248     if (failed) {
0249         free_pmds(mm, pmds, count);
0250         return -ENOMEM;
0251     }
0252
0253     return 0;
0254 }
0255
0256 /*
0257  * Mop up any pmd pages which may still be attached to the pgd.
0258  * Normally they will be freed by munmap/exit_mmap, but any pmd we
0259  * preallocate which never got a corresponding vma will need to be
0260  * freed manually.
0261  */
0262 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
0263 {
0264     pgd_t pgd = *pgdp;
0265
0266     if (pgd_val(pgd) != 0) {
0267         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
0268
0269         pgd_clear(pgdp);
0270
0271         paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
0272         pmd_free(mm, pmd);
0273         mm_dec_nr_pmds(mm);
0274     }
0275 }
0276
0277 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
0278 {
0279     int i;
0280
0281     for (i = 0; i < PREALLOCATED_PMDS; i++)
0282         mop_up_one_pmd(mm, &pgdp[i]);
0283
0284 #ifdef CONFIG_PAGE_TABLE_ISOLATION
0285
0286     if (!boot_cpu_has(X86_FEATURE_PTI))
0287         return;
0288
0289     pgdp = kernel_to_user_pgdp(pgdp);
0290
0291     for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
0292         mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
0293 #endif
0294 }
0295
0296 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
0297 {
0298     p4d_t *p4d;
0299     pud_t *pud;
0300     int i;
0301
0302     if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
0303         return;
0304
0305     p4d = p4d_offset(pgd, 0);
0306     pud = pud_offset(p4d, 0);
0307
0308     for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
0309         pmd_t *pmd = pmds[i];
0310
0311         if (i >= KERNEL_PGD_BOUNDARY)
0312             memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
0313                    sizeof(pmd_t) * PTRS_PER_PMD);
0314
0315         pud_populate(mm, pud, pmd);
0316     }
0317 }
0318
0319 #ifdef CONFIG_PAGE_TABLE_ISOLATION
0320 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
0321                      pgd_t *k_pgd, pmd_t *pmds[])
0322 {
0323     pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
0324     pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
0325     p4d_t *u_p4d;
0326     pud_t *u_pud;
0327     int i;
0328
0329     u_p4d = p4d_offset(u_pgd, 0);
0330     u_pud = pud_offset(u_p4d, 0);
0331
0332     s_pgd += KERNEL_PGD_BOUNDARY;
0333     u_pud += KERNEL_PGD_BOUNDARY;
0334
0335     for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
0336         pmd_t *pmd = pmds[i];
0337
0338         memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
0339                sizeof(pmd_t) * PTRS_PER_PMD);
0340
0341         pud_populate(mm, u_pud, pmd);
0342     }
0343
0344 }
0345 #else
0346 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
0347                      pgd_t *k_pgd, pmd_t *pmds[])
0348 {
0349 }
0350 #endif
0351 /*
0352  * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
0353  * assumes that pgd should be in one page.
0354  *
0355  * But kernel with PAE paging that is not running as a Xen domain
0356  * only needs to allocate 32 bytes for pgd instead of one page.
0357  */
0358 #ifdef CONFIG_X86_PAE
0359
0360 #include <linux/slab.h>
0361
0362 #define PGD_SIZE    (PTRS_PER_PGD * sizeof(pgd_t))
0363 #define PGD_ALIGN   32
0364
0365 static struct kmem_cache *pgd_cache;
0366
0367 void __init pgtable_cache_init(void)
0368 {
0369     /*
0370      * When PAE kernel is running as a Xen domain, it does not use
0371      * shared kernel pmd. And this requires a whole page for pgd.
0372      */
0373     if (!SHARED_KERNEL_PMD)
0374         return;
0375
0376     /*
0377      * when PAE kernel is not running as a Xen domain, it uses
0378      * shared kernel pmd. Shared kernel pmd does not require a whole
0379      * page for pgd. We are able to just allocate a 32-byte for pgd.
0380      * During boot time, we create a 32-byte slab for pgd table allocation.
0381      */
0382     pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
0383                       SLAB_PANIC, NULL);
0384 }
0385
0386 static inline pgd_t *_pgd_alloc(void)
0387 {
0388     /*
0389      * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
0390      * We allocate one page for pgd.
0391      */
0392     if (!SHARED_KERNEL_PMD)
0393         return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
0394                          PGD_ALLOCATION_ORDER);
0395
0396     /*
0397      * Now PAE kernel is not running as a Xen domain. We can allocate
0398      * a 32-byte slab for pgd to save memory space.
0399      */
0400     return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
0401 }
0402
0403 static inline void _pgd_free(pgd_t *pgd)
0404 {
0405     if (!SHARED_KERNEL_PMD)
0406         free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
0407     else
0408         kmem_cache_free(pgd_cache, pgd);
0409 }
0410 #else
0411
0412 static inline pgd_t *_pgd_alloc(void)
0413 {
0414     return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
0415                      PGD_ALLOCATION_ORDER);
0416 }
0417
0418 static inline void _pgd_free(pgd_t *pgd)
0419 {
0420     free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
0421 }
0422 #endif /* CONFIG_X86_PAE */
0423
0424 pgd_t *pgd_alloc(struct mm_struct *mm)
0425 {
0426     pgd_t *pgd;
0427     pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
0428     pmd_t *pmds[MAX_PREALLOCATED_PMDS];
0429
0430     pgd = _pgd_alloc();
0431
0432     if (pgd == NULL)
0433         goto out;
0434
0435     mm->pgd = pgd;
0436
0437     if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
0438         goto out_free_pgd;
0439
0440     if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
0441         goto out_free_pmds;
0442
0443     if (paravirt_pgd_alloc(mm) != 0)
0444         goto out_free_user_pmds;
0445
0446     /*
0447      * Make sure that pre-populating the pmds is atomic with
0448      * respect to anything walking the pgd_list, so that they
0449      * never see a partially populated pgd.
0450      */
0451     spin_lock(&pgd_lock);
0452
0453     pgd_ctor(mm, pgd);
0454     pgd_prepopulate_pmd(mm, pgd, pmds);
0455     pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
0456
0457     spin_unlock(&pgd_lock);
0458
0459     return pgd;
0460
0461 out_free_user_pmds:
0462     free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
0463 out_free_pmds:
0464     free_pmds(mm, pmds, PREALLOCATED_PMDS);
0465 out_free_pgd:
0466     _pgd_free(pgd);
0467 out:
0468     return NULL;
0469 }
0470
0471 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
0472 {
0473     pgd_mop_up_pmds(mm, pgd);
0474     pgd_dtor(pgd);
0475     paravirt_pgd_free(mm, pgd);
0476     _pgd_free(pgd);
0477 }
0478
0479 /*
0480  * Used to set accessed or dirty bits in the page table entries
0481  * on other architectures. On x86, the accessed and dirty bits
0482  * are tracked by hardware. However, do_wp_page calls this function
0483  * to also make the pte writeable at the same time the dirty bit is
0484  * set. In that case we do actually need to write the PTE.
0485  */
0486 int ptep_set_access_flags(struct vm_area_struct *vma,
0487               unsigned long address, pte_t *ptep,
0488               pte_t entry, int dirty)
0489 {
0490     int changed = !pte_same(*ptep, entry);
0491
0492     if (changed && dirty)
0493         set_pte(ptep, entry);
0494
0495     return changed;
0496 }
0497
0498 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0499 int pmdp_set_access_flags(struct vm_area_struct *vma,
0500               unsigned long address, pmd_t *pmdp,
0501               pmd_t entry, int dirty)
0502 {
0503     int changed = !pmd_same(*pmdp, entry);
0504
0505     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
0506
0507     if (changed && dirty) {
0508         set_pmd(pmdp, entry);
0509         /*
0510          * We had a write-protection fault here and changed the pmd
0511          * to to more permissive. No need to flush the TLB for that,
0512          * #PF is architecturally guaranteed to do that and in the
0513          * worst-case we'll generate a spurious fault.
0514          */
0515     }
0516
0517     return changed;
0518 }
0519
0520 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
0521               pud_t *pudp, pud_t entry, int dirty)
0522 {
0523     int changed = !pud_same(*pudp, entry);
0524
0525     VM_BUG_ON(address & ~HPAGE_PUD_MASK);
0526
0527     if (changed && dirty) {
0528         set_pud(pudp, entry);
0529         /*
0530          * We had a write-protection fault here and changed the pud
0531          * to to more permissive. No need to flush the TLB for that,
0532          * #PF is architecturally guaranteed to do that and in the
0533          * worst-case we'll generate a spurious fault.
0534          */
0535     }
0536
0537     return changed;
0538 }
0539 #endif
0540
0541 int ptep_test_and_clear_young(struct vm_area_struct *vma,
0542                   unsigned long addr, pte_t *ptep)
0543 {
0544     int ret = 0;
0545
0546     if (pte_young(*ptep))
0547         ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
0548                      (unsigned long *) &ptep->pte);
0549
0550     return ret;
0551 }
0552
0553 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0554 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
0555                   unsigned long addr, pmd_t *pmdp)
0556 {
0557     int ret = 0;
0558
0559     if (pmd_young(*pmdp))
0560         ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
0561                      (unsigned long *)pmdp);
0562
0563     return ret;
0564 }
0565 int pudp_test_and_clear_young(struct vm_area_struct *vma,
0566                   unsigned long addr, pud_t *pudp)
0567 {
0568     int ret = 0;
0569
0570     if (pud_young(*pudp))
0571         ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
0572                      (unsigned long *)pudp);
0573
0574     return ret;
0575 }
0576 #endif
0577
0578 int ptep_clear_flush_young(struct vm_area_struct *vma,
0579                unsigned long address, pte_t *ptep)
0580 {
0581     /*
0582      * On x86 CPUs, clearing the accessed bit without a TLB flush
0583      * doesn't cause data corruption. [ It could cause incorrect
0584      * page aging and the (mistaken) reclaim of hot pages, but the
0585      * chance of that should be relatively low. ]
0586      *
0587      * So as a performance optimization don't flush the TLB when
0588      * clearing the accessed bit, it will eventually be flushed by
0589      * a context switch or a VM operation anyway. [ In the rare
0590      * event of it not getting flushed for a long time the delay
0591      * shouldn't really matter because there's no real memory
0592      * pressure for swapout to react to. ]
0593      */
0594     return ptep_test_and_clear_young(vma, address, ptep);
0595 }
0596
0597 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0598 int pmdp_clear_flush_young(struct vm_area_struct *vma,
0599                unsigned long address, pmd_t *pmdp)
0600 {
0601     int young;
0602
0603     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
0604
0605     young = pmdp_test_and_clear_young(vma, address, pmdp);
0606     if (young)
0607         flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
0608
0609     return young;
0610 }
0611
0612 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
0613              pmd_t *pmdp)
0614 {
0615     /*
0616      * No flush is necessary. Once an invalid PTE is established, the PTE's
0617      * access and dirty bits cannot be updated.
0618      */
0619     return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
0620 }
0621 #endif
0622
0623 /**
0624  * reserve_top_address - reserves a hole in the top of kernel address space
0625  * @reserve - size of hole to reserve
0626  *
0627  * Can be used to relocate the fixmap area and poke a hole in the top
0628  * of kernel address space to make room for a hypervisor.
0629  */
0630 void __init reserve_top_address(unsigned long reserve)
0631 {
0632 #ifdef CONFIG_X86_32
0633     BUG_ON(fixmaps_set > 0);
0634     __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
0635     printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
0636            -reserve, __FIXADDR_TOP + PAGE_SIZE);
0637 #endif
0638 }
0639
0640 int fixmaps_set;
0641
0642 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
0643 {
0644     unsigned long address = __fix_to_virt(idx);
0645
0646 #ifdef CONFIG_X86_64
0647        /*
0648     * Ensure that the static initial page tables are covering the
0649     * fixmap completely.
0650     */
0651     BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
0652              (FIXMAP_PMD_NUM * PTRS_PER_PTE));
0653 #endif
0654
0655     if (idx >= __end_of_fixed_addresses) {
0656         BUG();
0657         return;
0658     }
0659     set_pte_vaddr(address, pte);
0660     fixmaps_set++;
0661 }
0662
0663 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
0664                phys_addr_t phys, pgprot_t flags)
0665 {
0666     /* Sanitize 'prot' against any unsupported bits: */
0667     pgprot_val(flags) &= __default_kernel_pte_mask;
0668
0669     __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
0670 }
0671
0672 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
0673 #ifdef CONFIG_X86_5LEVEL
0674 /**
0675  * p4d_set_huge - setup kernel P4D mapping
0676  *
0677  * No 512GB pages yet -- always return 0
0678  */
0679 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
0680 {
0681     return 0;
0682 }
0683
0684 /**
0685  * p4d_clear_huge - clear kernel P4D mapping when it is set
0686  *
0687  * No 512GB pages yet -- always return 0
0688  */
0689 void p4d_clear_huge(p4d_t *p4d)
0690 {
0691 }
0692 #endif
0693
0694 /**
0695  * pud_set_huge - setup kernel PUD mapping
0696  *
0697  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
0698  * function sets up a huge page only if any of the following conditions are met:
0699  *
0700  * - MTRRs are disabled, or
0701  *
0702  * - MTRRs are enabled and the range is completely covered by a single MTRR, or
0703  *
0704  * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
0705  *   has no effect on the requested PAT memory type.
0706  *
0707  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
0708  * page mapping attempt fails.
0709  *
0710  * Returns 1 on success and 0 on failure.
0711  */
0712 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
0713 {
0714     u8 mtrr, uniform;
0715
0716     mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
0717     if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
0718         (mtrr != MTRR_TYPE_WRBACK))
0719         return 0;
0720
0721     /* Bail out if we are we on a populated non-leaf entry: */
0722     if (pud_present(*pud) && !pud_huge(*pud))
0723         return 0;
0724
0725     set_pte((pte_t *)pud, pfn_pte(
0726         (u64)addr >> PAGE_SHIFT,
0727         __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
0728
0729     return 1;
0730 }
0731
0732 /**
0733  * pmd_set_huge - setup kernel PMD mapping
0734  *
0735  * See text over pud_set_huge() above.
0736  *
0737  * Returns 1 on success and 0 on failure.
0738  */
0739 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
0740 {
0741     u8 mtrr, uniform;
0742
0743     mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
0744     if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
0745         (mtrr != MTRR_TYPE_WRBACK)) {
0746         pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
0747                  __func__, addr, addr + PMD_SIZE);
0748         return 0;
0749     }
0750
0751     /* Bail out if we are we on a populated non-leaf entry: */
0752     if (pmd_present(*pmd) && !pmd_huge(*pmd))
0753         return 0;
0754
0755     set_pte((pte_t *)pmd, pfn_pte(
0756         (u64)addr >> PAGE_SHIFT,
0757         __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
0758
0759     return 1;
0760 }
0761
0762 /**
0763  * pud_clear_huge - clear kernel PUD mapping when it is set
0764  *
0765  * Returns 1 on success and 0 on failure (no PUD map is found).
0766  */
0767 int pud_clear_huge(pud_t *pud)
0768 {
0769     if (pud_large(*pud)) {
0770         pud_clear(pud);
0771         return 1;
0772     }
0773
0774     return 0;
0775 }
0776
0777 /**
0778  * pmd_clear_huge - clear kernel PMD mapping when it is set
0779  *
0780  * Returns 1 on success and 0 on failure (no PMD map is found).
0781  */
0782 int pmd_clear_huge(pmd_t *pmd)
0783 {
0784     if (pmd_large(*pmd)) {
0785         pmd_clear(pmd);
0786         return 1;
0787     }
0788
0789     return 0;
0790 }
0791
0792 #ifdef CONFIG_X86_64
0793 /**
0794  * pud_free_pmd_page - Clear pud entry and free pmd page.
0795  * @pud: Pointer to a PUD.
0796  * @addr: Virtual address associated with pud.
0797  *
0798  * Context: The pud range has been unmapped and TLB purged.
0799  * Return: 1 if clearing the entry succeeded. 0 otherwise.
0800  *
0801  * NOTE: Callers must allow a single page allocation.
0802  */
0803 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
0804 {
0805     pmd_t *pmd, *pmd_sv;
0806     pte_t *pte;
0807     int i;
0808
0809     pmd = pud_pgtable(*pud);
0810     pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
0811     if (!pmd_sv)
0812         return 0;
0813
0814     for (i = 0; i < PTRS_PER_PMD; i++) {
0815         pmd_sv[i] = pmd[i];
0816         if (!pmd_none(pmd[i]))
0817             pmd_clear(&pmd[i]);
0818     }
0819
0820     pud_clear(pud);
0821
0822     /* INVLPG to clear all paging-structure caches */
0823     flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
0824
0825     for (i = 0; i < PTRS_PER_PMD; i++) {
0826         if (!pmd_none(pmd_sv[i])) {
0827             pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
0828             free_page((unsigned long)pte);
0829         }
0830     }
0831
0832     free_page((unsigned long)pmd_sv);
0833
0834     pgtable_pmd_page_dtor(virt_to_page(pmd));
0835     free_page((unsigned long)pmd);
0836
0837     return 1;
0838 }
0839
0840 /**
0841  * pmd_free_pte_page - Clear pmd entry and free pte page.
0842  * @pmd: Pointer to a PMD.
0843  * @addr: Virtual address associated with pmd.
0844  *
0845  * Context: The pmd range has been unmapped and TLB purged.
0846  * Return: 1 if clearing the entry succeeded. 0 otherwise.
0847  */
0848 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
0849 {
0850     pte_t *pte;
0851
0852     pte = (pte_t *)pmd_page_vaddr(*pmd);
0853     pmd_clear(pmd);
0854
0855     /* INVLPG to clear all paging-structure caches */
0856     flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
0857
0858     free_page((unsigned long)pte);
0859
0860     return 1;
0861 }
0862
0863 #else /* !CONFIG_X86_64 */
0864
0865 /*
0866  * Disable free page handling on x86-PAE. This assures that ioremap()
0867  * does not update sync'd pmd entries. See vmalloc_sync_one().
0868  */
0869 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
0870 {
0871     return pmd_none(*pmd);
0872 }
0873
0874 #endif /* CONFIG_X86_64 */
0875 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */