Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * This file contains common routines for dealing with free of page tables
0004  * Along with common page table handling code
0005  *
0006  *  Derived from arch/powerpc/mm/tlb_64.c:
0007  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
0008  *
0009  *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
0010  *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
0011  *    Copyright (C) 1996 Paul Mackerras
0012  *
0013  *  Derived from "arch/i386/mm/init.c"
0014  *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0015  *
0016  *  Dave Engebretsen <engebret@us.ibm.com>
0017  *      Rework for PPC64 port.
0018  */
0019 
0020 #include <linux/kernel.h>
0021 #include <linux/gfp.h>
0022 #include <linux/mm.h>
0023 #include <linux/percpu.h>
0024 #include <linux/hardirq.h>
0025 #include <linux/hugetlb.h>
0026 #include <asm/tlbflush.h>
0027 #include <asm/tlb.h>
0028 #include <asm/hugetlb.h>
0029 #include <asm/pte-walk.h>
0030 
0031 #ifdef CONFIG_PPC64
0032 #define PGD_ALIGN (sizeof(pgd_t) * MAX_PTRS_PER_PGD)
0033 #else
0034 #define PGD_ALIGN PAGE_SIZE
0035 #endif
0036 
0037 pgd_t swapper_pg_dir[MAX_PTRS_PER_PGD] __section(".bss..page_aligned") __aligned(PGD_ALIGN);
0038 
0039 static inline int is_exec_fault(void)
0040 {
0041     return current->thread.regs && TRAP(current->thread.regs) == 0x400;
0042 }
0043 
0044 /* We only try to do i/d cache coherency on stuff that looks like
0045  * reasonably "normal" PTEs. We currently require a PTE to be present
0046  * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
0047  * on userspace PTEs
0048  */
0049 static inline int pte_looks_normal(pte_t pte)
0050 {
0051 
0052     if (pte_present(pte) && !pte_special(pte)) {
0053         if (pte_ci(pte))
0054             return 0;
0055         if (pte_user(pte))
0056             return 1;
0057     }
0058     return 0;
0059 }
0060 
0061 static struct page *maybe_pte_to_page(pte_t pte)
0062 {
0063     unsigned long pfn = pte_pfn(pte);
0064     struct page *page;
0065 
0066     if (unlikely(!pfn_valid(pfn)))
0067         return NULL;
0068     page = pfn_to_page(pfn);
0069     if (PageReserved(page))
0070         return NULL;
0071     return page;
0072 }
0073 
0074 #ifdef CONFIG_PPC_BOOK3S
0075 
0076 /* Server-style MMU handles coherency when hashing if HW exec permission
0077  * is supposed per page (currently 64-bit only). If not, then, we always
0078  * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
0079  * support falls into the same category.
0080  */
0081 
0082 static pte_t set_pte_filter_hash(pte_t pte)
0083 {
0084     pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
0085     if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
0086                        cpu_has_feature(CPU_FTR_NOEXECUTE))) {
0087         struct page *pg = maybe_pte_to_page(pte);
0088         if (!pg)
0089             return pte;
0090         if (!test_bit(PG_dcache_clean, &pg->flags)) {
0091             flush_dcache_icache_page(pg);
0092             set_bit(PG_dcache_clean, &pg->flags);
0093         }
0094     }
0095     return pte;
0096 }
0097 
0098 #else /* CONFIG_PPC_BOOK3S */
0099 
0100 static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
0101 
0102 #endif /* CONFIG_PPC_BOOK3S */
0103 
0104 /* Embedded type MMU with HW exec support. This is a bit more complicated
0105  * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
0106  * instead we "filter out" the exec permission for non clean pages.
0107  */
0108 static inline pte_t set_pte_filter(pte_t pte)
0109 {
0110     struct page *pg;
0111 
0112     if (radix_enabled())
0113         return pte;
0114 
0115     if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
0116         return set_pte_filter_hash(pte);
0117 
0118     /* No exec permission in the first place, move on */
0119     if (!pte_exec(pte) || !pte_looks_normal(pte))
0120         return pte;
0121 
0122     /* If you set _PAGE_EXEC on weird pages you're on your own */
0123     pg = maybe_pte_to_page(pte);
0124     if (unlikely(!pg))
0125         return pte;
0126 
0127     /* If the page clean, we move on */
0128     if (test_bit(PG_dcache_clean, &pg->flags))
0129         return pte;
0130 
0131     /* If it's an exec fault, we flush the cache and make it clean */
0132     if (is_exec_fault()) {
0133         flush_dcache_icache_page(pg);
0134         set_bit(PG_dcache_clean, &pg->flags);
0135         return pte;
0136     }
0137 
0138     /* Else, we filter out _PAGE_EXEC */
0139     return pte_exprotect(pte);
0140 }
0141 
0142 static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
0143                      int dirty)
0144 {
0145     struct page *pg;
0146 
0147     if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
0148         return pte;
0149 
0150     if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
0151         return pte;
0152 
0153     /* So here, we only care about exec faults, as we use them
0154      * to recover lost _PAGE_EXEC and perform I$/D$ coherency
0155      * if necessary. Also if _PAGE_EXEC is already set, same deal,
0156      * we just bail out
0157      */
0158     if (dirty || pte_exec(pte) || !is_exec_fault())
0159         return pte;
0160 
0161 #ifdef CONFIG_DEBUG_VM
0162     /* So this is an exec fault, _PAGE_EXEC is not set. If it was
0163      * an error we would have bailed out earlier in do_page_fault()
0164      * but let's make sure of it
0165      */
0166     if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
0167         return pte;
0168 #endif /* CONFIG_DEBUG_VM */
0169 
0170     /* If you set _PAGE_EXEC on weird pages you're on your own */
0171     pg = maybe_pte_to_page(pte);
0172     if (unlikely(!pg))
0173         goto bail;
0174 
0175     /* If the page is already clean, we move on */
0176     if (test_bit(PG_dcache_clean, &pg->flags))
0177         goto bail;
0178 
0179     /* Clean the page and set PG_dcache_clean */
0180     flush_dcache_icache_page(pg);
0181     set_bit(PG_dcache_clean, &pg->flags);
0182 
0183  bail:
0184     return pte_mkexec(pte);
0185 }
0186 
0187 /*
0188  * set_pte stores a linux PTE into the linux page table.
0189  */
0190 void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
0191         pte_t pte)
0192 {
0193     /*
0194      * Make sure hardware valid bit is not set. We don't do
0195      * tlb flush for this update.
0196      */
0197     VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
0198 
0199     /* Note: mm->context.id might not yet have been assigned as
0200      * this context might not have been activated yet when this
0201      * is called.
0202      */
0203     pte = set_pte_filter(pte);
0204 
0205     /* Perform the setting of the PTE */
0206     __set_pte_at(mm, addr, ptep, pte, 0);
0207 }
0208 
0209 void unmap_kernel_page(unsigned long va)
0210 {
0211     pmd_t *pmdp = pmd_off_k(va);
0212     pte_t *ptep = pte_offset_kernel(pmdp, va);
0213 
0214     pte_clear(&init_mm, va, ptep);
0215     flush_tlb_kernel_range(va, va + PAGE_SIZE);
0216 }
0217 
0218 /*
0219  * This is called when relaxing access to a PTE. It's also called in the page
0220  * fault path when we don't hit any of the major fault cases, ie, a minor
0221  * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
0222  * handled those two for us, we additionally deal with missing execute
0223  * permission here on some processors
0224  */
0225 int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
0226               pte_t *ptep, pte_t entry, int dirty)
0227 {
0228     int changed;
0229     entry = set_access_flags_filter(entry, vma, dirty);
0230     changed = !pte_same(*(ptep), entry);
0231     if (changed) {
0232         assert_pte_locked(vma->vm_mm, address);
0233         __ptep_set_access_flags(vma, ptep, entry,
0234                     address, mmu_virtual_psize);
0235     }
0236     return changed;
0237 }
0238 
0239 #ifdef CONFIG_HUGETLB_PAGE
0240 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
0241                    unsigned long addr, pte_t *ptep,
0242                    pte_t pte, int dirty)
0243 {
0244 #ifdef HUGETLB_NEED_PRELOAD
0245     /*
0246      * The "return 1" forces a call of update_mmu_cache, which will write a
0247      * TLB entry.  Without this, platforms that don't do a write of the TLB
0248      * entry in the TLB miss handler asm will fault ad infinitum.
0249      */
0250     ptep_set_access_flags(vma, addr, ptep, pte, dirty);
0251     return 1;
0252 #else
0253     int changed, psize;
0254 
0255     pte = set_access_flags_filter(pte, vma, dirty);
0256     changed = !pte_same(*(ptep), pte);
0257     if (changed) {
0258 
0259 #ifdef CONFIG_PPC_BOOK3S_64
0260         struct hstate *h = hstate_vma(vma);
0261 
0262         psize = hstate_get_psize(h);
0263 #ifdef CONFIG_DEBUG_VM
0264         assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
0265 #endif
0266 
0267 #else
0268         /*
0269          * Not used on non book3s64 platforms.
0270          * 8xx compares it with mmu_virtual_psize to
0271          * know if it is a huge page or not.
0272          */
0273         psize = MMU_PAGE_COUNT;
0274 #endif
0275         __ptep_set_access_flags(vma, ptep, pte, addr, psize);
0276     }
0277     return changed;
0278 #endif
0279 }
0280 
0281 #if defined(CONFIG_PPC_8xx)
0282 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
0283 {
0284     pmd_t *pmd = pmd_off(mm, addr);
0285     pte_basic_t val;
0286     pte_basic_t *entry = (pte_basic_t *)ptep;
0287     int num, i;
0288 
0289     /*
0290      * Make sure hardware valid bit is not set. We don't do
0291      * tlb flush for this update.
0292      */
0293     VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
0294 
0295     pte = set_pte_filter(pte);
0296 
0297     val = pte_val(pte);
0298 
0299     num = number_of_cells_per_pte(pmd, val, 1);
0300 
0301     for (i = 0; i < num; i++, entry++, val += SZ_4K)
0302         *entry = val;
0303 }
0304 #endif
0305 #endif /* CONFIG_HUGETLB_PAGE */
0306 
0307 #ifdef CONFIG_DEBUG_VM
0308 void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
0309 {
0310     pgd_t *pgd;
0311     p4d_t *p4d;
0312     pud_t *pud;
0313     pmd_t *pmd;
0314 
0315     if (mm == &init_mm)
0316         return;
0317     pgd = mm->pgd + pgd_index(addr);
0318     BUG_ON(pgd_none(*pgd));
0319     p4d = p4d_offset(pgd, addr);
0320     BUG_ON(p4d_none(*p4d));
0321     pud = pud_offset(p4d, addr);
0322     BUG_ON(pud_none(*pud));
0323     pmd = pmd_offset(pud, addr);
0324     /*
0325      * khugepaged to collapse normal pages to hugepage, first set
0326      * pmd to none to force page fault/gup to take mmap_lock. After
0327      * pmd is set to none, we do a pte_clear which does this assertion
0328      * so if we find pmd none, return.
0329      */
0330     if (pmd_none(*pmd))
0331         return;
0332     BUG_ON(!pmd_present(*pmd));
0333     assert_spin_locked(pte_lockptr(mm, pmd));
0334 }
0335 #endif /* CONFIG_DEBUG_VM */
0336 
0337 unsigned long vmalloc_to_phys(void *va)
0338 {
0339     unsigned long pfn = vmalloc_to_pfn(va);
0340 
0341     BUG_ON(!pfn);
0342     return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
0343 }
0344 EXPORT_SYMBOL_GPL(vmalloc_to_phys);
0345 
0346 /*
0347  * We have 4 cases for pgds and pmds:
0348  * (1) invalid (all zeroes)
0349  * (2) pointer to next table, as normal; bottom 6 bits == 0
0350  * (3) leaf pte for huge page _PAGE_PTE set
0351  * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
0352  *
0353  * So long as we atomically load page table pointers we are safe against teardown,
0354  * we can follow the address down to the page and take a ref on it.
0355  * This function need to be called with interrupts disabled. We use this variant
0356  * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
0357  */
0358 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
0359             bool *is_thp, unsigned *hpage_shift)
0360 {
0361     pgd_t *pgdp;
0362     p4d_t p4d, *p4dp;
0363     pud_t pud, *pudp;
0364     pmd_t pmd, *pmdp;
0365     pte_t *ret_pte;
0366     hugepd_t *hpdp = NULL;
0367     unsigned pdshift;
0368 
0369     if (hpage_shift)
0370         *hpage_shift = 0;
0371 
0372     if (is_thp)
0373         *is_thp = false;
0374 
0375     /*
0376      * Always operate on the local stack value. This make sure the
0377      * value don't get updated by a parallel THP split/collapse,
0378      * page fault or a page unmap. The return pte_t * is still not
0379      * stable. So should be checked there for above conditions.
0380      * Top level is an exception because it is folded into p4d.
0381      */
0382     pgdp = pgdir + pgd_index(ea);
0383     p4dp = p4d_offset(pgdp, ea);
0384     p4d  = READ_ONCE(*p4dp);
0385     pdshift = P4D_SHIFT;
0386 
0387     if (p4d_none(p4d))
0388         return NULL;
0389 
0390     if (p4d_is_leaf(p4d)) {
0391         ret_pte = (pte_t *)p4dp;
0392         goto out;
0393     }
0394 
0395     if (is_hugepd(__hugepd(p4d_val(p4d)))) {
0396         hpdp = (hugepd_t *)&p4d;
0397         goto out_huge;
0398     }
0399 
0400     /*
0401      * Even if we end up with an unmap, the pgtable will not
0402      * be freed, because we do an rcu free and here we are
0403      * irq disabled
0404      */
0405     pdshift = PUD_SHIFT;
0406     pudp = pud_offset(&p4d, ea);
0407     pud  = READ_ONCE(*pudp);
0408 
0409     if (pud_none(pud))
0410         return NULL;
0411 
0412     if (pud_is_leaf(pud)) {
0413         ret_pte = (pte_t *)pudp;
0414         goto out;
0415     }
0416 
0417     if (is_hugepd(__hugepd(pud_val(pud)))) {
0418         hpdp = (hugepd_t *)&pud;
0419         goto out_huge;
0420     }
0421 
0422     pdshift = PMD_SHIFT;
0423     pmdp = pmd_offset(&pud, ea);
0424     pmd  = READ_ONCE(*pmdp);
0425 
0426     /*
0427      * A hugepage collapse is captured by this condition, see
0428      * pmdp_collapse_flush.
0429      */
0430     if (pmd_none(pmd))
0431         return NULL;
0432 
0433 #ifdef CONFIG_PPC_BOOK3S_64
0434     /*
0435      * A hugepage split is captured by this condition, see
0436      * pmdp_invalidate.
0437      *
0438      * Huge page modification can be caught here too.
0439      */
0440     if (pmd_is_serializing(pmd))
0441         return NULL;
0442 #endif
0443 
0444     if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
0445         if (is_thp)
0446             *is_thp = true;
0447         ret_pte = (pte_t *)pmdp;
0448         goto out;
0449     }
0450 
0451     if (pmd_is_leaf(pmd)) {
0452         ret_pte = (pte_t *)pmdp;
0453         goto out;
0454     }
0455 
0456     if (is_hugepd(__hugepd(pmd_val(pmd)))) {
0457         hpdp = (hugepd_t *)&pmd;
0458         goto out_huge;
0459     }
0460 
0461     return pte_offset_kernel(&pmd, ea);
0462 
0463 out_huge:
0464     if (!hpdp)
0465         return NULL;
0466 
0467     ret_pte = hugepte_offset(*hpdp, ea, pdshift);
0468     pdshift = hugepd_shift(*hpdp);
0469 out:
0470     if (hpage_shift)
0471         *hpage_shift = pdshift;
0472     return ret_pte;
0473 }
0474 EXPORT_SYMBOL_GPL(__find_linux_pte);
0475 
0476 /* Note due to the way vm flags are laid out, the bits are XWR */
0477 const pgprot_t protection_map[16] = {
0478     [VM_NONE]                   = PAGE_NONE,
0479     [VM_READ]                   = PAGE_READONLY,
0480     [VM_WRITE]                  = PAGE_COPY,
0481     [VM_WRITE | VM_READ]                = PAGE_COPY,
0482     [VM_EXEC]                   = PAGE_READONLY_X,
0483     [VM_EXEC | VM_READ]             = PAGE_READONLY_X,
0484     [VM_EXEC | VM_WRITE]                = PAGE_COPY_X,
0485     [VM_EXEC | VM_WRITE | VM_READ]          = PAGE_COPY_X,
0486     [VM_SHARED]                 = PAGE_NONE,
0487     [VM_SHARED | VM_READ]               = PAGE_READONLY,
0488     [VM_SHARED | VM_WRITE]              = PAGE_SHARED,
0489     [VM_SHARED | VM_WRITE | VM_READ]        = PAGE_SHARED,
0490     [VM_SHARED | VM_EXEC]               = PAGE_READONLY_X,
0491     [VM_SHARED | VM_EXEC | VM_READ]         = PAGE_READONLY_X,
0492     [VM_SHARED | VM_EXEC | VM_WRITE]        = PAGE_SHARED_X,
0493     [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]  = PAGE_SHARED_X
0494 };
0495 
0496 #ifndef CONFIG_PPC_BOOK3S_64
0497 DECLARE_VM_GET_PAGE_PROT
0498 #endif