Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  IBM System z Huge TLB Page Support for Kernel.
0004  *
0005  *    Copyright IBM Corp. 2007,2020
0006  *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
0007  */
0008 
0009 #define KMSG_COMPONENT "hugetlb"
0010 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
0011 
0012 #include <asm/pgalloc.h>
0013 #include <linux/mm.h>
0014 #include <linux/hugetlb.h>
0015 #include <linux/mman.h>
0016 #include <linux/sched/mm.h>
0017 #include <linux/security.h>
0018 
0019 /*
0020  * If the bit selected by single-bit bitmask "a" is set within "x", move
0021  * it to the position indicated by single-bit bitmask "b".
0022  */
0023 #define move_set_bit(x, a, b)   (((x) & (a)) >> ilog2(a) << ilog2(b))
0024 
0025 static inline unsigned long __pte_to_rste(pte_t pte)
0026 {
0027     unsigned long rste;
0028 
0029     /*
0030      * Convert encoding       pte bits  pmd / pud bits
0031      *              lIR.uswrdy.p    dy..R...I...wr
0032      * empty            010.000000.0 -> 00..0...1...00
0033      * prot-none, clean, old    111.000000.1 -> 00..1...1...00
0034      * prot-none, clean, young  111.000001.1 -> 01..1...1...00
0035      * prot-none, dirty, old    111.000010.1 -> 10..1...1...00
0036      * prot-none, dirty, young  111.000011.1 -> 11..1...1...00
0037      * read-only, clean, old    111.000100.1 -> 00..1...1...01
0038      * read-only, clean, young  101.000101.1 -> 01..1...0...01
0039      * read-only, dirty, old    111.000110.1 -> 10..1...1...01
0040      * read-only, dirty, young  101.000111.1 -> 11..1...0...01
0041      * read-write, clean, old   111.001100.1 -> 00..1...1...11
0042      * read-write, clean, young 101.001101.1 -> 01..1...0...11
0043      * read-write, dirty, old   110.001110.1 -> 10..0...1...11
0044      * read-write, dirty, young 100.001111.1 -> 11..0...0...11
0045      * HW-bits: R read-only, I invalid
0046      * SW-bits: p present, y young, d dirty, r read, w write, s special,
0047      *      u unused, l large
0048      */
0049     if (pte_present(pte)) {
0050         rste = pte_val(pte) & PAGE_MASK;
0051         rste |= move_set_bit(pte_val(pte), _PAGE_READ,
0052                      _SEGMENT_ENTRY_READ);
0053         rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
0054                      _SEGMENT_ENTRY_WRITE);
0055         rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
0056                      _SEGMENT_ENTRY_INVALID);
0057         rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
0058                      _SEGMENT_ENTRY_PROTECT);
0059         rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
0060                      _SEGMENT_ENTRY_DIRTY);
0061         rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
0062                      _SEGMENT_ENTRY_YOUNG);
0063 #ifdef CONFIG_MEM_SOFT_DIRTY
0064         rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
0065                      _SEGMENT_ENTRY_SOFT_DIRTY);
0066 #endif
0067         rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
0068                      _SEGMENT_ENTRY_NOEXEC);
0069     } else
0070         rste = _SEGMENT_ENTRY_EMPTY;
0071     return rste;
0072 }
0073 
0074 static inline pte_t __rste_to_pte(unsigned long rste)
0075 {
0076     unsigned long pteval;
0077     int present;
0078 
0079     if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
0080         present = pud_present(__pud(rste));
0081     else
0082         present = pmd_present(__pmd(rste));
0083 
0084     /*
0085      * Convert encoding     pmd / pud bits      pte bits
0086      *              dy..R...I...wr    lIR.uswrdy.p
0087      * empty            00..0...1...00 -> 010.000000.0
0088      * prot-none, clean, old    00..1...1...00 -> 111.000000.1
0089      * prot-none, clean, young  01..1...1...00 -> 111.000001.1
0090      * prot-none, dirty, old    10..1...1...00 -> 111.000010.1
0091      * prot-none, dirty, young  11..1...1...00 -> 111.000011.1
0092      * read-only, clean, old    00..1...1...01 -> 111.000100.1
0093      * read-only, clean, young  01..1...0...01 -> 101.000101.1
0094      * read-only, dirty, old    10..1...1...01 -> 111.000110.1
0095      * read-only, dirty, young  11..1...0...01 -> 101.000111.1
0096      * read-write, clean, old   00..1...1...11 -> 111.001100.1
0097      * read-write, clean, young 01..1...0...11 -> 101.001101.1
0098      * read-write, dirty, old   10..0...1...11 -> 110.001110.1
0099      * read-write, dirty, young 11..0...0...11 -> 100.001111.1
0100      * HW-bits: R read-only, I invalid
0101      * SW-bits: p present, y young, d dirty, r read, w write, s special,
0102      *      u unused, l large
0103      */
0104     if (present) {
0105         pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
0106         pteval |= _PAGE_LARGE | _PAGE_PRESENT;
0107         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
0108         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
0109         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
0110         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
0111         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
0112         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
0113 #ifdef CONFIG_MEM_SOFT_DIRTY
0114         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
0115 #endif
0116         pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
0117     } else
0118         pteval = _PAGE_INVALID;
0119     return __pte(pteval);
0120 }
0121 
0122 static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
0123 {
0124     struct page *page;
0125     unsigned long size, paddr;
0126 
0127     if (!mm_uses_skeys(mm) ||
0128         rste & _SEGMENT_ENTRY_INVALID)
0129         return;
0130 
0131     if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
0132         page = pud_page(__pud(rste));
0133         size = PUD_SIZE;
0134         paddr = rste & PUD_MASK;
0135     } else {
0136         page = pmd_page(__pmd(rste));
0137         size = PMD_SIZE;
0138         paddr = rste & PMD_MASK;
0139     }
0140 
0141     if (!test_and_set_bit(PG_arch_1, &page->flags))
0142         __storage_key_init_range(paddr, paddr + size - 1);
0143 }
0144 
0145 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
0146              pte_t *ptep, pte_t pte)
0147 {
0148     unsigned long rste;
0149 
0150     rste = __pte_to_rste(pte);
0151     if (!MACHINE_HAS_NX)
0152         rste &= ~_SEGMENT_ENTRY_NOEXEC;
0153 
0154     /* Set correct table type for 2G hugepages */
0155     if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
0156         if (likely(pte_present(pte)))
0157             rste |= _REGION3_ENTRY_LARGE;
0158         rste |= _REGION_ENTRY_TYPE_R3;
0159     } else if (likely(pte_present(pte)))
0160         rste |= _SEGMENT_ENTRY_LARGE;
0161 
0162     clear_huge_pte_skeys(mm, rste);
0163     set_pte(ptep, __pte(rste));
0164 }
0165 
0166 pte_t huge_ptep_get(pte_t *ptep)
0167 {
0168     return __rste_to_pte(pte_val(*ptep));
0169 }
0170 
0171 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
0172                   unsigned long addr, pte_t *ptep)
0173 {
0174     pte_t pte = huge_ptep_get(ptep);
0175     pmd_t *pmdp = (pmd_t *) ptep;
0176     pud_t *pudp = (pud_t *) ptep;
0177 
0178     if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
0179         pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
0180     else
0181         pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
0182     return pte;
0183 }
0184 
0185 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
0186             unsigned long addr, unsigned long sz)
0187 {
0188     pgd_t *pgdp;
0189     p4d_t *p4dp;
0190     pud_t *pudp;
0191     pmd_t *pmdp = NULL;
0192 
0193     pgdp = pgd_offset(mm, addr);
0194     p4dp = p4d_alloc(mm, pgdp, addr);
0195     if (p4dp) {
0196         pudp = pud_alloc(mm, p4dp, addr);
0197         if (pudp) {
0198             if (sz == PUD_SIZE)
0199                 return (pte_t *) pudp;
0200             else if (sz == PMD_SIZE)
0201                 pmdp = pmd_alloc(mm, pudp, addr);
0202         }
0203     }
0204     return (pte_t *) pmdp;
0205 }
0206 
0207 pte_t *huge_pte_offset(struct mm_struct *mm,
0208                unsigned long addr, unsigned long sz)
0209 {
0210     pgd_t *pgdp;
0211     p4d_t *p4dp;
0212     pud_t *pudp;
0213     pmd_t *pmdp = NULL;
0214 
0215     pgdp = pgd_offset(mm, addr);
0216     if (pgd_present(*pgdp)) {
0217         p4dp = p4d_offset(pgdp, addr);
0218         if (p4d_present(*p4dp)) {
0219             pudp = pud_offset(p4dp, addr);
0220             if (pud_present(*pudp)) {
0221                 if (pud_large(*pudp))
0222                     return (pte_t *) pudp;
0223                 pmdp = pmd_offset(pudp, addr);
0224             }
0225         }
0226     }
0227     return (pte_t *) pmdp;
0228 }
0229 
0230 int pmd_huge(pmd_t pmd)
0231 {
0232     return pmd_large(pmd);
0233 }
0234 
0235 int pud_huge(pud_t pud)
0236 {
0237     return pud_large(pud);
0238 }
0239 
0240 struct page *
0241 follow_huge_pud(struct mm_struct *mm, unsigned long address,
0242         pud_t *pud, int flags)
0243 {
0244     if (flags & FOLL_GET)
0245         return NULL;
0246 
0247     return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
0248 }
0249 
0250 bool __init arch_hugetlb_valid_size(unsigned long size)
0251 {
0252     if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
0253         return true;
0254     else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
0255         return true;
0256     else
0257         return false;
0258 }
0259 
0260 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
0261         unsigned long addr, unsigned long len,
0262         unsigned long pgoff, unsigned long flags)
0263 {
0264     struct hstate *h = hstate_file(file);
0265     struct vm_unmapped_area_info info;
0266 
0267     info.flags = 0;
0268     info.length = len;
0269     info.low_limit = current->mm->mmap_base;
0270     info.high_limit = TASK_SIZE;
0271     info.align_mask = PAGE_MASK & ~huge_page_mask(h);
0272     info.align_offset = 0;
0273     return vm_unmapped_area(&info);
0274 }
0275 
0276 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
0277         unsigned long addr0, unsigned long len,
0278         unsigned long pgoff, unsigned long flags)
0279 {
0280     struct hstate *h = hstate_file(file);
0281     struct vm_unmapped_area_info info;
0282     unsigned long addr;
0283 
0284     info.flags = VM_UNMAPPED_AREA_TOPDOWN;
0285     info.length = len;
0286     info.low_limit = max(PAGE_SIZE, mmap_min_addr);
0287     info.high_limit = current->mm->mmap_base;
0288     info.align_mask = PAGE_MASK & ~huge_page_mask(h);
0289     info.align_offset = 0;
0290     addr = vm_unmapped_area(&info);
0291 
0292     /*
0293      * A failed mmap() very likely causes application failure,
0294      * so fall back to the bottom-up function here. This scenario
0295      * can happen with large stack limits and large mmap()
0296      * allocations.
0297      */
0298     if (addr & ~PAGE_MASK) {
0299         VM_BUG_ON(addr != -ENOMEM);
0300         info.flags = 0;
0301         info.low_limit = TASK_UNMAPPED_BASE;
0302         info.high_limit = TASK_SIZE;
0303         addr = vm_unmapped_area(&info);
0304     }
0305 
0306     return addr;
0307 }
0308 
0309 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
0310         unsigned long len, unsigned long pgoff, unsigned long flags)
0311 {
0312     struct hstate *h = hstate_file(file);
0313     struct mm_struct *mm = current->mm;
0314     struct vm_area_struct *vma;
0315 
0316     if (len & ~huge_page_mask(h))
0317         return -EINVAL;
0318     if (len > TASK_SIZE - mmap_min_addr)
0319         return -ENOMEM;
0320 
0321     if (flags & MAP_FIXED) {
0322         if (prepare_hugepage_range(file, addr, len))
0323             return -EINVAL;
0324         goto check_asce_limit;
0325     }
0326 
0327     if (addr) {
0328         addr = ALIGN(addr, huge_page_size(h));
0329         vma = find_vma(mm, addr);
0330         if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
0331             (!vma || addr + len <= vm_start_gap(vma)))
0332             goto check_asce_limit;
0333     }
0334 
0335     if (mm->get_unmapped_area == arch_get_unmapped_area)
0336         addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
0337                 pgoff, flags);
0338     else
0339         addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
0340                 pgoff, flags);
0341     if (offset_in_page(addr))
0342         return addr;
0343 
0344 check_asce_limit:
0345     return check_asce_limit(mm, addr, len);
0346 }