Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  Page table allocation functions
0004  *
0005  *    Copyright IBM Corp. 2016
0006  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
0007  */
0008 
0009 #include <linux/sysctl.h>
0010 #include <linux/slab.h>
0011 #include <linux/mm.h>
0012 #include <asm/mmu_context.h>
0013 #include <asm/pgalloc.h>
0014 #include <asm/gmap.h>
0015 #include <asm/tlb.h>
0016 #include <asm/tlbflush.h>
0017 
0018 #ifdef CONFIG_PGSTE
0019 
0020 int page_table_allocate_pgste = 0;
0021 EXPORT_SYMBOL(page_table_allocate_pgste);
0022 
0023 static struct ctl_table page_table_sysctl[] = {
0024     {
0025         .procname   = "allocate_pgste",
0026         .data       = &page_table_allocate_pgste,
0027         .maxlen     = sizeof(int),
0028         .mode       = S_IRUGO | S_IWUSR,
0029         .proc_handler   = proc_dointvec_minmax,
0030         .extra1     = SYSCTL_ZERO,
0031         .extra2     = SYSCTL_ONE,
0032     },
0033     { }
0034 };
0035 
0036 static struct ctl_table page_table_sysctl_dir[] = {
0037     {
0038         .procname   = "vm",
0039         .maxlen     = 0,
0040         .mode       = 0555,
0041         .child      = page_table_sysctl,
0042     },
0043     { }
0044 };
0045 
0046 static int __init page_table_register_sysctl(void)
0047 {
0048     return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
0049 }
0050 __initcall(page_table_register_sysctl);
0051 
0052 #endif /* CONFIG_PGSTE */
0053 
0054 unsigned long *crst_table_alloc(struct mm_struct *mm)
0055 {
0056     struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
0057 
0058     if (!page)
0059         return NULL;
0060     arch_set_page_dat(page, CRST_ALLOC_ORDER);
0061     return (unsigned long *) page_to_virt(page);
0062 }
0063 
0064 void crst_table_free(struct mm_struct *mm, unsigned long *table)
0065 {
0066     free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0067 }
0068 
0069 static void __crst_table_upgrade(void *arg)
0070 {
0071     struct mm_struct *mm = arg;
0072 
0073     /* change all active ASCEs to avoid the creation of new TLBs */
0074     if (current->active_mm == mm) {
0075         S390_lowcore.user_asce = mm->context.asce;
0076         __ctl_load(S390_lowcore.user_asce, 7, 7);
0077     }
0078     __tlb_flush_local();
0079 }
0080 
0081 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
0082 {
0083     unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
0084     unsigned long asce_limit = mm->context.asce_limit;
0085 
0086     /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
0087     VM_BUG_ON(asce_limit < _REGION2_SIZE);
0088 
0089     if (end <= asce_limit)
0090         return 0;
0091 
0092     if (asce_limit == _REGION2_SIZE) {
0093         p4d = crst_table_alloc(mm);
0094         if (unlikely(!p4d))
0095             goto err_p4d;
0096         crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
0097     }
0098     if (end > _REGION1_SIZE) {
0099         pgd = crst_table_alloc(mm);
0100         if (unlikely(!pgd))
0101             goto err_pgd;
0102         crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
0103     }
0104 
0105     spin_lock_bh(&mm->page_table_lock);
0106 
0107     /*
0108      * This routine gets called with mmap_lock lock held and there is
0109      * no reason to optimize for the case of otherwise. However, if
0110      * that would ever change, the below check will let us know.
0111      */
0112     VM_BUG_ON(asce_limit != mm->context.asce_limit);
0113 
0114     if (p4d) {
0115         __pgd = (unsigned long *) mm->pgd;
0116         p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
0117         mm->pgd = (pgd_t *) p4d;
0118         mm->context.asce_limit = _REGION1_SIZE;
0119         mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
0120             _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
0121         mm_inc_nr_puds(mm);
0122     }
0123     if (pgd) {
0124         __pgd = (unsigned long *) mm->pgd;
0125         pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
0126         mm->pgd = (pgd_t *) pgd;
0127         mm->context.asce_limit = TASK_SIZE_MAX;
0128         mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
0129             _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
0130     }
0131 
0132     spin_unlock_bh(&mm->page_table_lock);
0133 
0134     on_each_cpu(__crst_table_upgrade, mm, 0);
0135 
0136     return 0;
0137 
0138 err_pgd:
0139     crst_table_free(mm, p4d);
0140 err_p4d:
0141     return -ENOMEM;
0142 }
0143 
0144 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
0145 {
0146     unsigned int old, new;
0147 
0148     do {
0149         old = atomic_read(v);
0150         new = old ^ bits;
0151     } while (atomic_cmpxchg(v, old, new) != old);
0152     return new;
0153 }
0154 
0155 #ifdef CONFIG_PGSTE
0156 
0157 struct page *page_table_alloc_pgste(struct mm_struct *mm)
0158 {
0159     struct page *page;
0160     u64 *table;
0161 
0162     page = alloc_page(GFP_KERNEL);
0163     if (page) {
0164         table = (u64 *)page_to_virt(page);
0165         memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
0166         memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
0167     }
0168     return page;
0169 }
0170 
0171 void page_table_free_pgste(struct page *page)
0172 {
0173     __free_page(page);
0174 }
0175 
0176 #endif /* CONFIG_PGSTE */
0177 
0178 /*
0179  * A 2KB-pgtable is either upper or lower half of a normal page.
0180  * The second half of the page may be unused or used as another
0181  * 2KB-pgtable.
0182  *
0183  * Whenever possible the parent page for a new 2KB-pgtable is picked
0184  * from the list of partially allocated pages mm_context_t::pgtable_list.
0185  * In case the list is empty a new parent page is allocated and added to
0186  * the list.
0187  *
0188  * When a parent page gets fully allocated it contains 2KB-pgtables in both
0189  * upper and lower halves and is removed from mm_context_t::pgtable_list.
0190  *
0191  * When 2KB-pgtable is freed from to fully allocated parent page that
0192  * page turns partially allocated and added to mm_context_t::pgtable_list.
0193  *
0194  * If 2KB-pgtable is freed from the partially allocated parent page that
0195  * page turns unused and gets removed from mm_context_t::pgtable_list.
0196  * Furthermore, the unused parent page is released.
0197  *
0198  * As follows from the above, no unallocated or fully allocated parent
0199  * pages are contained in mm_context_t::pgtable_list.
0200  *
0201  * The upper byte (bits 24-31) of the parent page _refcount is used
0202  * for tracking contained 2KB-pgtables and has the following format:
0203  *
0204  *   PP  AA
0205  * 01234567    upper byte (bits 24-31) of struct page::_refcount
0206  *   ||  ||
0207  *   ||  |+--- upper 2KB-pgtable is allocated
0208  *   ||  +---- lower 2KB-pgtable is allocated
0209  *   |+------- upper 2KB-pgtable is pending for removal
0210  *   +-------- lower 2KB-pgtable is pending for removal
0211  *
0212  * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
0213  * using _refcount is possible).
0214  *
0215  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
0216  * The parent page is either:
0217  *   - added to mm_context_t::pgtable_list in case the second half of the
0218  *     parent page is still unallocated;
0219  *   - removed from mm_context_t::pgtable_list in case both hales of the
0220  *     parent page are allocated;
0221  * These operations are protected with mm_context_t::lock.
0222  *
0223  * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
0224  * and the corresponding PP bit is set to 1 in a single atomic operation.
0225  * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
0226  * exclusive and may never be both set to 1!
0227  * The parent page is either:
0228  *   - added to mm_context_t::pgtable_list in case the second half of the
0229  *     parent page is still allocated;
0230  *   - removed from mm_context_t::pgtable_list in case the second half of
0231  *     the parent page is unallocated;
0232  * These operations are protected with mm_context_t::lock.
0233  *
0234  * It is important to understand that mm_context_t::lock only protects
0235  * mm_context_t::pgtable_list and AA bits, but not the parent page itself
0236  * and PP bits.
0237  *
0238  * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
0239  * while both AA bits and the second PP bit are already unset. Then the
0240  * parent page does not contain any 2KB-pgtable fragment anymore, and it has
0241  * also been removed from mm_context_t::pgtable_list. It is safe to release
0242  * the page therefore.
0243  *
0244  * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
0245  * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
0246  * while the PP bits are never used, nor such a page is added to or removed
0247  * from mm_context_t::pgtable_list.
0248  */
0249 unsigned long *page_table_alloc(struct mm_struct *mm)
0250 {
0251     unsigned long *table;
0252     struct page *page;
0253     unsigned int mask, bit;
0254 
0255     /* Try to get a fragment of a 4K page as a 2K page table */
0256     if (!mm_alloc_pgste(mm)) {
0257         table = NULL;
0258         spin_lock_bh(&mm->context.lock);
0259         if (!list_empty(&mm->context.pgtable_list)) {
0260             page = list_first_entry(&mm->context.pgtable_list,
0261                         struct page, lru);
0262             mask = atomic_read(&page->_refcount) >> 24;
0263             /*
0264              * The pending removal bits must also be checked.
0265              * Failure to do so might lead to an impossible
0266              * value of (i.e 0x13 or 0x23) written to _refcount.
0267              * Such values violate the assumption that pending and
0268              * allocation bits are mutually exclusive, and the rest
0269              * of the code unrails as result. That could lead to
0270              * a whole bunch of races and corruptions.
0271              */
0272             mask = (mask | (mask >> 4)) & 0x03U;
0273             if (mask != 0x03U) {
0274                 table = (unsigned long *) page_to_virt(page);
0275                 bit = mask & 1;     /* =1 -> second 2K */
0276                 if (bit)
0277                     table += PTRS_PER_PTE;
0278                 atomic_xor_bits(&page->_refcount,
0279                             0x01U << (bit + 24));
0280                 list_del(&page->lru);
0281             }
0282         }
0283         spin_unlock_bh(&mm->context.lock);
0284         if (table)
0285             return table;
0286     }
0287     /* Allocate a fresh page */
0288     page = alloc_page(GFP_KERNEL);
0289     if (!page)
0290         return NULL;
0291     if (!pgtable_pte_page_ctor(page)) {
0292         __free_page(page);
0293         return NULL;
0294     }
0295     arch_set_page_dat(page, 0);
0296     /* Initialize page table */
0297     table = (unsigned long *) page_to_virt(page);
0298     if (mm_alloc_pgste(mm)) {
0299         /* Return 4K page table with PGSTEs */
0300         atomic_xor_bits(&page->_refcount, 0x03U << 24);
0301         memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
0302         memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
0303     } else {
0304         /* Return the first 2K fragment of the page */
0305         atomic_xor_bits(&page->_refcount, 0x01U << 24);
0306         memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
0307         spin_lock_bh(&mm->context.lock);
0308         list_add(&page->lru, &mm->context.pgtable_list);
0309         spin_unlock_bh(&mm->context.lock);
0310     }
0311     return table;
0312 }
0313 
0314 static void page_table_release_check(struct page *page, void *table,
0315                      unsigned int half, unsigned int mask)
0316 {
0317     char msg[128];
0318 
0319     if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
0320         return;
0321     snprintf(msg, sizeof(msg),
0322          "Invalid pgtable %p release half 0x%02x mask 0x%02x",
0323          table, half, mask);
0324     dump_page(page, msg);
0325 }
0326 
0327 void page_table_free(struct mm_struct *mm, unsigned long *table)
0328 {
0329     unsigned int mask, bit, half;
0330     struct page *page;
0331 
0332     page = virt_to_page(table);
0333     if (!mm_alloc_pgste(mm)) {
0334         /* Free 2K page table fragment of a 4K page */
0335         bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
0336         spin_lock_bh(&mm->context.lock);
0337         /*
0338          * Mark the page for delayed release. The actual release
0339          * will happen outside of the critical section from this
0340          * function or from __tlb_remove_table()
0341          */
0342         mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
0343         mask >>= 24;
0344         if (mask & 0x03U)
0345             list_add(&page->lru, &mm->context.pgtable_list);
0346         else
0347             list_del(&page->lru);
0348         spin_unlock_bh(&mm->context.lock);
0349         mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
0350         mask >>= 24;
0351         if (mask != 0x00U)
0352             return;
0353         half = 0x01U << bit;
0354     } else {
0355         half = 0x03U;
0356         mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
0357         mask >>= 24;
0358     }
0359 
0360     page_table_release_check(page, table, half, mask);
0361     pgtable_pte_page_dtor(page);
0362     __free_page(page);
0363 }
0364 
0365 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
0366              unsigned long vmaddr)
0367 {
0368     struct mm_struct *mm;
0369     struct page *page;
0370     unsigned int bit, mask;
0371 
0372     mm = tlb->mm;
0373     page = virt_to_page(table);
0374     if (mm_alloc_pgste(mm)) {
0375         gmap_unlink(mm, table, vmaddr);
0376         table = (unsigned long *) ((unsigned long)table | 0x03U);
0377         tlb_remove_table(tlb, table);
0378         return;
0379     }
0380     bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
0381     spin_lock_bh(&mm->context.lock);
0382     /*
0383      * Mark the page for delayed release. The actual release will happen
0384      * outside of the critical section from __tlb_remove_table() or from
0385      * page_table_free()
0386      */
0387     mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
0388     mask >>= 24;
0389     if (mask & 0x03U)
0390         list_add_tail(&page->lru, &mm->context.pgtable_list);
0391     else
0392         list_del(&page->lru);
0393     spin_unlock_bh(&mm->context.lock);
0394     table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
0395     tlb_remove_table(tlb, table);
0396 }
0397 
0398 void __tlb_remove_table(void *_table)
0399 {
0400     unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
0401     void *table = (void *)((unsigned long) _table ^ mask);
0402     struct page *page = virt_to_page(table);
0403 
0404     switch (half) {
0405     case 0x00U: /* pmd, pud, or p4d */
0406         free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0407         return;
0408     case 0x01U: /* lower 2K of a 4K page table */
0409     case 0x02U: /* higher 2K of a 4K page table */
0410         mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
0411         mask >>= 24;
0412         if (mask != 0x00U)
0413             return;
0414         break;
0415     case 0x03U: /* 4K page table with pgstes */
0416         mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
0417         mask >>= 24;
0418         break;
0419     }
0420 
0421     page_table_release_check(page, table, half, mask);
0422     pgtable_pte_page_dtor(page);
0423     __free_page(page);
0424 }
0425 
0426 /*
0427  * Base infrastructure required to generate basic asces, region, segment,
0428  * and page tables that do not make use of enhanced features like EDAT1.
0429  */
0430 
0431 static struct kmem_cache *base_pgt_cache;
0432 
0433 static unsigned long *base_pgt_alloc(void)
0434 {
0435     unsigned long *table;
0436 
0437     table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
0438     if (table)
0439         memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
0440     return table;
0441 }
0442 
0443 static void base_pgt_free(unsigned long *table)
0444 {
0445     kmem_cache_free(base_pgt_cache, table);
0446 }
0447 
0448 static unsigned long *base_crst_alloc(unsigned long val)
0449 {
0450     unsigned long *table;
0451 
0452     table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
0453     if (table)
0454         crst_table_init(table, val);
0455     return table;
0456 }
0457 
0458 static void base_crst_free(unsigned long *table)
0459 {
0460     free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0461 }
0462 
0463 #define BASE_ADDR_END_FUNC(NAME, SIZE)                  \
0464 static inline unsigned long base_##NAME##_addr_end(unsigned long addr,  \
0465                            unsigned long end)   \
0466 {                                   \
0467     unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);       \
0468                                     \
0469     return (next - 1) < (end - 1) ? next : end;         \
0470 }
0471 
0472 BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
0473 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
0474 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
0475 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
0476 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
0477 
0478 static inline unsigned long base_lra(unsigned long address)
0479 {
0480     unsigned long real;
0481 
0482     asm volatile(
0483         "   lra %0,0(%1)\n"
0484         : "=d" (real) : "a" (address) : "cc");
0485     return real;
0486 }
0487 
0488 static int base_page_walk(unsigned long *origin, unsigned long addr,
0489               unsigned long end, int alloc)
0490 {
0491     unsigned long *pte, next;
0492 
0493     if (!alloc)
0494         return 0;
0495     pte = origin;
0496     pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
0497     do {
0498         next = base_page_addr_end(addr, end);
0499         *pte = base_lra(addr);
0500     } while (pte++, addr = next, addr < end);
0501     return 0;
0502 }
0503 
0504 static int base_segment_walk(unsigned long *origin, unsigned long addr,
0505                  unsigned long end, int alloc)
0506 {
0507     unsigned long *ste, next, *table;
0508     int rc;
0509 
0510     ste = origin;
0511     ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
0512     do {
0513         next = base_segment_addr_end(addr, end);
0514         if (*ste & _SEGMENT_ENTRY_INVALID) {
0515             if (!alloc)
0516                 continue;
0517             table = base_pgt_alloc();
0518             if (!table)
0519                 return -ENOMEM;
0520             *ste = __pa(table) | _SEGMENT_ENTRY;
0521         }
0522         table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
0523         rc = base_page_walk(table, addr, next, alloc);
0524         if (rc)
0525             return rc;
0526         if (!alloc)
0527             base_pgt_free(table);
0528         cond_resched();
0529     } while (ste++, addr = next, addr < end);
0530     return 0;
0531 }
0532 
0533 static int base_region3_walk(unsigned long *origin, unsigned long addr,
0534                  unsigned long end, int alloc)
0535 {
0536     unsigned long *rtte, next, *table;
0537     int rc;
0538 
0539     rtte = origin;
0540     rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
0541     do {
0542         next = base_region3_addr_end(addr, end);
0543         if (*rtte & _REGION_ENTRY_INVALID) {
0544             if (!alloc)
0545                 continue;
0546             table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
0547             if (!table)
0548                 return -ENOMEM;
0549             *rtte = __pa(table) | _REGION3_ENTRY;
0550         }
0551         table = __va(*rtte & _REGION_ENTRY_ORIGIN);
0552         rc = base_segment_walk(table, addr, next, alloc);
0553         if (rc)
0554             return rc;
0555         if (!alloc)
0556             base_crst_free(table);
0557     } while (rtte++, addr = next, addr < end);
0558     return 0;
0559 }
0560 
0561 static int base_region2_walk(unsigned long *origin, unsigned long addr,
0562                  unsigned long end, int alloc)
0563 {
0564     unsigned long *rste, next, *table;
0565     int rc;
0566 
0567     rste = origin;
0568     rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
0569     do {
0570         next = base_region2_addr_end(addr, end);
0571         if (*rste & _REGION_ENTRY_INVALID) {
0572             if (!alloc)
0573                 continue;
0574             table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
0575             if (!table)
0576                 return -ENOMEM;
0577             *rste = __pa(table) | _REGION2_ENTRY;
0578         }
0579         table = __va(*rste & _REGION_ENTRY_ORIGIN);
0580         rc = base_region3_walk(table, addr, next, alloc);
0581         if (rc)
0582             return rc;
0583         if (!alloc)
0584             base_crst_free(table);
0585     } while (rste++, addr = next, addr < end);
0586     return 0;
0587 }
0588 
0589 static int base_region1_walk(unsigned long *origin, unsigned long addr,
0590                  unsigned long end, int alloc)
0591 {
0592     unsigned long *rfte, next, *table;
0593     int rc;
0594 
0595     rfte = origin;
0596     rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
0597     do {
0598         next = base_region1_addr_end(addr, end);
0599         if (*rfte & _REGION_ENTRY_INVALID) {
0600             if (!alloc)
0601                 continue;
0602             table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
0603             if (!table)
0604                 return -ENOMEM;
0605             *rfte = __pa(table) | _REGION1_ENTRY;
0606         }
0607         table = __va(*rfte & _REGION_ENTRY_ORIGIN);
0608         rc = base_region2_walk(table, addr, next, alloc);
0609         if (rc)
0610             return rc;
0611         if (!alloc)
0612             base_crst_free(table);
0613     } while (rfte++, addr = next, addr < end);
0614     return 0;
0615 }
0616 
0617 /**
0618  * base_asce_free - free asce and tables returned from base_asce_alloc()
0619  * @asce: asce to be freed
0620  *
0621  * Frees all region, segment, and page tables that were allocated with a
0622  * corresponding base_asce_alloc() call.
0623  */
0624 void base_asce_free(unsigned long asce)
0625 {
0626     unsigned long *table = __va(asce & _ASCE_ORIGIN);
0627 
0628     if (!asce)
0629         return;
0630     switch (asce & _ASCE_TYPE_MASK) {
0631     case _ASCE_TYPE_SEGMENT:
0632         base_segment_walk(table, 0, _REGION3_SIZE, 0);
0633         break;
0634     case _ASCE_TYPE_REGION3:
0635         base_region3_walk(table, 0, _REGION2_SIZE, 0);
0636         break;
0637     case _ASCE_TYPE_REGION2:
0638         base_region2_walk(table, 0, _REGION1_SIZE, 0);
0639         break;
0640     case _ASCE_TYPE_REGION1:
0641         base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
0642         break;
0643     }
0644     base_crst_free(table);
0645 }
0646 
0647 static int base_pgt_cache_init(void)
0648 {
0649     static DEFINE_MUTEX(base_pgt_cache_mutex);
0650     unsigned long sz = _PAGE_TABLE_SIZE;
0651 
0652     if (base_pgt_cache)
0653         return 0;
0654     mutex_lock(&base_pgt_cache_mutex);
0655     if (!base_pgt_cache)
0656         base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
0657     mutex_unlock(&base_pgt_cache_mutex);
0658     return base_pgt_cache ? 0 : -ENOMEM;
0659 }
0660 
0661 /**
0662  * base_asce_alloc - create kernel mapping without enhanced DAT features
0663  * @addr: virtual start address of kernel mapping
0664  * @num_pages: number of consecutive pages
0665  *
0666  * Generate an asce, including all required region, segment and page tables,
0667  * that can be used to access the virtual kernel mapping. The difference is
0668  * that the returned asce does not make use of any enhanced DAT features like
0669  * e.g. large pages. This is required for some I/O functions that pass an
0670  * asce, like e.g. some service call requests.
0671  *
0672  * Note: the returned asce may NEVER be attached to any cpu. It may only be
0673  *   used for I/O requests. tlb entries that might result because the
0674  *   asce was attached to a cpu won't be cleared.
0675  */
0676 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
0677 {
0678     unsigned long asce, *table, end;
0679     int rc;
0680 
0681     if (base_pgt_cache_init())
0682         return 0;
0683     end = addr + num_pages * PAGE_SIZE;
0684     if (end <= _REGION3_SIZE) {
0685         table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
0686         if (!table)
0687             return 0;
0688         rc = base_segment_walk(table, addr, end, 1);
0689         asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
0690     } else if (end <= _REGION2_SIZE) {
0691         table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
0692         if (!table)
0693             return 0;
0694         rc = base_region3_walk(table, addr, end, 1);
0695         asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
0696     } else if (end <= _REGION1_SIZE) {
0697         table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
0698         if (!table)
0699             return 0;
0700         rc = base_region2_walk(table, addr, end, 1);
0701         asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
0702     } else {
0703         table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
0704         if (!table)
0705             return 0;
0706         rc = base_region1_walk(table, addr, end, 1);
0707         asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
0708     }
0709     if (rc) {
0710         base_asce_free(asce);
0711         asce = 0;
0712     }
0713     return asce;
0714 }