0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/sysctl.h>
0010 #include <linux/slab.h>
0011 #include <linux/mm.h>
0012 #include <asm/mmu_context.h>
0013 #include <asm/pgalloc.h>
0014 #include <asm/gmap.h>
0015 #include <asm/tlb.h>
0016 #include <asm/tlbflush.h>
0017
0018 #ifdef CONFIG_PGSTE
0019
0020 int page_table_allocate_pgste = 0;
0021 EXPORT_SYMBOL(page_table_allocate_pgste);
0022
0023 static struct ctl_table page_table_sysctl[] = {
0024 {
0025 .procname = "allocate_pgste",
0026 .data = &page_table_allocate_pgste,
0027 .maxlen = sizeof(int),
0028 .mode = S_IRUGO | S_IWUSR,
0029 .proc_handler = proc_dointvec_minmax,
0030 .extra1 = SYSCTL_ZERO,
0031 .extra2 = SYSCTL_ONE,
0032 },
0033 { }
0034 };
0035
0036 static struct ctl_table page_table_sysctl_dir[] = {
0037 {
0038 .procname = "vm",
0039 .maxlen = 0,
0040 .mode = 0555,
0041 .child = page_table_sysctl,
0042 },
0043 { }
0044 };
0045
0046 static int __init page_table_register_sysctl(void)
0047 {
0048 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
0049 }
0050 __initcall(page_table_register_sysctl);
0051
0052 #endif
0053
0054 unsigned long *crst_table_alloc(struct mm_struct *mm)
0055 {
0056 struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
0057
0058 if (!page)
0059 return NULL;
0060 arch_set_page_dat(page, CRST_ALLOC_ORDER);
0061 return (unsigned long *) page_to_virt(page);
0062 }
0063
0064 void crst_table_free(struct mm_struct *mm, unsigned long *table)
0065 {
0066 free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0067 }
0068
0069 static void __crst_table_upgrade(void *arg)
0070 {
0071 struct mm_struct *mm = arg;
0072
0073
0074 if (current->active_mm == mm) {
0075 S390_lowcore.user_asce = mm->context.asce;
0076 __ctl_load(S390_lowcore.user_asce, 7, 7);
0077 }
0078 __tlb_flush_local();
0079 }
0080
0081 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
0082 {
0083 unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
0084 unsigned long asce_limit = mm->context.asce_limit;
0085
0086
0087 VM_BUG_ON(asce_limit < _REGION2_SIZE);
0088
0089 if (end <= asce_limit)
0090 return 0;
0091
0092 if (asce_limit == _REGION2_SIZE) {
0093 p4d = crst_table_alloc(mm);
0094 if (unlikely(!p4d))
0095 goto err_p4d;
0096 crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
0097 }
0098 if (end > _REGION1_SIZE) {
0099 pgd = crst_table_alloc(mm);
0100 if (unlikely(!pgd))
0101 goto err_pgd;
0102 crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
0103 }
0104
0105 spin_lock_bh(&mm->page_table_lock);
0106
0107
0108
0109
0110
0111
0112 VM_BUG_ON(asce_limit != mm->context.asce_limit);
0113
0114 if (p4d) {
0115 __pgd = (unsigned long *) mm->pgd;
0116 p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
0117 mm->pgd = (pgd_t *) p4d;
0118 mm->context.asce_limit = _REGION1_SIZE;
0119 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
0120 _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
0121 mm_inc_nr_puds(mm);
0122 }
0123 if (pgd) {
0124 __pgd = (unsigned long *) mm->pgd;
0125 pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
0126 mm->pgd = (pgd_t *) pgd;
0127 mm->context.asce_limit = TASK_SIZE_MAX;
0128 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
0129 _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
0130 }
0131
0132 spin_unlock_bh(&mm->page_table_lock);
0133
0134 on_each_cpu(__crst_table_upgrade, mm, 0);
0135
0136 return 0;
0137
0138 err_pgd:
0139 crst_table_free(mm, p4d);
0140 err_p4d:
0141 return -ENOMEM;
0142 }
0143
0144 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
0145 {
0146 unsigned int old, new;
0147
0148 do {
0149 old = atomic_read(v);
0150 new = old ^ bits;
0151 } while (atomic_cmpxchg(v, old, new) != old);
0152 return new;
0153 }
0154
0155 #ifdef CONFIG_PGSTE
0156
0157 struct page *page_table_alloc_pgste(struct mm_struct *mm)
0158 {
0159 struct page *page;
0160 u64 *table;
0161
0162 page = alloc_page(GFP_KERNEL);
0163 if (page) {
0164 table = (u64 *)page_to_virt(page);
0165 memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
0166 memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
0167 }
0168 return page;
0169 }
0170
0171 void page_table_free_pgste(struct page *page)
0172 {
0173 __free_page(page);
0174 }
0175
0176 #endif
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249 unsigned long *page_table_alloc(struct mm_struct *mm)
0250 {
0251 unsigned long *table;
0252 struct page *page;
0253 unsigned int mask, bit;
0254
0255
0256 if (!mm_alloc_pgste(mm)) {
0257 table = NULL;
0258 spin_lock_bh(&mm->context.lock);
0259 if (!list_empty(&mm->context.pgtable_list)) {
0260 page = list_first_entry(&mm->context.pgtable_list,
0261 struct page, lru);
0262 mask = atomic_read(&page->_refcount) >> 24;
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272 mask = (mask | (mask >> 4)) & 0x03U;
0273 if (mask != 0x03U) {
0274 table = (unsigned long *) page_to_virt(page);
0275 bit = mask & 1;
0276 if (bit)
0277 table += PTRS_PER_PTE;
0278 atomic_xor_bits(&page->_refcount,
0279 0x01U << (bit + 24));
0280 list_del(&page->lru);
0281 }
0282 }
0283 spin_unlock_bh(&mm->context.lock);
0284 if (table)
0285 return table;
0286 }
0287
0288 page = alloc_page(GFP_KERNEL);
0289 if (!page)
0290 return NULL;
0291 if (!pgtable_pte_page_ctor(page)) {
0292 __free_page(page);
0293 return NULL;
0294 }
0295 arch_set_page_dat(page, 0);
0296
0297 table = (unsigned long *) page_to_virt(page);
0298 if (mm_alloc_pgste(mm)) {
0299
0300 atomic_xor_bits(&page->_refcount, 0x03U << 24);
0301 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
0302 memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
0303 } else {
0304
0305 atomic_xor_bits(&page->_refcount, 0x01U << 24);
0306 memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
0307 spin_lock_bh(&mm->context.lock);
0308 list_add(&page->lru, &mm->context.pgtable_list);
0309 spin_unlock_bh(&mm->context.lock);
0310 }
0311 return table;
0312 }
0313
0314 static void page_table_release_check(struct page *page, void *table,
0315 unsigned int half, unsigned int mask)
0316 {
0317 char msg[128];
0318
0319 if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
0320 return;
0321 snprintf(msg, sizeof(msg),
0322 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
0323 table, half, mask);
0324 dump_page(page, msg);
0325 }
0326
0327 void page_table_free(struct mm_struct *mm, unsigned long *table)
0328 {
0329 unsigned int mask, bit, half;
0330 struct page *page;
0331
0332 page = virt_to_page(table);
0333 if (!mm_alloc_pgste(mm)) {
0334
0335 bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
0336 spin_lock_bh(&mm->context.lock);
0337
0338
0339
0340
0341
0342 mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
0343 mask >>= 24;
0344 if (mask & 0x03U)
0345 list_add(&page->lru, &mm->context.pgtable_list);
0346 else
0347 list_del(&page->lru);
0348 spin_unlock_bh(&mm->context.lock);
0349 mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
0350 mask >>= 24;
0351 if (mask != 0x00U)
0352 return;
0353 half = 0x01U << bit;
0354 } else {
0355 half = 0x03U;
0356 mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
0357 mask >>= 24;
0358 }
0359
0360 page_table_release_check(page, table, half, mask);
0361 pgtable_pte_page_dtor(page);
0362 __free_page(page);
0363 }
0364
0365 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
0366 unsigned long vmaddr)
0367 {
0368 struct mm_struct *mm;
0369 struct page *page;
0370 unsigned int bit, mask;
0371
0372 mm = tlb->mm;
0373 page = virt_to_page(table);
0374 if (mm_alloc_pgste(mm)) {
0375 gmap_unlink(mm, table, vmaddr);
0376 table = (unsigned long *) ((unsigned long)table | 0x03U);
0377 tlb_remove_table(tlb, table);
0378 return;
0379 }
0380 bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
0381 spin_lock_bh(&mm->context.lock);
0382
0383
0384
0385
0386
0387 mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
0388 mask >>= 24;
0389 if (mask & 0x03U)
0390 list_add_tail(&page->lru, &mm->context.pgtable_list);
0391 else
0392 list_del(&page->lru);
0393 spin_unlock_bh(&mm->context.lock);
0394 table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
0395 tlb_remove_table(tlb, table);
0396 }
0397
0398 void __tlb_remove_table(void *_table)
0399 {
0400 unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
0401 void *table = (void *)((unsigned long) _table ^ mask);
0402 struct page *page = virt_to_page(table);
0403
0404 switch (half) {
0405 case 0x00U:
0406 free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0407 return;
0408 case 0x01U:
0409 case 0x02U:
0410 mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
0411 mask >>= 24;
0412 if (mask != 0x00U)
0413 return;
0414 break;
0415 case 0x03U:
0416 mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
0417 mask >>= 24;
0418 break;
0419 }
0420
0421 page_table_release_check(page, table, half, mask);
0422 pgtable_pte_page_dtor(page);
0423 __free_page(page);
0424 }
0425
0426
0427
0428
0429
0430
0431 static struct kmem_cache *base_pgt_cache;
0432
0433 static unsigned long *base_pgt_alloc(void)
0434 {
0435 unsigned long *table;
0436
0437 table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
0438 if (table)
0439 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
0440 return table;
0441 }
0442
0443 static void base_pgt_free(unsigned long *table)
0444 {
0445 kmem_cache_free(base_pgt_cache, table);
0446 }
0447
0448 static unsigned long *base_crst_alloc(unsigned long val)
0449 {
0450 unsigned long *table;
0451
0452 table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
0453 if (table)
0454 crst_table_init(table, val);
0455 return table;
0456 }
0457
0458 static void base_crst_free(unsigned long *table)
0459 {
0460 free_pages((unsigned long)table, CRST_ALLOC_ORDER);
0461 }
0462
0463 #define BASE_ADDR_END_FUNC(NAME, SIZE) \
0464 static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
0465 unsigned long end) \
0466 { \
0467 unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
0468 \
0469 return (next - 1) < (end - 1) ? next : end; \
0470 }
0471
0472 BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
0473 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
0474 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
0475 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
0476 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
0477
0478 static inline unsigned long base_lra(unsigned long address)
0479 {
0480 unsigned long real;
0481
0482 asm volatile(
0483 " lra %0,0(%1)\n"
0484 : "=d" (real) : "a" (address) : "cc");
0485 return real;
0486 }
0487
0488 static int base_page_walk(unsigned long *origin, unsigned long addr,
0489 unsigned long end, int alloc)
0490 {
0491 unsigned long *pte, next;
0492
0493 if (!alloc)
0494 return 0;
0495 pte = origin;
0496 pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
0497 do {
0498 next = base_page_addr_end(addr, end);
0499 *pte = base_lra(addr);
0500 } while (pte++, addr = next, addr < end);
0501 return 0;
0502 }
0503
0504 static int base_segment_walk(unsigned long *origin, unsigned long addr,
0505 unsigned long end, int alloc)
0506 {
0507 unsigned long *ste, next, *table;
0508 int rc;
0509
0510 ste = origin;
0511 ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
0512 do {
0513 next = base_segment_addr_end(addr, end);
0514 if (*ste & _SEGMENT_ENTRY_INVALID) {
0515 if (!alloc)
0516 continue;
0517 table = base_pgt_alloc();
0518 if (!table)
0519 return -ENOMEM;
0520 *ste = __pa(table) | _SEGMENT_ENTRY;
0521 }
0522 table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
0523 rc = base_page_walk(table, addr, next, alloc);
0524 if (rc)
0525 return rc;
0526 if (!alloc)
0527 base_pgt_free(table);
0528 cond_resched();
0529 } while (ste++, addr = next, addr < end);
0530 return 0;
0531 }
0532
0533 static int base_region3_walk(unsigned long *origin, unsigned long addr,
0534 unsigned long end, int alloc)
0535 {
0536 unsigned long *rtte, next, *table;
0537 int rc;
0538
0539 rtte = origin;
0540 rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
0541 do {
0542 next = base_region3_addr_end(addr, end);
0543 if (*rtte & _REGION_ENTRY_INVALID) {
0544 if (!alloc)
0545 continue;
0546 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
0547 if (!table)
0548 return -ENOMEM;
0549 *rtte = __pa(table) | _REGION3_ENTRY;
0550 }
0551 table = __va(*rtte & _REGION_ENTRY_ORIGIN);
0552 rc = base_segment_walk(table, addr, next, alloc);
0553 if (rc)
0554 return rc;
0555 if (!alloc)
0556 base_crst_free(table);
0557 } while (rtte++, addr = next, addr < end);
0558 return 0;
0559 }
0560
0561 static int base_region2_walk(unsigned long *origin, unsigned long addr,
0562 unsigned long end, int alloc)
0563 {
0564 unsigned long *rste, next, *table;
0565 int rc;
0566
0567 rste = origin;
0568 rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
0569 do {
0570 next = base_region2_addr_end(addr, end);
0571 if (*rste & _REGION_ENTRY_INVALID) {
0572 if (!alloc)
0573 continue;
0574 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
0575 if (!table)
0576 return -ENOMEM;
0577 *rste = __pa(table) | _REGION2_ENTRY;
0578 }
0579 table = __va(*rste & _REGION_ENTRY_ORIGIN);
0580 rc = base_region3_walk(table, addr, next, alloc);
0581 if (rc)
0582 return rc;
0583 if (!alloc)
0584 base_crst_free(table);
0585 } while (rste++, addr = next, addr < end);
0586 return 0;
0587 }
0588
0589 static int base_region1_walk(unsigned long *origin, unsigned long addr,
0590 unsigned long end, int alloc)
0591 {
0592 unsigned long *rfte, next, *table;
0593 int rc;
0594
0595 rfte = origin;
0596 rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
0597 do {
0598 next = base_region1_addr_end(addr, end);
0599 if (*rfte & _REGION_ENTRY_INVALID) {
0600 if (!alloc)
0601 continue;
0602 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
0603 if (!table)
0604 return -ENOMEM;
0605 *rfte = __pa(table) | _REGION1_ENTRY;
0606 }
0607 table = __va(*rfte & _REGION_ENTRY_ORIGIN);
0608 rc = base_region2_walk(table, addr, next, alloc);
0609 if (rc)
0610 return rc;
0611 if (!alloc)
0612 base_crst_free(table);
0613 } while (rfte++, addr = next, addr < end);
0614 return 0;
0615 }
0616
0617
0618
0619
0620
0621
0622
0623
0624 void base_asce_free(unsigned long asce)
0625 {
0626 unsigned long *table = __va(asce & _ASCE_ORIGIN);
0627
0628 if (!asce)
0629 return;
0630 switch (asce & _ASCE_TYPE_MASK) {
0631 case _ASCE_TYPE_SEGMENT:
0632 base_segment_walk(table, 0, _REGION3_SIZE, 0);
0633 break;
0634 case _ASCE_TYPE_REGION3:
0635 base_region3_walk(table, 0, _REGION2_SIZE, 0);
0636 break;
0637 case _ASCE_TYPE_REGION2:
0638 base_region2_walk(table, 0, _REGION1_SIZE, 0);
0639 break;
0640 case _ASCE_TYPE_REGION1:
0641 base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
0642 break;
0643 }
0644 base_crst_free(table);
0645 }
0646
0647 static int base_pgt_cache_init(void)
0648 {
0649 static DEFINE_MUTEX(base_pgt_cache_mutex);
0650 unsigned long sz = _PAGE_TABLE_SIZE;
0651
0652 if (base_pgt_cache)
0653 return 0;
0654 mutex_lock(&base_pgt_cache_mutex);
0655 if (!base_pgt_cache)
0656 base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
0657 mutex_unlock(&base_pgt_cache_mutex);
0658 return base_pgt_cache ? 0 : -ENOMEM;
0659 }
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
0677 {
0678 unsigned long asce, *table, end;
0679 int rc;
0680
0681 if (base_pgt_cache_init())
0682 return 0;
0683 end = addr + num_pages * PAGE_SIZE;
0684 if (end <= _REGION3_SIZE) {
0685 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
0686 if (!table)
0687 return 0;
0688 rc = base_segment_walk(table, addr, end, 1);
0689 asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
0690 } else if (end <= _REGION2_SIZE) {
0691 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
0692 if (!table)
0693 return 0;
0694 rc = base_region3_walk(table, addr, end, 1);
0695 asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
0696 } else if (end <= _REGION1_SIZE) {
0697 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
0698 if (!table)
0699 return 0;
0700 rc = base_region2_walk(table, addr, end, 1);
0701 asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
0702 } else {
0703 table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
0704 if (!table)
0705 return 0;
0706 rc = base_region1_walk(table, addr, end, 1);
0707 asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
0708 }
0709 if (rc) {
0710 base_asce_free(asce);
0711 asce = 0;
0712 }
0713 return asce;
0714 }