Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *
0004  * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
0005  */
0006 
0007 #include <linux/types.h>
0008 #include <linux/string.h>
0009 #include <linux/kvm.h>
0010 #include <linux/kvm_host.h>
0011 #include <linux/anon_inodes.h>
0012 #include <linux/file.h>
0013 #include <linux/debugfs.h>
0014 #include <linux/pgtable.h>
0015 
0016 #include <asm/kvm_ppc.h>
0017 #include <asm/kvm_book3s.h>
0018 #include <asm/page.h>
0019 #include <asm/mmu.h>
0020 #include <asm/pgalloc.h>
0021 #include <asm/pte-walk.h>
0022 #include <asm/ultravisor.h>
0023 #include <asm/kvm_book3s_uvmem.h>
0024 #include <asm/plpar_wrappers.h>
0025 #include <asm/firmware.h>
0026 
0027 /*
0028  * Supported radix tree geometry.
0029  * Like p9, we support either 5 or 9 bits at the first (lowest) level,
0030  * for a page size of 64k or 4k.
0031  */
0032 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
0033 
0034 unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
0035                           gva_t eaddr, void *to, void *from,
0036                           unsigned long n)
0037 {
0038     int old_pid, old_lpid;
0039     unsigned long quadrant, ret = n;
0040     bool is_load = !!to;
0041 
0042     /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
0043     if (kvmhv_on_pseries())
0044         return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
0045                       (to != NULL) ? __pa(to): 0,
0046                       (from != NULL) ? __pa(from): 0, n);
0047 
0048     if (eaddr & (0xFFFUL << 52))
0049         return ret;
0050 
0051     quadrant = 1;
0052     if (!pid)
0053         quadrant = 2;
0054     if (is_load)
0055         from = (void *) (eaddr | (quadrant << 62));
0056     else
0057         to = (void *) (eaddr | (quadrant << 62));
0058 
0059     preempt_disable();
0060 
0061     asm volatile("hwsync" ::: "memory");
0062     isync();
0063     /* switch the lpid first to avoid running host with unallocated pid */
0064     old_lpid = mfspr(SPRN_LPID);
0065     if (old_lpid != lpid)
0066         mtspr(SPRN_LPID, lpid);
0067     if (quadrant == 1) {
0068         old_pid = mfspr(SPRN_PID);
0069         if (old_pid != pid)
0070             mtspr(SPRN_PID, pid);
0071     }
0072     isync();
0073 
0074     pagefault_disable();
0075     if (is_load)
0076         ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
0077     else
0078         ret = __copy_to_user_inatomic((void __user *)to, from, n);
0079     pagefault_enable();
0080 
0081     asm volatile("hwsync" ::: "memory");
0082     isync();
0083     /* switch the pid first to avoid running host with unallocated pid */
0084     if (quadrant == 1 && pid != old_pid)
0085         mtspr(SPRN_PID, old_pid);
0086     if (lpid != old_lpid)
0087         mtspr(SPRN_LPID, old_lpid);
0088     isync();
0089 
0090     preempt_enable();
0091 
0092     return ret;
0093 }
0094 
0095 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
0096                       void *to, void *from, unsigned long n)
0097 {
0098     int lpid = vcpu->kvm->arch.lpid;
0099     int pid = vcpu->arch.pid;
0100 
0101     /* This would cause a data segment intr so don't allow the access */
0102     if (eaddr & (0x3FFUL << 52))
0103         return -EINVAL;
0104 
0105     /* Should we be using the nested lpid */
0106     if (vcpu->arch.nested)
0107         lpid = vcpu->arch.nested->shadow_lpid;
0108 
0109     /* If accessing quadrant 3 then pid is expected to be 0 */
0110     if (((eaddr >> 62) & 0x3) == 0x3)
0111         pid = 0;
0112 
0113     eaddr &= ~(0xFFFUL << 52);
0114 
0115     return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
0116 }
0117 
0118 long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
0119                  unsigned long n)
0120 {
0121     long ret;
0122 
0123     ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
0124     if (ret > 0)
0125         memset(to + (n - ret), 0, ret);
0126 
0127     return ret;
0128 }
0129 
0130 long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
0131                    unsigned long n)
0132 {
0133     return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
0134 }
0135 
0136 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
0137                    struct kvmppc_pte *gpte, u64 root,
0138                    u64 *pte_ret_p)
0139 {
0140     struct kvm *kvm = vcpu->kvm;
0141     int ret, level, ps;
0142     unsigned long rts, bits, offset, index;
0143     u64 pte, base, gpa;
0144     __be64 rpte;
0145 
0146     rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
0147         ((root & RTS2_MASK) >> RTS2_SHIFT);
0148     bits = root & RPDS_MASK;
0149     base = root & RPDB_MASK;
0150 
0151     offset = rts + 31;
0152 
0153     /* Current implementations only support 52-bit space */
0154     if (offset != 52)
0155         return -EINVAL;
0156 
0157     /* Walk each level of the radix tree */
0158     for (level = 3; level >= 0; --level) {
0159         u64 addr;
0160         /* Check a valid size */
0161         if (level && bits != p9_supported_radix_bits[level])
0162             return -EINVAL;
0163         if (level == 0 && !(bits == 5 || bits == 9))
0164             return -EINVAL;
0165         offset -= bits;
0166         index = (eaddr >> offset) & ((1UL << bits) - 1);
0167         /* Check that low bits of page table base are zero */
0168         if (base & ((1UL << (bits + 3)) - 1))
0169             return -EINVAL;
0170         /* Read the entry from guest memory */
0171         addr = base + (index * sizeof(rpte));
0172 
0173         kvm_vcpu_srcu_read_lock(vcpu);
0174         ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
0175         kvm_vcpu_srcu_read_unlock(vcpu);
0176         if (ret) {
0177             if (pte_ret_p)
0178                 *pte_ret_p = addr;
0179             return ret;
0180         }
0181         pte = __be64_to_cpu(rpte);
0182         if (!(pte & _PAGE_PRESENT))
0183             return -ENOENT;
0184         /* Check if a leaf entry */
0185         if (pte & _PAGE_PTE)
0186             break;
0187         /* Get ready to walk the next level */
0188         base = pte & RPDB_MASK;
0189         bits = pte & RPDS_MASK;
0190     }
0191 
0192     /* Need a leaf at lowest level; 512GB pages not supported */
0193     if (level < 0 || level == 3)
0194         return -EINVAL;
0195 
0196     /* We found a valid leaf PTE */
0197     /* Offset is now log base 2 of the page size */
0198     gpa = pte & 0x01fffffffffff000ul;
0199     if (gpa & ((1ul << offset) - 1))
0200         return -EINVAL;
0201     gpa |= eaddr & ((1ul << offset) - 1);
0202     for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
0203         if (offset == mmu_psize_defs[ps].shift)
0204             break;
0205     gpte->page_size = ps;
0206     gpte->page_shift = offset;
0207 
0208     gpte->eaddr = eaddr;
0209     gpte->raddr = gpa;
0210 
0211     /* Work out permissions */
0212     gpte->may_read = !!(pte & _PAGE_READ);
0213     gpte->may_write = !!(pte & _PAGE_WRITE);
0214     gpte->may_execute = !!(pte & _PAGE_EXEC);
0215 
0216     gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
0217 
0218     if (pte_ret_p)
0219         *pte_ret_p = pte;
0220 
0221     return 0;
0222 }
0223 
0224 /*
0225  * Used to walk a partition or process table radix tree in guest memory
0226  * Note: We exploit the fact that a partition table and a process
0227  * table have the same layout, a partition-scoped page table and a
0228  * process-scoped page table have the same layout, and the 2nd
0229  * doubleword of a partition table entry has the same layout as
0230  * the PTCR register.
0231  */
0232 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
0233                      struct kvmppc_pte *gpte, u64 table,
0234                      int table_index, u64 *pte_ret_p)
0235 {
0236     struct kvm *kvm = vcpu->kvm;
0237     int ret;
0238     unsigned long size, ptbl, root;
0239     struct prtb_entry entry;
0240 
0241     if ((table & PRTS_MASK) > 24)
0242         return -EINVAL;
0243     size = 1ul << ((table & PRTS_MASK) + 12);
0244 
0245     /* Is the table big enough to contain this entry? */
0246     if ((table_index * sizeof(entry)) >= size)
0247         return -EINVAL;
0248 
0249     /* Read the table to find the root of the radix tree */
0250     ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
0251     kvm_vcpu_srcu_read_lock(vcpu);
0252     ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
0253     kvm_vcpu_srcu_read_unlock(vcpu);
0254     if (ret)
0255         return ret;
0256 
0257     /* Root is stored in the first double word */
0258     root = be64_to_cpu(entry.prtb0);
0259 
0260     return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
0261 }
0262 
0263 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
0264                struct kvmppc_pte *gpte, bool data, bool iswrite)
0265 {
0266     u32 pid;
0267     u64 pte;
0268     int ret;
0269 
0270     /* Work out effective PID */
0271     switch (eaddr >> 62) {
0272     case 0:
0273         pid = vcpu->arch.pid;
0274         break;
0275     case 3:
0276         pid = 0;
0277         break;
0278     default:
0279         return -EINVAL;
0280     }
0281 
0282     ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
0283                 vcpu->kvm->arch.process_table, pid, &pte);
0284     if (ret)
0285         return ret;
0286 
0287     /* Check privilege (applies only to process scoped translations) */
0288     if (kvmppc_get_msr(vcpu) & MSR_PR) {
0289         if (pte & _PAGE_PRIVILEGED) {
0290             gpte->may_read = 0;
0291             gpte->may_write = 0;
0292             gpte->may_execute = 0;
0293         }
0294     } else {
0295         if (!(pte & _PAGE_PRIVILEGED)) {
0296             /* Check AMR/IAMR to see if strict mode is in force */
0297             if (vcpu->arch.amr & (1ul << 62))
0298                 gpte->may_read = 0;
0299             if (vcpu->arch.amr & (1ul << 63))
0300                 gpte->may_write = 0;
0301             if (vcpu->arch.iamr & (1ul << 62))
0302                 gpte->may_execute = 0;
0303         }
0304     }
0305 
0306     return 0;
0307 }
0308 
0309 void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
0310                  unsigned int pshift, unsigned int lpid)
0311 {
0312     unsigned long psize = PAGE_SIZE;
0313     int psi;
0314     long rc;
0315     unsigned long rb;
0316 
0317     if (pshift)
0318         psize = 1UL << pshift;
0319     else
0320         pshift = PAGE_SHIFT;
0321 
0322     addr &= ~(psize - 1);
0323 
0324     if (!kvmhv_on_pseries()) {
0325         radix__flush_tlb_lpid_page(lpid, addr, psize);
0326         return;
0327     }
0328 
0329     psi = shift_to_mmu_psize(pshift);
0330 
0331     if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
0332         rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
0333         rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
0334                     lpid, rb);
0335     } else {
0336         rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
0337                         H_RPTI_TYPE_NESTED |
0338                         H_RPTI_TYPE_TLB,
0339                         psize_to_rpti_pgsize(psi),
0340                         addr, addr + psize);
0341     }
0342 
0343     if (rc)
0344         pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
0345 }
0346 
0347 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
0348 {
0349     long rc;
0350 
0351     if (!kvmhv_on_pseries()) {
0352         radix__flush_pwc_lpid(lpid);
0353         return;
0354     }
0355 
0356     if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
0357         rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
0358                     lpid, TLBIEL_INVAL_SET_LPID);
0359     else
0360         rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
0361                         H_RPTI_TYPE_NESTED |
0362                         H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
0363                         0, -1UL);
0364     if (rc)
0365         pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
0366 }
0367 
0368 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
0369                       unsigned long clr, unsigned long set,
0370                       unsigned long addr, unsigned int shift)
0371 {
0372     return __radix_pte_update(ptep, clr, set);
0373 }
0374 
0375 static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
0376                  pte_t *ptep, pte_t pte)
0377 {
0378     radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
0379 }
0380 
0381 static struct kmem_cache *kvm_pte_cache;
0382 static struct kmem_cache *kvm_pmd_cache;
0383 
0384 static pte_t *kvmppc_pte_alloc(void)
0385 {
0386     pte_t *pte;
0387 
0388     pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
0389     /* pmd_populate() will only reference _pa(pte). */
0390     kmemleak_ignore(pte);
0391 
0392     return pte;
0393 }
0394 
0395 static void kvmppc_pte_free(pte_t *ptep)
0396 {
0397     kmem_cache_free(kvm_pte_cache, ptep);
0398 }
0399 
0400 static pmd_t *kvmppc_pmd_alloc(void)
0401 {
0402     pmd_t *pmd;
0403 
0404     pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
0405     /* pud_populate() will only reference _pa(pmd). */
0406     kmemleak_ignore(pmd);
0407 
0408     return pmd;
0409 }
0410 
0411 static void kvmppc_pmd_free(pmd_t *pmdp)
0412 {
0413     kmem_cache_free(kvm_pmd_cache, pmdp);
0414 }
0415 
0416 /* Called with kvm->mmu_lock held */
0417 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
0418               unsigned int shift,
0419               const struct kvm_memory_slot *memslot,
0420               unsigned int lpid)
0421 
0422 {
0423     unsigned long old;
0424     unsigned long gfn = gpa >> PAGE_SHIFT;
0425     unsigned long page_size = PAGE_SIZE;
0426     unsigned long hpa;
0427 
0428     old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
0429     kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
0430 
0431     /* The following only applies to L1 entries */
0432     if (lpid != kvm->arch.lpid)
0433         return;
0434 
0435     if (!memslot) {
0436         memslot = gfn_to_memslot(kvm, gfn);
0437         if (!memslot)
0438             return;
0439     }
0440     if (shift) { /* 1GB or 2MB page */
0441         page_size = 1ul << shift;
0442         if (shift == PMD_SHIFT)
0443             kvm->stat.num_2M_pages--;
0444         else if (shift == PUD_SHIFT)
0445             kvm->stat.num_1G_pages--;
0446     }
0447 
0448     gpa &= ~(page_size - 1);
0449     hpa = old & PTE_RPN_MASK;
0450     kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
0451 
0452     if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
0453         kvmppc_update_dirty_map(memslot, gfn, page_size);
0454 }
0455 
0456 /*
0457  * kvmppc_free_p?d are used to free existing page tables, and recursively
0458  * descend and clear and free children.
0459  * Callers are responsible for flushing the PWC.
0460  *
0461  * When page tables are being unmapped/freed as part of page fault path
0462  * (full == false), valid ptes are generally not expected; however, there
0463  * is one situation where they arise, which is when dirty page logging is
0464  * turned off for a memslot while the VM is running.  The new memslot
0465  * becomes visible to page faults before the memslot commit function
0466  * gets to flush the memslot, which can lead to a 2MB page mapping being
0467  * installed for a guest physical address where there are already 64kB
0468  * (or 4kB) mappings (of sub-pages of the same 2MB page).
0469  */
0470 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
0471                   unsigned int lpid)
0472 {
0473     if (full) {
0474         memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
0475     } else {
0476         pte_t *p = pte;
0477         unsigned long it;
0478 
0479         for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
0480             if (pte_val(*p) == 0)
0481                 continue;
0482             kvmppc_unmap_pte(kvm, p,
0483                      pte_pfn(*p) << PAGE_SHIFT,
0484                      PAGE_SHIFT, NULL, lpid);
0485         }
0486     }
0487 
0488     kvmppc_pte_free(pte);
0489 }
0490 
0491 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
0492                   unsigned int lpid)
0493 {
0494     unsigned long im;
0495     pmd_t *p = pmd;
0496 
0497     for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
0498         if (!pmd_present(*p))
0499             continue;
0500         if (pmd_is_leaf(*p)) {
0501             if (full) {
0502                 pmd_clear(p);
0503             } else {
0504                 WARN_ON_ONCE(1);
0505                 kvmppc_unmap_pte(kvm, (pte_t *)p,
0506                      pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
0507                      PMD_SHIFT, NULL, lpid);
0508             }
0509         } else {
0510             pte_t *pte;
0511 
0512             pte = pte_offset_map(p, 0);
0513             kvmppc_unmap_free_pte(kvm, pte, full, lpid);
0514             pmd_clear(p);
0515         }
0516     }
0517     kvmppc_pmd_free(pmd);
0518 }
0519 
0520 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
0521                   unsigned int lpid)
0522 {
0523     unsigned long iu;
0524     pud_t *p = pud;
0525 
0526     for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
0527         if (!pud_present(*p))
0528             continue;
0529         if (pud_is_leaf(*p)) {
0530             pud_clear(p);
0531         } else {
0532             pmd_t *pmd;
0533 
0534             pmd = pmd_offset(p, 0);
0535             kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
0536             pud_clear(p);
0537         }
0538     }
0539     pud_free(kvm->mm, pud);
0540 }
0541 
0542 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
0543 {
0544     unsigned long ig;
0545 
0546     for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
0547         p4d_t *p4d = p4d_offset(pgd, 0);
0548         pud_t *pud;
0549 
0550         if (!p4d_present(*p4d))
0551             continue;
0552         pud = pud_offset(p4d, 0);
0553         kvmppc_unmap_free_pud(kvm, pud, lpid);
0554         p4d_clear(p4d);
0555     }
0556 }
0557 
0558 void kvmppc_free_radix(struct kvm *kvm)
0559 {
0560     if (kvm->arch.pgtable) {
0561         kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
0562                       kvm->arch.lpid);
0563         pgd_free(kvm->mm, kvm->arch.pgtable);
0564         kvm->arch.pgtable = NULL;
0565     }
0566 }
0567 
0568 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
0569                     unsigned long gpa, unsigned int lpid)
0570 {
0571     pte_t *pte = pte_offset_kernel(pmd, 0);
0572 
0573     /*
0574      * Clearing the pmd entry then flushing the PWC ensures that the pte
0575      * page no longer be cached by the MMU, so can be freed without
0576      * flushing the PWC again.
0577      */
0578     pmd_clear(pmd);
0579     kvmppc_radix_flush_pwc(kvm, lpid);
0580 
0581     kvmppc_unmap_free_pte(kvm, pte, false, lpid);
0582 }
0583 
0584 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
0585                     unsigned long gpa, unsigned int lpid)
0586 {
0587     pmd_t *pmd = pmd_offset(pud, 0);
0588 
0589     /*
0590      * Clearing the pud entry then flushing the PWC ensures that the pmd
0591      * page and any children pte pages will no longer be cached by the MMU,
0592      * so can be freed without flushing the PWC again.
0593      */
0594     pud_clear(pud);
0595     kvmppc_radix_flush_pwc(kvm, lpid);
0596 
0597     kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
0598 }
0599 
0600 /*
0601  * There are a number of bits which may differ between different faults to
0602  * the same partition scope entry. RC bits, in the course of cleaning and
0603  * aging. And the write bit can change, either the access could have been
0604  * upgraded, or a read fault could happen concurrently with a write fault
0605  * that sets those bits first.
0606  */
0607 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
0608 
0609 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
0610               unsigned long gpa, unsigned int level,
0611               unsigned long mmu_seq, unsigned int lpid,
0612               unsigned long *rmapp, struct rmap_nested **n_rmap)
0613 {
0614     pgd_t *pgd;
0615     p4d_t *p4d;
0616     pud_t *pud, *new_pud = NULL;
0617     pmd_t *pmd, *new_pmd = NULL;
0618     pte_t *ptep, *new_ptep = NULL;
0619     int ret;
0620 
0621     /* Traverse the guest's 2nd-level tree, allocate new levels needed */
0622     pgd = pgtable + pgd_index(gpa);
0623     p4d = p4d_offset(pgd, gpa);
0624 
0625     pud = NULL;
0626     if (p4d_present(*p4d))
0627         pud = pud_offset(p4d, gpa);
0628     else
0629         new_pud = pud_alloc_one(kvm->mm, gpa);
0630 
0631     pmd = NULL;
0632     if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
0633         pmd = pmd_offset(pud, gpa);
0634     else if (level <= 1)
0635         new_pmd = kvmppc_pmd_alloc();
0636 
0637     if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
0638         new_ptep = kvmppc_pte_alloc();
0639 
0640     /* Check if we might have been invalidated; let the guest retry if so */
0641     spin_lock(&kvm->mmu_lock);
0642     ret = -EAGAIN;
0643     if (mmu_invalidate_retry(kvm, mmu_seq))
0644         goto out_unlock;
0645 
0646     /* Now traverse again under the lock and change the tree */
0647     ret = -ENOMEM;
0648     if (p4d_none(*p4d)) {
0649         if (!new_pud)
0650             goto out_unlock;
0651         p4d_populate(kvm->mm, p4d, new_pud);
0652         new_pud = NULL;
0653     }
0654     pud = pud_offset(p4d, gpa);
0655     if (pud_is_leaf(*pud)) {
0656         unsigned long hgpa = gpa & PUD_MASK;
0657 
0658         /* Check if we raced and someone else has set the same thing */
0659         if (level == 2) {
0660             if (pud_raw(*pud) == pte_raw(pte)) {
0661                 ret = 0;
0662                 goto out_unlock;
0663             }
0664             /* Valid 1GB page here already, add our extra bits */
0665             WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
0666                             PTE_BITS_MUST_MATCH);
0667             kvmppc_radix_update_pte(kvm, (pte_t *)pud,
0668                           0, pte_val(pte), hgpa, PUD_SHIFT);
0669             ret = 0;
0670             goto out_unlock;
0671         }
0672         /*
0673          * If we raced with another CPU which has just put
0674          * a 1GB pte in after we saw a pmd page, try again.
0675          */
0676         if (!new_pmd) {
0677             ret = -EAGAIN;
0678             goto out_unlock;
0679         }
0680         /* Valid 1GB page here already, remove it */
0681         kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
0682                  lpid);
0683     }
0684     if (level == 2) {
0685         if (!pud_none(*pud)) {
0686             /*
0687              * There's a page table page here, but we wanted to
0688              * install a large page, so remove and free the page
0689              * table page.
0690              */
0691             kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
0692         }
0693         kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
0694         if (rmapp && n_rmap)
0695             kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
0696         ret = 0;
0697         goto out_unlock;
0698     }
0699     if (pud_none(*pud)) {
0700         if (!new_pmd)
0701             goto out_unlock;
0702         pud_populate(kvm->mm, pud, new_pmd);
0703         new_pmd = NULL;
0704     }
0705     pmd = pmd_offset(pud, gpa);
0706     if (pmd_is_leaf(*pmd)) {
0707         unsigned long lgpa = gpa & PMD_MASK;
0708 
0709         /* Check if we raced and someone else has set the same thing */
0710         if (level == 1) {
0711             if (pmd_raw(*pmd) == pte_raw(pte)) {
0712                 ret = 0;
0713                 goto out_unlock;
0714             }
0715             /* Valid 2MB page here already, add our extra bits */
0716             WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
0717                             PTE_BITS_MUST_MATCH);
0718             kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
0719                     0, pte_val(pte), lgpa, PMD_SHIFT);
0720             ret = 0;
0721             goto out_unlock;
0722         }
0723 
0724         /*
0725          * If we raced with another CPU which has just put
0726          * a 2MB pte in after we saw a pte page, try again.
0727          */
0728         if (!new_ptep) {
0729             ret = -EAGAIN;
0730             goto out_unlock;
0731         }
0732         /* Valid 2MB page here already, remove it */
0733         kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
0734                  lpid);
0735     }
0736     if (level == 1) {
0737         if (!pmd_none(*pmd)) {
0738             /*
0739              * There's a page table page here, but we wanted to
0740              * install a large page, so remove and free the page
0741              * table page.
0742              */
0743             kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
0744         }
0745         kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
0746         if (rmapp && n_rmap)
0747             kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
0748         ret = 0;
0749         goto out_unlock;
0750     }
0751     if (pmd_none(*pmd)) {
0752         if (!new_ptep)
0753             goto out_unlock;
0754         pmd_populate(kvm->mm, pmd, new_ptep);
0755         new_ptep = NULL;
0756     }
0757     ptep = pte_offset_kernel(pmd, gpa);
0758     if (pte_present(*ptep)) {
0759         /* Check if someone else set the same thing */
0760         if (pte_raw(*ptep) == pte_raw(pte)) {
0761             ret = 0;
0762             goto out_unlock;
0763         }
0764         /* Valid page here already, add our extra bits */
0765         WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
0766                             PTE_BITS_MUST_MATCH);
0767         kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
0768         ret = 0;
0769         goto out_unlock;
0770     }
0771     kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
0772     if (rmapp && n_rmap)
0773         kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
0774     ret = 0;
0775 
0776  out_unlock:
0777     spin_unlock(&kvm->mmu_lock);
0778     if (new_pud)
0779         pud_free(kvm->mm, new_pud);
0780     if (new_pmd)
0781         kvmppc_pmd_free(new_pmd);
0782     if (new_ptep)
0783         kvmppc_pte_free(new_ptep);
0784     return ret;
0785 }
0786 
0787 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
0788                  unsigned long gpa, unsigned int lpid)
0789 {
0790     unsigned long pgflags;
0791     unsigned int shift;
0792     pte_t *ptep;
0793 
0794     /*
0795      * Need to set an R or C bit in the 2nd-level tables;
0796      * since we are just helping out the hardware here,
0797      * it is sufficient to do what the hardware does.
0798      */
0799     pgflags = _PAGE_ACCESSED;
0800     if (writing)
0801         pgflags |= _PAGE_DIRTY;
0802 
0803     if (nested)
0804         ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
0805     else
0806         ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
0807 
0808     if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
0809         kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
0810         return true;
0811     }
0812     return false;
0813 }
0814 
0815 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
0816                    unsigned long gpa,
0817                    struct kvm_memory_slot *memslot,
0818                    bool writing, bool kvm_ro,
0819                    pte_t *inserted_pte, unsigned int *levelp)
0820 {
0821     struct kvm *kvm = vcpu->kvm;
0822     struct page *page = NULL;
0823     unsigned long mmu_seq;
0824     unsigned long hva, gfn = gpa >> PAGE_SHIFT;
0825     bool upgrade_write = false;
0826     bool *upgrade_p = &upgrade_write;
0827     pte_t pte, *ptep;
0828     unsigned int shift, level;
0829     int ret;
0830     bool large_enable;
0831 
0832     /* used to check for invalidations in progress */
0833     mmu_seq = kvm->mmu_invalidate_seq;
0834     smp_rmb();
0835 
0836     /*
0837      * Do a fast check first, since __gfn_to_pfn_memslot doesn't
0838      * do it with !atomic && !async, which is how we call it.
0839      * We always ask for write permission since the common case
0840      * is that the page is writable.
0841      */
0842     hva = gfn_to_hva_memslot(memslot, gfn);
0843     if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
0844         upgrade_write = true;
0845     } else {
0846         unsigned long pfn;
0847 
0848         /* Call KVM generic code to do the slow-path check */
0849         pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
0850                        writing, upgrade_p, NULL);
0851         if (is_error_noslot_pfn(pfn))
0852             return -EFAULT;
0853         page = NULL;
0854         if (pfn_valid(pfn)) {
0855             page = pfn_to_page(pfn);
0856             if (PageReserved(page))
0857                 page = NULL;
0858         }
0859     }
0860 
0861     /*
0862      * Read the PTE from the process' radix tree and use that
0863      * so we get the shift and attribute bits.
0864      */
0865     spin_lock(&kvm->mmu_lock);
0866     ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
0867     pte = __pte(0);
0868     if (ptep)
0869         pte = READ_ONCE(*ptep);
0870     spin_unlock(&kvm->mmu_lock);
0871     /*
0872      * If the PTE disappeared temporarily due to a THP
0873      * collapse, just return and let the guest try again.
0874      */
0875     if (!pte_present(pte)) {
0876         if (page)
0877             put_page(page);
0878         return RESUME_GUEST;
0879     }
0880 
0881     /* If we're logging dirty pages, always map single pages */
0882     large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
0883 
0884     /* Get pte level from shift/size */
0885     if (large_enable && shift == PUD_SHIFT &&
0886         (gpa & (PUD_SIZE - PAGE_SIZE)) ==
0887         (hva & (PUD_SIZE - PAGE_SIZE))) {
0888         level = 2;
0889     } else if (large_enable && shift == PMD_SHIFT &&
0890            (gpa & (PMD_SIZE - PAGE_SIZE)) ==
0891            (hva & (PMD_SIZE - PAGE_SIZE))) {
0892         level = 1;
0893     } else {
0894         level = 0;
0895         if (shift > PAGE_SHIFT) {
0896             /*
0897              * If the pte maps more than one page, bring over
0898              * bits from the virtual address to get the real
0899              * address of the specific single page we want.
0900              */
0901             unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
0902             pte = __pte(pte_val(pte) | (hva & rpnmask));
0903         }
0904     }
0905 
0906     pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
0907     if (writing || upgrade_write) {
0908         if (pte_val(pte) & _PAGE_WRITE)
0909             pte = __pte(pte_val(pte) | _PAGE_DIRTY);
0910     } else {
0911         pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
0912     }
0913 
0914     /* Allocate space in the tree and write the PTE */
0915     ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
0916                 mmu_seq, kvm->arch.lpid, NULL, NULL);
0917     if (inserted_pte)
0918         *inserted_pte = pte;
0919     if (levelp)
0920         *levelp = level;
0921 
0922     if (page) {
0923         if (!ret && (pte_val(pte) & _PAGE_WRITE))
0924             set_page_dirty_lock(page);
0925         put_page(page);
0926     }
0927 
0928     /* Increment number of large pages if we (successfully) inserted one */
0929     if (!ret) {
0930         if (level == 1)
0931             kvm->stat.num_2M_pages++;
0932         else if (level == 2)
0933             kvm->stat.num_1G_pages++;
0934     }
0935 
0936     return ret;
0937 }
0938 
0939 int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
0940                    unsigned long ea, unsigned long dsisr)
0941 {
0942     struct kvm *kvm = vcpu->kvm;
0943     unsigned long gpa, gfn;
0944     struct kvm_memory_slot *memslot;
0945     long ret;
0946     bool writing = !!(dsisr & DSISR_ISSTORE);
0947     bool kvm_ro = false;
0948 
0949     /* Check for unusual errors */
0950     if (dsisr & DSISR_UNSUPP_MMU) {
0951         pr_err("KVM: Got unsupported MMU fault\n");
0952         return -EFAULT;
0953     }
0954     if (dsisr & DSISR_BADACCESS) {
0955         /* Reflect to the guest as DSI */
0956         pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
0957         kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
0958         return RESUME_GUEST;
0959     }
0960 
0961     /* Translate the logical address */
0962     gpa = vcpu->arch.fault_gpa & ~0xfffUL;
0963     gpa &= ~0xF000000000000000ul;
0964     gfn = gpa >> PAGE_SHIFT;
0965     if (!(dsisr & DSISR_PRTABLE_FAULT))
0966         gpa |= ea & 0xfff;
0967 
0968     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
0969         return kvmppc_send_page_to_uv(kvm, gfn);
0970 
0971     /* Get the corresponding memslot */
0972     memslot = gfn_to_memslot(kvm, gfn);
0973 
0974     /* No memslot means it's an emulated MMIO region */
0975     if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
0976         if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
0977                  DSISR_SET_RC)) {
0978             /*
0979              * Bad address in guest page table tree, or other
0980              * unusual error - reflect it to the guest as DSI.
0981              */
0982             kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
0983             return RESUME_GUEST;
0984         }
0985         return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
0986     }
0987 
0988     if (memslot->flags & KVM_MEM_READONLY) {
0989         if (writing) {
0990             /* give the guest a DSI */
0991             kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
0992                                DSISR_PROTFAULT);
0993             return RESUME_GUEST;
0994         }
0995         kvm_ro = true;
0996     }
0997 
0998     /* Failed to set the reference/change bits */
0999     if (dsisr & DSISR_SET_RC) {
1000         spin_lock(&kvm->mmu_lock);
1001         if (kvmppc_hv_handle_set_rc(kvm, false, writing,
1002                         gpa, kvm->arch.lpid))
1003             dsisr &= ~DSISR_SET_RC;
1004         spin_unlock(&kvm->mmu_lock);
1005 
1006         if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
1007                    DSISR_PROTFAULT | DSISR_SET_RC)))
1008             return RESUME_GUEST;
1009     }
1010 
1011     /* Try to insert a pte */
1012     ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1013                          kvm_ro, NULL, NULL);
1014 
1015     if (ret == 0 || ret == -EAGAIN)
1016         ret = RESUME_GUEST;
1017     return ret;
1018 }
1019 
1020 /* Called with kvm->mmu_lock held */
1021 void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1022              unsigned long gfn)
1023 {
1024     pte_t *ptep;
1025     unsigned long gpa = gfn << PAGE_SHIFT;
1026     unsigned int shift;
1027 
1028     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1029         uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1030         return;
1031     }
1032 
1033     ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1034     if (ptep && pte_present(*ptep))
1035         kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1036                  kvm->arch.lpid);
1037 }
1038 
1039 /* Called with kvm->mmu_lock held */
1040 bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1041            unsigned long gfn)
1042 {
1043     pte_t *ptep;
1044     unsigned long gpa = gfn << PAGE_SHIFT;
1045     unsigned int shift;
1046     bool ref = false;
1047     unsigned long old, *rmapp;
1048 
1049     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1050         return ref;
1051 
1052     ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1053     if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1054         old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1055                           gpa, shift);
1056         /* XXX need to flush tlb here? */
1057         /* Also clear bit in ptes in shadow pgtable for nested guests */
1058         rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1059         kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1060                            old & PTE_RPN_MASK,
1061                            1UL << shift);
1062         ref = true;
1063     }
1064     return ref;
1065 }
1066 
1067 /* Called with kvm->mmu_lock held */
1068 bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1069             unsigned long gfn)
1070 
1071 {
1072     pte_t *ptep;
1073     unsigned long gpa = gfn << PAGE_SHIFT;
1074     unsigned int shift;
1075     bool ref = false;
1076 
1077     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1078         return ref;
1079 
1080     ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1081     if (ptep && pte_present(*ptep) && pte_young(*ptep))
1082         ref = true;
1083     return ref;
1084 }
1085 
1086 /* Returns the number of PAGE_SIZE pages that are dirty */
1087 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1088                 struct kvm_memory_slot *memslot, int pagenum)
1089 {
1090     unsigned long gfn = memslot->base_gfn + pagenum;
1091     unsigned long gpa = gfn << PAGE_SHIFT;
1092     pte_t *ptep, pte;
1093     unsigned int shift;
1094     int ret = 0;
1095     unsigned long old, *rmapp;
1096 
1097     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1098         return ret;
1099 
1100     /*
1101      * For performance reasons we don't hold kvm->mmu_lock while walking the
1102      * partition scoped table.
1103      */
1104     ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1105     if (!ptep)
1106         return 0;
1107 
1108     pte = READ_ONCE(*ptep);
1109     if (pte_present(pte) && pte_dirty(pte)) {
1110         spin_lock(&kvm->mmu_lock);
1111         /*
1112          * Recheck the pte again
1113          */
1114         if (pte_val(pte) != pte_val(*ptep)) {
1115             /*
1116              * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1117              * only find PAGE_SIZE pte entries here. We can continue
1118              * to use the pte addr returned by above page table
1119              * walk.
1120              */
1121             if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1122                 spin_unlock(&kvm->mmu_lock);
1123                 return 0;
1124             }
1125         }
1126 
1127         ret = 1;
1128         VM_BUG_ON(shift);
1129         old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1130                           gpa, shift);
1131         kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1132         /* Also clear bit in ptes in shadow pgtable for nested guests */
1133         rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1134         kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1135                            old & PTE_RPN_MASK,
1136                            1UL << shift);
1137         spin_unlock(&kvm->mmu_lock);
1138     }
1139     return ret;
1140 }
1141 
1142 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1143             struct kvm_memory_slot *memslot, unsigned long *map)
1144 {
1145     unsigned long i, j;
1146     int npages;
1147 
1148     for (i = 0; i < memslot->npages; i = j) {
1149         npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1150 
1151         /*
1152          * Note that if npages > 0 then i must be a multiple of npages,
1153          * since huge pages are only used to back the guest at guest
1154          * real addresses that are a multiple of their size.
1155          * Since we have at most one PTE covering any given guest
1156          * real address, if npages > 1 we can skip to i + npages.
1157          */
1158         j = i + 1;
1159         if (npages) {
1160             set_dirty_bits(map, i, npages);
1161             j = i + npages;
1162         }
1163     }
1164     return 0;
1165 }
1166 
1167 void kvmppc_radix_flush_memslot(struct kvm *kvm,
1168                 const struct kvm_memory_slot *memslot)
1169 {
1170     unsigned long n;
1171     pte_t *ptep;
1172     unsigned long gpa;
1173     unsigned int shift;
1174 
1175     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1176         kvmppc_uvmem_drop_pages(memslot, kvm, true);
1177 
1178     if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1179         return;
1180 
1181     gpa = memslot->base_gfn << PAGE_SHIFT;
1182     spin_lock(&kvm->mmu_lock);
1183     for (n = memslot->npages; n; --n) {
1184         ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1185         if (ptep && pte_present(*ptep))
1186             kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1187                      kvm->arch.lpid);
1188         gpa += PAGE_SIZE;
1189     }
1190     /*
1191      * Increase the mmu notifier sequence number to prevent any page
1192      * fault that read the memslot earlier from writing a PTE.
1193      */
1194     kvm->mmu_invalidate_seq++;
1195     spin_unlock(&kvm->mmu_lock);
1196 }
1197 
1198 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1199                  int psize, int *indexp)
1200 {
1201     if (!mmu_psize_defs[psize].shift)
1202         return;
1203     info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1204         (mmu_psize_defs[psize].ap << 29);
1205     ++(*indexp);
1206 }
1207 
1208 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1209 {
1210     int i;
1211 
1212     if (!radix_enabled())
1213         return -EINVAL;
1214     memset(info, 0, sizeof(*info));
1215 
1216     /* 4k page size */
1217     info->geometries[0].page_shift = 12;
1218     info->geometries[0].level_bits[0] = 9;
1219     for (i = 1; i < 4; ++i)
1220         info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1221     /* 64k page size */
1222     info->geometries[1].page_shift = 16;
1223     for (i = 0; i < 4; ++i)
1224         info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1225 
1226     i = 0;
1227     add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1228     add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1229     add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1230     add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1231 
1232     return 0;
1233 }
1234 
1235 int kvmppc_init_vm_radix(struct kvm *kvm)
1236 {
1237     kvm->arch.pgtable = pgd_alloc(kvm->mm);
1238     if (!kvm->arch.pgtable)
1239         return -ENOMEM;
1240     return 0;
1241 }
1242 
1243 static void pte_ctor(void *addr)
1244 {
1245     memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1246 }
1247 
1248 static void pmd_ctor(void *addr)
1249 {
1250     memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1251 }
1252 
1253 struct debugfs_radix_state {
1254     struct kvm  *kvm;
1255     struct mutex    mutex;
1256     unsigned long   gpa;
1257     int     lpid;
1258     int     chars_left;
1259     int     buf_index;
1260     char        buf[128];
1261     u8      hdr;
1262 };
1263 
1264 static int debugfs_radix_open(struct inode *inode, struct file *file)
1265 {
1266     struct kvm *kvm = inode->i_private;
1267     struct debugfs_radix_state *p;
1268 
1269     p = kzalloc(sizeof(*p), GFP_KERNEL);
1270     if (!p)
1271         return -ENOMEM;
1272 
1273     kvm_get_kvm(kvm);
1274     p->kvm = kvm;
1275     mutex_init(&p->mutex);
1276     file->private_data = p;
1277 
1278     return nonseekable_open(inode, file);
1279 }
1280 
1281 static int debugfs_radix_release(struct inode *inode, struct file *file)
1282 {
1283     struct debugfs_radix_state *p = file->private_data;
1284 
1285     kvm_put_kvm(p->kvm);
1286     kfree(p);
1287     return 0;
1288 }
1289 
1290 static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1291                  size_t len, loff_t *ppos)
1292 {
1293     struct debugfs_radix_state *p = file->private_data;
1294     ssize_t ret, r;
1295     unsigned long n;
1296     struct kvm *kvm;
1297     unsigned long gpa;
1298     pgd_t *pgt;
1299     struct kvm_nested_guest *nested;
1300     pgd_t *pgdp;
1301     p4d_t p4d, *p4dp;
1302     pud_t pud, *pudp;
1303     pmd_t pmd, *pmdp;
1304     pte_t *ptep;
1305     int shift;
1306     unsigned long pte;
1307 
1308     kvm = p->kvm;
1309     if (!kvm_is_radix(kvm))
1310         return 0;
1311 
1312     ret = mutex_lock_interruptible(&p->mutex);
1313     if (ret)
1314         return ret;
1315 
1316     if (p->chars_left) {
1317         n = p->chars_left;
1318         if (n > len)
1319             n = len;
1320         r = copy_to_user(buf, p->buf + p->buf_index, n);
1321         n -= r;
1322         p->chars_left -= n;
1323         p->buf_index += n;
1324         buf += n;
1325         len -= n;
1326         ret = n;
1327         if (r) {
1328             if (!n)
1329                 ret = -EFAULT;
1330             goto out;
1331         }
1332     }
1333 
1334     gpa = p->gpa;
1335     nested = NULL;
1336     pgt = NULL;
1337     while (len != 0 && p->lpid >= 0) {
1338         if (gpa >= RADIX_PGTABLE_RANGE) {
1339             gpa = 0;
1340             pgt = NULL;
1341             if (nested) {
1342                 kvmhv_put_nested(nested);
1343                 nested = NULL;
1344             }
1345             p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1346             p->hdr = 0;
1347             if (p->lpid < 0)
1348                 break;
1349         }
1350         if (!pgt) {
1351             if (p->lpid == 0) {
1352                 pgt = kvm->arch.pgtable;
1353             } else {
1354                 nested = kvmhv_get_nested(kvm, p->lpid, false);
1355                 if (!nested) {
1356                     gpa = RADIX_PGTABLE_RANGE;
1357                     continue;
1358                 }
1359                 pgt = nested->shadow_pgtable;
1360             }
1361         }
1362         n = 0;
1363         if (!p->hdr) {
1364             if (p->lpid > 0)
1365                 n = scnprintf(p->buf, sizeof(p->buf),
1366                           "\nNested LPID %d: ", p->lpid);
1367             n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1368                       "pgdir: %lx\n", (unsigned long)pgt);
1369             p->hdr = 1;
1370             goto copy;
1371         }
1372 
1373         pgdp = pgt + pgd_index(gpa);
1374         p4dp = p4d_offset(pgdp, gpa);
1375         p4d = READ_ONCE(*p4dp);
1376         if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1377             gpa = (gpa & P4D_MASK) + P4D_SIZE;
1378             continue;
1379         }
1380 
1381         pudp = pud_offset(&p4d, gpa);
1382         pud = READ_ONCE(*pudp);
1383         if (!(pud_val(pud) & _PAGE_PRESENT)) {
1384             gpa = (gpa & PUD_MASK) + PUD_SIZE;
1385             continue;
1386         }
1387         if (pud_val(pud) & _PAGE_PTE) {
1388             pte = pud_val(pud);
1389             shift = PUD_SHIFT;
1390             goto leaf;
1391         }
1392 
1393         pmdp = pmd_offset(&pud, gpa);
1394         pmd = READ_ONCE(*pmdp);
1395         if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1396             gpa = (gpa & PMD_MASK) + PMD_SIZE;
1397             continue;
1398         }
1399         if (pmd_val(pmd) & _PAGE_PTE) {
1400             pte = pmd_val(pmd);
1401             shift = PMD_SHIFT;
1402             goto leaf;
1403         }
1404 
1405         ptep = pte_offset_kernel(&pmd, gpa);
1406         pte = pte_val(READ_ONCE(*ptep));
1407         if (!(pte & _PAGE_PRESENT)) {
1408             gpa += PAGE_SIZE;
1409             continue;
1410         }
1411         shift = PAGE_SHIFT;
1412     leaf:
1413         n = scnprintf(p->buf, sizeof(p->buf),
1414                   " %lx: %lx %d\n", gpa, pte, shift);
1415         gpa += 1ul << shift;
1416     copy:
1417         p->chars_left = n;
1418         if (n > len)
1419             n = len;
1420         r = copy_to_user(buf, p->buf, n);
1421         n -= r;
1422         p->chars_left -= n;
1423         p->buf_index = n;
1424         buf += n;
1425         len -= n;
1426         ret += n;
1427         if (r) {
1428             if (!ret)
1429                 ret = -EFAULT;
1430             break;
1431         }
1432     }
1433     p->gpa = gpa;
1434     if (nested)
1435         kvmhv_put_nested(nested);
1436 
1437  out:
1438     mutex_unlock(&p->mutex);
1439     return ret;
1440 }
1441 
1442 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1443                size_t len, loff_t *ppos)
1444 {
1445     return -EACCES;
1446 }
1447 
1448 static const struct file_operations debugfs_radix_fops = {
1449     .owner   = THIS_MODULE,
1450     .open    = debugfs_radix_open,
1451     .release = debugfs_radix_release,
1452     .read    = debugfs_radix_read,
1453     .write   = debugfs_radix_write,
1454     .llseek  = generic_file_llseek,
1455 };
1456 
1457 void kvmhv_radix_debugfs_init(struct kvm *kvm)
1458 {
1459     debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
1460                 &debugfs_radix_fops);
1461 }
1462 
1463 int kvmppc_radix_init(void)
1464 {
1465     unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1466 
1467     kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1468     if (!kvm_pte_cache)
1469         return -ENOMEM;
1470 
1471     size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1472 
1473     kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1474     if (!kvm_pmd_cache) {
1475         kmem_cache_destroy(kvm_pte_cache);
1476         return -ENOMEM;
1477     }
1478 
1479     return 0;
1480 }
1481 
1482 void kvmppc_radix_exit(void)
1483 {
1484     kmem_cache_destroy(kvm_pte_cache);
1485     kmem_cache_destroy(kvm_pmd_cache);
1486 }