Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
0004  * No bombay mix was harmed in the writing of this file.
0005  *
0006  * Copyright (C) 2020 Google LLC
0007  * Author: Will Deacon <will@kernel.org>
0008  */
0009 
0010 #include <linux/bitfield.h>
0011 #include <asm/kvm_pgtable.h>
0012 #include <asm/stage2_pgtable.h>
0013 
0014 
0015 #define KVM_PTE_TYPE            BIT(1)
0016 #define KVM_PTE_TYPE_BLOCK      0
0017 #define KVM_PTE_TYPE_PAGE       1
0018 #define KVM_PTE_TYPE_TABLE      1
0019 
0020 #define KVM_PTE_LEAF_ATTR_LO        GENMASK(11, 2)
0021 
0022 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
0023 #define KVM_PTE_LEAF_ATTR_LO_S1_AP  GENMASK(7, 6)
0024 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO   3
0025 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW   1
0026 #define KVM_PTE_LEAF_ATTR_LO_S1_SH  GENMASK(9, 8)
0027 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS   3
0028 #define KVM_PTE_LEAF_ATTR_LO_S1_AF  BIT(10)
0029 
0030 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
0031 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R  BIT(6)
0032 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W  BIT(7)
0033 #define KVM_PTE_LEAF_ATTR_LO_S2_SH  GENMASK(9, 8)
0034 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS   3
0035 #define KVM_PTE_LEAF_ATTR_LO_S2_AF  BIT(10)
0036 
0037 #define KVM_PTE_LEAF_ATTR_HI        GENMASK(63, 51)
0038 
0039 #define KVM_PTE_LEAF_ATTR_HI_SW     GENMASK(58, 55)
0040 
0041 #define KVM_PTE_LEAF_ATTR_HI_S1_XN  BIT(54)
0042 
0043 #define KVM_PTE_LEAF_ATTR_HI_S2_XN  BIT(54)
0044 
0045 #define KVM_PTE_LEAF_ATTR_S2_PERMS  (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
0046                      KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
0047                      KVM_PTE_LEAF_ATTR_HI_S2_XN)
0048 
0049 #define KVM_INVALID_PTE_OWNER_MASK  GENMASK(9, 2)
0050 #define KVM_MAX_OWNER_ID        1
0051 
0052 struct kvm_pgtable_walk_data {
0053     struct kvm_pgtable      *pgt;
0054     struct kvm_pgtable_walker   *walker;
0055 
0056     u64             addr;
0057     u64             end;
0058 };
0059 
0060 #define KVM_PHYS_INVALID (-1ULL)
0061 
0062 static bool kvm_phys_is_valid(u64 phys)
0063 {
0064     return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
0065 }
0066 
0067 static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
0068 {
0069     u64 granule = kvm_granule_size(level);
0070 
0071     if (!kvm_level_supports_block_mapping(level))
0072         return false;
0073 
0074     if (granule > (end - addr))
0075         return false;
0076 
0077     if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
0078         return false;
0079 
0080     return IS_ALIGNED(addr, granule);
0081 }
0082 
0083 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
0084 {
0085     u64 shift = kvm_granule_shift(level);
0086     u64 mask = BIT(PAGE_SHIFT - 3) - 1;
0087 
0088     return (data->addr >> shift) & mask;
0089 }
0090 
0091 static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
0092 {
0093     u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
0094     u64 mask = BIT(pgt->ia_bits) - 1;
0095 
0096     return (addr & mask) >> shift;
0097 }
0098 
0099 static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
0100 {
0101     return __kvm_pgd_page_idx(data->pgt, data->addr);
0102 }
0103 
0104 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
0105 {
0106     struct kvm_pgtable pgt = {
0107         .ia_bits    = ia_bits,
0108         .start_level    = start_level,
0109     };
0110 
0111     return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
0112 }
0113 
0114 static bool kvm_pte_table(kvm_pte_t pte, u32 level)
0115 {
0116     if (level == KVM_PGTABLE_MAX_LEVELS - 1)
0117         return false;
0118 
0119     if (!kvm_pte_valid(pte))
0120         return false;
0121 
0122     return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
0123 }
0124 
0125 static kvm_pte_t kvm_phys_to_pte(u64 pa)
0126 {
0127     kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
0128 
0129     if (PAGE_SHIFT == 16)
0130         pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
0131 
0132     return pte;
0133 }
0134 
0135 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
0136 {
0137     return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
0138 }
0139 
0140 static void kvm_clear_pte(kvm_pte_t *ptep)
0141 {
0142     WRITE_ONCE(*ptep, 0);
0143 }
0144 
0145 static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
0146                   struct kvm_pgtable_mm_ops *mm_ops)
0147 {
0148     kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
0149 
0150     pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
0151     pte |= KVM_PTE_VALID;
0152 
0153     WARN_ON(kvm_pte_valid(old));
0154     smp_store_release(ptep, pte);
0155 }
0156 
0157 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
0158 {
0159     kvm_pte_t pte = kvm_phys_to_pte(pa);
0160     u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
0161                                KVM_PTE_TYPE_BLOCK;
0162 
0163     pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
0164     pte |= FIELD_PREP(KVM_PTE_TYPE, type);
0165     pte |= KVM_PTE_VALID;
0166 
0167     return pte;
0168 }
0169 
0170 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
0171 {
0172     return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
0173 }
0174 
0175 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
0176                   u32 level, kvm_pte_t *ptep,
0177                   enum kvm_pgtable_walk_flags flag)
0178 {
0179     struct kvm_pgtable_walker *walker = data->walker;
0180     return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
0181 }
0182 
0183 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
0184                   kvm_pte_t *pgtable, u32 level);
0185 
0186 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
0187                       kvm_pte_t *ptep, u32 level)
0188 {
0189     int ret = 0;
0190     u64 addr = data->addr;
0191     kvm_pte_t *childp, pte = *ptep;
0192     bool table = kvm_pte_table(pte, level);
0193     enum kvm_pgtable_walk_flags flags = data->walker->flags;
0194 
0195     if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
0196         ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
0197                          KVM_PGTABLE_WALK_TABLE_PRE);
0198     }
0199 
0200     if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
0201         ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
0202                          KVM_PGTABLE_WALK_LEAF);
0203         pte = *ptep;
0204         table = kvm_pte_table(pte, level);
0205     }
0206 
0207     if (ret)
0208         goto out;
0209 
0210     if (!table) {
0211         data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
0212         data->addr += kvm_granule_size(level);
0213         goto out;
0214     }
0215 
0216     childp = kvm_pte_follow(pte, data->pgt->mm_ops);
0217     ret = __kvm_pgtable_walk(data, childp, level + 1);
0218     if (ret)
0219         goto out;
0220 
0221     if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
0222         ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
0223                          KVM_PGTABLE_WALK_TABLE_POST);
0224     }
0225 
0226 out:
0227     return ret;
0228 }
0229 
0230 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
0231                   kvm_pte_t *pgtable, u32 level)
0232 {
0233     u32 idx;
0234     int ret = 0;
0235 
0236     if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
0237         return -EINVAL;
0238 
0239     for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
0240         kvm_pte_t *ptep = &pgtable[idx];
0241 
0242         if (data->addr >= data->end)
0243             break;
0244 
0245         ret = __kvm_pgtable_visit(data, ptep, level);
0246         if (ret)
0247             break;
0248     }
0249 
0250     return ret;
0251 }
0252 
0253 static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
0254 {
0255     u32 idx;
0256     int ret = 0;
0257     struct kvm_pgtable *pgt = data->pgt;
0258     u64 limit = BIT(pgt->ia_bits);
0259 
0260     if (data->addr > limit || data->end > limit)
0261         return -ERANGE;
0262 
0263     if (!pgt->pgd)
0264         return -EINVAL;
0265 
0266     for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
0267         kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
0268 
0269         ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
0270         if (ret)
0271             break;
0272     }
0273 
0274     return ret;
0275 }
0276 
0277 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
0278              struct kvm_pgtable_walker *walker)
0279 {
0280     struct kvm_pgtable_walk_data walk_data = {
0281         .pgt    = pgt,
0282         .addr   = ALIGN_DOWN(addr, PAGE_SIZE),
0283         .end    = PAGE_ALIGN(walk_data.addr + size),
0284         .walker = walker,
0285     };
0286 
0287     return _kvm_pgtable_walk(&walk_data);
0288 }
0289 
0290 struct leaf_walk_data {
0291     kvm_pte_t   pte;
0292     u32     level;
0293 };
0294 
0295 static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0296                enum kvm_pgtable_walk_flags flag, void * const arg)
0297 {
0298     struct leaf_walk_data *data = arg;
0299 
0300     data->pte   = *ptep;
0301     data->level = level;
0302 
0303     return 0;
0304 }
0305 
0306 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
0307              kvm_pte_t *ptep, u32 *level)
0308 {
0309     struct leaf_walk_data data;
0310     struct kvm_pgtable_walker walker = {
0311         .cb = leaf_walker,
0312         .flags  = KVM_PGTABLE_WALK_LEAF,
0313         .arg    = &data,
0314     };
0315     int ret;
0316 
0317     ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
0318                    PAGE_SIZE, &walker);
0319     if (!ret) {
0320         if (ptep)
0321             *ptep  = data.pte;
0322         if (level)
0323             *level = data.level;
0324     }
0325 
0326     return ret;
0327 }
0328 
0329 struct hyp_map_data {
0330     u64             phys;
0331     kvm_pte_t           attr;
0332     struct kvm_pgtable_mm_ops   *mm_ops;
0333 };
0334 
0335 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
0336 {
0337     bool device = prot & KVM_PGTABLE_PROT_DEVICE;
0338     u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
0339     kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
0340     u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
0341     u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
0342                            KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
0343 
0344     if (!(prot & KVM_PGTABLE_PROT_R))
0345         return -EINVAL;
0346 
0347     if (prot & KVM_PGTABLE_PROT_X) {
0348         if (prot & KVM_PGTABLE_PROT_W)
0349             return -EINVAL;
0350 
0351         if (device)
0352             return -EINVAL;
0353     } else {
0354         attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
0355     }
0356 
0357     attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
0358     attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
0359     attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
0360     attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
0361     *ptep = attr;
0362 
0363     return 0;
0364 }
0365 
0366 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
0367 {
0368     enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
0369     u32 ap;
0370 
0371     if (!kvm_pte_valid(pte))
0372         return prot;
0373 
0374     if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
0375         prot |= KVM_PGTABLE_PROT_X;
0376 
0377     ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
0378     if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
0379         prot |= KVM_PGTABLE_PROT_R;
0380     else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
0381         prot |= KVM_PGTABLE_PROT_RW;
0382 
0383     return prot;
0384 }
0385 
0386 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
0387                     kvm_pte_t *ptep, struct hyp_map_data *data)
0388 {
0389     kvm_pte_t new, old = *ptep;
0390     u64 granule = kvm_granule_size(level), phys = data->phys;
0391 
0392     if (!kvm_block_mapping_supported(addr, end, phys, level))
0393         return false;
0394 
0395     data->phys += granule;
0396     new = kvm_init_valid_leaf_pte(phys, data->attr, level);
0397     if (old == new)
0398         return true;
0399     if (!kvm_pte_valid(old))
0400         data->mm_ops->get_page(ptep);
0401     else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
0402         return false;
0403 
0404     smp_store_release(ptep, new);
0405     return true;
0406 }
0407 
0408 static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0409               enum kvm_pgtable_walk_flags flag, void * const arg)
0410 {
0411     kvm_pte_t *childp;
0412     struct hyp_map_data *data = arg;
0413     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
0414 
0415     if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
0416         return 0;
0417 
0418     if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
0419         return -EINVAL;
0420 
0421     childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
0422     if (!childp)
0423         return -ENOMEM;
0424 
0425     kvm_set_table_pte(ptep, childp, mm_ops);
0426     mm_ops->get_page(ptep);
0427     return 0;
0428 }
0429 
0430 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
0431             enum kvm_pgtable_prot prot)
0432 {
0433     int ret;
0434     struct hyp_map_data map_data = {
0435         .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
0436         .mm_ops = pgt->mm_ops,
0437     };
0438     struct kvm_pgtable_walker walker = {
0439         .cb = hyp_map_walker,
0440         .flags  = KVM_PGTABLE_WALK_LEAF,
0441         .arg    = &map_data,
0442     };
0443 
0444     ret = hyp_set_prot_attr(prot, &map_data.attr);
0445     if (ret)
0446         return ret;
0447 
0448     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
0449     dsb(ishst);
0450     isb();
0451     return ret;
0452 }
0453 
0454 struct hyp_unmap_data {
0455     u64             unmapped;
0456     struct kvm_pgtable_mm_ops   *mm_ops;
0457 };
0458 
0459 static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0460                 enum kvm_pgtable_walk_flags flag, void * const arg)
0461 {
0462     kvm_pte_t pte = *ptep, *childp = NULL;
0463     u64 granule = kvm_granule_size(level);
0464     struct hyp_unmap_data *data = arg;
0465     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
0466 
0467     if (!kvm_pte_valid(pte))
0468         return -EINVAL;
0469 
0470     if (kvm_pte_table(pte, level)) {
0471         childp = kvm_pte_follow(pte, mm_ops);
0472 
0473         if (mm_ops->page_count(childp) != 1)
0474             return 0;
0475 
0476         kvm_clear_pte(ptep);
0477         dsb(ishst);
0478         __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level);
0479     } else {
0480         if (end - addr < granule)
0481             return -EINVAL;
0482 
0483         kvm_clear_pte(ptep);
0484         dsb(ishst);
0485         __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
0486         data->unmapped += granule;
0487     }
0488 
0489     dsb(ish);
0490     isb();
0491     mm_ops->put_page(ptep);
0492 
0493     if (childp)
0494         mm_ops->put_page(childp);
0495 
0496     return 0;
0497 }
0498 
0499 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
0500 {
0501     struct hyp_unmap_data unmap_data = {
0502         .mm_ops = pgt->mm_ops,
0503     };
0504     struct kvm_pgtable_walker walker = {
0505         .cb = hyp_unmap_walker,
0506         .arg    = &unmap_data,
0507         .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
0508     };
0509 
0510     if (!pgt->mm_ops->page_count)
0511         return 0;
0512 
0513     kvm_pgtable_walk(pgt, addr, size, &walker);
0514     return unmap_data.unmapped;
0515 }
0516 
0517 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
0518              struct kvm_pgtable_mm_ops *mm_ops)
0519 {
0520     u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
0521 
0522     pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
0523     if (!pgt->pgd)
0524         return -ENOMEM;
0525 
0526     pgt->ia_bits        = va_bits;
0527     pgt->start_level    = KVM_PGTABLE_MAX_LEVELS - levels;
0528     pgt->mm_ops     = mm_ops;
0529     pgt->mmu        = NULL;
0530     pgt->force_pte_cb   = NULL;
0531 
0532     return 0;
0533 }
0534 
0535 static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0536                enum kvm_pgtable_walk_flags flag, void * const arg)
0537 {
0538     struct kvm_pgtable_mm_ops *mm_ops = arg;
0539     kvm_pte_t pte = *ptep;
0540 
0541     if (!kvm_pte_valid(pte))
0542         return 0;
0543 
0544     mm_ops->put_page(ptep);
0545 
0546     if (kvm_pte_table(pte, level))
0547         mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
0548 
0549     return 0;
0550 }
0551 
0552 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
0553 {
0554     struct kvm_pgtable_walker walker = {
0555         .cb = hyp_free_walker,
0556         .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
0557         .arg    = pgt->mm_ops,
0558     };
0559 
0560     WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
0561     pgt->mm_ops->put_page(pgt->pgd);
0562     pgt->pgd = NULL;
0563 }
0564 
0565 struct stage2_map_data {
0566     u64             phys;
0567     kvm_pte_t           attr;
0568     u8              owner_id;
0569 
0570     kvm_pte_t           *anchor;
0571     kvm_pte_t           *childp;
0572 
0573     struct kvm_s2_mmu       *mmu;
0574     void                *memcache;
0575 
0576     struct kvm_pgtable_mm_ops   *mm_ops;
0577 
0578     /* Force mappings to page granularity */
0579     bool                force_pte;
0580 };
0581 
0582 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
0583 {
0584     u64 vtcr = VTCR_EL2_FLAGS;
0585     u8 lvls;
0586 
0587     vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
0588     vtcr |= VTCR_EL2_T0SZ(phys_shift);
0589     /*
0590      * Use a minimum 2 level page table to prevent splitting
0591      * host PMD huge pages at stage2.
0592      */
0593     lvls = stage2_pgtable_levels(phys_shift);
0594     if (lvls < 2)
0595         lvls = 2;
0596     vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
0597 
0598     /*
0599      * Enable the Hardware Access Flag management, unconditionally
0600      * on all CPUs. The features is RES0 on CPUs without the support
0601      * and must be ignored by the CPUs.
0602      */
0603     vtcr |= VTCR_EL2_HA;
0604 
0605     /* Set the vmid bits */
0606     vtcr |= (get_vmid_bits(mmfr1) == 16) ?
0607         VTCR_EL2_VS_16BIT :
0608         VTCR_EL2_VS_8BIT;
0609 
0610     return vtcr;
0611 }
0612 
0613 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
0614 {
0615     if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
0616         return false;
0617 
0618     return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
0619 }
0620 
0621 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
0622 
0623 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
0624                 kvm_pte_t *ptep)
0625 {
0626     bool device = prot & KVM_PGTABLE_PROT_DEVICE;
0627     kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
0628                 KVM_S2_MEMATTR(pgt, NORMAL);
0629     u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
0630 
0631     if (!(prot & KVM_PGTABLE_PROT_X))
0632         attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
0633     else if (device)
0634         return -EINVAL;
0635 
0636     if (prot & KVM_PGTABLE_PROT_R)
0637         attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
0638 
0639     if (prot & KVM_PGTABLE_PROT_W)
0640         attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
0641 
0642     attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
0643     attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
0644     attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
0645     *ptep = attr;
0646 
0647     return 0;
0648 }
0649 
0650 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
0651 {
0652     enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
0653 
0654     if (!kvm_pte_valid(pte))
0655         return prot;
0656 
0657     if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
0658         prot |= KVM_PGTABLE_PROT_R;
0659     if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
0660         prot |= KVM_PGTABLE_PROT_W;
0661     if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
0662         prot |= KVM_PGTABLE_PROT_X;
0663 
0664     return prot;
0665 }
0666 
0667 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
0668 {
0669     if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
0670         return true;
0671 
0672     return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
0673 }
0674 
0675 static bool stage2_pte_is_counted(kvm_pte_t pte)
0676 {
0677     /*
0678      * The refcount tracks valid entries as well as invalid entries if they
0679      * encode ownership of a page to another entity than the page-table
0680      * owner, whose id is 0.
0681      */
0682     return !!pte;
0683 }
0684 
0685 static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
0686                u32 level, struct kvm_pgtable_mm_ops *mm_ops)
0687 {
0688     /*
0689      * Clear the existing PTE, and perform break-before-make with
0690      * TLB maintenance if it was valid.
0691      */
0692     if (kvm_pte_valid(*ptep)) {
0693         kvm_clear_pte(ptep);
0694         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
0695     }
0696 
0697     mm_ops->put_page(ptep);
0698 }
0699 
0700 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
0701 {
0702     u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
0703     return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
0704 }
0705 
0706 static bool stage2_pte_executable(kvm_pte_t pte)
0707 {
0708     return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
0709 }
0710 
0711 static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
0712                     struct stage2_map_data *data)
0713 {
0714     if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1)))
0715         return false;
0716 
0717     return kvm_block_mapping_supported(addr, end, data->phys, level);
0718 }
0719 
0720 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
0721                       kvm_pte_t *ptep,
0722                       struct stage2_map_data *data)
0723 {
0724     kvm_pte_t new, old = *ptep;
0725     u64 granule = kvm_granule_size(level), phys = data->phys;
0726     struct kvm_pgtable *pgt = data->mmu->pgt;
0727     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
0728 
0729     if (!stage2_leaf_mapping_allowed(addr, end, level, data))
0730         return -E2BIG;
0731 
0732     if (kvm_phys_is_valid(phys))
0733         new = kvm_init_valid_leaf_pte(phys, data->attr, level);
0734     else
0735         new = kvm_init_invalid_leaf_owner(data->owner_id);
0736 
0737     if (stage2_pte_is_counted(old)) {
0738         /*
0739          * Skip updating the PTE if we are trying to recreate the exact
0740          * same mapping or only change the access permissions. Instead,
0741          * the vCPU will exit one more time from guest if still needed
0742          * and then go through the path of relaxing permissions.
0743          */
0744         if (!stage2_pte_needs_update(old, new))
0745             return -EAGAIN;
0746 
0747         stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
0748     }
0749 
0750     /* Perform CMOs before installation of the guest stage-2 PTE */
0751     if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
0752         mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
0753                         granule);
0754 
0755     if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
0756         mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
0757 
0758     smp_store_release(ptep, new);
0759     if (stage2_pte_is_counted(new))
0760         mm_ops->get_page(ptep);
0761     if (kvm_phys_is_valid(phys))
0762         data->phys += granule;
0763     return 0;
0764 }
0765 
0766 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
0767                      kvm_pte_t *ptep,
0768                      struct stage2_map_data *data)
0769 {
0770     if (data->anchor)
0771         return 0;
0772 
0773     if (!stage2_leaf_mapping_allowed(addr, end, level, data))
0774         return 0;
0775 
0776     data->childp = kvm_pte_follow(*ptep, data->mm_ops);
0777     kvm_clear_pte(ptep);
0778 
0779     /*
0780      * Invalidate the whole stage-2, as we may have numerous leaf
0781      * entries below us which would otherwise need invalidating
0782      * individually.
0783      */
0784     kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
0785     data->anchor = ptep;
0786     return 0;
0787 }
0788 
0789 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0790                 struct stage2_map_data *data)
0791 {
0792     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
0793     kvm_pte_t *childp, pte = *ptep;
0794     int ret;
0795 
0796     if (data->anchor) {
0797         if (stage2_pte_is_counted(pte))
0798             mm_ops->put_page(ptep);
0799 
0800         return 0;
0801     }
0802 
0803     ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data);
0804     if (ret != -E2BIG)
0805         return ret;
0806 
0807     if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
0808         return -EINVAL;
0809 
0810     if (!data->memcache)
0811         return -ENOMEM;
0812 
0813     childp = mm_ops->zalloc_page(data->memcache);
0814     if (!childp)
0815         return -ENOMEM;
0816 
0817     /*
0818      * If we've run into an existing block mapping then replace it with
0819      * a table. Accesses beyond 'end' that fall within the new table
0820      * will be mapped lazily.
0821      */
0822     if (stage2_pte_is_counted(pte))
0823         stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
0824 
0825     kvm_set_table_pte(ptep, childp, mm_ops);
0826     mm_ops->get_page(ptep);
0827 
0828     return 0;
0829 }
0830 
0831 static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
0832                       kvm_pte_t *ptep,
0833                       struct stage2_map_data *data)
0834 {
0835     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
0836     kvm_pte_t *childp;
0837     int ret = 0;
0838 
0839     if (!data->anchor)
0840         return 0;
0841 
0842     if (data->anchor == ptep) {
0843         childp = data->childp;
0844         data->anchor = NULL;
0845         data->childp = NULL;
0846         ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
0847     } else {
0848         childp = kvm_pte_follow(*ptep, mm_ops);
0849     }
0850 
0851     mm_ops->put_page(childp);
0852     mm_ops->put_page(ptep);
0853 
0854     return ret;
0855 }
0856 
0857 /*
0858  * This is a little fiddly, as we use all three of the walk flags. The idea
0859  * is that the TABLE_PRE callback runs for table entries on the way down,
0860  * looking for table entries which we could conceivably replace with a
0861  * block entry for this mapping. If it finds one, then it sets the 'anchor'
0862  * field in 'struct stage2_map_data' to point at the table entry, before
0863  * clearing the entry to zero and descending into the now detached table.
0864  *
0865  * The behaviour of the LEAF callback then depends on whether or not the
0866  * anchor has been set. If not, then we're not using a block mapping higher
0867  * up the table and we perform the mapping at the existing leaves instead.
0868  * If, on the other hand, the anchor _is_ set, then we drop references to
0869  * all valid leaves so that the pages beneath the anchor can be freed.
0870  *
0871  * Finally, the TABLE_POST callback does nothing if the anchor has not
0872  * been set, but otherwise frees the page-table pages while walking back up
0873  * the page-table, installing the block entry when it revisits the anchor
0874  * pointer and clearing the anchor to NULL.
0875  */
0876 static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0877                  enum kvm_pgtable_walk_flags flag, void * const arg)
0878 {
0879     struct stage2_map_data *data = arg;
0880 
0881     switch (flag) {
0882     case KVM_PGTABLE_WALK_TABLE_PRE:
0883         return stage2_map_walk_table_pre(addr, end, level, ptep, data);
0884     case KVM_PGTABLE_WALK_LEAF:
0885         return stage2_map_walk_leaf(addr, end, level, ptep, data);
0886     case KVM_PGTABLE_WALK_TABLE_POST:
0887         return stage2_map_walk_table_post(addr, end, level, ptep, data);
0888     }
0889 
0890     return -EINVAL;
0891 }
0892 
0893 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
0894                u64 phys, enum kvm_pgtable_prot prot,
0895                void *mc)
0896 {
0897     int ret;
0898     struct stage2_map_data map_data = {
0899         .phys       = ALIGN_DOWN(phys, PAGE_SIZE),
0900         .mmu        = pgt->mmu,
0901         .memcache   = mc,
0902         .mm_ops     = pgt->mm_ops,
0903         .force_pte  = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
0904     };
0905     struct kvm_pgtable_walker walker = {
0906         .cb     = stage2_map_walker,
0907         .flags      = KVM_PGTABLE_WALK_TABLE_PRE |
0908                   KVM_PGTABLE_WALK_LEAF |
0909                   KVM_PGTABLE_WALK_TABLE_POST,
0910         .arg        = &map_data,
0911     };
0912 
0913     if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
0914         return -EINVAL;
0915 
0916     ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
0917     if (ret)
0918         return ret;
0919 
0920     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
0921     dsb(ishst);
0922     return ret;
0923 }
0924 
0925 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
0926                  void *mc, u8 owner_id)
0927 {
0928     int ret;
0929     struct stage2_map_data map_data = {
0930         .phys       = KVM_PHYS_INVALID,
0931         .mmu        = pgt->mmu,
0932         .memcache   = mc,
0933         .mm_ops     = pgt->mm_ops,
0934         .owner_id   = owner_id,
0935         .force_pte  = true,
0936     };
0937     struct kvm_pgtable_walker walker = {
0938         .cb     = stage2_map_walker,
0939         .flags      = KVM_PGTABLE_WALK_TABLE_PRE |
0940                   KVM_PGTABLE_WALK_LEAF |
0941                   KVM_PGTABLE_WALK_TABLE_POST,
0942         .arg        = &map_data,
0943     };
0944 
0945     if (owner_id > KVM_MAX_OWNER_ID)
0946         return -EINVAL;
0947 
0948     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
0949     return ret;
0950 }
0951 
0952 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
0953                    enum kvm_pgtable_walk_flags flag,
0954                    void * const arg)
0955 {
0956     struct kvm_pgtable *pgt = arg;
0957     struct kvm_s2_mmu *mmu = pgt->mmu;
0958     struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
0959     kvm_pte_t pte = *ptep, *childp = NULL;
0960     bool need_flush = false;
0961 
0962     if (!kvm_pte_valid(pte)) {
0963         if (stage2_pte_is_counted(pte)) {
0964             kvm_clear_pte(ptep);
0965             mm_ops->put_page(ptep);
0966         }
0967         return 0;
0968     }
0969 
0970     if (kvm_pte_table(pte, level)) {
0971         childp = kvm_pte_follow(pte, mm_ops);
0972 
0973         if (mm_ops->page_count(childp) != 1)
0974             return 0;
0975     } else if (stage2_pte_cacheable(pgt, pte)) {
0976         need_flush = !stage2_has_fwb(pgt);
0977     }
0978 
0979     /*
0980      * This is similar to the map() path in that we unmap the entire
0981      * block entry and rely on the remaining portions being faulted
0982      * back lazily.
0983      */
0984     stage2_put_pte(ptep, mmu, addr, level, mm_ops);
0985 
0986     if (need_flush && mm_ops->dcache_clean_inval_poc)
0987         mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
0988                            kvm_granule_size(level));
0989 
0990     if (childp)
0991         mm_ops->put_page(childp);
0992 
0993     return 0;
0994 }
0995 
0996 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
0997 {
0998     struct kvm_pgtable_walker walker = {
0999         .cb = stage2_unmap_walker,
1000         .arg    = pgt,
1001         .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1002     };
1003 
1004     return kvm_pgtable_walk(pgt, addr, size, &walker);
1005 }
1006 
1007 struct stage2_attr_data {
1008     kvm_pte_t           attr_set;
1009     kvm_pte_t           attr_clr;
1010     kvm_pte_t           pte;
1011     u32             level;
1012     struct kvm_pgtable_mm_ops   *mm_ops;
1013 };
1014 
1015 static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
1016                   enum kvm_pgtable_walk_flags flag,
1017                   void * const arg)
1018 {
1019     kvm_pte_t pte = *ptep;
1020     struct stage2_attr_data *data = arg;
1021     struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
1022 
1023     if (!kvm_pte_valid(pte))
1024         return 0;
1025 
1026     data->level = level;
1027     data->pte = pte;
1028     pte &= ~data->attr_clr;
1029     pte |= data->attr_set;
1030 
1031     /*
1032      * We may race with the CPU trying to set the access flag here,
1033      * but worst-case the access flag update gets lost and will be
1034      * set on the next access instead.
1035      */
1036     if (data->pte != pte) {
1037         /*
1038          * Invalidate instruction cache before updating the guest
1039          * stage-2 PTE if we are going to add executable permission.
1040          */
1041         if (mm_ops->icache_inval_pou &&
1042             stage2_pte_executable(pte) && !stage2_pte_executable(*ptep))
1043             mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1044                           kvm_granule_size(level));
1045         WRITE_ONCE(*ptep, pte);
1046     }
1047 
1048     return 0;
1049 }
1050 
1051 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1052                     u64 size, kvm_pte_t attr_set,
1053                     kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1054                     u32 *level)
1055 {
1056     int ret;
1057     kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1058     struct stage2_attr_data data = {
1059         .attr_set   = attr_set & attr_mask,
1060         .attr_clr   = attr_clr & attr_mask,
1061         .mm_ops     = pgt->mm_ops,
1062     };
1063     struct kvm_pgtable_walker walker = {
1064         .cb     = stage2_attr_walker,
1065         .arg        = &data,
1066         .flags      = KVM_PGTABLE_WALK_LEAF,
1067     };
1068 
1069     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1070     if (ret)
1071         return ret;
1072 
1073     if (orig_pte)
1074         *orig_pte = data.pte;
1075 
1076     if (level)
1077         *level = data.level;
1078     return 0;
1079 }
1080 
1081 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1082 {
1083     return stage2_update_leaf_attrs(pgt, addr, size, 0,
1084                     KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1085                     NULL, NULL);
1086 }
1087 
1088 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
1089 {
1090     kvm_pte_t pte = 0;
1091     stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1092                  &pte, NULL);
1093     dsb(ishst);
1094     return pte;
1095 }
1096 
1097 kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
1098 {
1099     kvm_pte_t pte = 0;
1100     stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
1101                  &pte, NULL);
1102     /*
1103      * "But where's the TLBI?!", you scream.
1104      * "Over in the core code", I sigh.
1105      *
1106      * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1107      */
1108     return pte;
1109 }
1110 
1111 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
1112 {
1113     kvm_pte_t pte = 0;
1114     stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
1115     return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
1116 }
1117 
1118 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1119                    enum kvm_pgtable_prot prot)
1120 {
1121     int ret;
1122     u32 level;
1123     kvm_pte_t set = 0, clr = 0;
1124 
1125     if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
1126         return -EINVAL;
1127 
1128     if (prot & KVM_PGTABLE_PROT_R)
1129         set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1130 
1131     if (prot & KVM_PGTABLE_PROT_W)
1132         set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1133 
1134     if (prot & KVM_PGTABLE_PROT_X)
1135         clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1136 
1137     ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
1138     if (!ret)
1139         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
1140     return ret;
1141 }
1142 
1143 static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
1144                    enum kvm_pgtable_walk_flags flag,
1145                    void * const arg)
1146 {
1147     struct kvm_pgtable *pgt = arg;
1148     struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1149     kvm_pte_t pte = *ptep;
1150 
1151     if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
1152         return 0;
1153 
1154     if (mm_ops->dcache_clean_inval_poc)
1155         mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
1156                            kvm_granule_size(level));
1157     return 0;
1158 }
1159 
1160 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1161 {
1162     struct kvm_pgtable_walker walker = {
1163         .cb = stage2_flush_walker,
1164         .flags  = KVM_PGTABLE_WALK_LEAF,
1165         .arg    = pgt,
1166     };
1167 
1168     if (stage2_has_fwb(pgt))
1169         return 0;
1170 
1171     return kvm_pgtable_walk(pgt, addr, size, &walker);
1172 }
1173 
1174 
1175 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1176                   struct kvm_pgtable_mm_ops *mm_ops,
1177                   enum kvm_pgtable_stage2_flags flags,
1178                   kvm_pgtable_force_pte_cb_t force_pte_cb)
1179 {
1180     size_t pgd_sz;
1181     u64 vtcr = mmu->arch->vtcr;
1182     u32 ia_bits = VTCR_EL2_IPA(vtcr);
1183     u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1184     u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1185 
1186     pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1187     pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
1188     if (!pgt->pgd)
1189         return -ENOMEM;
1190 
1191     pgt->ia_bits        = ia_bits;
1192     pgt->start_level    = start_level;
1193     pgt->mm_ops     = mm_ops;
1194     pgt->mmu        = mmu;
1195     pgt->flags      = flags;
1196     pgt->force_pte_cb   = force_pte_cb;
1197 
1198     /* Ensure zeroed PGD pages are visible to the hardware walker */
1199     dsb(ishst);
1200     return 0;
1201 }
1202 
1203 static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
1204                   enum kvm_pgtable_walk_flags flag,
1205                   void * const arg)
1206 {
1207     struct kvm_pgtable_mm_ops *mm_ops = arg;
1208     kvm_pte_t pte = *ptep;
1209 
1210     if (!stage2_pte_is_counted(pte))
1211         return 0;
1212 
1213     mm_ops->put_page(ptep);
1214 
1215     if (kvm_pte_table(pte, level))
1216         mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
1217 
1218     return 0;
1219 }
1220 
1221 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1222 {
1223     size_t pgd_sz;
1224     struct kvm_pgtable_walker walker = {
1225         .cb = stage2_free_walker,
1226         .flags  = KVM_PGTABLE_WALK_LEAF |
1227               KVM_PGTABLE_WALK_TABLE_POST,
1228         .arg    = pgt->mm_ops,
1229     };
1230 
1231     WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1232     pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1233     pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
1234     pgt->pgd = NULL;
1235 }