arm64/kvm/mmu.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
0004  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
0005  */
0006
0007 #include <linux/mman.h>
0008 #include <linux/kvm_host.h>
0009 #include <linux/io.h>
0010 #include <linux/hugetlb.h>
0011 #include <linux/sched/signal.h>
0012 #include <trace/events/kvm.h>
0013 #include <asm/pgalloc.h>
0014 #include <asm/cacheflush.h>
0015 #include <asm/kvm_arm.h>
0016 #include <asm/kvm_mmu.h>
0017 #include <asm/kvm_pgtable.h>
0018 #include <asm/kvm_ras.h>
0019 #include <asm/kvm_asm.h>
0020 #include <asm/kvm_emulate.h>
0021 #include <asm/virt.h>
0022
0023 #include "trace.h"
0024
0025 static struct kvm_pgtable *hyp_pgtable;
0026 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
0027
0028 static unsigned long hyp_idmap_start;
0029 static unsigned long hyp_idmap_end;
0030 static phys_addr_t hyp_idmap_vector;
0031
0032 static unsigned long io_map_base;
0033
0034
0035 /*
0036  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
0037  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
0038  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
0039  * long will also starve other vCPUs. We have to also make sure that the page
0040  * tables are not freed while we released the lock.
0041  */
0042 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
0043                   phys_addr_t end,
0044                   int (*fn)(struct kvm_pgtable *, u64, u64),
0045                   bool resched)
0046 {
0047     int ret;
0048     u64 next;
0049
0050     do {
0051         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
0052         if (!pgt)
0053             return -EINVAL;
0054
0055         next = stage2_pgd_addr_end(kvm, addr, end);
0056         ret = fn(pgt, addr, next - addr);
0057         if (ret)
0058             break;
0059
0060         if (resched && next != end)
0061             cond_resched_rwlock_write(&kvm->mmu_lock);
0062     } while (addr = next, addr != end);
0063
0064     return ret;
0065 }
0066
0067 #define stage2_apply_range_resched(kvm, addr, end, fn)          \
0068     stage2_apply_range(kvm, addr, end, fn, true)
0069
0070 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
0071 {
0072     return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
0073 }
0074
0075 /**
0076  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
0077  * @kvm:    pointer to kvm structure.
0078  *
0079  * Interface to HYP function to flush all VM TLB entries
0080  */
0081 void kvm_flush_remote_tlbs(struct kvm *kvm)
0082 {
0083     ++kvm->stat.generic.remote_tlb_flush_requests;
0084     kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
0085 }
0086
0087 static bool kvm_is_device_pfn(unsigned long pfn)
0088 {
0089     return !pfn_is_map_memory(pfn);
0090 }
0091
0092 static void *stage2_memcache_zalloc_page(void *arg)
0093 {
0094     struct kvm_mmu_memory_cache *mc = arg;
0095
0096     /* Allocated with __GFP_ZERO, so no need to zero */
0097     return kvm_mmu_memory_cache_alloc(mc);
0098 }
0099
0100 static void *kvm_host_zalloc_pages_exact(size_t size)
0101 {
0102     return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
0103 }
0104
0105 static void kvm_host_get_page(void *addr)
0106 {
0107     get_page(virt_to_page(addr));
0108 }
0109
0110 static void kvm_host_put_page(void *addr)
0111 {
0112     put_page(virt_to_page(addr));
0113 }
0114
0115 static int kvm_host_page_count(void *addr)
0116 {
0117     return page_count(virt_to_page(addr));
0118 }
0119
0120 static phys_addr_t kvm_host_pa(void *addr)
0121 {
0122     return __pa(addr);
0123 }
0124
0125 static void *kvm_host_va(phys_addr_t phys)
0126 {
0127     return __va(phys);
0128 }
0129
0130 static void clean_dcache_guest_page(void *va, size_t size)
0131 {
0132     __clean_dcache_guest_page(va, size);
0133 }
0134
0135 static void invalidate_icache_guest_page(void *va, size_t size)
0136 {
0137     __invalidate_icache_guest_page(va, size);
0138 }
0139
0140 /*
0141  * Unmapping vs dcache management:
0142  *
0143  * If a guest maps certain memory pages as uncached, all writes will
0144  * bypass the data cache and go directly to RAM.  However, the CPUs
0145  * can still speculate reads (not writes) and fill cache lines with
0146  * data.
0147  *
0148  * Those cache lines will be *clean* cache lines though, so a
0149  * clean+invalidate operation is equivalent to an invalidate
0150  * operation, because no cache lines are marked dirty.
0151  *
0152  * Those clean cache lines could be filled prior to an uncached write
0153  * by the guest, and the cache coherent IO subsystem would therefore
0154  * end up writing old data to disk.
0155  *
0156  * This is why right after unmapping a page/section and invalidating
0157  * the corresponding TLBs, we flush to make sure the IO subsystem will
0158  * never hit in the cache.
0159  *
0160  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
0161  * we then fully enforce cacheability of RAM, no matter what the guest
0162  * does.
0163  */
0164 /**
0165  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
0166  * @mmu:   The KVM stage-2 MMU pointer
0167  * @start: The intermediate physical base address of the range to unmap
0168  * @size:  The size of the area to unmap
0169  * @may_block: Whether or not we are permitted to block
0170  *
0171  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
0172  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
0173  * destroying the VM), otherwise another faulting VCPU may come in and mess
0174  * with things behind our backs.
0175  */
0176 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
0177                  bool may_block)
0178 {
0179     struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
0180     phys_addr_t end = start + size;
0181
0182     lockdep_assert_held_write(&kvm->mmu_lock);
0183     WARN_ON(size & ~PAGE_MASK);
0184     WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
0185                    may_block));
0186 }
0187
0188 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
0189 {
0190     __unmap_stage2_range(mmu, start, size, true);
0191 }
0192
0193 static void stage2_flush_memslot(struct kvm *kvm,
0194                  struct kvm_memory_slot *memslot)
0195 {
0196     phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
0197     phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
0198
0199     stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
0200 }
0201
0202 /**
0203  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
0204  * @kvm: The struct kvm pointer
0205  *
0206  * Go through the stage 2 page tables and invalidate any cache lines
0207  * backing memory already mapped to the VM.
0208  */
0209 static void stage2_flush_vm(struct kvm *kvm)
0210 {
0211     struct kvm_memslots *slots;
0212     struct kvm_memory_slot *memslot;
0213     int idx, bkt;
0214
0215     idx = srcu_read_lock(&kvm->srcu);
0216     write_lock(&kvm->mmu_lock);
0217
0218     slots = kvm_memslots(kvm);
0219     kvm_for_each_memslot(memslot, bkt, slots)
0220         stage2_flush_memslot(kvm, memslot);
0221
0222     write_unlock(&kvm->mmu_lock);
0223     srcu_read_unlock(&kvm->srcu, idx);
0224 }
0225
0226 /**
0227  * free_hyp_pgds - free Hyp-mode page tables
0228  */
0229 void free_hyp_pgds(void)
0230 {
0231     mutex_lock(&kvm_hyp_pgd_mutex);
0232     if (hyp_pgtable) {
0233         kvm_pgtable_hyp_destroy(hyp_pgtable);
0234         kfree(hyp_pgtable);
0235         hyp_pgtable = NULL;
0236     }
0237     mutex_unlock(&kvm_hyp_pgd_mutex);
0238 }
0239
0240 static bool kvm_host_owns_hyp_mappings(void)
0241 {
0242     if (is_kernel_in_hyp_mode())
0243         return false;
0244
0245     if (static_branch_likely(&kvm_protected_mode_initialized))
0246         return false;
0247
0248     /*
0249      * This can happen at boot time when __create_hyp_mappings() is called
0250      * after the hyp protection has been enabled, but the static key has
0251      * not been flipped yet.
0252      */
0253     if (!hyp_pgtable && is_protected_kvm_enabled())
0254         return false;
0255
0256     WARN_ON(!hyp_pgtable);
0257
0258     return true;
0259 }
0260
0261 int __create_hyp_mappings(unsigned long start, unsigned long size,
0262               unsigned long phys, enum kvm_pgtable_prot prot)
0263 {
0264     int err;
0265
0266     if (WARN_ON(!kvm_host_owns_hyp_mappings()))
0267         return -EINVAL;
0268
0269     mutex_lock(&kvm_hyp_pgd_mutex);
0270     err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
0271     mutex_unlock(&kvm_hyp_pgd_mutex);
0272
0273     return err;
0274 }
0275
0276 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
0277 {
0278     if (!is_vmalloc_addr(kaddr)) {
0279         BUG_ON(!virt_addr_valid(kaddr));
0280         return __pa(kaddr);
0281     } else {
0282         return page_to_phys(vmalloc_to_page(kaddr)) +
0283                offset_in_page(kaddr);
0284     }
0285 }
0286
0287 struct hyp_shared_pfn {
0288     u64 pfn;
0289     int count;
0290     struct rb_node node;
0291 };
0292
0293 static DEFINE_MUTEX(hyp_shared_pfns_lock);
0294 static struct rb_root hyp_shared_pfns = RB_ROOT;
0295
0296 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
0297                           struct rb_node **parent)
0298 {
0299     struct hyp_shared_pfn *this;
0300
0301     *node = &hyp_shared_pfns.rb_node;
0302     *parent = NULL;
0303     while (**node) {
0304         this = container_of(**node, struct hyp_shared_pfn, node);
0305         *parent = **node;
0306         if (this->pfn < pfn)
0307             *node = &((**node)->rb_left);
0308         else if (this->pfn > pfn)
0309             *node = &((**node)->rb_right);
0310         else
0311             return this;
0312     }
0313
0314     return NULL;
0315 }
0316
0317 static int share_pfn_hyp(u64 pfn)
0318 {
0319     struct rb_node **node, *parent;
0320     struct hyp_shared_pfn *this;
0321     int ret = 0;
0322
0323     mutex_lock(&hyp_shared_pfns_lock);
0324     this = find_shared_pfn(pfn, &node, &parent);
0325     if (this) {
0326         this->count++;
0327         goto unlock;
0328     }
0329
0330     this = kzalloc(sizeof(*this), GFP_KERNEL);
0331     if (!this) {
0332         ret = -ENOMEM;
0333         goto unlock;
0334     }
0335
0336     this->pfn = pfn;
0337     this->count = 1;
0338     rb_link_node(&this->node, parent, node);
0339     rb_insert_color(&this->node, &hyp_shared_pfns);
0340     ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
0341 unlock:
0342     mutex_unlock(&hyp_shared_pfns_lock);
0343
0344     return ret;
0345 }
0346
0347 static int unshare_pfn_hyp(u64 pfn)
0348 {
0349     struct rb_node **node, *parent;
0350     struct hyp_shared_pfn *this;
0351     int ret = 0;
0352
0353     mutex_lock(&hyp_shared_pfns_lock);
0354     this = find_shared_pfn(pfn, &node, &parent);
0355     if (WARN_ON(!this)) {
0356         ret = -ENOENT;
0357         goto unlock;
0358     }
0359
0360     this->count--;
0361     if (this->count)
0362         goto unlock;
0363
0364     rb_erase(&this->node, &hyp_shared_pfns);
0365     kfree(this);
0366     ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
0367 unlock:
0368     mutex_unlock(&hyp_shared_pfns_lock);
0369
0370     return ret;
0371 }
0372
0373 int kvm_share_hyp(void *from, void *to)
0374 {
0375     phys_addr_t start, end, cur;
0376     u64 pfn;
0377     int ret;
0378
0379     if (is_kernel_in_hyp_mode())
0380         return 0;
0381
0382     /*
0383      * The share hcall maps things in the 'fixed-offset' region of the hyp
0384      * VA space, so we can only share physically contiguous data-structures
0385      * for now.
0386      */
0387     if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
0388         return -EINVAL;
0389
0390     if (kvm_host_owns_hyp_mappings())
0391         return create_hyp_mappings(from, to, PAGE_HYP);
0392
0393     start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
0394     end = PAGE_ALIGN(__pa(to));
0395     for (cur = start; cur < end; cur += PAGE_SIZE) {
0396         pfn = __phys_to_pfn(cur);
0397         ret = share_pfn_hyp(pfn);
0398         if (ret)
0399             return ret;
0400     }
0401
0402     return 0;
0403 }
0404
0405 void kvm_unshare_hyp(void *from, void *to)
0406 {
0407     phys_addr_t start, end, cur;
0408     u64 pfn;
0409
0410     if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
0411         return;
0412
0413     start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
0414     end = PAGE_ALIGN(__pa(to));
0415     for (cur = start; cur < end; cur += PAGE_SIZE) {
0416         pfn = __phys_to_pfn(cur);
0417         WARN_ON(unshare_pfn_hyp(pfn));
0418     }
0419 }
0420
0421 /**
0422  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
0423  * @from:   The virtual kernel start address of the range
0424  * @to:     The virtual kernel end address of the range (exclusive)
0425  * @prot:   The protection to be applied to this range
0426  *
0427  * The same virtual address as the kernel virtual address is also used
0428  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
0429  * physical pages.
0430  */
0431 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
0432 {
0433     phys_addr_t phys_addr;
0434     unsigned long virt_addr;
0435     unsigned long start = kern_hyp_va((unsigned long)from);
0436     unsigned long end = kern_hyp_va((unsigned long)to);
0437
0438     if (is_kernel_in_hyp_mode())
0439         return 0;
0440
0441     if (!kvm_host_owns_hyp_mappings())
0442         return -EPERM;
0443
0444     start = start & PAGE_MASK;
0445     end = PAGE_ALIGN(end);
0446
0447     for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
0448         int err;
0449
0450         phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
0451         err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
0452                         prot);
0453         if (err)
0454             return err;
0455     }
0456
0457     return 0;
0458 }
0459
0460
0461 /**
0462  * hyp_alloc_private_va_range - Allocates a private VA range.
0463  * @size:   The size of the VA range to reserve.
0464  * @haddr:  The hypervisor virtual start address of the allocation.
0465  *
0466  * The private virtual address (VA) range is allocated below io_map_base
0467  * and aligned based on the order of @size.
0468  *
0469  * Return: 0 on success or negative error code on failure.
0470  */
0471 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
0472 {
0473     unsigned long base;
0474     int ret = 0;
0475
0476     mutex_lock(&kvm_hyp_pgd_mutex);
0477
0478     /*
0479      * This assumes that we have enough space below the idmap
0480      * page to allocate our VAs. If not, the check below will
0481      * kick. A potential alternative would be to detect that
0482      * overflow and switch to an allocation above the idmap.
0483      *
0484      * The allocated size is always a multiple of PAGE_SIZE.
0485      */
0486     base = io_map_base - PAGE_ALIGN(size);
0487
0488     /* Align the allocation based on the order of its size */
0489     base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
0490
0491     /*
0492      * Verify that BIT(VA_BITS - 1) hasn't been flipped by
0493      * allocating the new area, as it would indicate we've
0494      * overflowed the idmap/IO address range.
0495      */
0496     if ((base ^ io_map_base) & BIT(VA_BITS - 1))
0497         ret = -ENOMEM;
0498     else
0499         *haddr = io_map_base = base;
0500
0501     mutex_unlock(&kvm_hyp_pgd_mutex);
0502
0503     return ret;
0504 }
0505
0506 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
0507                     unsigned long *haddr,
0508                     enum kvm_pgtable_prot prot)
0509 {
0510     unsigned long addr;
0511     int ret = 0;
0512
0513     if (!kvm_host_owns_hyp_mappings()) {
0514         addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
0515                      phys_addr, size, prot);
0516         if (IS_ERR_VALUE(addr))
0517             return addr;
0518         *haddr = addr;
0519
0520         return 0;
0521     }
0522
0523     size = PAGE_ALIGN(size + offset_in_page(phys_addr));
0524     ret = hyp_alloc_private_va_range(size, &addr);
0525     if (ret)
0526         return ret;
0527
0528     ret = __create_hyp_mappings(addr, size, phys_addr, prot);
0529     if (ret)
0530         return ret;
0531
0532     *haddr = addr + offset_in_page(phys_addr);
0533     return ret;
0534 }
0535
0536 /**
0537  * create_hyp_io_mappings - Map IO into both kernel and HYP
0538  * @phys_addr:  The physical start address which gets mapped
0539  * @size:   Size of the region being mapped
0540  * @kaddr:  Kernel VA for this mapping
0541  * @haddr:  HYP VA for this mapping
0542  */
0543 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
0544                void __iomem **kaddr,
0545                void __iomem **haddr)
0546 {
0547     unsigned long addr;
0548     int ret;
0549
0550     if (is_protected_kvm_enabled())
0551         return -EPERM;
0552
0553     *kaddr = ioremap(phys_addr, size);
0554     if (!*kaddr)
0555         return -ENOMEM;
0556
0557     if (is_kernel_in_hyp_mode()) {
0558         *haddr = *kaddr;
0559         return 0;
0560     }
0561
0562     ret = __create_hyp_private_mapping(phys_addr, size,
0563                        &addr, PAGE_HYP_DEVICE);
0564     if (ret) {
0565         iounmap(*kaddr);
0566         *kaddr = NULL;
0567         *haddr = NULL;
0568         return ret;
0569     }
0570
0571     *haddr = (void __iomem *)addr;
0572     return 0;
0573 }
0574
0575 /**
0576  * create_hyp_exec_mappings - Map an executable range into HYP
0577  * @phys_addr:  The physical start address which gets mapped
0578  * @size:   Size of the region being mapped
0579  * @haddr:  HYP VA for this mapping
0580  */
0581 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
0582                  void **haddr)
0583 {
0584     unsigned long addr;
0585     int ret;
0586
0587     BUG_ON(is_kernel_in_hyp_mode());
0588
0589     ret = __create_hyp_private_mapping(phys_addr, size,
0590                        &addr, PAGE_HYP_EXEC);
0591     if (ret) {
0592         *haddr = NULL;
0593         return ret;
0594     }
0595
0596     *haddr = (void *)addr;
0597     return 0;
0598 }
0599
0600 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
0601     /* We shouldn't need any other callback to walk the PT */
0602     .phys_to_virt       = kvm_host_va,
0603 };
0604
0605 static int get_user_mapping_size(struct kvm *kvm, u64 addr)
0606 {
0607     struct kvm_pgtable pgt = {
0608         .pgd        = (kvm_pte_t *)kvm->mm->pgd,
0609         .ia_bits    = VA_BITS,
0610         .start_level    = (KVM_PGTABLE_MAX_LEVELS -
0611                    CONFIG_PGTABLE_LEVELS),
0612         .mm_ops     = &kvm_user_mm_ops,
0613     };
0614     kvm_pte_t pte = 0;  /* Keep GCC quiet... */
0615     u32 level = ~0;
0616     int ret;
0617
0618     ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
0619     VM_BUG_ON(ret);
0620     VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
0621     VM_BUG_ON(!(pte & PTE_VALID));
0622
0623     return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
0624 }
0625
0626 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
0627     .zalloc_page        = stage2_memcache_zalloc_page,
0628     .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
0629     .free_pages_exact   = free_pages_exact,
0630     .get_page       = kvm_host_get_page,
0631     .put_page       = kvm_host_put_page,
0632     .page_count     = kvm_host_page_count,
0633     .phys_to_virt       = kvm_host_va,
0634     .virt_to_phys       = kvm_host_pa,
0635     .dcache_clean_inval_poc = clean_dcache_guest_page,
0636     .icache_inval_pou   = invalidate_icache_guest_page,
0637 };
0638
0639 /**
0640  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
0641  * @kvm:    The pointer to the KVM structure
0642  * @mmu:    The pointer to the s2 MMU structure
0643  *
0644  * Allocates only the stage-2 HW PGD level table(s).
0645  * Note we don't need locking here as this is only called when the VM is
0646  * created, which can only be done once.
0647  */
0648 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
0649 {
0650     int cpu, err;
0651     struct kvm_pgtable *pgt;
0652
0653     if (mmu->pgt != NULL) {
0654         kvm_err("kvm_arch already initialized?\n");
0655         return -EINVAL;
0656     }
0657
0658     pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
0659     if (!pgt)
0660         return -ENOMEM;
0661
0662     mmu->arch = &kvm->arch;
0663     err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
0664     if (err)
0665         goto out_free_pgtable;
0666
0667     mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
0668     if (!mmu->last_vcpu_ran) {
0669         err = -ENOMEM;
0670         goto out_destroy_pgtable;
0671     }
0672
0673     for_each_possible_cpu(cpu)
0674         *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
0675
0676     mmu->pgt = pgt;
0677     mmu->pgd_phys = __pa(pgt->pgd);
0678     return 0;
0679
0680 out_destroy_pgtable:
0681     kvm_pgtable_stage2_destroy(pgt);
0682 out_free_pgtable:
0683     kfree(pgt);
0684     return err;
0685 }
0686
0687 static void stage2_unmap_memslot(struct kvm *kvm,
0688                  struct kvm_memory_slot *memslot)
0689 {
0690     hva_t hva = memslot->userspace_addr;
0691     phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
0692     phys_addr_t size = PAGE_SIZE * memslot->npages;
0693     hva_t reg_end = hva + size;
0694
0695     /*
0696      * A memory region could potentially cover multiple VMAs, and any holes
0697      * between them, so iterate over all of them to find out if we should
0698      * unmap any of them.
0699      *
0700      *     +--------------------------------------------+
0701      * +---------------+----------------+   +----------------+
0702      * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
0703      * +---------------+----------------+   +----------------+
0704      *     |               memory region                |
0705      *     +--------------------------------------------+
0706      */
0707     do {
0708         struct vm_area_struct *vma;
0709         hva_t vm_start, vm_end;
0710
0711         vma = find_vma_intersection(current->mm, hva, reg_end);
0712         if (!vma)
0713             break;
0714
0715         /*
0716          * Take the intersection of this VMA with the memory region
0717          */
0718         vm_start = max(hva, vma->vm_start);
0719         vm_end = min(reg_end, vma->vm_end);
0720
0721         if (!(vma->vm_flags & VM_PFNMAP)) {
0722             gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
0723             unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
0724         }
0725         hva = vm_end;
0726     } while (hva < reg_end);
0727 }
0728
0729 /**
0730  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
0731  * @kvm: The struct kvm pointer
0732  *
0733  * Go through the memregions and unmap any regular RAM
0734  * backing memory already mapped to the VM.
0735  */
0736 void stage2_unmap_vm(struct kvm *kvm)
0737 {
0738     struct kvm_memslots *slots;
0739     struct kvm_memory_slot *memslot;
0740     int idx, bkt;
0741
0742     idx = srcu_read_lock(&kvm->srcu);
0743     mmap_read_lock(current->mm);
0744     write_lock(&kvm->mmu_lock);
0745
0746     slots = kvm_memslots(kvm);
0747     kvm_for_each_memslot(memslot, bkt, slots)
0748         stage2_unmap_memslot(kvm, memslot);
0749
0750     write_unlock(&kvm->mmu_lock);
0751     mmap_read_unlock(current->mm);
0752     srcu_read_unlock(&kvm->srcu, idx);
0753 }
0754
0755 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
0756 {
0757     struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
0758     struct kvm_pgtable *pgt = NULL;
0759
0760     write_lock(&kvm->mmu_lock);
0761     pgt = mmu->pgt;
0762     if (pgt) {
0763         mmu->pgd_phys = 0;
0764         mmu->pgt = NULL;
0765         free_percpu(mmu->last_vcpu_ran);
0766     }
0767     write_unlock(&kvm->mmu_lock);
0768
0769     if (pgt) {
0770         kvm_pgtable_stage2_destroy(pgt);
0771         kfree(pgt);
0772     }
0773 }
0774
0775 /**
0776  * kvm_phys_addr_ioremap - map a device range to guest IPA
0777  *
0778  * @kvm:    The KVM pointer
0779  * @guest_ipa:  The IPA at which to insert the mapping
0780  * @pa:     The physical address of the device
0781  * @size:   The size of the mapping
0782  * @writable:   Whether or not to create a writable mapping
0783  */
0784 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
0785               phys_addr_t pa, unsigned long size, bool writable)
0786 {
0787     phys_addr_t addr;
0788     int ret = 0;
0789     struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
0790     struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
0791     enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
0792                      KVM_PGTABLE_PROT_R |
0793                      (writable ? KVM_PGTABLE_PROT_W : 0);
0794
0795     if (is_protected_kvm_enabled())
0796         return -EPERM;
0797
0798     size += offset_in_page(guest_ipa);
0799     guest_ipa &= PAGE_MASK;
0800
0801     for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
0802         ret = kvm_mmu_topup_memory_cache(&cache,
0803                          kvm_mmu_cache_min_pages(kvm));
0804         if (ret)
0805             break;
0806
0807         write_lock(&kvm->mmu_lock);
0808         ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
0809                          &cache);
0810         write_unlock(&kvm->mmu_lock);
0811         if (ret)
0812             break;
0813
0814         pa += PAGE_SIZE;
0815     }
0816
0817     kvm_mmu_free_memory_cache(&cache);
0818     return ret;
0819 }
0820
0821 /**
0822  * stage2_wp_range() - write protect stage2 memory region range
0823  * @mmu:        The KVM stage-2 MMU pointer
0824  * @addr:   Start address of range
0825  * @end:    End address of range
0826  */
0827 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
0828 {
0829     struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
0830     stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
0831 }
0832
0833 /**
0834  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
0835  * @kvm:    The KVM pointer
0836  * @slot:   The memory slot to write protect
0837  *
0838  * Called to start logging dirty pages after memory region
0839  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
0840  * all present PUD, PMD and PTEs are write protected in the memory region.
0841  * Afterwards read of dirty page log can be called.
0842  *
0843  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
0844  * serializing operations for VM memory regions.
0845  */
0846 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
0847 {
0848     struct kvm_memslots *slots = kvm_memslots(kvm);
0849     struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
0850     phys_addr_t start, end;
0851
0852     if (WARN_ON_ONCE(!memslot))
0853         return;
0854
0855     start = memslot->base_gfn << PAGE_SHIFT;
0856     end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
0857
0858     write_lock(&kvm->mmu_lock);
0859     stage2_wp_range(&kvm->arch.mmu, start, end);
0860     write_unlock(&kvm->mmu_lock);
0861     kvm_flush_remote_tlbs(kvm);
0862 }
0863
0864 /**
0865  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
0866  * @kvm:    The KVM pointer
0867  * @slot:   The memory slot associated with mask
0868  * @gfn_offset: The gfn offset in memory slot
0869  * @mask:   The mask of dirty pages at offset 'gfn_offset' in this memory
0870  *      slot to be write protected
0871  *
0872  * Walks bits set in mask write protects the associated pte's. Caller must
0873  * acquire kvm_mmu_lock.
0874  */
0875 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
0876         struct kvm_memory_slot *slot,
0877         gfn_t gfn_offset, unsigned long mask)
0878 {
0879     phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
0880     phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
0881     phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
0882
0883     stage2_wp_range(&kvm->arch.mmu, start, end);
0884 }
0885
0886 /*
0887  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
0888  * dirty pages.
0889  *
0890  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
0891  * enable dirty logging for them.
0892  */
0893 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
0894         struct kvm_memory_slot *slot,
0895         gfn_t gfn_offset, unsigned long mask)
0896 {
0897     kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
0898 }
0899
0900 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
0901 {
0902     send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
0903 }
0904
0905 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
0906                            unsigned long hva,
0907                            unsigned long map_size)
0908 {
0909     gpa_t gpa_start;
0910     hva_t uaddr_start, uaddr_end;
0911     size_t size;
0912
0913     /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
0914     if (map_size == PAGE_SIZE)
0915         return true;
0916
0917     size = memslot->npages * PAGE_SIZE;
0918
0919     gpa_start = memslot->base_gfn << PAGE_SHIFT;
0920
0921     uaddr_start = memslot->userspace_addr;
0922     uaddr_end = uaddr_start + size;
0923
0924     /*
0925      * Pages belonging to memslots that don't have the same alignment
0926      * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
0927      * PMD/PUD entries, because we'll end up mapping the wrong pages.
0928      *
0929      * Consider a layout like the following:
0930      *
0931      *    memslot->userspace_addr:
0932      *    +-----+--------------------+--------------------+---+
0933      *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
0934      *    +-----+--------------------+--------------------+---+
0935      *
0936      *    memslot->base_gfn << PAGE_SHIFT:
0937      *      +---+--------------------+--------------------+-----+
0938      *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
0939      *      +---+--------------------+--------------------+-----+
0940      *
0941      * If we create those stage-2 blocks, we'll end up with this incorrect
0942      * mapping:
0943      *   d -> f
0944      *   e -> g
0945      *   f -> h
0946      */
0947     if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
0948         return false;
0949
0950     /*
0951      * Next, let's make sure we're not trying to map anything not covered
0952      * by the memslot. This means we have to prohibit block size mappings
0953      * for the beginning and end of a non-block aligned and non-block sized
0954      * memory slot (illustrated by the head and tail parts of the
0955      * userspace view above containing pages 'abcde' and 'xyz',
0956      * respectively).
0957      *
0958      * Note that it doesn't matter if we do the check using the
0959      * userspace_addr or the base_gfn, as both are equally aligned (per
0960      * the check above) and equally sized.
0961      */
0962     return (hva & ~(map_size - 1)) >= uaddr_start &&
0963            (hva & ~(map_size - 1)) + map_size <= uaddr_end;
0964 }
0965
0966 /*
0967  * Check if the given hva is backed by a transparent huge page (THP) and
0968  * whether it can be mapped using block mapping in stage2. If so, adjust
0969  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
0970  * supported. This will need to be updated to support other THP sizes.
0971  *
0972  * Returns the size of the mapping.
0973  */
0974 static unsigned long
0975 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
0976                 unsigned long hva, kvm_pfn_t *pfnp,
0977                 phys_addr_t *ipap)
0978 {
0979     kvm_pfn_t pfn = *pfnp;
0980
0981     /*
0982      * Make sure the adjustment is done only for THP pages. Also make
0983      * sure that the HVA and IPA are sufficiently aligned and that the
0984      * block map is contained within the memslot.
0985      */
0986     if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
0987         get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
0988         /*
0989          * The address we faulted on is backed by a transparent huge
0990          * page.  However, because we map the compound huge page and
0991          * not the individual tail page, we need to transfer the
0992          * refcount to the head page.  We have to be careful that the
0993          * THP doesn't start to split while we are adjusting the
0994          * refcounts.
0995          *
0996          * We are sure this doesn't happen, because mmu_invalidate_retry
0997          * was successful and we are holding the mmu_lock, so if this
0998          * THP is trying to split, it will be blocked in the mmu
0999          * notifier before touching any of the pages, specifically
1000          * before being able to call __split_huge_page_refcount().
1001          *
1002          * We can therefore safely transfer the refcount from PG_tail
1003          * to PG_head and switch the pfn from a tail page to the head
1004          * page accordingly.
1005          */
1006         *ipap &= PMD_MASK;
1007         kvm_release_pfn_clean(pfn);
1008         pfn &= ~(PTRS_PER_PMD - 1);
1009         get_page(pfn_to_page(pfn));
1010         *pfnp = pfn;
1011
1012         return PMD_SIZE;
1013     }
1014
1015     /* Use page mapping if we cannot use block mapping. */
1016     return PAGE_SIZE;
1017 }
1018
1019 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
1020 {
1021     unsigned long pa;
1022
1023     if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
1024         return huge_page_shift(hstate_vma(vma));
1025
1026     if (!(vma->vm_flags & VM_PFNMAP))
1027         return PAGE_SHIFT;
1028
1029     VM_BUG_ON(is_vm_hugetlb_page(vma));
1030
1031     pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
1032
1033 #ifndef __PAGETABLE_PMD_FOLDED
1034     if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
1035         ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
1036         ALIGN(hva, PUD_SIZE) <= vma->vm_end)
1037         return PUD_SHIFT;
1038 #endif
1039
1040     if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
1041         ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
1042         ALIGN(hva, PMD_SIZE) <= vma->vm_end)
1043         return PMD_SHIFT;
1044
1045     return PAGE_SHIFT;
1046 }
1047
1048 /*
1049  * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1050  * able to see the page's tags and therefore they must be initialised first. If
1051  * PG_mte_tagged is set, tags have already been initialised.
1052  *
1053  * The race in the test/set of the PG_mte_tagged flag is handled by:
1054  * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1055  *   racing to santise the same page
1056  * - mmap_lock protects between a VM faulting a page in and the VMM performing
1057  *   an mprotect() to add VM_MTE
1058  */
1059 static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1060                  unsigned long size)
1061 {
1062     unsigned long i, nr_pages = size >> PAGE_SHIFT;
1063     struct page *page;
1064
1065     if (!kvm_has_mte(kvm))
1066         return 0;
1067
1068     /*
1069      * pfn_to_online_page() is used to reject ZONE_DEVICE pages
1070      * that may not support tags.
1071      */
1072     page = pfn_to_online_page(pfn);
1073
1074     if (!page)
1075         return -EFAULT;
1076
1077     for (i = 0; i < nr_pages; i++, page++) {
1078         if (!test_bit(PG_mte_tagged, &page->flags)) {
1079             mte_clear_page_tags(page_address(page));
1080             set_bit(PG_mte_tagged, &page->flags);
1081         }
1082     }
1083
1084     return 0;
1085 }
1086
1087 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1088               struct kvm_memory_slot *memslot, unsigned long hva,
1089               unsigned long fault_status)
1090 {
1091     int ret = 0;
1092     bool write_fault, writable, force_pte = false;
1093     bool exec_fault;
1094     bool device = false;
1095     bool shared;
1096     unsigned long mmu_seq;
1097     struct kvm *kvm = vcpu->kvm;
1098     struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1099     struct vm_area_struct *vma;
1100     short vma_shift;
1101     gfn_t gfn;
1102     kvm_pfn_t pfn;
1103     bool logging_active = memslot_is_logging(memslot);
1104     bool use_read_lock = false;
1105     unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
1106     unsigned long vma_pagesize, fault_granule;
1107     enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
1108     struct kvm_pgtable *pgt;
1109
1110     fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
1111     write_fault = kvm_is_write_fault(vcpu);
1112     exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1113     VM_BUG_ON(write_fault && exec_fault);
1114
1115     if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1116         kvm_err("Unexpected L2 read permission error\n");
1117         return -EFAULT;
1118     }
1119
1120     /*
1121      * Let's check if we will get back a huge page backed by hugetlbfs, or
1122      * get block mapping for device MMIO region.
1123      */
1124     mmap_read_lock(current->mm);
1125     vma = vma_lookup(current->mm, hva);
1126     if (unlikely(!vma)) {
1127         kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1128         mmap_read_unlock(current->mm);
1129         return -EFAULT;
1130     }
1131
1132     /*
1133      * logging_active is guaranteed to never be true for VM_PFNMAP
1134      * memslots.
1135      */
1136     if (logging_active) {
1137         force_pte = true;
1138         vma_shift = PAGE_SHIFT;
1139         use_read_lock = (fault_status == FSC_PERM && write_fault &&
1140                  fault_granule == PAGE_SIZE);
1141     } else {
1142         vma_shift = get_vma_page_shift(vma, hva);
1143     }
1144
1145     shared = (vma->vm_flags & VM_SHARED);
1146
1147     switch (vma_shift) {
1148 #ifndef __PAGETABLE_PMD_FOLDED
1149     case PUD_SHIFT:
1150         if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
1151             break;
1152         fallthrough;
1153 #endif
1154     case CONT_PMD_SHIFT:
1155         vma_shift = PMD_SHIFT;
1156         fallthrough;
1157     case PMD_SHIFT:
1158         if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
1159             break;
1160         fallthrough;
1161     case CONT_PTE_SHIFT:
1162         vma_shift = PAGE_SHIFT;
1163         force_pte = true;
1164         fallthrough;
1165     case PAGE_SHIFT:
1166         break;
1167     default:
1168         WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
1169     }
1170
1171     vma_pagesize = 1UL << vma_shift;
1172     if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
1173         fault_ipa &= ~(vma_pagesize - 1);
1174
1175     gfn = fault_ipa >> PAGE_SHIFT;
1176     mmap_read_unlock(current->mm);
1177
1178     /*
1179      * Permission faults just need to update the existing leaf entry,
1180      * and so normally don't require allocations from the memcache. The
1181      * only exception to this is when dirty logging is enabled at runtime
1182      * and a write fault needs to collapse a block entry into a table.
1183      */
1184     if (fault_status != FSC_PERM || (logging_active && write_fault)) {
1185         ret = kvm_mmu_topup_memory_cache(memcache,
1186                          kvm_mmu_cache_min_pages(kvm));
1187         if (ret)
1188             return ret;
1189     }
1190
1191     mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1192     /*
1193      * Ensure the read of mmu_invalidate_seq happens before we call
1194      * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1195      * the page we just got a reference to gets unmapped before we have a
1196      * chance to grab the mmu_lock, which ensure that if the page gets
1197      * unmapped afterwards, the call to kvm_unmap_gfn will take it away
1198      * from us again properly. This smp_rmb() interacts with the smp_wmb()
1199      * in kvm_mmu_notifier_invalidate_<page|range_end>.
1200      *
1201      * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1202      * used to avoid unnecessary overhead introduced to locate the memory
1203      * slot because it's always fixed even @gfn is adjusted for huge pages.
1204      */
1205     smp_rmb();
1206
1207     pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
1208                    write_fault, &writable, NULL);
1209     if (pfn == KVM_PFN_ERR_HWPOISON) {
1210         kvm_send_hwpoison_signal(hva, vma_shift);
1211         return 0;
1212     }
1213     if (is_error_noslot_pfn(pfn))
1214         return -EFAULT;
1215
1216     if (kvm_is_device_pfn(pfn)) {
1217         /*
1218          * If the page was identified as device early by looking at
1219          * the VMA flags, vma_pagesize is already representing the
1220          * largest quantity we can map.  If instead it was mapped
1221          * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
1222          * and must not be upgraded.
1223          *
1224          * In both cases, we don't let transparent_hugepage_adjust()
1225          * change things at the last minute.
1226          */
1227         device = true;
1228     } else if (logging_active && !write_fault) {
1229         /*
1230          * Only actually map the page as writable if this was a write
1231          * fault.
1232          */
1233         writable = false;
1234     }
1235
1236     if (exec_fault && device)
1237         return -ENOEXEC;
1238
1239     /*
1240      * To reduce MMU contentions and enhance concurrency during dirty
1241      * logging dirty logging, only acquire read lock for permission
1242      * relaxation.
1243      */
1244     if (use_read_lock)
1245         read_lock(&kvm->mmu_lock);
1246     else
1247         write_lock(&kvm->mmu_lock);
1248     pgt = vcpu->arch.hw_mmu->pgt;
1249     if (mmu_invalidate_retry(kvm, mmu_seq))
1250         goto out_unlock;
1251
1252     /*
1253      * If we are not forced to use page mapping, check if we are
1254      * backed by a THP and thus use block mapping if possible.
1255      */
1256     if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
1257         if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
1258             vma_pagesize = fault_granule;
1259         else
1260             vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
1261                                    hva, &pfn,
1262                                    &fault_ipa);
1263     }
1264
1265     if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
1266         /* Check the VMM hasn't introduced a new VM_SHARED VMA */
1267         if (!shared)
1268             ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
1269         else
1270             ret = -EFAULT;
1271         if (ret)
1272             goto out_unlock;
1273     }
1274
1275     if (writable)
1276         prot |= KVM_PGTABLE_PROT_W;
1277
1278     if (exec_fault)
1279         prot |= KVM_PGTABLE_PROT_X;
1280
1281     if (device)
1282         prot |= KVM_PGTABLE_PROT_DEVICE;
1283     else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
1284         prot |= KVM_PGTABLE_PROT_X;
1285
1286     /*
1287      * Under the premise of getting a FSC_PERM fault, we just need to relax
1288      * permissions only if vma_pagesize equals fault_granule. Otherwise,
1289      * kvm_pgtable_stage2_map() should be called to change block size.
1290      */
1291     if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
1292         ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
1293     } else {
1294         WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
1295
1296         ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
1297                          __pfn_to_phys(pfn), prot,
1298                          memcache);
1299     }
1300
1301     /* Mark the page dirty only if the fault is handled successfully */
1302     if (writable && !ret) {
1303         kvm_set_pfn_dirty(pfn);
1304         mark_page_dirty_in_slot(kvm, memslot, gfn);
1305     }
1306
1307 out_unlock:
1308     if (use_read_lock)
1309         read_unlock(&kvm->mmu_lock);
1310     else
1311         write_unlock(&kvm->mmu_lock);
1312     kvm_set_pfn_accessed(pfn);
1313     kvm_release_pfn_clean(pfn);
1314     return ret != -EAGAIN ? ret : 0;
1315 }
1316
1317 /* Resolve the access fault by making the page young again. */
1318 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1319 {
1320     pte_t pte;
1321     kvm_pte_t kpte;
1322     struct kvm_s2_mmu *mmu;
1323
1324     trace_kvm_access_fault(fault_ipa);
1325
1326     write_lock(&vcpu->kvm->mmu_lock);
1327     mmu = vcpu->arch.hw_mmu;
1328     kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1329     write_unlock(&vcpu->kvm->mmu_lock);
1330
1331     pte = __pte(kpte);
1332     if (pte_valid(pte))
1333         kvm_set_pfn_accessed(pte_pfn(pte));
1334 }
1335
1336 /**
1337  * kvm_handle_guest_abort - handles all 2nd stage aborts
1338  * @vcpu:   the VCPU pointer
1339  *
1340  * Any abort that gets to the host is almost guaranteed to be caused by a
1341  * missing second stage translation table entry, which can mean that either the
1342  * guest simply needs more memory and we must allocate an appropriate page or it
1343  * can mean that the guest tried to access I/O memory, which is emulated by user
1344  * space. The distinction is based on the IPA causing the fault and whether this
1345  * memory region has been registered as standard RAM by user space.
1346  */
1347 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1348 {
1349     unsigned long fault_status;
1350     phys_addr_t fault_ipa;
1351     struct kvm_memory_slot *memslot;
1352     unsigned long hva;
1353     bool is_iabt, write_fault, writable;
1354     gfn_t gfn;
1355     int ret, idx;
1356
1357     fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1358
1359     fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1360     is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1361
1362     if (fault_status == FSC_FAULT) {
1363         /* Beyond sanitised PARange (which is the IPA limit) */
1364         if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
1365             kvm_inject_size_fault(vcpu);
1366             return 1;
1367         }
1368
1369         /* Falls between the IPA range and the PARange? */
1370         if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
1371             fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
1372
1373             if (is_iabt)
1374                 kvm_inject_pabt(vcpu, fault_ipa);
1375             else
1376                 kvm_inject_dabt(vcpu, fault_ipa);
1377             return 1;
1378         }
1379     }
1380
1381     /* Synchronous External Abort? */
1382     if (kvm_vcpu_abt_issea(vcpu)) {
1383         /*
1384          * For RAS the host kernel may handle this abort.
1385          * There is no need to pass the error into the guest.
1386          */
1387         if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1388             kvm_inject_vabt(vcpu);
1389
1390         return 1;
1391     }
1392
1393     trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1394                   kvm_vcpu_get_hfar(vcpu), fault_ipa);
1395
1396     /* Check the stage-2 fault is trans. fault or write fault */
1397     if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1398         fault_status != FSC_ACCESS) {
1399         kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1400             kvm_vcpu_trap_get_class(vcpu),
1401             (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1402             (unsigned long)kvm_vcpu_get_esr(vcpu));
1403         return -EFAULT;
1404     }
1405
1406     idx = srcu_read_lock(&vcpu->kvm->srcu);
1407
1408     gfn = fault_ipa >> PAGE_SHIFT;
1409     memslot = gfn_to_memslot(vcpu->kvm, gfn);
1410     hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1411     write_fault = kvm_is_write_fault(vcpu);
1412     if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1413         /*
1414          * The guest has put either its instructions or its page-tables
1415          * somewhere it shouldn't have. Userspace won't be able to do
1416          * anything about this (there's no syndrome for a start), so
1417          * re-inject the abort back into the guest.
1418          */
1419         if (is_iabt) {
1420             ret = -ENOEXEC;
1421             goto out;
1422         }
1423
1424         if (kvm_vcpu_abt_iss1tw(vcpu)) {
1425             kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1426             ret = 1;
1427             goto out_unlock;
1428         }
1429
1430         /*
1431          * Check for a cache maintenance operation. Since we
1432          * ended-up here, we know it is outside of any memory
1433          * slot. But we can't find out if that is for a device,
1434          * or if the guest is just being stupid. The only thing
1435          * we know for sure is that this range cannot be cached.
1436          *
1437          * So let's assume that the guest is just being
1438          * cautious, and skip the instruction.
1439          */
1440         if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1441             kvm_incr_pc(vcpu);
1442             ret = 1;
1443             goto out_unlock;
1444         }
1445
1446         /*
1447          * The IPA is reported as [MAX:12], so we need to
1448          * complement it with the bottom 12 bits from the
1449          * faulting VA. This is always 12 bits, irrespective
1450          * of the page size.
1451          */
1452         fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1453         ret = io_mem_abort(vcpu, fault_ipa);
1454         goto out_unlock;
1455     }
1456
1457     /* Userspace should not be able to register out-of-bounds IPAs */
1458     VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1459
1460     if (fault_status == FSC_ACCESS) {
1461         handle_access_fault(vcpu, fault_ipa);
1462         ret = 1;
1463         goto out_unlock;
1464     }
1465
1466     ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1467     if (ret == 0)
1468         ret = 1;
1469 out:
1470     if (ret == -ENOEXEC) {
1471         kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1472         ret = 1;
1473     }
1474 out_unlock:
1475     srcu_read_unlock(&vcpu->kvm->srcu, idx);
1476     return ret;
1477 }
1478
1479 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1480 {
1481     if (!kvm->arch.mmu.pgt)
1482         return false;
1483
1484     __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
1485                  (range->end - range->start) << PAGE_SHIFT,
1486                  range->may_block);
1487
1488     return false;
1489 }
1490
1491 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1492 {
1493     kvm_pfn_t pfn = pte_pfn(range->pte);
1494     int ret;
1495
1496     if (!kvm->arch.mmu.pgt)
1497         return false;
1498
1499     WARN_ON(range->end - range->start != 1);
1500
1501     ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
1502     if (ret)
1503         return false;
1504
1505     /*
1506      * We've moved a page around, probably through CoW, so let's treat
1507      * it just like a translation fault and the map handler will clean
1508      * the cache to the PoC.
1509      *
1510      * The MMU notifiers will have unmapped a huge PMD before calling
1511      * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
1512      * therefore we never need to clear out a huge PMD through this
1513      * calling path and a memcache is not required.
1514      */
1515     kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
1516                    PAGE_SIZE, __pfn_to_phys(pfn),
1517                    KVM_PGTABLE_PROT_R, NULL);
1518
1519     return false;
1520 }
1521
1522 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1523 {
1524     u64 size = (range->end - range->start) << PAGE_SHIFT;
1525     kvm_pte_t kpte;
1526     pte_t pte;
1527
1528     if (!kvm->arch.mmu.pgt)
1529         return false;
1530
1531     WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1532
1533     kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
1534                     range->start << PAGE_SHIFT);
1535     pte = __pte(kpte);
1536     return pte_valid(pte) && pte_young(pte);
1537 }
1538
1539 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1540 {
1541     if (!kvm->arch.mmu.pgt)
1542         return false;
1543
1544     return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
1545                        range->start << PAGE_SHIFT);
1546 }
1547
1548 phys_addr_t kvm_mmu_get_httbr(void)
1549 {
1550     return __pa(hyp_pgtable->pgd);
1551 }
1552
1553 phys_addr_t kvm_get_idmap_vector(void)
1554 {
1555     return hyp_idmap_vector;
1556 }
1557
1558 static int kvm_map_idmap_text(void)
1559 {
1560     unsigned long size = hyp_idmap_end - hyp_idmap_start;
1561     int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1562                     PAGE_HYP_EXEC);
1563     if (err)
1564         kvm_err("Failed to idmap %lx-%lx\n",
1565             hyp_idmap_start, hyp_idmap_end);
1566
1567     return err;
1568 }
1569
1570 static void *kvm_hyp_zalloc_page(void *arg)
1571 {
1572     return (void *)get_zeroed_page(GFP_KERNEL);
1573 }
1574
1575 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
1576     .zalloc_page        = kvm_hyp_zalloc_page,
1577     .get_page       = kvm_host_get_page,
1578     .put_page       = kvm_host_put_page,
1579     .phys_to_virt       = kvm_host_va,
1580     .virt_to_phys       = kvm_host_pa,
1581 };
1582
1583 int kvm_mmu_init(u32 *hyp_va_bits)
1584 {
1585     int err;
1586
1587     hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
1588     hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1589     hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
1590     hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1591     hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
1592
1593     /*
1594      * We rely on the linker script to ensure at build time that the HYP
1595      * init code does not cross a page boundary.
1596      */
1597     BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1598
1599     *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1600     kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
1601     kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1602     kvm_debug("HYP VA range: %lx:%lx\n",
1603           kern_hyp_va(PAGE_OFFSET),
1604           kern_hyp_va((unsigned long)high_memory - 1));
1605
1606     if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1607         hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
1608         hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1609         /*
1610          * The idmap page is intersecting with the VA space,
1611          * it is not safe to continue further.
1612          */
1613         kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1614         err = -EINVAL;
1615         goto out;
1616     }
1617
1618     hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
1619     if (!hyp_pgtable) {
1620         kvm_err("Hyp mode page-table not allocated\n");
1621         err = -ENOMEM;
1622         goto out;
1623     }
1624
1625     err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
1626     if (err)
1627         goto out_free_pgtable;
1628
1629     err = kvm_map_idmap_text();
1630     if (err)
1631         goto out_destroy_pgtable;
1632
1633     io_map_base = hyp_idmap_start;
1634     return 0;
1635
1636 out_destroy_pgtable:
1637     kvm_pgtable_hyp_destroy(hyp_pgtable);
1638 out_free_pgtable:
1639     kfree(hyp_pgtable);
1640     hyp_pgtable = NULL;
1641 out:
1642     return err;
1643 }
1644
1645 void kvm_arch_commit_memory_region(struct kvm *kvm,
1646                    struct kvm_memory_slot *old,
1647                    const struct kvm_memory_slot *new,
1648                    enum kvm_mr_change change)
1649 {
1650     /*
1651      * At this point memslot has been committed and there is an
1652      * allocated dirty_bitmap[], dirty pages will be tracked while the
1653      * memory slot is write protected.
1654      */
1655     if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1656         /*
1657          * If we're with initial-all-set, we don't need to write
1658          * protect any pages because they're all reported as dirty.
1659          * Huge pages and normal pages will be write protect gradually.
1660          */
1661         if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1662             kvm_mmu_wp_memory_region(kvm, new->id);
1663         }
1664     }
1665 }
1666
1667 int kvm_arch_prepare_memory_region(struct kvm *kvm,
1668                    const struct kvm_memory_slot *old,
1669                    struct kvm_memory_slot *new,
1670                    enum kvm_mr_change change)
1671 {
1672     hva_t hva, reg_end;
1673     int ret = 0;
1674
1675     if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1676             change != KVM_MR_FLAGS_ONLY)
1677         return 0;
1678
1679     /*
1680      * Prevent userspace from creating a memory region outside of the IPA
1681      * space addressable by the KVM guest IPA space.
1682      */
1683     if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
1684         return -EFAULT;
1685
1686     hva = new->userspace_addr;
1687     reg_end = hva + (new->npages << PAGE_SHIFT);
1688
1689     mmap_read_lock(current->mm);
1690     /*
1691      * A memory region could potentially cover multiple VMAs, and any holes
1692      * between them, so iterate over all of them.
1693      *
1694      *     +--------------------------------------------+
1695      * +---------------+----------------+   +----------------+
1696      * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1697      * +---------------+----------------+   +----------------+
1698      *     |               memory region                |
1699      *     +--------------------------------------------+
1700      */
1701     do {
1702         struct vm_area_struct *vma;
1703
1704         vma = find_vma_intersection(current->mm, hva, reg_end);
1705         if (!vma)
1706             break;
1707
1708         /*
1709          * VM_SHARED mappings are not allowed with MTE to avoid races
1710          * when updating the PG_mte_tagged page flag, see
1711          * sanitise_mte_tags for more details.
1712          */
1713         if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
1714             ret = -EINVAL;
1715             break;
1716         }
1717
1718         if (vma->vm_flags & VM_PFNMAP) {
1719             /* IO region dirty page logging not allowed */
1720             if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1721                 ret = -EINVAL;
1722                 break;
1723             }
1724         }
1725         hva = min(reg_end, vma->vm_end);
1726     } while (hva < reg_end);
1727
1728     mmap_read_unlock(current->mm);
1729     return ret;
1730 }
1731
1732 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
1733 {
1734 }
1735
1736 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
1737 {
1738 }
1739
1740 void kvm_arch_flush_shadow_all(struct kvm *kvm)
1741 {
1742     kvm_free_stage2_pgd(&kvm->arch.mmu);
1743 }
1744
1745 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1746                    struct kvm_memory_slot *slot)
1747 {
1748     gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1749     phys_addr_t size = slot->npages << PAGE_SHIFT;
1750
1751     write_lock(&kvm->mmu_lock);
1752     unmap_stage2_range(&kvm->arch.mmu, gpa, size);
1753     write_unlock(&kvm->mmu_lock);
1754 }
1755
1756 /*
1757  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1758  *
1759  * Main problems:
1760  * - S/W ops are local to a CPU (not broadcast)
1761  * - We have line migration behind our back (speculation)
1762  * - System caches don't support S/W at all (damn!)
1763  *
1764  * In the face of the above, the best we can do is to try and convert
1765  * S/W ops to VA ops. Because the guest is not allowed to infer the
1766  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1767  * which is a rather good thing for us.
1768  *
1769  * Also, it is only used when turning caches on/off ("The expected
1770  * usage of the cache maintenance instructions that operate by set/way
1771  * is associated with the cache maintenance instructions associated
1772  * with the powerdown and powerup of caches, if this is required by
1773  * the implementation.").
1774  *
1775  * We use the following policy:
1776  *
1777  * - If we trap a S/W operation, we enable VM trapping to detect
1778  *   caches being turned on/off, and do a full clean.
1779  *
1780  * - We flush the caches on both caches being turned on and off.
1781  *
1782  * - Once the caches are enabled, we stop trapping VM ops.
1783  */
1784 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1785 {
1786     unsigned long hcr = *vcpu_hcr(vcpu);
1787
1788     /*
1789      * If this is the first time we do a S/W operation
1790      * (i.e. HCR_TVM not set) flush the whole memory, and set the
1791      * VM trapping.
1792      *
1793      * Otherwise, rely on the VM trapping to wait for the MMU +
1794      * Caches to be turned off. At that point, we'll be able to
1795      * clean the caches again.
1796      */
1797     if (!(hcr & HCR_TVM)) {
1798         trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1799                     vcpu_has_cache_enabled(vcpu));
1800         stage2_flush_vm(vcpu->kvm);
1801         *vcpu_hcr(vcpu) = hcr | HCR_TVM;
1802     }
1803 }
1804
1805 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1806 {
1807     bool now_enabled = vcpu_has_cache_enabled(vcpu);
1808
1809     /*
1810      * If switching the MMU+caches on, need to invalidate the caches.
1811      * If switching it off, need to clean the caches.
1812      * Clean + invalidate does the trick always.
1813      */
1814     if (now_enabled != was_enabled)
1815         stage2_flush_vm(vcpu->kvm);
1816
1817     /* Caches are now on, stop trapping VM ops (until a S/W op) */
1818     if (now_enabled)
1819         *vcpu_hcr(vcpu) &= ~HCR_TVM;
1820
1821     trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1822 }