kvm/mmu/mmu.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Kernel-based Virtual Machine driver for Linux
0004  *
0005  * This module enables machines with Intel VT-x extensions to run virtual
0006  * machines without emulation or binary translation.
0007  *
0008  * MMU support
0009  *
0010  * Copyright (C) 2006 Qumranet, Inc.
0011  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
0012  *
0013  * Authors:
0014  *   Yaniv Kamay  <yaniv@qumranet.com>
0015  *   Avi Kivity   <avi@qumranet.com>
0016  */
0017
0018 #include "irq.h"
0019 #include "ioapic.h"
0020 #include "mmu.h"
0021 #include "mmu_internal.h"
0022 #include "tdp_mmu.h"
0023 #include "x86.h"
0024 #include "kvm_cache_regs.h"
0025 #include "kvm_emulate.h"
0026 #include "cpuid.h"
0027 #include "spte.h"
0028
0029 #include <linux/kvm_host.h>
0030 #include <linux/types.h>
0031 #include <linux/string.h>
0032 #include <linux/mm.h>
0033 #include <linux/highmem.h>
0034 #include <linux/moduleparam.h>
0035 #include <linux/export.h>
0036 #include <linux/swap.h>
0037 #include <linux/hugetlb.h>
0038 #include <linux/compiler.h>
0039 #include <linux/srcu.h>
0040 #include <linux/slab.h>
0041 #include <linux/sched/signal.h>
0042 #include <linux/uaccess.h>
0043 #include <linux/hash.h>
0044 #include <linux/kern_levels.h>
0045 #include <linux/kthread.h>
0046
0047 #include <asm/page.h>
0048 #include <asm/memtype.h>
0049 #include <asm/cmpxchg.h>
0050 #include <asm/io.h>
0051 #include <asm/set_memory.h>
0052 #include <asm/vmx.h>
0053 #include <asm/kvm_page_track.h>
0054 #include "trace.h"
0055
0056 extern bool itlb_multihit_kvm_mitigation;
0057
0058 int __read_mostly nx_huge_pages = -1;
0059 static uint __read_mostly nx_huge_pages_recovery_period_ms;
0060 #ifdef CONFIG_PREEMPT_RT
0061 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
0062 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
0063 #else
0064 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
0065 #endif
0066
0067 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
0068 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
0069
0070 static const struct kernel_param_ops nx_huge_pages_ops = {
0071     .set = set_nx_huge_pages,
0072     .get = param_get_bool,
0073 };
0074
0075 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
0076     .set = set_nx_huge_pages_recovery_param,
0077     .get = param_get_uint,
0078 };
0079
0080 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
0081 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
0082 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
0083         &nx_huge_pages_recovery_ratio, 0644);
0084 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
0085 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
0086         &nx_huge_pages_recovery_period_ms, 0644);
0087 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
0088
0089 static bool __read_mostly force_flush_and_sync_on_reuse;
0090 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
0091
0092 /*
0093  * When setting this variable to true it enables Two-Dimensional-Paging
0094  * where the hardware walks 2 page tables:
0095  * 1. the guest-virtual to guest-physical
0096  * 2. while doing 1. it walks guest-physical to host-physical
0097  * If the hardware supports that we don't need to do shadow paging.
0098  */
0099 bool tdp_enabled = false;
0100
0101 static int max_huge_page_level __read_mostly;
0102 static int tdp_root_level __read_mostly;
0103 static int max_tdp_level __read_mostly;
0104
0105 #ifdef MMU_DEBUG
0106 bool dbg = 0;
0107 module_param(dbg, bool, 0644);
0108 #endif
0109
0110 #define PTE_PREFETCH_NUM        8
0111
0112 #include <trace/events/kvm.h>
0113
0114 /* make pte_list_desc fit well in cache lines */
0115 #define PTE_LIST_EXT 14
0116
0117 /*
0118  * Slight optimization of cacheline layout, by putting `more' and `spte_count'
0119  * at the start; then accessing it will only use one single cacheline for
0120  * either full (entries==PTE_LIST_EXT) case or entries<=6.
0121  */
0122 struct pte_list_desc {
0123     struct pte_list_desc *more;
0124     /*
0125      * Stores number of entries stored in the pte_list_desc.  No need to be
0126      * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
0127      */
0128     u64 spte_count;
0129     u64 *sptes[PTE_LIST_EXT];
0130 };
0131
0132 struct kvm_shadow_walk_iterator {
0133     u64 addr;
0134     hpa_t shadow_addr;
0135     u64 *sptep;
0136     int level;
0137     unsigned index;
0138 };
0139
0140 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
0141     for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
0142                      (_root), (_addr));                \
0143          shadow_walk_okay(&(_walker));                     \
0144          shadow_walk_next(&(_walker)))
0145
0146 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
0147     for (shadow_walk_init(&(_walker), _vcpu, _addr);    \
0148          shadow_walk_okay(&(_walker));          \
0149          shadow_walk_next(&(_walker)))
0150
0151 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
0152     for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
0153          shadow_walk_okay(&(_walker)) &&                \
0154         ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
0155          __shadow_walk_next(&(_walker), spte))
0156
0157 static struct kmem_cache *pte_list_desc_cache;
0158 struct kmem_cache *mmu_page_header_cache;
0159 static struct percpu_counter kvm_total_used_mmu_pages;
0160
0161 static void mmu_spte_set(u64 *sptep, u64 spte);
0162
0163 struct kvm_mmu_role_regs {
0164     const unsigned long cr0;
0165     const unsigned long cr4;
0166     const u64 efer;
0167 };
0168
0169 #define CREATE_TRACE_POINTS
0170 #include "mmutrace.h"
0171
0172 /*
0173  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
0174  * reading from the role_regs.  Once the root_role is constructed, it becomes
0175  * the single source of truth for the MMU's state.
0176  */
0177 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)           \
0178 static inline bool __maybe_unused                   \
0179 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)     \
0180 {                                   \
0181     return !!(regs->reg & flag);                    \
0182 }
0183 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
0184 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
0185 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
0186 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
0187 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
0188 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
0189 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
0190 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
0191 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
0192 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
0193
0194 /*
0195  * The MMU itself (with a valid role) is the single source of truth for the
0196  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
0197  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
0198  * and the vCPU may be incorrect/irrelevant.
0199  */
0200 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)     \
0201 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)    \
0202 {                               \
0203     return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
0204 }
0205 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
0206 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
0207 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
0208 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
0209 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
0210 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
0211 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
0212 BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
0213
0214 static inline bool is_cr0_pg(struct kvm_mmu *mmu)
0215 {
0216         return mmu->cpu_role.base.level > 0;
0217 }
0218
0219 static inline bool is_cr4_pae(struct kvm_mmu *mmu)
0220 {
0221         return !mmu->cpu_role.base.has_4_byte_gpte;
0222 }
0223
0224 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
0225 {
0226     struct kvm_mmu_role_regs regs = {
0227         .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
0228         .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
0229         .efer = vcpu->arch.efer,
0230     };
0231
0232     return regs;
0233 }
0234
0235 static inline bool kvm_available_flush_tlb_with_range(void)
0236 {
0237     return kvm_x86_ops.tlb_remote_flush_with_range;
0238 }
0239
0240 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
0241         struct kvm_tlb_range *range)
0242 {
0243     int ret = -ENOTSUPP;
0244
0245     if (range && kvm_x86_ops.tlb_remote_flush_with_range)
0246         ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
0247
0248     if (ret)
0249         kvm_flush_remote_tlbs(kvm);
0250 }
0251
0252 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
0253         u64 start_gfn, u64 pages)
0254 {
0255     struct kvm_tlb_range range;
0256
0257     range.start_gfn = start_gfn;
0258     range.pages = pages;
0259
0260     kvm_flush_remote_tlbs_with_range(kvm, &range);
0261 }
0262
0263 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
0264                unsigned int access)
0265 {
0266     u64 spte = make_mmio_spte(vcpu, gfn, access);
0267
0268     trace_mark_mmio_spte(sptep, gfn, spte);
0269     mmu_spte_set(sptep, spte);
0270 }
0271
0272 static gfn_t get_mmio_spte_gfn(u64 spte)
0273 {
0274     u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
0275
0276     gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
0277            & shadow_nonpresent_or_rsvd_mask;
0278
0279     return gpa >> PAGE_SHIFT;
0280 }
0281
0282 static unsigned get_mmio_spte_access(u64 spte)
0283 {
0284     return spte & shadow_mmio_access_mask;
0285 }
0286
0287 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
0288 {
0289     u64 kvm_gen, spte_gen, gen;
0290
0291     gen = kvm_vcpu_memslots(vcpu)->generation;
0292     if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
0293         return false;
0294
0295     kvm_gen = gen & MMIO_SPTE_GEN_MASK;
0296     spte_gen = get_mmio_spte_generation(spte);
0297
0298     trace_check_mmio_spte(spte, kvm_gen, spte_gen);
0299     return likely(kvm_gen == spte_gen);
0300 }
0301
0302 static int is_cpuid_PSE36(void)
0303 {
0304     return 1;
0305 }
0306
0307 #ifdef CONFIG_X86_64
0308 static void __set_spte(u64 *sptep, u64 spte)
0309 {
0310     WRITE_ONCE(*sptep, spte);
0311 }
0312
0313 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
0314 {
0315     WRITE_ONCE(*sptep, spte);
0316 }
0317
0318 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
0319 {
0320     return xchg(sptep, spte);
0321 }
0322
0323 static u64 __get_spte_lockless(u64 *sptep)
0324 {
0325     return READ_ONCE(*sptep);
0326 }
0327 #else
0328 union split_spte {
0329     struct {
0330         u32 spte_low;
0331         u32 spte_high;
0332     };
0333     u64 spte;
0334 };
0335
0336 static void count_spte_clear(u64 *sptep, u64 spte)
0337 {
0338     struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
0339
0340     if (is_shadow_present_pte(spte))
0341         return;
0342
0343     /* Ensure the spte is completely set before we increase the count */
0344     smp_wmb();
0345     sp->clear_spte_count++;
0346 }
0347
0348 static void __set_spte(u64 *sptep, u64 spte)
0349 {
0350     union split_spte *ssptep, sspte;
0351
0352     ssptep = (union split_spte *)sptep;
0353     sspte = (union split_spte)spte;
0354
0355     ssptep->spte_high = sspte.spte_high;
0356
0357     /*
0358      * If we map the spte from nonpresent to present, We should store
0359      * the high bits firstly, then set present bit, so cpu can not
0360      * fetch this spte while we are setting the spte.
0361      */
0362     smp_wmb();
0363
0364     WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
0365 }
0366
0367 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
0368 {
0369     union split_spte *ssptep, sspte;
0370
0371     ssptep = (union split_spte *)sptep;
0372     sspte = (union split_spte)spte;
0373
0374     WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
0375
0376     /*
0377      * If we map the spte from present to nonpresent, we should clear
0378      * present bit firstly to avoid vcpu fetch the old high bits.
0379      */
0380     smp_wmb();
0381
0382     ssptep->spte_high = sspte.spte_high;
0383     count_spte_clear(sptep, spte);
0384 }
0385
0386 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
0387 {
0388     union split_spte *ssptep, sspte, orig;
0389
0390     ssptep = (union split_spte *)sptep;
0391     sspte = (union split_spte)spte;
0392
0393     /* xchg acts as a barrier before the setting of the high bits */
0394     orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
0395     orig.spte_high = ssptep->spte_high;
0396     ssptep->spte_high = sspte.spte_high;
0397     count_spte_clear(sptep, spte);
0398
0399     return orig.spte;
0400 }
0401
0402 /*
0403  * The idea using the light way get the spte on x86_32 guest is from
0404  * gup_get_pte (mm/gup.c).
0405  *
0406  * An spte tlb flush may be pending, because kvm_set_pte_rmap
0407  * coalesces them and we are running out of the MMU lock.  Therefore
0408  * we need to protect against in-progress updates of the spte.
0409  *
0410  * Reading the spte while an update is in progress may get the old value
0411  * for the high part of the spte.  The race is fine for a present->non-present
0412  * change (because the high part of the spte is ignored for non-present spte),
0413  * but for a present->present change we must reread the spte.
0414  *
0415  * All such changes are done in two steps (present->non-present and
0416  * non-present->present), hence it is enough to count the number of
0417  * present->non-present updates: if it changed while reading the spte,
0418  * we might have hit the race.  This is done using clear_spte_count.
0419  */
0420 static u64 __get_spte_lockless(u64 *sptep)
0421 {
0422     struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
0423     union split_spte spte, *orig = (union split_spte *)sptep;
0424     int count;
0425
0426 retry:
0427     count = sp->clear_spte_count;
0428     smp_rmb();
0429
0430     spte.spte_low = orig->spte_low;
0431     smp_rmb();
0432
0433     spte.spte_high = orig->spte_high;
0434     smp_rmb();
0435
0436     if (unlikely(spte.spte_low != orig->spte_low ||
0437           count != sp->clear_spte_count))
0438         goto retry;
0439
0440     return spte.spte;
0441 }
0442 #endif
0443
0444 /* Rules for using mmu_spte_set:
0445  * Set the sptep from nonpresent to present.
0446  * Note: the sptep being assigned *must* be either not present
0447  * or in a state where the hardware will not attempt to update
0448  * the spte.
0449  */
0450 static void mmu_spte_set(u64 *sptep, u64 new_spte)
0451 {
0452     WARN_ON(is_shadow_present_pte(*sptep));
0453     __set_spte(sptep, new_spte);
0454 }
0455
0456 /*
0457  * Update the SPTE (excluding the PFN), but do not track changes in its
0458  * accessed/dirty status.
0459  */
0460 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
0461 {
0462     u64 old_spte = *sptep;
0463
0464     WARN_ON(!is_shadow_present_pte(new_spte));
0465     check_spte_writable_invariants(new_spte);
0466
0467     if (!is_shadow_present_pte(old_spte)) {
0468         mmu_spte_set(sptep, new_spte);
0469         return old_spte;
0470     }
0471
0472     if (!spte_has_volatile_bits(old_spte))
0473         __update_clear_spte_fast(sptep, new_spte);
0474     else
0475         old_spte = __update_clear_spte_slow(sptep, new_spte);
0476
0477     WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
0478
0479     return old_spte;
0480 }
0481
0482 /* Rules for using mmu_spte_update:
0483  * Update the state bits, it means the mapped pfn is not changed.
0484  *
0485  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
0486  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
0487  * spte, even though the writable spte might be cached on a CPU's TLB.
0488  *
0489  * Returns true if the TLB needs to be flushed
0490  */
0491 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
0492 {
0493     bool flush = false;
0494     u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
0495
0496     if (!is_shadow_present_pte(old_spte))
0497         return false;
0498
0499     /*
0500      * For the spte updated out of mmu-lock is safe, since
0501      * we always atomically update it, see the comments in
0502      * spte_has_volatile_bits().
0503      */
0504     if (is_mmu_writable_spte(old_spte) &&
0505           !is_writable_pte(new_spte))
0506         flush = true;
0507
0508     /*
0509      * Flush TLB when accessed/dirty states are changed in the page tables,
0510      * to guarantee consistency between TLB and page tables.
0511      */
0512
0513     if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
0514         flush = true;
0515         kvm_set_pfn_accessed(spte_to_pfn(old_spte));
0516     }
0517
0518     if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
0519         flush = true;
0520         kvm_set_pfn_dirty(spte_to_pfn(old_spte));
0521     }
0522
0523     return flush;
0524 }
0525
0526 /*
0527  * Rules for using mmu_spte_clear_track_bits:
0528  * It sets the sptep from present to nonpresent, and track the
0529  * state bits, it is used to clear the last level sptep.
0530  * Returns the old PTE.
0531  */
0532 static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
0533 {
0534     kvm_pfn_t pfn;
0535     u64 old_spte = *sptep;
0536     int level = sptep_to_sp(sptep)->role.level;
0537     struct page *page;
0538
0539     if (!is_shadow_present_pte(old_spte) ||
0540         !spte_has_volatile_bits(old_spte))
0541         __update_clear_spte_fast(sptep, 0ull);
0542     else
0543         old_spte = __update_clear_spte_slow(sptep, 0ull);
0544
0545     if (!is_shadow_present_pte(old_spte))
0546         return old_spte;
0547
0548     kvm_update_page_stats(kvm, level, -1);
0549
0550     pfn = spte_to_pfn(old_spte);
0551
0552     /*
0553      * KVM doesn't hold a reference to any pages mapped into the guest, and
0554      * instead uses the mmu_notifier to ensure that KVM unmaps any pages
0555      * before they are reclaimed.  Sanity check that, if the pfn is backed
0556      * by a refcounted page, the refcount is elevated.
0557      */
0558     page = kvm_pfn_to_refcounted_page(pfn);
0559     WARN_ON(page && !page_count(page));
0560
0561     if (is_accessed_spte(old_spte))
0562         kvm_set_pfn_accessed(pfn);
0563
0564     if (is_dirty_spte(old_spte))
0565         kvm_set_pfn_dirty(pfn);
0566
0567     return old_spte;
0568 }
0569
0570 /*
0571  * Rules for using mmu_spte_clear_no_track:
0572  * Directly clear spte without caring the state bits of sptep,
0573  * it is used to set the upper level spte.
0574  */
0575 static void mmu_spte_clear_no_track(u64 *sptep)
0576 {
0577     __update_clear_spte_fast(sptep, 0ull);
0578 }
0579
0580 static u64 mmu_spte_get_lockless(u64 *sptep)
0581 {
0582     return __get_spte_lockless(sptep);
0583 }
0584
0585 /* Returns the Accessed status of the PTE and resets it at the same time. */
0586 static bool mmu_spte_age(u64 *sptep)
0587 {
0588     u64 spte = mmu_spte_get_lockless(sptep);
0589
0590     if (!is_accessed_spte(spte))
0591         return false;
0592
0593     if (spte_ad_enabled(spte)) {
0594         clear_bit((ffs(shadow_accessed_mask) - 1),
0595               (unsigned long *)sptep);
0596     } else {
0597         /*
0598          * Capture the dirty status of the page, so that it doesn't get
0599          * lost when the SPTE is marked for access tracking.
0600          */
0601         if (is_writable_pte(spte))
0602             kvm_set_pfn_dirty(spte_to_pfn(spte));
0603
0604         spte = mark_spte_for_access_track(spte);
0605         mmu_spte_update_no_track(sptep, spte);
0606     }
0607
0608     return true;
0609 }
0610
0611 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
0612 {
0613     if (is_tdp_mmu(vcpu->arch.mmu)) {
0614         kvm_tdp_mmu_walk_lockless_begin();
0615     } else {
0616         /*
0617          * Prevent page table teardown by making any free-er wait during
0618          * kvm_flush_remote_tlbs() IPI to all active vcpus.
0619          */
0620         local_irq_disable();
0621
0622         /*
0623          * Make sure a following spte read is not reordered ahead of the write
0624          * to vcpu->mode.
0625          */
0626         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
0627     }
0628 }
0629
0630 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
0631 {
0632     if (is_tdp_mmu(vcpu->arch.mmu)) {
0633         kvm_tdp_mmu_walk_lockless_end();
0634     } else {
0635         /*
0636          * Make sure the write to vcpu->mode is not reordered in front of
0637          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
0638          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
0639          */
0640         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
0641         local_irq_enable();
0642     }
0643 }
0644
0645 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
0646 {
0647     int r;
0648
0649     /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
0650     r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
0651                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
0652     if (r)
0653         return r;
0654     r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
0655                        PT64_ROOT_MAX_LEVEL);
0656     if (r)
0657         return r;
0658     if (maybe_indirect) {
0659         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
0660                            PT64_ROOT_MAX_LEVEL);
0661         if (r)
0662             return r;
0663     }
0664     return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
0665                       PT64_ROOT_MAX_LEVEL);
0666 }
0667
0668 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
0669 {
0670     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
0671     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
0672     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
0673     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
0674 }
0675
0676 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
0677 {
0678     kmem_cache_free(pte_list_desc_cache, pte_list_desc);
0679 }
0680
0681 static bool sp_has_gptes(struct kvm_mmu_page *sp);
0682
0683 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
0684 {
0685     if (sp->role.passthrough)
0686         return sp->gfn;
0687
0688     if (!sp->role.direct)
0689         return sp->shadowed_translation[index] >> PAGE_SHIFT;
0690
0691     return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
0692 }
0693
0694 /*
0695  * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
0696  * that the SPTE itself may have a more constrained access permissions that
0697  * what the guest enforces. For example, a guest may create an executable
0698  * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
0699  */
0700 static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
0701 {
0702     if (sp_has_gptes(sp))
0703         return sp->shadowed_translation[index] & ACC_ALL;
0704
0705     /*
0706      * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
0707      * KVM is not shadowing any guest page tables, so the "guest access
0708      * permissions" are just ACC_ALL.
0709      *
0710      * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
0711      * is shadowing a guest huge page with small pages, the guest access
0712      * permissions being shadowed are the access permissions of the huge
0713      * page.
0714      *
0715      * In both cases, sp->role.access contains the correct access bits.
0716      */
0717     return sp->role.access;
0718 }
0719
0720 static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
0721                      gfn_t gfn, unsigned int access)
0722 {
0723     if (sp_has_gptes(sp)) {
0724         sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
0725         return;
0726     }
0727
0728     WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
0729               "access mismatch under %s page %llx (expected %u, got %u)\n",
0730               sp->role.passthrough ? "passthrough" : "direct",
0731               sp->gfn, kvm_mmu_page_get_access(sp, index), access);
0732
0733     WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
0734               "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
0735               sp->role.passthrough ? "passthrough" : "direct",
0736               sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
0737 }
0738
0739 static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
0740                     unsigned int access)
0741 {
0742     gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
0743
0744     kvm_mmu_page_set_translation(sp, index, gfn, access);
0745 }
0746
0747 /*
0748  * Return the pointer to the large page information for a given gfn,
0749  * handling slots that are not large page aligned.
0750  */
0751 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
0752         const struct kvm_memory_slot *slot, int level)
0753 {
0754     unsigned long idx;
0755
0756     idx = gfn_to_index(gfn, slot->base_gfn, level);
0757     return &slot->arch.lpage_info[level - 2][idx];
0758 }
0759
0760 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
0761                         gfn_t gfn, int count)
0762 {
0763     struct kvm_lpage_info *linfo;
0764     int i;
0765
0766     for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
0767         linfo = lpage_info_slot(gfn, slot, i);
0768         linfo->disallow_lpage += count;
0769         WARN_ON(linfo->disallow_lpage < 0);
0770     }
0771 }
0772
0773 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
0774 {
0775     update_gfn_disallow_lpage_count(slot, gfn, 1);
0776 }
0777
0778 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
0779 {
0780     update_gfn_disallow_lpage_count(slot, gfn, -1);
0781 }
0782
0783 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
0784 {
0785     struct kvm_memslots *slots;
0786     struct kvm_memory_slot *slot;
0787     gfn_t gfn;
0788
0789     kvm->arch.indirect_shadow_pages++;
0790     gfn = sp->gfn;
0791     slots = kvm_memslots_for_spte_role(kvm, sp->role);
0792     slot = __gfn_to_memslot(slots, gfn);
0793
0794     /* the non-leaf shadow pages are keeping readonly. */
0795     if (sp->role.level > PG_LEVEL_4K)
0796         return kvm_slot_page_track_add_page(kvm, slot, gfn,
0797                             KVM_PAGE_TRACK_WRITE);
0798
0799     kvm_mmu_gfn_disallow_lpage(slot, gfn);
0800
0801     if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
0802         kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
0803 }
0804
0805 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
0806 {
0807     if (sp->lpage_disallowed)
0808         return;
0809
0810     ++kvm->stat.nx_lpage_splits;
0811     list_add_tail(&sp->lpage_disallowed_link,
0812               &kvm->arch.lpage_disallowed_mmu_pages);
0813     sp->lpage_disallowed = true;
0814 }
0815
0816 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
0817 {
0818     struct kvm_memslots *slots;
0819     struct kvm_memory_slot *slot;
0820     gfn_t gfn;
0821
0822     kvm->arch.indirect_shadow_pages--;
0823     gfn = sp->gfn;
0824     slots = kvm_memslots_for_spte_role(kvm, sp->role);
0825     slot = __gfn_to_memslot(slots, gfn);
0826     if (sp->role.level > PG_LEVEL_4K)
0827         return kvm_slot_page_track_remove_page(kvm, slot, gfn,
0828                                KVM_PAGE_TRACK_WRITE);
0829
0830     kvm_mmu_gfn_allow_lpage(slot, gfn);
0831 }
0832
0833 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
0834 {
0835     --kvm->stat.nx_lpage_splits;
0836     sp->lpage_disallowed = false;
0837     list_del(&sp->lpage_disallowed_link);
0838 }
0839
0840 static struct kvm_memory_slot *
0841 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
0842                 bool no_dirty_log)
0843 {
0844     struct kvm_memory_slot *slot;
0845
0846     slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
0847     if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
0848         return NULL;
0849     if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
0850         return NULL;
0851
0852     return slot;
0853 }
0854
0855 /*
0856  * About rmap_head encoding:
0857  *
0858  * If the bit zero of rmap_head->val is clear, then it points to the only spte
0859  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
0860  * pte_list_desc containing more mappings.
0861  */
0862
0863 /*
0864  * Returns the number of pointers in the rmap chain, not counting the new one.
0865  */
0866 static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
0867             struct kvm_rmap_head *rmap_head)
0868 {
0869     struct pte_list_desc *desc;
0870     int count = 0;
0871
0872     if (!rmap_head->val) {
0873         rmap_printk("%p %llx 0->1\n", spte, *spte);
0874         rmap_head->val = (unsigned long)spte;
0875     } else if (!(rmap_head->val & 1)) {
0876         rmap_printk("%p %llx 1->many\n", spte, *spte);
0877         desc = kvm_mmu_memory_cache_alloc(cache);
0878         desc->sptes[0] = (u64 *)rmap_head->val;
0879         desc->sptes[1] = spte;
0880         desc->spte_count = 2;
0881         rmap_head->val = (unsigned long)desc | 1;
0882         ++count;
0883     } else {
0884         rmap_printk("%p %llx many->many\n", spte, *spte);
0885         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0886         while (desc->spte_count == PTE_LIST_EXT) {
0887             count += PTE_LIST_EXT;
0888             if (!desc->more) {
0889                 desc->more = kvm_mmu_memory_cache_alloc(cache);
0890                 desc = desc->more;
0891                 desc->spte_count = 0;
0892                 break;
0893             }
0894             desc = desc->more;
0895         }
0896         count += desc->spte_count;
0897         desc->sptes[desc->spte_count++] = spte;
0898     }
0899     return count;
0900 }
0901
0902 static void
0903 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
0904                struct pte_list_desc *desc, int i,
0905                struct pte_list_desc *prev_desc)
0906 {
0907     int j = desc->spte_count - 1;
0908
0909     desc->sptes[i] = desc->sptes[j];
0910     desc->sptes[j] = NULL;
0911     desc->spte_count--;
0912     if (desc->spte_count)
0913         return;
0914     if (!prev_desc && !desc->more)
0915         rmap_head->val = 0;
0916     else
0917         if (prev_desc)
0918             prev_desc->more = desc->more;
0919         else
0920             rmap_head->val = (unsigned long)desc->more | 1;
0921     mmu_free_pte_list_desc(desc);
0922 }
0923
0924 static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
0925 {
0926     struct pte_list_desc *desc;
0927     struct pte_list_desc *prev_desc;
0928     int i;
0929
0930     if (!rmap_head->val) {
0931         pr_err("%s: %p 0->BUG\n", __func__, spte);
0932         BUG();
0933     } else if (!(rmap_head->val & 1)) {
0934         rmap_printk("%p 1->0\n", spte);
0935         if ((u64 *)rmap_head->val != spte) {
0936             pr_err("%s:  %p 1->BUG\n", __func__, spte);
0937             BUG();
0938         }
0939         rmap_head->val = 0;
0940     } else {
0941         rmap_printk("%p many->many\n", spte);
0942         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0943         prev_desc = NULL;
0944         while (desc) {
0945             for (i = 0; i < desc->spte_count; ++i) {
0946                 if (desc->sptes[i] == spte) {
0947                     pte_list_desc_remove_entry(rmap_head,
0948                             desc, i, prev_desc);
0949                     return;
0950                 }
0951             }
0952             prev_desc = desc;
0953             desc = desc->more;
0954         }
0955         pr_err("%s: %p many->many\n", __func__, spte);
0956         BUG();
0957     }
0958 }
0959
0960 static void kvm_zap_one_rmap_spte(struct kvm *kvm,
0961                   struct kvm_rmap_head *rmap_head, u64 *sptep)
0962 {
0963     mmu_spte_clear_track_bits(kvm, sptep);
0964     pte_list_remove(sptep, rmap_head);
0965 }
0966
0967 /* Return true if at least one SPTE was zapped, false otherwise */
0968 static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
0969                    struct kvm_rmap_head *rmap_head)
0970 {
0971     struct pte_list_desc *desc, *next;
0972     int i;
0973
0974     if (!rmap_head->val)
0975         return false;
0976
0977     if (!(rmap_head->val & 1)) {
0978         mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
0979         goto out;
0980     }
0981
0982     desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0983
0984     for (; desc; desc = next) {
0985         for (i = 0; i < desc->spte_count; i++)
0986             mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
0987         next = desc->more;
0988         mmu_free_pte_list_desc(desc);
0989     }
0990 out:
0991     /* rmap_head is meaningless now, remember to reset it */
0992     rmap_head->val = 0;
0993     return true;
0994 }
0995
0996 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
0997 {
0998     struct pte_list_desc *desc;
0999     unsigned int count = 0;
1000
1001     if (!rmap_head->val)
1002         return 0;
1003     else if (!(rmap_head->val & 1))
1004         return 1;
1005
1006     desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1007
1008     while (desc) {
1009         count += desc->spte_count;
1010         desc = desc->more;
1011     }
1012
1013     return count;
1014 }
1015
1016 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1017                      const struct kvm_memory_slot *slot)
1018 {
1019     unsigned long idx;
1020
1021     idx = gfn_to_index(gfn, slot->base_gfn, level);
1022     return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1023 }
1024
1025 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1026 {
1027     struct kvm_mmu_memory_cache *mc;
1028
1029     mc = &vcpu->arch.mmu_pte_list_desc_cache;
1030     return kvm_mmu_memory_cache_nr_free_objects(mc);
1031 }
1032
1033 static void rmap_remove(struct kvm *kvm, u64 *spte)
1034 {
1035     struct kvm_memslots *slots;
1036     struct kvm_memory_slot *slot;
1037     struct kvm_mmu_page *sp;
1038     gfn_t gfn;
1039     struct kvm_rmap_head *rmap_head;
1040
1041     sp = sptep_to_sp(spte);
1042     gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
1043
1044     /*
1045      * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1046      * so we have to determine which memslots to use based on context
1047      * information in sp->role.
1048      */
1049     slots = kvm_memslots_for_spte_role(kvm, sp->role);
1050
1051     slot = __gfn_to_memslot(slots, gfn);
1052     rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1053
1054     pte_list_remove(spte, rmap_head);
1055 }
1056
1057 /*
1058  * Used by the following functions to iterate through the sptes linked by a
1059  * rmap.  All fields are private and not assumed to be used outside.
1060  */
1061 struct rmap_iterator {
1062     /* private fields */
1063     struct pte_list_desc *desc; /* holds the sptep if not NULL */
1064     int pos;            /* index of the sptep */
1065 };
1066
1067 /*
1068  * Iteration must be started by this function.  This should also be used after
1069  * removing/dropping sptes from the rmap link because in such cases the
1070  * information in the iterator may not be valid.
1071  *
1072  * Returns sptep if found, NULL otherwise.
1073  */
1074 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1075                struct rmap_iterator *iter)
1076 {
1077     u64 *sptep;
1078
1079     if (!rmap_head->val)
1080         return NULL;
1081
1082     if (!(rmap_head->val & 1)) {
1083         iter->desc = NULL;
1084         sptep = (u64 *)rmap_head->val;
1085         goto out;
1086     }
1087
1088     iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1089     iter->pos = 0;
1090     sptep = iter->desc->sptes[iter->pos];
1091 out:
1092     BUG_ON(!is_shadow_present_pte(*sptep));
1093     return sptep;
1094 }
1095
1096 /*
1097  * Must be used with a valid iterator: e.g. after rmap_get_first().
1098  *
1099  * Returns sptep if found, NULL otherwise.
1100  */
1101 static u64 *rmap_get_next(struct rmap_iterator *iter)
1102 {
1103     u64 *sptep;
1104
1105     if (iter->desc) {
1106         if (iter->pos < PTE_LIST_EXT - 1) {
1107             ++iter->pos;
1108             sptep = iter->desc->sptes[iter->pos];
1109             if (sptep)
1110                 goto out;
1111         }
1112
1113         iter->desc = iter->desc->more;
1114
1115         if (iter->desc) {
1116             iter->pos = 0;
1117             /* desc->sptes[0] cannot be NULL */
1118             sptep = iter->desc->sptes[iter->pos];
1119             goto out;
1120         }
1121     }
1122
1123     return NULL;
1124 out:
1125     BUG_ON(!is_shadow_present_pte(*sptep));
1126     return sptep;
1127 }
1128
1129 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)         \
1130     for (_spte_ = rmap_get_first(_rmap_head_, _iter_);      \
1131          _spte_; _spte_ = rmap_get_next(_iter_))
1132
1133 static void drop_spte(struct kvm *kvm, u64 *sptep)
1134 {
1135     u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1136
1137     if (is_shadow_present_pte(old_spte))
1138         rmap_remove(kvm, sptep);
1139 }
1140
1141 static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
1142 {
1143     struct kvm_mmu_page *sp;
1144
1145     sp = sptep_to_sp(sptep);
1146     WARN_ON(sp->role.level == PG_LEVEL_4K);
1147
1148     drop_spte(kvm, sptep);
1149
1150     if (flush)
1151         kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
1152             KVM_PAGES_PER_HPAGE(sp->role.level));
1153 }
1154
1155 /*
1156  * Write-protect on the specified @sptep, @pt_protect indicates whether
1157  * spte write-protection is caused by protecting shadow page table.
1158  *
1159  * Note: write protection is difference between dirty logging and spte
1160  * protection:
1161  * - for dirty logging, the spte can be set to writable at anytime if
1162  *   its dirty bitmap is properly set.
1163  * - for spte protection, the spte can be writable only after unsync-ing
1164  *   shadow page.
1165  *
1166  * Return true if tlb need be flushed.
1167  */
1168 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1169 {
1170     u64 spte = *sptep;
1171
1172     if (!is_writable_pte(spte) &&
1173         !(pt_protect && is_mmu_writable_spte(spte)))
1174         return false;
1175
1176     rmap_printk("spte %p %llx\n", sptep, *sptep);
1177
1178     if (pt_protect)
1179         spte &= ~shadow_mmu_writable_mask;
1180     spte = spte & ~PT_WRITABLE_MASK;
1181
1182     return mmu_spte_update(sptep, spte);
1183 }
1184
1185 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1186                    bool pt_protect)
1187 {
1188     u64 *sptep;
1189     struct rmap_iterator iter;
1190     bool flush = false;
1191
1192     for_each_rmap_spte(rmap_head, &iter, sptep)
1193         flush |= spte_write_protect(sptep, pt_protect);
1194
1195     return flush;
1196 }
1197
1198 static bool spte_clear_dirty(u64 *sptep)
1199 {
1200     u64 spte = *sptep;
1201
1202     rmap_printk("spte %p %llx\n", sptep, *sptep);
1203
1204     MMU_WARN_ON(!spte_ad_enabled(spte));
1205     spte &= ~shadow_dirty_mask;
1206     return mmu_spte_update(sptep, spte);
1207 }
1208
1209 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1210 {
1211     bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1212                            (unsigned long *)sptep);
1213     if (was_writable && !spte_ad_enabled(*sptep))
1214         kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1215
1216     return was_writable;
1217 }
1218
1219 /*
1220  * Gets the GFN ready for another round of dirty logging by clearing the
1221  *  - D bit on ad-enabled SPTEs, and
1222  *  - W bit on ad-disabled SPTEs.
1223  * Returns true iff any D or W bits were cleared.
1224  */
1225 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1226                    const struct kvm_memory_slot *slot)
1227 {
1228     u64 *sptep;
1229     struct rmap_iterator iter;
1230     bool flush = false;
1231
1232     for_each_rmap_spte(rmap_head, &iter, sptep)
1233         if (spte_ad_need_write_protect(*sptep))
1234             flush |= spte_wrprot_for_clear_dirty(sptep);
1235         else
1236             flush |= spte_clear_dirty(sptep);
1237
1238     return flush;
1239 }
1240
1241 /**
1242  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1243  * @kvm: kvm instance
1244  * @slot: slot to protect
1245  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1246  * @mask: indicates which pages we should protect
1247  *
1248  * Used when we do not need to care about huge page mappings.
1249  */
1250 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1251                      struct kvm_memory_slot *slot,
1252                      gfn_t gfn_offset, unsigned long mask)
1253 {
1254     struct kvm_rmap_head *rmap_head;
1255
1256     if (is_tdp_mmu_enabled(kvm))
1257         kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1258                 slot->base_gfn + gfn_offset, mask, true);
1259
1260     if (!kvm_memslots_have_rmaps(kvm))
1261         return;
1262
1263     while (mask) {
1264         rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1265                     PG_LEVEL_4K, slot);
1266         rmap_write_protect(rmap_head, false);
1267
1268         /* clear the first set bit */
1269         mask &= mask - 1;
1270     }
1271 }
1272
1273 /**
1274  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1275  * protect the page if the D-bit isn't supported.
1276  * @kvm: kvm instance
1277  * @slot: slot to clear D-bit
1278  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1279  * @mask: indicates which pages we should clear D-bit
1280  *
1281  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1282  */
1283 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1284                      struct kvm_memory_slot *slot,
1285                      gfn_t gfn_offset, unsigned long mask)
1286 {
1287     struct kvm_rmap_head *rmap_head;
1288
1289     if (is_tdp_mmu_enabled(kvm))
1290         kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1291                 slot->base_gfn + gfn_offset, mask, false);
1292
1293     if (!kvm_memslots_have_rmaps(kvm))
1294         return;
1295
1296     while (mask) {
1297         rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1298                     PG_LEVEL_4K, slot);
1299         __rmap_clear_dirty(kvm, rmap_head, slot);
1300
1301         /* clear the first set bit */
1302         mask &= mask - 1;
1303     }
1304 }
1305
1306 /**
1307  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1308  * PT level pages.
1309  *
1310  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1311  * enable dirty logging for them.
1312  *
1313  * We need to care about huge page mappings: e.g. during dirty logging we may
1314  * have such mappings.
1315  */
1316 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1317                 struct kvm_memory_slot *slot,
1318                 gfn_t gfn_offset, unsigned long mask)
1319 {
1320     /*
1321      * Huge pages are NOT write protected when we start dirty logging in
1322      * initially-all-set mode; must write protect them here so that they
1323      * are split to 4K on the first write.
1324      *
1325      * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1326      * of memslot has no such restriction, so the range can cross two large
1327      * pages.
1328      */
1329     if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1330         gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1331         gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1332
1333         if (READ_ONCE(eager_page_split))
1334             kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1335
1336         kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1337
1338         /* Cross two large pages? */
1339         if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1340             ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1341             kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1342                                PG_LEVEL_2M);
1343     }
1344
1345     /* Now handle 4K PTEs.  */
1346     if (kvm_x86_ops.cpu_dirty_log_size)
1347         kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1348     else
1349         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1350 }
1351
1352 int kvm_cpu_dirty_log_size(void)
1353 {
1354     return kvm_x86_ops.cpu_dirty_log_size;
1355 }
1356
1357 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1358                     struct kvm_memory_slot *slot, u64 gfn,
1359                     int min_level)
1360 {
1361     struct kvm_rmap_head *rmap_head;
1362     int i;
1363     bool write_protected = false;
1364
1365     if (kvm_memslots_have_rmaps(kvm)) {
1366         for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1367             rmap_head = gfn_to_rmap(gfn, i, slot);
1368             write_protected |= rmap_write_protect(rmap_head, true);
1369         }
1370     }
1371
1372     if (is_tdp_mmu_enabled(kvm))
1373         write_protected |=
1374             kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1375
1376     return write_protected;
1377 }
1378
1379 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1380 {
1381     struct kvm_memory_slot *slot;
1382
1383     slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1384     return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1385 }
1386
1387 static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1388                const struct kvm_memory_slot *slot)
1389 {
1390     return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1391 }
1392
1393 static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1394              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1395              pte_t unused)
1396 {
1397     return __kvm_zap_rmap(kvm, rmap_head, slot);
1398 }
1399
1400 static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1401                  struct kvm_memory_slot *slot, gfn_t gfn, int level,
1402                  pte_t pte)
1403 {
1404     u64 *sptep;
1405     struct rmap_iterator iter;
1406     bool need_flush = false;
1407     u64 new_spte;
1408     kvm_pfn_t new_pfn;
1409
1410     WARN_ON(pte_huge(pte));
1411     new_pfn = pte_pfn(pte);
1412
1413 restart:
1414     for_each_rmap_spte(rmap_head, &iter, sptep) {
1415         rmap_printk("spte %p %llx gfn %llx (%d)\n",
1416                 sptep, *sptep, gfn, level);
1417
1418         need_flush = true;
1419
1420         if (pte_write(pte)) {
1421             kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
1422             goto restart;
1423         } else {
1424             new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1425                     *sptep, new_pfn);
1426
1427             mmu_spte_clear_track_bits(kvm, sptep);
1428             mmu_spte_set(sptep, new_spte);
1429         }
1430     }
1431
1432     if (need_flush && kvm_available_flush_tlb_with_range()) {
1433         kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1434         return false;
1435     }
1436
1437     return need_flush;
1438 }
1439
1440 struct slot_rmap_walk_iterator {
1441     /* input fields. */
1442     const struct kvm_memory_slot *slot;
1443     gfn_t start_gfn;
1444     gfn_t end_gfn;
1445     int start_level;
1446     int end_level;
1447
1448     /* output fields. */
1449     gfn_t gfn;
1450     struct kvm_rmap_head *rmap;
1451     int level;
1452
1453     /* private field. */
1454     struct kvm_rmap_head *end_rmap;
1455 };
1456
1457 static void
1458 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1459 {
1460     iterator->level = level;
1461     iterator->gfn = iterator->start_gfn;
1462     iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1463     iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1464 }
1465
1466 static void
1467 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1468             const struct kvm_memory_slot *slot, int start_level,
1469             int end_level, gfn_t start_gfn, gfn_t end_gfn)
1470 {
1471     iterator->slot = slot;
1472     iterator->start_level = start_level;
1473     iterator->end_level = end_level;
1474     iterator->start_gfn = start_gfn;
1475     iterator->end_gfn = end_gfn;
1476
1477     rmap_walk_init_level(iterator, iterator->start_level);
1478 }
1479
1480 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1481 {
1482     return !!iterator->rmap;
1483 }
1484
1485 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1486 {
1487     while (++iterator->rmap <= iterator->end_rmap) {
1488         iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1489
1490         if (iterator->rmap->val)
1491             return;
1492     }
1493
1494     if (++iterator->level > iterator->end_level) {
1495         iterator->rmap = NULL;
1496         return;
1497     }
1498
1499     rmap_walk_init_level(iterator, iterator->level);
1500 }
1501
1502 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1503        _start_gfn, _end_gfn, _iter_)                \
1504     for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,     \
1505                  _end_level_, _start_gfn, _end_gfn);    \
1506          slot_rmap_walk_okay(_iter_);               \
1507          slot_rmap_walk_next(_iter_))
1508
1509 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1510                    struct kvm_memory_slot *slot, gfn_t gfn,
1511                    int level, pte_t pte);
1512
1513 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1514                          struct kvm_gfn_range *range,
1515                          rmap_handler_t handler)
1516 {
1517     struct slot_rmap_walk_iterator iterator;
1518     bool ret = false;
1519
1520     for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1521                  range->start, range->end - 1, &iterator)
1522         ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1523                    iterator.level, range->pte);
1524
1525     return ret;
1526 }
1527
1528 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1529 {
1530     bool flush = false;
1531
1532     if (kvm_memslots_have_rmaps(kvm))
1533         flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
1534
1535     if (is_tdp_mmu_enabled(kvm))
1536         flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1537
1538     return flush;
1539 }
1540
1541 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1542 {
1543     bool flush = false;
1544
1545     if (kvm_memslots_have_rmaps(kvm))
1546         flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
1547
1548     if (is_tdp_mmu_enabled(kvm))
1549         flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1550
1551     return flush;
1552 }
1553
1554 static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1555              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1556              pte_t unused)
1557 {
1558     u64 *sptep;
1559     struct rmap_iterator iter;
1560     int young = 0;
1561
1562     for_each_rmap_spte(rmap_head, &iter, sptep)
1563         young |= mmu_spte_age(sptep);
1564
1565     return young;
1566 }
1567
1568 static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1569                   struct kvm_memory_slot *slot, gfn_t gfn,
1570                   int level, pte_t unused)
1571 {
1572     u64 *sptep;
1573     struct rmap_iterator iter;
1574
1575     for_each_rmap_spte(rmap_head, &iter, sptep)
1576         if (is_accessed_spte(*sptep))
1577             return true;
1578     return false;
1579 }
1580
1581 #define RMAP_RECYCLE_THRESHOLD 1000
1582
1583 static void __rmap_add(struct kvm *kvm,
1584                struct kvm_mmu_memory_cache *cache,
1585                const struct kvm_memory_slot *slot,
1586                u64 *spte, gfn_t gfn, unsigned int access)
1587 {
1588     struct kvm_mmu_page *sp;
1589     struct kvm_rmap_head *rmap_head;
1590     int rmap_count;
1591
1592     sp = sptep_to_sp(spte);
1593     kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
1594     kvm_update_page_stats(kvm, sp->role.level, 1);
1595
1596     rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1597     rmap_count = pte_list_add(cache, spte, rmap_head);
1598
1599     if (rmap_count > kvm->stat.max_mmu_rmap_size)
1600         kvm->stat.max_mmu_rmap_size = rmap_count;
1601     if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1602         kvm_zap_all_rmap_sptes(kvm, rmap_head);
1603         kvm_flush_remote_tlbs_with_address(
1604                 kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1605     }
1606 }
1607
1608 static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
1609              u64 *spte, gfn_t gfn, unsigned int access)
1610 {
1611     struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1612
1613     __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
1614 }
1615
1616 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1617 {
1618     bool young = false;
1619
1620     if (kvm_memslots_have_rmaps(kvm))
1621         young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
1622
1623     if (is_tdp_mmu_enabled(kvm))
1624         young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1625
1626     return young;
1627 }
1628
1629 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1630 {
1631     bool young = false;
1632
1633     if (kvm_memslots_have_rmaps(kvm))
1634         young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
1635
1636     if (is_tdp_mmu_enabled(kvm))
1637         young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1638
1639     return young;
1640 }
1641
1642 #ifdef MMU_DEBUG
1643 static int is_empty_shadow_page(u64 *spt)
1644 {
1645     u64 *pos;
1646     u64 *end;
1647
1648     for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1649         if (is_shadow_present_pte(*pos)) {
1650             printk(KERN_ERR "%s: %p %llx\n", __func__,
1651                    pos, *pos);
1652             return 0;
1653         }
1654     return 1;
1655 }
1656 #endif
1657
1658 /*
1659  * This value is the sum of all of the kvm instances's
1660  * kvm->arch.n_used_mmu_pages values.  We need a global,
1661  * aggregate version in order to make the slab shrinker
1662  * faster
1663  */
1664 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1665 {
1666     kvm->arch.n_used_mmu_pages += nr;
1667     percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1668 }
1669
1670 static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1671 {
1672     MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1673     hlist_del(&sp->hash_link);
1674     list_del(&sp->link);
1675     free_page((unsigned long)sp->spt);
1676     if (!sp->role.direct)
1677         free_page((unsigned long)sp->shadowed_translation);
1678     kmem_cache_free(mmu_page_header_cache, sp);
1679 }
1680
1681 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1682 {
1683     return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1684 }
1685
1686 static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1687                     struct kvm_mmu_page *sp, u64 *parent_pte)
1688 {
1689     if (!parent_pte)
1690         return;
1691
1692     pte_list_add(cache, parent_pte, &sp->parent_ptes);
1693 }
1694
1695 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1696                        u64 *parent_pte)
1697 {
1698     pte_list_remove(parent_pte, &sp->parent_ptes);
1699 }
1700
1701 static void drop_parent_pte(struct kvm_mmu_page *sp,
1702                 u64 *parent_pte)
1703 {
1704     mmu_page_remove_parent_pte(sp, parent_pte);
1705     mmu_spte_clear_no_track(parent_pte);
1706 }
1707
1708 static void mark_unsync(u64 *spte);
1709 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1710 {
1711     u64 *sptep;
1712     struct rmap_iterator iter;
1713
1714     for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1715         mark_unsync(sptep);
1716     }
1717 }
1718
1719 static void mark_unsync(u64 *spte)
1720 {
1721     struct kvm_mmu_page *sp;
1722
1723     sp = sptep_to_sp(spte);
1724     if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1725         return;
1726     if (sp->unsync_children++)
1727         return;
1728     kvm_mmu_mark_parents_unsync(sp);
1729 }
1730
1731 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1732                    struct kvm_mmu_page *sp)
1733 {
1734     return -1;
1735 }
1736
1737 #define KVM_PAGE_ARRAY_NR 16
1738
1739 struct kvm_mmu_pages {
1740     struct mmu_page_and_offset {
1741         struct kvm_mmu_page *sp;
1742         unsigned int idx;
1743     } page[KVM_PAGE_ARRAY_NR];
1744     unsigned int nr;
1745 };
1746
1747 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1748              int idx)
1749 {
1750     int i;
1751
1752     if (sp->unsync)
1753         for (i=0; i < pvec->nr; i++)
1754             if (pvec->page[i].sp == sp)
1755                 return 0;
1756
1757     pvec->page[pvec->nr].sp = sp;
1758     pvec->page[pvec->nr].idx = idx;
1759     pvec->nr++;
1760     return (pvec->nr == KVM_PAGE_ARRAY_NR);
1761 }
1762
1763 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1764 {
1765     --sp->unsync_children;
1766     WARN_ON((int)sp->unsync_children < 0);
1767     __clear_bit(idx, sp->unsync_child_bitmap);
1768 }
1769
1770 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1771                struct kvm_mmu_pages *pvec)
1772 {
1773     int i, ret, nr_unsync_leaf = 0;
1774
1775     for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1776         struct kvm_mmu_page *child;
1777         u64 ent = sp->spt[i];
1778
1779         if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1780             clear_unsync_child_bit(sp, i);
1781             continue;
1782         }
1783
1784         child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
1785
1786         if (child->unsync_children) {
1787             if (mmu_pages_add(pvec, child, i))
1788                 return -ENOSPC;
1789
1790             ret = __mmu_unsync_walk(child, pvec);
1791             if (!ret) {
1792                 clear_unsync_child_bit(sp, i);
1793                 continue;
1794             } else if (ret > 0) {
1795                 nr_unsync_leaf += ret;
1796             } else
1797                 return ret;
1798         } else if (child->unsync) {
1799             nr_unsync_leaf++;
1800             if (mmu_pages_add(pvec, child, i))
1801                 return -ENOSPC;
1802         } else
1803             clear_unsync_child_bit(sp, i);
1804     }
1805
1806     return nr_unsync_leaf;
1807 }
1808
1809 #define INVALID_INDEX (-1)
1810
1811 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1812                struct kvm_mmu_pages *pvec)
1813 {
1814     pvec->nr = 0;
1815     if (!sp->unsync_children)
1816         return 0;
1817
1818     mmu_pages_add(pvec, sp, INVALID_INDEX);
1819     return __mmu_unsync_walk(sp, pvec);
1820 }
1821
1822 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1823 {
1824     WARN_ON(!sp->unsync);
1825     trace_kvm_mmu_sync_page(sp);
1826     sp->unsync = 0;
1827     --kvm->stat.mmu_unsync;
1828 }
1829
1830 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1831                      struct list_head *invalid_list);
1832 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1833                     struct list_head *invalid_list);
1834
1835 static bool sp_has_gptes(struct kvm_mmu_page *sp)
1836 {
1837     if (sp->role.direct)
1838         return false;
1839
1840     if (sp->role.passthrough)
1841         return false;
1842
1843     return true;
1844 }
1845
1846 #define for_each_valid_sp(_kvm, _sp, _list)             \
1847     hlist_for_each_entry(_sp, _list, hash_link)         \
1848         if (is_obsolete_sp((_kvm), (_sp))) {            \
1849         } else
1850
1851 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)       \
1852     for_each_valid_sp(_kvm, _sp,                    \
1853       &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
1854         if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1855
1856 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1857              struct list_head *invalid_list)
1858 {
1859     int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1860
1861     if (ret < 0)
1862         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1863     return ret;
1864 }
1865
1866 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1867                     struct list_head *invalid_list,
1868                     bool remote_flush)
1869 {
1870     if (!remote_flush && list_empty(invalid_list))
1871         return false;
1872
1873     if (!list_empty(invalid_list))
1874         kvm_mmu_commit_zap_page(kvm, invalid_list);
1875     else
1876         kvm_flush_remote_tlbs(kvm);
1877     return true;
1878 }
1879
1880 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1881 {
1882     if (sp->role.invalid)
1883         return true;
1884
1885     /* TDP MMU pages due not use the MMU generation. */
1886     return !sp->tdp_mmu_page &&
1887            unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1888 }
1889
1890 struct mmu_page_path {
1891     struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1892     unsigned int idx[PT64_ROOT_MAX_LEVEL];
1893 };
1894
1895 #define for_each_sp(pvec, sp, parents, i)           \
1896         for (i = mmu_pages_first(&pvec, &parents);  \
1897             i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1898             i = mmu_pages_next(&pvec, &parents, i))
1899
1900 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1901               struct mmu_page_path *parents,
1902               int i)
1903 {
1904     int n;
1905
1906     for (n = i+1; n < pvec->nr; n++) {
1907         struct kvm_mmu_page *sp = pvec->page[n].sp;
1908         unsigned idx = pvec->page[n].idx;
1909         int level = sp->role.level;
1910
1911         parents->idx[level-1] = idx;
1912         if (level == PG_LEVEL_4K)
1913             break;
1914
1915         parents->parent[level-2] = sp;
1916     }
1917
1918     return n;
1919 }
1920
1921 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1922                struct mmu_page_path *parents)
1923 {
1924     struct kvm_mmu_page *sp;
1925     int level;
1926
1927     if (pvec->nr == 0)
1928         return 0;
1929
1930     WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1931
1932     sp = pvec->page[0].sp;
1933     level = sp->role.level;
1934     WARN_ON(level == PG_LEVEL_4K);
1935
1936     parents->parent[level-2] = sp;
1937
1938     /* Also set up a sentinel.  Further entries in pvec are all
1939      * children of sp, so this element is never overwritten.
1940      */
1941     parents->parent[level-1] = NULL;
1942     return mmu_pages_next(pvec, parents, 0);
1943 }
1944
1945 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1946 {
1947     struct kvm_mmu_page *sp;
1948     unsigned int level = 0;
1949
1950     do {
1951         unsigned int idx = parents->idx[level];
1952         sp = parents->parent[level];
1953         if (!sp)
1954             return;
1955
1956         WARN_ON(idx == INVALID_INDEX);
1957         clear_unsync_child_bit(sp, idx);
1958         level++;
1959     } while (!sp->unsync_children);
1960 }
1961
1962 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1963                  struct kvm_mmu_page *parent, bool can_yield)
1964 {
1965     int i;
1966     struct kvm_mmu_page *sp;
1967     struct mmu_page_path parents;
1968     struct kvm_mmu_pages pages;
1969     LIST_HEAD(invalid_list);
1970     bool flush = false;
1971
1972     while (mmu_unsync_walk(parent, &pages)) {
1973         bool protected = false;
1974
1975         for_each_sp(pages, sp, parents, i)
1976             protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1977
1978         if (protected) {
1979             kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1980             flush = false;
1981         }
1982
1983         for_each_sp(pages, sp, parents, i) {
1984             kvm_unlink_unsync_page(vcpu->kvm, sp);
1985             flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
1986             mmu_pages_clear_parents(&parents);
1987         }
1988         if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
1989             kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
1990             if (!can_yield) {
1991                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1992                 return -EINTR;
1993             }
1994
1995             cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
1996             flush = false;
1997         }
1998     }
1999
2000     kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2001     return 0;
2002 }
2003
2004 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2005 {
2006     atomic_set(&sp->write_flooding_count,  0);
2007 }
2008
2009 static void clear_sp_write_flooding_count(u64 *spte)
2010 {
2011     __clear_sp_write_flooding_count(sptep_to_sp(spte));
2012 }
2013
2014 /*
2015  * The vCPU is required when finding indirect shadow pages; the shadow
2016  * page may already exist and syncing it needs the vCPU pointer in
2017  * order to read guest page tables.  Direct shadow pages are never
2018  * unsync, thus @vcpu can be NULL if @role.direct is true.
2019  */
2020 static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2021                              struct kvm_vcpu *vcpu,
2022                              gfn_t gfn,
2023                              struct hlist_head *sp_list,
2024                              union kvm_mmu_page_role role)
2025 {
2026     struct kvm_mmu_page *sp;
2027     int ret;
2028     int collisions = 0;
2029     LIST_HEAD(invalid_list);
2030
2031     for_each_valid_sp(kvm, sp, sp_list) {
2032         if (sp->gfn != gfn) {
2033             collisions++;
2034             continue;
2035         }
2036
2037         if (sp->role.word != role.word) {
2038             /*
2039              * If the guest is creating an upper-level page, zap
2040              * unsync pages for the same gfn.  While it's possible
2041              * the guest is using recursive page tables, in all
2042              * likelihood the guest has stopped using the unsync
2043              * page and is installing a completely unrelated page.
2044              * Unsync pages must not be left as is, because the new
2045              * upper-level page will be write-protected.
2046              */
2047             if (role.level > PG_LEVEL_4K && sp->unsync)
2048                 kvm_mmu_prepare_zap_page(kvm, sp,
2049                              &invalid_list);
2050             continue;
2051         }
2052
2053         /* unsync and write-flooding only apply to indirect SPs. */
2054         if (sp->role.direct)
2055             goto out;
2056
2057         if (sp->unsync) {
2058             if (KVM_BUG_ON(!vcpu, kvm))
2059                 break;
2060
2061             /*
2062              * The page is good, but is stale.  kvm_sync_page does
2063              * get the latest guest state, but (unlike mmu_unsync_children)
2064              * it doesn't write-protect the page or mark it synchronized!
2065              * This way the validity of the mapping is ensured, but the
2066              * overhead of write protection is not incurred until the
2067              * guest invalidates the TLB mapping.  This allows multiple
2068              * SPs for a single gfn to be unsync.
2069              *
2070              * If the sync fails, the page is zapped.  If so, break
2071              * in order to rebuild it.
2072              */
2073             ret = kvm_sync_page(vcpu, sp, &invalid_list);
2074             if (ret < 0)
2075                 break;
2076
2077             WARN_ON(!list_empty(&invalid_list));
2078             if (ret > 0)
2079                 kvm_flush_remote_tlbs(kvm);
2080         }
2081
2082         __clear_sp_write_flooding_count(sp);
2083
2084         goto out;
2085     }
2086
2087     sp = NULL;
2088     ++kvm->stat.mmu_cache_miss;
2089
2090 out:
2091     kvm_mmu_commit_zap_page(kvm, &invalid_list);
2092
2093     if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2094         kvm->stat.max_mmu_page_hash_collisions = collisions;
2095     return sp;
2096 }
2097
2098 /* Caches used when allocating a new shadow page. */
2099 struct shadow_page_caches {
2100     struct kvm_mmu_memory_cache *page_header_cache;
2101     struct kvm_mmu_memory_cache *shadow_page_cache;
2102     struct kvm_mmu_memory_cache *shadowed_info_cache;
2103 };
2104
2105 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2106                               struct shadow_page_caches *caches,
2107                               gfn_t gfn,
2108                               struct hlist_head *sp_list,
2109                               union kvm_mmu_page_role role)
2110 {
2111     struct kvm_mmu_page *sp;
2112
2113     sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2114     sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
2115     if (!role.direct)
2116         sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
2117
2118     set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2119
2120     /*
2121      * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2122      * depends on valid pages being added to the head of the list.  See
2123      * comments in kvm_zap_obsolete_pages().
2124      */
2125     sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2126     list_add(&sp->link, &kvm->arch.active_mmu_pages);
2127     kvm_mod_used_mmu_pages(kvm, +1);
2128
2129     sp->gfn = gfn;
2130     sp->role = role;
2131     hlist_add_head(&sp->hash_link, sp_list);
2132     if (sp_has_gptes(sp))
2133         account_shadowed(kvm, sp);
2134
2135     return sp;
2136 }
2137
2138 /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
2139 static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2140                               struct kvm_vcpu *vcpu,
2141                               struct shadow_page_caches *caches,
2142                               gfn_t gfn,
2143                               union kvm_mmu_page_role role)
2144 {
2145     struct hlist_head *sp_list;
2146     struct kvm_mmu_page *sp;
2147     bool created = false;
2148
2149     sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2150
2151     sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2152     if (!sp) {
2153         created = true;
2154         sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2155     }
2156
2157     trace_kvm_mmu_get_page(sp, created);
2158     return sp;
2159 }
2160
2161 static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2162                             gfn_t gfn,
2163                             union kvm_mmu_page_role role)
2164 {
2165     struct shadow_page_caches caches = {
2166         .page_header_cache = &vcpu->arch.mmu_page_header_cache,
2167         .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2168         .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2169     };
2170
2171     return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2172 }
2173
2174 static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2175                           unsigned int access)
2176 {
2177     struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2178     union kvm_mmu_page_role role;
2179
2180     role = parent_sp->role;
2181     role.level--;
2182     role.access = access;
2183     role.direct = direct;
2184     role.passthrough = 0;
2185
2186     /*
2187      * If the guest has 4-byte PTEs then that means it's using 32-bit,
2188      * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2189      * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2190      * shadow each guest page table with multiple shadow page tables, which
2191      * requires extra bookkeeping in the role.
2192      *
2193      * Specifically, to shadow the guest's page directory (which covers a
2194      * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2195      * 1GiB of the address space. @role.quadrant encodes which quarter of
2196      * the address space each maps.
2197      *
2198      * To shadow the guest's page tables (which each map a 4MiB region), KVM
2199      * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2200      * @role.quadrant encodes which half of the region they map.
2201      *
2202      * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2203      * consumes bits 29:21.  To consume bits 31:30, KVM's uses 4 shadow
2204      * PDPTEs; those 4 PAE page directories are pre-allocated and their
2205      * quadrant is assigned in mmu_alloc_root().   A 4-byte PTE consumes
2206      * bits 21:12, while an 8-byte PTE consumes bits 20:12.  To consume
2207      * bit 21 in the PTE (the child here), KVM propagates that bit to the
2208      * quadrant, i.e. sets quadrant to '0' or '1'.  The parent 8-byte PDE
2209      * covers bit 21 (see above), thus the quadrant is calculated from the
2210      * _least_ significant bit of the PDE index.
2211      */
2212     if (role.has_4_byte_gpte) {
2213         WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2214         role.quadrant = spte_index(sptep) & 1;
2215     }
2216
2217     return role;
2218 }
2219
2220 static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2221                          u64 *sptep, gfn_t gfn,
2222                          bool direct, unsigned int access)
2223 {
2224     union kvm_mmu_page_role role;
2225
2226     if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2227         return ERR_PTR(-EEXIST);
2228
2229     role = kvm_mmu_child_role(sptep, direct, access);
2230     return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2231 }
2232
2233 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2234                     struct kvm_vcpu *vcpu, hpa_t root,
2235                     u64 addr)
2236 {
2237     iterator->addr = addr;
2238     iterator->shadow_addr = root;
2239     iterator->level = vcpu->arch.mmu->root_role.level;
2240
2241     if (iterator->level >= PT64_ROOT_4LEVEL &&
2242         vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2243         !vcpu->arch.mmu->root_role.direct)
2244         iterator->level = PT32E_ROOT_LEVEL;
2245
2246     if (iterator->level == PT32E_ROOT_LEVEL) {
2247         /*
2248          * prev_root is currently only used for 64-bit hosts. So only
2249          * the active root_hpa is valid here.
2250          */
2251         BUG_ON(root != vcpu->arch.mmu->root.hpa);
2252
2253         iterator->shadow_addr
2254             = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2255         iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2256         --iterator->level;
2257         if (!iterator->shadow_addr)
2258             iterator->level = 0;
2259     }
2260 }
2261
2262 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2263                  struct kvm_vcpu *vcpu, u64 addr)
2264 {
2265     shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2266                     addr);
2267 }
2268
2269 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2270 {
2271     if (iterator->level < PG_LEVEL_4K)
2272         return false;
2273
2274     iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2275     iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2276     return true;
2277 }
2278
2279 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2280                    u64 spte)
2281 {
2282     if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2283         iterator->level = 0;
2284         return;
2285     }
2286
2287     iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2288     --iterator->level;
2289 }
2290
2291 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2292 {
2293     __shadow_walk_next(iterator, *iterator->sptep);
2294 }
2295
2296 static void __link_shadow_page(struct kvm *kvm,
2297                    struct kvm_mmu_memory_cache *cache, u64 *sptep,
2298                    struct kvm_mmu_page *sp, bool flush)
2299 {
2300     u64 spte;
2301
2302     BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2303
2304     /*
2305      * If an SPTE is present already, it must be a leaf and therefore
2306      * a large one.  Drop it, and flush the TLB if needed, before
2307      * installing sp.
2308      */
2309     if (is_shadow_present_pte(*sptep))
2310         drop_large_spte(kvm, sptep, flush);
2311
2312     spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2313
2314     mmu_spte_set(sptep, spte);
2315
2316     mmu_page_add_parent_pte(cache, sp, sptep);
2317
2318     if (sp->unsync_children || sp->unsync)
2319         mark_unsync(sptep);
2320 }
2321
2322 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2323                  struct kvm_mmu_page *sp)
2324 {
2325     __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2326 }
2327
2328 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2329                    unsigned direct_access)
2330 {
2331     if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2332         struct kvm_mmu_page *child;
2333
2334         /*
2335          * For the direct sp, if the guest pte's dirty bit
2336          * changed form clean to dirty, it will corrupt the
2337          * sp's access: allow writable in the read-only sp,
2338          * so we should update the spte at this point to get
2339          * a new sp with the correct access.
2340          */
2341         child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
2342         if (child->role.access == direct_access)
2343             return;
2344
2345         drop_parent_pte(child, sptep);
2346         kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2347     }
2348 }
2349
2350 /* Returns the number of zapped non-leaf child shadow pages. */
2351 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2352                 u64 *spte, struct list_head *invalid_list)
2353 {
2354     u64 pte;
2355     struct kvm_mmu_page *child;
2356
2357     pte = *spte;
2358     if (is_shadow_present_pte(pte)) {
2359         if (is_last_spte(pte, sp->role.level)) {
2360             drop_spte(kvm, spte);
2361         } else {
2362             child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
2363             drop_parent_pte(child, spte);
2364
2365             /*
2366              * Recursively zap nested TDP SPs, parentless SPs are
2367              * unlikely to be used again in the near future.  This
2368              * avoids retaining a large number of stale nested SPs.
2369              */
2370             if (tdp_enabled && invalid_list &&
2371                 child->role.guest_mode && !child->parent_ptes.val)
2372                 return kvm_mmu_prepare_zap_page(kvm, child,
2373                                 invalid_list);
2374         }
2375     } else if (is_mmio_spte(pte)) {
2376         mmu_spte_clear_no_track(spte);
2377     }
2378     return 0;
2379 }
2380
2381 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2382                     struct kvm_mmu_page *sp,
2383                     struct list_head *invalid_list)
2384 {
2385     int zapped = 0;
2386     unsigned i;
2387
2388     for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2389         zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2390
2391     return zapped;
2392 }
2393
2394 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2395 {
2396     u64 *sptep;
2397     struct rmap_iterator iter;
2398
2399     while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2400         drop_parent_pte(sp, sptep);
2401 }
2402
2403 static int mmu_zap_unsync_children(struct kvm *kvm,
2404                    struct kvm_mmu_page *parent,
2405                    struct list_head *invalid_list)
2406 {
2407     int i, zapped = 0;
2408     struct mmu_page_path parents;
2409     struct kvm_mmu_pages pages;
2410
2411     if (parent->role.level == PG_LEVEL_4K)
2412         return 0;
2413
2414     while (mmu_unsync_walk(parent, &pages)) {
2415         struct kvm_mmu_page *sp;
2416
2417         for_each_sp(pages, sp, parents, i) {
2418             kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2419             mmu_pages_clear_parents(&parents);
2420             zapped++;
2421         }
2422     }
2423
2424     return zapped;
2425 }
2426
2427 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2428                        struct kvm_mmu_page *sp,
2429                        struct list_head *invalid_list,
2430                        int *nr_zapped)
2431 {
2432     bool list_unstable, zapped_root = false;
2433
2434     trace_kvm_mmu_prepare_zap_page(sp);
2435     ++kvm->stat.mmu_shadow_zapped;
2436     *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2437     *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2438     kvm_mmu_unlink_parents(sp);
2439
2440     /* Zapping children means active_mmu_pages has become unstable. */
2441     list_unstable = *nr_zapped;
2442
2443     if (!sp->role.invalid && sp_has_gptes(sp))
2444         unaccount_shadowed(kvm, sp);
2445
2446     if (sp->unsync)
2447         kvm_unlink_unsync_page(kvm, sp);
2448     if (!sp->root_count) {
2449         /* Count self */
2450         (*nr_zapped)++;
2451
2452         /*
2453          * Already invalid pages (previously active roots) are not on
2454          * the active page list.  See list_del() in the "else" case of
2455          * !sp->root_count.
2456          */
2457         if (sp->role.invalid)
2458             list_add(&sp->link, invalid_list);
2459         else
2460             list_move(&sp->link, invalid_list);
2461         kvm_mod_used_mmu_pages(kvm, -1);
2462     } else {
2463         /*
2464          * Remove the active root from the active page list, the root
2465          * will be explicitly freed when the root_count hits zero.
2466          */
2467         list_del(&sp->link);
2468
2469         /*
2470          * Obsolete pages cannot be used on any vCPUs, see the comment
2471          * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2472          * treats invalid shadow pages as being obsolete.
2473          */
2474         zapped_root = !is_obsolete_sp(kvm, sp);
2475     }
2476
2477     if (sp->lpage_disallowed)
2478         unaccount_huge_nx_page(kvm, sp);
2479
2480     sp->role.invalid = 1;
2481
2482     /*
2483      * Make the request to free obsolete roots after marking the root
2484      * invalid, otherwise other vCPUs may not see it as invalid.
2485      */
2486     if (zapped_root)
2487         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2488     return list_unstable;
2489 }
2490
2491 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2492                      struct list_head *invalid_list)
2493 {
2494     int nr_zapped;
2495
2496     __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2497     return nr_zapped;
2498 }
2499
2500 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2501                     struct list_head *invalid_list)
2502 {
2503     struct kvm_mmu_page *sp, *nsp;
2504
2505     if (list_empty(invalid_list))
2506         return;
2507
2508     /*
2509      * We need to make sure everyone sees our modifications to
2510      * the page tables and see changes to vcpu->mode here. The barrier
2511      * in the kvm_flush_remote_tlbs() achieves this. This pairs
2512      * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2513      *
2514      * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2515      * guest mode and/or lockless shadow page table walks.
2516      */
2517     kvm_flush_remote_tlbs(kvm);
2518
2519     list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2520         WARN_ON(!sp->role.invalid || sp->root_count);
2521         kvm_mmu_free_shadow_page(sp);
2522     }
2523 }
2524
2525 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2526                           unsigned long nr_to_zap)
2527 {
2528     unsigned long total_zapped = 0;
2529     struct kvm_mmu_page *sp, *tmp;
2530     LIST_HEAD(invalid_list);
2531     bool unstable;
2532     int nr_zapped;
2533
2534     if (list_empty(&kvm->arch.active_mmu_pages))
2535         return 0;
2536
2537 restart:
2538     list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2539         /*
2540          * Don't zap active root pages, the page itself can't be freed
2541          * and zapping it will just force vCPUs to realloc and reload.
2542          */
2543         if (sp->root_count)
2544             continue;
2545
2546         unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2547                               &nr_zapped);
2548         total_zapped += nr_zapped;
2549         if (total_zapped >= nr_to_zap)
2550             break;
2551
2552         if (unstable)
2553             goto restart;
2554     }
2555
2556     kvm_mmu_commit_zap_page(kvm, &invalid_list);
2557
2558     kvm->stat.mmu_recycled += total_zapped;
2559     return total_zapped;
2560 }
2561
2562 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2563 {
2564     if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2565         return kvm->arch.n_max_mmu_pages -
2566             kvm->arch.n_used_mmu_pages;
2567
2568     return 0;
2569 }
2570
2571 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2572 {
2573     unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2574
2575     if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2576         return 0;
2577
2578     kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2579
2580     /*
2581      * Note, this check is intentionally soft, it only guarantees that one
2582      * page is available, while the caller may end up allocating as many as
2583      * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2584      * exceeding the (arbitrary by default) limit will not harm the host,
2585      * being too aggressive may unnecessarily kill the guest, and getting an
2586      * exact count is far more trouble than it's worth, especially in the
2587      * page fault paths.
2588      */
2589     if (!kvm_mmu_available_pages(vcpu->kvm))
2590         return -ENOSPC;
2591     return 0;
2592 }
2593
2594 /*
2595  * Changing the number of mmu pages allocated to the vm
2596  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2597  */
2598 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2599 {
2600     write_lock(&kvm->mmu_lock);
2601
2602     if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2603         kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2604                           goal_nr_mmu_pages);
2605
2606         goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2607     }
2608
2609     kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2610
2611     write_unlock(&kvm->mmu_lock);
2612 }
2613
2614 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2615 {
2616     struct kvm_mmu_page *sp;
2617     LIST_HEAD(invalid_list);
2618     int r;
2619
2620     pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2621     r = 0;
2622     write_lock(&kvm->mmu_lock);
2623     for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2624         pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2625              sp->role.word);
2626         r = 1;
2627         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2628     }
2629     kvm_mmu_commit_zap_page(kvm, &invalid_list);
2630     write_unlock(&kvm->mmu_lock);
2631
2632     return r;
2633 }
2634
2635 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2636 {
2637     gpa_t gpa;
2638     int r;
2639
2640     if (vcpu->arch.mmu->root_role.direct)
2641         return 0;
2642
2643     gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2644
2645     r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2646
2647     return r;
2648 }
2649
2650 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2651 {
2652     trace_kvm_mmu_unsync_page(sp);
2653     ++kvm->stat.mmu_unsync;
2654     sp->unsync = 1;
2655
2656     kvm_mmu_mark_parents_unsync(sp);
2657 }
2658
2659 /*
2660  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2661  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2662  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2663  * be write-protected.
2664  */
2665 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2666                 gfn_t gfn, bool can_unsync, bool prefetch)
2667 {
2668     struct kvm_mmu_page *sp;
2669     bool locked = false;
2670
2671     /*
2672      * Force write-protection if the page is being tracked.  Note, the page
2673      * track machinery is used to write-protect upper-level shadow pages,
2674      * i.e. this guards the role.level == 4K assertion below!
2675      */
2676     if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2677         return -EPERM;
2678
2679     /*
2680      * The page is not write-tracked, mark existing shadow pages unsync
2681      * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2682      * that case, KVM must complete emulation of the guest TLB flush before
2683      * allowing shadow pages to become unsync (writable by the guest).
2684      */
2685     for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2686         if (!can_unsync)
2687             return -EPERM;
2688
2689         if (sp->unsync)
2690             continue;
2691
2692         if (prefetch)
2693             return -EEXIST;
2694
2695         /*
2696          * TDP MMU page faults require an additional spinlock as they
2697          * run with mmu_lock held for read, not write, and the unsync
2698          * logic is not thread safe.  Take the spinklock regardless of
2699          * the MMU type to avoid extra conditionals/parameters, there's
2700          * no meaningful penalty if mmu_lock is held for write.
2701          */
2702         if (!locked) {
2703             locked = true;
2704             spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2705
2706             /*
2707              * Recheck after taking the spinlock, a different vCPU
2708              * may have since marked the page unsync.  A false
2709              * positive on the unprotected check above is not
2710              * possible as clearing sp->unsync _must_ hold mmu_lock
2711              * for write, i.e. unsync cannot transition from 0->1
2712              * while this CPU holds mmu_lock for read (or write).
2713              */
2714             if (READ_ONCE(sp->unsync))
2715                 continue;
2716         }
2717
2718         WARN_ON(sp->role.level != PG_LEVEL_4K);
2719         kvm_unsync_page(kvm, sp);
2720     }
2721     if (locked)
2722         spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2723
2724     /*
2725      * We need to ensure that the marking of unsync pages is visible
2726      * before the SPTE is updated to allow writes because
2727      * kvm_mmu_sync_roots() checks the unsync flags without holding
2728      * the MMU lock and so can race with this. If the SPTE was updated
2729      * before the page had been marked as unsync-ed, something like the
2730      * following could happen:
2731      *
2732      * CPU 1                    CPU 2
2733      * ---------------------------------------------------------------------
2734      * 1.2 Host updates SPTE
2735      *     to be writable
2736      *                      2.1 Guest writes a GPTE for GVA X.
2737      *                          (GPTE being in the guest page table shadowed
2738      *                           by the SP from CPU 1.)
2739      *                          This reads SPTE during the page table walk.
2740      *                          Since SPTE.W is read as 1, there is no
2741      *                          fault.
2742      *
2743      *                      2.2 Guest issues TLB flush.
2744      *                          That causes a VM Exit.
2745      *
2746      *                      2.3 Walking of unsync pages sees sp->unsync is
2747      *                          false and skips the page.
2748      *
2749      *                      2.4 Guest accesses GVA X.
2750      *                          Since the mapping in the SP was not updated,
2751      *                          so the old mapping for GVA X incorrectly
2752      *                          gets used.
2753      * 1.1 Host marks SP
2754      *     as unsync
2755      *     (sp->unsync = true)
2756      *
2757      * The write barrier below ensures that 1.1 happens before 1.2 and thus
2758      * the situation in 2.4 does not arise.  It pairs with the read barrier
2759      * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2760      */
2761     smp_wmb();
2762
2763     return 0;
2764 }
2765
2766 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2767             u64 *sptep, unsigned int pte_access, gfn_t gfn,
2768             kvm_pfn_t pfn, struct kvm_page_fault *fault)
2769 {
2770     struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2771     int level = sp->role.level;
2772     int was_rmapped = 0;
2773     int ret = RET_PF_FIXED;
2774     bool flush = false;
2775     bool wrprot;
2776     u64 spte;
2777
2778     /* Prefetching always gets a writable pfn.  */
2779     bool host_writable = !fault || fault->map_writable;
2780     bool prefetch = !fault || fault->prefetch;
2781     bool write_fault = fault && fault->write;
2782
2783     pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2784          *sptep, write_fault, gfn);
2785
2786     if (unlikely(is_noslot_pfn(pfn))) {
2787         vcpu->stat.pf_mmio_spte_created++;
2788         mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2789         return RET_PF_EMULATE;
2790     }
2791
2792     if (is_shadow_present_pte(*sptep)) {
2793         /*
2794          * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2795          * the parent of the now unreachable PTE.
2796          */
2797         if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2798             struct kvm_mmu_page *child;
2799             u64 pte = *sptep;
2800
2801             child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
2802             drop_parent_pte(child, sptep);
2803             flush = true;
2804         } else if (pfn != spte_to_pfn(*sptep)) {
2805             pgprintk("hfn old %llx new %llx\n",
2806                  spte_to_pfn(*sptep), pfn);
2807             drop_spte(vcpu->kvm, sptep);
2808             flush = true;
2809         } else
2810             was_rmapped = 1;
2811     }
2812
2813     wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2814                true, host_writable, &spte);
2815
2816     if (*sptep == spte) {
2817         ret = RET_PF_SPURIOUS;
2818     } else {
2819         flush |= mmu_spte_update(sptep, spte);
2820         trace_kvm_mmu_set_spte(level, gfn, sptep);
2821     }
2822
2823     if (wrprot) {
2824         if (write_fault)
2825             ret = RET_PF_EMULATE;
2826     }
2827
2828     if (flush)
2829         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2830                 KVM_PAGES_PER_HPAGE(level));
2831
2832     pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2833
2834     if (!was_rmapped) {
2835         WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2836         rmap_add(vcpu, slot, sptep, gfn, pte_access);
2837     } else {
2838         /* Already rmapped but the pte_access bits may have changed. */
2839         kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
2840     }
2841
2842     return ret;
2843 }
2844
2845 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2846                     struct kvm_mmu_page *sp,
2847                     u64 *start, u64 *end)
2848 {
2849     struct page *pages[PTE_PREFETCH_NUM];
2850     struct kvm_memory_slot *slot;
2851     unsigned int access = sp->role.access;
2852     int i, ret;
2853     gfn_t gfn;
2854
2855     gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
2856     slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2857     if (!slot)
2858         return -1;
2859
2860     ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2861     if (ret <= 0)
2862         return -1;
2863
2864     for (i = 0; i < ret; i++, gfn++, start++) {
2865         mmu_set_spte(vcpu, slot, start, access, gfn,
2866                  page_to_pfn(pages[i]), NULL);
2867         put_page(pages[i]);
2868     }
2869
2870     return 0;
2871 }
2872
2873 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2874                   struct kvm_mmu_page *sp, u64 *sptep)
2875 {
2876     u64 *spte, *start = NULL;
2877     int i;
2878
2879     WARN_ON(!sp->role.direct);
2880
2881     i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
2882     spte = sp->spt + i;
2883
2884     for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2885         if (is_shadow_present_pte(*spte) || spte == sptep) {
2886             if (!start)
2887                 continue;
2888             if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2889                 return;
2890             start = NULL;
2891         } else if (!start)
2892             start = spte;
2893     }
2894     if (start)
2895         direct_pte_prefetch_many(vcpu, sp, start, spte);
2896 }
2897
2898 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2899 {
2900     struct kvm_mmu_page *sp;
2901
2902     sp = sptep_to_sp(sptep);
2903
2904     /*
2905      * Without accessed bits, there's no way to distinguish between
2906      * actually accessed translations and prefetched, so disable pte
2907      * prefetch if accessed bits aren't available.
2908      */
2909     if (sp_ad_disabled(sp))
2910         return;
2911
2912     if (sp->role.level > PG_LEVEL_4K)
2913         return;
2914
2915     /*
2916      * If addresses are being invalidated, skip prefetching to avoid
2917      * accidentally prefetching those addresses.
2918      */
2919     if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
2920         return;
2921
2922     __direct_pte_prefetch(vcpu, sp, sptep);
2923 }
2924
2925 /*
2926  * Lookup the mapping level for @gfn in the current mm.
2927  *
2928  * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
2929  * consumer to be tied into KVM's handlers for MMU notifier events!
2930  *
2931  * There are several ways to safely use this helper:
2932  *
2933  * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
2934  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
2935  *   lookup, but it does need to be held while checking the MMU notifier.
2936  *
2937  * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
2938  *   event for the hva.  This can be done by explicit checking the MMU notifier
2939  *   or by ensuring that KVM already has a valid mapping that covers the hva.
2940  *
2941  * - Do not use the result to install new mappings, e.g. use the host mapping
2942  *   level only to decide whether or not to zap an entry.  In this case, it's
2943  *   not required to hold mmu_lock (though it's highly likely the caller will
2944  *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
2945  *
2946  * Note!  The lookup can still race with modifications to host page tables, but
2947  * the above "rules" ensure KVM will not _consume_ the result of the walk if a
2948  * race with the primary MMU occurs.
2949  */
2950 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
2951                   const struct kvm_memory_slot *slot)
2952 {
2953     int level = PG_LEVEL_4K;
2954     unsigned long hva;
2955     unsigned long flags;
2956     pgd_t pgd;
2957     p4d_t p4d;
2958     pud_t pud;
2959     pmd_t pmd;
2960
2961     /*
2962      * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2963      * is not solely for performance, it's also necessary to avoid the
2964      * "writable" check in __gfn_to_hva_many(), which will always fail on
2965      * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
2966      * page fault steps have already verified the guest isn't writing a
2967      * read-only memslot.
2968      */
2969     hva = __gfn_to_hva_memslot(slot, gfn);
2970
2971     /*
2972      * Disable IRQs to prevent concurrent tear down of host page tables,
2973      * e.g. if the primary MMU promotes a P*D to a huge page and then frees
2974      * the original page table.
2975      */
2976     local_irq_save(flags);
2977
2978     /*
2979      * Read each entry once.  As above, a non-leaf entry can be promoted to
2980      * a huge page _during_ this walk.  Re-reading the entry could send the
2981      * walk into the weeks, e.g. p*d_large() returns false (sees the old
2982      * value) and then p*d_offset() walks into the target huge page instead
2983      * of the old page table (sees the new value).
2984      */
2985     pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2986     if (pgd_none(pgd))
2987         goto out;
2988
2989     p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2990     if (p4d_none(p4d) || !p4d_present(p4d))
2991         goto out;
2992
2993     pud = READ_ONCE(*pud_offset(&p4d, hva));
2994     if (pud_none(pud) || !pud_present(pud))
2995         goto out;
2996
2997     if (pud_large(pud)) {
2998         level = PG_LEVEL_1G;
2999         goto out;
3000     }
3001
3002     pmd = READ_ONCE(*pmd_offset(&pud, hva));
3003     if (pmd_none(pmd) || !pmd_present(pmd))
3004         goto out;
3005
3006     if (pmd_large(pmd))
3007         level = PG_LEVEL_2M;
3008
3009 out:
3010     local_irq_restore(flags);
3011     return level;
3012 }
3013
3014 int kvm_mmu_max_mapping_level(struct kvm *kvm,
3015                   const struct kvm_memory_slot *slot, gfn_t gfn,
3016                   int max_level)
3017 {
3018     struct kvm_lpage_info *linfo;
3019     int host_level;
3020
3021     max_level = min(max_level, max_huge_page_level);
3022     for ( ; max_level > PG_LEVEL_4K; max_level--) {
3023         linfo = lpage_info_slot(gfn, slot, max_level);
3024         if (!linfo->disallow_lpage)
3025             break;
3026     }
3027
3028     if (max_level == PG_LEVEL_4K)
3029         return PG_LEVEL_4K;
3030
3031     host_level = host_pfn_mapping_level(kvm, gfn, slot);
3032     return min(host_level, max_level);
3033 }
3034
3035 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3036 {
3037     struct kvm_memory_slot *slot = fault->slot;
3038     kvm_pfn_t mask;
3039
3040     fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3041
3042     if (unlikely(fault->max_level == PG_LEVEL_4K))
3043         return;
3044
3045     if (is_error_noslot_pfn(fault->pfn))
3046         return;
3047
3048     if (kvm_slot_dirty_track_enabled(slot))
3049         return;
3050
3051     /*
3052      * Enforce the iTLB multihit workaround after capturing the requested
3053      * level, which will be used to do precise, accurate accounting.
3054      */
3055     fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3056                              fault->gfn, fault->max_level);
3057     if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3058         return;
3059
3060     /*
3061      * mmu_invalidate_retry() was successful and mmu_lock is held, so
3062      * the pmd can't be split from under us.
3063      */
3064     fault->goal_level = fault->req_level;
3065     mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3066     VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3067     fault->pfn &= ~mask;
3068 }
3069
3070 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
3071 {
3072     if (cur_level > PG_LEVEL_4K &&
3073         cur_level == fault->goal_level &&
3074         is_shadow_present_pte(spte) &&
3075         !is_large_pte(spte)) {
3076         /*
3077          * A small SPTE exists for this pfn, but FNAME(fetch)
3078          * and __direct_map would like to create a large PTE
3079          * instead: just force them to go down another level,
3080          * patching back for them into pfn the next 9 bits of
3081          * the address.
3082          */
3083         u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3084                 KVM_PAGES_PER_HPAGE(cur_level - 1);
3085         fault->pfn |= fault->gfn & page_mask;
3086         fault->goal_level--;
3087     }
3088 }
3089
3090 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3091 {
3092     struct kvm_shadow_walk_iterator it;
3093     struct kvm_mmu_page *sp;
3094     int ret;
3095     gfn_t base_gfn = fault->gfn;
3096
3097     kvm_mmu_hugepage_adjust(vcpu, fault);
3098
3099     trace_kvm_mmu_spte_requested(fault);
3100     for_each_shadow_entry(vcpu, fault->addr, it) {
3101         /*
3102          * We cannot overwrite existing page tables with an NX
3103          * large page, as the leaf could be executable.
3104          */
3105         if (fault->nx_huge_page_workaround_enabled)
3106             disallowed_hugepage_adjust(fault, *it.sptep, it.level);
3107
3108         base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3109         if (it.level == fault->goal_level)
3110             break;
3111
3112         sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3113         if (sp == ERR_PTR(-EEXIST))
3114             continue;
3115
3116         link_shadow_page(vcpu, it.sptep, sp);
3117         if (fault->is_tdp && fault->huge_page_disallowed &&
3118             fault->req_level >= it.level)
3119             account_huge_nx_page(vcpu->kvm, sp);
3120     }
3121
3122     if (WARN_ON_ONCE(it.level != fault->goal_level))
3123         return -EFAULT;
3124
3125     ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3126                base_gfn, fault->pfn, fault);
3127     if (ret == RET_PF_SPURIOUS)
3128         return ret;
3129
3130     direct_pte_prefetch(vcpu, it.sptep);
3131     return ret;
3132 }
3133
3134 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3135 {
3136     send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3137 }
3138
3139 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3140 {
3141     /*
3142      * Do not cache the mmio info caused by writing the readonly gfn
3143      * into the spte otherwise read access on readonly gfn also can
3144      * caused mmio page fault and treat it as mmio access.
3145      */
3146     if (pfn == KVM_PFN_ERR_RO_FAULT)
3147         return RET_PF_EMULATE;
3148
3149     if (pfn == KVM_PFN_ERR_HWPOISON) {
3150         kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3151         return RET_PF_RETRY;
3152     }
3153
3154     return -EFAULT;
3155 }
3156
3157 static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3158                    unsigned int access)
3159 {
3160     /* The pfn is invalid, report the error! */
3161     if (unlikely(is_error_pfn(fault->pfn)))
3162         return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
3163
3164     if (unlikely(!fault->slot)) {
3165         gva_t gva = fault->is_tdp ? 0 : fault->addr;
3166
3167         vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3168                      access & shadow_mmio_access_mask);
3169         /*
3170          * If MMIO caching is disabled, emulate immediately without
3171          * touching the shadow page tables as attempting to install an
3172          * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
3173          * whose gfn is greater than host.MAXPHYADDR, any guest that
3174          * generates such gfns is running nested and is being tricked
3175          * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3176          * and only if L1's MAXPHYADDR is inaccurate with respect to
3177          * the hardware's).
3178          */
3179         if (unlikely(!enable_mmio_caching) ||
3180             unlikely(fault->gfn > kvm_mmu_max_gfn()))
3181             return RET_PF_EMULATE;
3182     }
3183
3184     return RET_PF_CONTINUE;
3185 }
3186
3187 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3188 {
3189     /*
3190      * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3191      * reach the common page fault handler if the SPTE has an invalid MMIO
3192      * generation number.  Refreshing the MMIO generation needs to go down
3193      * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
3194      */
3195     if (fault->rsvd)
3196         return false;
3197
3198     /*
3199      * #PF can be fast if:
3200      *
3201      * 1. The shadow page table entry is not present and A/D bits are
3202      *    disabled _by KVM_, which could mean that the fault is potentially
3203      *    caused by access tracking (if enabled).  If A/D bits are enabled
3204      *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3205      *    bits for L2 and employ access tracking, but the fast page fault
3206      *    mechanism only supports direct MMUs.
3207      * 2. The shadow page table entry is present, the access is a write,
3208      *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3209      *    the fault was caused by a write-protection violation.  If the
3210      *    SPTE is MMU-writable (determined later), the fault can be fixed
3211      *    by setting the Writable bit, which can be done out of mmu_lock.
3212      */
3213     if (!fault->present)
3214         return !kvm_ad_enabled();
3215
3216     /*
3217      * Note, instruction fetches and writes are mutually exclusive, ignore
3218      * the "exec" flag.
3219      */
3220     return fault->write;
3221 }
3222
3223 /*
3224  * Returns true if the SPTE was fixed successfully. Otherwise,
3225  * someone else modified the SPTE from its original value.
3226  */
3227 static bool
3228 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3229             u64 *sptep, u64 old_spte, u64 new_spte)
3230 {
3231     /*
3232      * Theoretically we could also set dirty bit (and flush TLB) here in
3233      * order to eliminate unnecessary PML logging. See comments in
3234      * set_spte. But fast_page_fault is very unlikely to happen with PML
3235      * enabled, so we do not do this. This might result in the same GPA
3236      * to be logged in PML buffer again when the write really happens, and
3237      * eventually to be called by mark_page_dirty twice. But it's also no
3238      * harm. This also avoids the TLB flush needed after setting dirty bit
3239      * so non-PML cases won't be impacted.
3240      *
3241      * Compare with set_spte where instead shadow_dirty_mask is set.
3242      */
3243     if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3244         return false;
3245
3246     if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3247         mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3248
3249     return true;
3250 }
3251
3252 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3253 {
3254     if (fault->exec)
3255         return is_executable_pte(spte);
3256
3257     if (fault->write)
3258         return is_writable_pte(spte);
3259
3260     /* Fault was on Read access */
3261     return spte & PT_PRESENT_MASK;
3262 }
3263
3264 /*
3265  * Returns the last level spte pointer of the shadow page walk for the given
3266  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3267  * walk could be performed, returns NULL and *spte does not contain valid data.
3268  *
3269  * Contract:
3270  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3271  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3272  */
3273 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3274 {
3275     struct kvm_shadow_walk_iterator iterator;
3276     u64 old_spte;
3277     u64 *sptep = NULL;
3278
3279     for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3280         sptep = iterator.sptep;
3281         *spte = old_spte;
3282     }
3283
3284     return sptep;
3285 }
3286
3287 /*
3288  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3289  */
3290 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3291 {
3292     struct kvm_mmu_page *sp;
3293     int ret = RET_PF_INVALID;
3294     u64 spte = 0ull;
3295     u64 *sptep = NULL;
3296     uint retry_count = 0;
3297
3298     if (!page_fault_can_be_fast(fault))
3299         return ret;
3300
3301     walk_shadow_page_lockless_begin(vcpu);
3302
3303     do {
3304         u64 new_spte;
3305
3306         if (is_tdp_mmu(vcpu->arch.mmu))
3307             sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3308         else
3309             sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3310
3311         if (!is_shadow_present_pte(spte))
3312             break;
3313
3314         sp = sptep_to_sp(sptep);
3315         if (!is_last_spte(spte, sp->role.level))
3316             break;
3317
3318         /*
3319          * Check whether the memory access that caused the fault would
3320          * still cause it if it were to be performed right now. If not,
3321          * then this is a spurious fault caused by TLB lazily flushed,
3322          * or some other CPU has already fixed the PTE after the
3323          * current CPU took the fault.
3324          *
3325          * Need not check the access of upper level table entries since
3326          * they are always ACC_ALL.
3327          */
3328         if (is_access_allowed(fault, spte)) {
3329             ret = RET_PF_SPURIOUS;
3330             break;
3331         }
3332
3333         new_spte = spte;
3334
3335         /*
3336          * KVM only supports fixing page faults outside of MMU lock for
3337          * direct MMUs, nested MMUs are always indirect, and KVM always
3338          * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
3339          * enabled, the SPTE can't be an access-tracked SPTE.
3340          */
3341         if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3342             new_spte = restore_acc_track_spte(new_spte);
3343
3344         /*
3345          * To keep things simple, only SPTEs that are MMU-writable can
3346          * be made fully writable outside of mmu_lock, e.g. only SPTEs
3347          * that were write-protected for dirty-logging or access
3348          * tracking are handled here.  Don't bother checking if the
3349          * SPTE is writable to prioritize running with A/D bits enabled.
3350          * The is_access_allowed() check above handles the common case
3351          * of the fault being spurious, and the SPTE is known to be
3352          * shadow-present, i.e. except for access tracking restoration
3353          * making the new SPTE writable, the check is wasteful.
3354          */
3355         if (fault->write && is_mmu_writable_spte(spte)) {
3356             new_spte |= PT_WRITABLE_MASK;
3357
3358             /*
3359              * Do not fix write-permission on the large spte when
3360              * dirty logging is enabled. Since we only dirty the
3361              * first page into the dirty-bitmap in
3362              * fast_pf_fix_direct_spte(), other pages are missed
3363              * if its slot has dirty logging enabled.
3364              *
3365              * Instead, we let the slow page fault path create a
3366              * normal spte to fix the access.
3367              */
3368             if (sp->role.level > PG_LEVEL_4K &&
3369                 kvm_slot_dirty_track_enabled(fault->slot))
3370                 break;
3371         }
3372
3373         /* Verify that the fault can be handled in the fast path */
3374         if (new_spte == spte ||
3375             !is_access_allowed(fault, new_spte))
3376             break;
3377
3378         /*
3379          * Currently, fast page fault only works for direct mapping
3380          * since the gfn is not stable for indirect shadow page. See
3381          * Documentation/virt/kvm/locking.rst to get more detail.
3382          */
3383         if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3384             ret = RET_PF_FIXED;
3385             break;
3386         }
3387
3388         if (++retry_count > 4) {
3389             printk_once(KERN_WARNING
3390                 "kvm: Fast #PF retrying more than 4 times.\n");
3391             break;
3392         }
3393
3394     } while (true);
3395
3396     trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3397     walk_shadow_page_lockless_end(vcpu);
3398
3399     if (ret != RET_PF_INVALID)
3400         vcpu->stat.pf_fast++;
3401
3402     return ret;
3403 }
3404
3405 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3406                    struct list_head *invalid_list)
3407 {
3408     struct kvm_mmu_page *sp;
3409
3410     if (!VALID_PAGE(*root_hpa))
3411         return;
3412
3413     sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
3414     if (WARN_ON(!sp))
3415         return;
3416
3417     if (is_tdp_mmu_page(sp))
3418         kvm_tdp_mmu_put_root(kvm, sp, false);
3419     else if (!--sp->root_count && sp->role.invalid)
3420         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3421
3422     *root_hpa = INVALID_PAGE;
3423 }
3424
3425 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3426 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3427             ulong roots_to_free)
3428 {
3429     int i;
3430     LIST_HEAD(invalid_list);
3431     bool free_active_root;
3432
3433     BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3434
3435     /* Before acquiring the MMU lock, see if we need to do any real work. */
3436     free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3437         && VALID_PAGE(mmu->root.hpa);
3438
3439     if (!free_active_root) {
3440         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3441             if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3442                 VALID_PAGE(mmu->prev_roots[i].hpa))
3443                 break;
3444
3445         if (i == KVM_MMU_NUM_PREV_ROOTS)
3446             return;
3447     }
3448
3449     write_lock(&kvm->mmu_lock);
3450
3451     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3452         if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3453             mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3454                        &invalid_list);
3455
3456     if (free_active_root) {
3457         if (to_shadow_page(mmu->root.hpa)) {
3458             mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3459         } else if (mmu->pae_root) {
3460             for (i = 0; i < 4; ++i) {
3461                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3462                     continue;
3463
3464                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3465                            &invalid_list);
3466                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3467             }
3468         }
3469         mmu->root.hpa = INVALID_PAGE;
3470         mmu->root.pgd = 0;
3471     }
3472
3473     kvm_mmu_commit_zap_page(kvm, &invalid_list);
3474     write_unlock(&kvm->mmu_lock);
3475 }
3476 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3477
3478 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3479 {
3480     unsigned long roots_to_free = 0;
3481     hpa_t root_hpa;
3482     int i;
3483
3484     /*
3485      * This should not be called while L2 is active, L2 can't invalidate
3486      * _only_ its own roots, e.g. INVVPID unconditionally exits.
3487      */
3488     WARN_ON_ONCE(mmu->root_role.guest_mode);
3489
3490     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3491         root_hpa = mmu->prev_roots[i].hpa;
3492         if (!VALID_PAGE(root_hpa))
3493             continue;
3494
3495         if (!to_shadow_page(root_hpa) ||
3496             to_shadow_page(root_hpa)->role.guest_mode)
3497             roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3498     }
3499
3500     kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3501 }
3502 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3503
3504
3505 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3506 {
3507     int ret = 0;
3508
3509     if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3510         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3511         ret = 1;
3512     }
3513
3514     return ret;
3515 }
3516
3517 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
3518                 u8 level)
3519 {
3520     union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3521     struct kvm_mmu_page *sp;
3522
3523     role.level = level;
3524     role.quadrant = quadrant;
3525
3526     WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3527     WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3528
3529     sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3530     ++sp->root_count;
3531
3532     return __pa(sp->spt);
3533 }
3534
3535 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3536 {
3537     struct kvm_mmu *mmu = vcpu->arch.mmu;
3538     u8 shadow_root_level = mmu->root_role.level;
3539     hpa_t root;
3540     unsigned i;
3541     int r;
3542
3543     write_lock(&vcpu->kvm->mmu_lock);
3544     r = make_mmu_pages_available(vcpu);
3545     if (r < 0)
3546         goto out_unlock;
3547
3548     if (is_tdp_mmu_enabled(vcpu->kvm)) {
3549         root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3550         mmu->root.hpa = root;
3551     } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3552         root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
3553         mmu->root.hpa = root;
3554     } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3555         if (WARN_ON_ONCE(!mmu->pae_root)) {
3556             r = -EIO;
3557             goto out_unlock;
3558         }
3559
3560         for (i = 0; i < 4; ++i) {
3561             WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3562
3563             root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
3564                           PT32_ROOT_LEVEL);
3565             mmu->pae_root[i] = root | PT_PRESENT_MASK |
3566                        shadow_me_value;
3567         }
3568         mmu->root.hpa = __pa(mmu->pae_root);
3569     } else {
3570         WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3571         r = -EIO;
3572         goto out_unlock;
3573     }
3574
3575     /* root.pgd is ignored for direct MMUs. */
3576     mmu->root.pgd = 0;
3577 out_unlock:
3578     write_unlock(&vcpu->kvm->mmu_lock);
3579     return r;
3580 }
3581
3582 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3583 {
3584     struct kvm_memslots *slots;
3585     struct kvm_memory_slot *slot;
3586     int r = 0, i, bkt;
3587
3588     /*
3589      * Check if this is the first shadow root being allocated before
3590      * taking the lock.
3591      */
3592     if (kvm_shadow_root_allocated(kvm))
3593         return 0;
3594
3595     mutex_lock(&kvm->slots_arch_lock);
3596
3597     /* Recheck, under the lock, whether this is the first shadow root. */
3598     if (kvm_shadow_root_allocated(kvm))
3599         goto out_unlock;
3600
3601     /*
3602      * Check if anything actually needs to be allocated, e.g. all metadata
3603      * will be allocated upfront if TDP is disabled.
3604      */
3605     if (kvm_memslots_have_rmaps(kvm) &&
3606         kvm_page_track_write_tracking_enabled(kvm))
3607         goto out_success;
3608
3609     for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3610         slots = __kvm_memslots(kvm, i);
3611         kvm_for_each_memslot(slot, bkt, slots) {
3612             /*
3613              * Both of these functions are no-ops if the target is
3614              * already allocated, so unconditionally calling both
3615              * is safe.  Intentionally do NOT free allocations on
3616              * failure to avoid having to track which allocations
3617              * were made now versus when the memslot was created.
3618              * The metadata is guaranteed to be freed when the slot
3619              * is freed, and will be kept/used if userspace retries
3620              * KVM_RUN instead of killing the VM.
3621              */
3622             r = memslot_rmap_alloc(slot, slot->npages);
3623             if (r)
3624                 goto out_unlock;
3625             r = kvm_page_track_write_tracking_alloc(slot);
3626             if (r)
3627                 goto out_unlock;
3628         }
3629     }
3630
3631     /*
3632      * Ensure that shadow_root_allocated becomes true strictly after
3633      * all the related pointers are set.
3634      */
3635 out_success:
3636     smp_store_release(&kvm->arch.shadow_root_allocated, true);
3637
3638 out_unlock:
3639     mutex_unlock(&kvm->slots_arch_lock);
3640     return r;
3641 }
3642
3643 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3644 {
3645     struct kvm_mmu *mmu = vcpu->arch.mmu;
3646     u64 pdptrs[4], pm_mask;
3647     gfn_t root_gfn, root_pgd;
3648     int quadrant, i, r;
3649     hpa_t root;
3650
3651     root_pgd = mmu->get_guest_pgd(vcpu);
3652     root_gfn = root_pgd >> PAGE_SHIFT;
3653
3654     if (mmu_check_root(vcpu, root_gfn))
3655         return 1;
3656
3657     /*
3658      * On SVM, reading PDPTRs might access guest memory, which might fault
3659      * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3660      */
3661     if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3662         for (i = 0; i < 4; ++i) {
3663             pdptrs[i] = mmu->get_pdptr(vcpu, i);
3664             if (!(pdptrs[i] & PT_PRESENT_MASK))
3665                 continue;
3666
3667             if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3668                 return 1;
3669         }
3670     }
3671
3672     r = mmu_first_shadow_root_alloc(vcpu->kvm);
3673     if (r)
3674         return r;
3675
3676     write_lock(&vcpu->kvm->mmu_lock);
3677     r = make_mmu_pages_available(vcpu);
3678     if (r < 0)
3679         goto out_unlock;
3680
3681     /*
3682      * Do we shadow a long mode page table? If so we need to
3683      * write-protect the guests page table root.
3684      */
3685     if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3686         root = mmu_alloc_root(vcpu, root_gfn, 0,
3687                       mmu->root_role.level);
3688         mmu->root.hpa = root;
3689         goto set_root_pgd;
3690     }
3691
3692     if (WARN_ON_ONCE(!mmu->pae_root)) {
3693         r = -EIO;
3694         goto out_unlock;
3695     }
3696
3697     /*
3698      * We shadow a 32 bit page table. This may be a legacy 2-level
3699      * or a PAE 3-level page table. In either case we need to be aware that
3700      * the shadow page table may be a PAE or a long mode page table.
3701      */
3702     pm_mask = PT_PRESENT_MASK | shadow_me_value;
3703     if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3704         pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3705
3706         if (WARN_ON_ONCE(!mmu->pml4_root)) {
3707             r = -EIO;
3708             goto out_unlock;
3709         }
3710         mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3711
3712         if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3713             if (WARN_ON_ONCE(!mmu->pml5_root)) {
3714                 r = -EIO;
3715                 goto out_unlock;
3716             }
3717             mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3718         }
3719     }
3720
3721     for (i = 0; i < 4; ++i) {
3722         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3723
3724         if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3725             if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3726                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3727                 continue;
3728             }
3729             root_gfn = pdptrs[i] >> PAGE_SHIFT;
3730         }
3731
3732         /*
3733          * If shadowing 32-bit non-PAE page tables, each PAE page
3734          * directory maps one quarter of the guest's non-PAE page
3735          * directory. Othwerise each PAE page direct shadows one guest
3736          * PAE page directory so that quadrant should be 0.
3737          */
3738         quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3739
3740         root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3741         mmu->pae_root[i] = root | pm_mask;
3742     }
3743
3744     if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3745         mmu->root.hpa = __pa(mmu->pml5_root);
3746     else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3747         mmu->root.hpa = __pa(mmu->pml4_root);
3748     else
3749         mmu->root.hpa = __pa(mmu->pae_root);
3750
3751 set_root_pgd:
3752     mmu->root.pgd = root_pgd;
3753 out_unlock:
3754     write_unlock(&vcpu->kvm->mmu_lock);
3755
3756     return r;
3757 }
3758
3759 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3760 {
3761     struct kvm_mmu *mmu = vcpu->arch.mmu;
3762     bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3763     u64 *pml5_root = NULL;
3764     u64 *pml4_root = NULL;
3765     u64 *pae_root;
3766
3767     /*
3768      * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3769      * tables are allocated and initialized at root creation as there is no
3770      * equivalent level in the guest's NPT to shadow.  Allocate the tables
3771      * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3772      */
3773     if (mmu->root_role.direct ||
3774         mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3775         mmu->root_role.level < PT64_ROOT_4LEVEL)
3776         return 0;
3777
3778     /*
3779      * NPT, the only paging mode that uses this horror, uses a fixed number
3780      * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3781      * all MMus are 5-level.  Thus, this can safely require that pml5_root
3782      * is allocated if the other roots are valid and pml5 is needed, as any
3783      * prior MMU would also have required pml5.
3784      */
3785     if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3786         return 0;
3787
3788     /*
3789      * The special roots should always be allocated in concert.  Yell and
3790      * bail if KVM ends up in a state where only one of the roots is valid.
3791      */
3792     if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3793              (need_pml5 && mmu->pml5_root)))
3794         return -EIO;
3795
3796     /*
3797      * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3798      * doesn't need to be decrypted.
3799      */
3800     pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3801     if (!pae_root)
3802         return -ENOMEM;
3803
3804 #ifdef CONFIG_X86_64
3805     pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3806     if (!pml4_root)
3807         goto err_pml4;
3808
3809     if (need_pml5) {
3810         pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3811         if (!pml5_root)
3812             goto err_pml5;
3813     }
3814 #endif
3815
3816     mmu->pae_root = pae_root;
3817     mmu->pml4_root = pml4_root;
3818     mmu->pml5_root = pml5_root;
3819
3820     return 0;
3821
3822 #ifdef CONFIG_X86_64
3823 err_pml5:
3824     free_page((unsigned long)pml4_root);
3825 err_pml4:
3826     free_page((unsigned long)pae_root);
3827     return -ENOMEM;
3828 #endif
3829 }
3830
3831 static bool is_unsync_root(hpa_t root)
3832 {
3833     struct kvm_mmu_page *sp;
3834
3835     if (!VALID_PAGE(root))
3836         return false;
3837
3838     /*
3839      * The read barrier orders the CPU's read of SPTE.W during the page table
3840      * walk before the reads of sp->unsync/sp->unsync_children here.
3841      *
3842      * Even if another CPU was marking the SP as unsync-ed simultaneously,
3843      * any guest page table changes are not guaranteed to be visible anyway
3844      * until this VCPU issues a TLB flush strictly after those changes are
3845      * made.  We only need to ensure that the other CPU sets these flags
3846      * before any actual changes to the page tables are made.  The comments
3847      * in mmu_try_to_unsync_pages() describe what could go wrong if this
3848      * requirement isn't satisfied.
3849      */
3850     smp_rmb();
3851     sp = to_shadow_page(root);
3852
3853     /*
3854      * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3855      * PDPTEs for a given PAE root need to be synchronized individually.
3856      */
3857     if (WARN_ON_ONCE(!sp))
3858         return false;
3859
3860     if (sp->unsync || sp->unsync_children)
3861         return true;
3862
3863     return false;
3864 }
3865
3866 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3867 {
3868     int i;
3869     struct kvm_mmu_page *sp;
3870
3871     if (vcpu->arch.mmu->root_role.direct)
3872         return;
3873
3874     if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3875         return;
3876
3877     vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3878
3879     if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3880         hpa_t root = vcpu->arch.mmu->root.hpa;
3881         sp = to_shadow_page(root);
3882
3883         if (!is_unsync_root(root))
3884             return;
3885
3886         write_lock(&vcpu->kvm->mmu_lock);
3887         mmu_sync_children(vcpu, sp, true);
3888         write_unlock(&vcpu->kvm->mmu_lock);
3889         return;
3890     }
3891
3892     write_lock(&vcpu->kvm->mmu_lock);
3893
3894     for (i = 0; i < 4; ++i) {
3895         hpa_t root = vcpu->arch.mmu->pae_root[i];
3896
3897         if (IS_VALID_PAE_ROOT(root)) {
3898             root &= SPTE_BASE_ADDR_MASK;
3899             sp = to_shadow_page(root);
3900             mmu_sync_children(vcpu, sp, true);
3901         }
3902     }
3903
3904     write_unlock(&vcpu->kvm->mmu_lock);
3905 }
3906
3907 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3908 {
3909     unsigned long roots_to_free = 0;
3910     int i;
3911
3912     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3913         if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3914             roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3915
3916     /* sync prev_roots by simply freeing them */
3917     kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3918 }
3919
3920 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3921                   gpa_t vaddr, u64 access,
3922                   struct x86_exception *exception)
3923 {
3924     if (exception)
3925         exception->error_code = 0;
3926     return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3927 }
3928
3929 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3930 {
3931     /*
3932      * A nested guest cannot use the MMIO cache if it is using nested
3933      * page tables, because cr2 is a nGPA while the cache stores GPAs.
3934      */
3935     if (mmu_is_nested(vcpu))
3936         return false;
3937
3938     if (direct)
3939         return vcpu_match_mmio_gpa(vcpu, addr);
3940
3941     return vcpu_match_mmio_gva(vcpu, addr);
3942 }
3943
3944 /*
3945  * Return the level of the lowest level SPTE added to sptes.
3946  * That SPTE may be non-present.
3947  *
3948  * Must be called between walk_shadow_page_lockless_{begin,end}.
3949  */
3950 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3951 {
3952     struct kvm_shadow_walk_iterator iterator;
3953     int leaf = -1;
3954     u64 spte;
3955
3956     for (shadow_walk_init(&iterator, vcpu, addr),
3957          *root_level = iterator.level;
3958          shadow_walk_okay(&iterator);
3959          __shadow_walk_next(&iterator, spte)) {
3960         leaf = iterator.level;
3961         spte = mmu_spte_get_lockless(iterator.sptep);
3962
3963         sptes[leaf] = spte;
3964     }
3965
3966     return leaf;
3967 }
3968
3969 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3970 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3971 {
3972     u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3973     struct rsvd_bits_validate *rsvd_check;
3974     int root, leaf, level;
3975     bool reserved = false;
3976
3977     walk_shadow_page_lockless_begin(vcpu);
3978
3979     if (is_tdp_mmu(vcpu->arch.mmu))
3980         leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3981     else
3982         leaf = get_walk(vcpu, addr, sptes, &root);
3983
3984     walk_shadow_page_lockless_end(vcpu);
3985
3986     if (unlikely(leaf < 0)) {
3987         *sptep = 0ull;
3988         return reserved;
3989     }
3990
3991     *sptep = sptes[leaf];
3992
3993     /*
3994      * Skip reserved bits checks on the terminal leaf if it's not a valid
3995      * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
3996      * design, always have reserved bits set.  The purpose of the checks is
3997      * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3998      */
3999     if (!is_shadow_present_pte(sptes[leaf]))
4000         leaf++;
4001
4002     rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4003
4004     for (level = root; level >= leaf; level--)
4005         reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
4006
4007     if (reserved) {
4008         pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4009                __func__, addr);
4010         for (level = root; level >= leaf; level--)
4011             pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4012                    sptes[level], level,
4013                    get_rsvd_bits(rsvd_check, sptes[level], level));
4014     }
4015
4016     return reserved;
4017 }
4018
4019 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4020 {
4021     u64 spte;
4022     bool reserved;
4023
4024     if (mmio_info_in_cache(vcpu, addr, direct))
4025         return RET_PF_EMULATE;
4026
4027     reserved = get_mmio_spte(vcpu, addr, &spte);
4028     if (WARN_ON(reserved))
4029         return -EINVAL;
4030
4031     if (is_mmio_spte(spte)) {
4032         gfn_t gfn = get_mmio_spte_gfn(spte);
4033         unsigned int access = get_mmio_spte_access(spte);
4034
4035         if (!check_mmio_spte(vcpu, spte))
4036             return RET_PF_INVALID;
4037
4038         if (direct)
4039             addr = 0;
4040
4041         trace_handle_mmio_page_fault(addr, gfn, access);
4042         vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4043         return RET_PF_EMULATE;
4044     }
4045
4046     /*
4047      * If the page table is zapped by other cpus, let CPU fault again on
4048      * the address.
4049      */
4050     return RET_PF_RETRY;
4051 }
4052
4053 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4054                      struct kvm_page_fault *fault)
4055 {
4056     if (unlikely(fault->rsvd))
4057         return false;
4058
4059     if (!fault->present || !fault->write)
4060         return false;
4061
4062     /*
4063      * guest is writing the page which is write tracked which can
4064      * not be fixed by page fault handler.
4065      */
4066     if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
4067         return true;
4068
4069     return false;
4070 }
4071
4072 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4073 {
4074     struct kvm_shadow_walk_iterator iterator;
4075     u64 spte;
4076
4077     walk_shadow_page_lockless_begin(vcpu);
4078     for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4079         clear_sp_write_flooding_count(iterator.sptep);
4080     walk_shadow_page_lockless_end(vcpu);
4081 }
4082
4083 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4084 {
4085     /* make sure the token value is not 0 */
4086     u32 id = vcpu->arch.apf.id;
4087
4088     if (id << 12 == 0)
4089         vcpu->arch.apf.id = 1;
4090
4091     return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4092 }
4093
4094 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4095                     gfn_t gfn)
4096 {
4097     struct kvm_arch_async_pf arch;
4098
4099     arch.token = alloc_apf_token(vcpu);
4100     arch.gfn = gfn;
4101     arch.direct_map = vcpu->arch.mmu->root_role.direct;
4102     arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
4103
4104     return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4105                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4106 }
4107
4108 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4109 {
4110     int r;
4111
4112     if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4113           work->wakeup_all)
4114         return;
4115
4116     r = kvm_mmu_reload(vcpu);
4117     if (unlikely(r))
4118         return;
4119
4120     if (!vcpu->arch.mmu->root_role.direct &&
4121           work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
4122         return;
4123
4124     kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
4125 }
4126
4127 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4128 {
4129     struct kvm_memory_slot *slot = fault->slot;
4130     bool async;
4131
4132     /*
4133      * Retry the page fault if the gfn hit a memslot that is being deleted
4134      * or moved.  This ensures any existing SPTEs for the old memslot will
4135      * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4136      */
4137     if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4138         return RET_PF_RETRY;
4139
4140     if (!kvm_is_visible_memslot(slot)) {
4141         /* Don't expose private memslots to L2. */
4142         if (is_guest_mode(vcpu)) {
4143             fault->slot = NULL;
4144             fault->pfn = KVM_PFN_NOSLOT;
4145             fault->map_writable = false;
4146             return RET_PF_CONTINUE;
4147         }
4148         /*
4149          * If the APIC access page exists but is disabled, go directly
4150          * to emulation without caching the MMIO access or creating a
4151          * MMIO SPTE.  That way the cache doesn't need to be purged
4152          * when the AVIC is re-enabled.
4153          */
4154         if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4155             !kvm_apicv_activated(vcpu->kvm))
4156             return RET_PF_EMULATE;
4157     }
4158
4159     async = false;
4160     fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
4161                       fault->write, &fault->map_writable,
4162                       &fault->hva);
4163     if (!async)
4164         return RET_PF_CONTINUE; /* *pfn has correct page already */
4165
4166     if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4167         trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4168         if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
4169             trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
4170             kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4171             return RET_PF_RETRY;
4172         } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4173             return RET_PF_RETRY;
4174         }
4175     }
4176
4177     fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
4178                       fault->write, &fault->map_writable,
4179                       &fault->hva);
4180     return RET_PF_CONTINUE;
4181 }
4182
4183 /*
4184  * Returns true if the page fault is stale and needs to be retried, i.e. if the
4185  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4186  */
4187 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4188                 struct kvm_page_fault *fault, int mmu_seq)
4189 {
4190     struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
4191
4192     /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4193     if (sp && is_obsolete_sp(vcpu->kvm, sp))
4194         return true;
4195
4196     /*
4197      * Roots without an associated shadow page are considered invalid if
4198      * there is a pending request to free obsolete roots.  The request is
4199      * only a hint that the current root _may_ be obsolete and needs to be
4200      * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4201      * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4202      * to reload even if no vCPU is actively using the root.
4203      */
4204     if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4205         return true;
4206
4207     return fault->slot &&
4208            mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
4209 }
4210
4211 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4212 {
4213     bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
4214
4215     unsigned long mmu_seq;
4216     int r;
4217
4218     fault->gfn = fault->addr >> PAGE_SHIFT;
4219     fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4220
4221     if (page_fault_handle_page_track(vcpu, fault))
4222         return RET_PF_EMULATE;
4223
4224     r = fast_page_fault(vcpu, fault);
4225     if (r != RET_PF_INVALID)
4226         return r;
4227
4228     r = mmu_topup_memory_caches(vcpu, false);
4229     if (r)
4230         return r;
4231
4232     mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4233     smp_rmb();
4234
4235     r = kvm_faultin_pfn(vcpu, fault);
4236     if (r != RET_PF_CONTINUE)
4237         return r;
4238
4239     r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
4240     if (r != RET_PF_CONTINUE)
4241         return r;
4242
4243     r = RET_PF_RETRY;
4244
4245     if (is_tdp_mmu_fault)
4246         read_lock(&vcpu->kvm->mmu_lock);
4247     else
4248         write_lock(&vcpu->kvm->mmu_lock);
4249
4250     if (is_page_fault_stale(vcpu, fault, mmu_seq))
4251         goto out_unlock;
4252
4253     r = make_mmu_pages_available(vcpu);
4254     if (r)
4255         goto out_unlock;
4256
4257     if (is_tdp_mmu_fault)
4258         r = kvm_tdp_mmu_map(vcpu, fault);
4259     else
4260         r = __direct_map(vcpu, fault);
4261
4262 out_unlock:
4263     if (is_tdp_mmu_fault)
4264         read_unlock(&vcpu->kvm->mmu_lock);
4265     else
4266         write_unlock(&vcpu->kvm->mmu_lock);
4267     kvm_release_pfn_clean(fault->pfn);
4268     return r;
4269 }
4270
4271 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4272                 struct kvm_page_fault *fault)
4273 {
4274     pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4275
4276     /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4277     fault->max_level = PG_LEVEL_2M;
4278     return direct_page_fault(vcpu, fault);
4279 }
4280
4281 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4282                 u64 fault_address, char *insn, int insn_len)
4283 {
4284     int r = 1;
4285     u32 flags = vcpu->arch.apf.host_apf_flags;
4286
4287 #ifndef CONFIG_X86_64
4288     /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4289     if (WARN_ON_ONCE(fault_address >> 32))
4290         return -EFAULT;
4291 #endif
4292
4293     vcpu->arch.l1tf_flush_l1d = true;
4294     if (!flags) {
4295         trace_kvm_page_fault(fault_address, error_code);
4296
4297         if (kvm_event_needs_reinjection(vcpu))
4298             kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4299         r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4300                 insn_len);
4301     } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4302         vcpu->arch.apf.host_apf_flags = 0;
4303         local_irq_disable();
4304         kvm_async_pf_task_wait_schedule(fault_address);
4305         local_irq_enable();
4306     } else {
4307         WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4308     }
4309
4310     return r;
4311 }
4312 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4313
4314 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4315 {
4316     /*
4317      * If the guest's MTRRs may be used to compute the "real" memtype,
4318      * restrict the mapping level to ensure KVM uses a consistent memtype
4319      * across the entire mapping.  If the host MTRRs are ignored by TDP
4320      * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
4321      * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
4322      * from the guest's MTRRs so that guest accesses to memory that is
4323      * DMA'd aren't cached against the guest's wishes.
4324      *
4325      * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4326      * e.g. KVM will force UC memtype for host MMIO.
4327      */
4328     if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
4329         for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4330             int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4331             gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4332
4333             if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4334                 break;
4335         }
4336     }
4337
4338     return direct_page_fault(vcpu, fault);
4339 }
4340
4341 static void nonpaging_init_context(struct kvm_mmu *context)
4342 {
4343     context->page_fault = nonpaging_page_fault;
4344     context->gva_to_gpa = nonpaging_gva_to_gpa;
4345     context->sync_page = nonpaging_sync_page;
4346     context->invlpg = NULL;
4347 }
4348
4349 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4350                   union kvm_mmu_page_role role)
4351 {
4352     return (role.direct || pgd == root->pgd) &&
4353            VALID_PAGE(root->hpa) &&
4354            role.word == to_shadow_page(root->hpa)->role.word;
4355 }
4356
4357 /*
4358  * Find out if a previously cached root matching the new pgd/role is available,
4359  * and insert the current root as the MRU in the cache.
4360  * If a matching root is found, it is assigned to kvm_mmu->root and
4361  * true is returned.
4362  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4363  * evicted to make room for the current root, and false is returned.
4364  */
4365 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4366                           gpa_t new_pgd,
4367                           union kvm_mmu_page_role new_role)
4368 {
4369     uint i;
4370
4371     if (is_root_usable(&mmu->root, new_pgd, new_role))
4372         return true;
4373
4374     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4375         /*
4376          * The swaps end up rotating the cache like this:
4377          *   C   0 1 2 3   (on entry to the function)
4378          *   0   C 1 2 3
4379          *   1   C 0 2 3
4380          *   2   C 0 1 3
4381          *   3   C 0 1 2   (on exit from the loop)
4382          */
4383         swap(mmu->root, mmu->prev_roots[i]);
4384         if (is_root_usable(&mmu->root, new_pgd, new_role))
4385             return true;
4386     }
4387
4388     kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4389     return false;
4390 }
4391
4392 /*
4393  * Find out if a previously cached root matching the new pgd/role is available.
4394  * On entry, mmu->root is invalid.
4395  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4396  * of the cache becomes invalid, and true is returned.
4397  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4398  */
4399 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4400                          gpa_t new_pgd,
4401                          union kvm_mmu_page_role new_role)
4402 {
4403     uint i;
4404
4405     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4406         if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4407             goto hit;
4408
4409     return false;
4410
4411 hit:
4412     swap(mmu->root, mmu->prev_roots[i]);
4413     /* Bubble up the remaining roots.  */
4414     for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4415         mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4416     mmu->prev_roots[i].hpa = INVALID_PAGE;
4417     return true;
4418 }
4419
4420 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4421                 gpa_t new_pgd, union kvm_mmu_page_role new_role)
4422 {
4423     /*
4424      * For now, limit the caching to 64-bit hosts+VMs in order to avoid
4425      * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4426      * later if necessary.
4427      */
4428     if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4429         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4430
4431     if (VALID_PAGE(mmu->root.hpa))
4432         return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4433     else
4434         return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4435 }
4436
4437 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4438 {
4439     struct kvm_mmu *mmu = vcpu->arch.mmu;
4440     union kvm_mmu_page_role new_role = mmu->root_role;
4441
4442     if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4443         /* kvm_mmu_ensure_valid_pgd will set up a new root.  */
4444         return;
4445     }
4446
4447     /*
4448      * It's possible that the cached previous root page is obsolete because
4449      * of a change in the MMU generation number. However, changing the
4450      * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4451      * which will free the root set here and allocate a new one.
4452      */
4453     kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4454
4455     if (force_flush_and_sync_on_reuse) {
4456         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4457         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4458     }
4459
4460     /*
4461      * The last MMIO access's GVA and GPA are cached in the VCPU. When
4462      * switching to a new CR3, that GVA->GPA mapping may no longer be
4463      * valid. So clear any cached MMIO info even when we don't need to sync
4464      * the shadow page tables.
4465      */
4466     vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4467
4468     /*
4469      * If this is a direct root page, it doesn't have a write flooding
4470      * count. Otherwise, clear the write flooding count.
4471      */
4472     if (!new_role.direct)
4473         __clear_sp_write_flooding_count(
4474                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4475 }
4476 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4477
4478 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4479 {
4480     return kvm_read_cr3(vcpu);
4481 }
4482
4483 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4484                unsigned int access)
4485 {
4486     if (unlikely(is_mmio_spte(*sptep))) {
4487         if (gfn != get_mmio_spte_gfn(*sptep)) {
4488             mmu_spte_clear_no_track(sptep);
4489             return true;
4490         }
4491
4492         mark_mmio_spte(vcpu, sptep, gfn, access);
4493         return true;
4494     }
4495
4496     return false;
4497 }
4498
4499 #define PTTYPE_EPT 18 /* arbitrary */
4500 #define PTTYPE PTTYPE_EPT
4501 #include "paging_tmpl.h"
4502 #undef PTTYPE
4503
4504 #define PTTYPE 64
4505 #include "paging_tmpl.h"
4506 #undef PTTYPE
4507
4508 #define PTTYPE 32
4509 #include "paging_tmpl.h"
4510 #undef PTTYPE
4511
4512 static void
4513 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4514             u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4515             bool pse, bool amd)
4516 {
4517     u64 gbpages_bit_rsvd = 0;
4518     u64 nonleaf_bit8_rsvd = 0;
4519     u64 high_bits_rsvd;
4520
4521     rsvd_check->bad_mt_xwr = 0;
4522
4523     if (!gbpages)
4524         gbpages_bit_rsvd = rsvd_bits(7, 7);
4525
4526     if (level == PT32E_ROOT_LEVEL)
4527         high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4528     else
4529         high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4530
4531     /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4532     if (!nx)
4533         high_bits_rsvd |= rsvd_bits(63, 63);
4534
4535     /*
4536      * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4537      * leaf entries) on AMD CPUs only.
4538      */
4539     if (amd)
4540         nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4541
4542     switch (level) {
4543     case PT32_ROOT_LEVEL:
4544         /* no rsvd bits for 2 level 4K page table entries */
4545         rsvd_check->rsvd_bits_mask[0][1] = 0;
4546         rsvd_check->rsvd_bits_mask[0][0] = 0;
4547         rsvd_check->rsvd_bits_mask[1][0] =
4548             rsvd_check->rsvd_bits_mask[0][0];
4549
4550         if (!pse) {
4551             rsvd_check->rsvd_bits_mask[1][1] = 0;
4552             break;
4553         }
4554
4555         if (is_cpuid_PSE36())
4556             /* 36bits PSE 4MB page */
4557             rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4558         else
4559             /* 32 bits PSE 4MB page */
4560             rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4561         break;
4562     case PT32E_ROOT_LEVEL:
4563         rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4564                            high_bits_rsvd |
4565                            rsvd_bits(5, 8) |
4566                            rsvd_bits(1, 2); /* PDPTE */
4567         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;  /* PDE */
4568         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;  /* PTE */
4569         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4570                            rsvd_bits(13, 20);   /* large page */
4571         rsvd_check->rsvd_bits_mask[1][0] =
4572             rsvd_check->rsvd_bits_mask[0][0];
4573         break;
4574     case PT64_ROOT_5LEVEL:
4575         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4576                            nonleaf_bit8_rsvd |
4577                            rsvd_bits(7, 7);
4578         rsvd_check->rsvd_bits_mask[1][4] =
4579             rsvd_check->rsvd_bits_mask[0][4];
4580         fallthrough;
4581     case PT64_ROOT_4LEVEL:
4582         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4583                            nonleaf_bit8_rsvd |
4584                            rsvd_bits(7, 7);
4585         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4586                            gbpages_bit_rsvd;
4587         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4588         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4589         rsvd_check->rsvd_bits_mask[1][3] =
4590             rsvd_check->rsvd_bits_mask[0][3];
4591         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4592                            gbpages_bit_rsvd |
4593                            rsvd_bits(13, 29);
4594         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4595                            rsvd_bits(13, 20); /* large page */
4596         rsvd_check->rsvd_bits_mask[1][0] =
4597             rsvd_check->rsvd_bits_mask[0][0];
4598         break;
4599     }
4600 }
4601
4602 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4603 {
4604     /*
4605      * If TDP is enabled, let the guest use GBPAGES if they're supported in
4606      * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4607      * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4608      * walk for performance and complexity reasons.  Not to mention KVM
4609      * _can't_ solve the problem because GVA->GPA walks aren't visible to
4610      * KVM once a TDP translation is installed.  Mimic hardware behavior so
4611      * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4612      */
4613     return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4614                  guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4615 }
4616
4617 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4618                     struct kvm_mmu *context)
4619 {
4620     __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4621                 vcpu->arch.reserved_gpa_bits,
4622                 context->cpu_role.base.level, is_efer_nx(context),
4623                 guest_can_use_gbpages(vcpu),
4624                 is_cr4_pse(context),
4625                 guest_cpuid_is_amd_or_hygon(vcpu));
4626 }
4627
4628 static void
4629 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4630                 u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4631 {
4632     u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4633     u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4634     u64 bad_mt_xwr;
4635
4636     if (huge_page_level < PG_LEVEL_1G)
4637         large_1g_rsvd = rsvd_bits(7, 7);
4638     if (huge_page_level < PG_LEVEL_2M)
4639         large_2m_rsvd = rsvd_bits(7, 7);
4640
4641     rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4642     rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4643     rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4644     rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4645     rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4646
4647     /* large page */
4648     rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4649     rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4650     rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4651     rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4652     rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4653
4654     bad_mt_xwr = 0xFFull << (2 * 8);    /* bits 3..5 must not be 2 */
4655     bad_mt_xwr |= 0xFFull << (3 * 8);   /* bits 3..5 must not be 3 */
4656     bad_mt_xwr |= 0xFFull << (7 * 8);   /* bits 3..5 must not be 7 */
4657     bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4658     bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4659     if (!execonly) {
4660         /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4661         bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4662     }
4663     rsvd_check->bad_mt_xwr = bad_mt_xwr;
4664 }
4665
4666 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4667         struct kvm_mmu *context, bool execonly, int huge_page_level)
4668 {
4669     __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4670                     vcpu->arch.reserved_gpa_bits, execonly,
4671                     huge_page_level);
4672 }
4673
4674 static inline u64 reserved_hpa_bits(void)
4675 {
4676     return rsvd_bits(shadow_phys_bits, 63);
4677 }
4678
4679 /*
4680  * the page table on host is the shadow page table for the page
4681  * table in guest or amd nested guest, its mmu features completely
4682  * follow the features in guest.
4683  */
4684 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4685                     struct kvm_mmu *context)
4686 {
4687     /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4688     bool is_amd = true;
4689     /* KVM doesn't use 2-level page tables for the shadow MMU. */
4690     bool is_pse = false;
4691     struct rsvd_bits_validate *shadow_zero_check;
4692     int i;
4693
4694     WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
4695
4696     shadow_zero_check = &context->shadow_zero_check;
4697     __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4698                 context->root_role.level,
4699                 context->root_role.efer_nx,
4700                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4701
4702     if (!shadow_me_mask)
4703         return;
4704
4705     for (i = context->root_role.level; --i >= 0;) {
4706         /*
4707          * So far shadow_me_value is a constant during KVM's life
4708          * time.  Bits in shadow_me_value are allowed to be set.
4709          * Bits in shadow_me_mask but not in shadow_me_value are
4710          * not allowed to be set.
4711          */
4712         shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
4713         shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
4714         shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
4715         shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
4716     }
4717
4718 }
4719
4720 static inline bool boot_cpu_is_amd(void)
4721 {
4722     WARN_ON_ONCE(!tdp_enabled);
4723     return shadow_x_mask == 0;
4724 }
4725
4726 /*
4727  * the direct page table on host, use as much mmu features as
4728  * possible, however, kvm currently does not do execution-protection.
4729  */
4730 static void
4731 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4732 {
4733     struct rsvd_bits_validate *shadow_zero_check;
4734     int i;
4735
4736     shadow_zero_check = &context->shadow_zero_check;
4737
4738     if (boot_cpu_is_amd())
4739         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4740                     context->root_role.level, true,
4741                     boot_cpu_has(X86_FEATURE_GBPAGES),
4742                     false, true);
4743     else
4744         __reset_rsvds_bits_mask_ept(shadow_zero_check,
4745                         reserved_hpa_bits(), false,
4746                         max_huge_page_level);
4747
4748     if (!shadow_me_mask)
4749         return;
4750
4751     for (i = context->root_role.level; --i >= 0;) {
4752         shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4753         shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4754     }
4755 }
4756
4757 /*
4758  * as the comments in reset_shadow_zero_bits_mask() except it
4759  * is the shadow page table for intel nested guest.
4760  */
4761 static void
4762 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4763 {
4764     __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4765                     reserved_hpa_bits(), execonly,
4766                     max_huge_page_level);
4767 }
4768
4769 #define BYTE_MASK(access) \
4770     ((1 & (access) ? 2 : 0) | \
4771      (2 & (access) ? 4 : 0) | \
4772      (3 & (access) ? 8 : 0) | \
4773      (4 & (access) ? 16 : 0) | \
4774      (5 & (access) ? 32 : 0) | \
4775      (6 & (access) ? 64 : 0) | \
4776      (7 & (access) ? 128 : 0))
4777
4778
4779 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4780 {
4781     unsigned byte;
4782
4783     const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4784     const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4785     const u8 u = BYTE_MASK(ACC_USER_MASK);
4786
4787     bool cr4_smep = is_cr4_smep(mmu);
4788     bool cr4_smap = is_cr4_smap(mmu);
4789     bool cr0_wp = is_cr0_wp(mmu);
4790     bool efer_nx = is_efer_nx(mmu);
4791
4792     for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4793         unsigned pfec = byte << 1;
4794
4795         /*
4796          * Each "*f" variable has a 1 bit for each UWX value
4797          * that causes a fault with the given PFEC.
4798          */
4799
4800         /* Faults from writes to non-writable pages */
4801         u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4802         /* Faults from user mode accesses to supervisor pages */
4803         u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4804         /* Faults from fetches of non-executable pages*/
4805         u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4806         /* Faults from kernel mode fetches of user pages */
4807         u8 smepf = 0;
4808         /* Faults from kernel mode accesses of user pages */
4809         u8 smapf = 0;
4810
4811         if (!ept) {
4812             /* Faults from kernel mode accesses to user pages */
4813             u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4814
4815             /* Not really needed: !nx will cause pte.nx to fault */
4816             if (!efer_nx)
4817                 ff = 0;
4818
4819             /* Allow supervisor writes if !cr0.wp */
4820             if (!cr0_wp)
4821                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4822
4823             /* Disallow supervisor fetches of user code if cr4.smep */
4824             if (cr4_smep)
4825                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4826
4827             /*
4828              * SMAP:kernel-mode data accesses from user-mode
4829              * mappings should fault. A fault is considered
4830              * as a SMAP violation if all of the following
4831              * conditions are true:
4832              *   - X86_CR4_SMAP is set in CR4
4833              *   - A user page is accessed
4834              *   - The access is not a fetch
4835              *   - The access is supervisor mode
4836              *   - If implicit supervisor access or X86_EFLAGS_AC is clear
4837              *
4838              * Here, we cover the first four conditions.
4839              * The fifth is computed dynamically in permission_fault();
4840              * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4841              * *not* subject to SMAP restrictions.
4842              */
4843             if (cr4_smap)
4844                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4845         }
4846
4847         mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4848     }
4849 }
4850
4851 /*
4852 * PKU is an additional mechanism by which the paging controls access to
4853 * user-mode addresses based on the value in the PKRU register.  Protection
4854 * key violations are reported through a bit in the page fault error code.
4855 * Unlike other bits of the error code, the PK bit is not known at the
4856 * call site of e.g. gva_to_gpa; it must be computed directly in
4857 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4858 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4859 *
4860 * In particular the following conditions come from the error code, the
4861 * page tables and the machine state:
4862 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4863 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4864 * - PK is always zero if U=0 in the page tables
4865 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4866 *
4867 * The PKRU bitmask caches the result of these four conditions.  The error
4868 * code (minus the P bit) and the page table's U bit form an index into the
4869 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4870 * with the two bits of the PKRU register corresponding to the protection key.
4871 * For the first three conditions above the bits will be 00, thus masking
4872 * away both AD and WD.  For all reads or if the last condition holds, WD
4873 * only will be masked away.
4874 */
4875 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4876 {
4877     unsigned bit;
4878     bool wp;
4879
4880     mmu->pkru_mask = 0;
4881
4882     if (!is_cr4_pke(mmu))
4883         return;
4884
4885     wp = is_cr0_wp(mmu);
4886
4887     for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4888         unsigned pfec, pkey_bits;
4889         bool check_pkey, check_write, ff, uf, wf, pte_user;
4890
4891         pfec = bit << 1;
4892         ff = pfec & PFERR_FETCH_MASK;
4893         uf = pfec & PFERR_USER_MASK;
4894         wf = pfec & PFERR_WRITE_MASK;
4895
4896         /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4897         pte_user = pfec & PFERR_RSVD_MASK;
4898
4899         /*
4900          * Only need to check the access which is not an
4901          * instruction fetch and is to a user page.
4902          */
4903         check_pkey = (!ff && pte_user);
4904         /*
4905          * write access is controlled by PKRU if it is a
4906          * user access or CR0.WP = 1.
4907          */
4908         check_write = check_pkey && wf && (uf || wp);
4909
4910         /* PKRU.AD stops both read and write access. */
4911         pkey_bits = !!check_pkey;
4912         /* PKRU.WD stops write access. */
4913         pkey_bits |= (!!check_write) << 1;
4914
4915         mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4916     }
4917 }
4918
4919 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4920                     struct kvm_mmu *mmu)
4921 {
4922     if (!is_cr0_pg(mmu))
4923         return;
4924
4925     reset_guest_rsvds_bits_mask(vcpu, mmu);
4926     update_permission_bitmask(mmu, false);
4927     update_pkru_bitmask(mmu);
4928 }
4929
4930 static void paging64_init_context(struct kvm_mmu *context)
4931 {
4932     context->page_fault = paging64_page_fault;
4933     context->gva_to_gpa = paging64_gva_to_gpa;
4934     context->sync_page = paging64_sync_page;
4935     context->invlpg = paging64_invlpg;
4936 }
4937
4938 static void paging32_init_context(struct kvm_mmu *context)
4939 {
4940     context->page_fault = paging32_page_fault;
4941     context->gva_to_gpa = paging32_gva_to_gpa;
4942     context->sync_page = paging32_sync_page;
4943     context->invlpg = paging32_invlpg;
4944 }
4945
4946 static union kvm_cpu_role
4947 kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
4948 {
4949     union kvm_cpu_role role = {0};
4950
4951     role.base.access = ACC_ALL;
4952     role.base.smm = is_smm(vcpu);
4953     role.base.guest_mode = is_guest_mode(vcpu);
4954     role.ext.valid = 1;
4955
4956     if (!____is_cr0_pg(regs)) {
4957         role.base.direct = 1;
4958         return role;
4959     }
4960
4961     role.base.efer_nx = ____is_efer_nx(regs);
4962     role.base.cr0_wp = ____is_cr0_wp(regs);
4963     role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
4964     role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
4965     role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
4966
4967     if (____is_efer_lma(regs))
4968         role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
4969                             : PT64_ROOT_4LEVEL;
4970     else if (____is_cr4_pae(regs))
4971         role.base.level = PT32E_ROOT_LEVEL;
4972     else
4973         role.base.level = PT32_ROOT_LEVEL;
4974
4975     role.ext.cr4_smep = ____is_cr4_smep(regs);
4976     role.ext.cr4_smap = ____is_cr4_smap(regs);
4977     role.ext.cr4_pse = ____is_cr4_pse(regs);
4978
4979     /* PKEY and LA57 are active iff long mode is active. */
4980     role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4981     role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4982     role.ext.efer_lma = ____is_efer_lma(regs);
4983     return role;
4984 }
4985
4986 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4987 {
4988     /* tdp_root_level is architecture forced level, use it if nonzero */
4989     if (tdp_root_level)
4990         return tdp_root_level;
4991
4992     /* Use 5-level TDP if and only if it's useful/necessary. */
4993     if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4994         return 4;
4995
4996     return max_tdp_level;
4997 }
4998
4999 static union kvm_mmu_page_role
5000 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5001                 union kvm_cpu_role cpu_role)
5002 {
5003     union kvm_mmu_page_role role = {0};
5004
5005     role.access = ACC_ALL;
5006     role.cr0_wp = true;
5007     role.efer_nx = true;
5008     role.smm = cpu_role.base.smm;
5009     role.guest_mode = cpu_role.base.guest_mode;
5010     role.ad_disabled = !kvm_ad_enabled();
5011     role.level = kvm_mmu_get_tdp_level(vcpu);
5012     role.direct = true;
5013     role.has_4_byte_gpte = false;
5014
5015     return role;
5016 }
5017
5018 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5019                  union kvm_cpu_role cpu_role)
5020 {
5021     struct kvm_mmu *context = &vcpu->arch.root_mmu;
5022     union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5023
5024     if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5025         root_role.word == context->root_role.word)
5026         return;
5027
5028     context->cpu_role.as_u64 = cpu_role.as_u64;
5029     context->root_role.word = root_role.word;
5030     context->page_fault = kvm_tdp_page_fault;
5031     context->sync_page = nonpaging_sync_page;
5032     context->invlpg = NULL;
5033     context->get_guest_pgd = get_cr3;
5034     context->get_pdptr = kvm_pdptr_read;
5035     context->inject_page_fault = kvm_inject_page_fault;
5036
5037     if (!is_cr0_pg(context))
5038         context->gva_to_gpa = nonpaging_gva_to_gpa;
5039     else if (is_cr4_pae(context))
5040         context->gva_to_gpa = paging64_gva_to_gpa;
5041     else
5042         context->gva_to_gpa = paging32_gva_to_gpa;
5043
5044     reset_guest_paging_metadata(vcpu, context);
5045     reset_tdp_shadow_zero_bits_mask(context);
5046 }
5047
5048 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
5049                     union kvm_cpu_role cpu_role,
5050                     union kvm_mmu_page_role root_role)
5051 {
5052     if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5053         root_role.word == context->root_role.word)
5054         return;
5055
5056     context->cpu_role.as_u64 = cpu_role.as_u64;
5057     context->root_role.word = root_role.word;
5058
5059     if (!is_cr0_pg(context))
5060         nonpaging_init_context(context);
5061     else if (is_cr4_pae(context))
5062         paging64_init_context(context);
5063     else
5064         paging32_init_context(context);
5065
5066     reset_guest_paging_metadata(vcpu, context);
5067     reset_shadow_zero_bits_mask(vcpu, context);
5068 }
5069
5070 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5071                 union kvm_cpu_role cpu_role)
5072 {
5073     struct kvm_mmu *context = &vcpu->arch.root_mmu;
5074     union kvm_mmu_page_role root_role;
5075
5076     root_role = cpu_role.base;
5077
5078     /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5079     root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5080
5081     /*
5082      * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5083      * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5084      * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5085      * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5086      * The iTLB multi-hit workaround can be toggled at any time, so assume
5087      * NX can be used by any non-nested shadow MMU to avoid having to reset
5088      * MMU contexts.
5089      */
5090     root_role.efer_nx = true;
5091
5092     shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5093 }
5094
5095 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5096                  unsigned long cr4, u64 efer, gpa_t nested_cr3)
5097 {
5098     struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5099     struct kvm_mmu_role_regs regs = {
5100         .cr0 = cr0,
5101         .cr4 = cr4 & ~X86_CR4_PKE,
5102         .efer = efer,
5103     };
5104     union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5105     union kvm_mmu_page_role root_role;
5106
5107     /* NPT requires CR0.PG=1. */
5108     WARN_ON_ONCE(cpu_role.base.direct);
5109
5110     root_role = cpu_role.base;
5111     root_role.level = kvm_mmu_get_tdp_level(vcpu);
5112     if (root_role.level == PT64_ROOT_5LEVEL &&
5113         cpu_role.base.level == PT64_ROOT_4LEVEL)
5114         root_role.passthrough = 1;
5115
5116     shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5117     kvm_mmu_new_pgd(vcpu, nested_cr3);
5118 }
5119 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5120
5121 static union kvm_cpu_role
5122 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5123                    bool execonly, u8 level)
5124 {
5125     union kvm_cpu_role role = {0};
5126
5127     /*
5128      * KVM does not support SMM transfer monitors, and consequently does not
5129      * support the "entry to SMM" control either.  role.base.smm is always 0.
5130      */
5131     WARN_ON_ONCE(is_smm(vcpu));
5132     role.base.level = level;
5133     role.base.has_4_byte_gpte = false;
5134     role.base.direct = false;
5135     role.base.ad_disabled = !accessed_dirty;
5136     role.base.guest_mode = true;
5137     role.base.access = ACC_ALL;
5138
5139     role.ext.word = 0;
5140     role.ext.execonly = execonly;
5141     role.ext.valid = 1;
5142
5143     return role;
5144 }
5145
5146 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5147                  int huge_page_level, bool accessed_dirty,
5148                  gpa_t new_eptp)
5149 {
5150     struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5151     u8 level = vmx_eptp_page_walk_level(new_eptp);
5152     union kvm_cpu_role new_mode =
5153         kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5154                            execonly, level);
5155
5156     if (new_mode.as_u64 != context->cpu_role.as_u64) {
5157         /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5158         context->cpu_role.as_u64 = new_mode.as_u64;
5159         context->root_role.word = new_mode.base.word;
5160
5161         context->page_fault = ept_page_fault;
5162         context->gva_to_gpa = ept_gva_to_gpa;
5163         context->sync_page = ept_sync_page;
5164         context->invlpg = ept_invlpg;
5165
5166         update_permission_bitmask(context, true);
5167         context->pkru_mask = 0;
5168         reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5169         reset_ept_shadow_zero_bits_mask(context, execonly);
5170     }
5171
5172     kvm_mmu_new_pgd(vcpu, new_eptp);
5173 }
5174 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5175
5176 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5177                  union kvm_cpu_role cpu_role)
5178 {
5179     struct kvm_mmu *context = &vcpu->arch.root_mmu;
5180
5181     kvm_init_shadow_mmu(vcpu, cpu_role);
5182
5183     context->get_guest_pgd     = get_cr3;
5184     context->get_pdptr         = kvm_pdptr_read;
5185     context->inject_page_fault = kvm_inject_page_fault;
5186 }
5187
5188 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5189                 union kvm_cpu_role new_mode)
5190 {
5191     struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5192
5193     if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5194         return;
5195
5196     g_context->cpu_role.as_u64   = new_mode.as_u64;
5197     g_context->get_guest_pgd     = get_cr3;
5198     g_context->get_pdptr         = kvm_pdptr_read;
5199     g_context->inject_page_fault = kvm_inject_page_fault;
5200
5201     /*
5202      * L2 page tables are never shadowed, so there is no need to sync
5203      * SPTEs.
5204      */
5205     g_context->invlpg            = NULL;
5206
5207     /*
5208      * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5209      * L1's nested page tables (e.g. EPT12). The nested translation
5210      * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5211      * L2's page tables as the first level of translation and L1's
5212      * nested page tables as the second level of translation. Basically
5213      * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5214      */
5215     if (!is_paging(vcpu))
5216         g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5217     else if (is_long_mode(vcpu))
5218         g_context->gva_to_gpa = paging64_gva_to_gpa;
5219     else if (is_pae(vcpu))
5220         g_context->gva_to_gpa = paging64_gva_to_gpa;
5221     else
5222         g_context->gva_to_gpa = paging32_gva_to_gpa;
5223
5224     reset_guest_paging_metadata(vcpu, g_context);
5225 }
5226
5227 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5228 {
5229     struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5230     union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5231
5232     if (mmu_is_nested(vcpu))
5233         init_kvm_nested_mmu(vcpu, cpu_role);
5234     else if (tdp_enabled)
5235         init_kvm_tdp_mmu(vcpu, cpu_role);
5236     else
5237         init_kvm_softmmu(vcpu, cpu_role);
5238 }
5239 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5240
5241 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5242 {
5243     /*
5244      * Invalidate all MMU roles to force them to reinitialize as CPUID
5245      * information is factored into reserved bit calculations.
5246      *
5247      * Correctly handling multiple vCPU models with respect to paging and
5248      * physical address properties) in a single VM would require tracking
5249      * all relevant CPUID information in kvm_mmu_page_role. That is very
5250      * undesirable as it would increase the memory requirements for
5251      * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5252      * problem is swept under the rug; KVM's CPUID API is horrific and
5253      * it's all but impossible to solve it without introducing a new API.
5254      */
5255     vcpu->arch.root_mmu.root_role.word = 0;
5256     vcpu->arch.guest_mmu.root_role.word = 0;
5257     vcpu->arch.nested_mmu.root_role.word = 0;
5258     vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5259     vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5260     vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5261     kvm_mmu_reset_context(vcpu);
5262
5263     /*
5264      * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5265      * kvm_arch_vcpu_ioctl().
5266      */
5267     KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5268 }
5269
5270 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5271 {
5272     kvm_mmu_unload(vcpu);
5273     kvm_init_mmu(vcpu);
5274 }
5275 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5276
5277 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5278 {
5279     int r;
5280
5281     r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
5282     if (r)
5283         goto out;
5284     r = mmu_alloc_special_roots(vcpu);
5285     if (r)
5286         goto out;
5287     if (vcpu->arch.mmu->root_role.direct)
5288         r = mmu_alloc_direct_roots(vcpu);
5289     else
5290         r = mmu_alloc_shadow_roots(vcpu);
5291     if (r)
5292         goto out;
5293
5294     kvm_mmu_sync_roots(vcpu);
5295
5296     kvm_mmu_load_pgd(vcpu);
5297
5298     /*
5299      * Flush any TLB entries for the new root, the provenance of the root
5300      * is unknown.  Even if KVM ensures there are no stale TLB entries
5301      * for a freed root, in theory another hypervisor could have left
5302      * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5303      * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5304      */
5305     static_call(kvm_x86_flush_tlb_current)(vcpu);
5306 out:
5307     return r;
5308 }
5309
5310 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5311 {
5312     struct kvm *kvm = vcpu->kvm;
5313
5314     kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5315     WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5316     kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5317     WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5318     vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5319 }
5320
5321 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5322 {
5323     struct kvm_mmu_page *sp;
5324
5325     if (!VALID_PAGE(root_hpa))
5326         return false;
5327
5328     /*
5329      * When freeing obsolete roots, treat roots as obsolete if they don't
5330      * have an associated shadow page.  This does mean KVM will get false
5331      * positives and free roots that don't strictly need to be freed, but
5332      * such false positives are relatively rare:
5333      *
5334      *  (a) only PAE paging and nested NPT has roots without shadow pages
5335      *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5336      *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5337      *      is unlikely to zap an in-use PGD.
5338      */
5339     sp = to_shadow_page(root_hpa);
5340     return !sp || is_obsolete_sp(kvm, sp);
5341 }
5342
5343 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5344 {
5345     unsigned long roots_to_free = 0;
5346     int i;
5347
5348     if (is_obsolete_root(kvm, mmu->root.hpa))
5349         roots_to_free |= KVM_MMU_ROOT_CURRENT;
5350
5351     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5352         if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
5353             roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5354     }
5355
5356     if (roots_to_free)
5357         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5358 }
5359
5360 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5361 {
5362     __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5363     __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5364 }
5365
5366 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5367                     int *bytes)
5368 {
5369     u64 gentry = 0;
5370     int r;
5371
5372     /*
5373      * Assume that the pte write on a page table of the same type
5374      * as the current vcpu paging mode since we update the sptes only
5375      * when they have the same mode.
5376      */
5377     if (is_pae(vcpu) && *bytes == 4) {
5378         /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5379         *gpa &= ~(gpa_t)7;
5380         *bytes = 8;
5381     }
5382
5383     if (*bytes == 4 || *bytes == 8) {
5384         r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5385         if (r)
5386             gentry = 0;
5387     }
5388
5389     return gentry;
5390 }
5391
5392 /*
5393  * If we're seeing too many writes to a page, it may no longer be a page table,
5394  * or we may be forking, in which case it is better to unmap the page.
5395  */
5396 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5397 {
5398     /*
5399      * Skip write-flooding detected for the sp whose level is 1, because
5400      * it can become unsync, then the guest page is not write-protected.
5401      */
5402     if (sp->role.level == PG_LEVEL_4K)
5403         return false;
5404
5405     atomic_inc(&sp->write_flooding_count);
5406     return atomic_read(&sp->write_flooding_count) >= 3;
5407 }
5408
5409 /*
5410  * Misaligned accesses are too much trouble to fix up; also, they usually
5411  * indicate a page is not used as a page table.
5412  */
5413 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5414                     int bytes)
5415 {
5416     unsigned offset, pte_size, misaligned;
5417
5418     pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5419          gpa, bytes, sp->role.word);
5420
5421     offset = offset_in_page(gpa);
5422     pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5423
5424     /*
5425      * Sometimes, the OS only writes the last one bytes to update status
5426      * bits, for example, in linux, andb instruction is used in clear_bit().
5427      */
5428     if (!(offset & (pte_size - 1)) && bytes == 1)
5429         return false;
5430
5431     misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5432     misaligned |= bytes < 4;
5433
5434     return misaligned;
5435 }
5436
5437 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5438 {
5439     unsigned page_offset, quadrant;
5440     u64 *spte;
5441     int level;
5442
5443     page_offset = offset_in_page(gpa);
5444     level = sp->role.level;
5445     *nspte = 1;
5446     if (sp->role.has_4_byte_gpte) {
5447         page_offset <<= 1;  /* 32->64 */
5448         /*
5449          * A 32-bit pde maps 4MB while the shadow pdes map
5450          * only 2MB.  So we need to double the offset again
5451          * and zap two pdes instead of one.
5452          */
5453         if (level == PT32_ROOT_LEVEL) {
5454             page_offset &= ~7; /* kill rounding error */
5455             page_offset <<= 1;
5456             *nspte = 2;
5457         }
5458         quadrant = page_offset >> PAGE_SHIFT;
5459         page_offset &= ~PAGE_MASK;
5460         if (quadrant != sp->role.quadrant)
5461             return NULL;
5462     }
5463
5464     spte = &sp->spt[page_offset / sizeof(*spte)];
5465     return spte;
5466 }
5467
5468 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5469                   const u8 *new, int bytes,
5470                   struct kvm_page_track_notifier_node *node)
5471 {
5472     gfn_t gfn = gpa >> PAGE_SHIFT;
5473     struct kvm_mmu_page *sp;
5474     LIST_HEAD(invalid_list);
5475     u64 entry, gentry, *spte;
5476     int npte;
5477     bool flush = false;
5478
5479     /*
5480      * If we don't have indirect shadow pages, it means no page is
5481      * write-protected, so we can exit simply.
5482      */
5483     if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5484         return;
5485
5486     pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5487
5488     write_lock(&vcpu->kvm->mmu_lock);
5489
5490     gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5491
5492     ++vcpu->kvm->stat.mmu_pte_write;
5493
5494     for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5495         if (detect_write_misaligned(sp, gpa, bytes) ||
5496               detect_write_flooding(sp)) {
5497             kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5498             ++vcpu->kvm->stat.mmu_flooded;
5499             continue;
5500         }
5501
5502         spte = get_written_sptes(sp, gpa, &npte);
5503         if (!spte)
5504             continue;
5505
5506         while (npte--) {
5507             entry = *spte;
5508             mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5509             if (gentry && sp->role.level != PG_LEVEL_4K)
5510                 ++vcpu->kvm->stat.mmu_pde_zapped;
5511             if (is_shadow_present_pte(entry))
5512                 flush = true;
5513             ++spte;
5514         }
5515     }
5516     kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5517     write_unlock(&vcpu->kvm->mmu_lock);
5518 }
5519
5520 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5521                void *insn, int insn_len)
5522 {
5523     int r, emulation_type = EMULTYPE_PF;
5524     bool direct = vcpu->arch.mmu->root_role.direct;
5525
5526     if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5527         return RET_PF_RETRY;
5528
5529     r = RET_PF_INVALID;
5530     if (unlikely(error_code & PFERR_RSVD_MASK)) {
5531         r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5532         if (r == RET_PF_EMULATE)
5533             goto emulate;
5534     }
5535
5536     if (r == RET_PF_INVALID) {
5537         r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5538                       lower_32_bits(error_code), false);
5539         if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5540             return -EIO;
5541     }
5542
5543     if (r < 0)
5544         return r;
5545     if (r != RET_PF_EMULATE)
5546         return 1;
5547
5548     /*
5549      * Before emulating the instruction, check if the error code
5550      * was due to a RO violation while translating the guest page.
5551      * This can occur when using nested virtualization with nested
5552      * paging in both guests. If true, we simply unprotect the page
5553      * and resume the guest.
5554      */
5555     if (vcpu->arch.mmu->root_role.direct &&
5556         (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5557         kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5558         return 1;
5559     }
5560
5561     /*
5562      * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5563      * optimistically try to just unprotect the page and let the processor
5564      * re-execute the instruction that caused the page fault.  Do not allow
5565      * retrying MMIO emulation, as it's not only pointless but could also
5566      * cause us to enter an infinite loop because the processor will keep
5567      * faulting on the non-existent MMIO address.  Retrying an instruction
5568      * from a nested guest is also pointless and dangerous as we are only
5569      * explicitly shadowing L1's page tables, i.e. unprotecting something
5570      * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5571      */
5572     if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5573         emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5574 emulate:
5575     return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5576                        insn_len);
5577 }
5578 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5579
5580 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5581                 gva_t gva, hpa_t root_hpa)
5582 {
5583     int i;
5584
5585     /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5586     if (mmu != &vcpu->arch.guest_mmu) {
5587         /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5588         if (is_noncanonical_address(gva, vcpu))
5589             return;
5590
5591         static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5592     }
5593
5594     if (!mmu->invlpg)
5595         return;
5596
5597     if (root_hpa == INVALID_PAGE) {
5598         mmu->invlpg(vcpu, gva, mmu->root.hpa);
5599
5600         /*
5601          * INVLPG is required to invalidate any global mappings for the VA,
5602          * irrespective of PCID. Since it would take us roughly similar amount
5603          * of work to determine whether any of the prev_root mappings of the VA
5604          * is marked global, or to just sync it blindly, so we might as well
5605          * just always sync it.
5606          *
5607          * Mappings not reachable via the current cr3 or the prev_roots will be
5608          * synced when switching to that cr3, so nothing needs to be done here
5609          * for them.
5610          */
5611         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5612             if (VALID_PAGE(mmu->prev_roots[i].hpa))
5613                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5614     } else {
5615         mmu->invlpg(vcpu, gva, root_hpa);
5616     }
5617 }
5618
5619 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5620 {
5621     kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5622     ++vcpu->stat.invlpg;
5623 }
5624 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5625
5626
5627 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5628 {
5629     struct kvm_mmu *mmu = vcpu->arch.mmu;
5630     bool tlb_flush = false;
5631     uint i;
5632
5633     if (pcid == kvm_get_active_pcid(vcpu)) {
5634         if (mmu->invlpg)
5635             mmu->invlpg(vcpu, gva, mmu->root.hpa);
5636         tlb_flush = true;
5637     }
5638
5639     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5640         if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5641             pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5642             if (mmu->invlpg)
5643                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5644             tlb_flush = true;
5645         }
5646     }
5647
5648     if (tlb_flush)
5649         static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5650
5651     ++vcpu->stat.invlpg;
5652
5653     /*
5654      * Mappings not reachable via the current cr3 or the prev_roots will be
5655      * synced when switching to that cr3, so nothing needs to be done here
5656      * for them.
5657      */
5658 }
5659
5660 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5661                int tdp_max_root_level, int tdp_huge_page_level)
5662 {
5663     tdp_enabled = enable_tdp;
5664     tdp_root_level = tdp_forced_root_level;
5665     max_tdp_level = tdp_max_root_level;
5666
5667     /*
5668      * max_huge_page_level reflects KVM's MMU capabilities irrespective
5669      * of kernel support, e.g. KVM may be capable of using 1GB pages when
5670      * the kernel is not.  But, KVM never creates a page size greater than
5671      * what is used by the kernel for any given HVA, i.e. the kernel's
5672      * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5673      */
5674     if (tdp_enabled)
5675         max_huge_page_level = tdp_huge_page_level;
5676     else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5677         max_huge_page_level = PG_LEVEL_1G;
5678     else
5679         max_huge_page_level = PG_LEVEL_2M;
5680 }
5681 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5682
5683 /* The return value indicates if tlb flush on all vcpus is needed. */
5684 typedef bool (*slot_level_handler) (struct kvm *kvm,
5685                     struct kvm_rmap_head *rmap_head,
5686                     const struct kvm_memory_slot *slot);
5687
5688 /* The caller should hold mmu-lock before calling this function. */
5689 static __always_inline bool
5690 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5691             slot_level_handler fn, int start_level, int end_level,
5692             gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5693             bool flush)
5694 {
5695     struct slot_rmap_walk_iterator iterator;
5696
5697     for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5698             end_gfn, &iterator) {
5699         if (iterator.rmap)
5700             flush |= fn(kvm, iterator.rmap, memslot);
5701
5702         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5703             if (flush && flush_on_yield) {
5704                 kvm_flush_remote_tlbs_with_address(kvm,
5705                         start_gfn,
5706                         iterator.gfn - start_gfn + 1);
5707                 flush = false;
5708             }
5709             cond_resched_rwlock_write(&kvm->mmu_lock);
5710         }
5711     }
5712
5713     return flush;
5714 }
5715
5716 static __always_inline bool
5717 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5718           slot_level_handler fn, int start_level, int end_level,
5719           bool flush_on_yield)
5720 {
5721     return slot_handle_level_range(kvm, memslot, fn, start_level,
5722             end_level, memslot->base_gfn,
5723             memslot->base_gfn + memslot->npages - 1,
5724             flush_on_yield, false);
5725 }
5726
5727 static __always_inline bool
5728 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5729              slot_level_handler fn, bool flush_on_yield)
5730 {
5731     return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5732                  PG_LEVEL_4K, flush_on_yield);
5733 }
5734
5735 static void free_mmu_pages(struct kvm_mmu *mmu)
5736 {
5737     if (!tdp_enabled && mmu->pae_root)
5738         set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5739     free_page((unsigned long)mmu->pae_root);
5740     free_page((unsigned long)mmu->pml4_root);
5741     free_page((unsigned long)mmu->pml5_root);
5742 }
5743
5744 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5745 {
5746     struct page *page;
5747     int i;
5748
5749     mmu->root.hpa = INVALID_PAGE;
5750     mmu->root.pgd = 0;
5751     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5752         mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5753
5754     /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5755     if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5756         return 0;
5757
5758     /*
5759      * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5760      * while the PDP table is a per-vCPU construct that's allocated at MMU
5761      * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5762      * x86_64.  Therefore we need to allocate the PDP table in the first
5763      * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5764      * generally doesn't use PAE paging and can skip allocating the PDP
5765      * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5766      * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5767      * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5768      */
5769     if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5770         return 0;
5771
5772     page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5773     if (!page)
5774         return -ENOMEM;
5775
5776     mmu->pae_root = page_address(page);
5777
5778     /*
5779      * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5780      * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
5781      * that KVM's writes and the CPU's reads get along.  Note, this is
5782      * only necessary when using shadow paging, as 64-bit NPT can get at
5783      * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5784      * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5785      */
5786     if (!tdp_enabled)
5787         set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5788     else
5789         WARN_ON_ONCE(shadow_me_value);
5790
5791     for (i = 0; i < 4; ++i)
5792         mmu->pae_root[i] = INVALID_PAE_ROOT;
5793
5794     return 0;
5795 }
5796
5797 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5798 {
5799     int ret;
5800
5801     vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5802     vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5803
5804     vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5805     vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5806
5807     vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5808
5809     vcpu->arch.mmu = &vcpu->arch.root_mmu;
5810     vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5811
5812     ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5813     if (ret)
5814         return ret;
5815
5816     ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5817     if (ret)
5818         goto fail_allocate_root;
5819
5820     return ret;
5821  fail_allocate_root:
5822     free_mmu_pages(&vcpu->arch.guest_mmu);
5823     return ret;
5824 }
5825
5826 #define BATCH_ZAP_PAGES 10
5827 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5828 {
5829     struct kvm_mmu_page *sp, *node;
5830     int nr_zapped, batch = 0;
5831     bool unstable;
5832
5833 restart:
5834     list_for_each_entry_safe_reverse(sp, node,
5835           &kvm->arch.active_mmu_pages, link) {
5836         /*
5837          * No obsolete valid page exists before a newly created page
5838          * since active_mmu_pages is a FIFO list.
5839          */
5840         if (!is_obsolete_sp(kvm, sp))
5841             break;
5842
5843         /*
5844          * Invalid pages should never land back on the list of active
5845          * pages.  Skip the bogus page, otherwise we'll get stuck in an
5846          * infinite loop if the page gets put back on the list (again).
5847          */
5848         if (WARN_ON(sp->role.invalid))
5849             continue;
5850
5851         /*
5852          * No need to flush the TLB since we're only zapping shadow
5853          * pages with an obsolete generation number and all vCPUS have
5854          * loaded a new root, i.e. the shadow pages being zapped cannot
5855          * be in active use by the guest.
5856          */
5857         if (batch >= BATCH_ZAP_PAGES &&
5858             cond_resched_rwlock_write(&kvm->mmu_lock)) {
5859             batch = 0;
5860             goto restart;
5861         }
5862
5863         unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
5864                 &kvm->arch.zapped_obsolete_pages, &nr_zapped);
5865         batch += nr_zapped;
5866
5867         if (unstable)
5868             goto restart;
5869     }
5870
5871     /*
5872      * Kick all vCPUs (via remote TLB flush) before freeing the page tables
5873      * to ensure KVM is not in the middle of a lockless shadow page table
5874      * walk, which may reference the pages.  The remote TLB flush itself is
5875      * not required and is simply a convenient way to kick vCPUs as needed.
5876      * KVM performs a local TLB flush when allocating a new root (see
5877      * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
5878      * running with an obsolete MMU.
5879      */
5880     kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5881 }
5882
5883 /*
5884  * Fast invalidate all shadow pages and use lock-break technique
5885  * to zap obsolete pages.
5886  *
5887  * It's required when memslot is being deleted or VM is being
5888  * destroyed, in these cases, we should ensure that KVM MMU does
5889  * not use any resource of the being-deleted slot or all slots
5890  * after calling the function.
5891  */
5892 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5893 {
5894     lockdep_assert_held(&kvm->slots_lock);
5895
5896     write_lock(&kvm->mmu_lock);
5897     trace_kvm_mmu_zap_all_fast(kvm);
5898
5899     /*
5900      * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5901      * held for the entire duration of zapping obsolete pages, it's
5902      * impossible for there to be multiple invalid generations associated
5903      * with *valid* shadow pages at any given time, i.e. there is exactly
5904      * one valid generation and (at most) one invalid generation.
5905      */
5906     kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5907
5908     /*
5909      * In order to ensure all vCPUs drop their soon-to-be invalid roots,
5910      * invalidating TDP MMU roots must be done while holding mmu_lock for
5911      * write and in the same critical section as making the reload request,
5912      * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
5913      */
5914     if (is_tdp_mmu_enabled(kvm))
5915         kvm_tdp_mmu_invalidate_all_roots(kvm);
5916
5917     /*
5918      * Notify all vcpus to reload its shadow page table and flush TLB.
5919      * Then all vcpus will switch to new shadow page table with the new
5920      * mmu_valid_gen.
5921      *
5922      * Note: we need to do this under the protection of mmu_lock,
5923      * otherwise, vcpu would purge shadow page but miss tlb flush.
5924      */
5925     kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
5926
5927     kvm_zap_obsolete_pages(kvm);
5928
5929     write_unlock(&kvm->mmu_lock);
5930
5931     /*
5932      * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
5933      * returning to the caller, e.g. if the zap is in response to a memslot
5934      * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
5935      * associated with the deleted memslot once the update completes, and
5936      * Deferring the zap until the final reference to the root is put would
5937      * lead to use-after-free.
5938      */
5939     if (is_tdp_mmu_enabled(kvm))
5940         kvm_tdp_mmu_zap_invalidated_roots(kvm);
5941 }
5942
5943 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5944 {
5945     return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5946 }
5947
5948 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5949             struct kvm_memory_slot *slot,
5950             struct kvm_page_track_notifier_node *node)
5951 {
5952     kvm_mmu_zap_all_fast(kvm);
5953 }
5954
5955 int kvm_mmu_init_vm(struct kvm *kvm)
5956 {
5957     struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5958     int r;
5959
5960     INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5961     INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5962     INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
5963     spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5964
5965     r = kvm_mmu_init_tdp_mmu(kvm);
5966     if (r < 0)
5967         return r;
5968
5969     node->track_write = kvm_mmu_pte_write;
5970     node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5971     kvm_page_track_register_notifier(kvm, node);
5972
5973     kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
5974     kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
5975
5976     kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
5977
5978     kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
5979     kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
5980
5981     return 0;
5982 }
5983
5984 static void mmu_free_vm_memory_caches(struct kvm *kvm)
5985 {
5986     kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
5987     kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
5988     kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
5989 }
5990
5991 void kvm_mmu_uninit_vm(struct kvm *kvm)
5992 {
5993     struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5994
5995     kvm_page_track_unregister_notifier(kvm, node);
5996
5997     kvm_mmu_uninit_tdp_mmu(kvm);
5998
5999     mmu_free_vm_memory_caches(kvm);
6000 }
6001
6002 static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6003 {
6004     const struct kvm_memory_slot *memslot;
6005     struct kvm_memslots *slots;
6006     struct kvm_memslot_iter iter;
6007     bool flush = false;
6008     gfn_t start, end;
6009     int i;
6010
6011     if (!kvm_memslots_have_rmaps(kvm))
6012         return flush;
6013
6014     for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6015         slots = __kvm_memslots(kvm, i);
6016
6017         kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6018             memslot = iter.slot;
6019             start = max(gfn_start, memslot->base_gfn);
6020             end = min(gfn_end, memslot->base_gfn + memslot->npages);
6021             if (WARN_ON_ONCE(start >= end))
6022                 continue;
6023
6024             flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
6025                             PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6026                             start, end - 1, true, flush);
6027         }
6028     }
6029
6030     return flush;
6031 }
6032
6033 /*
6034  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6035  * (not including it)
6036  */
6037 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6038 {
6039     bool flush;
6040     int i;
6041
6042     if (WARN_ON_ONCE(gfn_end <= gfn_start))
6043         return;
6044
6045     write_lock(&kvm->mmu_lock);
6046
6047     kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
6048
6049     flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6050
6051     if (is_tdp_mmu_enabled(kvm)) {
6052         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
6053             flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
6054                               gfn_end, true, flush);
6055     }
6056
6057     if (flush)
6058         kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
6059                            gfn_end - gfn_start);
6060
6061     kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
6062
6063     write_unlock(&kvm->mmu_lock);
6064 }
6065
6066 static bool slot_rmap_write_protect(struct kvm *kvm,
6067                     struct kvm_rmap_head *rmap_head,
6068                     const struct kvm_memory_slot *slot)
6069 {
6070     return rmap_write_protect(rmap_head, false);
6071 }
6072
6073 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6074                       const struct kvm_memory_slot *memslot,
6075                       int start_level)
6076 {
6077     if (kvm_memslots_have_rmaps(kvm)) {
6078         write_lock(&kvm->mmu_lock);
6079         slot_handle_level(kvm, memslot, slot_rmap_write_protect,
6080                   start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
6081         write_unlock(&kvm->mmu_lock);
6082     }
6083
6084     if (is_tdp_mmu_enabled(kvm)) {
6085         read_lock(&kvm->mmu_lock);
6086         kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6087         read_unlock(&kvm->mmu_lock);
6088     }
6089 }
6090
6091 static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6092 {
6093     return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6094 }
6095
6096 static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6097 {
6098     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6099         return true;
6100
6101     /*
6102      * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6103      * to split a single huge page. Calculating how many are actually needed
6104      * is possible but not worth the complexity.
6105      */
6106     return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6107            need_topup(&kvm->arch.split_page_header_cache, 1) ||
6108            need_topup(&kvm->arch.split_shadow_page_cache, 1);
6109 }
6110
6111 static int topup_split_caches(struct kvm *kvm)
6112 {
6113     /*
6114      * Allocating rmap list entries when splitting huge pages for nested
6115      * MMUs is uncommon as KVM needs to use a list if and only if there is
6116      * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6117      * aliased by multiple L2 gfns and/or from multiple nested roots with
6118      * different roles.  Aliasing gfns when using TDP is atypical for VMMs;
6119      * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6120      * but aliasing rarely occurs post-boot or for many gfns.  If there is
6121      * only one rmap entry, rmap->val points directly at that one entry and
6122      * doesn't need to allocate a list.  Buffer the cache by the default
6123      * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6124      * encounters an aliased gfn or two.
6125      */
6126     const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6127                  KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6128     int r;
6129
6130     lockdep_assert_held(&kvm->slots_lock);
6131
6132     r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
6133                      SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6134     if (r)
6135         return r;
6136
6137     r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6138     if (r)
6139         return r;
6140
6141     return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6142 }
6143
6144 static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6145 {
6146     struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6147     struct shadow_page_caches caches = {};
6148     union kvm_mmu_page_role role;
6149     unsigned int access;
6150     gfn_t gfn;
6151
6152     gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6153     access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
6154
6155     /*
6156      * Note, huge page splitting always uses direct shadow pages, regardless
6157      * of whether the huge page itself is mapped by a direct or indirect
6158      * shadow page, since the huge page region itself is being directly
6159      * mapped with smaller pages.
6160      */
6161     role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
6162
6163     /* Direct SPs do not require a shadowed_info_cache. */
6164     caches.page_header_cache = &kvm->arch.split_page_header_cache;
6165     caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6166
6167     /* Safe to pass NULL for vCPU since requesting a direct SP. */
6168     return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6169 }
6170
6171 static void shadow_mmu_split_huge_page(struct kvm *kvm,
6172                        const struct kvm_memory_slot *slot,
6173                        u64 *huge_sptep)
6174
6175 {
6176     struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6177     u64 huge_spte = READ_ONCE(*huge_sptep);
6178     struct kvm_mmu_page *sp;
6179     bool flush = false;
6180     u64 *sptep, spte;
6181     gfn_t gfn;
6182     int index;
6183
6184     sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6185
6186     for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6187         sptep = &sp->spt[index];
6188         gfn = kvm_mmu_page_get_gfn(sp, index);
6189
6190         /*
6191          * The SP may already have populated SPTEs, e.g. if this huge
6192          * page is aliased by multiple sptes with the same access
6193          * permissions. These entries are guaranteed to map the same
6194          * gfn-to-pfn translation since the SP is direct, so no need to
6195          * modify them.
6196          *
6197          * However, if a given SPTE points to a lower level page table,
6198          * that lower level page table may only be partially populated.
6199          * Installing such SPTEs would effectively unmap a potion of the
6200          * huge page. Unmapping guest memory always requires a TLB flush
6201          * since a subsequent operation on the unmapped regions would
6202          * fail to detect the need to flush.
6203          */
6204         if (is_shadow_present_pte(*sptep)) {
6205             flush |= !is_last_spte(*sptep, sp->role.level);
6206             continue;
6207         }
6208
6209         spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6210         mmu_spte_set(sptep, spte);
6211         __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6212     }
6213
6214     __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
6215 }
6216
6217 static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6218                       const struct kvm_memory_slot *slot,
6219                       u64 *huge_sptep)
6220 {
6221     struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6222     int level, r = 0;
6223     gfn_t gfn;
6224     u64 spte;
6225
6226     /* Grab information for the tracepoint before dropping the MMU lock. */
6227     gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6228     level = huge_sp->role.level;
6229     spte = *huge_sptep;
6230
6231     if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6232         r = -ENOSPC;
6233         goto out;
6234     }
6235
6236     if (need_topup_split_caches_or_resched(kvm)) {
6237         write_unlock(&kvm->mmu_lock);
6238         cond_resched();
6239         /*
6240          * If the topup succeeds, return -EAGAIN to indicate that the
6241          * rmap iterator should be restarted because the MMU lock was
6242          * dropped.
6243          */
6244         r = topup_split_caches(kvm) ?: -EAGAIN;
6245         write_lock(&kvm->mmu_lock);
6246         goto out;
6247     }
6248
6249     shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6250
6251 out:
6252     trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6253     return r;
6254 }
6255
6256 static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6257                         struct kvm_rmap_head *rmap_head,
6258                         const struct kvm_memory_slot *slot)
6259 {
6260     struct rmap_iterator iter;
6261     struct kvm_mmu_page *sp;
6262     u64 *huge_sptep;
6263     int r;
6264
6265 restart:
6266     for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6267         sp = sptep_to_sp(huge_sptep);
6268
6269         /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6270         if (WARN_ON_ONCE(!sp->role.guest_mode))
6271             continue;
6272
6273         /* The rmaps should never contain non-leaf SPTEs. */
6274         if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6275             continue;
6276
6277         /* SPs with level >PG_LEVEL_4K should never by unsync. */
6278         if (WARN_ON_ONCE(sp->unsync))
6279             continue;
6280
6281         /* Don't bother splitting huge pages on invalid SPs. */
6282         if (sp->role.invalid)
6283             continue;
6284
6285         r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6286
6287         /*
6288          * The split succeeded or needs to be retried because the MMU
6289          * lock was dropped. Either way, restart the iterator to get it
6290          * back into a consistent state.
6291          */
6292         if (!r || r == -EAGAIN)
6293             goto restart;
6294
6295         /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6296         break;
6297     }
6298
6299     return false;
6300 }
6301
6302 static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6303                         const struct kvm_memory_slot *slot,
6304                         gfn_t start, gfn_t end,
6305                         int target_level)
6306 {
6307     int level;
6308
6309     /*
6310      * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6311      * down to the target level. This ensures pages are recursively split
6312      * all the way to the target level. There's no need to split pages
6313      * already at the target level.
6314      */
6315     for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
6316         slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
6317                     level, level, start, end - 1, true, false);
6318     }
6319 }
6320
6321 /* Must be called with the mmu_lock held in write-mode. */
6322 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6323                    const struct kvm_memory_slot *memslot,
6324                    u64 start, u64 end,
6325                    int target_level)
6326 {
6327     if (!is_tdp_mmu_enabled(kvm))
6328         return;
6329
6330     if (kvm_memslots_have_rmaps(kvm))
6331         kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6332
6333     kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
6334
6335     /*
6336      * A TLB flush is unnecessary at this point for the same resons as in
6337      * kvm_mmu_slot_try_split_huge_pages().
6338      */
6339 }
6340
6341 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6342                     const struct kvm_memory_slot *memslot,
6343                     int target_level)
6344 {
6345     u64 start = memslot->base_gfn;
6346     u64 end = start + memslot->npages;
6347
6348     if (!is_tdp_mmu_enabled(kvm))
6349         return;
6350
6351     if (kvm_memslots_have_rmaps(kvm)) {
6352         write_lock(&kvm->mmu_lock);
6353         kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6354         write_unlock(&kvm->mmu_lock);
6355     }
6356
6357     read_lock(&kvm->mmu_lock);
6358     kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6359     read_unlock(&kvm->mmu_lock);
6360
6361     /*
6362      * No TLB flush is necessary here. KVM will flush TLBs after
6363      * write-protecting and/or clearing dirty on the newly split SPTEs to
6364      * ensure that guest writes are reflected in the dirty log before the
6365      * ioctl to enable dirty logging on this memslot completes. Since the
6366      * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6367      * safe for KVM to decide if a TLB flush is necessary based on the split
6368      * SPTEs.
6369      */
6370 }
6371
6372 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6373                      struct kvm_rmap_head *rmap_head,
6374                      const struct kvm_memory_slot *slot)
6375 {
6376     u64 *sptep;
6377     struct rmap_iterator iter;
6378     int need_tlb_flush = 0;
6379     struct kvm_mmu_page *sp;
6380
6381 restart:
6382     for_each_rmap_spte(rmap_head, &iter, sptep) {
6383         sp = sptep_to_sp(sptep);
6384
6385         /*
6386          * We cannot do huge page mapping for indirect shadow pages,
6387          * which are found on the last rmap (level = 1) when not using
6388          * tdp; such shadow pages are synced with the page table in
6389          * the guest, and the guest page table is using 4K page size
6390          * mapping if the indirect sp has level = 1.
6391          */
6392         if (sp->role.direct &&
6393             sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6394                                    PG_LEVEL_NUM)) {
6395             kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6396
6397             if (kvm_available_flush_tlb_with_range())
6398                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6399                     KVM_PAGES_PER_HPAGE(sp->role.level));
6400             else
6401                 need_tlb_flush = 1;
6402
6403             goto restart;
6404         }
6405     }
6406
6407     return need_tlb_flush;
6408 }
6409
6410 static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6411                        const struct kvm_memory_slot *slot)
6412 {
6413     /*
6414      * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6415      * pages that are already mapped at the maximum hugepage level.
6416      */
6417     if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
6418                   PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6419         kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6420 }
6421
6422 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6423                    const struct kvm_memory_slot *slot)
6424 {
6425     if (kvm_memslots_have_rmaps(kvm)) {
6426         write_lock(&kvm->mmu_lock);
6427         kvm_rmap_zap_collapsible_sptes(kvm, slot);
6428         write_unlock(&kvm->mmu_lock);
6429     }
6430
6431     if (is_tdp_mmu_enabled(kvm)) {
6432         read_lock(&kvm->mmu_lock);
6433         kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6434         read_unlock(&kvm->mmu_lock);
6435     }
6436 }
6437
6438 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6439                     const struct kvm_memory_slot *memslot)
6440 {
6441     /*
6442      * All current use cases for flushing the TLBs for a specific memslot
6443      * related to dirty logging, and many do the TLB flush out of mmu_lock.
6444      * The interaction between the various operations on memslot must be
6445      * serialized by slots_locks to ensure the TLB flush from one operation
6446      * is observed by any other operation on the same memslot.
6447      */
6448     lockdep_assert_held(&kvm->slots_lock);
6449     kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6450                        memslot->npages);
6451 }
6452
6453 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6454                    const struct kvm_memory_slot *memslot)
6455 {
6456     if (kvm_memslots_have_rmaps(kvm)) {
6457         write_lock(&kvm->mmu_lock);
6458         /*
6459          * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6460          * support dirty logging at a 4k granularity.
6461          */
6462         slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
6463         write_unlock(&kvm->mmu_lock);
6464     }
6465
6466     if (is_tdp_mmu_enabled(kvm)) {
6467         read_lock(&kvm->mmu_lock);
6468         kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6469         read_unlock(&kvm->mmu_lock);
6470     }
6471
6472     /*
6473      * The caller will flush the TLBs after this function returns.
6474      *
6475      * It's also safe to flush TLBs out of mmu lock here as currently this
6476      * function is only used for dirty logging, in which case flushing TLB
6477      * out of mmu lock also guarantees no dirty pages will be lost in
6478      * dirty_bitmap.
6479      */
6480 }
6481
6482 void kvm_mmu_zap_all(struct kvm *kvm)
6483 {
6484     struct kvm_mmu_page *sp, *node;
6485     LIST_HEAD(invalid_list);
6486     int ign;
6487
6488     write_lock(&kvm->mmu_lock);
6489 restart:
6490     list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6491         if (WARN_ON(sp->role.invalid))
6492             continue;
6493         if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6494             goto restart;
6495         if (cond_resched_rwlock_write(&kvm->mmu_lock))
6496             goto restart;
6497     }
6498
6499     kvm_mmu_commit_zap_page(kvm, &invalid_list);
6500
6501     if (is_tdp_mmu_enabled(kvm))
6502         kvm_tdp_mmu_zap_all(kvm);
6503
6504     write_unlock(&kvm->mmu_lock);
6505 }
6506
6507 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6508 {
6509     WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6510
6511     gen &= MMIO_SPTE_GEN_MASK;
6512
6513     /*
6514      * Generation numbers are incremented in multiples of the number of
6515      * address spaces in order to provide unique generations across all
6516      * address spaces.  Strip what is effectively the address space
6517      * modifier prior to checking for a wrap of the MMIO generation so
6518      * that a wrap in any address space is detected.
6519      */
6520     gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6521
6522     /*
6523      * The very rare case: if the MMIO generation number has wrapped,
6524      * zap all shadow pages.
6525      */
6526     if (unlikely(gen == 0)) {
6527         kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6528         kvm_mmu_zap_all_fast(kvm);
6529     }
6530 }
6531
6532 static unsigned long
6533 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6534 {
6535     struct kvm *kvm;
6536     int nr_to_scan = sc->nr_to_scan;
6537     unsigned long freed = 0;
6538
6539     mutex_lock(&kvm_lock);
6540
6541     list_for_each_entry(kvm, &vm_list, vm_list) {
6542         int idx;
6543         LIST_HEAD(invalid_list);
6544
6545         /*
6546          * Never scan more than sc->nr_to_scan VM instances.
6547          * Will not hit this condition practically since we do not try
6548          * to shrink more than one VM and it is very unlikely to see
6549          * !n_used_mmu_pages so many times.
6550          */
6551         if (!nr_to_scan--)
6552             break;
6553         /*
6554          * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6555          * here. We may skip a VM instance errorneosly, but we do not
6556          * want to shrink a VM that only started to populate its MMU
6557          * anyway.
6558          */
6559         if (!kvm->arch.n_used_mmu_pages &&
6560             !kvm_has_zapped_obsolete_pages(kvm))
6561             continue;
6562
6563         idx = srcu_read_lock(&kvm->srcu);
6564         write_lock(&kvm->mmu_lock);
6565
6566         if (kvm_has_zapped_obsolete_pages(kvm)) {
6567             kvm_mmu_commit_zap_page(kvm,
6568                   &kvm->arch.zapped_obsolete_pages);
6569             goto unlock;
6570         }
6571
6572         freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6573
6574 unlock:
6575         write_unlock(&kvm->mmu_lock);
6576         srcu_read_unlock(&kvm->srcu, idx);
6577
6578         /*
6579          * unfair on small ones
6580          * per-vm shrinkers cry out
6581          * sadness comes quickly
6582          */
6583         list_move_tail(&kvm->vm_list, &vm_list);
6584         break;
6585     }
6586
6587     mutex_unlock(&kvm_lock);
6588     return freed;
6589 }
6590
6591 static unsigned long
6592 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6593 {
6594     return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6595 }
6596
6597 static struct shrinker mmu_shrinker = {
6598     .count_objects = mmu_shrink_count,
6599     .scan_objects = mmu_shrink_scan,
6600     .seeks = DEFAULT_SEEKS * 10,
6601 };
6602
6603 static void mmu_destroy_caches(void)
6604 {
6605     kmem_cache_destroy(pte_list_desc_cache);
6606     kmem_cache_destroy(mmu_page_header_cache);
6607 }
6608
6609 static bool get_nx_auto_mode(void)
6610 {
6611     /* Return true when CPU has the bug, and mitigations are ON */
6612     return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6613 }
6614
6615 static void __set_nx_huge_pages(bool val)
6616 {
6617     nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6618 }
6619
6620 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6621 {
6622     bool old_val = nx_huge_pages;
6623     bool new_val;
6624
6625     /* In "auto" mode deploy workaround only if CPU has the bug. */
6626     if (sysfs_streq(val, "off"))
6627         new_val = 0;
6628     else if (sysfs_streq(val, "force"))
6629         new_val = 1;
6630     else if (sysfs_streq(val, "auto"))
6631         new_val = get_nx_auto_mode();
6632     else if (strtobool(val, &new_val) < 0)
6633         return -EINVAL;
6634
6635     __set_nx_huge_pages(new_val);
6636
6637     if (new_val != old_val) {
6638         struct kvm *kvm;
6639
6640         mutex_lock(&kvm_lock);
6641
6642         list_for_each_entry(kvm, &vm_list, vm_list) {
6643             mutex_lock(&kvm->slots_lock);
6644             kvm_mmu_zap_all_fast(kvm);
6645             mutex_unlock(&kvm->slots_lock);
6646
6647             wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6648         }
6649         mutex_unlock(&kvm_lock);
6650     }
6651
6652     return 0;
6653 }
6654
6655 /*
6656  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6657  * its default value of -1 is technically undefined behavior for a boolean.
6658  * Forward the module init call to SPTE code so that it too can handle module
6659  * params that need to be resolved/snapshot.
6660  */
6661 void __init kvm_mmu_x86_module_init(void)
6662 {
6663     if (nx_huge_pages == -1)
6664         __set_nx_huge_pages(get_nx_auto_mode());
6665
6666     kvm_mmu_spte_module_init();
6667 }
6668
6669 /*
6670  * The bulk of the MMU initialization is deferred until the vendor module is
6671  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6672  * to be reset when a potentially different vendor module is loaded.
6673  */
6674 int kvm_mmu_vendor_module_init(void)
6675 {
6676     int ret = -ENOMEM;
6677
6678     /*
6679      * MMU roles use union aliasing which is, generally speaking, an
6680      * undefined behavior. However, we supposedly know how compilers behave
6681      * and the current status quo is unlikely to change. Guardians below are
6682      * supposed to let us know if the assumption becomes false.
6683      */
6684     BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6685     BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6686     BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
6687
6688     kvm_mmu_reset_all_pte_masks();
6689
6690     pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6691                         sizeof(struct pte_list_desc),
6692                         0, SLAB_ACCOUNT, NULL);
6693     if (!pte_list_desc_cache)
6694         goto out;
6695
6696     mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6697                           sizeof(struct kvm_mmu_page),
6698                           0, SLAB_ACCOUNT, NULL);
6699     if (!mmu_page_header_cache)
6700         goto out;
6701
6702     if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6703         goto out;
6704
6705     ret = register_shrinker(&mmu_shrinker, "x86-mmu");
6706     if (ret)
6707         goto out;
6708
6709     return 0;
6710
6711 out:
6712     mmu_destroy_caches();
6713     return ret;
6714 }
6715
6716 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6717 {
6718     kvm_mmu_unload(vcpu);
6719     free_mmu_pages(&vcpu->arch.root_mmu);
6720     free_mmu_pages(&vcpu->arch.guest_mmu);
6721     mmu_free_memory_caches(vcpu);
6722 }
6723
6724 void kvm_mmu_vendor_module_exit(void)
6725 {
6726     mmu_destroy_caches();
6727     percpu_counter_destroy(&kvm_total_used_mmu_pages);
6728     unregister_shrinker(&mmu_shrinker);
6729 }
6730
6731 /*
6732  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6733  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6734  */
6735 static bool calc_nx_huge_pages_recovery_period(uint *period)
6736 {
6737     /*
6738      * Use READ_ONCE to get the params, this may be called outside of the
6739      * param setters, e.g. by the kthread to compute its next timeout.
6740      */
6741     bool enabled = READ_ONCE(nx_huge_pages);
6742     uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6743
6744     if (!enabled || !ratio)
6745         return false;
6746
6747     *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6748     if (!*period) {
6749         /* Make sure the period is not less than one second.  */
6750         ratio = min(ratio, 3600u);
6751         *period = 60 * 60 * 1000 / ratio;
6752     }
6753     return true;
6754 }
6755
6756 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6757 {
6758     bool was_recovery_enabled, is_recovery_enabled;
6759     uint old_period, new_period;
6760     int err;
6761
6762     was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6763
6764     err = param_set_uint(val, kp);
6765     if (err)
6766         return err;
6767
6768     is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6769
6770     if (is_recovery_enabled &&
6771         (!was_recovery_enabled || old_period > new_period)) {
6772         struct kvm *kvm;
6773
6774         mutex_lock(&kvm_lock);
6775
6776         list_for_each_entry(kvm, &vm_list, vm_list)
6777             wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6778
6779         mutex_unlock(&kvm_lock);
6780     }
6781
6782     return err;
6783 }
6784
6785 static void kvm_recover_nx_lpages(struct kvm *kvm)
6786 {
6787     unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6788     int rcu_idx;
6789     struct kvm_mmu_page *sp;
6790     unsigned int ratio;
6791     LIST_HEAD(invalid_list);
6792     bool flush = false;
6793     ulong to_zap;
6794
6795     rcu_idx = srcu_read_lock(&kvm->srcu);
6796     write_lock(&kvm->mmu_lock);
6797
6798     /*
6799      * Zapping TDP MMU shadow pages, including the remote TLB flush, must
6800      * be done under RCU protection, because the pages are freed via RCU
6801      * callback.
6802      */
6803     rcu_read_lock();
6804
6805     ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6806     to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6807     for ( ; to_zap; --to_zap) {
6808         if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6809             break;
6810
6811         /*
6812          * We use a separate list instead of just using active_mmu_pages
6813          * because the number of lpage_disallowed pages is expected to
6814          * be relatively small compared to the total.
6815          */
6816         sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6817                       struct kvm_mmu_page,
6818                       lpage_disallowed_link);
6819         WARN_ON_ONCE(!sp->lpage_disallowed);
6820         if (is_tdp_mmu_page(sp)) {
6821             flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6822         } else {
6823             kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6824             WARN_ON_ONCE(sp->lpage_disallowed);
6825         }
6826
6827         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6828             kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6829             rcu_read_unlock();
6830
6831             cond_resched_rwlock_write(&kvm->mmu_lock);
6832             flush = false;
6833
6834             rcu_read_lock();
6835         }
6836     }
6837     kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6838
6839     rcu_read_unlock();
6840
6841     write_unlock(&kvm->mmu_lock);
6842     srcu_read_unlock(&kvm->srcu, rcu_idx);
6843 }
6844
6845 static long get_nx_lpage_recovery_timeout(u64 start_time)
6846 {
6847     bool enabled;
6848     uint period;
6849
6850     enabled = calc_nx_huge_pages_recovery_period(&period);
6851
6852     return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6853                : MAX_SCHEDULE_TIMEOUT;
6854 }
6855
6856 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6857 {
6858     u64 start_time;
6859     long remaining_time;
6860
6861     while (true) {
6862         start_time = get_jiffies_64();
6863         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6864
6865         set_current_state(TASK_INTERRUPTIBLE);
6866         while (!kthread_should_stop() && remaining_time > 0) {
6867             schedule_timeout(remaining_time);
6868             remaining_time = get_nx_lpage_recovery_timeout(start_time);
6869             set_current_state(TASK_INTERRUPTIBLE);
6870         }
6871
6872         set_current_state(TASK_RUNNING);
6873
6874         if (kthread_should_stop())
6875             return 0;
6876
6877         kvm_recover_nx_lpages(kvm);
6878     }
6879 }
6880
6881 int kvm_mmu_post_init_vm(struct kvm *kvm)
6882 {
6883     int err;
6884
6885     err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6886                       "kvm-nx-lpage-recovery",
6887                       &kvm->arch.nx_lpage_recovery_thread);
6888     if (!err)
6889         kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6890
6891     return err;
6892 }
6893
6894 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6895 {
6896     if (kvm->arch.nx_lpage_recovery_thread)
6897         kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6898 }