0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #include "irq.h"
0019 #include "ioapic.h"
0020 #include "mmu.h"
0021 #include "mmu_internal.h"
0022 #include "tdp_mmu.h"
0023 #include "x86.h"
0024 #include "kvm_cache_regs.h"
0025 #include "kvm_emulate.h"
0026 #include "cpuid.h"
0027 #include "spte.h"
0028
0029 #include <linux/kvm_host.h>
0030 #include <linux/types.h>
0031 #include <linux/string.h>
0032 #include <linux/mm.h>
0033 #include <linux/highmem.h>
0034 #include <linux/moduleparam.h>
0035 #include <linux/export.h>
0036 #include <linux/swap.h>
0037 #include <linux/hugetlb.h>
0038 #include <linux/compiler.h>
0039 #include <linux/srcu.h>
0040 #include <linux/slab.h>
0041 #include <linux/sched/signal.h>
0042 #include <linux/uaccess.h>
0043 #include <linux/hash.h>
0044 #include <linux/kern_levels.h>
0045 #include <linux/kthread.h>
0046
0047 #include <asm/page.h>
0048 #include <asm/memtype.h>
0049 #include <asm/cmpxchg.h>
0050 #include <asm/io.h>
0051 #include <asm/set_memory.h>
0052 #include <asm/vmx.h>
0053 #include <asm/kvm_page_track.h>
0054 #include "trace.h"
0055
0056 extern bool itlb_multihit_kvm_mitigation;
0057
0058 int __read_mostly nx_huge_pages = -1;
0059 static uint __read_mostly nx_huge_pages_recovery_period_ms;
0060 #ifdef CONFIG_PREEMPT_RT
0061
0062 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
0063 #else
0064 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
0065 #endif
0066
0067 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
0068 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
0069
0070 static const struct kernel_param_ops nx_huge_pages_ops = {
0071 .set = set_nx_huge_pages,
0072 .get = param_get_bool,
0073 };
0074
0075 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
0076 .set = set_nx_huge_pages_recovery_param,
0077 .get = param_get_uint,
0078 };
0079
0080 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
0081 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
0082 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
0083 &nx_huge_pages_recovery_ratio, 0644);
0084 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
0085 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
0086 &nx_huge_pages_recovery_period_ms, 0644);
0087 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
0088
0089 static bool __read_mostly force_flush_and_sync_on_reuse;
0090 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
0091
0092
0093
0094
0095
0096
0097
0098
0099 bool tdp_enabled = false;
0100
0101 static int max_huge_page_level __read_mostly;
0102 static int tdp_root_level __read_mostly;
0103 static int max_tdp_level __read_mostly;
0104
0105 #ifdef MMU_DEBUG
0106 bool dbg = 0;
0107 module_param(dbg, bool, 0644);
0108 #endif
0109
0110 #define PTE_PREFETCH_NUM 8
0111
0112 #include <trace/events/kvm.h>
0113
0114
0115 #define PTE_LIST_EXT 14
0116
0117
0118
0119
0120
0121
0122 struct pte_list_desc {
0123 struct pte_list_desc *more;
0124
0125
0126
0127
0128 u64 spte_count;
0129 u64 *sptes[PTE_LIST_EXT];
0130 };
0131
0132 struct kvm_shadow_walk_iterator {
0133 u64 addr;
0134 hpa_t shadow_addr;
0135 u64 *sptep;
0136 int level;
0137 unsigned index;
0138 };
0139
0140 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
0141 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
0142 (_root), (_addr)); \
0143 shadow_walk_okay(&(_walker)); \
0144 shadow_walk_next(&(_walker)))
0145
0146 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
0147 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
0148 shadow_walk_okay(&(_walker)); \
0149 shadow_walk_next(&(_walker)))
0150
0151 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
0152 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
0153 shadow_walk_okay(&(_walker)) && \
0154 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
0155 __shadow_walk_next(&(_walker), spte))
0156
0157 static struct kmem_cache *pte_list_desc_cache;
0158 struct kmem_cache *mmu_page_header_cache;
0159 static struct percpu_counter kvm_total_used_mmu_pages;
0160
0161 static void mmu_spte_set(u64 *sptep, u64 spte);
0162
0163 struct kvm_mmu_role_regs {
0164 const unsigned long cr0;
0165 const unsigned long cr4;
0166 const u64 efer;
0167 };
0168
0169 #define CREATE_TRACE_POINTS
0170 #include "mmutrace.h"
0171
0172
0173
0174
0175
0176
0177 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
0178 static inline bool __maybe_unused \
0179 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
0180 { \
0181 return !!(regs->reg & flag); \
0182 }
0183 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
0184 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
0185 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
0186 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
0187 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
0188 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
0189 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
0190 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
0191 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
0192 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
0193
0194
0195
0196
0197
0198
0199
0200 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
0201 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
0202 { \
0203 return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
0204 }
0205 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
0206 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
0207 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
0208 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
0209 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
0210 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
0211 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
0212 BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
0213
0214 static inline bool is_cr0_pg(struct kvm_mmu *mmu)
0215 {
0216 return mmu->cpu_role.base.level > 0;
0217 }
0218
0219 static inline bool is_cr4_pae(struct kvm_mmu *mmu)
0220 {
0221 return !mmu->cpu_role.base.has_4_byte_gpte;
0222 }
0223
0224 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
0225 {
0226 struct kvm_mmu_role_regs regs = {
0227 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
0228 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
0229 .efer = vcpu->arch.efer,
0230 };
0231
0232 return regs;
0233 }
0234
0235 static inline bool kvm_available_flush_tlb_with_range(void)
0236 {
0237 return kvm_x86_ops.tlb_remote_flush_with_range;
0238 }
0239
0240 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
0241 struct kvm_tlb_range *range)
0242 {
0243 int ret = -ENOTSUPP;
0244
0245 if (range && kvm_x86_ops.tlb_remote_flush_with_range)
0246 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
0247
0248 if (ret)
0249 kvm_flush_remote_tlbs(kvm);
0250 }
0251
0252 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
0253 u64 start_gfn, u64 pages)
0254 {
0255 struct kvm_tlb_range range;
0256
0257 range.start_gfn = start_gfn;
0258 range.pages = pages;
0259
0260 kvm_flush_remote_tlbs_with_range(kvm, &range);
0261 }
0262
0263 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
0264 unsigned int access)
0265 {
0266 u64 spte = make_mmio_spte(vcpu, gfn, access);
0267
0268 trace_mark_mmio_spte(sptep, gfn, spte);
0269 mmu_spte_set(sptep, spte);
0270 }
0271
0272 static gfn_t get_mmio_spte_gfn(u64 spte)
0273 {
0274 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
0275
0276 gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
0277 & shadow_nonpresent_or_rsvd_mask;
0278
0279 return gpa >> PAGE_SHIFT;
0280 }
0281
0282 static unsigned get_mmio_spte_access(u64 spte)
0283 {
0284 return spte & shadow_mmio_access_mask;
0285 }
0286
0287 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
0288 {
0289 u64 kvm_gen, spte_gen, gen;
0290
0291 gen = kvm_vcpu_memslots(vcpu)->generation;
0292 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
0293 return false;
0294
0295 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
0296 spte_gen = get_mmio_spte_generation(spte);
0297
0298 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
0299 return likely(kvm_gen == spte_gen);
0300 }
0301
0302 static int is_cpuid_PSE36(void)
0303 {
0304 return 1;
0305 }
0306
0307 #ifdef CONFIG_X86_64
0308 static void __set_spte(u64 *sptep, u64 spte)
0309 {
0310 WRITE_ONCE(*sptep, spte);
0311 }
0312
0313 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
0314 {
0315 WRITE_ONCE(*sptep, spte);
0316 }
0317
0318 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
0319 {
0320 return xchg(sptep, spte);
0321 }
0322
0323 static u64 __get_spte_lockless(u64 *sptep)
0324 {
0325 return READ_ONCE(*sptep);
0326 }
0327 #else
0328 union split_spte {
0329 struct {
0330 u32 spte_low;
0331 u32 spte_high;
0332 };
0333 u64 spte;
0334 };
0335
0336 static void count_spte_clear(u64 *sptep, u64 spte)
0337 {
0338 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
0339
0340 if (is_shadow_present_pte(spte))
0341 return;
0342
0343
0344 smp_wmb();
0345 sp->clear_spte_count++;
0346 }
0347
0348 static void __set_spte(u64 *sptep, u64 spte)
0349 {
0350 union split_spte *ssptep, sspte;
0351
0352 ssptep = (union split_spte *)sptep;
0353 sspte = (union split_spte)spte;
0354
0355 ssptep->spte_high = sspte.spte_high;
0356
0357
0358
0359
0360
0361
0362 smp_wmb();
0363
0364 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
0365 }
0366
0367 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
0368 {
0369 union split_spte *ssptep, sspte;
0370
0371 ssptep = (union split_spte *)sptep;
0372 sspte = (union split_spte)spte;
0373
0374 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
0375
0376
0377
0378
0379
0380 smp_wmb();
0381
0382 ssptep->spte_high = sspte.spte_high;
0383 count_spte_clear(sptep, spte);
0384 }
0385
0386 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
0387 {
0388 union split_spte *ssptep, sspte, orig;
0389
0390 ssptep = (union split_spte *)sptep;
0391 sspte = (union split_spte)spte;
0392
0393
0394 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
0395 orig.spte_high = ssptep->spte_high;
0396 ssptep->spte_high = sspte.spte_high;
0397 count_spte_clear(sptep, spte);
0398
0399 return orig.spte;
0400 }
0401
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420 static u64 __get_spte_lockless(u64 *sptep)
0421 {
0422 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
0423 union split_spte spte, *orig = (union split_spte *)sptep;
0424 int count;
0425
0426 retry:
0427 count = sp->clear_spte_count;
0428 smp_rmb();
0429
0430 spte.spte_low = orig->spte_low;
0431 smp_rmb();
0432
0433 spte.spte_high = orig->spte_high;
0434 smp_rmb();
0435
0436 if (unlikely(spte.spte_low != orig->spte_low ||
0437 count != sp->clear_spte_count))
0438 goto retry;
0439
0440 return spte.spte;
0441 }
0442 #endif
0443
0444
0445
0446
0447
0448
0449
0450 static void mmu_spte_set(u64 *sptep, u64 new_spte)
0451 {
0452 WARN_ON(is_shadow_present_pte(*sptep));
0453 __set_spte(sptep, new_spte);
0454 }
0455
0456
0457
0458
0459
0460 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
0461 {
0462 u64 old_spte = *sptep;
0463
0464 WARN_ON(!is_shadow_present_pte(new_spte));
0465 check_spte_writable_invariants(new_spte);
0466
0467 if (!is_shadow_present_pte(old_spte)) {
0468 mmu_spte_set(sptep, new_spte);
0469 return old_spte;
0470 }
0471
0472 if (!spte_has_volatile_bits(old_spte))
0473 __update_clear_spte_fast(sptep, new_spte);
0474 else
0475 old_spte = __update_clear_spte_slow(sptep, new_spte);
0476
0477 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
0478
0479 return old_spte;
0480 }
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
0492 {
0493 bool flush = false;
0494 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
0495
0496 if (!is_shadow_present_pte(old_spte))
0497 return false;
0498
0499
0500
0501
0502
0503
0504 if (is_mmu_writable_spte(old_spte) &&
0505 !is_writable_pte(new_spte))
0506 flush = true;
0507
0508
0509
0510
0511
0512
0513 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
0514 flush = true;
0515 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
0516 }
0517
0518 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
0519 flush = true;
0520 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
0521 }
0522
0523 return flush;
0524 }
0525
0526
0527
0528
0529
0530
0531
0532 static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
0533 {
0534 kvm_pfn_t pfn;
0535 u64 old_spte = *sptep;
0536 int level = sptep_to_sp(sptep)->role.level;
0537 struct page *page;
0538
0539 if (!is_shadow_present_pte(old_spte) ||
0540 !spte_has_volatile_bits(old_spte))
0541 __update_clear_spte_fast(sptep, 0ull);
0542 else
0543 old_spte = __update_clear_spte_slow(sptep, 0ull);
0544
0545 if (!is_shadow_present_pte(old_spte))
0546 return old_spte;
0547
0548 kvm_update_page_stats(kvm, level, -1);
0549
0550 pfn = spte_to_pfn(old_spte);
0551
0552
0553
0554
0555
0556
0557
0558 page = kvm_pfn_to_refcounted_page(pfn);
0559 WARN_ON(page && !page_count(page));
0560
0561 if (is_accessed_spte(old_spte))
0562 kvm_set_pfn_accessed(pfn);
0563
0564 if (is_dirty_spte(old_spte))
0565 kvm_set_pfn_dirty(pfn);
0566
0567 return old_spte;
0568 }
0569
0570
0571
0572
0573
0574
0575 static void mmu_spte_clear_no_track(u64 *sptep)
0576 {
0577 __update_clear_spte_fast(sptep, 0ull);
0578 }
0579
0580 static u64 mmu_spte_get_lockless(u64 *sptep)
0581 {
0582 return __get_spte_lockless(sptep);
0583 }
0584
0585
0586 static bool mmu_spte_age(u64 *sptep)
0587 {
0588 u64 spte = mmu_spte_get_lockless(sptep);
0589
0590 if (!is_accessed_spte(spte))
0591 return false;
0592
0593 if (spte_ad_enabled(spte)) {
0594 clear_bit((ffs(shadow_accessed_mask) - 1),
0595 (unsigned long *)sptep);
0596 } else {
0597
0598
0599
0600
0601 if (is_writable_pte(spte))
0602 kvm_set_pfn_dirty(spte_to_pfn(spte));
0603
0604 spte = mark_spte_for_access_track(spte);
0605 mmu_spte_update_no_track(sptep, spte);
0606 }
0607
0608 return true;
0609 }
0610
0611 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
0612 {
0613 if (is_tdp_mmu(vcpu->arch.mmu)) {
0614 kvm_tdp_mmu_walk_lockless_begin();
0615 } else {
0616
0617
0618
0619
0620 local_irq_disable();
0621
0622
0623
0624
0625
0626 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
0627 }
0628 }
0629
0630 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
0631 {
0632 if (is_tdp_mmu(vcpu->arch.mmu)) {
0633 kvm_tdp_mmu_walk_lockless_end();
0634 } else {
0635
0636
0637
0638
0639
0640 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
0641 local_irq_enable();
0642 }
0643 }
0644
0645 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
0646 {
0647 int r;
0648
0649
0650 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
0651 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
0652 if (r)
0653 return r;
0654 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
0655 PT64_ROOT_MAX_LEVEL);
0656 if (r)
0657 return r;
0658 if (maybe_indirect) {
0659 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
0660 PT64_ROOT_MAX_LEVEL);
0661 if (r)
0662 return r;
0663 }
0664 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
0665 PT64_ROOT_MAX_LEVEL);
0666 }
0667
0668 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
0669 {
0670 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
0671 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
0672 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
0673 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
0674 }
0675
0676 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
0677 {
0678 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
0679 }
0680
0681 static bool sp_has_gptes(struct kvm_mmu_page *sp);
0682
0683 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
0684 {
0685 if (sp->role.passthrough)
0686 return sp->gfn;
0687
0688 if (!sp->role.direct)
0689 return sp->shadowed_translation[index] >> PAGE_SHIFT;
0690
0691 return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
0692 }
0693
0694
0695
0696
0697
0698
0699
0700 static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
0701 {
0702 if (sp_has_gptes(sp))
0703 return sp->shadowed_translation[index] & ACC_ALL;
0704
0705
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717 return sp->role.access;
0718 }
0719
0720 static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
0721 gfn_t gfn, unsigned int access)
0722 {
0723 if (sp_has_gptes(sp)) {
0724 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
0725 return;
0726 }
0727
0728 WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
0729 "access mismatch under %s page %llx (expected %u, got %u)\n",
0730 sp->role.passthrough ? "passthrough" : "direct",
0731 sp->gfn, kvm_mmu_page_get_access(sp, index), access);
0732
0733 WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
0734 "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
0735 sp->role.passthrough ? "passthrough" : "direct",
0736 sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
0737 }
0738
0739 static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
0740 unsigned int access)
0741 {
0742 gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
0743
0744 kvm_mmu_page_set_translation(sp, index, gfn, access);
0745 }
0746
0747
0748
0749
0750
0751 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
0752 const struct kvm_memory_slot *slot, int level)
0753 {
0754 unsigned long idx;
0755
0756 idx = gfn_to_index(gfn, slot->base_gfn, level);
0757 return &slot->arch.lpage_info[level - 2][idx];
0758 }
0759
0760 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
0761 gfn_t gfn, int count)
0762 {
0763 struct kvm_lpage_info *linfo;
0764 int i;
0765
0766 for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
0767 linfo = lpage_info_slot(gfn, slot, i);
0768 linfo->disallow_lpage += count;
0769 WARN_ON(linfo->disallow_lpage < 0);
0770 }
0771 }
0772
0773 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
0774 {
0775 update_gfn_disallow_lpage_count(slot, gfn, 1);
0776 }
0777
0778 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
0779 {
0780 update_gfn_disallow_lpage_count(slot, gfn, -1);
0781 }
0782
0783 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
0784 {
0785 struct kvm_memslots *slots;
0786 struct kvm_memory_slot *slot;
0787 gfn_t gfn;
0788
0789 kvm->arch.indirect_shadow_pages++;
0790 gfn = sp->gfn;
0791 slots = kvm_memslots_for_spte_role(kvm, sp->role);
0792 slot = __gfn_to_memslot(slots, gfn);
0793
0794
0795 if (sp->role.level > PG_LEVEL_4K)
0796 return kvm_slot_page_track_add_page(kvm, slot, gfn,
0797 KVM_PAGE_TRACK_WRITE);
0798
0799 kvm_mmu_gfn_disallow_lpage(slot, gfn);
0800
0801 if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
0802 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
0803 }
0804
0805 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
0806 {
0807 if (sp->lpage_disallowed)
0808 return;
0809
0810 ++kvm->stat.nx_lpage_splits;
0811 list_add_tail(&sp->lpage_disallowed_link,
0812 &kvm->arch.lpage_disallowed_mmu_pages);
0813 sp->lpage_disallowed = true;
0814 }
0815
0816 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
0817 {
0818 struct kvm_memslots *slots;
0819 struct kvm_memory_slot *slot;
0820 gfn_t gfn;
0821
0822 kvm->arch.indirect_shadow_pages--;
0823 gfn = sp->gfn;
0824 slots = kvm_memslots_for_spte_role(kvm, sp->role);
0825 slot = __gfn_to_memslot(slots, gfn);
0826 if (sp->role.level > PG_LEVEL_4K)
0827 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
0828 KVM_PAGE_TRACK_WRITE);
0829
0830 kvm_mmu_gfn_allow_lpage(slot, gfn);
0831 }
0832
0833 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
0834 {
0835 --kvm->stat.nx_lpage_splits;
0836 sp->lpage_disallowed = false;
0837 list_del(&sp->lpage_disallowed_link);
0838 }
0839
0840 static struct kvm_memory_slot *
0841 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
0842 bool no_dirty_log)
0843 {
0844 struct kvm_memory_slot *slot;
0845
0846 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
0847 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
0848 return NULL;
0849 if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
0850 return NULL;
0851
0852 return slot;
0853 }
0854
0855
0856
0857
0858
0859
0860
0861
0862
0863
0864
0865
0866 static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
0867 struct kvm_rmap_head *rmap_head)
0868 {
0869 struct pte_list_desc *desc;
0870 int count = 0;
0871
0872 if (!rmap_head->val) {
0873 rmap_printk("%p %llx 0->1\n", spte, *spte);
0874 rmap_head->val = (unsigned long)spte;
0875 } else if (!(rmap_head->val & 1)) {
0876 rmap_printk("%p %llx 1->many\n", spte, *spte);
0877 desc = kvm_mmu_memory_cache_alloc(cache);
0878 desc->sptes[0] = (u64 *)rmap_head->val;
0879 desc->sptes[1] = spte;
0880 desc->spte_count = 2;
0881 rmap_head->val = (unsigned long)desc | 1;
0882 ++count;
0883 } else {
0884 rmap_printk("%p %llx many->many\n", spte, *spte);
0885 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0886 while (desc->spte_count == PTE_LIST_EXT) {
0887 count += PTE_LIST_EXT;
0888 if (!desc->more) {
0889 desc->more = kvm_mmu_memory_cache_alloc(cache);
0890 desc = desc->more;
0891 desc->spte_count = 0;
0892 break;
0893 }
0894 desc = desc->more;
0895 }
0896 count += desc->spte_count;
0897 desc->sptes[desc->spte_count++] = spte;
0898 }
0899 return count;
0900 }
0901
0902 static void
0903 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
0904 struct pte_list_desc *desc, int i,
0905 struct pte_list_desc *prev_desc)
0906 {
0907 int j = desc->spte_count - 1;
0908
0909 desc->sptes[i] = desc->sptes[j];
0910 desc->sptes[j] = NULL;
0911 desc->spte_count--;
0912 if (desc->spte_count)
0913 return;
0914 if (!prev_desc && !desc->more)
0915 rmap_head->val = 0;
0916 else
0917 if (prev_desc)
0918 prev_desc->more = desc->more;
0919 else
0920 rmap_head->val = (unsigned long)desc->more | 1;
0921 mmu_free_pte_list_desc(desc);
0922 }
0923
0924 static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
0925 {
0926 struct pte_list_desc *desc;
0927 struct pte_list_desc *prev_desc;
0928 int i;
0929
0930 if (!rmap_head->val) {
0931 pr_err("%s: %p 0->BUG\n", __func__, spte);
0932 BUG();
0933 } else if (!(rmap_head->val & 1)) {
0934 rmap_printk("%p 1->0\n", spte);
0935 if ((u64 *)rmap_head->val != spte) {
0936 pr_err("%s: %p 1->BUG\n", __func__, spte);
0937 BUG();
0938 }
0939 rmap_head->val = 0;
0940 } else {
0941 rmap_printk("%p many->many\n", spte);
0942 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0943 prev_desc = NULL;
0944 while (desc) {
0945 for (i = 0; i < desc->spte_count; ++i) {
0946 if (desc->sptes[i] == spte) {
0947 pte_list_desc_remove_entry(rmap_head,
0948 desc, i, prev_desc);
0949 return;
0950 }
0951 }
0952 prev_desc = desc;
0953 desc = desc->more;
0954 }
0955 pr_err("%s: %p many->many\n", __func__, spte);
0956 BUG();
0957 }
0958 }
0959
0960 static void kvm_zap_one_rmap_spte(struct kvm *kvm,
0961 struct kvm_rmap_head *rmap_head, u64 *sptep)
0962 {
0963 mmu_spte_clear_track_bits(kvm, sptep);
0964 pte_list_remove(sptep, rmap_head);
0965 }
0966
0967
0968 static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
0969 struct kvm_rmap_head *rmap_head)
0970 {
0971 struct pte_list_desc *desc, *next;
0972 int i;
0973
0974 if (!rmap_head->val)
0975 return false;
0976
0977 if (!(rmap_head->val & 1)) {
0978 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
0979 goto out;
0980 }
0981
0982 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
0983
0984 for (; desc; desc = next) {
0985 for (i = 0; i < desc->spte_count; i++)
0986 mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
0987 next = desc->more;
0988 mmu_free_pte_list_desc(desc);
0989 }
0990 out:
0991
0992 rmap_head->val = 0;
0993 return true;
0994 }
0995
0996 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
0997 {
0998 struct pte_list_desc *desc;
0999 unsigned int count = 0;
1000
1001 if (!rmap_head->val)
1002 return 0;
1003 else if (!(rmap_head->val & 1))
1004 return 1;
1005
1006 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1007
1008 while (desc) {
1009 count += desc->spte_count;
1010 desc = desc->more;
1011 }
1012
1013 return count;
1014 }
1015
1016 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1017 const struct kvm_memory_slot *slot)
1018 {
1019 unsigned long idx;
1020
1021 idx = gfn_to_index(gfn, slot->base_gfn, level);
1022 return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1023 }
1024
1025 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1026 {
1027 struct kvm_mmu_memory_cache *mc;
1028
1029 mc = &vcpu->arch.mmu_pte_list_desc_cache;
1030 return kvm_mmu_memory_cache_nr_free_objects(mc);
1031 }
1032
1033 static void rmap_remove(struct kvm *kvm, u64 *spte)
1034 {
1035 struct kvm_memslots *slots;
1036 struct kvm_memory_slot *slot;
1037 struct kvm_mmu_page *sp;
1038 gfn_t gfn;
1039 struct kvm_rmap_head *rmap_head;
1040
1041 sp = sptep_to_sp(spte);
1042 gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
1043
1044
1045
1046
1047
1048
1049 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1050
1051 slot = __gfn_to_memslot(slots, gfn);
1052 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1053
1054 pte_list_remove(spte, rmap_head);
1055 }
1056
1057
1058
1059
1060
1061 struct rmap_iterator {
1062
1063 struct pte_list_desc *desc;
1064 int pos;
1065 };
1066
1067
1068
1069
1070
1071
1072
1073
1074 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1075 struct rmap_iterator *iter)
1076 {
1077 u64 *sptep;
1078
1079 if (!rmap_head->val)
1080 return NULL;
1081
1082 if (!(rmap_head->val & 1)) {
1083 iter->desc = NULL;
1084 sptep = (u64 *)rmap_head->val;
1085 goto out;
1086 }
1087
1088 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1089 iter->pos = 0;
1090 sptep = iter->desc->sptes[iter->pos];
1091 out:
1092 BUG_ON(!is_shadow_present_pte(*sptep));
1093 return sptep;
1094 }
1095
1096
1097
1098
1099
1100
1101 static u64 *rmap_get_next(struct rmap_iterator *iter)
1102 {
1103 u64 *sptep;
1104
1105 if (iter->desc) {
1106 if (iter->pos < PTE_LIST_EXT - 1) {
1107 ++iter->pos;
1108 sptep = iter->desc->sptes[iter->pos];
1109 if (sptep)
1110 goto out;
1111 }
1112
1113 iter->desc = iter->desc->more;
1114
1115 if (iter->desc) {
1116 iter->pos = 0;
1117
1118 sptep = iter->desc->sptes[iter->pos];
1119 goto out;
1120 }
1121 }
1122
1123 return NULL;
1124 out:
1125 BUG_ON(!is_shadow_present_pte(*sptep));
1126 return sptep;
1127 }
1128
1129 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1130 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1131 _spte_; _spte_ = rmap_get_next(_iter_))
1132
1133 static void drop_spte(struct kvm *kvm, u64 *sptep)
1134 {
1135 u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1136
1137 if (is_shadow_present_pte(old_spte))
1138 rmap_remove(kvm, sptep);
1139 }
1140
1141 static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
1142 {
1143 struct kvm_mmu_page *sp;
1144
1145 sp = sptep_to_sp(sptep);
1146 WARN_ON(sp->role.level == PG_LEVEL_4K);
1147
1148 drop_spte(kvm, sptep);
1149
1150 if (flush)
1151 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
1152 KVM_PAGES_PER_HPAGE(sp->role.level));
1153 }
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1169 {
1170 u64 spte = *sptep;
1171
1172 if (!is_writable_pte(spte) &&
1173 !(pt_protect && is_mmu_writable_spte(spte)))
1174 return false;
1175
1176 rmap_printk("spte %p %llx\n", sptep, *sptep);
1177
1178 if (pt_protect)
1179 spte &= ~shadow_mmu_writable_mask;
1180 spte = spte & ~PT_WRITABLE_MASK;
1181
1182 return mmu_spte_update(sptep, spte);
1183 }
1184
1185 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1186 bool pt_protect)
1187 {
1188 u64 *sptep;
1189 struct rmap_iterator iter;
1190 bool flush = false;
1191
1192 for_each_rmap_spte(rmap_head, &iter, sptep)
1193 flush |= spte_write_protect(sptep, pt_protect);
1194
1195 return flush;
1196 }
1197
1198 static bool spte_clear_dirty(u64 *sptep)
1199 {
1200 u64 spte = *sptep;
1201
1202 rmap_printk("spte %p %llx\n", sptep, *sptep);
1203
1204 MMU_WARN_ON(!spte_ad_enabled(spte));
1205 spte &= ~shadow_dirty_mask;
1206 return mmu_spte_update(sptep, spte);
1207 }
1208
1209 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1210 {
1211 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1212 (unsigned long *)sptep);
1213 if (was_writable && !spte_ad_enabled(*sptep))
1214 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1215
1216 return was_writable;
1217 }
1218
1219
1220
1221
1222
1223
1224
1225 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1226 const struct kvm_memory_slot *slot)
1227 {
1228 u64 *sptep;
1229 struct rmap_iterator iter;
1230 bool flush = false;
1231
1232 for_each_rmap_spte(rmap_head, &iter, sptep)
1233 if (spte_ad_need_write_protect(*sptep))
1234 flush |= spte_wrprot_for_clear_dirty(sptep);
1235 else
1236 flush |= spte_clear_dirty(sptep);
1237
1238 return flush;
1239 }
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1251 struct kvm_memory_slot *slot,
1252 gfn_t gfn_offset, unsigned long mask)
1253 {
1254 struct kvm_rmap_head *rmap_head;
1255
1256 if (is_tdp_mmu_enabled(kvm))
1257 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1258 slot->base_gfn + gfn_offset, mask, true);
1259
1260 if (!kvm_memslots_have_rmaps(kvm))
1261 return;
1262
1263 while (mask) {
1264 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1265 PG_LEVEL_4K, slot);
1266 rmap_write_protect(rmap_head, false);
1267
1268
1269 mask &= mask - 1;
1270 }
1271 }
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1284 struct kvm_memory_slot *slot,
1285 gfn_t gfn_offset, unsigned long mask)
1286 {
1287 struct kvm_rmap_head *rmap_head;
1288
1289 if (is_tdp_mmu_enabled(kvm))
1290 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1291 slot->base_gfn + gfn_offset, mask, false);
1292
1293 if (!kvm_memslots_have_rmaps(kvm))
1294 return;
1295
1296 while (mask) {
1297 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1298 PG_LEVEL_4K, slot);
1299 __rmap_clear_dirty(kvm, rmap_head, slot);
1300
1301
1302 mask &= mask - 1;
1303 }
1304 }
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1317 struct kvm_memory_slot *slot,
1318 gfn_t gfn_offset, unsigned long mask)
1319 {
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1330 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1331 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1332
1333 if (READ_ONCE(eager_page_split))
1334 kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1335
1336 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1337
1338
1339 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1340 ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1341 kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1342 PG_LEVEL_2M);
1343 }
1344
1345
1346 if (kvm_x86_ops.cpu_dirty_log_size)
1347 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1348 else
1349 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1350 }
1351
1352 int kvm_cpu_dirty_log_size(void)
1353 {
1354 return kvm_x86_ops.cpu_dirty_log_size;
1355 }
1356
1357 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1358 struct kvm_memory_slot *slot, u64 gfn,
1359 int min_level)
1360 {
1361 struct kvm_rmap_head *rmap_head;
1362 int i;
1363 bool write_protected = false;
1364
1365 if (kvm_memslots_have_rmaps(kvm)) {
1366 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1367 rmap_head = gfn_to_rmap(gfn, i, slot);
1368 write_protected |= rmap_write_protect(rmap_head, true);
1369 }
1370 }
1371
1372 if (is_tdp_mmu_enabled(kvm))
1373 write_protected |=
1374 kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1375
1376 return write_protected;
1377 }
1378
1379 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1380 {
1381 struct kvm_memory_slot *slot;
1382
1383 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1384 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1385 }
1386
1387 static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1388 const struct kvm_memory_slot *slot)
1389 {
1390 return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1391 }
1392
1393 static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1394 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1395 pte_t unused)
1396 {
1397 return __kvm_zap_rmap(kvm, rmap_head, slot);
1398 }
1399
1400 static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1401 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1402 pte_t pte)
1403 {
1404 u64 *sptep;
1405 struct rmap_iterator iter;
1406 bool need_flush = false;
1407 u64 new_spte;
1408 kvm_pfn_t new_pfn;
1409
1410 WARN_ON(pte_huge(pte));
1411 new_pfn = pte_pfn(pte);
1412
1413 restart:
1414 for_each_rmap_spte(rmap_head, &iter, sptep) {
1415 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1416 sptep, *sptep, gfn, level);
1417
1418 need_flush = true;
1419
1420 if (pte_write(pte)) {
1421 kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
1422 goto restart;
1423 } else {
1424 new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1425 *sptep, new_pfn);
1426
1427 mmu_spte_clear_track_bits(kvm, sptep);
1428 mmu_spte_set(sptep, new_spte);
1429 }
1430 }
1431
1432 if (need_flush && kvm_available_flush_tlb_with_range()) {
1433 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1434 return false;
1435 }
1436
1437 return need_flush;
1438 }
1439
1440 struct slot_rmap_walk_iterator {
1441
1442 const struct kvm_memory_slot *slot;
1443 gfn_t start_gfn;
1444 gfn_t end_gfn;
1445 int start_level;
1446 int end_level;
1447
1448
1449 gfn_t gfn;
1450 struct kvm_rmap_head *rmap;
1451 int level;
1452
1453
1454 struct kvm_rmap_head *end_rmap;
1455 };
1456
1457 static void
1458 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1459 {
1460 iterator->level = level;
1461 iterator->gfn = iterator->start_gfn;
1462 iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1463 iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1464 }
1465
1466 static void
1467 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1468 const struct kvm_memory_slot *slot, int start_level,
1469 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1470 {
1471 iterator->slot = slot;
1472 iterator->start_level = start_level;
1473 iterator->end_level = end_level;
1474 iterator->start_gfn = start_gfn;
1475 iterator->end_gfn = end_gfn;
1476
1477 rmap_walk_init_level(iterator, iterator->start_level);
1478 }
1479
1480 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1481 {
1482 return !!iterator->rmap;
1483 }
1484
1485 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1486 {
1487 while (++iterator->rmap <= iterator->end_rmap) {
1488 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1489
1490 if (iterator->rmap->val)
1491 return;
1492 }
1493
1494 if (++iterator->level > iterator->end_level) {
1495 iterator->rmap = NULL;
1496 return;
1497 }
1498
1499 rmap_walk_init_level(iterator, iterator->level);
1500 }
1501
1502 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1503 _start_gfn, _end_gfn, _iter_) \
1504 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1505 _end_level_, _start_gfn, _end_gfn); \
1506 slot_rmap_walk_okay(_iter_); \
1507 slot_rmap_walk_next(_iter_))
1508
1509 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1510 struct kvm_memory_slot *slot, gfn_t gfn,
1511 int level, pte_t pte);
1512
1513 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1514 struct kvm_gfn_range *range,
1515 rmap_handler_t handler)
1516 {
1517 struct slot_rmap_walk_iterator iterator;
1518 bool ret = false;
1519
1520 for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1521 range->start, range->end - 1, &iterator)
1522 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1523 iterator.level, range->pte);
1524
1525 return ret;
1526 }
1527
1528 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1529 {
1530 bool flush = false;
1531
1532 if (kvm_memslots_have_rmaps(kvm))
1533 flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
1534
1535 if (is_tdp_mmu_enabled(kvm))
1536 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1537
1538 return flush;
1539 }
1540
1541 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1542 {
1543 bool flush = false;
1544
1545 if (kvm_memslots_have_rmaps(kvm))
1546 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
1547
1548 if (is_tdp_mmu_enabled(kvm))
1549 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1550
1551 return flush;
1552 }
1553
1554 static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1555 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1556 pte_t unused)
1557 {
1558 u64 *sptep;
1559 struct rmap_iterator iter;
1560 int young = 0;
1561
1562 for_each_rmap_spte(rmap_head, &iter, sptep)
1563 young |= mmu_spte_age(sptep);
1564
1565 return young;
1566 }
1567
1568 static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1569 struct kvm_memory_slot *slot, gfn_t gfn,
1570 int level, pte_t unused)
1571 {
1572 u64 *sptep;
1573 struct rmap_iterator iter;
1574
1575 for_each_rmap_spte(rmap_head, &iter, sptep)
1576 if (is_accessed_spte(*sptep))
1577 return true;
1578 return false;
1579 }
1580
1581 #define RMAP_RECYCLE_THRESHOLD 1000
1582
1583 static void __rmap_add(struct kvm *kvm,
1584 struct kvm_mmu_memory_cache *cache,
1585 const struct kvm_memory_slot *slot,
1586 u64 *spte, gfn_t gfn, unsigned int access)
1587 {
1588 struct kvm_mmu_page *sp;
1589 struct kvm_rmap_head *rmap_head;
1590 int rmap_count;
1591
1592 sp = sptep_to_sp(spte);
1593 kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
1594 kvm_update_page_stats(kvm, sp->role.level, 1);
1595
1596 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1597 rmap_count = pte_list_add(cache, spte, rmap_head);
1598
1599 if (rmap_count > kvm->stat.max_mmu_rmap_size)
1600 kvm->stat.max_mmu_rmap_size = rmap_count;
1601 if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1602 kvm_zap_all_rmap_sptes(kvm, rmap_head);
1603 kvm_flush_remote_tlbs_with_address(
1604 kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1605 }
1606 }
1607
1608 static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
1609 u64 *spte, gfn_t gfn, unsigned int access)
1610 {
1611 struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1612
1613 __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
1614 }
1615
1616 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1617 {
1618 bool young = false;
1619
1620 if (kvm_memslots_have_rmaps(kvm))
1621 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
1622
1623 if (is_tdp_mmu_enabled(kvm))
1624 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1625
1626 return young;
1627 }
1628
1629 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1630 {
1631 bool young = false;
1632
1633 if (kvm_memslots_have_rmaps(kvm))
1634 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
1635
1636 if (is_tdp_mmu_enabled(kvm))
1637 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1638
1639 return young;
1640 }
1641
1642 #ifdef MMU_DEBUG
1643 static int is_empty_shadow_page(u64 *spt)
1644 {
1645 u64 *pos;
1646 u64 *end;
1647
1648 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1649 if (is_shadow_present_pte(*pos)) {
1650 printk(KERN_ERR "%s: %p %llx\n", __func__,
1651 pos, *pos);
1652 return 0;
1653 }
1654 return 1;
1655 }
1656 #endif
1657
1658
1659
1660
1661
1662
1663
1664 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1665 {
1666 kvm->arch.n_used_mmu_pages += nr;
1667 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1668 }
1669
1670 static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1671 {
1672 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1673 hlist_del(&sp->hash_link);
1674 list_del(&sp->link);
1675 free_page((unsigned long)sp->spt);
1676 if (!sp->role.direct)
1677 free_page((unsigned long)sp->shadowed_translation);
1678 kmem_cache_free(mmu_page_header_cache, sp);
1679 }
1680
1681 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1682 {
1683 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1684 }
1685
1686 static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1687 struct kvm_mmu_page *sp, u64 *parent_pte)
1688 {
1689 if (!parent_pte)
1690 return;
1691
1692 pte_list_add(cache, parent_pte, &sp->parent_ptes);
1693 }
1694
1695 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1696 u64 *parent_pte)
1697 {
1698 pte_list_remove(parent_pte, &sp->parent_ptes);
1699 }
1700
1701 static void drop_parent_pte(struct kvm_mmu_page *sp,
1702 u64 *parent_pte)
1703 {
1704 mmu_page_remove_parent_pte(sp, parent_pte);
1705 mmu_spte_clear_no_track(parent_pte);
1706 }
1707
1708 static void mark_unsync(u64 *spte);
1709 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1710 {
1711 u64 *sptep;
1712 struct rmap_iterator iter;
1713
1714 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1715 mark_unsync(sptep);
1716 }
1717 }
1718
1719 static void mark_unsync(u64 *spte)
1720 {
1721 struct kvm_mmu_page *sp;
1722
1723 sp = sptep_to_sp(spte);
1724 if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1725 return;
1726 if (sp->unsync_children++)
1727 return;
1728 kvm_mmu_mark_parents_unsync(sp);
1729 }
1730
1731 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1732 struct kvm_mmu_page *sp)
1733 {
1734 return -1;
1735 }
1736
1737 #define KVM_PAGE_ARRAY_NR 16
1738
1739 struct kvm_mmu_pages {
1740 struct mmu_page_and_offset {
1741 struct kvm_mmu_page *sp;
1742 unsigned int idx;
1743 } page[KVM_PAGE_ARRAY_NR];
1744 unsigned int nr;
1745 };
1746
1747 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1748 int idx)
1749 {
1750 int i;
1751
1752 if (sp->unsync)
1753 for (i=0; i < pvec->nr; i++)
1754 if (pvec->page[i].sp == sp)
1755 return 0;
1756
1757 pvec->page[pvec->nr].sp = sp;
1758 pvec->page[pvec->nr].idx = idx;
1759 pvec->nr++;
1760 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1761 }
1762
1763 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1764 {
1765 --sp->unsync_children;
1766 WARN_ON((int)sp->unsync_children < 0);
1767 __clear_bit(idx, sp->unsync_child_bitmap);
1768 }
1769
1770 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1771 struct kvm_mmu_pages *pvec)
1772 {
1773 int i, ret, nr_unsync_leaf = 0;
1774
1775 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1776 struct kvm_mmu_page *child;
1777 u64 ent = sp->spt[i];
1778
1779 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1780 clear_unsync_child_bit(sp, i);
1781 continue;
1782 }
1783
1784 child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
1785
1786 if (child->unsync_children) {
1787 if (mmu_pages_add(pvec, child, i))
1788 return -ENOSPC;
1789
1790 ret = __mmu_unsync_walk(child, pvec);
1791 if (!ret) {
1792 clear_unsync_child_bit(sp, i);
1793 continue;
1794 } else if (ret > 0) {
1795 nr_unsync_leaf += ret;
1796 } else
1797 return ret;
1798 } else if (child->unsync) {
1799 nr_unsync_leaf++;
1800 if (mmu_pages_add(pvec, child, i))
1801 return -ENOSPC;
1802 } else
1803 clear_unsync_child_bit(sp, i);
1804 }
1805
1806 return nr_unsync_leaf;
1807 }
1808
1809 #define INVALID_INDEX (-1)
1810
1811 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1812 struct kvm_mmu_pages *pvec)
1813 {
1814 pvec->nr = 0;
1815 if (!sp->unsync_children)
1816 return 0;
1817
1818 mmu_pages_add(pvec, sp, INVALID_INDEX);
1819 return __mmu_unsync_walk(sp, pvec);
1820 }
1821
1822 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1823 {
1824 WARN_ON(!sp->unsync);
1825 trace_kvm_mmu_sync_page(sp);
1826 sp->unsync = 0;
1827 --kvm->stat.mmu_unsync;
1828 }
1829
1830 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1831 struct list_head *invalid_list);
1832 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1833 struct list_head *invalid_list);
1834
1835 static bool sp_has_gptes(struct kvm_mmu_page *sp)
1836 {
1837 if (sp->role.direct)
1838 return false;
1839
1840 if (sp->role.passthrough)
1841 return false;
1842
1843 return true;
1844 }
1845
1846 #define for_each_valid_sp(_kvm, _sp, _list) \
1847 hlist_for_each_entry(_sp, _list, hash_link) \
1848 if (is_obsolete_sp((_kvm), (_sp))) { \
1849 } else
1850
1851 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
1852 for_each_valid_sp(_kvm, _sp, \
1853 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
1854 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1855
1856 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1857 struct list_head *invalid_list)
1858 {
1859 int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1860
1861 if (ret < 0)
1862 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1863 return ret;
1864 }
1865
1866 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1867 struct list_head *invalid_list,
1868 bool remote_flush)
1869 {
1870 if (!remote_flush && list_empty(invalid_list))
1871 return false;
1872
1873 if (!list_empty(invalid_list))
1874 kvm_mmu_commit_zap_page(kvm, invalid_list);
1875 else
1876 kvm_flush_remote_tlbs(kvm);
1877 return true;
1878 }
1879
1880 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1881 {
1882 if (sp->role.invalid)
1883 return true;
1884
1885
1886 return !sp->tdp_mmu_page &&
1887 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1888 }
1889
1890 struct mmu_page_path {
1891 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1892 unsigned int idx[PT64_ROOT_MAX_LEVEL];
1893 };
1894
1895 #define for_each_sp(pvec, sp, parents, i) \
1896 for (i = mmu_pages_first(&pvec, &parents); \
1897 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1898 i = mmu_pages_next(&pvec, &parents, i))
1899
1900 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1901 struct mmu_page_path *parents,
1902 int i)
1903 {
1904 int n;
1905
1906 for (n = i+1; n < pvec->nr; n++) {
1907 struct kvm_mmu_page *sp = pvec->page[n].sp;
1908 unsigned idx = pvec->page[n].idx;
1909 int level = sp->role.level;
1910
1911 parents->idx[level-1] = idx;
1912 if (level == PG_LEVEL_4K)
1913 break;
1914
1915 parents->parent[level-2] = sp;
1916 }
1917
1918 return n;
1919 }
1920
1921 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1922 struct mmu_page_path *parents)
1923 {
1924 struct kvm_mmu_page *sp;
1925 int level;
1926
1927 if (pvec->nr == 0)
1928 return 0;
1929
1930 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1931
1932 sp = pvec->page[0].sp;
1933 level = sp->role.level;
1934 WARN_ON(level == PG_LEVEL_4K);
1935
1936 parents->parent[level-2] = sp;
1937
1938
1939
1940
1941 parents->parent[level-1] = NULL;
1942 return mmu_pages_next(pvec, parents, 0);
1943 }
1944
1945 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1946 {
1947 struct kvm_mmu_page *sp;
1948 unsigned int level = 0;
1949
1950 do {
1951 unsigned int idx = parents->idx[level];
1952 sp = parents->parent[level];
1953 if (!sp)
1954 return;
1955
1956 WARN_ON(idx == INVALID_INDEX);
1957 clear_unsync_child_bit(sp, idx);
1958 level++;
1959 } while (!sp->unsync_children);
1960 }
1961
1962 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1963 struct kvm_mmu_page *parent, bool can_yield)
1964 {
1965 int i;
1966 struct kvm_mmu_page *sp;
1967 struct mmu_page_path parents;
1968 struct kvm_mmu_pages pages;
1969 LIST_HEAD(invalid_list);
1970 bool flush = false;
1971
1972 while (mmu_unsync_walk(parent, &pages)) {
1973 bool protected = false;
1974
1975 for_each_sp(pages, sp, parents, i)
1976 protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1977
1978 if (protected) {
1979 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1980 flush = false;
1981 }
1982
1983 for_each_sp(pages, sp, parents, i) {
1984 kvm_unlink_unsync_page(vcpu->kvm, sp);
1985 flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
1986 mmu_pages_clear_parents(&parents);
1987 }
1988 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
1989 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
1990 if (!can_yield) {
1991 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1992 return -EINTR;
1993 }
1994
1995 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
1996 flush = false;
1997 }
1998 }
1999
2000 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2001 return 0;
2002 }
2003
2004 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2005 {
2006 atomic_set(&sp->write_flooding_count, 0);
2007 }
2008
2009 static void clear_sp_write_flooding_count(u64 *spte)
2010 {
2011 __clear_sp_write_flooding_count(sptep_to_sp(spte));
2012 }
2013
2014
2015
2016
2017
2018
2019
2020 static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2021 struct kvm_vcpu *vcpu,
2022 gfn_t gfn,
2023 struct hlist_head *sp_list,
2024 union kvm_mmu_page_role role)
2025 {
2026 struct kvm_mmu_page *sp;
2027 int ret;
2028 int collisions = 0;
2029 LIST_HEAD(invalid_list);
2030
2031 for_each_valid_sp(kvm, sp, sp_list) {
2032 if (sp->gfn != gfn) {
2033 collisions++;
2034 continue;
2035 }
2036
2037 if (sp->role.word != role.word) {
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047 if (role.level > PG_LEVEL_4K && sp->unsync)
2048 kvm_mmu_prepare_zap_page(kvm, sp,
2049 &invalid_list);
2050 continue;
2051 }
2052
2053
2054 if (sp->role.direct)
2055 goto out;
2056
2057 if (sp->unsync) {
2058 if (KVM_BUG_ON(!vcpu, kvm))
2059 break;
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073 ret = kvm_sync_page(vcpu, sp, &invalid_list);
2074 if (ret < 0)
2075 break;
2076
2077 WARN_ON(!list_empty(&invalid_list));
2078 if (ret > 0)
2079 kvm_flush_remote_tlbs(kvm);
2080 }
2081
2082 __clear_sp_write_flooding_count(sp);
2083
2084 goto out;
2085 }
2086
2087 sp = NULL;
2088 ++kvm->stat.mmu_cache_miss;
2089
2090 out:
2091 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2092
2093 if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2094 kvm->stat.max_mmu_page_hash_collisions = collisions;
2095 return sp;
2096 }
2097
2098
2099 struct shadow_page_caches {
2100 struct kvm_mmu_memory_cache *page_header_cache;
2101 struct kvm_mmu_memory_cache *shadow_page_cache;
2102 struct kvm_mmu_memory_cache *shadowed_info_cache;
2103 };
2104
2105 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2106 struct shadow_page_caches *caches,
2107 gfn_t gfn,
2108 struct hlist_head *sp_list,
2109 union kvm_mmu_page_role role)
2110 {
2111 struct kvm_mmu_page *sp;
2112
2113 sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2114 sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
2115 if (!role.direct)
2116 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
2117
2118 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2119
2120
2121
2122
2123
2124
2125 sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2126 list_add(&sp->link, &kvm->arch.active_mmu_pages);
2127 kvm_mod_used_mmu_pages(kvm, +1);
2128
2129 sp->gfn = gfn;
2130 sp->role = role;
2131 hlist_add_head(&sp->hash_link, sp_list);
2132 if (sp_has_gptes(sp))
2133 account_shadowed(kvm, sp);
2134
2135 return sp;
2136 }
2137
2138
2139 static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2140 struct kvm_vcpu *vcpu,
2141 struct shadow_page_caches *caches,
2142 gfn_t gfn,
2143 union kvm_mmu_page_role role)
2144 {
2145 struct hlist_head *sp_list;
2146 struct kvm_mmu_page *sp;
2147 bool created = false;
2148
2149 sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2150
2151 sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2152 if (!sp) {
2153 created = true;
2154 sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2155 }
2156
2157 trace_kvm_mmu_get_page(sp, created);
2158 return sp;
2159 }
2160
2161 static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2162 gfn_t gfn,
2163 union kvm_mmu_page_role role)
2164 {
2165 struct shadow_page_caches caches = {
2166 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
2167 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2168 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2169 };
2170
2171 return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2172 }
2173
2174 static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2175 unsigned int access)
2176 {
2177 struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2178 union kvm_mmu_page_role role;
2179
2180 role = parent_sp->role;
2181 role.level--;
2182 role.access = access;
2183 role.direct = direct;
2184 role.passthrough = 0;
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212 if (role.has_4_byte_gpte) {
2213 WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2214 role.quadrant = spte_index(sptep) & 1;
2215 }
2216
2217 return role;
2218 }
2219
2220 static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2221 u64 *sptep, gfn_t gfn,
2222 bool direct, unsigned int access)
2223 {
2224 union kvm_mmu_page_role role;
2225
2226 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2227 return ERR_PTR(-EEXIST);
2228
2229 role = kvm_mmu_child_role(sptep, direct, access);
2230 return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2231 }
2232
2233 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2234 struct kvm_vcpu *vcpu, hpa_t root,
2235 u64 addr)
2236 {
2237 iterator->addr = addr;
2238 iterator->shadow_addr = root;
2239 iterator->level = vcpu->arch.mmu->root_role.level;
2240
2241 if (iterator->level >= PT64_ROOT_4LEVEL &&
2242 vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2243 !vcpu->arch.mmu->root_role.direct)
2244 iterator->level = PT32E_ROOT_LEVEL;
2245
2246 if (iterator->level == PT32E_ROOT_LEVEL) {
2247
2248
2249
2250
2251 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2252
2253 iterator->shadow_addr
2254 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2255 iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2256 --iterator->level;
2257 if (!iterator->shadow_addr)
2258 iterator->level = 0;
2259 }
2260 }
2261
2262 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2263 struct kvm_vcpu *vcpu, u64 addr)
2264 {
2265 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2266 addr);
2267 }
2268
2269 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2270 {
2271 if (iterator->level < PG_LEVEL_4K)
2272 return false;
2273
2274 iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2275 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2276 return true;
2277 }
2278
2279 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2280 u64 spte)
2281 {
2282 if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2283 iterator->level = 0;
2284 return;
2285 }
2286
2287 iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2288 --iterator->level;
2289 }
2290
2291 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2292 {
2293 __shadow_walk_next(iterator, *iterator->sptep);
2294 }
2295
2296 static void __link_shadow_page(struct kvm *kvm,
2297 struct kvm_mmu_memory_cache *cache, u64 *sptep,
2298 struct kvm_mmu_page *sp, bool flush)
2299 {
2300 u64 spte;
2301
2302 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2303
2304
2305
2306
2307
2308
2309 if (is_shadow_present_pte(*sptep))
2310 drop_large_spte(kvm, sptep, flush);
2311
2312 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2313
2314 mmu_spte_set(sptep, spte);
2315
2316 mmu_page_add_parent_pte(cache, sp, sptep);
2317
2318 if (sp->unsync_children || sp->unsync)
2319 mark_unsync(sptep);
2320 }
2321
2322 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2323 struct kvm_mmu_page *sp)
2324 {
2325 __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2326 }
2327
2328 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2329 unsigned direct_access)
2330 {
2331 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2332 struct kvm_mmu_page *child;
2333
2334
2335
2336
2337
2338
2339
2340
2341 child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
2342 if (child->role.access == direct_access)
2343 return;
2344
2345 drop_parent_pte(child, sptep);
2346 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2347 }
2348 }
2349
2350
2351 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2352 u64 *spte, struct list_head *invalid_list)
2353 {
2354 u64 pte;
2355 struct kvm_mmu_page *child;
2356
2357 pte = *spte;
2358 if (is_shadow_present_pte(pte)) {
2359 if (is_last_spte(pte, sp->role.level)) {
2360 drop_spte(kvm, spte);
2361 } else {
2362 child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
2363 drop_parent_pte(child, spte);
2364
2365
2366
2367
2368
2369
2370 if (tdp_enabled && invalid_list &&
2371 child->role.guest_mode && !child->parent_ptes.val)
2372 return kvm_mmu_prepare_zap_page(kvm, child,
2373 invalid_list);
2374 }
2375 } else if (is_mmio_spte(pte)) {
2376 mmu_spte_clear_no_track(spte);
2377 }
2378 return 0;
2379 }
2380
2381 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2382 struct kvm_mmu_page *sp,
2383 struct list_head *invalid_list)
2384 {
2385 int zapped = 0;
2386 unsigned i;
2387
2388 for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2389 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2390
2391 return zapped;
2392 }
2393
2394 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2395 {
2396 u64 *sptep;
2397 struct rmap_iterator iter;
2398
2399 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2400 drop_parent_pte(sp, sptep);
2401 }
2402
2403 static int mmu_zap_unsync_children(struct kvm *kvm,
2404 struct kvm_mmu_page *parent,
2405 struct list_head *invalid_list)
2406 {
2407 int i, zapped = 0;
2408 struct mmu_page_path parents;
2409 struct kvm_mmu_pages pages;
2410
2411 if (parent->role.level == PG_LEVEL_4K)
2412 return 0;
2413
2414 while (mmu_unsync_walk(parent, &pages)) {
2415 struct kvm_mmu_page *sp;
2416
2417 for_each_sp(pages, sp, parents, i) {
2418 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2419 mmu_pages_clear_parents(&parents);
2420 zapped++;
2421 }
2422 }
2423
2424 return zapped;
2425 }
2426
2427 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2428 struct kvm_mmu_page *sp,
2429 struct list_head *invalid_list,
2430 int *nr_zapped)
2431 {
2432 bool list_unstable, zapped_root = false;
2433
2434 trace_kvm_mmu_prepare_zap_page(sp);
2435 ++kvm->stat.mmu_shadow_zapped;
2436 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2437 *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2438 kvm_mmu_unlink_parents(sp);
2439
2440
2441 list_unstable = *nr_zapped;
2442
2443 if (!sp->role.invalid && sp_has_gptes(sp))
2444 unaccount_shadowed(kvm, sp);
2445
2446 if (sp->unsync)
2447 kvm_unlink_unsync_page(kvm, sp);
2448 if (!sp->root_count) {
2449
2450 (*nr_zapped)++;
2451
2452
2453
2454
2455
2456
2457 if (sp->role.invalid)
2458 list_add(&sp->link, invalid_list);
2459 else
2460 list_move(&sp->link, invalid_list);
2461 kvm_mod_used_mmu_pages(kvm, -1);
2462 } else {
2463
2464
2465
2466
2467 list_del(&sp->link);
2468
2469
2470
2471
2472
2473
2474 zapped_root = !is_obsolete_sp(kvm, sp);
2475 }
2476
2477 if (sp->lpage_disallowed)
2478 unaccount_huge_nx_page(kvm, sp);
2479
2480 sp->role.invalid = 1;
2481
2482
2483
2484
2485
2486 if (zapped_root)
2487 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2488 return list_unstable;
2489 }
2490
2491 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2492 struct list_head *invalid_list)
2493 {
2494 int nr_zapped;
2495
2496 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2497 return nr_zapped;
2498 }
2499
2500 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2501 struct list_head *invalid_list)
2502 {
2503 struct kvm_mmu_page *sp, *nsp;
2504
2505 if (list_empty(invalid_list))
2506 return;
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517 kvm_flush_remote_tlbs(kvm);
2518
2519 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2520 WARN_ON(!sp->role.invalid || sp->root_count);
2521 kvm_mmu_free_shadow_page(sp);
2522 }
2523 }
2524
2525 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2526 unsigned long nr_to_zap)
2527 {
2528 unsigned long total_zapped = 0;
2529 struct kvm_mmu_page *sp, *tmp;
2530 LIST_HEAD(invalid_list);
2531 bool unstable;
2532 int nr_zapped;
2533
2534 if (list_empty(&kvm->arch.active_mmu_pages))
2535 return 0;
2536
2537 restart:
2538 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2539
2540
2541
2542
2543 if (sp->root_count)
2544 continue;
2545
2546 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2547 &nr_zapped);
2548 total_zapped += nr_zapped;
2549 if (total_zapped >= nr_to_zap)
2550 break;
2551
2552 if (unstable)
2553 goto restart;
2554 }
2555
2556 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2557
2558 kvm->stat.mmu_recycled += total_zapped;
2559 return total_zapped;
2560 }
2561
2562 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2563 {
2564 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2565 return kvm->arch.n_max_mmu_pages -
2566 kvm->arch.n_used_mmu_pages;
2567
2568 return 0;
2569 }
2570
2571 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2572 {
2573 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2574
2575 if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2576 return 0;
2577
2578 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589 if (!kvm_mmu_available_pages(vcpu->kvm))
2590 return -ENOSPC;
2591 return 0;
2592 }
2593
2594
2595
2596
2597
2598 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2599 {
2600 write_lock(&kvm->mmu_lock);
2601
2602 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2603 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2604 goal_nr_mmu_pages);
2605
2606 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2607 }
2608
2609 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2610
2611 write_unlock(&kvm->mmu_lock);
2612 }
2613
2614 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2615 {
2616 struct kvm_mmu_page *sp;
2617 LIST_HEAD(invalid_list);
2618 int r;
2619
2620 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2621 r = 0;
2622 write_lock(&kvm->mmu_lock);
2623 for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2624 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2625 sp->role.word);
2626 r = 1;
2627 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2628 }
2629 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2630 write_unlock(&kvm->mmu_lock);
2631
2632 return r;
2633 }
2634
2635 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2636 {
2637 gpa_t gpa;
2638 int r;
2639
2640 if (vcpu->arch.mmu->root_role.direct)
2641 return 0;
2642
2643 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2644
2645 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2646
2647 return r;
2648 }
2649
2650 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2651 {
2652 trace_kvm_mmu_unsync_page(sp);
2653 ++kvm->stat.mmu_unsync;
2654 sp->unsync = 1;
2655
2656 kvm_mmu_mark_parents_unsync(sp);
2657 }
2658
2659
2660
2661
2662
2663
2664
2665 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2666 gfn_t gfn, bool can_unsync, bool prefetch)
2667 {
2668 struct kvm_mmu_page *sp;
2669 bool locked = false;
2670
2671
2672
2673
2674
2675
2676 if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2677 return -EPERM;
2678
2679
2680
2681
2682
2683
2684
2685 for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2686 if (!can_unsync)
2687 return -EPERM;
2688
2689 if (sp->unsync)
2690 continue;
2691
2692 if (prefetch)
2693 return -EEXIST;
2694
2695
2696
2697
2698
2699
2700
2701
2702 if (!locked) {
2703 locked = true;
2704 spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714 if (READ_ONCE(sp->unsync))
2715 continue;
2716 }
2717
2718 WARN_ON(sp->role.level != PG_LEVEL_4K);
2719 kvm_unsync_page(kvm, sp);
2720 }
2721 if (locked)
2722 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761 smp_wmb();
2762
2763 return 0;
2764 }
2765
2766 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2767 u64 *sptep, unsigned int pte_access, gfn_t gfn,
2768 kvm_pfn_t pfn, struct kvm_page_fault *fault)
2769 {
2770 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2771 int level = sp->role.level;
2772 int was_rmapped = 0;
2773 int ret = RET_PF_FIXED;
2774 bool flush = false;
2775 bool wrprot;
2776 u64 spte;
2777
2778
2779 bool host_writable = !fault || fault->map_writable;
2780 bool prefetch = !fault || fault->prefetch;
2781 bool write_fault = fault && fault->write;
2782
2783 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2784 *sptep, write_fault, gfn);
2785
2786 if (unlikely(is_noslot_pfn(pfn))) {
2787 vcpu->stat.pf_mmio_spte_created++;
2788 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2789 return RET_PF_EMULATE;
2790 }
2791
2792 if (is_shadow_present_pte(*sptep)) {
2793
2794
2795
2796
2797 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2798 struct kvm_mmu_page *child;
2799 u64 pte = *sptep;
2800
2801 child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
2802 drop_parent_pte(child, sptep);
2803 flush = true;
2804 } else if (pfn != spte_to_pfn(*sptep)) {
2805 pgprintk("hfn old %llx new %llx\n",
2806 spte_to_pfn(*sptep), pfn);
2807 drop_spte(vcpu->kvm, sptep);
2808 flush = true;
2809 } else
2810 was_rmapped = 1;
2811 }
2812
2813 wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2814 true, host_writable, &spte);
2815
2816 if (*sptep == spte) {
2817 ret = RET_PF_SPURIOUS;
2818 } else {
2819 flush |= mmu_spte_update(sptep, spte);
2820 trace_kvm_mmu_set_spte(level, gfn, sptep);
2821 }
2822
2823 if (wrprot) {
2824 if (write_fault)
2825 ret = RET_PF_EMULATE;
2826 }
2827
2828 if (flush)
2829 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2830 KVM_PAGES_PER_HPAGE(level));
2831
2832 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2833
2834 if (!was_rmapped) {
2835 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2836 rmap_add(vcpu, slot, sptep, gfn, pte_access);
2837 } else {
2838
2839 kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
2840 }
2841
2842 return ret;
2843 }
2844
2845 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2846 struct kvm_mmu_page *sp,
2847 u64 *start, u64 *end)
2848 {
2849 struct page *pages[PTE_PREFETCH_NUM];
2850 struct kvm_memory_slot *slot;
2851 unsigned int access = sp->role.access;
2852 int i, ret;
2853 gfn_t gfn;
2854
2855 gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
2856 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2857 if (!slot)
2858 return -1;
2859
2860 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2861 if (ret <= 0)
2862 return -1;
2863
2864 for (i = 0; i < ret; i++, gfn++, start++) {
2865 mmu_set_spte(vcpu, slot, start, access, gfn,
2866 page_to_pfn(pages[i]), NULL);
2867 put_page(pages[i]);
2868 }
2869
2870 return 0;
2871 }
2872
2873 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2874 struct kvm_mmu_page *sp, u64 *sptep)
2875 {
2876 u64 *spte, *start = NULL;
2877 int i;
2878
2879 WARN_ON(!sp->role.direct);
2880
2881 i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
2882 spte = sp->spt + i;
2883
2884 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2885 if (is_shadow_present_pte(*spte) || spte == sptep) {
2886 if (!start)
2887 continue;
2888 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2889 return;
2890 start = NULL;
2891 } else if (!start)
2892 start = spte;
2893 }
2894 if (start)
2895 direct_pte_prefetch_many(vcpu, sp, start, spte);
2896 }
2897
2898 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2899 {
2900 struct kvm_mmu_page *sp;
2901
2902 sp = sptep_to_sp(sptep);
2903
2904
2905
2906
2907
2908
2909 if (sp_ad_disabled(sp))
2910 return;
2911
2912 if (sp->role.level > PG_LEVEL_4K)
2913 return;
2914
2915
2916
2917
2918
2919 if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
2920 return;
2921
2922 __direct_pte_prefetch(vcpu, sp, sptep);
2923 }
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
2951 const struct kvm_memory_slot *slot)
2952 {
2953 int level = PG_LEVEL_4K;
2954 unsigned long hva;
2955 unsigned long flags;
2956 pgd_t pgd;
2957 p4d_t p4d;
2958 pud_t pud;
2959 pmd_t pmd;
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969 hva = __gfn_to_hva_memslot(slot, gfn);
2970
2971
2972
2973
2974
2975
2976 local_irq_save(flags);
2977
2978
2979
2980
2981
2982
2983
2984
2985 pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2986 if (pgd_none(pgd))
2987 goto out;
2988
2989 p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2990 if (p4d_none(p4d) || !p4d_present(p4d))
2991 goto out;
2992
2993 pud = READ_ONCE(*pud_offset(&p4d, hva));
2994 if (pud_none(pud) || !pud_present(pud))
2995 goto out;
2996
2997 if (pud_large(pud)) {
2998 level = PG_LEVEL_1G;
2999 goto out;
3000 }
3001
3002 pmd = READ_ONCE(*pmd_offset(&pud, hva));
3003 if (pmd_none(pmd) || !pmd_present(pmd))
3004 goto out;
3005
3006 if (pmd_large(pmd))
3007 level = PG_LEVEL_2M;
3008
3009 out:
3010 local_irq_restore(flags);
3011 return level;
3012 }
3013
3014 int kvm_mmu_max_mapping_level(struct kvm *kvm,
3015 const struct kvm_memory_slot *slot, gfn_t gfn,
3016 int max_level)
3017 {
3018 struct kvm_lpage_info *linfo;
3019 int host_level;
3020
3021 max_level = min(max_level, max_huge_page_level);
3022 for ( ; max_level > PG_LEVEL_4K; max_level--) {
3023 linfo = lpage_info_slot(gfn, slot, max_level);
3024 if (!linfo->disallow_lpage)
3025 break;
3026 }
3027
3028 if (max_level == PG_LEVEL_4K)
3029 return PG_LEVEL_4K;
3030
3031 host_level = host_pfn_mapping_level(kvm, gfn, slot);
3032 return min(host_level, max_level);
3033 }
3034
3035 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3036 {
3037 struct kvm_memory_slot *slot = fault->slot;
3038 kvm_pfn_t mask;
3039
3040 fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3041
3042 if (unlikely(fault->max_level == PG_LEVEL_4K))
3043 return;
3044
3045 if (is_error_noslot_pfn(fault->pfn))
3046 return;
3047
3048 if (kvm_slot_dirty_track_enabled(slot))
3049 return;
3050
3051
3052
3053
3054
3055 fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3056 fault->gfn, fault->max_level);
3057 if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3058 return;
3059
3060
3061
3062
3063
3064 fault->goal_level = fault->req_level;
3065 mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3066 VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3067 fault->pfn &= ~mask;
3068 }
3069
3070 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
3071 {
3072 if (cur_level > PG_LEVEL_4K &&
3073 cur_level == fault->goal_level &&
3074 is_shadow_present_pte(spte) &&
3075 !is_large_pte(spte)) {
3076
3077
3078
3079
3080
3081
3082
3083 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3084 KVM_PAGES_PER_HPAGE(cur_level - 1);
3085 fault->pfn |= fault->gfn & page_mask;
3086 fault->goal_level--;
3087 }
3088 }
3089
3090 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3091 {
3092 struct kvm_shadow_walk_iterator it;
3093 struct kvm_mmu_page *sp;
3094 int ret;
3095 gfn_t base_gfn = fault->gfn;
3096
3097 kvm_mmu_hugepage_adjust(vcpu, fault);
3098
3099 trace_kvm_mmu_spte_requested(fault);
3100 for_each_shadow_entry(vcpu, fault->addr, it) {
3101
3102
3103
3104
3105 if (fault->nx_huge_page_workaround_enabled)
3106 disallowed_hugepage_adjust(fault, *it.sptep, it.level);
3107
3108 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3109 if (it.level == fault->goal_level)
3110 break;
3111
3112 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3113 if (sp == ERR_PTR(-EEXIST))
3114 continue;
3115
3116 link_shadow_page(vcpu, it.sptep, sp);
3117 if (fault->is_tdp && fault->huge_page_disallowed &&
3118 fault->req_level >= it.level)
3119 account_huge_nx_page(vcpu->kvm, sp);
3120 }
3121
3122 if (WARN_ON_ONCE(it.level != fault->goal_level))
3123 return -EFAULT;
3124
3125 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3126 base_gfn, fault->pfn, fault);
3127 if (ret == RET_PF_SPURIOUS)
3128 return ret;
3129
3130 direct_pte_prefetch(vcpu, it.sptep);
3131 return ret;
3132 }
3133
3134 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3135 {
3136 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3137 }
3138
3139 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3140 {
3141
3142
3143
3144
3145
3146 if (pfn == KVM_PFN_ERR_RO_FAULT)
3147 return RET_PF_EMULATE;
3148
3149 if (pfn == KVM_PFN_ERR_HWPOISON) {
3150 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3151 return RET_PF_RETRY;
3152 }
3153
3154 return -EFAULT;
3155 }
3156
3157 static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3158 unsigned int access)
3159 {
3160
3161 if (unlikely(is_error_pfn(fault->pfn)))
3162 return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
3163
3164 if (unlikely(!fault->slot)) {
3165 gva_t gva = fault->is_tdp ? 0 : fault->addr;
3166
3167 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3168 access & shadow_mmio_access_mask);
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179 if (unlikely(!enable_mmio_caching) ||
3180 unlikely(fault->gfn > kvm_mmu_max_gfn()))
3181 return RET_PF_EMULATE;
3182 }
3183
3184 return RET_PF_CONTINUE;
3185 }
3186
3187 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3188 {
3189
3190
3191
3192
3193
3194
3195 if (fault->rsvd)
3196 return false;
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213 if (!fault->present)
3214 return !kvm_ad_enabled();
3215
3216
3217
3218
3219
3220 return fault->write;
3221 }
3222
3223
3224
3225
3226
3227 static bool
3228 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3229 u64 *sptep, u64 old_spte, u64 new_spte)
3230 {
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243 if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3244 return false;
3245
3246 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3247 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3248
3249 return true;
3250 }
3251
3252 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3253 {
3254 if (fault->exec)
3255 return is_executable_pte(spte);
3256
3257 if (fault->write)
3258 return is_writable_pte(spte);
3259
3260
3261 return spte & PT_PRESENT_MASK;
3262 }
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3274 {
3275 struct kvm_shadow_walk_iterator iterator;
3276 u64 old_spte;
3277 u64 *sptep = NULL;
3278
3279 for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3280 sptep = iterator.sptep;
3281 *spte = old_spte;
3282 }
3283
3284 return sptep;
3285 }
3286
3287
3288
3289
3290 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3291 {
3292 struct kvm_mmu_page *sp;
3293 int ret = RET_PF_INVALID;
3294 u64 spte = 0ull;
3295 u64 *sptep = NULL;
3296 uint retry_count = 0;
3297
3298 if (!page_fault_can_be_fast(fault))
3299 return ret;
3300
3301 walk_shadow_page_lockless_begin(vcpu);
3302
3303 do {
3304 u64 new_spte;
3305
3306 if (is_tdp_mmu(vcpu->arch.mmu))
3307 sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3308 else
3309 sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3310
3311 if (!is_shadow_present_pte(spte))
3312 break;
3313
3314 sp = sptep_to_sp(sptep);
3315 if (!is_last_spte(spte, sp->role.level))
3316 break;
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328 if (is_access_allowed(fault, spte)) {
3329 ret = RET_PF_SPURIOUS;
3330 break;
3331 }
3332
3333 new_spte = spte;
3334
3335
3336
3337
3338
3339
3340
3341 if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3342 new_spte = restore_acc_track_spte(new_spte);
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355 if (fault->write && is_mmu_writable_spte(spte)) {
3356 new_spte |= PT_WRITABLE_MASK;
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368 if (sp->role.level > PG_LEVEL_4K &&
3369 kvm_slot_dirty_track_enabled(fault->slot))
3370 break;
3371 }
3372
3373
3374 if (new_spte == spte ||
3375 !is_access_allowed(fault, new_spte))
3376 break;
3377
3378
3379
3380
3381
3382
3383 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3384 ret = RET_PF_FIXED;
3385 break;
3386 }
3387
3388 if (++retry_count > 4) {
3389 printk_once(KERN_WARNING
3390 "kvm: Fast #PF retrying more than 4 times.\n");
3391 break;
3392 }
3393
3394 } while (true);
3395
3396 trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3397 walk_shadow_page_lockless_end(vcpu);
3398
3399 if (ret != RET_PF_INVALID)
3400 vcpu->stat.pf_fast++;
3401
3402 return ret;
3403 }
3404
3405 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3406 struct list_head *invalid_list)
3407 {
3408 struct kvm_mmu_page *sp;
3409
3410 if (!VALID_PAGE(*root_hpa))
3411 return;
3412
3413 sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
3414 if (WARN_ON(!sp))
3415 return;
3416
3417 if (is_tdp_mmu_page(sp))
3418 kvm_tdp_mmu_put_root(kvm, sp, false);
3419 else if (!--sp->root_count && sp->role.invalid)
3420 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3421
3422 *root_hpa = INVALID_PAGE;
3423 }
3424
3425
3426 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3427 ulong roots_to_free)
3428 {
3429 int i;
3430 LIST_HEAD(invalid_list);
3431 bool free_active_root;
3432
3433 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3434
3435
3436 free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3437 && VALID_PAGE(mmu->root.hpa);
3438
3439 if (!free_active_root) {
3440 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3441 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3442 VALID_PAGE(mmu->prev_roots[i].hpa))
3443 break;
3444
3445 if (i == KVM_MMU_NUM_PREV_ROOTS)
3446 return;
3447 }
3448
3449 write_lock(&kvm->mmu_lock);
3450
3451 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3452 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3453 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3454 &invalid_list);
3455
3456 if (free_active_root) {
3457 if (to_shadow_page(mmu->root.hpa)) {
3458 mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3459 } else if (mmu->pae_root) {
3460 for (i = 0; i < 4; ++i) {
3461 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3462 continue;
3463
3464 mmu_free_root_page(kvm, &mmu->pae_root[i],
3465 &invalid_list);
3466 mmu->pae_root[i] = INVALID_PAE_ROOT;
3467 }
3468 }
3469 mmu->root.hpa = INVALID_PAGE;
3470 mmu->root.pgd = 0;
3471 }
3472
3473 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3474 write_unlock(&kvm->mmu_lock);
3475 }
3476 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3477
3478 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3479 {
3480 unsigned long roots_to_free = 0;
3481 hpa_t root_hpa;
3482 int i;
3483
3484
3485
3486
3487
3488 WARN_ON_ONCE(mmu->root_role.guest_mode);
3489
3490 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3491 root_hpa = mmu->prev_roots[i].hpa;
3492 if (!VALID_PAGE(root_hpa))
3493 continue;
3494
3495 if (!to_shadow_page(root_hpa) ||
3496 to_shadow_page(root_hpa)->role.guest_mode)
3497 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3498 }
3499
3500 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3501 }
3502 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3503
3504
3505 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3506 {
3507 int ret = 0;
3508
3509 if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3510 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3511 ret = 1;
3512 }
3513
3514 return ret;
3515 }
3516
3517 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
3518 u8 level)
3519 {
3520 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3521 struct kvm_mmu_page *sp;
3522
3523 role.level = level;
3524 role.quadrant = quadrant;
3525
3526 WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3527 WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3528
3529 sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3530 ++sp->root_count;
3531
3532 return __pa(sp->spt);
3533 }
3534
3535 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3536 {
3537 struct kvm_mmu *mmu = vcpu->arch.mmu;
3538 u8 shadow_root_level = mmu->root_role.level;
3539 hpa_t root;
3540 unsigned i;
3541 int r;
3542
3543 write_lock(&vcpu->kvm->mmu_lock);
3544 r = make_mmu_pages_available(vcpu);
3545 if (r < 0)
3546 goto out_unlock;
3547
3548 if (is_tdp_mmu_enabled(vcpu->kvm)) {
3549 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3550 mmu->root.hpa = root;
3551 } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3552 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
3553 mmu->root.hpa = root;
3554 } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3555 if (WARN_ON_ONCE(!mmu->pae_root)) {
3556 r = -EIO;
3557 goto out_unlock;
3558 }
3559
3560 for (i = 0; i < 4; ++i) {
3561 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3562
3563 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
3564 PT32_ROOT_LEVEL);
3565 mmu->pae_root[i] = root | PT_PRESENT_MASK |
3566 shadow_me_value;
3567 }
3568 mmu->root.hpa = __pa(mmu->pae_root);
3569 } else {
3570 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3571 r = -EIO;
3572 goto out_unlock;
3573 }
3574
3575
3576 mmu->root.pgd = 0;
3577 out_unlock:
3578 write_unlock(&vcpu->kvm->mmu_lock);
3579 return r;
3580 }
3581
3582 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3583 {
3584 struct kvm_memslots *slots;
3585 struct kvm_memory_slot *slot;
3586 int r = 0, i, bkt;
3587
3588
3589
3590
3591
3592 if (kvm_shadow_root_allocated(kvm))
3593 return 0;
3594
3595 mutex_lock(&kvm->slots_arch_lock);
3596
3597
3598 if (kvm_shadow_root_allocated(kvm))
3599 goto out_unlock;
3600
3601
3602
3603
3604
3605 if (kvm_memslots_have_rmaps(kvm) &&
3606 kvm_page_track_write_tracking_enabled(kvm))
3607 goto out_success;
3608
3609 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3610 slots = __kvm_memslots(kvm, i);
3611 kvm_for_each_memslot(slot, bkt, slots) {
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622 r = memslot_rmap_alloc(slot, slot->npages);
3623 if (r)
3624 goto out_unlock;
3625 r = kvm_page_track_write_tracking_alloc(slot);
3626 if (r)
3627 goto out_unlock;
3628 }
3629 }
3630
3631
3632
3633
3634
3635 out_success:
3636 smp_store_release(&kvm->arch.shadow_root_allocated, true);
3637
3638 out_unlock:
3639 mutex_unlock(&kvm->slots_arch_lock);
3640 return r;
3641 }
3642
3643 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3644 {
3645 struct kvm_mmu *mmu = vcpu->arch.mmu;
3646 u64 pdptrs[4], pm_mask;
3647 gfn_t root_gfn, root_pgd;
3648 int quadrant, i, r;
3649 hpa_t root;
3650
3651 root_pgd = mmu->get_guest_pgd(vcpu);
3652 root_gfn = root_pgd >> PAGE_SHIFT;
3653
3654 if (mmu_check_root(vcpu, root_gfn))
3655 return 1;
3656
3657
3658
3659
3660
3661 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3662 for (i = 0; i < 4; ++i) {
3663 pdptrs[i] = mmu->get_pdptr(vcpu, i);
3664 if (!(pdptrs[i] & PT_PRESENT_MASK))
3665 continue;
3666
3667 if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3668 return 1;
3669 }
3670 }
3671
3672 r = mmu_first_shadow_root_alloc(vcpu->kvm);
3673 if (r)
3674 return r;
3675
3676 write_lock(&vcpu->kvm->mmu_lock);
3677 r = make_mmu_pages_available(vcpu);
3678 if (r < 0)
3679 goto out_unlock;
3680
3681
3682
3683
3684
3685 if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3686 root = mmu_alloc_root(vcpu, root_gfn, 0,
3687 mmu->root_role.level);
3688 mmu->root.hpa = root;
3689 goto set_root_pgd;
3690 }
3691
3692 if (WARN_ON_ONCE(!mmu->pae_root)) {
3693 r = -EIO;
3694 goto out_unlock;
3695 }
3696
3697
3698
3699
3700
3701
3702 pm_mask = PT_PRESENT_MASK | shadow_me_value;
3703 if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3704 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3705
3706 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3707 r = -EIO;
3708 goto out_unlock;
3709 }
3710 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3711
3712 if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3713 if (WARN_ON_ONCE(!mmu->pml5_root)) {
3714 r = -EIO;
3715 goto out_unlock;
3716 }
3717 mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3718 }
3719 }
3720
3721 for (i = 0; i < 4; ++i) {
3722 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3723
3724 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3725 if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3726 mmu->pae_root[i] = INVALID_PAE_ROOT;
3727 continue;
3728 }
3729 root_gfn = pdptrs[i] >> PAGE_SHIFT;
3730 }
3731
3732
3733
3734
3735
3736
3737
3738 quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3739
3740 root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3741 mmu->pae_root[i] = root | pm_mask;
3742 }
3743
3744 if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3745 mmu->root.hpa = __pa(mmu->pml5_root);
3746 else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3747 mmu->root.hpa = __pa(mmu->pml4_root);
3748 else
3749 mmu->root.hpa = __pa(mmu->pae_root);
3750
3751 set_root_pgd:
3752 mmu->root.pgd = root_pgd;
3753 out_unlock:
3754 write_unlock(&vcpu->kvm->mmu_lock);
3755
3756 return r;
3757 }
3758
3759 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3760 {
3761 struct kvm_mmu *mmu = vcpu->arch.mmu;
3762 bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3763 u64 *pml5_root = NULL;
3764 u64 *pml4_root = NULL;
3765 u64 *pae_root;
3766
3767
3768
3769
3770
3771
3772
3773 if (mmu->root_role.direct ||
3774 mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3775 mmu->root_role.level < PT64_ROOT_4LEVEL)
3776 return 0;
3777
3778
3779
3780
3781
3782
3783
3784
3785 if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3786 return 0;
3787
3788
3789
3790
3791
3792 if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3793 (need_pml5 && mmu->pml5_root)))
3794 return -EIO;
3795
3796
3797
3798
3799
3800 pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3801 if (!pae_root)
3802 return -ENOMEM;
3803
3804 #ifdef CONFIG_X86_64
3805 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3806 if (!pml4_root)
3807 goto err_pml4;
3808
3809 if (need_pml5) {
3810 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3811 if (!pml5_root)
3812 goto err_pml5;
3813 }
3814 #endif
3815
3816 mmu->pae_root = pae_root;
3817 mmu->pml4_root = pml4_root;
3818 mmu->pml5_root = pml5_root;
3819
3820 return 0;
3821
3822 #ifdef CONFIG_X86_64
3823 err_pml5:
3824 free_page((unsigned long)pml4_root);
3825 err_pml4:
3826 free_page((unsigned long)pae_root);
3827 return -ENOMEM;
3828 #endif
3829 }
3830
3831 static bool is_unsync_root(hpa_t root)
3832 {
3833 struct kvm_mmu_page *sp;
3834
3835 if (!VALID_PAGE(root))
3836 return false;
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850 smp_rmb();
3851 sp = to_shadow_page(root);
3852
3853
3854
3855
3856
3857 if (WARN_ON_ONCE(!sp))
3858 return false;
3859
3860 if (sp->unsync || sp->unsync_children)
3861 return true;
3862
3863 return false;
3864 }
3865
3866 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3867 {
3868 int i;
3869 struct kvm_mmu_page *sp;
3870
3871 if (vcpu->arch.mmu->root_role.direct)
3872 return;
3873
3874 if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3875 return;
3876
3877 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3878
3879 if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3880 hpa_t root = vcpu->arch.mmu->root.hpa;
3881 sp = to_shadow_page(root);
3882
3883 if (!is_unsync_root(root))
3884 return;
3885
3886 write_lock(&vcpu->kvm->mmu_lock);
3887 mmu_sync_children(vcpu, sp, true);
3888 write_unlock(&vcpu->kvm->mmu_lock);
3889 return;
3890 }
3891
3892 write_lock(&vcpu->kvm->mmu_lock);
3893
3894 for (i = 0; i < 4; ++i) {
3895 hpa_t root = vcpu->arch.mmu->pae_root[i];
3896
3897 if (IS_VALID_PAE_ROOT(root)) {
3898 root &= SPTE_BASE_ADDR_MASK;
3899 sp = to_shadow_page(root);
3900 mmu_sync_children(vcpu, sp, true);
3901 }
3902 }
3903
3904 write_unlock(&vcpu->kvm->mmu_lock);
3905 }
3906
3907 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3908 {
3909 unsigned long roots_to_free = 0;
3910 int i;
3911
3912 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3913 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3914 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3915
3916
3917 kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3918 }
3919
3920 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3921 gpa_t vaddr, u64 access,
3922 struct x86_exception *exception)
3923 {
3924 if (exception)
3925 exception->error_code = 0;
3926 return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3927 }
3928
3929 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3930 {
3931
3932
3933
3934
3935 if (mmu_is_nested(vcpu))
3936 return false;
3937
3938 if (direct)
3939 return vcpu_match_mmio_gpa(vcpu, addr);
3940
3941 return vcpu_match_mmio_gva(vcpu, addr);
3942 }
3943
3944
3945
3946
3947
3948
3949
3950 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3951 {
3952 struct kvm_shadow_walk_iterator iterator;
3953 int leaf = -1;
3954 u64 spte;
3955
3956 for (shadow_walk_init(&iterator, vcpu, addr),
3957 *root_level = iterator.level;
3958 shadow_walk_okay(&iterator);
3959 __shadow_walk_next(&iterator, spte)) {
3960 leaf = iterator.level;
3961 spte = mmu_spte_get_lockless(iterator.sptep);
3962
3963 sptes[leaf] = spte;
3964 }
3965
3966 return leaf;
3967 }
3968
3969
3970 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3971 {
3972 u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3973 struct rsvd_bits_validate *rsvd_check;
3974 int root, leaf, level;
3975 bool reserved = false;
3976
3977 walk_shadow_page_lockless_begin(vcpu);
3978
3979 if (is_tdp_mmu(vcpu->arch.mmu))
3980 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3981 else
3982 leaf = get_walk(vcpu, addr, sptes, &root);
3983
3984 walk_shadow_page_lockless_end(vcpu);
3985
3986 if (unlikely(leaf < 0)) {
3987 *sptep = 0ull;
3988 return reserved;
3989 }
3990
3991 *sptep = sptes[leaf];
3992
3993
3994
3995
3996
3997
3998
3999 if (!is_shadow_present_pte(sptes[leaf]))
4000 leaf++;
4001
4002 rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4003
4004 for (level = root; level >= leaf; level--)
4005 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
4006
4007 if (reserved) {
4008 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4009 __func__, addr);
4010 for (level = root; level >= leaf; level--)
4011 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4012 sptes[level], level,
4013 get_rsvd_bits(rsvd_check, sptes[level], level));
4014 }
4015
4016 return reserved;
4017 }
4018
4019 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4020 {
4021 u64 spte;
4022 bool reserved;
4023
4024 if (mmio_info_in_cache(vcpu, addr, direct))
4025 return RET_PF_EMULATE;
4026
4027 reserved = get_mmio_spte(vcpu, addr, &spte);
4028 if (WARN_ON(reserved))
4029 return -EINVAL;
4030
4031 if (is_mmio_spte(spte)) {
4032 gfn_t gfn = get_mmio_spte_gfn(spte);
4033 unsigned int access = get_mmio_spte_access(spte);
4034
4035 if (!check_mmio_spte(vcpu, spte))
4036 return RET_PF_INVALID;
4037
4038 if (direct)
4039 addr = 0;
4040
4041 trace_handle_mmio_page_fault(addr, gfn, access);
4042 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4043 return RET_PF_EMULATE;
4044 }
4045
4046
4047
4048
4049
4050 return RET_PF_RETRY;
4051 }
4052
4053 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4054 struct kvm_page_fault *fault)
4055 {
4056 if (unlikely(fault->rsvd))
4057 return false;
4058
4059 if (!fault->present || !fault->write)
4060 return false;
4061
4062
4063
4064
4065
4066 if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
4067 return true;
4068
4069 return false;
4070 }
4071
4072 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4073 {
4074 struct kvm_shadow_walk_iterator iterator;
4075 u64 spte;
4076
4077 walk_shadow_page_lockless_begin(vcpu);
4078 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4079 clear_sp_write_flooding_count(iterator.sptep);
4080 walk_shadow_page_lockless_end(vcpu);
4081 }
4082
4083 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4084 {
4085
4086 u32 id = vcpu->arch.apf.id;
4087
4088 if (id << 12 == 0)
4089 vcpu->arch.apf.id = 1;
4090
4091 return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4092 }
4093
4094 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4095 gfn_t gfn)
4096 {
4097 struct kvm_arch_async_pf arch;
4098
4099 arch.token = alloc_apf_token(vcpu);
4100 arch.gfn = gfn;
4101 arch.direct_map = vcpu->arch.mmu->root_role.direct;
4102 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
4103
4104 return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4105 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4106 }
4107
4108 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4109 {
4110 int r;
4111
4112 if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4113 work->wakeup_all)
4114 return;
4115
4116 r = kvm_mmu_reload(vcpu);
4117 if (unlikely(r))
4118 return;
4119
4120 if (!vcpu->arch.mmu->root_role.direct &&
4121 work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
4122 return;
4123
4124 kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
4125 }
4126
4127 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4128 {
4129 struct kvm_memory_slot *slot = fault->slot;
4130 bool async;
4131
4132
4133
4134
4135
4136
4137 if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4138 return RET_PF_RETRY;
4139
4140 if (!kvm_is_visible_memslot(slot)) {
4141
4142 if (is_guest_mode(vcpu)) {
4143 fault->slot = NULL;
4144 fault->pfn = KVM_PFN_NOSLOT;
4145 fault->map_writable = false;
4146 return RET_PF_CONTINUE;
4147 }
4148
4149
4150
4151
4152
4153
4154 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4155 !kvm_apicv_activated(vcpu->kvm))
4156 return RET_PF_EMULATE;
4157 }
4158
4159 async = false;
4160 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
4161 fault->write, &fault->map_writable,
4162 &fault->hva);
4163 if (!async)
4164 return RET_PF_CONTINUE;
4165
4166 if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4167 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4168 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
4169 trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
4170 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4171 return RET_PF_RETRY;
4172 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4173 return RET_PF_RETRY;
4174 }
4175 }
4176
4177 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
4178 fault->write, &fault->map_writable,
4179 &fault->hva);
4180 return RET_PF_CONTINUE;
4181 }
4182
4183
4184
4185
4186
4187 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4188 struct kvm_page_fault *fault, int mmu_seq)
4189 {
4190 struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
4191
4192
4193 if (sp && is_obsolete_sp(vcpu->kvm, sp))
4194 return true;
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204 if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4205 return true;
4206
4207 return fault->slot &&
4208 mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
4209 }
4210
4211 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4212 {
4213 bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
4214
4215 unsigned long mmu_seq;
4216 int r;
4217
4218 fault->gfn = fault->addr >> PAGE_SHIFT;
4219 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4220
4221 if (page_fault_handle_page_track(vcpu, fault))
4222 return RET_PF_EMULATE;
4223
4224 r = fast_page_fault(vcpu, fault);
4225 if (r != RET_PF_INVALID)
4226 return r;
4227
4228 r = mmu_topup_memory_caches(vcpu, false);
4229 if (r)
4230 return r;
4231
4232 mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4233 smp_rmb();
4234
4235 r = kvm_faultin_pfn(vcpu, fault);
4236 if (r != RET_PF_CONTINUE)
4237 return r;
4238
4239 r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
4240 if (r != RET_PF_CONTINUE)
4241 return r;
4242
4243 r = RET_PF_RETRY;
4244
4245 if (is_tdp_mmu_fault)
4246 read_lock(&vcpu->kvm->mmu_lock);
4247 else
4248 write_lock(&vcpu->kvm->mmu_lock);
4249
4250 if (is_page_fault_stale(vcpu, fault, mmu_seq))
4251 goto out_unlock;
4252
4253 r = make_mmu_pages_available(vcpu);
4254 if (r)
4255 goto out_unlock;
4256
4257 if (is_tdp_mmu_fault)
4258 r = kvm_tdp_mmu_map(vcpu, fault);
4259 else
4260 r = __direct_map(vcpu, fault);
4261
4262 out_unlock:
4263 if (is_tdp_mmu_fault)
4264 read_unlock(&vcpu->kvm->mmu_lock);
4265 else
4266 write_unlock(&vcpu->kvm->mmu_lock);
4267 kvm_release_pfn_clean(fault->pfn);
4268 return r;
4269 }
4270
4271 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4272 struct kvm_page_fault *fault)
4273 {
4274 pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4275
4276
4277 fault->max_level = PG_LEVEL_2M;
4278 return direct_page_fault(vcpu, fault);
4279 }
4280
4281 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4282 u64 fault_address, char *insn, int insn_len)
4283 {
4284 int r = 1;
4285 u32 flags = vcpu->arch.apf.host_apf_flags;
4286
4287 #ifndef CONFIG_X86_64
4288
4289 if (WARN_ON_ONCE(fault_address >> 32))
4290 return -EFAULT;
4291 #endif
4292
4293 vcpu->arch.l1tf_flush_l1d = true;
4294 if (!flags) {
4295 trace_kvm_page_fault(fault_address, error_code);
4296
4297 if (kvm_event_needs_reinjection(vcpu))
4298 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4299 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4300 insn_len);
4301 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4302 vcpu->arch.apf.host_apf_flags = 0;
4303 local_irq_disable();
4304 kvm_async_pf_task_wait_schedule(fault_address);
4305 local_irq_enable();
4306 } else {
4307 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4308 }
4309
4310 return r;
4311 }
4312 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4313
4314 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4315 {
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328 if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
4329 for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4330 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4331 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4332
4333 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4334 break;
4335 }
4336 }
4337
4338 return direct_page_fault(vcpu, fault);
4339 }
4340
4341 static void nonpaging_init_context(struct kvm_mmu *context)
4342 {
4343 context->page_fault = nonpaging_page_fault;
4344 context->gva_to_gpa = nonpaging_gva_to_gpa;
4345 context->sync_page = nonpaging_sync_page;
4346 context->invlpg = NULL;
4347 }
4348
4349 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4350 union kvm_mmu_page_role role)
4351 {
4352 return (role.direct || pgd == root->pgd) &&
4353 VALID_PAGE(root->hpa) &&
4354 role.word == to_shadow_page(root->hpa)->role.word;
4355 }
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4366 gpa_t new_pgd,
4367 union kvm_mmu_page_role new_role)
4368 {
4369 uint i;
4370
4371 if (is_root_usable(&mmu->root, new_pgd, new_role))
4372 return true;
4373
4374 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4375
4376
4377
4378
4379
4380
4381
4382
4383 swap(mmu->root, mmu->prev_roots[i]);
4384 if (is_root_usable(&mmu->root, new_pgd, new_role))
4385 return true;
4386 }
4387
4388 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4389 return false;
4390 }
4391
4392
4393
4394
4395
4396
4397
4398
4399 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4400 gpa_t new_pgd,
4401 union kvm_mmu_page_role new_role)
4402 {
4403 uint i;
4404
4405 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4406 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4407 goto hit;
4408
4409 return false;
4410
4411 hit:
4412 swap(mmu->root, mmu->prev_roots[i]);
4413
4414 for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4415 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4416 mmu->prev_roots[i].hpa = INVALID_PAGE;
4417 return true;
4418 }
4419
4420 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4421 gpa_t new_pgd, union kvm_mmu_page_role new_role)
4422 {
4423
4424
4425
4426
4427
4428 if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4429 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4430
4431 if (VALID_PAGE(mmu->root.hpa))
4432 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4433 else
4434 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4435 }
4436
4437 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4438 {
4439 struct kvm_mmu *mmu = vcpu->arch.mmu;
4440 union kvm_mmu_page_role new_role = mmu->root_role;
4441
4442 if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4443
4444 return;
4445 }
4446
4447
4448
4449
4450
4451
4452
4453 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4454
4455 if (force_flush_and_sync_on_reuse) {
4456 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4457 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4458 }
4459
4460
4461
4462
4463
4464
4465
4466 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4467
4468
4469
4470
4471
4472 if (!new_role.direct)
4473 __clear_sp_write_flooding_count(
4474 to_shadow_page(vcpu->arch.mmu->root.hpa));
4475 }
4476 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4477
4478 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4479 {
4480 return kvm_read_cr3(vcpu);
4481 }
4482
4483 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4484 unsigned int access)
4485 {
4486 if (unlikely(is_mmio_spte(*sptep))) {
4487 if (gfn != get_mmio_spte_gfn(*sptep)) {
4488 mmu_spte_clear_no_track(sptep);
4489 return true;
4490 }
4491
4492 mark_mmio_spte(vcpu, sptep, gfn, access);
4493 return true;
4494 }
4495
4496 return false;
4497 }
4498
4499 #define PTTYPE_EPT 18
4500 #define PTTYPE PTTYPE_EPT
4501 #include "paging_tmpl.h"
4502 #undef PTTYPE
4503
4504 #define PTTYPE 64
4505 #include "paging_tmpl.h"
4506 #undef PTTYPE
4507
4508 #define PTTYPE 32
4509 #include "paging_tmpl.h"
4510 #undef PTTYPE
4511
4512 static void
4513 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4514 u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4515 bool pse, bool amd)
4516 {
4517 u64 gbpages_bit_rsvd = 0;
4518 u64 nonleaf_bit8_rsvd = 0;
4519 u64 high_bits_rsvd;
4520
4521 rsvd_check->bad_mt_xwr = 0;
4522
4523 if (!gbpages)
4524 gbpages_bit_rsvd = rsvd_bits(7, 7);
4525
4526 if (level == PT32E_ROOT_LEVEL)
4527 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4528 else
4529 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4530
4531
4532 if (!nx)
4533 high_bits_rsvd |= rsvd_bits(63, 63);
4534
4535
4536
4537
4538
4539 if (amd)
4540 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4541
4542 switch (level) {
4543 case PT32_ROOT_LEVEL:
4544
4545 rsvd_check->rsvd_bits_mask[0][1] = 0;
4546 rsvd_check->rsvd_bits_mask[0][0] = 0;
4547 rsvd_check->rsvd_bits_mask[1][0] =
4548 rsvd_check->rsvd_bits_mask[0][0];
4549
4550 if (!pse) {
4551 rsvd_check->rsvd_bits_mask[1][1] = 0;
4552 break;
4553 }
4554
4555 if (is_cpuid_PSE36())
4556
4557 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4558 else
4559
4560 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4561 break;
4562 case PT32E_ROOT_LEVEL:
4563 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4564 high_bits_rsvd |
4565 rsvd_bits(5, 8) |
4566 rsvd_bits(1, 2);
4567 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4568 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4569 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4570 rsvd_bits(13, 20);
4571 rsvd_check->rsvd_bits_mask[1][0] =
4572 rsvd_check->rsvd_bits_mask[0][0];
4573 break;
4574 case PT64_ROOT_5LEVEL:
4575 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4576 nonleaf_bit8_rsvd |
4577 rsvd_bits(7, 7);
4578 rsvd_check->rsvd_bits_mask[1][4] =
4579 rsvd_check->rsvd_bits_mask[0][4];
4580 fallthrough;
4581 case PT64_ROOT_4LEVEL:
4582 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4583 nonleaf_bit8_rsvd |
4584 rsvd_bits(7, 7);
4585 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4586 gbpages_bit_rsvd;
4587 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4588 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4589 rsvd_check->rsvd_bits_mask[1][3] =
4590 rsvd_check->rsvd_bits_mask[0][3];
4591 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4592 gbpages_bit_rsvd |
4593 rsvd_bits(13, 29);
4594 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4595 rsvd_bits(13, 20);
4596 rsvd_check->rsvd_bits_mask[1][0] =
4597 rsvd_check->rsvd_bits_mask[0][0];
4598 break;
4599 }
4600 }
4601
4602 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4603 {
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613 return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4614 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4615 }
4616
4617 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4618 struct kvm_mmu *context)
4619 {
4620 __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4621 vcpu->arch.reserved_gpa_bits,
4622 context->cpu_role.base.level, is_efer_nx(context),
4623 guest_can_use_gbpages(vcpu),
4624 is_cr4_pse(context),
4625 guest_cpuid_is_amd_or_hygon(vcpu));
4626 }
4627
4628 static void
4629 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4630 u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4631 {
4632 u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4633 u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4634 u64 bad_mt_xwr;
4635
4636 if (huge_page_level < PG_LEVEL_1G)
4637 large_1g_rsvd = rsvd_bits(7, 7);
4638 if (huge_page_level < PG_LEVEL_2M)
4639 large_2m_rsvd = rsvd_bits(7, 7);
4640
4641 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4642 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4643 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4644 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4645 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4646
4647
4648 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4649 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4650 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4651 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4652 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4653
4654 bad_mt_xwr = 0xFFull << (2 * 8);
4655 bad_mt_xwr |= 0xFFull << (3 * 8);
4656 bad_mt_xwr |= 0xFFull << (7 * 8);
4657 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4658 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4659 if (!execonly) {
4660
4661 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4662 }
4663 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4664 }
4665
4666 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4667 struct kvm_mmu *context, bool execonly, int huge_page_level)
4668 {
4669 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4670 vcpu->arch.reserved_gpa_bits, execonly,
4671 huge_page_level);
4672 }
4673
4674 static inline u64 reserved_hpa_bits(void)
4675 {
4676 return rsvd_bits(shadow_phys_bits, 63);
4677 }
4678
4679
4680
4681
4682
4683
4684 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4685 struct kvm_mmu *context)
4686 {
4687
4688 bool is_amd = true;
4689
4690 bool is_pse = false;
4691 struct rsvd_bits_validate *shadow_zero_check;
4692 int i;
4693
4694 WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
4695
4696 shadow_zero_check = &context->shadow_zero_check;
4697 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4698 context->root_role.level,
4699 context->root_role.efer_nx,
4700 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4701
4702 if (!shadow_me_mask)
4703 return;
4704
4705 for (i = context->root_role.level; --i >= 0;) {
4706
4707
4708
4709
4710
4711
4712 shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
4713 shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
4714 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
4715 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
4716 }
4717
4718 }
4719
4720 static inline bool boot_cpu_is_amd(void)
4721 {
4722 WARN_ON_ONCE(!tdp_enabled);
4723 return shadow_x_mask == 0;
4724 }
4725
4726
4727
4728
4729
4730 static void
4731 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4732 {
4733 struct rsvd_bits_validate *shadow_zero_check;
4734 int i;
4735
4736 shadow_zero_check = &context->shadow_zero_check;
4737
4738 if (boot_cpu_is_amd())
4739 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4740 context->root_role.level, true,
4741 boot_cpu_has(X86_FEATURE_GBPAGES),
4742 false, true);
4743 else
4744 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4745 reserved_hpa_bits(), false,
4746 max_huge_page_level);
4747
4748 if (!shadow_me_mask)
4749 return;
4750
4751 for (i = context->root_role.level; --i >= 0;) {
4752 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4753 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4754 }
4755 }
4756
4757
4758
4759
4760
4761 static void
4762 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4763 {
4764 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4765 reserved_hpa_bits(), execonly,
4766 max_huge_page_level);
4767 }
4768
4769 #define BYTE_MASK(access) \
4770 ((1 & (access) ? 2 : 0) | \
4771 (2 & (access) ? 4 : 0) | \
4772 (3 & (access) ? 8 : 0) | \
4773 (4 & (access) ? 16 : 0) | \
4774 (5 & (access) ? 32 : 0) | \
4775 (6 & (access) ? 64 : 0) | \
4776 (7 & (access) ? 128 : 0))
4777
4778
4779 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4780 {
4781 unsigned byte;
4782
4783 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4784 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4785 const u8 u = BYTE_MASK(ACC_USER_MASK);
4786
4787 bool cr4_smep = is_cr4_smep(mmu);
4788 bool cr4_smap = is_cr4_smap(mmu);
4789 bool cr0_wp = is_cr0_wp(mmu);
4790 bool efer_nx = is_efer_nx(mmu);
4791
4792 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4793 unsigned pfec = byte << 1;
4794
4795
4796
4797
4798
4799
4800
4801 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4802
4803 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4804
4805 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4806
4807 u8 smepf = 0;
4808
4809 u8 smapf = 0;
4810
4811 if (!ept) {
4812
4813 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4814
4815
4816 if (!efer_nx)
4817 ff = 0;
4818
4819
4820 if (!cr0_wp)
4821 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4822
4823
4824 if (cr4_smep)
4825 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843 if (cr4_smap)
4844 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4845 }
4846
4847 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4848 }
4849 }
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4876 {
4877 unsigned bit;
4878 bool wp;
4879
4880 mmu->pkru_mask = 0;
4881
4882 if (!is_cr4_pke(mmu))
4883 return;
4884
4885 wp = is_cr0_wp(mmu);
4886
4887 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4888 unsigned pfec, pkey_bits;
4889 bool check_pkey, check_write, ff, uf, wf, pte_user;
4890
4891 pfec = bit << 1;
4892 ff = pfec & PFERR_FETCH_MASK;
4893 uf = pfec & PFERR_USER_MASK;
4894 wf = pfec & PFERR_WRITE_MASK;
4895
4896
4897 pte_user = pfec & PFERR_RSVD_MASK;
4898
4899
4900
4901
4902
4903 check_pkey = (!ff && pte_user);
4904
4905
4906
4907
4908 check_write = check_pkey && wf && (uf || wp);
4909
4910
4911 pkey_bits = !!check_pkey;
4912
4913 pkey_bits |= (!!check_write) << 1;
4914
4915 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4916 }
4917 }
4918
4919 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4920 struct kvm_mmu *mmu)
4921 {
4922 if (!is_cr0_pg(mmu))
4923 return;
4924
4925 reset_guest_rsvds_bits_mask(vcpu, mmu);
4926 update_permission_bitmask(mmu, false);
4927 update_pkru_bitmask(mmu);
4928 }
4929
4930 static void paging64_init_context(struct kvm_mmu *context)
4931 {
4932 context->page_fault = paging64_page_fault;
4933 context->gva_to_gpa = paging64_gva_to_gpa;
4934 context->sync_page = paging64_sync_page;
4935 context->invlpg = paging64_invlpg;
4936 }
4937
4938 static void paging32_init_context(struct kvm_mmu *context)
4939 {
4940 context->page_fault = paging32_page_fault;
4941 context->gva_to_gpa = paging32_gva_to_gpa;
4942 context->sync_page = paging32_sync_page;
4943 context->invlpg = paging32_invlpg;
4944 }
4945
4946 static union kvm_cpu_role
4947 kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
4948 {
4949 union kvm_cpu_role role = {0};
4950
4951 role.base.access = ACC_ALL;
4952 role.base.smm = is_smm(vcpu);
4953 role.base.guest_mode = is_guest_mode(vcpu);
4954 role.ext.valid = 1;
4955
4956 if (!____is_cr0_pg(regs)) {
4957 role.base.direct = 1;
4958 return role;
4959 }
4960
4961 role.base.efer_nx = ____is_efer_nx(regs);
4962 role.base.cr0_wp = ____is_cr0_wp(regs);
4963 role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
4964 role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
4965 role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
4966
4967 if (____is_efer_lma(regs))
4968 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
4969 : PT64_ROOT_4LEVEL;
4970 else if (____is_cr4_pae(regs))
4971 role.base.level = PT32E_ROOT_LEVEL;
4972 else
4973 role.base.level = PT32_ROOT_LEVEL;
4974
4975 role.ext.cr4_smep = ____is_cr4_smep(regs);
4976 role.ext.cr4_smap = ____is_cr4_smap(regs);
4977 role.ext.cr4_pse = ____is_cr4_pse(regs);
4978
4979
4980 role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4981 role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4982 role.ext.efer_lma = ____is_efer_lma(regs);
4983 return role;
4984 }
4985
4986 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4987 {
4988
4989 if (tdp_root_level)
4990 return tdp_root_level;
4991
4992
4993 if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4994 return 4;
4995
4996 return max_tdp_level;
4997 }
4998
4999 static union kvm_mmu_page_role
5000 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5001 union kvm_cpu_role cpu_role)
5002 {
5003 union kvm_mmu_page_role role = {0};
5004
5005 role.access = ACC_ALL;
5006 role.cr0_wp = true;
5007 role.efer_nx = true;
5008 role.smm = cpu_role.base.smm;
5009 role.guest_mode = cpu_role.base.guest_mode;
5010 role.ad_disabled = !kvm_ad_enabled();
5011 role.level = kvm_mmu_get_tdp_level(vcpu);
5012 role.direct = true;
5013 role.has_4_byte_gpte = false;
5014
5015 return role;
5016 }
5017
5018 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5019 union kvm_cpu_role cpu_role)
5020 {
5021 struct kvm_mmu *context = &vcpu->arch.root_mmu;
5022 union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5023
5024 if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5025 root_role.word == context->root_role.word)
5026 return;
5027
5028 context->cpu_role.as_u64 = cpu_role.as_u64;
5029 context->root_role.word = root_role.word;
5030 context->page_fault = kvm_tdp_page_fault;
5031 context->sync_page = nonpaging_sync_page;
5032 context->invlpg = NULL;
5033 context->get_guest_pgd = get_cr3;
5034 context->get_pdptr = kvm_pdptr_read;
5035 context->inject_page_fault = kvm_inject_page_fault;
5036
5037 if (!is_cr0_pg(context))
5038 context->gva_to_gpa = nonpaging_gva_to_gpa;
5039 else if (is_cr4_pae(context))
5040 context->gva_to_gpa = paging64_gva_to_gpa;
5041 else
5042 context->gva_to_gpa = paging32_gva_to_gpa;
5043
5044 reset_guest_paging_metadata(vcpu, context);
5045 reset_tdp_shadow_zero_bits_mask(context);
5046 }
5047
5048 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
5049 union kvm_cpu_role cpu_role,
5050 union kvm_mmu_page_role root_role)
5051 {
5052 if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5053 root_role.word == context->root_role.word)
5054 return;
5055
5056 context->cpu_role.as_u64 = cpu_role.as_u64;
5057 context->root_role.word = root_role.word;
5058
5059 if (!is_cr0_pg(context))
5060 nonpaging_init_context(context);
5061 else if (is_cr4_pae(context))
5062 paging64_init_context(context);
5063 else
5064 paging32_init_context(context);
5065
5066 reset_guest_paging_metadata(vcpu, context);
5067 reset_shadow_zero_bits_mask(vcpu, context);
5068 }
5069
5070 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5071 union kvm_cpu_role cpu_role)
5072 {
5073 struct kvm_mmu *context = &vcpu->arch.root_mmu;
5074 union kvm_mmu_page_role root_role;
5075
5076 root_role = cpu_role.base;
5077
5078
5079 root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090 root_role.efer_nx = true;
5091
5092 shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5093 }
5094
5095 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5096 unsigned long cr4, u64 efer, gpa_t nested_cr3)
5097 {
5098 struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5099 struct kvm_mmu_role_regs regs = {
5100 .cr0 = cr0,
5101 .cr4 = cr4 & ~X86_CR4_PKE,
5102 .efer = efer,
5103 };
5104 union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s);
5105 union kvm_mmu_page_role root_role;
5106
5107
5108 WARN_ON_ONCE(cpu_role.base.direct);
5109
5110 root_role = cpu_role.base;
5111 root_role.level = kvm_mmu_get_tdp_level(vcpu);
5112 if (root_role.level == PT64_ROOT_5LEVEL &&
5113 cpu_role.base.level == PT64_ROOT_4LEVEL)
5114 root_role.passthrough = 1;
5115
5116 shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5117 kvm_mmu_new_pgd(vcpu, nested_cr3);
5118 }
5119 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5120
5121 static union kvm_cpu_role
5122 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5123 bool execonly, u8 level)
5124 {
5125 union kvm_cpu_role role = {0};
5126
5127
5128
5129
5130
5131 WARN_ON_ONCE(is_smm(vcpu));
5132 role.base.level = level;
5133 role.base.has_4_byte_gpte = false;
5134 role.base.direct = false;
5135 role.base.ad_disabled = !accessed_dirty;
5136 role.base.guest_mode = true;
5137 role.base.access = ACC_ALL;
5138
5139 role.ext.word = 0;
5140 role.ext.execonly = execonly;
5141 role.ext.valid = 1;
5142
5143 return role;
5144 }
5145
5146 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5147 int huge_page_level, bool accessed_dirty,
5148 gpa_t new_eptp)
5149 {
5150 struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5151 u8 level = vmx_eptp_page_walk_level(new_eptp);
5152 union kvm_cpu_role new_mode =
5153 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5154 execonly, level);
5155
5156 if (new_mode.as_u64 != context->cpu_role.as_u64) {
5157
5158 context->cpu_role.as_u64 = new_mode.as_u64;
5159 context->root_role.word = new_mode.base.word;
5160
5161 context->page_fault = ept_page_fault;
5162 context->gva_to_gpa = ept_gva_to_gpa;
5163 context->sync_page = ept_sync_page;
5164 context->invlpg = ept_invlpg;
5165
5166 update_permission_bitmask(context, true);
5167 context->pkru_mask = 0;
5168 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5169 reset_ept_shadow_zero_bits_mask(context, execonly);
5170 }
5171
5172 kvm_mmu_new_pgd(vcpu, new_eptp);
5173 }
5174 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5175
5176 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5177 union kvm_cpu_role cpu_role)
5178 {
5179 struct kvm_mmu *context = &vcpu->arch.root_mmu;
5180
5181 kvm_init_shadow_mmu(vcpu, cpu_role);
5182
5183 context->get_guest_pgd = get_cr3;
5184 context->get_pdptr = kvm_pdptr_read;
5185 context->inject_page_fault = kvm_inject_page_fault;
5186 }
5187
5188 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5189 union kvm_cpu_role new_mode)
5190 {
5191 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5192
5193 if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5194 return;
5195
5196 g_context->cpu_role.as_u64 = new_mode.as_u64;
5197 g_context->get_guest_pgd = get_cr3;
5198 g_context->get_pdptr = kvm_pdptr_read;
5199 g_context->inject_page_fault = kvm_inject_page_fault;
5200
5201
5202
5203
5204
5205 g_context->invlpg = NULL;
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215 if (!is_paging(vcpu))
5216 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5217 else if (is_long_mode(vcpu))
5218 g_context->gva_to_gpa = paging64_gva_to_gpa;
5219 else if (is_pae(vcpu))
5220 g_context->gva_to_gpa = paging64_gva_to_gpa;
5221 else
5222 g_context->gva_to_gpa = paging32_gva_to_gpa;
5223
5224 reset_guest_paging_metadata(vcpu, g_context);
5225 }
5226
5227 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5228 {
5229 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5230 union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s);
5231
5232 if (mmu_is_nested(vcpu))
5233 init_kvm_nested_mmu(vcpu, cpu_role);
5234 else if (tdp_enabled)
5235 init_kvm_tdp_mmu(vcpu, cpu_role);
5236 else
5237 init_kvm_softmmu(vcpu, cpu_role);
5238 }
5239 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5240
5241 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5242 {
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255 vcpu->arch.root_mmu.root_role.word = 0;
5256 vcpu->arch.guest_mmu.root_role.word = 0;
5257 vcpu->arch.nested_mmu.root_role.word = 0;
5258 vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5259 vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5260 vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5261 kvm_mmu_reset_context(vcpu);
5262
5263
5264
5265
5266
5267 KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5268 }
5269
5270 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5271 {
5272 kvm_mmu_unload(vcpu);
5273 kvm_init_mmu(vcpu);
5274 }
5275 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5276
5277 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5278 {
5279 int r;
5280
5281 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
5282 if (r)
5283 goto out;
5284 r = mmu_alloc_special_roots(vcpu);
5285 if (r)
5286 goto out;
5287 if (vcpu->arch.mmu->root_role.direct)
5288 r = mmu_alloc_direct_roots(vcpu);
5289 else
5290 r = mmu_alloc_shadow_roots(vcpu);
5291 if (r)
5292 goto out;
5293
5294 kvm_mmu_sync_roots(vcpu);
5295
5296 kvm_mmu_load_pgd(vcpu);
5297
5298
5299
5300
5301
5302
5303
5304
5305 static_call(kvm_x86_flush_tlb_current)(vcpu);
5306 out:
5307 return r;
5308 }
5309
5310 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5311 {
5312 struct kvm *kvm = vcpu->kvm;
5313
5314 kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5315 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5316 kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5317 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5318 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5319 }
5320
5321 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5322 {
5323 struct kvm_mmu_page *sp;
5324
5325 if (!VALID_PAGE(root_hpa))
5326 return false;
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339 sp = to_shadow_page(root_hpa);
5340 return !sp || is_obsolete_sp(kvm, sp);
5341 }
5342
5343 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5344 {
5345 unsigned long roots_to_free = 0;
5346 int i;
5347
5348 if (is_obsolete_root(kvm, mmu->root.hpa))
5349 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5350
5351 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5352 if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
5353 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5354 }
5355
5356 if (roots_to_free)
5357 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5358 }
5359
5360 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5361 {
5362 __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5363 __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5364 }
5365
5366 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5367 int *bytes)
5368 {
5369 u64 gentry = 0;
5370 int r;
5371
5372
5373
5374
5375
5376
5377 if (is_pae(vcpu) && *bytes == 4) {
5378
5379 *gpa &= ~(gpa_t)7;
5380 *bytes = 8;
5381 }
5382
5383 if (*bytes == 4 || *bytes == 8) {
5384 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5385 if (r)
5386 gentry = 0;
5387 }
5388
5389 return gentry;
5390 }
5391
5392
5393
5394
5395
5396 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5397 {
5398
5399
5400
5401
5402 if (sp->role.level == PG_LEVEL_4K)
5403 return false;
5404
5405 atomic_inc(&sp->write_flooding_count);
5406 return atomic_read(&sp->write_flooding_count) >= 3;
5407 }
5408
5409
5410
5411
5412
5413 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5414 int bytes)
5415 {
5416 unsigned offset, pte_size, misaligned;
5417
5418 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5419 gpa, bytes, sp->role.word);
5420
5421 offset = offset_in_page(gpa);
5422 pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5423
5424
5425
5426
5427
5428 if (!(offset & (pte_size - 1)) && bytes == 1)
5429 return false;
5430
5431 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5432 misaligned |= bytes < 4;
5433
5434 return misaligned;
5435 }
5436
5437 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5438 {
5439 unsigned page_offset, quadrant;
5440 u64 *spte;
5441 int level;
5442
5443 page_offset = offset_in_page(gpa);
5444 level = sp->role.level;
5445 *nspte = 1;
5446 if (sp->role.has_4_byte_gpte) {
5447 page_offset <<= 1;
5448
5449
5450
5451
5452
5453 if (level == PT32_ROOT_LEVEL) {
5454 page_offset &= ~7;
5455 page_offset <<= 1;
5456 *nspte = 2;
5457 }
5458 quadrant = page_offset >> PAGE_SHIFT;
5459 page_offset &= ~PAGE_MASK;
5460 if (quadrant != sp->role.quadrant)
5461 return NULL;
5462 }
5463
5464 spte = &sp->spt[page_offset / sizeof(*spte)];
5465 return spte;
5466 }
5467
5468 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5469 const u8 *new, int bytes,
5470 struct kvm_page_track_notifier_node *node)
5471 {
5472 gfn_t gfn = gpa >> PAGE_SHIFT;
5473 struct kvm_mmu_page *sp;
5474 LIST_HEAD(invalid_list);
5475 u64 entry, gentry, *spte;
5476 int npte;
5477 bool flush = false;
5478
5479
5480
5481
5482
5483 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5484 return;
5485
5486 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5487
5488 write_lock(&vcpu->kvm->mmu_lock);
5489
5490 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5491
5492 ++vcpu->kvm->stat.mmu_pte_write;
5493
5494 for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5495 if (detect_write_misaligned(sp, gpa, bytes) ||
5496 detect_write_flooding(sp)) {
5497 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5498 ++vcpu->kvm->stat.mmu_flooded;
5499 continue;
5500 }
5501
5502 spte = get_written_sptes(sp, gpa, &npte);
5503 if (!spte)
5504 continue;
5505
5506 while (npte--) {
5507 entry = *spte;
5508 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5509 if (gentry && sp->role.level != PG_LEVEL_4K)
5510 ++vcpu->kvm->stat.mmu_pde_zapped;
5511 if (is_shadow_present_pte(entry))
5512 flush = true;
5513 ++spte;
5514 }
5515 }
5516 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5517 write_unlock(&vcpu->kvm->mmu_lock);
5518 }
5519
5520 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5521 void *insn, int insn_len)
5522 {
5523 int r, emulation_type = EMULTYPE_PF;
5524 bool direct = vcpu->arch.mmu->root_role.direct;
5525
5526 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5527 return RET_PF_RETRY;
5528
5529 r = RET_PF_INVALID;
5530 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5531 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5532 if (r == RET_PF_EMULATE)
5533 goto emulate;
5534 }
5535
5536 if (r == RET_PF_INVALID) {
5537 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5538 lower_32_bits(error_code), false);
5539 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5540 return -EIO;
5541 }
5542
5543 if (r < 0)
5544 return r;
5545 if (r != RET_PF_EMULATE)
5546 return 1;
5547
5548
5549
5550
5551
5552
5553
5554
5555 if (vcpu->arch.mmu->root_role.direct &&
5556 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5557 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5558 return 1;
5559 }
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5573 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5574 emulate:
5575 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5576 insn_len);
5577 }
5578 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5579
5580 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5581 gva_t gva, hpa_t root_hpa)
5582 {
5583 int i;
5584
5585
5586 if (mmu != &vcpu->arch.guest_mmu) {
5587
5588 if (is_noncanonical_address(gva, vcpu))
5589 return;
5590
5591 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5592 }
5593
5594 if (!mmu->invlpg)
5595 return;
5596
5597 if (root_hpa == INVALID_PAGE) {
5598 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5612 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5613 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5614 } else {
5615 mmu->invlpg(vcpu, gva, root_hpa);
5616 }
5617 }
5618
5619 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5620 {
5621 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5622 ++vcpu->stat.invlpg;
5623 }
5624 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5625
5626
5627 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5628 {
5629 struct kvm_mmu *mmu = vcpu->arch.mmu;
5630 bool tlb_flush = false;
5631 uint i;
5632
5633 if (pcid == kvm_get_active_pcid(vcpu)) {
5634 if (mmu->invlpg)
5635 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5636 tlb_flush = true;
5637 }
5638
5639 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5640 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5641 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5642 if (mmu->invlpg)
5643 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5644 tlb_flush = true;
5645 }
5646 }
5647
5648 if (tlb_flush)
5649 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5650
5651 ++vcpu->stat.invlpg;
5652
5653
5654
5655
5656
5657
5658 }
5659
5660 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5661 int tdp_max_root_level, int tdp_huge_page_level)
5662 {
5663 tdp_enabled = enable_tdp;
5664 tdp_root_level = tdp_forced_root_level;
5665 max_tdp_level = tdp_max_root_level;
5666
5667
5668
5669
5670
5671
5672
5673
5674 if (tdp_enabled)
5675 max_huge_page_level = tdp_huge_page_level;
5676 else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5677 max_huge_page_level = PG_LEVEL_1G;
5678 else
5679 max_huge_page_level = PG_LEVEL_2M;
5680 }
5681 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5682
5683
5684 typedef bool (*slot_level_handler) (struct kvm *kvm,
5685 struct kvm_rmap_head *rmap_head,
5686 const struct kvm_memory_slot *slot);
5687
5688
5689 static __always_inline bool
5690 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5691 slot_level_handler fn, int start_level, int end_level,
5692 gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5693 bool flush)
5694 {
5695 struct slot_rmap_walk_iterator iterator;
5696
5697 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5698 end_gfn, &iterator) {
5699 if (iterator.rmap)
5700 flush |= fn(kvm, iterator.rmap, memslot);
5701
5702 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5703 if (flush && flush_on_yield) {
5704 kvm_flush_remote_tlbs_with_address(kvm,
5705 start_gfn,
5706 iterator.gfn - start_gfn + 1);
5707 flush = false;
5708 }
5709 cond_resched_rwlock_write(&kvm->mmu_lock);
5710 }
5711 }
5712
5713 return flush;
5714 }
5715
5716 static __always_inline bool
5717 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5718 slot_level_handler fn, int start_level, int end_level,
5719 bool flush_on_yield)
5720 {
5721 return slot_handle_level_range(kvm, memslot, fn, start_level,
5722 end_level, memslot->base_gfn,
5723 memslot->base_gfn + memslot->npages - 1,
5724 flush_on_yield, false);
5725 }
5726
5727 static __always_inline bool
5728 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5729 slot_level_handler fn, bool flush_on_yield)
5730 {
5731 return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5732 PG_LEVEL_4K, flush_on_yield);
5733 }
5734
5735 static void free_mmu_pages(struct kvm_mmu *mmu)
5736 {
5737 if (!tdp_enabled && mmu->pae_root)
5738 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5739 free_page((unsigned long)mmu->pae_root);
5740 free_page((unsigned long)mmu->pml4_root);
5741 free_page((unsigned long)mmu->pml5_root);
5742 }
5743
5744 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5745 {
5746 struct page *page;
5747 int i;
5748
5749 mmu->root.hpa = INVALID_PAGE;
5750 mmu->root.pgd = 0;
5751 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5752 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5753
5754
5755 if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5756 return 0;
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769 if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5770 return 0;
5771
5772 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5773 if (!page)
5774 return -ENOMEM;
5775
5776 mmu->pae_root = page_address(page);
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786 if (!tdp_enabled)
5787 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5788 else
5789 WARN_ON_ONCE(shadow_me_value);
5790
5791 for (i = 0; i < 4; ++i)
5792 mmu->pae_root[i] = INVALID_PAE_ROOT;
5793
5794 return 0;
5795 }
5796
5797 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5798 {
5799 int ret;
5800
5801 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5802 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5803
5804 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5805 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5806
5807 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5808
5809 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5810 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5811
5812 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5813 if (ret)
5814 return ret;
5815
5816 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5817 if (ret)
5818 goto fail_allocate_root;
5819
5820 return ret;
5821 fail_allocate_root:
5822 free_mmu_pages(&vcpu->arch.guest_mmu);
5823 return ret;
5824 }
5825
5826 #define BATCH_ZAP_PAGES 10
5827 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5828 {
5829 struct kvm_mmu_page *sp, *node;
5830 int nr_zapped, batch = 0;
5831 bool unstable;
5832
5833 restart:
5834 list_for_each_entry_safe_reverse(sp, node,
5835 &kvm->arch.active_mmu_pages, link) {
5836
5837
5838
5839
5840 if (!is_obsolete_sp(kvm, sp))
5841 break;
5842
5843
5844
5845
5846
5847
5848 if (WARN_ON(sp->role.invalid))
5849 continue;
5850
5851
5852
5853
5854
5855
5856
5857 if (batch >= BATCH_ZAP_PAGES &&
5858 cond_resched_rwlock_write(&kvm->mmu_lock)) {
5859 batch = 0;
5860 goto restart;
5861 }
5862
5863 unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
5864 &kvm->arch.zapped_obsolete_pages, &nr_zapped);
5865 batch += nr_zapped;
5866
5867 if (unstable)
5868 goto restart;
5869 }
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5881 }
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5893 {
5894 lockdep_assert_held(&kvm->slots_lock);
5895
5896 write_lock(&kvm->mmu_lock);
5897 trace_kvm_mmu_zap_all_fast(kvm);
5898
5899
5900
5901
5902
5903
5904
5905
5906 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5907
5908
5909
5910
5911
5912
5913
5914 if (is_tdp_mmu_enabled(kvm))
5915 kvm_tdp_mmu_invalidate_all_roots(kvm);
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
5926
5927 kvm_zap_obsolete_pages(kvm);
5928
5929 write_unlock(&kvm->mmu_lock);
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939 if (is_tdp_mmu_enabled(kvm))
5940 kvm_tdp_mmu_zap_invalidated_roots(kvm);
5941 }
5942
5943 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5944 {
5945 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5946 }
5947
5948 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5949 struct kvm_memory_slot *slot,
5950 struct kvm_page_track_notifier_node *node)
5951 {
5952 kvm_mmu_zap_all_fast(kvm);
5953 }
5954
5955 int kvm_mmu_init_vm(struct kvm *kvm)
5956 {
5957 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5958 int r;
5959
5960 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5961 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5962 INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
5963 spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5964
5965 r = kvm_mmu_init_tdp_mmu(kvm);
5966 if (r < 0)
5967 return r;
5968
5969 node->track_write = kvm_mmu_pte_write;
5970 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5971 kvm_page_track_register_notifier(kvm, node);
5972
5973 kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
5974 kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
5975
5976 kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
5977
5978 kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
5979 kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
5980
5981 return 0;
5982 }
5983
5984 static void mmu_free_vm_memory_caches(struct kvm *kvm)
5985 {
5986 kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
5987 kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
5988 kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
5989 }
5990
5991 void kvm_mmu_uninit_vm(struct kvm *kvm)
5992 {
5993 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5994
5995 kvm_page_track_unregister_notifier(kvm, node);
5996
5997 kvm_mmu_uninit_tdp_mmu(kvm);
5998
5999 mmu_free_vm_memory_caches(kvm);
6000 }
6001
6002 static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6003 {
6004 const struct kvm_memory_slot *memslot;
6005 struct kvm_memslots *slots;
6006 struct kvm_memslot_iter iter;
6007 bool flush = false;
6008 gfn_t start, end;
6009 int i;
6010
6011 if (!kvm_memslots_have_rmaps(kvm))
6012 return flush;
6013
6014 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6015 slots = __kvm_memslots(kvm, i);
6016
6017 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6018 memslot = iter.slot;
6019 start = max(gfn_start, memslot->base_gfn);
6020 end = min(gfn_end, memslot->base_gfn + memslot->npages);
6021 if (WARN_ON_ONCE(start >= end))
6022 continue;
6023
6024 flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
6025 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6026 start, end - 1, true, flush);
6027 }
6028 }
6029
6030 return flush;
6031 }
6032
6033
6034
6035
6036
6037 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6038 {
6039 bool flush;
6040 int i;
6041
6042 if (WARN_ON_ONCE(gfn_end <= gfn_start))
6043 return;
6044
6045 write_lock(&kvm->mmu_lock);
6046
6047 kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
6048
6049 flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6050
6051 if (is_tdp_mmu_enabled(kvm)) {
6052 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
6053 flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
6054 gfn_end, true, flush);
6055 }
6056
6057 if (flush)
6058 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
6059 gfn_end - gfn_start);
6060
6061 kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
6062
6063 write_unlock(&kvm->mmu_lock);
6064 }
6065
6066 static bool slot_rmap_write_protect(struct kvm *kvm,
6067 struct kvm_rmap_head *rmap_head,
6068 const struct kvm_memory_slot *slot)
6069 {
6070 return rmap_write_protect(rmap_head, false);
6071 }
6072
6073 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6074 const struct kvm_memory_slot *memslot,
6075 int start_level)
6076 {
6077 if (kvm_memslots_have_rmaps(kvm)) {
6078 write_lock(&kvm->mmu_lock);
6079 slot_handle_level(kvm, memslot, slot_rmap_write_protect,
6080 start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
6081 write_unlock(&kvm->mmu_lock);
6082 }
6083
6084 if (is_tdp_mmu_enabled(kvm)) {
6085 read_lock(&kvm->mmu_lock);
6086 kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6087 read_unlock(&kvm->mmu_lock);
6088 }
6089 }
6090
6091 static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6092 {
6093 return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6094 }
6095
6096 static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6097 {
6098 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6099 return true;
6100
6101
6102
6103
6104
6105
6106 return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6107 need_topup(&kvm->arch.split_page_header_cache, 1) ||
6108 need_topup(&kvm->arch.split_shadow_page_cache, 1);
6109 }
6110
6111 static int topup_split_caches(struct kvm *kvm)
6112 {
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126 const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6127 KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6128 int r;
6129
6130 lockdep_assert_held(&kvm->slots_lock);
6131
6132 r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
6133 SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6134 if (r)
6135 return r;
6136
6137 r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6138 if (r)
6139 return r;
6140
6141 return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6142 }
6143
6144 static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6145 {
6146 struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6147 struct shadow_page_caches caches = {};
6148 union kvm_mmu_page_role role;
6149 unsigned int access;
6150 gfn_t gfn;
6151
6152 gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6153 access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
6154
6155
6156
6157
6158
6159
6160
6161 role = kvm_mmu_child_role(huge_sptep, true, access);
6162
6163
6164 caches.page_header_cache = &kvm->arch.split_page_header_cache;
6165 caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6166
6167
6168 return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6169 }
6170
6171 static void shadow_mmu_split_huge_page(struct kvm *kvm,
6172 const struct kvm_memory_slot *slot,
6173 u64 *huge_sptep)
6174
6175 {
6176 struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6177 u64 huge_spte = READ_ONCE(*huge_sptep);
6178 struct kvm_mmu_page *sp;
6179 bool flush = false;
6180 u64 *sptep, spte;
6181 gfn_t gfn;
6182 int index;
6183
6184 sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6185
6186 for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6187 sptep = &sp->spt[index];
6188 gfn = kvm_mmu_page_get_gfn(sp, index);
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204 if (is_shadow_present_pte(*sptep)) {
6205 flush |= !is_last_spte(*sptep, sp->role.level);
6206 continue;
6207 }
6208
6209 spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6210 mmu_spte_set(sptep, spte);
6211 __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6212 }
6213
6214 __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
6215 }
6216
6217 static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6218 const struct kvm_memory_slot *slot,
6219 u64 *huge_sptep)
6220 {
6221 struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6222 int level, r = 0;
6223 gfn_t gfn;
6224 u64 spte;
6225
6226
6227 gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6228 level = huge_sp->role.level;
6229 spte = *huge_sptep;
6230
6231 if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6232 r = -ENOSPC;
6233 goto out;
6234 }
6235
6236 if (need_topup_split_caches_or_resched(kvm)) {
6237 write_unlock(&kvm->mmu_lock);
6238 cond_resched();
6239
6240
6241
6242
6243
6244 r = topup_split_caches(kvm) ?: -EAGAIN;
6245 write_lock(&kvm->mmu_lock);
6246 goto out;
6247 }
6248
6249 shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6250
6251 out:
6252 trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6253 return r;
6254 }
6255
6256 static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6257 struct kvm_rmap_head *rmap_head,
6258 const struct kvm_memory_slot *slot)
6259 {
6260 struct rmap_iterator iter;
6261 struct kvm_mmu_page *sp;
6262 u64 *huge_sptep;
6263 int r;
6264
6265 restart:
6266 for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6267 sp = sptep_to_sp(huge_sptep);
6268
6269
6270 if (WARN_ON_ONCE(!sp->role.guest_mode))
6271 continue;
6272
6273
6274 if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6275 continue;
6276
6277
6278 if (WARN_ON_ONCE(sp->unsync))
6279 continue;
6280
6281
6282 if (sp->role.invalid)
6283 continue;
6284
6285 r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6286
6287
6288
6289
6290
6291
6292 if (!r || r == -EAGAIN)
6293 goto restart;
6294
6295
6296 break;
6297 }
6298
6299 return false;
6300 }
6301
6302 static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6303 const struct kvm_memory_slot *slot,
6304 gfn_t start, gfn_t end,
6305 int target_level)
6306 {
6307 int level;
6308
6309
6310
6311
6312
6313
6314
6315 for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
6316 slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
6317 level, level, start, end - 1, true, false);
6318 }
6319 }
6320
6321
6322 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6323 const struct kvm_memory_slot *memslot,
6324 u64 start, u64 end,
6325 int target_level)
6326 {
6327 if (!is_tdp_mmu_enabled(kvm))
6328 return;
6329
6330 if (kvm_memslots_have_rmaps(kvm))
6331 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6332
6333 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
6334
6335
6336
6337
6338
6339 }
6340
6341 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6342 const struct kvm_memory_slot *memslot,
6343 int target_level)
6344 {
6345 u64 start = memslot->base_gfn;
6346 u64 end = start + memslot->npages;
6347
6348 if (!is_tdp_mmu_enabled(kvm))
6349 return;
6350
6351 if (kvm_memslots_have_rmaps(kvm)) {
6352 write_lock(&kvm->mmu_lock);
6353 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6354 write_unlock(&kvm->mmu_lock);
6355 }
6356
6357 read_lock(&kvm->mmu_lock);
6358 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6359 read_unlock(&kvm->mmu_lock);
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370 }
6371
6372 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6373 struct kvm_rmap_head *rmap_head,
6374 const struct kvm_memory_slot *slot)
6375 {
6376 u64 *sptep;
6377 struct rmap_iterator iter;
6378 int need_tlb_flush = 0;
6379 struct kvm_mmu_page *sp;
6380
6381 restart:
6382 for_each_rmap_spte(rmap_head, &iter, sptep) {
6383 sp = sptep_to_sp(sptep);
6384
6385
6386
6387
6388
6389
6390
6391
6392 if (sp->role.direct &&
6393 sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6394 PG_LEVEL_NUM)) {
6395 kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6396
6397 if (kvm_available_flush_tlb_with_range())
6398 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6399 KVM_PAGES_PER_HPAGE(sp->role.level));
6400 else
6401 need_tlb_flush = 1;
6402
6403 goto restart;
6404 }
6405 }
6406
6407 return need_tlb_flush;
6408 }
6409
6410 static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6411 const struct kvm_memory_slot *slot)
6412 {
6413
6414
6415
6416
6417 if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
6418 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6419 kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6420 }
6421
6422 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6423 const struct kvm_memory_slot *slot)
6424 {
6425 if (kvm_memslots_have_rmaps(kvm)) {
6426 write_lock(&kvm->mmu_lock);
6427 kvm_rmap_zap_collapsible_sptes(kvm, slot);
6428 write_unlock(&kvm->mmu_lock);
6429 }
6430
6431 if (is_tdp_mmu_enabled(kvm)) {
6432 read_lock(&kvm->mmu_lock);
6433 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6434 read_unlock(&kvm->mmu_lock);
6435 }
6436 }
6437
6438 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6439 const struct kvm_memory_slot *memslot)
6440 {
6441
6442
6443
6444
6445
6446
6447
6448 lockdep_assert_held(&kvm->slots_lock);
6449 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6450 memslot->npages);
6451 }
6452
6453 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6454 const struct kvm_memory_slot *memslot)
6455 {
6456 if (kvm_memslots_have_rmaps(kvm)) {
6457 write_lock(&kvm->mmu_lock);
6458
6459
6460
6461
6462 slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
6463 write_unlock(&kvm->mmu_lock);
6464 }
6465
6466 if (is_tdp_mmu_enabled(kvm)) {
6467 read_lock(&kvm->mmu_lock);
6468 kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6469 read_unlock(&kvm->mmu_lock);
6470 }
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480 }
6481
6482 void kvm_mmu_zap_all(struct kvm *kvm)
6483 {
6484 struct kvm_mmu_page *sp, *node;
6485 LIST_HEAD(invalid_list);
6486 int ign;
6487
6488 write_lock(&kvm->mmu_lock);
6489 restart:
6490 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6491 if (WARN_ON(sp->role.invalid))
6492 continue;
6493 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6494 goto restart;
6495 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6496 goto restart;
6497 }
6498
6499 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6500
6501 if (is_tdp_mmu_enabled(kvm))
6502 kvm_tdp_mmu_zap_all(kvm);
6503
6504 write_unlock(&kvm->mmu_lock);
6505 }
6506
6507 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6508 {
6509 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6510
6511 gen &= MMIO_SPTE_GEN_MASK;
6512
6513
6514
6515
6516
6517
6518
6519
6520 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6521
6522
6523
6524
6525
6526 if (unlikely(gen == 0)) {
6527 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6528 kvm_mmu_zap_all_fast(kvm);
6529 }
6530 }
6531
6532 static unsigned long
6533 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6534 {
6535 struct kvm *kvm;
6536 int nr_to_scan = sc->nr_to_scan;
6537 unsigned long freed = 0;
6538
6539 mutex_lock(&kvm_lock);
6540
6541 list_for_each_entry(kvm, &vm_list, vm_list) {
6542 int idx;
6543 LIST_HEAD(invalid_list);
6544
6545
6546
6547
6548
6549
6550
6551 if (!nr_to_scan--)
6552 break;
6553
6554
6555
6556
6557
6558
6559 if (!kvm->arch.n_used_mmu_pages &&
6560 !kvm_has_zapped_obsolete_pages(kvm))
6561 continue;
6562
6563 idx = srcu_read_lock(&kvm->srcu);
6564 write_lock(&kvm->mmu_lock);
6565
6566 if (kvm_has_zapped_obsolete_pages(kvm)) {
6567 kvm_mmu_commit_zap_page(kvm,
6568 &kvm->arch.zapped_obsolete_pages);
6569 goto unlock;
6570 }
6571
6572 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6573
6574 unlock:
6575 write_unlock(&kvm->mmu_lock);
6576 srcu_read_unlock(&kvm->srcu, idx);
6577
6578
6579
6580
6581
6582
6583 list_move_tail(&kvm->vm_list, &vm_list);
6584 break;
6585 }
6586
6587 mutex_unlock(&kvm_lock);
6588 return freed;
6589 }
6590
6591 static unsigned long
6592 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6593 {
6594 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6595 }
6596
6597 static struct shrinker mmu_shrinker = {
6598 .count_objects = mmu_shrink_count,
6599 .scan_objects = mmu_shrink_scan,
6600 .seeks = DEFAULT_SEEKS * 10,
6601 };
6602
6603 static void mmu_destroy_caches(void)
6604 {
6605 kmem_cache_destroy(pte_list_desc_cache);
6606 kmem_cache_destroy(mmu_page_header_cache);
6607 }
6608
6609 static bool get_nx_auto_mode(void)
6610 {
6611
6612 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6613 }
6614
6615 static void __set_nx_huge_pages(bool val)
6616 {
6617 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6618 }
6619
6620 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6621 {
6622 bool old_val = nx_huge_pages;
6623 bool new_val;
6624
6625
6626 if (sysfs_streq(val, "off"))
6627 new_val = 0;
6628 else if (sysfs_streq(val, "force"))
6629 new_val = 1;
6630 else if (sysfs_streq(val, "auto"))
6631 new_val = get_nx_auto_mode();
6632 else if (strtobool(val, &new_val) < 0)
6633 return -EINVAL;
6634
6635 __set_nx_huge_pages(new_val);
6636
6637 if (new_val != old_val) {
6638 struct kvm *kvm;
6639
6640 mutex_lock(&kvm_lock);
6641
6642 list_for_each_entry(kvm, &vm_list, vm_list) {
6643 mutex_lock(&kvm->slots_lock);
6644 kvm_mmu_zap_all_fast(kvm);
6645 mutex_unlock(&kvm->slots_lock);
6646
6647 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6648 }
6649 mutex_unlock(&kvm_lock);
6650 }
6651
6652 return 0;
6653 }
6654
6655
6656
6657
6658
6659
6660
6661 void __init kvm_mmu_x86_module_init(void)
6662 {
6663 if (nx_huge_pages == -1)
6664 __set_nx_huge_pages(get_nx_auto_mode());
6665
6666 kvm_mmu_spte_module_init();
6667 }
6668
6669
6670
6671
6672
6673
6674 int kvm_mmu_vendor_module_init(void)
6675 {
6676 int ret = -ENOMEM;
6677
6678
6679
6680
6681
6682
6683
6684 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6685 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6686 BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
6687
6688 kvm_mmu_reset_all_pte_masks();
6689
6690 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6691 sizeof(struct pte_list_desc),
6692 0, SLAB_ACCOUNT, NULL);
6693 if (!pte_list_desc_cache)
6694 goto out;
6695
6696 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6697 sizeof(struct kvm_mmu_page),
6698 0, SLAB_ACCOUNT, NULL);
6699 if (!mmu_page_header_cache)
6700 goto out;
6701
6702 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6703 goto out;
6704
6705 ret = register_shrinker(&mmu_shrinker, "x86-mmu");
6706 if (ret)
6707 goto out;
6708
6709 return 0;
6710
6711 out:
6712 mmu_destroy_caches();
6713 return ret;
6714 }
6715
6716 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6717 {
6718 kvm_mmu_unload(vcpu);
6719 free_mmu_pages(&vcpu->arch.root_mmu);
6720 free_mmu_pages(&vcpu->arch.guest_mmu);
6721 mmu_free_memory_caches(vcpu);
6722 }
6723
6724 void kvm_mmu_vendor_module_exit(void)
6725 {
6726 mmu_destroy_caches();
6727 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6728 unregister_shrinker(&mmu_shrinker);
6729 }
6730
6731
6732
6733
6734
6735 static bool calc_nx_huge_pages_recovery_period(uint *period)
6736 {
6737
6738
6739
6740
6741 bool enabled = READ_ONCE(nx_huge_pages);
6742 uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6743
6744 if (!enabled || !ratio)
6745 return false;
6746
6747 *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6748 if (!*period) {
6749
6750 ratio = min(ratio, 3600u);
6751 *period = 60 * 60 * 1000 / ratio;
6752 }
6753 return true;
6754 }
6755
6756 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6757 {
6758 bool was_recovery_enabled, is_recovery_enabled;
6759 uint old_period, new_period;
6760 int err;
6761
6762 was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6763
6764 err = param_set_uint(val, kp);
6765 if (err)
6766 return err;
6767
6768 is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6769
6770 if (is_recovery_enabled &&
6771 (!was_recovery_enabled || old_period > new_period)) {
6772 struct kvm *kvm;
6773
6774 mutex_lock(&kvm_lock);
6775
6776 list_for_each_entry(kvm, &vm_list, vm_list)
6777 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6778
6779 mutex_unlock(&kvm_lock);
6780 }
6781
6782 return err;
6783 }
6784
6785 static void kvm_recover_nx_lpages(struct kvm *kvm)
6786 {
6787 unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6788 int rcu_idx;
6789 struct kvm_mmu_page *sp;
6790 unsigned int ratio;
6791 LIST_HEAD(invalid_list);
6792 bool flush = false;
6793 ulong to_zap;
6794
6795 rcu_idx = srcu_read_lock(&kvm->srcu);
6796 write_lock(&kvm->mmu_lock);
6797
6798
6799
6800
6801
6802
6803 rcu_read_lock();
6804
6805 ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6806 to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6807 for ( ; to_zap; --to_zap) {
6808 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6809 break;
6810
6811
6812
6813
6814
6815
6816 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6817 struct kvm_mmu_page,
6818 lpage_disallowed_link);
6819 WARN_ON_ONCE(!sp->lpage_disallowed);
6820 if (is_tdp_mmu_page(sp)) {
6821 flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6822 } else {
6823 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6824 WARN_ON_ONCE(sp->lpage_disallowed);
6825 }
6826
6827 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6828 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6829 rcu_read_unlock();
6830
6831 cond_resched_rwlock_write(&kvm->mmu_lock);
6832 flush = false;
6833
6834 rcu_read_lock();
6835 }
6836 }
6837 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6838
6839 rcu_read_unlock();
6840
6841 write_unlock(&kvm->mmu_lock);
6842 srcu_read_unlock(&kvm->srcu, rcu_idx);
6843 }
6844
6845 static long get_nx_lpage_recovery_timeout(u64 start_time)
6846 {
6847 bool enabled;
6848 uint period;
6849
6850 enabled = calc_nx_huge_pages_recovery_period(&period);
6851
6852 return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6853 : MAX_SCHEDULE_TIMEOUT;
6854 }
6855
6856 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6857 {
6858 u64 start_time;
6859 long remaining_time;
6860
6861 while (true) {
6862 start_time = get_jiffies_64();
6863 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6864
6865 set_current_state(TASK_INTERRUPTIBLE);
6866 while (!kthread_should_stop() && remaining_time > 0) {
6867 schedule_timeout(remaining_time);
6868 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6869 set_current_state(TASK_INTERRUPTIBLE);
6870 }
6871
6872 set_current_state(TASK_RUNNING);
6873
6874 if (kthread_should_stop())
6875 return 0;
6876
6877 kvm_recover_nx_lpages(kvm);
6878 }
6879 }
6880
6881 int kvm_mmu_post_init_vm(struct kvm *kvm)
6882 {
6883 int err;
6884
6885 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6886 "kvm-nx-lpage-recovery",
6887 &kvm->arch.nx_lpage_recovery_thread);
6888 if (!err)
6889 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6890
6891 return err;
6892 }
6893
6894 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6895 {
6896 if (kvm->arch.nx_lpage_recovery_thread)
6897 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6898 }