0001
0002
0003 #include "mmu.h"
0004 #include "mmu_internal.h"
0005 #include "mmutrace.h"
0006 #include "tdp_iter.h"
0007 #include "tdp_mmu.h"
0008 #include "spte.h"
0009
0010 #include <asm/cmpxchg.h>
0011 #include <trace/events/kvm.h>
0012
0013 static bool __read_mostly tdp_mmu_enabled = true;
0014 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
0015
0016
0017 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
0018 {
0019 struct workqueue_struct *wq;
0020
0021 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
0022 return 0;
0023
0024 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
0025 if (!wq)
0026 return -ENOMEM;
0027
0028
0029 kvm->arch.tdp_mmu_enabled = true;
0030 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
0031 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
0032 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
0033 kvm->arch.tdp_mmu_zap_wq = wq;
0034 return 1;
0035 }
0036
0037
0038 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
0039 bool shared)
0040 {
0041 if (shared)
0042 lockdep_assert_held_read(&kvm->mmu_lock);
0043 else
0044 lockdep_assert_held_write(&kvm->mmu_lock);
0045
0046 return true;
0047 }
0048
0049 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
0050 {
0051 if (!kvm->arch.tdp_mmu_enabled)
0052 return;
0053
0054
0055 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
0056
0057 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
0058 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
0059
0060
0061
0062
0063
0064
0065 rcu_barrier();
0066 }
0067
0068 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
0069 {
0070 free_page((unsigned long)sp->spt);
0071 kmem_cache_free(mmu_page_header_cache, sp);
0072 }
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
0083 {
0084 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
0085 rcu_head);
0086
0087 tdp_mmu_free_sp(sp);
0088 }
0089
0090 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0091 bool shared);
0092
0093 static void tdp_mmu_zap_root_work(struct work_struct *work)
0094 {
0095 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
0096 tdp_mmu_async_work);
0097 struct kvm *kvm = root->tdp_mmu_async_data;
0098
0099 read_lock(&kvm->mmu_lock);
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109 tdp_mmu_zap_root(kvm, root, true);
0110
0111
0112
0113
0114
0115
0116
0117
0118 kvm_tdp_mmu_put_root(kvm, root, true);
0119
0120 read_unlock(&kvm->mmu_lock);
0121 }
0122
0123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
0124 {
0125 root->tdp_mmu_async_data = kvm;
0126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
0127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
0128 }
0129
0130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
0131 {
0132 union kvm_mmu_page_role role = page->role;
0133 role.invalid = true;
0134
0135
0136 role.word = xchg(&page->role.word, role.word);
0137 return role.invalid;
0138 }
0139
0140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
0141 bool shared)
0142 {
0143 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
0144
0145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
0146 return;
0147
0148 WARN_ON(!root->tdp_mmu_page);
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174 if (!kvm_tdp_root_mark_invalid(root)) {
0175 refcount_set(&root->tdp_mmu_root_count, 1);
0176
0177
0178
0179
0180
0181
0182
0183
0184 tdp_mmu_schedule_zap_root(kvm, root);
0185 return;
0186 }
0187
0188 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0189 list_del_rcu(&root->link);
0190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
0192 }
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
0205 struct kvm_mmu_page *prev_root,
0206 bool shared, bool only_valid)
0207 {
0208 struct kvm_mmu_page *next_root;
0209
0210 rcu_read_lock();
0211
0212 if (prev_root)
0213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0214 &prev_root->link,
0215 typeof(*prev_root), link);
0216 else
0217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0218 typeof(*next_root), link);
0219
0220 while (next_root) {
0221 if ((!only_valid || !next_root->role.invalid) &&
0222 kvm_tdp_mmu_get_root(next_root))
0223 break;
0224
0225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0226 &next_root->link, typeof(*next_root), link);
0227 }
0228
0229 rcu_read_unlock();
0230
0231 if (prev_root)
0232 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
0233
0234 return next_root;
0235 }
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
0248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
0249 _root; \
0250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
0251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
0252 kvm_mmu_page_as_id(_root) != _as_id) { \
0253 } else
0254
0255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
0256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
0257
0258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
0259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
0260
0261
0262
0263
0264
0265
0266
0267
0268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
0269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
0270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
0271 kvm_mmu_page_as_id(_root) != _as_id) { \
0272 } else
0273
0274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
0275 {
0276 struct kvm_mmu_page *sp;
0277
0278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
0279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
0280
0281 return sp;
0282 }
0283
0284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
0285 gfn_t gfn, union kvm_mmu_page_role role)
0286 {
0287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
0288
0289 sp->role = role;
0290 sp->gfn = gfn;
0291 sp->ptep = sptep;
0292 sp->tdp_mmu_page = true;
0293
0294 trace_kvm_mmu_get_page(sp, true);
0295 }
0296
0297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
0298 struct tdp_iter *iter)
0299 {
0300 struct kvm_mmu_page *parent_sp;
0301 union kvm_mmu_page_role role;
0302
0303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
0304
0305 role = parent_sp->role;
0306 role.level--;
0307
0308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
0309 }
0310
0311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
0312 {
0313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
0314 struct kvm *kvm = vcpu->kvm;
0315 struct kvm_mmu_page *root;
0316
0317 lockdep_assert_held_write(&kvm->mmu_lock);
0318
0319
0320
0321
0322
0323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
0324 if (root->role.word == role.word &&
0325 kvm_tdp_mmu_get_root(root))
0326 goto out;
0327 }
0328
0329 root = tdp_mmu_alloc_sp(vcpu);
0330 tdp_mmu_init_sp(root, NULL, 0, role);
0331
0332 refcount_set(&root->tdp_mmu_root_count, 1);
0333
0334 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
0336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0337
0338 out:
0339 return __pa(root->spt);
0340 }
0341
0342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0343 u64 old_spte, u64 new_spte, int level,
0344 bool shared);
0345
0346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
0347 {
0348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
0349 return;
0350
0351 if (is_accessed_spte(old_spte) &&
0352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
0353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
0354 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
0355 }
0356
0357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
0358 u64 old_spte, u64 new_spte, int level)
0359 {
0360 bool pfn_changed;
0361 struct kvm_memory_slot *slot;
0362
0363 if (level > PG_LEVEL_4K)
0364 return;
0365
0366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
0367
0368 if ((!is_writable_pte(old_spte) || pfn_changed) &&
0369 is_writable_pte(new_spte)) {
0370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
0371 mark_page_dirty_in_slot(kvm, slot, gfn);
0372 }
0373 }
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
0385 bool shared)
0386 {
0387 if (shared)
0388 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0389 else
0390 lockdep_assert_held_write(&kvm->mmu_lock);
0391
0392 list_del(&sp->link);
0393 if (sp->lpage_disallowed)
0394 unaccount_huge_nx_page(kvm, sp);
0395
0396 if (shared)
0397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0398 }
0399
0400
0401
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
0418 {
0419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
0420 int level = sp->role.level;
0421 gfn_t base_gfn = sp->gfn;
0422 int i;
0423
0424 trace_kvm_mmu_prepare_zap_page(sp);
0425
0426 tdp_mmu_unlink_sp(kvm, sp, shared);
0427
0428 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
0429 tdp_ptep_t sptep = pt + i;
0430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
0431 u64 old_spte;
0432
0433 if (shared) {
0434
0435
0436
0437
0438
0439
0440
0441
0442 for (;;) {
0443 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
0444 if (!is_removed_spte(old_spte))
0445 break;
0446 cpu_relax();
0447 }
0448 } else {
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458 old_spte = kvm_tdp_mmu_read_spte(sptep);
0459 if (!is_shadow_present_pte(old_spte))
0460 continue;
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
0491 REMOVED_SPTE, level);
0492 }
0493 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
0494 old_spte, REMOVED_SPTE, level, shared);
0495 }
0496
0497 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
0498 }
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0516 u64 old_spte, u64 new_spte, int level,
0517 bool shared)
0518 {
0519 bool was_present = is_shadow_present_pte(old_spte);
0520 bool is_present = is_shadow_present_pte(new_spte);
0521 bool was_leaf = was_present && is_last_spte(old_spte, level);
0522 bool is_leaf = is_present && is_last_spte(new_spte, level);
0523 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
0524
0525 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
0526 WARN_ON(level < PG_LEVEL_4K);
0527 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538 if (was_leaf && is_leaf && pfn_changed) {
0539 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
0540 "SPTE with another present leaf SPTE mapping a\n"
0541 "different PFN!\n"
0542 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
0543 as_id, gfn, old_spte, new_spte, level);
0544
0545
0546
0547
0548
0549 BUG();
0550 }
0551
0552 if (old_spte == new_spte)
0553 return;
0554
0555 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
0556
0557 if (is_leaf)
0558 check_spte_writable_invariants(new_spte);
0559
0560
0561
0562
0563
0564
0565 if (!was_present && !is_present) {
0566
0567
0568
0569
0570
0571
0572 if (WARN_ON(!is_mmio_spte(old_spte) &&
0573 !is_mmio_spte(new_spte) &&
0574 !is_removed_spte(new_spte)))
0575 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
0576 "should not be replaced with another,\n"
0577 "different nonpresent SPTE, unless one or both\n"
0578 "are MMIO SPTEs, or the new SPTE is\n"
0579 "a temporary removed SPTE.\n"
0580 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
0581 as_id, gfn, old_spte, new_spte, level);
0582 return;
0583 }
0584
0585 if (is_leaf != was_leaf)
0586 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
0587
0588 if (was_leaf && is_dirty_spte(old_spte) &&
0589 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
0590 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
0591
0592
0593
0594
0595
0596
0597
0598 if (was_present && !was_leaf &&
0599 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0600 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
0601 }
0602
0603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0604 u64 old_spte, u64 new_spte, int level,
0605 bool shared)
0606 {
0607 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
0608 shared);
0609 handle_changed_spte_acc_track(old_spte, new_spte, level);
0610 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
0611 new_spte, level);
0612 }
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628
0629
0630
0631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
0632 struct tdp_iter *iter,
0633 u64 new_spte)
0634 {
0635 u64 *sptep = rcu_dereference(iter->sptep);
0636
0637
0638
0639
0640
0641
0642
0643 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
0644
0645 lockdep_assert_held_read(&kvm->mmu_lock);
0646
0647
0648
0649
0650
0651 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
0652 return -EBUSY;
0653
0654 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
0655 new_spte, iter->level, true);
0656 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
0657
0658 return 0;
0659 }
0660
0661 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
0662 struct tdp_iter *iter)
0663 {
0664 int ret;
0665
0666
0667
0668
0669
0670
0671
0672 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
0673 if (ret)
0674 return ret;
0675
0676 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
0677 KVM_PAGES_PER_HPAGE(iter->level));
0678
0679
0680
0681
0682
0683
0684
0685
0686 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
0687
0688 return 0;
0689 }
0690
0691
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
0716 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
0717 bool record_acc_track, bool record_dirty_log)
0718 {
0719 lockdep_assert_held_write(&kvm->mmu_lock);
0720
0721
0722
0723
0724
0725
0726
0727
0728 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
0729
0730 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
0731
0732 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
0733
0734 if (record_acc_track)
0735 handle_changed_spte_acc_track(old_spte, new_spte, level);
0736 if (record_dirty_log)
0737 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
0738 new_spte, level);
0739 return old_spte;
0740 }
0741
0742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
0743 u64 new_spte, bool record_acc_track,
0744 bool record_dirty_log)
0745 {
0746 WARN_ON_ONCE(iter->yielded);
0747
0748 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
0749 iter->old_spte, new_spte,
0750 iter->gfn, iter->level,
0751 record_acc_track, record_dirty_log);
0752 }
0753
0754 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
0755 u64 new_spte)
0756 {
0757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
0758 }
0759
0760 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
0761 struct tdp_iter *iter,
0762 u64 new_spte)
0763 {
0764 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
0765 }
0766
0767 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
0768 struct tdp_iter *iter,
0769 u64 new_spte)
0770 {
0771 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
0772 }
0773
0774 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
0775 for_each_tdp_pte(_iter, _root, _start, _end)
0776
0777 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
0778 tdp_root_for_each_pte(_iter, _root, _start, _end) \
0779 if (!is_shadow_present_pte(_iter.old_spte) || \
0780 !is_last_spte(_iter.old_spte, _iter.level)) \
0781 continue; \
0782 else
0783
0784 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
0785 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
0786
0787
0788
0789
0790
0791
0792
0793
0794
0795
0796
0797
0798
0799
0800
0801 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
0802 struct tdp_iter *iter,
0803 bool flush, bool shared)
0804 {
0805 WARN_ON(iter->yielded);
0806
0807
0808 if (iter->next_last_level_gfn == iter->yielded_gfn)
0809 return false;
0810
0811 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
0812 if (flush)
0813 kvm_flush_remote_tlbs(kvm);
0814
0815 rcu_read_unlock();
0816
0817 if (shared)
0818 cond_resched_rwlock_read(&kvm->mmu_lock);
0819 else
0820 cond_resched_rwlock_write(&kvm->mmu_lock);
0821
0822 rcu_read_lock();
0823
0824 WARN_ON(iter->gfn > iter->next_last_level_gfn);
0825
0826 iter->yielded = true;
0827 }
0828
0829 return iter->yielded;
0830 }
0831
0832 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
0833 {
0834
0835
0836
0837
0838
0839
0840 return kvm_mmu_max_gfn() + 1;
0841 }
0842
0843 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0844 bool shared, int zap_level)
0845 {
0846 struct tdp_iter iter;
0847
0848 gfn_t end = tdp_mmu_max_gfn_exclusive();
0849 gfn_t start = 0;
0850
0851 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
0852 retry:
0853 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
0854 continue;
0855
0856 if (!is_shadow_present_pte(iter.old_spte))
0857 continue;
0858
0859 if (iter.level > zap_level)
0860 continue;
0861
0862 if (!shared)
0863 tdp_mmu_set_spte(kvm, &iter, 0);
0864 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
0865 goto retry;
0866 }
0867 }
0868
0869 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0870 bool shared)
0871 {
0872
0873
0874
0875
0876
0877
0878
0879
0880
0881
0882
0883 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
0884
0885 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
0886
0887 rcu_read_lock();
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
0900 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
0901
0902 rcu_read_unlock();
0903 }
0904
0905 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
0906 {
0907 u64 old_spte;
0908
0909
0910
0911
0912
0913 if (WARN_ON_ONCE(!sp->ptep))
0914 return false;
0915
0916 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
0917 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
0918 return false;
0919
0920 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
0921 sp->gfn, sp->role.level + 1, true, true);
0922
0923 return true;
0924 }
0925
0926
0927
0928
0929
0930
0931
0932
0933 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
0934 gfn_t start, gfn_t end, bool can_yield, bool flush)
0935 {
0936 struct tdp_iter iter;
0937
0938 end = min(end, tdp_mmu_max_gfn_exclusive());
0939
0940 lockdep_assert_held_write(&kvm->mmu_lock);
0941
0942 rcu_read_lock();
0943
0944 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
0945 if (can_yield &&
0946 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
0947 flush = false;
0948 continue;
0949 }
0950
0951 if (!is_shadow_present_pte(iter.old_spte) ||
0952 !is_last_spte(iter.old_spte, iter.level))
0953 continue;
0954
0955 tdp_mmu_set_spte(kvm, &iter, 0);
0956 flush = true;
0957 }
0958
0959 rcu_read_unlock();
0960
0961
0962
0963
0964
0965 return flush;
0966 }
0967
0968
0969
0970
0971
0972
0973 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
0974 bool can_yield, bool flush)
0975 {
0976 struct kvm_mmu_page *root;
0977
0978 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
0979 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
0980
0981 return flush;
0982 }
0983
0984 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
0985 {
0986 struct kvm_mmu_page *root;
0987 int i;
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1002 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1003 tdp_mmu_zap_root(kvm, root, false);
1004 }
1005 }
1006
1007
1008
1009
1010
1011 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1012 {
1013 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1014 }
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1034 {
1035 struct kvm_mmu_page *root;
1036
1037 lockdep_assert_held_write(&kvm->mmu_lock);
1038 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1039 if (!root->role.invalid &&
1040 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1041 root->role.invalid = true;
1042 tdp_mmu_schedule_zap_root(kvm, root);
1043 }
1044 }
1045 }
1046
1047
1048
1049
1050
1051 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1052 struct kvm_page_fault *fault,
1053 struct tdp_iter *iter)
1054 {
1055 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1056 u64 new_spte;
1057 int ret = RET_PF_FIXED;
1058 bool wrprot = false;
1059
1060 WARN_ON(sp->role.level != fault->goal_level);
1061 if (unlikely(!fault->slot))
1062 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1063 else
1064 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1065 fault->pfn, iter->old_spte, fault->prefetch, true,
1066 fault->map_writable, &new_spte);
1067
1068 if (new_spte == iter->old_spte)
1069 ret = RET_PF_SPURIOUS;
1070 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1071 return RET_PF_RETRY;
1072 else if (is_shadow_present_pte(iter->old_spte) &&
1073 !is_last_spte(iter->old_spte, iter->level))
1074 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1075 KVM_PAGES_PER_HPAGE(iter->level + 1));
1076
1077
1078
1079
1080
1081
1082 if (wrprot) {
1083 if (fault->write)
1084 ret = RET_PF_EMULATE;
1085 }
1086
1087
1088 if (unlikely(is_mmio_spte(new_spte))) {
1089 vcpu->stat.pf_mmio_spte_created++;
1090 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1091 new_spte);
1092 ret = RET_PF_EMULATE;
1093 } else {
1094 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1095 rcu_dereference(iter->sptep));
1096 }
1097
1098 return ret;
1099 }
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1116 struct kvm_mmu_page *sp, bool account_nx,
1117 bool shared)
1118 {
1119 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1120 int ret = 0;
1121
1122 if (shared) {
1123 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1124 if (ret)
1125 return ret;
1126 } else {
1127 tdp_mmu_set_spte(kvm, iter, spte);
1128 }
1129
1130 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1131 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1132 if (account_nx)
1133 account_huge_nx_page(kvm, sp);
1134 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1135
1136 return 0;
1137 }
1138
1139
1140
1141
1142
1143 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1144 {
1145 struct kvm_mmu *mmu = vcpu->arch.mmu;
1146 struct tdp_iter iter;
1147 struct kvm_mmu_page *sp;
1148 int ret;
1149
1150 kvm_mmu_hugepage_adjust(vcpu, fault);
1151
1152 trace_kvm_mmu_spte_requested(fault);
1153
1154 rcu_read_lock();
1155
1156 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1157 if (fault->nx_huge_page_workaround_enabled)
1158 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1159
1160 if (iter.level == fault->goal_level)
1161 break;
1162
1163
1164
1165
1166
1167
1168 if (is_shadow_present_pte(iter.old_spte) &&
1169 is_large_pte(iter.old_spte)) {
1170 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1171 break;
1172
1173
1174
1175
1176
1177
1178 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1179 }
1180
1181 if (!is_shadow_present_pte(iter.old_spte)) {
1182 bool account_nx = fault->huge_page_disallowed &&
1183 fault->req_level >= iter.level;
1184
1185
1186
1187
1188
1189
1190 if (is_removed_spte(iter.old_spte))
1191 break;
1192
1193 sp = tdp_mmu_alloc_sp(vcpu);
1194 tdp_mmu_init_child_sp(sp, &iter);
1195
1196 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1197 tdp_mmu_free_sp(sp);
1198 break;
1199 }
1200 }
1201 }
1202
1203
1204
1205
1206
1207 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1208 rcu_read_unlock();
1209 return RET_PF_RETRY;
1210 }
1211
1212 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1213 rcu_read_unlock();
1214
1215 return ret;
1216 }
1217
1218 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1219 bool flush)
1220 {
1221 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1222 range->end, range->may_block, flush);
1223 }
1224
1225 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1226 struct kvm_gfn_range *range);
1227
1228 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1229 struct kvm_gfn_range *range,
1230 tdp_handler_t handler)
1231 {
1232 struct kvm_mmu_page *root;
1233 struct tdp_iter iter;
1234 bool ret = false;
1235
1236
1237
1238
1239
1240 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1241 rcu_read_lock();
1242
1243 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1244 ret |= handler(kvm, &iter, range);
1245
1246 rcu_read_unlock();
1247 }
1248
1249 return ret;
1250 }
1251
1252
1253
1254
1255
1256 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1257 struct kvm_gfn_range *range)
1258 {
1259 u64 new_spte = 0;
1260
1261
1262 if (!is_accessed_spte(iter->old_spte))
1263 return false;
1264
1265 new_spte = iter->old_spte;
1266
1267 if (spte_ad_enabled(new_spte)) {
1268 new_spte &= ~shadow_accessed_mask;
1269 } else {
1270
1271
1272
1273
1274 if (is_writable_pte(new_spte))
1275 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1276
1277 new_spte = mark_spte_for_access_track(new_spte);
1278 }
1279
1280 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1281
1282 return true;
1283 }
1284
1285 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1286 {
1287 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1288 }
1289
1290 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1291 struct kvm_gfn_range *range)
1292 {
1293 return is_accessed_spte(iter->old_spte);
1294 }
1295
1296 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1297 {
1298 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1299 }
1300
1301 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1302 struct kvm_gfn_range *range)
1303 {
1304 u64 new_spte;
1305
1306
1307 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1308
1309 if (iter->level != PG_LEVEL_4K ||
1310 !is_shadow_present_pte(iter->old_spte))
1311 return false;
1312
1313
1314
1315
1316
1317
1318
1319 tdp_mmu_set_spte(kvm, iter, 0);
1320
1321 if (!pte_write(range->pte)) {
1322 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1323 pte_pfn(range->pte));
1324
1325 tdp_mmu_set_spte(kvm, iter, new_spte);
1326 }
1327
1328 return true;
1329 }
1330
1331
1332
1333
1334
1335
1336
1337 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1338 {
1339
1340
1341
1342
1343
1344 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1345 }
1346
1347
1348
1349
1350
1351
1352 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1353 gfn_t start, gfn_t end, int min_level)
1354 {
1355 struct tdp_iter iter;
1356 u64 new_spte;
1357 bool spte_set = false;
1358
1359 rcu_read_lock();
1360
1361 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1362
1363 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1364 retry:
1365 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1366 continue;
1367
1368 if (!is_shadow_present_pte(iter.old_spte) ||
1369 !is_last_spte(iter.old_spte, iter.level) ||
1370 !(iter.old_spte & PT_WRITABLE_MASK))
1371 continue;
1372
1373 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1374
1375 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1376 goto retry;
1377
1378 spte_set = true;
1379 }
1380
1381 rcu_read_unlock();
1382 return spte_set;
1383 }
1384
1385
1386
1387
1388
1389
1390 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1391 const struct kvm_memory_slot *slot, int min_level)
1392 {
1393 struct kvm_mmu_page *root;
1394 bool spte_set = false;
1395
1396 lockdep_assert_held_read(&kvm->mmu_lock);
1397
1398 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1399 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1400 slot->base_gfn + slot->npages, min_level);
1401
1402 return spte_set;
1403 }
1404
1405 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1406 {
1407 struct kvm_mmu_page *sp;
1408
1409 gfp |= __GFP_ZERO;
1410
1411 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1412 if (!sp)
1413 return NULL;
1414
1415 sp->spt = (void *)__get_free_page(gfp);
1416 if (!sp->spt) {
1417 kmem_cache_free(mmu_page_header_cache, sp);
1418 return NULL;
1419 }
1420
1421 return sp;
1422 }
1423
1424 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1425 struct tdp_iter *iter,
1426 bool shared)
1427 {
1428 struct kvm_mmu_page *sp;
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1440 if (sp)
1441 return sp;
1442
1443 rcu_read_unlock();
1444
1445 if (shared)
1446 read_unlock(&kvm->mmu_lock);
1447 else
1448 write_unlock(&kvm->mmu_lock);
1449
1450 iter->yielded = true;
1451 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1452
1453 if (shared)
1454 read_lock(&kvm->mmu_lock);
1455 else
1456 write_lock(&kvm->mmu_lock);
1457
1458 rcu_read_lock();
1459
1460 return sp;
1461 }
1462
1463 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1464 struct kvm_mmu_page *sp, bool shared)
1465 {
1466 const u64 huge_spte = iter->old_spte;
1467 const int level = iter->level;
1468 int ret, i;
1469
1470 tdp_mmu_init_child_sp(sp, iter);
1471
1472
1473
1474
1475
1476 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1477 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1488 if (ret)
1489 goto out;
1490
1491
1492
1493
1494
1495
1496 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1497
1498 out:
1499 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1500 return ret;
1501 }
1502
1503 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1504 struct kvm_mmu_page *root,
1505 gfn_t start, gfn_t end,
1506 int target_level, bool shared)
1507 {
1508 struct kvm_mmu_page *sp = NULL;
1509 struct tdp_iter iter;
1510 int ret = 0;
1511
1512 rcu_read_lock();
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1526 retry:
1527 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1528 continue;
1529
1530 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1531 continue;
1532
1533 if (!sp) {
1534 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1535 if (!sp) {
1536 ret = -ENOMEM;
1537 trace_kvm_mmu_split_huge_page(iter.gfn,
1538 iter.old_spte,
1539 iter.level, ret);
1540 break;
1541 }
1542
1543 if (iter.yielded)
1544 continue;
1545 }
1546
1547 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1548 goto retry;
1549
1550 sp = NULL;
1551 }
1552
1553 rcu_read_unlock();
1554
1555
1556
1557
1558
1559
1560 if (sp)
1561 tdp_mmu_free_sp(sp);
1562
1563 return ret;
1564 }
1565
1566
1567
1568
1569
1570 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1571 const struct kvm_memory_slot *slot,
1572 gfn_t start, gfn_t end,
1573 int target_level, bool shared)
1574 {
1575 struct kvm_mmu_page *root;
1576 int r = 0;
1577
1578 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1579
1580 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1581 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1582 if (r) {
1583 kvm_tdp_mmu_put_root(kvm, root, shared);
1584 break;
1585 }
1586 }
1587 }
1588
1589
1590
1591
1592
1593
1594
1595
1596 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1597 gfn_t start, gfn_t end)
1598 {
1599 struct tdp_iter iter;
1600 u64 new_spte;
1601 bool spte_set = false;
1602
1603 rcu_read_lock();
1604
1605 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1606 retry:
1607 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1608 continue;
1609
1610 if (!is_shadow_present_pte(iter.old_spte))
1611 continue;
1612
1613 if (spte_ad_need_write_protect(iter.old_spte)) {
1614 if (is_writable_pte(iter.old_spte))
1615 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1616 else
1617 continue;
1618 } else {
1619 if (iter.old_spte & shadow_dirty_mask)
1620 new_spte = iter.old_spte & ~shadow_dirty_mask;
1621 else
1622 continue;
1623 }
1624
1625 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1626 goto retry;
1627
1628 spte_set = true;
1629 }
1630
1631 rcu_read_unlock();
1632 return spte_set;
1633 }
1634
1635
1636
1637
1638
1639
1640
1641
1642 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1643 const struct kvm_memory_slot *slot)
1644 {
1645 struct kvm_mmu_page *root;
1646 bool spte_set = false;
1647
1648 lockdep_assert_held_read(&kvm->mmu_lock);
1649
1650 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1651 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1652 slot->base_gfn + slot->npages);
1653
1654 return spte_set;
1655 }
1656
1657
1658
1659
1660
1661
1662
1663
1664 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1665 gfn_t gfn, unsigned long mask, bool wrprot)
1666 {
1667 struct tdp_iter iter;
1668 u64 new_spte;
1669
1670 rcu_read_lock();
1671
1672 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1673 gfn + BITS_PER_LONG) {
1674 if (!mask)
1675 break;
1676
1677 if (iter.level > PG_LEVEL_4K ||
1678 !(mask & (1UL << (iter.gfn - gfn))))
1679 continue;
1680
1681 mask &= ~(1UL << (iter.gfn - gfn));
1682
1683 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1684 if (is_writable_pte(iter.old_spte))
1685 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1686 else
1687 continue;
1688 } else {
1689 if (iter.old_spte & shadow_dirty_mask)
1690 new_spte = iter.old_spte & ~shadow_dirty_mask;
1691 else
1692 continue;
1693 }
1694
1695 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1696 }
1697
1698 rcu_read_unlock();
1699 }
1700
1701
1702
1703
1704
1705
1706
1707
1708 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1709 struct kvm_memory_slot *slot,
1710 gfn_t gfn, unsigned long mask,
1711 bool wrprot)
1712 {
1713 struct kvm_mmu_page *root;
1714
1715 lockdep_assert_held_write(&kvm->mmu_lock);
1716 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1717 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1718 }
1719
1720 static void zap_collapsible_spte_range(struct kvm *kvm,
1721 struct kvm_mmu_page *root,
1722 const struct kvm_memory_slot *slot)
1723 {
1724 gfn_t start = slot->base_gfn;
1725 gfn_t end = start + slot->npages;
1726 struct tdp_iter iter;
1727 int max_mapping_level;
1728
1729 rcu_read_lock();
1730
1731 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1732 retry:
1733 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1734 continue;
1735
1736 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1737 !is_shadow_present_pte(iter.old_spte))
1738 continue;
1739
1740
1741
1742
1743
1744
1745 if (is_last_spte(iter.old_spte, iter.level))
1746 continue;
1747
1748
1749
1750
1751
1752
1753
1754
1755 if (iter.gfn < start || iter.gfn >= end)
1756 continue;
1757
1758 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1759 iter.gfn, PG_LEVEL_NUM);
1760 if (max_mapping_level < iter.level)
1761 continue;
1762
1763
1764 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1765 goto retry;
1766 }
1767
1768 rcu_read_unlock();
1769 }
1770
1771
1772
1773
1774
1775 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1776 const struct kvm_memory_slot *slot)
1777 {
1778 struct kvm_mmu_page *root;
1779
1780 lockdep_assert_held_read(&kvm->mmu_lock);
1781
1782 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1783 zap_collapsible_spte_range(kvm, root, slot);
1784 }
1785
1786
1787
1788
1789
1790
1791 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1792 gfn_t gfn, int min_level)
1793 {
1794 struct tdp_iter iter;
1795 u64 new_spte;
1796 bool spte_set = false;
1797
1798 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1799
1800 rcu_read_lock();
1801
1802 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1803 if (!is_shadow_present_pte(iter.old_spte) ||
1804 !is_last_spte(iter.old_spte, iter.level))
1805 continue;
1806
1807 new_spte = iter.old_spte &
1808 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1809
1810 if (new_spte == iter.old_spte)
1811 break;
1812
1813 tdp_mmu_set_spte(kvm, &iter, new_spte);
1814 spte_set = true;
1815 }
1816
1817 rcu_read_unlock();
1818
1819 return spte_set;
1820 }
1821
1822
1823
1824
1825
1826
1827 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1828 struct kvm_memory_slot *slot, gfn_t gfn,
1829 int min_level)
1830 {
1831 struct kvm_mmu_page *root;
1832 bool spte_set = false;
1833
1834 lockdep_assert_held_write(&kvm->mmu_lock);
1835 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1836 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1837
1838 return spte_set;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1848 int *root_level)
1849 {
1850 struct tdp_iter iter;
1851 struct kvm_mmu *mmu = vcpu->arch.mmu;
1852 gfn_t gfn = addr >> PAGE_SHIFT;
1853 int leaf = -1;
1854
1855 *root_level = vcpu->arch.mmu->root_role.level;
1856
1857 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1858 leaf = iter.level;
1859 sptes[leaf] = iter.old_spte;
1860 }
1861
1862 return leaf;
1863 }
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1877 u64 *spte)
1878 {
1879 struct tdp_iter iter;
1880 struct kvm_mmu *mmu = vcpu->arch.mmu;
1881 gfn_t gfn = addr >> PAGE_SHIFT;
1882 tdp_ptep_t sptep = NULL;
1883
1884 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1885 *spte = iter.old_spte;
1886 sptep = iter.sptep;
1887 }
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899 return rcu_dereference(sptep);
1900 }