Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 
0003 #include "mmu.h"
0004 #include "mmu_internal.h"
0005 #include "mmutrace.h"
0006 #include "tdp_iter.h"
0007 #include "tdp_mmu.h"
0008 #include "spte.h"
0009 
0010 #include <asm/cmpxchg.h>
0011 #include <trace/events/kvm.h>
0012 
0013 static bool __read_mostly tdp_mmu_enabled = true;
0014 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
0015 
0016 /* Initializes the TDP MMU for the VM, if enabled. */
0017 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
0018 {
0019     struct workqueue_struct *wq;
0020 
0021     if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
0022         return 0;
0023 
0024     wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
0025     if (!wq)
0026         return -ENOMEM;
0027 
0028     /* This should not be changed for the lifetime of the VM. */
0029     kvm->arch.tdp_mmu_enabled = true;
0030     INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
0031     spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
0032     INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
0033     kvm->arch.tdp_mmu_zap_wq = wq;
0034     return 1;
0035 }
0036 
0037 /* Arbitrarily returns true so that this may be used in if statements. */
0038 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
0039                                  bool shared)
0040 {
0041     if (shared)
0042         lockdep_assert_held_read(&kvm->mmu_lock);
0043     else
0044         lockdep_assert_held_write(&kvm->mmu_lock);
0045 
0046     return true;
0047 }
0048 
0049 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
0050 {
0051     if (!kvm->arch.tdp_mmu_enabled)
0052         return;
0053 
0054     /* Also waits for any queued work items.  */
0055     destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
0056 
0057     WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
0058     WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
0059 
0060     /*
0061      * Ensure that all the outstanding RCU callbacks to free shadow pages
0062      * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
0063      * can call kvm_tdp_mmu_put_root and create new callbacks.
0064      */
0065     rcu_barrier();
0066 }
0067 
0068 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
0069 {
0070     free_page((unsigned long)sp->spt);
0071     kmem_cache_free(mmu_page_header_cache, sp);
0072 }
0073 
0074 /*
0075  * This is called through call_rcu in order to free TDP page table memory
0076  * safely with respect to other kernel threads that may be operating on
0077  * the memory.
0078  * By only accessing TDP MMU page table memory in an RCU read critical
0079  * section, and freeing it after a grace period, lockless access to that
0080  * memory won't use it after it is freed.
0081  */
0082 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
0083 {
0084     struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
0085                            rcu_head);
0086 
0087     tdp_mmu_free_sp(sp);
0088 }
0089 
0090 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0091                  bool shared);
0092 
0093 static void tdp_mmu_zap_root_work(struct work_struct *work)
0094 {
0095     struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
0096                          tdp_mmu_async_work);
0097     struct kvm *kvm = root->tdp_mmu_async_data;
0098 
0099     read_lock(&kvm->mmu_lock);
0100 
0101     /*
0102      * A TLB flush is not necessary as KVM performs a local TLB flush when
0103      * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
0104      * to a different pCPU.  Note, the local TLB flush on reuse also
0105      * invalidates any paging-structure-cache entries, i.e. TLB entries for
0106      * intermediate paging structures, that may be zapped, as such entries
0107      * are associated with the ASID on both VMX and SVM.
0108      */
0109     tdp_mmu_zap_root(kvm, root, true);
0110 
0111     /*
0112      * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
0113      * avoiding an infinite loop.  By design, the root is reachable while
0114      * it's being asynchronously zapped, thus a different task can put its
0115      * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
0116      * asynchronously zapped root is unavoidable.
0117      */
0118     kvm_tdp_mmu_put_root(kvm, root, true);
0119 
0120     read_unlock(&kvm->mmu_lock);
0121 }
0122 
0123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
0124 {
0125     root->tdp_mmu_async_data = kvm;
0126     INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
0127     queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
0128 }
0129 
0130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
0131 {
0132     union kvm_mmu_page_role role = page->role;
0133     role.invalid = true;
0134 
0135     /* No need to use cmpxchg, only the invalid bit can change.  */
0136     role.word = xchg(&page->role.word, role.word);
0137     return role.invalid;
0138 }
0139 
0140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
0141               bool shared)
0142 {
0143     kvm_lockdep_assert_mmu_lock_held(kvm, shared);
0144 
0145     if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
0146         return;
0147 
0148     WARN_ON(!root->tdp_mmu_page);
0149 
0150     /*
0151      * The root now has refcount=0.  It is valid, but readers already
0152      * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
0153      * rejects it.  This remains true for the rest of the execution
0154      * of this function, because readers visit valid roots only
0155      * (except for tdp_mmu_zap_root_work(), which however
0156      * does not acquire any reference itself).
0157      *
0158      * Even though there are flows that need to visit all roots for
0159      * correctness, they all take mmu_lock for write, so they cannot yet
0160      * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
0161      * since the root still has refcount=0.
0162      *
0163      * However, tdp_mmu_zap_root can yield, and writers do not expect to
0164      * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
0165      * So the root temporarily gets an extra reference, going to refcount=1
0166      * while staying invalid.  Readers still cannot acquire any reference;
0167      * but writers are now allowed to run if tdp_mmu_zap_root yields and
0168      * they might take an extra reference if they themselves yield.
0169      * Therefore, when the reference is given back by the worker,
0170      * there is no guarantee that the refcount is still 1.  If not, whoever
0171      * puts the last reference will free the page, but they will not have to
0172      * zap the root because a root cannot go from invalid to valid.
0173      */
0174     if (!kvm_tdp_root_mark_invalid(root)) {
0175         refcount_set(&root->tdp_mmu_root_count, 1);
0176 
0177         /*
0178          * Zapping the root in a worker is not just "nice to have";
0179          * it is required because kvm_tdp_mmu_invalidate_all_roots()
0180          * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
0181          * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
0182          * might return with some roots not zapped yet.
0183          */
0184         tdp_mmu_schedule_zap_root(kvm, root);
0185         return;
0186     }
0187 
0188     spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0189     list_del_rcu(&root->link);
0190     spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0191     call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
0192 }
0193 
0194 /*
0195  * Returns the next root after @prev_root (or the first root if @prev_root is
0196  * NULL).  A reference to the returned root is acquired, and the reference to
0197  * @prev_root is released (the caller obviously must hold a reference to
0198  * @prev_root if it's non-NULL).
0199  *
0200  * If @only_valid is true, invalid roots are skipped.
0201  *
0202  * Returns NULL if the end of tdp_mmu_roots was reached.
0203  */
0204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
0205                           struct kvm_mmu_page *prev_root,
0206                           bool shared, bool only_valid)
0207 {
0208     struct kvm_mmu_page *next_root;
0209 
0210     rcu_read_lock();
0211 
0212     if (prev_root)
0213         next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0214                           &prev_root->link,
0215                           typeof(*prev_root), link);
0216     else
0217         next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0218                            typeof(*next_root), link);
0219 
0220     while (next_root) {
0221         if ((!only_valid || !next_root->role.invalid) &&
0222             kvm_tdp_mmu_get_root(next_root))
0223             break;
0224 
0225         next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
0226                 &next_root->link, typeof(*next_root), link);
0227     }
0228 
0229     rcu_read_unlock();
0230 
0231     if (prev_root)
0232         kvm_tdp_mmu_put_root(kvm, prev_root, shared);
0233 
0234     return next_root;
0235 }
0236 
0237 /*
0238  * Note: this iterator gets and puts references to the roots it iterates over.
0239  * This makes it safe to release the MMU lock and yield within the loop, but
0240  * if exiting the loop early, the caller must drop the reference to the most
0241  * recent root. (Unless keeping a live reference is desirable.)
0242  *
0243  * If shared is set, this function is operating under the MMU lock in read
0244  * mode. In the unlikely event that this thread must free a root, the lock
0245  * will be temporarily dropped and reacquired in write mode.
0246  */
0247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
0248     for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);   \
0249          _root;                             \
0250          _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))  \
0251         if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&      \
0252             kvm_mmu_page_as_id(_root) != _as_id) {          \
0253         } else
0254 
0255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)    \
0256     __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
0257 
0258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)           \
0259     __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
0260 
0261 /*
0262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
0263  * the implication being that any flow that holds mmu_lock for read is
0264  * inherently yield-friendly and should use the yield-safe variant above.
0265  * Holding mmu_lock for write obviates the need for RCU protection as the list
0266  * is guaranteed to be stable.
0267  */
0268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)          \
0269     list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
0270         if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&    \
0271             kvm_mmu_page_as_id(_root) != _as_id) {      \
0272         } else
0273 
0274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
0275 {
0276     struct kvm_mmu_page *sp;
0277 
0278     sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
0279     sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
0280 
0281     return sp;
0282 }
0283 
0284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
0285                 gfn_t gfn, union kvm_mmu_page_role role)
0286 {
0287     set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
0288 
0289     sp->role = role;
0290     sp->gfn = gfn;
0291     sp->ptep = sptep;
0292     sp->tdp_mmu_page = true;
0293 
0294     trace_kvm_mmu_get_page(sp, true);
0295 }
0296 
0297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
0298                   struct tdp_iter *iter)
0299 {
0300     struct kvm_mmu_page *parent_sp;
0301     union kvm_mmu_page_role role;
0302 
0303     parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
0304 
0305     role = parent_sp->role;
0306     role.level--;
0307 
0308     tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
0309 }
0310 
0311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
0312 {
0313     union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
0314     struct kvm *kvm = vcpu->kvm;
0315     struct kvm_mmu_page *root;
0316 
0317     lockdep_assert_held_write(&kvm->mmu_lock);
0318 
0319     /*
0320      * Check for an existing root before allocating a new one.  Note, the
0321      * role check prevents consuming an invalid root.
0322      */
0323     for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
0324         if (root->role.word == role.word &&
0325             kvm_tdp_mmu_get_root(root))
0326             goto out;
0327     }
0328 
0329     root = tdp_mmu_alloc_sp(vcpu);
0330     tdp_mmu_init_sp(root, NULL, 0, role);
0331 
0332     refcount_set(&root->tdp_mmu_root_count, 1);
0333 
0334     spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0335     list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
0336     spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0337 
0338 out:
0339     return __pa(root->spt);
0340 }
0341 
0342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0343                 u64 old_spte, u64 new_spte, int level,
0344                 bool shared);
0345 
0346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
0347 {
0348     if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
0349         return;
0350 
0351     if (is_accessed_spte(old_spte) &&
0352         (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
0353          spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
0354         kvm_set_pfn_accessed(spte_to_pfn(old_spte));
0355 }
0356 
0357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
0358                       u64 old_spte, u64 new_spte, int level)
0359 {
0360     bool pfn_changed;
0361     struct kvm_memory_slot *slot;
0362 
0363     if (level > PG_LEVEL_4K)
0364         return;
0365 
0366     pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
0367 
0368     if ((!is_writable_pte(old_spte) || pfn_changed) &&
0369         is_writable_pte(new_spte)) {
0370         slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
0371         mark_page_dirty_in_slot(kvm, slot, gfn);
0372     }
0373 }
0374 
0375 /**
0376  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
0377  *
0378  * @kvm: kvm instance
0379  * @sp: the page to be removed
0380  * @shared: This operation may not be running under the exclusive use of
0381  *      the MMU lock and the operation must synchronize with other
0382  *      threads that might be adding or removing pages.
0383  */
0384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
0385                   bool shared)
0386 {
0387     if (shared)
0388         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
0389     else
0390         lockdep_assert_held_write(&kvm->mmu_lock);
0391 
0392     list_del(&sp->link);
0393     if (sp->lpage_disallowed)
0394         unaccount_huge_nx_page(kvm, sp);
0395 
0396     if (shared)
0397         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
0398 }
0399 
0400 /**
0401  * handle_removed_pt() - handle a page table removed from the TDP structure
0402  *
0403  * @kvm: kvm instance
0404  * @pt: the page removed from the paging structure
0405  * @shared: This operation may not be running under the exclusive use
0406  *      of the MMU lock and the operation must synchronize with other
0407  *      threads that might be modifying SPTEs.
0408  *
0409  * Given a page table that has been removed from the TDP paging structure,
0410  * iterates through the page table to clear SPTEs and free child page tables.
0411  *
0412  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
0413  * protection. Since this thread removed it from the paging structure,
0414  * this thread will be responsible for ensuring the page is freed. Hence the
0415  * early rcu_dereferences in the function.
0416  */
0417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
0418 {
0419     struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
0420     int level = sp->role.level;
0421     gfn_t base_gfn = sp->gfn;
0422     int i;
0423 
0424     trace_kvm_mmu_prepare_zap_page(sp);
0425 
0426     tdp_mmu_unlink_sp(kvm, sp, shared);
0427 
0428     for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
0429         tdp_ptep_t sptep = pt + i;
0430         gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
0431         u64 old_spte;
0432 
0433         if (shared) {
0434             /*
0435              * Set the SPTE to a nonpresent value that other
0436              * threads will not overwrite. If the SPTE was
0437              * already marked as removed then another thread
0438              * handling a page fault could overwrite it, so
0439              * set the SPTE until it is set from some other
0440              * value to the removed SPTE value.
0441              */
0442             for (;;) {
0443                 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
0444                 if (!is_removed_spte(old_spte))
0445                     break;
0446                 cpu_relax();
0447             }
0448         } else {
0449             /*
0450              * If the SPTE is not MMU-present, there is no backing
0451              * page associated with the SPTE and so no side effects
0452              * that need to be recorded, and exclusive ownership of
0453              * mmu_lock ensures the SPTE can't be made present.
0454              * Note, zapping MMIO SPTEs is also unnecessary as they
0455              * are guarded by the memslots generation, not by being
0456              * unreachable.
0457              */
0458             old_spte = kvm_tdp_mmu_read_spte(sptep);
0459             if (!is_shadow_present_pte(old_spte))
0460                 continue;
0461 
0462             /*
0463              * Use the common helper instead of a raw WRITE_ONCE as
0464              * the SPTE needs to be updated atomically if it can be
0465              * modified by a different vCPU outside of mmu_lock.
0466              * Even though the parent SPTE is !PRESENT, the TLB
0467              * hasn't yet been flushed, and both Intel and AMD
0468              * document that A/D assists can use upper-level PxE
0469              * entries that are cached in the TLB, i.e. the CPU can
0470              * still access the page and mark it dirty.
0471              *
0472              * No retry is needed in the atomic update path as the
0473              * sole concern is dropping a Dirty bit, i.e. no other
0474              * task can zap/remove the SPTE as mmu_lock is held for
0475              * write.  Marking the SPTE as a removed SPTE is not
0476              * strictly necessary for the same reason, but using
0477              * the remove SPTE value keeps the shared/exclusive
0478              * paths consistent and allows the handle_changed_spte()
0479              * call below to hardcode the new value to REMOVED_SPTE.
0480              *
0481              * Note, even though dropping a Dirty bit is the only
0482              * scenario where a non-atomic update could result in a
0483              * functional bug, simply checking the Dirty bit isn't
0484              * sufficient as a fast page fault could read the upper
0485              * level SPTE before it is zapped, and then make this
0486              * target SPTE writable, resume the guest, and set the
0487              * Dirty bit between reading the SPTE above and writing
0488              * it here.
0489              */
0490             old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
0491                               REMOVED_SPTE, level);
0492         }
0493         handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
0494                     old_spte, REMOVED_SPTE, level, shared);
0495     }
0496 
0497     call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
0498 }
0499 
0500 /**
0501  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
0502  * @kvm: kvm instance
0503  * @as_id: the address space of the paging structure the SPTE was a part of
0504  * @gfn: the base GFN that was mapped by the SPTE
0505  * @old_spte: The value of the SPTE before the change
0506  * @new_spte: The value of the SPTE after the change
0507  * @level: the level of the PT the SPTE is part of in the paging structure
0508  * @shared: This operation may not be running under the exclusive use of
0509  *      the MMU lock and the operation must synchronize with other
0510  *      threads that might be modifying SPTEs.
0511  *
0512  * Handle bookkeeping that might result from the modification of a SPTE.
0513  * This function must be called for all TDP SPTE modifications.
0514  */
0515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0516                   u64 old_spte, u64 new_spte, int level,
0517                   bool shared)
0518 {
0519     bool was_present = is_shadow_present_pte(old_spte);
0520     bool is_present = is_shadow_present_pte(new_spte);
0521     bool was_leaf = was_present && is_last_spte(old_spte, level);
0522     bool is_leaf = is_present && is_last_spte(new_spte, level);
0523     bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
0524 
0525     WARN_ON(level > PT64_ROOT_MAX_LEVEL);
0526     WARN_ON(level < PG_LEVEL_4K);
0527     WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
0528 
0529     /*
0530      * If this warning were to trigger it would indicate that there was a
0531      * missing MMU notifier or a race with some notifier handler.
0532      * A present, leaf SPTE should never be directly replaced with another
0533      * present leaf SPTE pointing to a different PFN. A notifier handler
0534      * should be zapping the SPTE before the main MM's page table is
0535      * changed, or the SPTE should be zeroed, and the TLBs flushed by the
0536      * thread before replacement.
0537      */
0538     if (was_leaf && is_leaf && pfn_changed) {
0539         pr_err("Invalid SPTE change: cannot replace a present leaf\n"
0540                "SPTE with another present leaf SPTE mapping a\n"
0541                "different PFN!\n"
0542                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
0543                as_id, gfn, old_spte, new_spte, level);
0544 
0545         /*
0546          * Crash the host to prevent error propagation and guest data
0547          * corruption.
0548          */
0549         BUG();
0550     }
0551 
0552     if (old_spte == new_spte)
0553         return;
0554 
0555     trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
0556 
0557     if (is_leaf)
0558         check_spte_writable_invariants(new_spte);
0559 
0560     /*
0561      * The only times a SPTE should be changed from a non-present to
0562      * non-present state is when an MMIO entry is installed/modified/
0563      * removed. In that case, there is nothing to do here.
0564      */
0565     if (!was_present && !is_present) {
0566         /*
0567          * If this change does not involve a MMIO SPTE or removed SPTE,
0568          * it is unexpected. Log the change, though it should not
0569          * impact the guest since both the former and current SPTEs
0570          * are nonpresent.
0571          */
0572         if (WARN_ON(!is_mmio_spte(old_spte) &&
0573                 !is_mmio_spte(new_spte) &&
0574                 !is_removed_spte(new_spte)))
0575             pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
0576                    "should not be replaced with another,\n"
0577                    "different nonpresent SPTE, unless one or both\n"
0578                    "are MMIO SPTEs, or the new SPTE is\n"
0579                    "a temporary removed SPTE.\n"
0580                    "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
0581                    as_id, gfn, old_spte, new_spte, level);
0582         return;
0583     }
0584 
0585     if (is_leaf != was_leaf)
0586         kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
0587 
0588     if (was_leaf && is_dirty_spte(old_spte) &&
0589         (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
0590         kvm_set_pfn_dirty(spte_to_pfn(old_spte));
0591 
0592     /*
0593      * Recursively handle child PTs if the change removed a subtree from
0594      * the paging structure.  Note the WARN on the PFN changing without the
0595      * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
0596      * pages are kernel allocations and should never be migrated.
0597      */
0598     if (was_present && !was_leaf &&
0599         (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0600         handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
0601 }
0602 
0603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
0604                 u64 old_spte, u64 new_spte, int level,
0605                 bool shared)
0606 {
0607     __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
0608                   shared);
0609     handle_changed_spte_acc_track(old_spte, new_spte, level);
0610     handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
0611                       new_spte, level);
0612 }
0613 
0614 /*
0615  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
0616  * and handle the associated bookkeeping.  Do not mark the page dirty
0617  * in KVM's dirty bitmaps.
0618  *
0619  * If setting the SPTE fails because it has changed, iter->old_spte will be
0620  * refreshed to the current value of the spte.
0621  *
0622  * @kvm: kvm instance
0623  * @iter: a tdp_iter instance currently on the SPTE that should be set
0624  * @new_spte: The value the SPTE should be set to
0625  * Return:
0626  * * 0      - If the SPTE was set.
0627  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
0628  *            no side-effects other than setting iter->old_spte to the last
0629  *            known value of the spte.
0630  */
0631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
0632                       struct tdp_iter *iter,
0633                       u64 new_spte)
0634 {
0635     u64 *sptep = rcu_dereference(iter->sptep);
0636 
0637     /*
0638      * The caller is responsible for ensuring the old SPTE is not a REMOVED
0639      * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
0640      * and pre-checking before inserting a new SPTE is advantageous as it
0641      * avoids unnecessary work.
0642      */
0643     WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
0644 
0645     lockdep_assert_held_read(&kvm->mmu_lock);
0646 
0647     /*
0648      * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
0649      * does not hold the mmu_lock.
0650      */
0651     if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
0652         return -EBUSY;
0653 
0654     __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
0655                   new_spte, iter->level, true);
0656     handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
0657 
0658     return 0;
0659 }
0660 
0661 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
0662                       struct tdp_iter *iter)
0663 {
0664     int ret;
0665 
0666     /*
0667      * Freeze the SPTE by setting it to a special,
0668      * non-present value. This will stop other threads from
0669      * immediately installing a present entry in its place
0670      * before the TLBs are flushed.
0671      */
0672     ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
0673     if (ret)
0674         return ret;
0675 
0676     kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
0677                        KVM_PAGES_PER_HPAGE(iter->level));
0678 
0679     /*
0680      * No other thread can overwrite the removed SPTE as they must either
0681      * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
0682      * overwrite the special removed SPTE value. No bookkeeping is needed
0683      * here since the SPTE is going from non-present to non-present.  Use
0684      * the raw write helper to avoid an unnecessary check on volatile bits.
0685      */
0686     __kvm_tdp_mmu_write_spte(iter->sptep, 0);
0687 
0688     return 0;
0689 }
0690 
0691 
0692 /*
0693  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
0694  * @kvm:          KVM instance
0695  * @as_id:        Address space ID, i.e. regular vs. SMM
0696  * @sptep:        Pointer to the SPTE
0697  * @old_spte:         The current value of the SPTE
0698  * @new_spte:         The new value that will be set for the SPTE
0699  * @gfn:          The base GFN that was (or will be) mapped by the SPTE
0700  * @level:        The level _containing_ the SPTE (its parent PT's level)
0701  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
0702  *            of the page. Should be set unless handling an MMU
0703  *            notifier for access tracking. Leaving record_acc_track
0704  *            unset in that case prevents page accesses from being
0705  *            double counted.
0706  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
0707  *            appropriate for the change being made. Should be set
0708  *            unless performing certain dirty logging operations.
0709  *            Leaving record_dirty_log unset in that case prevents page
0710  *            writes from being double counted.
0711  *
0712  * Returns the old SPTE value, which _may_ be different than @old_spte if the
0713  * SPTE had voldatile bits.
0714  */
0715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
0716                   u64 old_spte, u64 new_spte, gfn_t gfn, int level,
0717                   bool record_acc_track, bool record_dirty_log)
0718 {
0719     lockdep_assert_held_write(&kvm->mmu_lock);
0720 
0721     /*
0722      * No thread should be using this function to set SPTEs to or from the
0723      * temporary removed SPTE value.
0724      * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
0725      * should be used. If operating under the MMU lock in write mode, the
0726      * use of the removed SPTE should not be necessary.
0727      */
0728     WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
0729 
0730     old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
0731 
0732     __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
0733 
0734     if (record_acc_track)
0735         handle_changed_spte_acc_track(old_spte, new_spte, level);
0736     if (record_dirty_log)
0737         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
0738                           new_spte, level);
0739     return old_spte;
0740 }
0741 
0742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
0743                      u64 new_spte, bool record_acc_track,
0744                      bool record_dirty_log)
0745 {
0746     WARN_ON_ONCE(iter->yielded);
0747 
0748     iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
0749                         iter->old_spte, new_spte,
0750                         iter->gfn, iter->level,
0751                         record_acc_track, record_dirty_log);
0752 }
0753 
0754 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
0755                     u64 new_spte)
0756 {
0757     _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
0758 }
0759 
0760 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
0761                          struct tdp_iter *iter,
0762                          u64 new_spte)
0763 {
0764     _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
0765 }
0766 
0767 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
0768                          struct tdp_iter *iter,
0769                          u64 new_spte)
0770 {
0771     _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
0772 }
0773 
0774 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
0775     for_each_tdp_pte(_iter, _root, _start, _end)
0776 
0777 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
0778     tdp_root_for_each_pte(_iter, _root, _start, _end)       \
0779         if (!is_shadow_present_pte(_iter.old_spte) ||       \
0780             !is_last_spte(_iter.old_spte, _iter.level))     \
0781             continue;                   \
0782         else
0783 
0784 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)     \
0785     for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
0786 
0787 /*
0788  * Yield if the MMU lock is contended or this thread needs to return control
0789  * to the scheduler.
0790  *
0791  * If this function should yield and flush is set, it will perform a remote
0792  * TLB flush before yielding.
0793  *
0794  * If this function yields, iter->yielded is set and the caller must skip to
0795  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
0796  * over the paging structures to allow the iterator to continue its traversal
0797  * from the paging structure root.
0798  *
0799  * Returns true if this function yielded.
0800  */
0801 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
0802                               struct tdp_iter *iter,
0803                               bool flush, bool shared)
0804 {
0805     WARN_ON(iter->yielded);
0806 
0807     /* Ensure forward progress has been made before yielding. */
0808     if (iter->next_last_level_gfn == iter->yielded_gfn)
0809         return false;
0810 
0811     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
0812         if (flush)
0813             kvm_flush_remote_tlbs(kvm);
0814 
0815         rcu_read_unlock();
0816 
0817         if (shared)
0818             cond_resched_rwlock_read(&kvm->mmu_lock);
0819         else
0820             cond_resched_rwlock_write(&kvm->mmu_lock);
0821 
0822         rcu_read_lock();
0823 
0824         WARN_ON(iter->gfn > iter->next_last_level_gfn);
0825 
0826         iter->yielded = true;
0827     }
0828 
0829     return iter->yielded;
0830 }
0831 
0832 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
0833 {
0834     /*
0835      * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
0836      * a gpa range that would exceed the max gfn, and KVM does not create
0837      * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
0838      * the slow emulation path every time.
0839      */
0840     return kvm_mmu_max_gfn() + 1;
0841 }
0842 
0843 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0844                    bool shared, int zap_level)
0845 {
0846     struct tdp_iter iter;
0847 
0848     gfn_t end = tdp_mmu_max_gfn_exclusive();
0849     gfn_t start = 0;
0850 
0851     for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
0852 retry:
0853         if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
0854             continue;
0855 
0856         if (!is_shadow_present_pte(iter.old_spte))
0857             continue;
0858 
0859         if (iter.level > zap_level)
0860             continue;
0861 
0862         if (!shared)
0863             tdp_mmu_set_spte(kvm, &iter, 0);
0864         else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
0865             goto retry;
0866     }
0867 }
0868 
0869 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
0870                  bool shared)
0871 {
0872 
0873     /*
0874      * The root must have an elevated refcount so that it's reachable via
0875      * mmu_notifier callbacks, which allows this path to yield and drop
0876      * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
0877      * must drop all references to relevant pages prior to completing the
0878      * callback.  Dropping mmu_lock with an unreachable root would result
0879      * in zapping SPTEs after a relevant mmu_notifier callback completes
0880      * and lead to use-after-free as zapping a SPTE triggers "writeback" of
0881      * dirty accessed bits to the SPTE's associated struct page.
0882      */
0883     WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
0884 
0885     kvm_lockdep_assert_mmu_lock_held(kvm, shared);
0886 
0887     rcu_read_lock();
0888 
0889     /*
0890      * To avoid RCU stalls due to recursively removing huge swaths of SPs,
0891      * split the zap into two passes.  On the first pass, zap at the 1gb
0892      * level, and then zap top-level SPs on the second pass.  "1gb" is not
0893      * arbitrary, as KVM must be able to zap a 1gb shadow page without
0894      * inducing a stall to allow in-place replacement with a 1gb hugepage.
0895      *
0896      * Because zapping a SP recurses on its children, stepping down to
0897      * PG_LEVEL_4K in the iterator itself is unnecessary.
0898      */
0899     __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
0900     __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
0901 
0902     rcu_read_unlock();
0903 }
0904 
0905 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
0906 {
0907     u64 old_spte;
0908 
0909     /*
0910      * This helper intentionally doesn't allow zapping a root shadow page,
0911      * which doesn't have a parent page table and thus no associated entry.
0912      */
0913     if (WARN_ON_ONCE(!sp->ptep))
0914         return false;
0915 
0916     old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
0917     if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
0918         return false;
0919 
0920     __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
0921                sp->gfn, sp->role.level + 1, true, true);
0922 
0923     return true;
0924 }
0925 
0926 /*
0927  * If can_yield is true, will release the MMU lock and reschedule if the
0928  * scheduler needs the CPU or there is contention on the MMU lock. If this
0929  * function cannot yield, it will not release the MMU lock or reschedule and
0930  * the caller must ensure it does not supply too large a GFN range, or the
0931  * operation can cause a soft lockup.
0932  */
0933 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
0934                   gfn_t start, gfn_t end, bool can_yield, bool flush)
0935 {
0936     struct tdp_iter iter;
0937 
0938     end = min(end, tdp_mmu_max_gfn_exclusive());
0939 
0940     lockdep_assert_held_write(&kvm->mmu_lock);
0941 
0942     rcu_read_lock();
0943 
0944     for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
0945         if (can_yield &&
0946             tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
0947             flush = false;
0948             continue;
0949         }
0950 
0951         if (!is_shadow_present_pte(iter.old_spte) ||
0952             !is_last_spte(iter.old_spte, iter.level))
0953             continue;
0954 
0955         tdp_mmu_set_spte(kvm, &iter, 0);
0956         flush = true;
0957     }
0958 
0959     rcu_read_unlock();
0960 
0961     /*
0962      * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
0963      * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
0964      */
0965     return flush;
0966 }
0967 
0968 /*
0969  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
0970  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
0971  * more SPTEs were zapped since the MMU lock was last acquired.
0972  */
0973 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
0974                bool can_yield, bool flush)
0975 {
0976     struct kvm_mmu_page *root;
0977 
0978     for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
0979         flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
0980 
0981     return flush;
0982 }
0983 
0984 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
0985 {
0986     struct kvm_mmu_page *root;
0987     int i;
0988 
0989     /*
0990      * Zap all roots, including invalid roots, as all SPTEs must be dropped
0991      * before returning to the caller.  Zap directly even if the root is
0992      * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
0993      * all that expensive and mmu_lock is already held, which means the
0994      * worker has yielded, i.e. flushing the work instead of zapping here
0995      * isn't guaranteed to be any faster.
0996      *
0997      * A TLB flush is unnecessary, KVM zaps everything if and only the VM
0998      * is being destroyed or the userspace VMM has exited.  In both cases,
0999      * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1000      */
1001     for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1002         for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1003             tdp_mmu_zap_root(kvm, root, false);
1004     }
1005 }
1006 
1007 /*
1008  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1009  * zap" completes.
1010  */
1011 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1012 {
1013     flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1014 }
1015 
1016 /*
1017  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1018  * is about to be zapped, e.g. in response to a memslots update.  The actual
1019  * zapping is performed asynchronously, so a reference is taken on all roots.
1020  * Using a separate workqueue makes it easy to ensure that the destruction is
1021  * performed before the "fast zap" completes, without keeping a separate list
1022  * of invalidated roots; the list is effectively the list of work items in
1023  * the workqueue.
1024  *
1025  * Get a reference even if the root is already invalid, the asynchronous worker
1026  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1027  * is held for write, it should be impossible to observe a root with zero refcount,
1028  * i.e. the list of roots cannot be stale.
1029  *
1030  * This has essentially the same effect for the TDP MMU
1031  * as updating mmu_valid_gen does for the shadow MMU.
1032  */
1033 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1034 {
1035     struct kvm_mmu_page *root;
1036 
1037     lockdep_assert_held_write(&kvm->mmu_lock);
1038     list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1039         if (!root->role.invalid &&
1040             !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1041             root->role.invalid = true;
1042             tdp_mmu_schedule_zap_root(kvm, root);
1043         }
1044     }
1045 }
1046 
1047 /*
1048  * Installs a last-level SPTE to handle a TDP page fault.
1049  * (NPT/EPT violation/misconfiguration)
1050  */
1051 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1052                       struct kvm_page_fault *fault,
1053                       struct tdp_iter *iter)
1054 {
1055     struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1056     u64 new_spte;
1057     int ret = RET_PF_FIXED;
1058     bool wrprot = false;
1059 
1060     WARN_ON(sp->role.level != fault->goal_level);
1061     if (unlikely(!fault->slot))
1062         new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1063     else
1064         wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1065                      fault->pfn, iter->old_spte, fault->prefetch, true,
1066                      fault->map_writable, &new_spte);
1067 
1068     if (new_spte == iter->old_spte)
1069         ret = RET_PF_SPURIOUS;
1070     else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1071         return RET_PF_RETRY;
1072     else if (is_shadow_present_pte(iter->old_spte) &&
1073          !is_last_spte(iter->old_spte, iter->level))
1074         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1075                            KVM_PAGES_PER_HPAGE(iter->level + 1));
1076 
1077     /*
1078      * If the page fault was caused by a write but the page is write
1079      * protected, emulation is needed. If the emulation was skipped,
1080      * the vCPU would have the same fault again.
1081      */
1082     if (wrprot) {
1083         if (fault->write)
1084             ret = RET_PF_EMULATE;
1085     }
1086 
1087     /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1088     if (unlikely(is_mmio_spte(new_spte))) {
1089         vcpu->stat.pf_mmio_spte_created++;
1090         trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1091                      new_spte);
1092         ret = RET_PF_EMULATE;
1093     } else {
1094         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1095                        rcu_dereference(iter->sptep));
1096     }
1097 
1098     return ret;
1099 }
1100 
1101 /*
1102  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1103  * provided page table.
1104  *
1105  * @kvm: kvm instance
1106  * @iter: a tdp_iter instance currently on the SPTE that should be set
1107  * @sp: The new TDP page table to install.
1108  * @account_nx: True if this page table is being installed to split a
1109  *              non-executable huge page.
1110  * @shared: This operation is running under the MMU lock in read mode.
1111  *
1112  * Returns: 0 if the new page table was installed. Non-0 if the page table
1113  *          could not be installed (e.g. the atomic compare-exchange failed).
1114  */
1115 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1116                struct kvm_mmu_page *sp, bool account_nx,
1117                bool shared)
1118 {
1119     u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1120     int ret = 0;
1121 
1122     if (shared) {
1123         ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1124         if (ret)
1125             return ret;
1126     } else {
1127         tdp_mmu_set_spte(kvm, iter, spte);
1128     }
1129 
1130     spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1131     list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1132     if (account_nx)
1133         account_huge_nx_page(kvm, sp);
1134     spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1135 
1136     return 0;
1137 }
1138 
1139 /*
1140  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1141  * page tables and SPTEs to translate the faulting guest physical address.
1142  */
1143 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1144 {
1145     struct kvm_mmu *mmu = vcpu->arch.mmu;
1146     struct tdp_iter iter;
1147     struct kvm_mmu_page *sp;
1148     int ret;
1149 
1150     kvm_mmu_hugepage_adjust(vcpu, fault);
1151 
1152     trace_kvm_mmu_spte_requested(fault);
1153 
1154     rcu_read_lock();
1155 
1156     tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1157         if (fault->nx_huge_page_workaround_enabled)
1158             disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1159 
1160         if (iter.level == fault->goal_level)
1161             break;
1162 
1163         /*
1164          * If there is an SPTE mapping a large page at a higher level
1165          * than the target, that SPTE must be cleared and replaced
1166          * with a non-leaf SPTE.
1167          */
1168         if (is_shadow_present_pte(iter.old_spte) &&
1169             is_large_pte(iter.old_spte)) {
1170             if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1171                 break;
1172 
1173             /*
1174              * The iter must explicitly re-read the spte here
1175              * because the new value informs the !present
1176              * path below.
1177              */
1178             iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1179         }
1180 
1181         if (!is_shadow_present_pte(iter.old_spte)) {
1182             bool account_nx = fault->huge_page_disallowed &&
1183                       fault->req_level >= iter.level;
1184 
1185             /*
1186              * If SPTE has been frozen by another thread, just
1187              * give up and retry, avoiding unnecessary page table
1188              * allocation and free.
1189              */
1190             if (is_removed_spte(iter.old_spte))
1191                 break;
1192 
1193             sp = tdp_mmu_alloc_sp(vcpu);
1194             tdp_mmu_init_child_sp(sp, &iter);
1195 
1196             if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1197                 tdp_mmu_free_sp(sp);
1198                 break;
1199             }
1200         }
1201     }
1202 
1203     /*
1204      * Force the guest to retry the access if the upper level SPTEs aren't
1205      * in place, or if the target leaf SPTE is frozen by another CPU.
1206      */
1207     if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1208         rcu_read_unlock();
1209         return RET_PF_RETRY;
1210     }
1211 
1212     ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1213     rcu_read_unlock();
1214 
1215     return ret;
1216 }
1217 
1218 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1219                  bool flush)
1220 {
1221     return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1222                      range->end, range->may_block, flush);
1223 }
1224 
1225 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1226                   struct kvm_gfn_range *range);
1227 
1228 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1229                            struct kvm_gfn_range *range,
1230                            tdp_handler_t handler)
1231 {
1232     struct kvm_mmu_page *root;
1233     struct tdp_iter iter;
1234     bool ret = false;
1235 
1236     /*
1237      * Don't support rescheduling, none of the MMU notifiers that funnel
1238      * into this helper allow blocking; it'd be dead, wasteful code.
1239      */
1240     for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1241         rcu_read_lock();
1242 
1243         tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1244             ret |= handler(kvm, &iter, range);
1245 
1246         rcu_read_unlock();
1247     }
1248 
1249     return ret;
1250 }
1251 
1252 /*
1253  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1254  * if any of the GFNs in the range have been accessed.
1255  */
1256 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1257               struct kvm_gfn_range *range)
1258 {
1259     u64 new_spte = 0;
1260 
1261     /* If we have a non-accessed entry we don't need to change the pte. */
1262     if (!is_accessed_spte(iter->old_spte))
1263         return false;
1264 
1265     new_spte = iter->old_spte;
1266 
1267     if (spte_ad_enabled(new_spte)) {
1268         new_spte &= ~shadow_accessed_mask;
1269     } else {
1270         /*
1271          * Capture the dirty status of the page, so that it doesn't get
1272          * lost when the SPTE is marked for access tracking.
1273          */
1274         if (is_writable_pte(new_spte))
1275             kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1276 
1277         new_spte = mark_spte_for_access_track(new_spte);
1278     }
1279 
1280     tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1281 
1282     return true;
1283 }
1284 
1285 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1286 {
1287     return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1288 }
1289 
1290 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1291              struct kvm_gfn_range *range)
1292 {
1293     return is_accessed_spte(iter->old_spte);
1294 }
1295 
1296 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1297 {
1298     return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1299 }
1300 
1301 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1302              struct kvm_gfn_range *range)
1303 {
1304     u64 new_spte;
1305 
1306     /* Huge pages aren't expected to be modified without first being zapped. */
1307     WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1308 
1309     if (iter->level != PG_LEVEL_4K ||
1310         !is_shadow_present_pte(iter->old_spte))
1311         return false;
1312 
1313     /*
1314      * Note, when changing a read-only SPTE, it's not strictly necessary to
1315      * zero the SPTE before setting the new PFN, but doing so preserves the
1316      * invariant that the PFN of a present * leaf SPTE can never change.
1317      * See __handle_changed_spte().
1318      */
1319     tdp_mmu_set_spte(kvm, iter, 0);
1320 
1321     if (!pte_write(range->pte)) {
1322         new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1323                                   pte_pfn(range->pte));
1324 
1325         tdp_mmu_set_spte(kvm, iter, new_spte);
1326     }
1327 
1328     return true;
1329 }
1330 
1331 /*
1332  * Handle the changed_pte MMU notifier for the TDP MMU.
1333  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1334  * notifier.
1335  * Returns non-zero if a flush is needed before releasing the MMU lock.
1336  */
1337 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1338 {
1339     /*
1340      * No need to handle the remote TLB flush under RCU protection, the
1341      * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1342      * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1343      */
1344     return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1345 }
1346 
1347 /*
1348  * Remove write access from all SPTEs at or above min_level that map GFNs
1349  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1350  * be flushed.
1351  */
1352 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1353                  gfn_t start, gfn_t end, int min_level)
1354 {
1355     struct tdp_iter iter;
1356     u64 new_spte;
1357     bool spte_set = false;
1358 
1359     rcu_read_lock();
1360 
1361     BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1362 
1363     for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1364 retry:
1365         if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1366             continue;
1367 
1368         if (!is_shadow_present_pte(iter.old_spte) ||
1369             !is_last_spte(iter.old_spte, iter.level) ||
1370             !(iter.old_spte & PT_WRITABLE_MASK))
1371             continue;
1372 
1373         new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1374 
1375         if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1376             goto retry;
1377 
1378         spte_set = true;
1379     }
1380 
1381     rcu_read_unlock();
1382     return spte_set;
1383 }
1384 
1385 /*
1386  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1387  * only affect leaf SPTEs down to min_level.
1388  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1389  */
1390 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1391                  const struct kvm_memory_slot *slot, int min_level)
1392 {
1393     struct kvm_mmu_page *root;
1394     bool spte_set = false;
1395 
1396     lockdep_assert_held_read(&kvm->mmu_lock);
1397 
1398     for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1399         spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1400                  slot->base_gfn + slot->npages, min_level);
1401 
1402     return spte_set;
1403 }
1404 
1405 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1406 {
1407     struct kvm_mmu_page *sp;
1408 
1409     gfp |= __GFP_ZERO;
1410 
1411     sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1412     if (!sp)
1413         return NULL;
1414 
1415     sp->spt = (void *)__get_free_page(gfp);
1416     if (!sp->spt) {
1417         kmem_cache_free(mmu_page_header_cache, sp);
1418         return NULL;
1419     }
1420 
1421     return sp;
1422 }
1423 
1424 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1425                                struct tdp_iter *iter,
1426                                bool shared)
1427 {
1428     struct kvm_mmu_page *sp;
1429 
1430     /*
1431      * Since we are allocating while under the MMU lock we have to be
1432      * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1433      * reclaim and to avoid making any filesystem callbacks (which can end
1434      * up invoking KVM MMU notifiers, resulting in a deadlock).
1435      *
1436      * If this allocation fails we drop the lock and retry with reclaim
1437      * allowed.
1438      */
1439     sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1440     if (sp)
1441         return sp;
1442 
1443     rcu_read_unlock();
1444 
1445     if (shared)
1446         read_unlock(&kvm->mmu_lock);
1447     else
1448         write_unlock(&kvm->mmu_lock);
1449 
1450     iter->yielded = true;
1451     sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1452 
1453     if (shared)
1454         read_lock(&kvm->mmu_lock);
1455     else
1456         write_lock(&kvm->mmu_lock);
1457 
1458     rcu_read_lock();
1459 
1460     return sp;
1461 }
1462 
1463 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1464                    struct kvm_mmu_page *sp, bool shared)
1465 {
1466     const u64 huge_spte = iter->old_spte;
1467     const int level = iter->level;
1468     int ret, i;
1469 
1470     tdp_mmu_init_child_sp(sp, iter);
1471 
1472     /*
1473      * No need for atomics when writing to sp->spt since the page table has
1474      * not been linked in yet and thus is not reachable from any other CPU.
1475      */
1476     for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1477         sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1478 
1479     /*
1480      * Replace the huge spte with a pointer to the populated lower level
1481      * page table. Since we are making this change without a TLB flush vCPUs
1482      * will see a mix of the split mappings and the original huge mapping,
1483      * depending on what's currently in their TLB. This is fine from a
1484      * correctness standpoint since the translation will be the same either
1485      * way.
1486      */
1487     ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1488     if (ret)
1489         goto out;
1490 
1491     /*
1492      * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1493      * are overwriting from the page stats. But we have to manually update
1494      * the page stats with the new present child pages.
1495      */
1496     kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1497 
1498 out:
1499     trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1500     return ret;
1501 }
1502 
1503 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1504                      struct kvm_mmu_page *root,
1505                      gfn_t start, gfn_t end,
1506                      int target_level, bool shared)
1507 {
1508     struct kvm_mmu_page *sp = NULL;
1509     struct tdp_iter iter;
1510     int ret = 0;
1511 
1512     rcu_read_lock();
1513 
1514     /*
1515      * Traverse the page table splitting all huge pages above the target
1516      * level into one lower level. For example, if we encounter a 1GB page
1517      * we split it into 512 2MB pages.
1518      *
1519      * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1520      * to visit an SPTE before ever visiting its children, which means we
1521      * will correctly recursively split huge pages that are more than one
1522      * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1523      * and then splitting each of those to 512 4KB pages).
1524      */
1525     for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1526 retry:
1527         if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1528             continue;
1529 
1530         if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1531             continue;
1532 
1533         if (!sp) {
1534             sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1535             if (!sp) {
1536                 ret = -ENOMEM;
1537                 trace_kvm_mmu_split_huge_page(iter.gfn,
1538                                   iter.old_spte,
1539                                   iter.level, ret);
1540                 break;
1541             }
1542 
1543             if (iter.yielded)
1544                 continue;
1545         }
1546 
1547         if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1548             goto retry;
1549 
1550         sp = NULL;
1551     }
1552 
1553     rcu_read_unlock();
1554 
1555     /*
1556      * It's possible to exit the loop having never used the last sp if, for
1557      * example, a vCPU doing HugePage NX splitting wins the race and
1558      * installs its own sp in place of the last sp we tried to split.
1559      */
1560     if (sp)
1561         tdp_mmu_free_sp(sp);
1562 
1563     return ret;
1564 }
1565 
1566 
1567 /*
1568  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1569  */
1570 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1571                       const struct kvm_memory_slot *slot,
1572                       gfn_t start, gfn_t end,
1573                       int target_level, bool shared)
1574 {
1575     struct kvm_mmu_page *root;
1576     int r = 0;
1577 
1578     kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1579 
1580     for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1581         r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1582         if (r) {
1583             kvm_tdp_mmu_put_root(kvm, root, shared);
1584             break;
1585         }
1586     }
1587 }
1588 
1589 /*
1590  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1591  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1592  * If AD bits are not enabled, this will require clearing the writable bit on
1593  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1594  * be flushed.
1595  */
1596 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1597                gfn_t start, gfn_t end)
1598 {
1599     struct tdp_iter iter;
1600     u64 new_spte;
1601     bool spte_set = false;
1602 
1603     rcu_read_lock();
1604 
1605     tdp_root_for_each_leaf_pte(iter, root, start, end) {
1606 retry:
1607         if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1608             continue;
1609 
1610         if (!is_shadow_present_pte(iter.old_spte))
1611             continue;
1612 
1613         if (spte_ad_need_write_protect(iter.old_spte)) {
1614             if (is_writable_pte(iter.old_spte))
1615                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1616             else
1617                 continue;
1618         } else {
1619             if (iter.old_spte & shadow_dirty_mask)
1620                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1621             else
1622                 continue;
1623         }
1624 
1625         if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1626             goto retry;
1627 
1628         spte_set = true;
1629     }
1630 
1631     rcu_read_unlock();
1632     return spte_set;
1633 }
1634 
1635 /*
1636  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1637  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1638  * If AD bits are not enabled, this will require clearing the writable bit on
1639  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1640  * be flushed.
1641  */
1642 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1643                   const struct kvm_memory_slot *slot)
1644 {
1645     struct kvm_mmu_page *root;
1646     bool spte_set = false;
1647 
1648     lockdep_assert_held_read(&kvm->mmu_lock);
1649 
1650     for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1651         spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1652                 slot->base_gfn + slot->npages);
1653 
1654     return spte_set;
1655 }
1656 
1657 /*
1658  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1659  * set in mask, starting at gfn. The given memslot is expected to contain all
1660  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1661  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1662  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1663  */
1664 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1665                   gfn_t gfn, unsigned long mask, bool wrprot)
1666 {
1667     struct tdp_iter iter;
1668     u64 new_spte;
1669 
1670     rcu_read_lock();
1671 
1672     tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1673                     gfn + BITS_PER_LONG) {
1674         if (!mask)
1675             break;
1676 
1677         if (iter.level > PG_LEVEL_4K ||
1678             !(mask & (1UL << (iter.gfn - gfn))))
1679             continue;
1680 
1681         mask &= ~(1UL << (iter.gfn - gfn));
1682 
1683         if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1684             if (is_writable_pte(iter.old_spte))
1685                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1686             else
1687                 continue;
1688         } else {
1689             if (iter.old_spte & shadow_dirty_mask)
1690                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1691             else
1692                 continue;
1693         }
1694 
1695         tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1696     }
1697 
1698     rcu_read_unlock();
1699 }
1700 
1701 /*
1702  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1703  * set in mask, starting at gfn. The given memslot is expected to contain all
1704  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1705  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1706  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1707  */
1708 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1709                        struct kvm_memory_slot *slot,
1710                        gfn_t gfn, unsigned long mask,
1711                        bool wrprot)
1712 {
1713     struct kvm_mmu_page *root;
1714 
1715     lockdep_assert_held_write(&kvm->mmu_lock);
1716     for_each_tdp_mmu_root(kvm, root, slot->as_id)
1717         clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1718 }
1719 
1720 static void zap_collapsible_spte_range(struct kvm *kvm,
1721                        struct kvm_mmu_page *root,
1722                        const struct kvm_memory_slot *slot)
1723 {
1724     gfn_t start = slot->base_gfn;
1725     gfn_t end = start + slot->npages;
1726     struct tdp_iter iter;
1727     int max_mapping_level;
1728 
1729     rcu_read_lock();
1730 
1731     for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1732 retry:
1733         if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1734             continue;
1735 
1736         if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1737             !is_shadow_present_pte(iter.old_spte))
1738             continue;
1739 
1740         /*
1741          * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1742          * a large page size, then its parent would have been zapped
1743          * instead of stepping down.
1744          */
1745         if (is_last_spte(iter.old_spte, iter.level))
1746             continue;
1747 
1748         /*
1749          * If iter.gfn resides outside of the slot, i.e. the page for
1750          * the current level overlaps but is not contained by the slot,
1751          * then the SPTE can't be made huge.  More importantly, trying
1752          * to query that info from slot->arch.lpage_info will cause an
1753          * out-of-bounds access.
1754          */
1755         if (iter.gfn < start || iter.gfn >= end)
1756             continue;
1757 
1758         max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1759                                   iter.gfn, PG_LEVEL_NUM);
1760         if (max_mapping_level < iter.level)
1761             continue;
1762 
1763         /* Note, a successful atomic zap also does a remote TLB flush. */
1764         if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1765             goto retry;
1766     }
1767 
1768     rcu_read_unlock();
1769 }
1770 
1771 /*
1772  * Zap non-leaf SPTEs (and free their associated page tables) which could
1773  * be replaced by huge pages, for GFNs within the slot.
1774  */
1775 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1776                        const struct kvm_memory_slot *slot)
1777 {
1778     struct kvm_mmu_page *root;
1779 
1780     lockdep_assert_held_read(&kvm->mmu_lock);
1781 
1782     for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1783         zap_collapsible_spte_range(kvm, root, slot);
1784 }
1785 
1786 /*
1787  * Removes write access on the last level SPTE mapping this GFN and unsets the
1788  * MMU-writable bit to ensure future writes continue to be intercepted.
1789  * Returns true if an SPTE was set and a TLB flush is needed.
1790  */
1791 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1792                   gfn_t gfn, int min_level)
1793 {
1794     struct tdp_iter iter;
1795     u64 new_spte;
1796     bool spte_set = false;
1797 
1798     BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1799 
1800     rcu_read_lock();
1801 
1802     for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1803         if (!is_shadow_present_pte(iter.old_spte) ||
1804             !is_last_spte(iter.old_spte, iter.level))
1805             continue;
1806 
1807         new_spte = iter.old_spte &
1808             ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1809 
1810         if (new_spte == iter.old_spte)
1811             break;
1812 
1813         tdp_mmu_set_spte(kvm, &iter, new_spte);
1814         spte_set = true;
1815     }
1816 
1817     rcu_read_unlock();
1818 
1819     return spte_set;
1820 }
1821 
1822 /*
1823  * Removes write access on the last level SPTE mapping this GFN and unsets the
1824  * MMU-writable bit to ensure future writes continue to be intercepted.
1825  * Returns true if an SPTE was set and a TLB flush is needed.
1826  */
1827 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1828                    struct kvm_memory_slot *slot, gfn_t gfn,
1829                    int min_level)
1830 {
1831     struct kvm_mmu_page *root;
1832     bool spte_set = false;
1833 
1834     lockdep_assert_held_write(&kvm->mmu_lock);
1835     for_each_tdp_mmu_root(kvm, root, slot->as_id)
1836         spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1837 
1838     return spte_set;
1839 }
1840 
1841 /*
1842  * Return the level of the lowest level SPTE added to sptes.
1843  * That SPTE may be non-present.
1844  *
1845  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1846  */
1847 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1848              int *root_level)
1849 {
1850     struct tdp_iter iter;
1851     struct kvm_mmu *mmu = vcpu->arch.mmu;
1852     gfn_t gfn = addr >> PAGE_SHIFT;
1853     int leaf = -1;
1854 
1855     *root_level = vcpu->arch.mmu->root_role.level;
1856 
1857     tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1858         leaf = iter.level;
1859         sptes[leaf] = iter.old_spte;
1860     }
1861 
1862     return leaf;
1863 }
1864 
1865 /*
1866  * Returns the last level spte pointer of the shadow page walk for the given
1867  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1868  * walk could be performed, returns NULL and *spte does not contain valid data.
1869  *
1870  * Contract:
1871  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1872  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1873  *
1874  * WARNING: This function is only intended to be called during fast_page_fault.
1875  */
1876 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1877                     u64 *spte)
1878 {
1879     struct tdp_iter iter;
1880     struct kvm_mmu *mmu = vcpu->arch.mmu;
1881     gfn_t gfn = addr >> PAGE_SHIFT;
1882     tdp_ptep_t sptep = NULL;
1883 
1884     tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1885         *spte = iter.old_spte;
1886         sptep = iter.sptep;
1887     }
1888 
1889     /*
1890      * Perform the rcu_dereference to get the raw spte pointer value since
1891      * we are passing it up to fast_page_fault, which is shared with the
1892      * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1893      * annotation.
1894      *
1895      * This is safe since fast_page_fault obeys the contracts of this
1896      * function as well as all TDP MMU contracts around modifying SPTEs
1897      * outside of mmu_lock.
1898      */
1899     return rcu_dereference(sptep);
1900 }