Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 
0003 #include <linux/objtool.h>
0004 #include <linux/percpu.h>
0005 
0006 #include <asm/debugreg.h>
0007 #include <asm/mmu_context.h>
0008 
0009 #include "cpuid.h"
0010 #include "evmcs.h"
0011 #include "hyperv.h"
0012 #include "mmu.h"
0013 #include "nested.h"
0014 #include "pmu.h"
0015 #include "sgx.h"
0016 #include "trace.h"
0017 #include "vmx.h"
0018 #include "x86.h"
0019 
0020 static bool __read_mostly enable_shadow_vmcs = 1;
0021 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
0022 
0023 static bool __read_mostly nested_early_check = 0;
0024 module_param(nested_early_check, bool, S_IRUGO);
0025 
0026 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
0027 
0028 /*
0029  * Hyper-V requires all of these, so mark them as supported even though
0030  * they are just treated the same as all-context.
0031  */
0032 #define VMX_VPID_EXTENT_SUPPORTED_MASK      \
0033     (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
0034     VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
0035     VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
0036     VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
0037 
0038 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
0039 
0040 enum {
0041     VMX_VMREAD_BITMAP,
0042     VMX_VMWRITE_BITMAP,
0043     VMX_BITMAP_NR
0044 };
0045 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
0046 
0047 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
0048 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
0049 
0050 struct shadow_vmcs_field {
0051     u16 encoding;
0052     u16 offset;
0053 };
0054 static struct shadow_vmcs_field shadow_read_only_fields[] = {
0055 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
0056 #include "vmcs_shadow_fields.h"
0057 };
0058 static int max_shadow_read_only_fields =
0059     ARRAY_SIZE(shadow_read_only_fields);
0060 
0061 static struct shadow_vmcs_field shadow_read_write_fields[] = {
0062 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
0063 #include "vmcs_shadow_fields.h"
0064 };
0065 static int max_shadow_read_write_fields =
0066     ARRAY_SIZE(shadow_read_write_fields);
0067 
0068 static void init_vmcs_shadow_fields(void)
0069 {
0070     int i, j;
0071 
0072     memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
0073     memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
0074 
0075     for (i = j = 0; i < max_shadow_read_only_fields; i++) {
0076         struct shadow_vmcs_field entry = shadow_read_only_fields[i];
0077         u16 field = entry.encoding;
0078 
0079         if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
0080             (i + 1 == max_shadow_read_only_fields ||
0081              shadow_read_only_fields[i + 1].encoding != field + 1))
0082             pr_err("Missing field from shadow_read_only_field %x\n",
0083                    field + 1);
0084 
0085         clear_bit(field, vmx_vmread_bitmap);
0086         if (field & 1)
0087 #ifdef CONFIG_X86_64
0088             continue;
0089 #else
0090             entry.offset += sizeof(u32);
0091 #endif
0092         shadow_read_only_fields[j++] = entry;
0093     }
0094     max_shadow_read_only_fields = j;
0095 
0096     for (i = j = 0; i < max_shadow_read_write_fields; i++) {
0097         struct shadow_vmcs_field entry = shadow_read_write_fields[i];
0098         u16 field = entry.encoding;
0099 
0100         if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
0101             (i + 1 == max_shadow_read_write_fields ||
0102              shadow_read_write_fields[i + 1].encoding != field + 1))
0103             pr_err("Missing field from shadow_read_write_field %x\n",
0104                    field + 1);
0105 
0106         WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
0107               field <= GUEST_TR_AR_BYTES,
0108               "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
0109 
0110         /*
0111          * PML and the preemption timer can be emulated, but the
0112          * processor cannot vmwrite to fields that don't exist
0113          * on bare metal.
0114          */
0115         switch (field) {
0116         case GUEST_PML_INDEX:
0117             if (!cpu_has_vmx_pml())
0118                 continue;
0119             break;
0120         case VMX_PREEMPTION_TIMER_VALUE:
0121             if (!cpu_has_vmx_preemption_timer())
0122                 continue;
0123             break;
0124         case GUEST_INTR_STATUS:
0125             if (!cpu_has_vmx_apicv())
0126                 continue;
0127             break;
0128         default:
0129             break;
0130         }
0131 
0132         clear_bit(field, vmx_vmwrite_bitmap);
0133         clear_bit(field, vmx_vmread_bitmap);
0134         if (field & 1)
0135 #ifdef CONFIG_X86_64
0136             continue;
0137 #else
0138             entry.offset += sizeof(u32);
0139 #endif
0140         shadow_read_write_fields[j++] = entry;
0141     }
0142     max_shadow_read_write_fields = j;
0143 }
0144 
0145 /*
0146  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
0147  * set the success or error code of an emulated VMX instruction (as specified
0148  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
0149  * instruction.
0150  */
0151 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
0152 {
0153     vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
0154             & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
0155                 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
0156     return kvm_skip_emulated_instruction(vcpu);
0157 }
0158 
0159 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
0160 {
0161     vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
0162             & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
0163                 X86_EFLAGS_SF | X86_EFLAGS_OF))
0164             | X86_EFLAGS_CF);
0165     return kvm_skip_emulated_instruction(vcpu);
0166 }
0167 
0168 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
0169                 u32 vm_instruction_error)
0170 {
0171     vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
0172             & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
0173                 X86_EFLAGS_SF | X86_EFLAGS_OF))
0174             | X86_EFLAGS_ZF);
0175     get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
0176     /*
0177      * We don't need to force sync to shadow VMCS because
0178      * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
0179      * fields and thus must be synced.
0180      */
0181     if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
0182         to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
0183 
0184     return kvm_skip_emulated_instruction(vcpu);
0185 }
0186 
0187 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
0188 {
0189     struct vcpu_vmx *vmx = to_vmx(vcpu);
0190 
0191     /*
0192      * failValid writes the error number to the current VMCS, which
0193      * can't be done if there isn't a current VMCS.
0194      */
0195     if (vmx->nested.current_vmptr == INVALID_GPA &&
0196         !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
0197         return nested_vmx_failInvalid(vcpu);
0198 
0199     return nested_vmx_failValid(vcpu, vm_instruction_error);
0200 }
0201 
0202 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
0203 {
0204     /* TODO: not to reset guest simply here. */
0205     kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
0206     pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
0207 }
0208 
0209 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
0210 {
0211     return fixed_bits_valid(control, low, high);
0212 }
0213 
0214 static inline u64 vmx_control_msr(u32 low, u32 high)
0215 {
0216     return low | ((u64)high << 32);
0217 }
0218 
0219 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
0220 {
0221     secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
0222     vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
0223     vmx->nested.need_vmcs12_to_shadow_sync = false;
0224 }
0225 
0226 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
0227 {
0228     struct vcpu_vmx *vmx = to_vmx(vcpu);
0229 
0230     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
0231         kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
0232         vmx->nested.hv_evmcs = NULL;
0233     }
0234 
0235     vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
0236 }
0237 
0238 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
0239                      struct loaded_vmcs *prev)
0240 {
0241     struct vmcs_host_state *dest, *src;
0242 
0243     if (unlikely(!vmx->guest_state_loaded))
0244         return;
0245 
0246     src = &prev->host_state;
0247     dest = &vmx->loaded_vmcs->host_state;
0248 
0249     vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
0250     dest->ldt_sel = src->ldt_sel;
0251 #ifdef CONFIG_X86_64
0252     dest->ds_sel = src->ds_sel;
0253     dest->es_sel = src->es_sel;
0254 #endif
0255 }
0256 
0257 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
0258 {
0259     struct vcpu_vmx *vmx = to_vmx(vcpu);
0260     struct loaded_vmcs *prev;
0261     int cpu;
0262 
0263     if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
0264         return;
0265 
0266     cpu = get_cpu();
0267     prev = vmx->loaded_vmcs;
0268     vmx->loaded_vmcs = vmcs;
0269     vmx_vcpu_load_vmcs(vcpu, cpu, prev);
0270     vmx_sync_vmcs_host_state(vmx, prev);
0271     put_cpu();
0272 
0273     vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
0274 
0275     /*
0276      * All lazily updated registers will be reloaded from VMCS12 on both
0277      * vmentry and vmexit.
0278      */
0279     vcpu->arch.regs_dirty = 0;
0280 }
0281 
0282 /*
0283  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
0284  * just stops using VMX.
0285  */
0286 static void free_nested(struct kvm_vcpu *vcpu)
0287 {
0288     struct vcpu_vmx *vmx = to_vmx(vcpu);
0289 
0290     if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
0291         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
0292 
0293     if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
0294         return;
0295 
0296     kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
0297 
0298     vmx->nested.vmxon = false;
0299     vmx->nested.smm.vmxon = false;
0300     vmx->nested.vmxon_ptr = INVALID_GPA;
0301     free_vpid(vmx->nested.vpid02);
0302     vmx->nested.posted_intr_nv = -1;
0303     vmx->nested.current_vmptr = INVALID_GPA;
0304     if (enable_shadow_vmcs) {
0305         vmx_disable_shadow_vmcs(vmx);
0306         vmcs_clear(vmx->vmcs01.shadow_vmcs);
0307         free_vmcs(vmx->vmcs01.shadow_vmcs);
0308         vmx->vmcs01.shadow_vmcs = NULL;
0309     }
0310     kfree(vmx->nested.cached_vmcs12);
0311     vmx->nested.cached_vmcs12 = NULL;
0312     kfree(vmx->nested.cached_shadow_vmcs12);
0313     vmx->nested.cached_shadow_vmcs12 = NULL;
0314     /*
0315      * Unpin physical memory we referred to in the vmcs02.  The APIC access
0316      * page's backing page (yeah, confusing) shouldn't actually be accessed,
0317      * and if it is written, the contents are irrelevant.
0318      */
0319     kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
0320     kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
0321     kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
0322     vmx->nested.pi_desc = NULL;
0323 
0324     kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
0325 
0326     nested_release_evmcs(vcpu);
0327 
0328     free_loaded_vmcs(&vmx->nested.vmcs02);
0329 }
0330 
0331 /*
0332  * Ensure that the current vmcs of the logical processor is the
0333  * vmcs01 of the vcpu before calling free_nested().
0334  */
0335 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
0336 {
0337     vcpu_load(vcpu);
0338     vmx_leave_nested(vcpu);
0339     vcpu_put(vcpu);
0340 }
0341 
0342 #define EPTP_PA_MASK   GENMASK_ULL(51, 12)
0343 
0344 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
0345 {
0346     return VALID_PAGE(root_hpa) &&
0347            ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
0348 }
0349 
0350 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
0351                        gpa_t addr)
0352 {
0353     uint i;
0354     struct kvm_mmu_root_info *cached_root;
0355 
0356     WARN_ON_ONCE(!mmu_is_nested(vcpu));
0357 
0358     for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
0359         cached_root = &vcpu->arch.mmu->prev_roots[i];
0360 
0361         if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
0362                         eptp))
0363             vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
0364     }
0365 }
0366 
0367 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
0368         struct x86_exception *fault)
0369 {
0370     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
0371     struct vcpu_vmx *vmx = to_vmx(vcpu);
0372     u32 vm_exit_reason;
0373     unsigned long exit_qualification = vcpu->arch.exit_qualification;
0374 
0375     if (vmx->nested.pml_full) {
0376         vm_exit_reason = EXIT_REASON_PML_FULL;
0377         vmx->nested.pml_full = false;
0378         exit_qualification &= INTR_INFO_UNBLOCK_NMI;
0379     } else {
0380         if (fault->error_code & PFERR_RSVD_MASK)
0381             vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
0382         else
0383             vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
0384 
0385         /*
0386          * Although the caller (kvm_inject_emulated_page_fault) would
0387          * have already synced the faulting address in the shadow EPT
0388          * tables for the current EPTP12, we also need to sync it for
0389          * any other cached EPTP02s based on the same EP4TA, since the
0390          * TLB associates mappings to the EP4TA rather than the full EPTP.
0391          */
0392         nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
0393                        fault->address);
0394     }
0395 
0396     nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
0397     vmcs12->guest_physical_address = fault->address;
0398 }
0399 
0400 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
0401 {
0402     struct vcpu_vmx *vmx = to_vmx(vcpu);
0403     bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
0404     int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
0405 
0406     kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
0407                 nested_ept_ad_enabled(vcpu),
0408                 nested_ept_get_eptp(vcpu));
0409 }
0410 
0411 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
0412 {
0413     WARN_ON(mmu_is_nested(vcpu));
0414 
0415     vcpu->arch.mmu = &vcpu->arch.guest_mmu;
0416     nested_ept_new_eptp(vcpu);
0417     vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
0418     vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
0419     vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
0420 
0421     vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
0422 }
0423 
0424 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
0425 {
0426     vcpu->arch.mmu = &vcpu->arch.root_mmu;
0427     vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
0428 }
0429 
0430 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
0431                         u16 error_code)
0432 {
0433     bool inequality, bit;
0434 
0435     bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
0436     inequality =
0437         (error_code & vmcs12->page_fault_error_code_mask) !=
0438          vmcs12->page_fault_error_code_match;
0439     return inequality ^ bit;
0440 }
0441 
0442 
0443 /*
0444  * KVM wants to inject page-faults which it got to the guest. This function
0445  * checks whether in a nested guest, we need to inject them to L1 or L2.
0446  */
0447 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
0448 {
0449     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
0450     unsigned int nr = vcpu->arch.exception.nr;
0451     bool has_payload = vcpu->arch.exception.has_payload;
0452     unsigned long payload = vcpu->arch.exception.payload;
0453 
0454     if (nr == PF_VECTOR) {
0455         if (vcpu->arch.exception.nested_apf) {
0456             *exit_qual = vcpu->arch.apf.nested_apf_token;
0457             return 1;
0458         }
0459         if (nested_vmx_is_page_fault_vmexit(vmcs12,
0460                             vcpu->arch.exception.error_code)) {
0461             *exit_qual = has_payload ? payload : vcpu->arch.cr2;
0462             return 1;
0463         }
0464     } else if (vmcs12->exception_bitmap & (1u << nr)) {
0465         if (nr == DB_VECTOR) {
0466             if (!has_payload) {
0467                 payload = vcpu->arch.dr6;
0468                 payload &= ~DR6_BT;
0469                 payload ^= DR6_ACTIVE_LOW;
0470             }
0471             *exit_qual = payload;
0472         } else
0473             *exit_qual = 0;
0474         return 1;
0475     }
0476 
0477     return 0;
0478 }
0479 
0480 static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
0481                             struct x86_exception *fault)
0482 {
0483     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
0484 
0485     WARN_ON(!is_guest_mode(vcpu));
0486 
0487     if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
0488         !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
0489         vmcs12->vm_exit_intr_error_code = fault->error_code;
0490         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
0491                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
0492                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
0493                   fault->address);
0494         return true;
0495     }
0496     return false;
0497 }
0498 
0499 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
0500                            struct vmcs12 *vmcs12)
0501 {
0502     if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
0503         return 0;
0504 
0505     if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
0506         CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
0507         return -EINVAL;
0508 
0509     return 0;
0510 }
0511 
0512 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
0513                         struct vmcs12 *vmcs12)
0514 {
0515     if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
0516         return 0;
0517 
0518     if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
0519         return -EINVAL;
0520 
0521     return 0;
0522 }
0523 
0524 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
0525                         struct vmcs12 *vmcs12)
0526 {
0527     if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
0528         return 0;
0529 
0530     if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
0531         return -EINVAL;
0532 
0533     return 0;
0534 }
0535 
0536 /*
0537  * For x2APIC MSRs, ignore the vmcs01 bitmap.  L1 can enable x2APIC without L1
0538  * itself utilizing x2APIC.  All MSRs were previously set to be intercepted,
0539  * only the "disable intercept" case needs to be handled.
0540  */
0541 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
0542                             unsigned long *msr_bitmap_l0,
0543                             u32 msr, int type)
0544 {
0545     if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
0546         vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
0547 
0548     if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
0549         vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
0550 }
0551 
0552 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
0553 {
0554     int msr;
0555 
0556     for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
0557         unsigned word = msr / BITS_PER_LONG;
0558 
0559         msr_bitmap[word] = ~0;
0560         msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
0561     }
0562 }
0563 
0564 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw)                 \
0565 static inline                                   \
0566 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx,          \
0567                      unsigned long *msr_bitmap_l1,      \
0568                      unsigned long *msr_bitmap_l0, u32 msr) \
0569 {                                       \
0570     if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) ||        \
0571         vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr))           \
0572         vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr);            \
0573     else                                    \
0574         vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr);          \
0575 }
0576 BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
0577 BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
0578 
0579 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
0580                             unsigned long *msr_bitmap_l1,
0581                             unsigned long *msr_bitmap_l0,
0582                             u32 msr, int types)
0583 {
0584     if (types & MSR_TYPE_R)
0585         nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
0586                           msr_bitmap_l0, msr);
0587     if (types & MSR_TYPE_W)
0588         nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
0589                            msr_bitmap_l0, msr);
0590 }
0591 
0592 /*
0593  * Merge L0's and L1's MSR bitmap, return false to indicate that
0594  * we do not use the hardware.
0595  */
0596 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
0597                          struct vmcs12 *vmcs12)
0598 {
0599     struct vcpu_vmx *vmx = to_vmx(vcpu);
0600     int msr;
0601     unsigned long *msr_bitmap_l1;
0602     unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
0603     struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
0604     struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
0605 
0606     /* Nothing to do if the MSR bitmap is not in use.  */
0607     if (!cpu_has_vmx_msr_bitmap() ||
0608         !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
0609         return false;
0610 
0611     /*
0612      * MSR bitmap update can be skipped when:
0613      * - MSR bitmap for L1 hasn't changed.
0614      * - Nested hypervisor (L1) is attempting to launch the same L2 as
0615      *   before.
0616      * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
0617      *   and tells KVM (L0) there were no changes in MSR bitmap for L2.
0618      */
0619     if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
0620         evmcs->hv_enlightenments_control.msr_bitmap &&
0621         evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
0622         return true;
0623 
0624     if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
0625         return false;
0626 
0627     msr_bitmap_l1 = (unsigned long *)map->hva;
0628 
0629     /*
0630      * To keep the control flow simple, pay eight 8-byte writes (sixteen
0631      * 4-byte writes on 32-bit systems) up front to enable intercepts for
0632      * the x2APIC MSR range and selectively toggle those relevant to L2.
0633      */
0634     enable_x2apic_msr_intercepts(msr_bitmap_l0);
0635 
0636     if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
0637         if (nested_cpu_has_apic_reg_virt(vmcs12)) {
0638             /*
0639              * L0 need not intercept reads for MSRs between 0x800
0640              * and 0x8ff, it just lets the processor take the value
0641              * from the virtual-APIC page; take those 256 bits
0642              * directly from the L1 bitmap.
0643              */
0644             for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
0645                 unsigned word = msr / BITS_PER_LONG;
0646 
0647                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
0648             }
0649         }
0650 
0651         nested_vmx_disable_intercept_for_x2apic_msr(
0652             msr_bitmap_l1, msr_bitmap_l0,
0653             X2APIC_MSR(APIC_TASKPRI),
0654             MSR_TYPE_R | MSR_TYPE_W);
0655 
0656         if (nested_cpu_has_vid(vmcs12)) {
0657             nested_vmx_disable_intercept_for_x2apic_msr(
0658                 msr_bitmap_l1, msr_bitmap_l0,
0659                 X2APIC_MSR(APIC_EOI),
0660                 MSR_TYPE_W);
0661             nested_vmx_disable_intercept_for_x2apic_msr(
0662                 msr_bitmap_l1, msr_bitmap_l0,
0663                 X2APIC_MSR(APIC_SELF_IPI),
0664                 MSR_TYPE_W);
0665         }
0666     }
0667 
0668     /*
0669      * Always check vmcs01's bitmap to honor userspace MSR filters and any
0670      * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
0671      */
0672 #ifdef CONFIG_X86_64
0673     nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
0674                      MSR_FS_BASE, MSR_TYPE_RW);
0675 
0676     nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
0677                      MSR_GS_BASE, MSR_TYPE_RW);
0678 
0679     nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
0680                      MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
0681 #endif
0682     nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
0683                      MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
0684 
0685     nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
0686                      MSR_IA32_PRED_CMD, MSR_TYPE_W);
0687 
0688     kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
0689 
0690     vmx->nested.force_msr_bitmap_recalc = false;
0691 
0692     return true;
0693 }
0694 
0695 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
0696                        struct vmcs12 *vmcs12)
0697 {
0698     struct vcpu_vmx *vmx = to_vmx(vcpu);
0699     struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
0700 
0701     if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
0702         vmcs12->vmcs_link_pointer == INVALID_GPA)
0703         return;
0704 
0705     if (ghc->gpa != vmcs12->vmcs_link_pointer &&
0706         kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
0707                       vmcs12->vmcs_link_pointer, VMCS12_SIZE))
0708         return;
0709 
0710     kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
0711                   VMCS12_SIZE);
0712 }
0713 
0714 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
0715                           struct vmcs12 *vmcs12)
0716 {
0717     struct vcpu_vmx *vmx = to_vmx(vcpu);
0718     struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
0719 
0720     if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
0721         vmcs12->vmcs_link_pointer == INVALID_GPA)
0722         return;
0723 
0724     if (ghc->gpa != vmcs12->vmcs_link_pointer &&
0725         kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
0726                       vmcs12->vmcs_link_pointer, VMCS12_SIZE))
0727         return;
0728 
0729     kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
0730                    VMCS12_SIZE);
0731 }
0732 
0733 /*
0734  * In nested virtualization, check if L1 has set
0735  * VM_EXIT_ACK_INTR_ON_EXIT
0736  */
0737 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
0738 {
0739     return get_vmcs12(vcpu)->vm_exit_controls &
0740         VM_EXIT_ACK_INTR_ON_EXIT;
0741 }
0742 
0743 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
0744                       struct vmcs12 *vmcs12)
0745 {
0746     if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
0747         CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
0748         return -EINVAL;
0749     else
0750         return 0;
0751 }
0752 
0753 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
0754                        struct vmcs12 *vmcs12)
0755 {
0756     if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
0757         !nested_cpu_has_apic_reg_virt(vmcs12) &&
0758         !nested_cpu_has_vid(vmcs12) &&
0759         !nested_cpu_has_posted_intr(vmcs12))
0760         return 0;
0761 
0762     /*
0763      * If virtualize x2apic mode is enabled,
0764      * virtualize apic access must be disabled.
0765      */
0766     if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
0767            nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
0768         return -EINVAL;
0769 
0770     /*
0771      * If virtual interrupt delivery is enabled,
0772      * we must exit on external interrupts.
0773      */
0774     if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
0775         return -EINVAL;
0776 
0777     /*
0778      * bits 15:8 should be zero in posted_intr_nv,
0779      * the descriptor address has been already checked
0780      * in nested_get_vmcs12_pages.
0781      *
0782      * bits 5:0 of posted_intr_desc_addr should be zero.
0783      */
0784     if (nested_cpu_has_posted_intr(vmcs12) &&
0785        (CC(!nested_cpu_has_vid(vmcs12)) ||
0786         CC(!nested_exit_intr_ack_set(vcpu)) ||
0787         CC((vmcs12->posted_intr_nv & 0xff00)) ||
0788         CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
0789         return -EINVAL;
0790 
0791     /* tpr shadow is needed by all apicv features. */
0792     if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
0793         return -EINVAL;
0794 
0795     return 0;
0796 }
0797 
0798 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
0799                        u32 count, u64 addr)
0800 {
0801     if (count == 0)
0802         return 0;
0803 
0804     if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
0805         !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
0806         return -EINVAL;
0807 
0808     return 0;
0809 }
0810 
0811 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
0812                              struct vmcs12 *vmcs12)
0813 {
0814     if (CC(nested_vmx_check_msr_switch(vcpu,
0815                        vmcs12->vm_exit_msr_load_count,
0816                        vmcs12->vm_exit_msr_load_addr)) ||
0817         CC(nested_vmx_check_msr_switch(vcpu,
0818                        vmcs12->vm_exit_msr_store_count,
0819                        vmcs12->vm_exit_msr_store_addr)))
0820         return -EINVAL;
0821 
0822     return 0;
0823 }
0824 
0825 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
0826                                                       struct vmcs12 *vmcs12)
0827 {
0828     if (CC(nested_vmx_check_msr_switch(vcpu,
0829                        vmcs12->vm_entry_msr_load_count,
0830                        vmcs12->vm_entry_msr_load_addr)))
0831                 return -EINVAL;
0832 
0833     return 0;
0834 }
0835 
0836 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
0837                      struct vmcs12 *vmcs12)
0838 {
0839     if (!nested_cpu_has_pml(vmcs12))
0840         return 0;
0841 
0842     if (CC(!nested_cpu_has_ept(vmcs12)) ||
0843         CC(!page_address_valid(vcpu, vmcs12->pml_address)))
0844         return -EINVAL;
0845 
0846     return 0;
0847 }
0848 
0849 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
0850                             struct vmcs12 *vmcs12)
0851 {
0852     if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
0853            !nested_cpu_has_ept(vmcs12)))
0854         return -EINVAL;
0855     return 0;
0856 }
0857 
0858 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
0859                              struct vmcs12 *vmcs12)
0860 {
0861     if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
0862            !nested_cpu_has_ept(vmcs12)))
0863         return -EINVAL;
0864     return 0;
0865 }
0866 
0867 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
0868                          struct vmcs12 *vmcs12)
0869 {
0870     if (!nested_cpu_has_shadow_vmcs(vmcs12))
0871         return 0;
0872 
0873     if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
0874         CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
0875         return -EINVAL;
0876 
0877     return 0;
0878 }
0879 
0880 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
0881                        struct vmx_msr_entry *e)
0882 {
0883     /* x2APIC MSR accesses are not allowed */
0884     if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
0885         return -EINVAL;
0886     if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
0887         CC(e->index == MSR_IA32_UCODE_REV))
0888         return -EINVAL;
0889     if (CC(e->reserved != 0))
0890         return -EINVAL;
0891     return 0;
0892 }
0893 
0894 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
0895                      struct vmx_msr_entry *e)
0896 {
0897     if (CC(e->index == MSR_FS_BASE) ||
0898         CC(e->index == MSR_GS_BASE) ||
0899         CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
0900         nested_vmx_msr_check_common(vcpu, e))
0901         return -EINVAL;
0902     return 0;
0903 }
0904 
0905 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
0906                       struct vmx_msr_entry *e)
0907 {
0908     if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
0909         nested_vmx_msr_check_common(vcpu, e))
0910         return -EINVAL;
0911     return 0;
0912 }
0913 
0914 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
0915 {
0916     struct vcpu_vmx *vmx = to_vmx(vcpu);
0917     u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
0918                        vmx->nested.msrs.misc_high);
0919 
0920     return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
0921 }
0922 
0923 /*
0924  * Load guest's/host's msr at nested entry/exit.
0925  * return 0 for success, entry index for failure.
0926  *
0927  * One of the failure modes for MSR load/store is when a list exceeds the
0928  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
0929  * as possible, process all valid entries before failing rather than precheck
0930  * for a capacity violation.
0931  */
0932 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
0933 {
0934     u32 i;
0935     struct vmx_msr_entry e;
0936     u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
0937 
0938     for (i = 0; i < count; i++) {
0939         if (unlikely(i >= max_msr_list_size))
0940             goto fail;
0941 
0942         if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
0943                     &e, sizeof(e))) {
0944             pr_debug_ratelimited(
0945                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
0946                 __func__, i, gpa + i * sizeof(e));
0947             goto fail;
0948         }
0949         if (nested_vmx_load_msr_check(vcpu, &e)) {
0950             pr_debug_ratelimited(
0951                 "%s check failed (%u, 0x%x, 0x%x)\n",
0952                 __func__, i, e.index, e.reserved);
0953             goto fail;
0954         }
0955         if (kvm_set_msr(vcpu, e.index, e.value)) {
0956             pr_debug_ratelimited(
0957                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
0958                 __func__, i, e.index, e.value);
0959             goto fail;
0960         }
0961     }
0962     return 0;
0963 fail:
0964     /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
0965     return i + 1;
0966 }
0967 
0968 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
0969                         u32 msr_index,
0970                         u64 *data)
0971 {
0972     struct vcpu_vmx *vmx = to_vmx(vcpu);
0973 
0974     /*
0975      * If the L0 hypervisor stored a more accurate value for the TSC that
0976      * does not include the time taken for emulation of the L2->L1
0977      * VM-exit in L0, use the more accurate value.
0978      */
0979     if (msr_index == MSR_IA32_TSC) {
0980         int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
0981                             MSR_IA32_TSC);
0982 
0983         if (i >= 0) {
0984             u64 val = vmx->msr_autostore.guest.val[i].value;
0985 
0986             *data = kvm_read_l1_tsc(vcpu, val);
0987             return true;
0988         }
0989     }
0990 
0991     if (kvm_get_msr(vcpu, msr_index, data)) {
0992         pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
0993             msr_index);
0994         return false;
0995     }
0996     return true;
0997 }
0998 
0999 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1000                      struct vmx_msr_entry *e)
1001 {
1002     if (kvm_vcpu_read_guest(vcpu,
1003                 gpa + i * sizeof(*e),
1004                 e, 2 * sizeof(u32))) {
1005         pr_debug_ratelimited(
1006             "%s cannot read MSR entry (%u, 0x%08llx)\n",
1007             __func__, i, gpa + i * sizeof(*e));
1008         return false;
1009     }
1010     if (nested_vmx_store_msr_check(vcpu, e)) {
1011         pr_debug_ratelimited(
1012             "%s check failed (%u, 0x%x, 0x%x)\n",
1013             __func__, i, e->index, e->reserved);
1014         return false;
1015     }
1016     return true;
1017 }
1018 
1019 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1020 {
1021     u64 data;
1022     u32 i;
1023     struct vmx_msr_entry e;
1024     u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1025 
1026     for (i = 0; i < count; i++) {
1027         if (unlikely(i >= max_msr_list_size))
1028             return -EINVAL;
1029 
1030         if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1031             return -EINVAL;
1032 
1033         if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1034             return -EINVAL;
1035 
1036         if (kvm_vcpu_write_guest(vcpu,
1037                      gpa + i * sizeof(e) +
1038                          offsetof(struct vmx_msr_entry, value),
1039                      &data, sizeof(data))) {
1040             pr_debug_ratelimited(
1041                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1042                 __func__, i, e.index, data);
1043             return -EINVAL;
1044         }
1045     }
1046     return 0;
1047 }
1048 
1049 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1050 {
1051     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1052     u32 count = vmcs12->vm_exit_msr_store_count;
1053     u64 gpa = vmcs12->vm_exit_msr_store_addr;
1054     struct vmx_msr_entry e;
1055     u32 i;
1056 
1057     for (i = 0; i < count; i++) {
1058         if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1059             return false;
1060 
1061         if (e.index == msr_index)
1062             return true;
1063     }
1064     return false;
1065 }
1066 
1067 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1068                        u32 msr_index)
1069 {
1070     struct vcpu_vmx *vmx = to_vmx(vcpu);
1071     struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1072     bool in_vmcs12_store_list;
1073     int msr_autostore_slot;
1074     bool in_autostore_list;
1075     int last;
1076 
1077     msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1078     in_autostore_list = msr_autostore_slot >= 0;
1079     in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1080 
1081     if (in_vmcs12_store_list && !in_autostore_list) {
1082         if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1083             /*
1084              * Emulated VMEntry does not fail here.  Instead a less
1085              * accurate value will be returned by
1086              * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1087              * instead of reading the value from the vmcs02 VMExit
1088              * MSR-store area.
1089              */
1090             pr_warn_ratelimited(
1091                 "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1092                 msr_index);
1093             return;
1094         }
1095         last = autostore->nr++;
1096         autostore->val[last].index = msr_index;
1097     } else if (!in_vmcs12_store_list && in_autostore_list) {
1098         last = --autostore->nr;
1099         autostore->val[msr_autostore_slot] = autostore->val[last];
1100     }
1101 }
1102 
1103 /*
1104  * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
1105  * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
1106  * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1107  * @entry_failure_code.
1108  */
1109 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1110                    bool nested_ept, bool reload_pdptrs,
1111                    enum vm_entry_failure_code *entry_failure_code)
1112 {
1113     if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
1114         *entry_failure_code = ENTRY_FAIL_DEFAULT;
1115         return -EINVAL;
1116     }
1117 
1118     /*
1119      * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1120      * must not be dereferenced.
1121      */
1122     if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1123         CC(!load_pdptrs(vcpu, cr3))) {
1124         *entry_failure_code = ENTRY_FAIL_PDPTE;
1125         return -EINVAL;
1126     }
1127 
1128     vcpu->arch.cr3 = cr3;
1129     kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1130 
1131     /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1132     kvm_init_mmu(vcpu);
1133 
1134     if (!nested_ept)
1135         kvm_mmu_new_pgd(vcpu, cr3);
1136 
1137     return 0;
1138 }
1139 
1140 /*
1141  * Returns if KVM is able to config CPU to tag TLB entries
1142  * populated by L2 differently than TLB entries populated
1143  * by L1.
1144  *
1145  * If L0 uses EPT, L1 and L2 run with different EPTP because
1146  * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1147  * are tagged with different EPTP.
1148  *
1149  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1150  * with different VPID (L1 entries are tagged with vmx->vpid
1151  * while L2 entries are tagged with vmx->nested.vpid02).
1152  */
1153 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1154 {
1155     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1156 
1157     return enable_ept ||
1158            (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1159 }
1160 
1161 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1162                         struct vmcs12 *vmcs12,
1163                         bool is_vmenter)
1164 {
1165     struct vcpu_vmx *vmx = to_vmx(vcpu);
1166 
1167     /*
1168      * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1169      * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
1170      * full TLB flush from the guest's perspective.  This is required even
1171      * if VPID is disabled in the host as KVM may need to synchronize the
1172      * MMU in response to the guest TLB flush.
1173      *
1174      * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1175      * EPT is a special snowflake, as guest-physical mappings aren't
1176      * flushed on VPID invalidations, including VM-Enter or VM-Exit with
1177      * VPID disabled.  As a result, KVM _never_ needs to sync nEPT
1178      * entries on VM-Enter because L1 can't rely on VM-Enter to flush
1179      * those mappings.
1180      */
1181     if (!nested_cpu_has_vpid(vmcs12)) {
1182         kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1183         return;
1184     }
1185 
1186     /* L2 should never have a VPID if VPID is disabled. */
1187     WARN_ON(!enable_vpid);
1188 
1189     /*
1190      * VPID is enabled and in use by vmcs12.  If vpid12 is changing, then
1191      * emulate a guest TLB flush as KVM does not track vpid12 history nor
1192      * is the VPID incorporated into the MMU context.  I.e. KVM must assume
1193      * that the new vpid12 has never been used and thus represents a new
1194      * guest ASID that cannot have entries in the TLB.
1195      */
1196     if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1197         vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1198         kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1199         return;
1200     }
1201 
1202     /*
1203      * If VPID is enabled, used by vmc12, and vpid12 is not changing but
1204      * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1205      * KVM was unable to allocate a VPID for L2, flush the current context
1206      * as the effective ASID is common to both L1 and L2.
1207      */
1208     if (!nested_has_guest_tlb_tag(vcpu))
1209         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1210 }
1211 
1212 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1213 {
1214     superset &= mask;
1215     subset &= mask;
1216 
1217     return (superset | subset) == superset;
1218 }
1219 
1220 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1221 {
1222     const u64 feature_and_reserved =
1223         /* feature (except bit 48; see below) */
1224         BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1225         /* reserved */
1226         BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1227     u64 vmx_basic = vmcs_config.nested.basic;
1228 
1229     if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1230         return -EINVAL;
1231 
1232     /*
1233      * KVM does not emulate a version of VMX that constrains physical
1234      * addresses of VMX structures (e.g. VMCS) to 32-bits.
1235      */
1236     if (data & BIT_ULL(48))
1237         return -EINVAL;
1238 
1239     if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1240         vmx_basic_vmcs_revision_id(data))
1241         return -EINVAL;
1242 
1243     if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1244         return -EINVAL;
1245 
1246     vmx->nested.msrs.basic = data;
1247     return 0;
1248 }
1249 
1250 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1251                 u32 **low, u32 **high)
1252 {
1253     switch (msr_index) {
1254     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1255         *low = &msrs->pinbased_ctls_low;
1256         *high = &msrs->pinbased_ctls_high;
1257         break;
1258     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1259         *low = &msrs->procbased_ctls_low;
1260         *high = &msrs->procbased_ctls_high;
1261         break;
1262     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1263         *low = &msrs->exit_ctls_low;
1264         *high = &msrs->exit_ctls_high;
1265         break;
1266     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1267         *low = &msrs->entry_ctls_low;
1268         *high = &msrs->entry_ctls_high;
1269         break;
1270     case MSR_IA32_VMX_PROCBASED_CTLS2:
1271         *low = &msrs->secondary_ctls_low;
1272         *high = &msrs->secondary_ctls_high;
1273         break;
1274     default:
1275         BUG();
1276     }
1277 }
1278 
1279 static int
1280 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1281 {
1282     u32 *lowp, *highp;
1283     u64 supported;
1284 
1285     vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1286 
1287     supported = vmx_control_msr(*lowp, *highp);
1288 
1289     /* Check must-be-1 bits are still 1. */
1290     if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1291         return -EINVAL;
1292 
1293     /* Check must-be-0 bits are still 0. */
1294     if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1295         return -EINVAL;
1296 
1297     vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1298     *lowp = data;
1299     *highp = data >> 32;
1300     return 0;
1301 }
1302 
1303 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1304 {
1305     const u64 feature_and_reserved_bits =
1306         /* feature */
1307         BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1308         BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1309         /* reserved */
1310         GENMASK_ULL(13, 9) | BIT_ULL(31);
1311     u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1312                        vmcs_config.nested.misc_high);
1313 
1314     if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1315         return -EINVAL;
1316 
1317     if ((vmx->nested.msrs.pinbased_ctls_high &
1318          PIN_BASED_VMX_PREEMPTION_TIMER) &&
1319         vmx_misc_preemption_timer_rate(data) !=
1320         vmx_misc_preemption_timer_rate(vmx_misc))
1321         return -EINVAL;
1322 
1323     if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1324         return -EINVAL;
1325 
1326     if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1327         return -EINVAL;
1328 
1329     if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1330         return -EINVAL;
1331 
1332     vmx->nested.msrs.misc_low = data;
1333     vmx->nested.msrs.misc_high = data >> 32;
1334 
1335     return 0;
1336 }
1337 
1338 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1339 {
1340     u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1341                            vmcs_config.nested.vpid_caps);
1342 
1343     /* Every bit is either reserved or a feature bit. */
1344     if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1345         return -EINVAL;
1346 
1347     vmx->nested.msrs.ept_caps = data;
1348     vmx->nested.msrs.vpid_caps = data >> 32;
1349     return 0;
1350 }
1351 
1352 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1353 {
1354     switch (msr_index) {
1355     case MSR_IA32_VMX_CR0_FIXED0:
1356         return &msrs->cr0_fixed0;
1357     case MSR_IA32_VMX_CR4_FIXED0:
1358         return &msrs->cr4_fixed0;
1359     default:
1360         BUG();
1361     }
1362 }
1363 
1364 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1365 {
1366     const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1367 
1368     /*
1369      * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1370      * must be 1 in the restored value.
1371      */
1372     if (!is_bitwise_subset(data, *msr, -1ULL))
1373         return -EINVAL;
1374 
1375     *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1376     return 0;
1377 }
1378 
1379 /*
1380  * Called when userspace is restoring VMX MSRs.
1381  *
1382  * Returns 0 on success, non-0 otherwise.
1383  */
1384 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1385 {
1386     struct vcpu_vmx *vmx = to_vmx(vcpu);
1387 
1388     /*
1389      * Don't allow changes to the VMX capability MSRs while the vCPU
1390      * is in VMX operation.
1391      */
1392     if (vmx->nested.vmxon)
1393         return -EBUSY;
1394 
1395     switch (msr_index) {
1396     case MSR_IA32_VMX_BASIC:
1397         return vmx_restore_vmx_basic(vmx, data);
1398     case MSR_IA32_VMX_PINBASED_CTLS:
1399     case MSR_IA32_VMX_PROCBASED_CTLS:
1400     case MSR_IA32_VMX_EXIT_CTLS:
1401     case MSR_IA32_VMX_ENTRY_CTLS:
1402         /*
1403          * The "non-true" VMX capability MSRs are generated from the
1404          * "true" MSRs, so we do not support restoring them directly.
1405          *
1406          * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1407          * should restore the "true" MSRs with the must-be-1 bits
1408          * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1409          * DEFAULT SETTINGS".
1410          */
1411         return -EINVAL;
1412     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1413     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1414     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1415     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1416     case MSR_IA32_VMX_PROCBASED_CTLS2:
1417         return vmx_restore_control_msr(vmx, msr_index, data);
1418     case MSR_IA32_VMX_MISC:
1419         return vmx_restore_vmx_misc(vmx, data);
1420     case MSR_IA32_VMX_CR0_FIXED0:
1421     case MSR_IA32_VMX_CR4_FIXED0:
1422         return vmx_restore_fixed0_msr(vmx, msr_index, data);
1423     case MSR_IA32_VMX_CR0_FIXED1:
1424     case MSR_IA32_VMX_CR4_FIXED1:
1425         /*
1426          * These MSRs are generated based on the vCPU's CPUID, so we
1427          * do not support restoring them directly.
1428          */
1429         return -EINVAL;
1430     case MSR_IA32_VMX_EPT_VPID_CAP:
1431         return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1432     case MSR_IA32_VMX_VMCS_ENUM:
1433         vmx->nested.msrs.vmcs_enum = data;
1434         return 0;
1435     case MSR_IA32_VMX_VMFUNC:
1436         if (data & ~vmcs_config.nested.vmfunc_controls)
1437             return -EINVAL;
1438         vmx->nested.msrs.vmfunc_controls = data;
1439         return 0;
1440     default:
1441         /*
1442          * The rest of the VMX capability MSRs do not support restore.
1443          */
1444         return -EINVAL;
1445     }
1446 }
1447 
1448 /* Returns 0 on success, non-0 otherwise. */
1449 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1450 {
1451     switch (msr_index) {
1452     case MSR_IA32_VMX_BASIC:
1453         *pdata = msrs->basic;
1454         break;
1455     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1456     case MSR_IA32_VMX_PINBASED_CTLS:
1457         *pdata = vmx_control_msr(
1458             msrs->pinbased_ctls_low,
1459             msrs->pinbased_ctls_high);
1460         if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1461             *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1462         break;
1463     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1464     case MSR_IA32_VMX_PROCBASED_CTLS:
1465         *pdata = vmx_control_msr(
1466             msrs->procbased_ctls_low,
1467             msrs->procbased_ctls_high);
1468         if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1469             *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1470         break;
1471     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1472     case MSR_IA32_VMX_EXIT_CTLS:
1473         *pdata = vmx_control_msr(
1474             msrs->exit_ctls_low,
1475             msrs->exit_ctls_high);
1476         if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1477             *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1478         break;
1479     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1480     case MSR_IA32_VMX_ENTRY_CTLS:
1481         *pdata = vmx_control_msr(
1482             msrs->entry_ctls_low,
1483             msrs->entry_ctls_high);
1484         if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1485             *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1486         break;
1487     case MSR_IA32_VMX_MISC:
1488         *pdata = vmx_control_msr(
1489             msrs->misc_low,
1490             msrs->misc_high);
1491         break;
1492     case MSR_IA32_VMX_CR0_FIXED0:
1493         *pdata = msrs->cr0_fixed0;
1494         break;
1495     case MSR_IA32_VMX_CR0_FIXED1:
1496         *pdata = msrs->cr0_fixed1;
1497         break;
1498     case MSR_IA32_VMX_CR4_FIXED0:
1499         *pdata = msrs->cr4_fixed0;
1500         break;
1501     case MSR_IA32_VMX_CR4_FIXED1:
1502         *pdata = msrs->cr4_fixed1;
1503         break;
1504     case MSR_IA32_VMX_VMCS_ENUM:
1505         *pdata = msrs->vmcs_enum;
1506         break;
1507     case MSR_IA32_VMX_PROCBASED_CTLS2:
1508         *pdata = vmx_control_msr(
1509             msrs->secondary_ctls_low,
1510             msrs->secondary_ctls_high);
1511         break;
1512     case MSR_IA32_VMX_EPT_VPID_CAP:
1513         *pdata = msrs->ept_caps |
1514             ((u64)msrs->vpid_caps << 32);
1515         break;
1516     case MSR_IA32_VMX_VMFUNC:
1517         *pdata = msrs->vmfunc_controls;
1518         break;
1519     default:
1520         return 1;
1521     }
1522 
1523     return 0;
1524 }
1525 
1526 /*
1527  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1528  * been modified by the L1 guest.  Note, "writable" in this context means
1529  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1530  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1531  * VM-exit information fields (which are actually writable if the vCPU is
1532  * configured to support "VMWRITE to any supported field in the VMCS").
1533  */
1534 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1535 {
1536     struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1537     struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1538     struct shadow_vmcs_field field;
1539     unsigned long val;
1540     int i;
1541 
1542     if (WARN_ON(!shadow_vmcs))
1543         return;
1544 
1545     preempt_disable();
1546 
1547     vmcs_load(shadow_vmcs);
1548 
1549     for (i = 0; i < max_shadow_read_write_fields; i++) {
1550         field = shadow_read_write_fields[i];
1551         val = __vmcs_readl(field.encoding);
1552         vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1553     }
1554 
1555     vmcs_clear(shadow_vmcs);
1556     vmcs_load(vmx->loaded_vmcs->vmcs);
1557 
1558     preempt_enable();
1559 }
1560 
1561 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1562 {
1563     const struct shadow_vmcs_field *fields[] = {
1564         shadow_read_write_fields,
1565         shadow_read_only_fields
1566     };
1567     const int max_fields[] = {
1568         max_shadow_read_write_fields,
1569         max_shadow_read_only_fields
1570     };
1571     struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1572     struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1573     struct shadow_vmcs_field field;
1574     unsigned long val;
1575     int i, q;
1576 
1577     if (WARN_ON(!shadow_vmcs))
1578         return;
1579 
1580     vmcs_load(shadow_vmcs);
1581 
1582     for (q = 0; q < ARRAY_SIZE(fields); q++) {
1583         for (i = 0; i < max_fields[q]; i++) {
1584             field = fields[q][i];
1585             val = vmcs12_read_any(vmcs12, field.encoding,
1586                           field.offset);
1587             __vmcs_writel(field.encoding, val);
1588         }
1589     }
1590 
1591     vmcs_clear(shadow_vmcs);
1592     vmcs_load(vmx->loaded_vmcs->vmcs);
1593 }
1594 
1595 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1596 {
1597     struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1598     struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1599 
1600     /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1601     vmcs12->tpr_threshold = evmcs->tpr_threshold;
1602     vmcs12->guest_rip = evmcs->guest_rip;
1603 
1604     if (unlikely(!(hv_clean_fields &
1605                HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1606         vmcs12->guest_rsp = evmcs->guest_rsp;
1607         vmcs12->guest_rflags = evmcs->guest_rflags;
1608         vmcs12->guest_interruptibility_info =
1609             evmcs->guest_interruptibility_info;
1610     }
1611 
1612     if (unlikely(!(hv_clean_fields &
1613                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1614         vmcs12->cpu_based_vm_exec_control =
1615             evmcs->cpu_based_vm_exec_control;
1616     }
1617 
1618     if (unlikely(!(hv_clean_fields &
1619                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1620         vmcs12->exception_bitmap = evmcs->exception_bitmap;
1621     }
1622 
1623     if (unlikely(!(hv_clean_fields &
1624                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1625         vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1626     }
1627 
1628     if (unlikely(!(hv_clean_fields &
1629                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1630         vmcs12->vm_entry_intr_info_field =
1631             evmcs->vm_entry_intr_info_field;
1632         vmcs12->vm_entry_exception_error_code =
1633             evmcs->vm_entry_exception_error_code;
1634         vmcs12->vm_entry_instruction_len =
1635             evmcs->vm_entry_instruction_len;
1636     }
1637 
1638     if (unlikely(!(hv_clean_fields &
1639                HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1640         vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1641         vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1642         vmcs12->host_cr0 = evmcs->host_cr0;
1643         vmcs12->host_cr3 = evmcs->host_cr3;
1644         vmcs12->host_cr4 = evmcs->host_cr4;
1645         vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1646         vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1647         vmcs12->host_rip = evmcs->host_rip;
1648         vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1649         vmcs12->host_es_selector = evmcs->host_es_selector;
1650         vmcs12->host_cs_selector = evmcs->host_cs_selector;
1651         vmcs12->host_ss_selector = evmcs->host_ss_selector;
1652         vmcs12->host_ds_selector = evmcs->host_ds_selector;
1653         vmcs12->host_fs_selector = evmcs->host_fs_selector;
1654         vmcs12->host_gs_selector = evmcs->host_gs_selector;
1655         vmcs12->host_tr_selector = evmcs->host_tr_selector;
1656     }
1657 
1658     if (unlikely(!(hv_clean_fields &
1659                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1660         vmcs12->pin_based_vm_exec_control =
1661             evmcs->pin_based_vm_exec_control;
1662         vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1663         vmcs12->secondary_vm_exec_control =
1664             evmcs->secondary_vm_exec_control;
1665     }
1666 
1667     if (unlikely(!(hv_clean_fields &
1668                HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1669         vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1670         vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1671     }
1672 
1673     if (unlikely(!(hv_clean_fields &
1674                HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1675         vmcs12->msr_bitmap = evmcs->msr_bitmap;
1676     }
1677 
1678     if (unlikely(!(hv_clean_fields &
1679                HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1680         vmcs12->guest_es_base = evmcs->guest_es_base;
1681         vmcs12->guest_cs_base = evmcs->guest_cs_base;
1682         vmcs12->guest_ss_base = evmcs->guest_ss_base;
1683         vmcs12->guest_ds_base = evmcs->guest_ds_base;
1684         vmcs12->guest_fs_base = evmcs->guest_fs_base;
1685         vmcs12->guest_gs_base = evmcs->guest_gs_base;
1686         vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1687         vmcs12->guest_tr_base = evmcs->guest_tr_base;
1688         vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1689         vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1690         vmcs12->guest_es_limit = evmcs->guest_es_limit;
1691         vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1692         vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1693         vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1694         vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1695         vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1696         vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1697         vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1698         vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1699         vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1700         vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1701         vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1702         vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1703         vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1704         vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1705         vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1706         vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1707         vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1708         vmcs12->guest_es_selector = evmcs->guest_es_selector;
1709         vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1710         vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1711         vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1712         vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1713         vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1714         vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1715         vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1716     }
1717 
1718     if (unlikely(!(hv_clean_fields &
1719                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1720         vmcs12->tsc_offset = evmcs->tsc_offset;
1721         vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1722         vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1723     }
1724 
1725     if (unlikely(!(hv_clean_fields &
1726                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1727         vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1728         vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1729         vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1730         vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1731         vmcs12->guest_cr0 = evmcs->guest_cr0;
1732         vmcs12->guest_cr3 = evmcs->guest_cr3;
1733         vmcs12->guest_cr4 = evmcs->guest_cr4;
1734         vmcs12->guest_dr7 = evmcs->guest_dr7;
1735     }
1736 
1737     if (unlikely(!(hv_clean_fields &
1738                HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1739         vmcs12->host_fs_base = evmcs->host_fs_base;
1740         vmcs12->host_gs_base = evmcs->host_gs_base;
1741         vmcs12->host_tr_base = evmcs->host_tr_base;
1742         vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1743         vmcs12->host_idtr_base = evmcs->host_idtr_base;
1744         vmcs12->host_rsp = evmcs->host_rsp;
1745     }
1746 
1747     if (unlikely(!(hv_clean_fields &
1748                HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1749         vmcs12->ept_pointer = evmcs->ept_pointer;
1750         vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1751     }
1752 
1753     if (unlikely(!(hv_clean_fields &
1754                HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1755         vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1756         vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1757         vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1758         vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1759         vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1760         vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1761         vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1762         vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1763         vmcs12->guest_pending_dbg_exceptions =
1764             evmcs->guest_pending_dbg_exceptions;
1765         vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1766         vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1767         vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1768         vmcs12->guest_activity_state = evmcs->guest_activity_state;
1769         vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1770     }
1771 
1772     /*
1773      * Not used?
1774      * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1775      * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1776      * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1777      * vmcs12->page_fault_error_code_mask =
1778      *      evmcs->page_fault_error_code_mask;
1779      * vmcs12->page_fault_error_code_match =
1780      *      evmcs->page_fault_error_code_match;
1781      * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1782      * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1783      * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1784      * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1785      */
1786 
1787     /*
1788      * Read only fields:
1789      * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1790      * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1791      * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1792      * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1793      * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1794      * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1795      * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1796      * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1797      * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1798      * vmcs12->exit_qualification = evmcs->exit_qualification;
1799      * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1800      *
1801      * Not present in struct vmcs12:
1802      * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1803      * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1804      * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1805      * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1806      */
1807 
1808     return;
1809 }
1810 
1811 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1812 {
1813     struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1814     struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1815 
1816     /*
1817      * Should not be changed by KVM:
1818      *
1819      * evmcs->host_es_selector = vmcs12->host_es_selector;
1820      * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1821      * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1822      * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1823      * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1824      * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1825      * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1826      * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1827      * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1828      * evmcs->host_cr0 = vmcs12->host_cr0;
1829      * evmcs->host_cr3 = vmcs12->host_cr3;
1830      * evmcs->host_cr4 = vmcs12->host_cr4;
1831      * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1832      * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1833      * evmcs->host_rip = vmcs12->host_rip;
1834      * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1835      * evmcs->host_fs_base = vmcs12->host_fs_base;
1836      * evmcs->host_gs_base = vmcs12->host_gs_base;
1837      * evmcs->host_tr_base = vmcs12->host_tr_base;
1838      * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1839      * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1840      * evmcs->host_rsp = vmcs12->host_rsp;
1841      * sync_vmcs02_to_vmcs12() doesn't read these:
1842      * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1843      * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1844      * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1845      * evmcs->ept_pointer = vmcs12->ept_pointer;
1846      * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1847      * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1848      * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1849      * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1850      * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1851      * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1852      * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1853      * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1854      * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1855      * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1856      * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1857      * evmcs->page_fault_error_code_mask =
1858      *      vmcs12->page_fault_error_code_mask;
1859      * evmcs->page_fault_error_code_match =
1860      *      vmcs12->page_fault_error_code_match;
1861      * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1862      * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1863      * evmcs->tsc_offset = vmcs12->tsc_offset;
1864      * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1865      * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1866      * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1867      * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1868      * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1869      * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1870      * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1871      * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1872      *
1873      * Not present in struct vmcs12:
1874      * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1875      * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1876      * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1877      * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1878      */
1879 
1880     evmcs->guest_es_selector = vmcs12->guest_es_selector;
1881     evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1882     evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1883     evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1884     evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1885     evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1886     evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1887     evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1888 
1889     evmcs->guest_es_limit = vmcs12->guest_es_limit;
1890     evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1891     evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1892     evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1893     evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1894     evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1895     evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1896     evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1897     evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1898     evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1899 
1900     evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1901     evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1902     evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1903     evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1904     evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1905     evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1906     evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1907     evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1908 
1909     evmcs->guest_es_base = vmcs12->guest_es_base;
1910     evmcs->guest_cs_base = vmcs12->guest_cs_base;
1911     evmcs->guest_ss_base = vmcs12->guest_ss_base;
1912     evmcs->guest_ds_base = vmcs12->guest_ds_base;
1913     evmcs->guest_fs_base = vmcs12->guest_fs_base;
1914     evmcs->guest_gs_base = vmcs12->guest_gs_base;
1915     evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1916     evmcs->guest_tr_base = vmcs12->guest_tr_base;
1917     evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1918     evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1919 
1920     evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1921     evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1922 
1923     evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1924     evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1925     evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1926     evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1927 
1928     evmcs->guest_pending_dbg_exceptions =
1929         vmcs12->guest_pending_dbg_exceptions;
1930     evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1931     evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1932 
1933     evmcs->guest_activity_state = vmcs12->guest_activity_state;
1934     evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1935 
1936     evmcs->guest_cr0 = vmcs12->guest_cr0;
1937     evmcs->guest_cr3 = vmcs12->guest_cr3;
1938     evmcs->guest_cr4 = vmcs12->guest_cr4;
1939     evmcs->guest_dr7 = vmcs12->guest_dr7;
1940 
1941     evmcs->guest_physical_address = vmcs12->guest_physical_address;
1942 
1943     evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1944     evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1945     evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1946     evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1947     evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1948     evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1949     evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1950     evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1951 
1952     evmcs->exit_qualification = vmcs12->exit_qualification;
1953 
1954     evmcs->guest_linear_address = vmcs12->guest_linear_address;
1955     evmcs->guest_rsp = vmcs12->guest_rsp;
1956     evmcs->guest_rflags = vmcs12->guest_rflags;
1957 
1958     evmcs->guest_interruptibility_info =
1959         vmcs12->guest_interruptibility_info;
1960     evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1961     evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1962     evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1963     evmcs->vm_entry_exception_error_code =
1964         vmcs12->vm_entry_exception_error_code;
1965     evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1966 
1967     evmcs->guest_rip = vmcs12->guest_rip;
1968 
1969     evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1970 
1971     return;
1972 }
1973 
1974 /*
1975  * This is an equivalent of the nested hypervisor executing the vmptrld
1976  * instruction.
1977  */
1978 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
1979     struct kvm_vcpu *vcpu, bool from_launch)
1980 {
1981     struct vcpu_vmx *vmx = to_vmx(vcpu);
1982     bool evmcs_gpa_changed = false;
1983     u64 evmcs_gpa;
1984 
1985     if (likely(!vmx->nested.enlightened_vmcs_enabled))
1986         return EVMPTRLD_DISABLED;
1987 
1988     if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
1989         nested_release_evmcs(vcpu);
1990         return EVMPTRLD_DISABLED;
1991     }
1992 
1993     if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1994         vmx->nested.current_vmptr = INVALID_GPA;
1995 
1996         nested_release_evmcs(vcpu);
1997 
1998         if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1999                  &vmx->nested.hv_evmcs_map))
2000             return EVMPTRLD_ERROR;
2001 
2002         vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2003 
2004         /*
2005          * Currently, KVM only supports eVMCS version 1
2006          * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2007          * value to first u32 field of eVMCS which should specify eVMCS
2008          * VersionNumber.
2009          *
2010          * Guest should be aware of supported eVMCS versions by host by
2011          * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2012          * expected to set this CPUID leaf according to the value
2013          * returned in vmcs_version from nested_enable_evmcs().
2014          *
2015          * However, it turns out that Microsoft Hyper-V fails to comply
2016          * to their own invented interface: When Hyper-V use eVMCS, it
2017          * just sets first u32 field of eVMCS to revision_id specified
2018          * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2019          * which is one of the supported versions specified in
2020          * CPUID.0x4000000A.EAX[0:15].
2021          *
2022          * To overcome Hyper-V bug, we accept here either a supported
2023          * eVMCS version or VMCS12 revision_id as valid values for first
2024          * u32 field of eVMCS.
2025          */
2026         if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2027             (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2028             nested_release_evmcs(vcpu);
2029             return EVMPTRLD_VMFAIL;
2030         }
2031 
2032         vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2033 
2034         evmcs_gpa_changed = true;
2035         /*
2036          * Unlike normal vmcs12, enlightened vmcs12 is not fully
2037          * reloaded from guest's memory (read only fields, fields not
2038          * present in struct hv_enlightened_vmcs, ...). Make sure there
2039          * are no leftovers.
2040          */
2041         if (from_launch) {
2042             struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2043             memset(vmcs12, 0, sizeof(*vmcs12));
2044             vmcs12->hdr.revision_id = VMCS12_REVISION;
2045         }
2046 
2047     }
2048 
2049     /*
2050      * Clean fields data can't be used on VMLAUNCH and when we switch
2051      * between different L2 guests as KVM keeps a single VMCS12 per L1.
2052      */
2053     if (from_launch || evmcs_gpa_changed) {
2054         vmx->nested.hv_evmcs->hv_clean_fields &=
2055             ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2056 
2057         vmx->nested.force_msr_bitmap_recalc = true;
2058     }
2059 
2060     return EVMPTRLD_SUCCEEDED;
2061 }
2062 
2063 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2064 {
2065     struct vcpu_vmx *vmx = to_vmx(vcpu);
2066 
2067     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2068         copy_vmcs12_to_enlightened(vmx);
2069     else
2070         copy_vmcs12_to_shadow(vmx);
2071 
2072     vmx->nested.need_vmcs12_to_shadow_sync = false;
2073 }
2074 
2075 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2076 {
2077     struct vcpu_vmx *vmx =
2078         container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2079 
2080     vmx->nested.preemption_timer_expired = true;
2081     kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2082     kvm_vcpu_kick(&vmx->vcpu);
2083 
2084     return HRTIMER_NORESTART;
2085 }
2086 
2087 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2088 {
2089     struct vcpu_vmx *vmx = to_vmx(vcpu);
2090     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2091 
2092     u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2093                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2094 
2095     if (!vmx->nested.has_preemption_timer_deadline) {
2096         vmx->nested.preemption_timer_deadline =
2097             vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2098         vmx->nested.has_preemption_timer_deadline = true;
2099     }
2100     return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2101 }
2102 
2103 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2104                     u64 preemption_timeout)
2105 {
2106     struct vcpu_vmx *vmx = to_vmx(vcpu);
2107 
2108     /*
2109      * A timer value of zero is architecturally guaranteed to cause
2110      * a VMExit prior to executing any instructions in the guest.
2111      */
2112     if (preemption_timeout == 0) {
2113         vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2114         return;
2115     }
2116 
2117     if (vcpu->arch.virtual_tsc_khz == 0)
2118         return;
2119 
2120     preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2121     preemption_timeout *= 1000000;
2122     do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2123     hrtimer_start(&vmx->nested.preemption_timer,
2124               ktime_add_ns(ktime_get(), preemption_timeout),
2125               HRTIMER_MODE_ABS_PINNED);
2126 }
2127 
2128 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2129 {
2130     if (vmx->nested.nested_run_pending &&
2131         (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2132         return vmcs12->guest_ia32_efer;
2133     else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2134         return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2135     else
2136         return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2137 }
2138 
2139 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2140 {
2141     struct kvm *kvm = vmx->vcpu.kvm;
2142 
2143     /*
2144      * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2145      * according to L0's settings (vmcs12 is irrelevant here).  Host
2146      * fields that come from L0 and are not constant, e.g. HOST_CR3,
2147      * will be set as needed prior to VMLAUNCH/VMRESUME.
2148      */
2149     if (vmx->nested.vmcs02_initialized)
2150         return;
2151     vmx->nested.vmcs02_initialized = true;
2152 
2153     /*
2154      * We don't care what the EPTP value is we just need to guarantee
2155      * it's valid so we don't get a false positive when doing early
2156      * consistency checks.
2157      */
2158     if (enable_ept && nested_early_check)
2159         vmcs_write64(EPT_POINTER,
2160                  construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2161 
2162     /* All VMFUNCs are currently emulated through L0 vmexits.  */
2163     if (cpu_has_vmx_vmfunc())
2164         vmcs_write64(VM_FUNCTION_CONTROL, 0);
2165 
2166     if (cpu_has_vmx_posted_intr())
2167         vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2168 
2169     if (cpu_has_vmx_msr_bitmap())
2170         vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2171 
2172     /*
2173      * PML is emulated for L2, but never enabled in hardware as the MMU
2174      * handles A/D emulation.  Disabling PML for L2 also avoids having to
2175      * deal with filtering out L2 GPAs from the buffer.
2176      */
2177     if (enable_pml) {
2178         vmcs_write64(PML_ADDRESS, 0);
2179         vmcs_write16(GUEST_PML_INDEX, -1);
2180     }
2181 
2182     if (cpu_has_vmx_encls_vmexit())
2183         vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
2184 
2185     if (kvm_notify_vmexit_enabled(kvm))
2186         vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
2187 
2188     /*
2189      * Set the MSR load/store lists to match L0's settings.  Only the
2190      * addresses are constant (for vmcs02), the counts can change based
2191      * on L2's behavior, e.g. switching to/from long mode.
2192      */
2193     vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2194     vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2195     vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2196 
2197     vmx_set_constant_host_state(vmx);
2198 }
2199 
2200 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2201                       struct vmcs12 *vmcs12)
2202 {
2203     prepare_vmcs02_constant_state(vmx);
2204 
2205     vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
2206 
2207     if (enable_vpid) {
2208         if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2209             vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2210         else
2211             vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2212     }
2213 }
2214 
2215 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2216                  struct vmcs12 *vmcs12)
2217 {
2218     u32 exec_control;
2219     u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2220 
2221     if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2222         prepare_vmcs02_early_rare(vmx, vmcs12);
2223 
2224     /*
2225      * PIN CONTROLS
2226      */
2227     exec_control = __pin_controls_get(vmcs01);
2228     exec_control |= (vmcs12->pin_based_vm_exec_control &
2229              ~PIN_BASED_VMX_PREEMPTION_TIMER);
2230 
2231     /* Posted interrupts setting is only taken from vmcs12.  */
2232     vmx->nested.pi_pending = false;
2233     if (nested_cpu_has_posted_intr(vmcs12))
2234         vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2235     else
2236         exec_control &= ~PIN_BASED_POSTED_INTR;
2237     pin_controls_set(vmx, exec_control);
2238 
2239     /*
2240      * EXEC CONTROLS
2241      */
2242     exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2243     exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2244     exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2245     exec_control &= ~CPU_BASED_TPR_SHADOW;
2246     exec_control |= vmcs12->cpu_based_vm_exec_control;
2247 
2248     vmx->nested.l1_tpr_threshold = -1;
2249     if (exec_control & CPU_BASED_TPR_SHADOW)
2250         vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2251 #ifdef CONFIG_X86_64
2252     else
2253         exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2254                 CPU_BASED_CR8_STORE_EXITING;
2255 #endif
2256 
2257     /*
2258      * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2259      * for I/O port accesses.
2260      */
2261     exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2262     exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2263 
2264     /*
2265      * This bit will be computed in nested_get_vmcs12_pages, because
2266      * we do not have access to L1's MSR bitmap yet.  For now, keep
2267      * the same bit as before, hoping to avoid multiple VMWRITEs that
2268      * only set/clear this bit.
2269      */
2270     exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2271     exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2272 
2273     exec_controls_set(vmx, exec_control);
2274 
2275     /*
2276      * SECONDARY EXEC CONTROLS
2277      */
2278     if (cpu_has_secondary_exec_ctrls()) {
2279         exec_control = __secondary_exec_controls_get(vmcs01);
2280 
2281         /* Take the following fields only from vmcs12 */
2282         exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2283                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2284                   SECONDARY_EXEC_ENABLE_INVPCID |
2285                   SECONDARY_EXEC_ENABLE_RDTSCP |
2286                   SECONDARY_EXEC_XSAVES |
2287                   SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2288                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2289                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
2290                   SECONDARY_EXEC_ENABLE_VMFUNC |
2291                   SECONDARY_EXEC_DESC);
2292 
2293         if (nested_cpu_has(vmcs12,
2294                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2295             exec_control |= vmcs12->secondary_vm_exec_control;
2296 
2297         /* PML is emulated and never enabled in hardware for L2. */
2298         exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2299 
2300         /* VMCS shadowing for L2 is emulated for now */
2301         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2302 
2303         /*
2304          * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2305          * will not have to rewrite the controls just for this bit.
2306          */
2307         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2308             (vmcs12->guest_cr4 & X86_CR4_UMIP))
2309             exec_control |= SECONDARY_EXEC_DESC;
2310 
2311         if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2312             vmcs_write16(GUEST_INTR_STATUS,
2313                 vmcs12->guest_intr_status);
2314 
2315         if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2316             exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2317 
2318         if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2319             vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2320 
2321         secondary_exec_controls_set(vmx, exec_control);
2322     }
2323 
2324     /*
2325      * ENTRY CONTROLS
2326      *
2327      * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2328      * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2329      * on the related bits (if supported by the CPU) in the hope that
2330      * we can avoid VMWrites during vmx_set_efer().
2331      */
2332     exec_control = __vm_entry_controls_get(vmcs01);
2333     exec_control |= vmcs12->vm_entry_controls;
2334     exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2335     if (cpu_has_load_ia32_efer()) {
2336         if (guest_efer & EFER_LMA)
2337             exec_control |= VM_ENTRY_IA32E_MODE;
2338         if (guest_efer != host_efer)
2339             exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2340     }
2341     vm_entry_controls_set(vmx, exec_control);
2342 
2343     /*
2344      * EXIT CONTROLS
2345      *
2346      * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2347      * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2348      * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2349      */
2350     exec_control = __vm_exit_controls_get(vmcs01);
2351     if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2352         exec_control |= VM_EXIT_LOAD_IA32_EFER;
2353     else
2354         exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2355     vm_exit_controls_set(vmx, exec_control);
2356 
2357     /*
2358      * Interrupt/Exception Fields
2359      */
2360     if (vmx->nested.nested_run_pending) {
2361         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2362                  vmcs12->vm_entry_intr_info_field);
2363         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2364                  vmcs12->vm_entry_exception_error_code);
2365         vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2366                  vmcs12->vm_entry_instruction_len);
2367         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2368                  vmcs12->guest_interruptibility_info);
2369         vmx->loaded_vmcs->nmi_known_unmasked =
2370             !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2371     } else {
2372         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2373     }
2374 }
2375 
2376 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2377 {
2378     struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2379 
2380     if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2381                HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2382         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2383         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2384         vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2385         vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2386         vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2387         vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2388         vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2389         vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2390         vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2391         vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2392         vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2393         vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2394         vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2395         vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2396         vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2397         vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2398         vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2399         vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2400         vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2401         vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2402         vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2403         vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2404         vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2405         vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2406         vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2407         vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2408         vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2409         vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2410         vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2411         vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2412         vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2413         vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2414         vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2415         vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2416         vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2417         vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2418 
2419         vmx->segment_cache.bitmask = 0;
2420     }
2421 
2422     if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2423                HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2424         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2425         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2426                 vmcs12->guest_pending_dbg_exceptions);
2427         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2428         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2429 
2430         /*
2431          * L1 may access the L2's PDPTR, so save them to construct
2432          * vmcs12
2433          */
2434         if (enable_ept) {
2435             vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2436             vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2437             vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2438             vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2439         }
2440 
2441         if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2442             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2443             vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2444     }
2445 
2446     if (nested_cpu_has_xsaves(vmcs12))
2447         vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2448 
2449     /*
2450      * Whether page-faults are trapped is determined by a combination of
2451      * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
2452      * doesn't care about page faults then we should set all of these to
2453      * L1's desires. However, if L0 does care about (some) page faults, it
2454      * is not easy (if at all possible?) to merge L0 and L1's desires, we
2455      * simply ask to exit on each and every L2 page fault. This is done by
2456      * setting MASK=MATCH=0 and (see below) EB.PF=1.
2457      * Note that below we don't need special code to set EB.PF beyond the
2458      * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2459      * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2460      * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2461      */
2462     if (vmx_need_pf_intercept(&vmx->vcpu)) {
2463         /*
2464          * TODO: if both L0 and L1 need the same MASK and MATCH,
2465          * go ahead and use it?
2466          */
2467         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2468         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2469     } else {
2470         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2471         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2472     }
2473 
2474     if (cpu_has_vmx_apicv()) {
2475         vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2476         vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2477         vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2478         vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2479     }
2480 
2481     /*
2482      * Make sure the msr_autostore list is up to date before we set the
2483      * count in the vmcs02.
2484      */
2485     prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2486 
2487     vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2488     vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2489     vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2490 
2491     set_cr4_guest_host_mask(vmx);
2492 }
2493 
2494 /*
2495  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2496  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2497  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2498  * guest in a way that will both be appropriate to L1's requests, and our
2499  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2500  * function also has additional necessary side-effects, like setting various
2501  * vcpu->arch fields.
2502  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2503  * is assigned to entry_failure_code on failure.
2504  */
2505 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2506               bool from_vmentry,
2507               enum vm_entry_failure_code *entry_failure_code)
2508 {
2509     struct vcpu_vmx *vmx = to_vmx(vcpu);
2510     bool load_guest_pdptrs_vmcs12 = false;
2511 
2512     if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
2513         prepare_vmcs02_rare(vmx, vmcs12);
2514         vmx->nested.dirty_vmcs12 = false;
2515 
2516         load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
2517             !(vmx->nested.hv_evmcs->hv_clean_fields &
2518               HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2519     }
2520 
2521     if (vmx->nested.nested_run_pending &&
2522         (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2523         kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2524         vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2525     } else {
2526         kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2527         vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
2528     }
2529     if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2530         !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2531         vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
2532     vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2533 
2534     /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2535      * bitwise-or of what L1 wants to trap for L2, and what we want to
2536      * trap. Note that CR0.TS also needs updating - we do this later.
2537      */
2538     vmx_update_exception_bitmap(vcpu);
2539     vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2540     vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2541 
2542     if (vmx->nested.nested_run_pending &&
2543         (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2544         vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2545         vcpu->arch.pat = vmcs12->guest_ia32_pat;
2546     } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2547         vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2548     }
2549 
2550     vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2551             vcpu->arch.l1_tsc_offset,
2552             vmx_get_l2_tsc_offset(vcpu),
2553             vmx_get_l2_tsc_multiplier(vcpu));
2554 
2555     vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2556             vcpu->arch.l1_tsc_scaling_ratio,
2557             vmx_get_l2_tsc_multiplier(vcpu));
2558 
2559     vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2560     if (kvm_caps.has_tsc_control)
2561         vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2562 
2563     nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2564 
2565     if (nested_cpu_has_ept(vmcs12))
2566         nested_ept_init_mmu_context(vcpu);
2567 
2568     /*
2569      * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2570      * bits which we consider mandatory enabled.
2571      * The CR0_READ_SHADOW is what L2 should have expected to read given
2572      * the specifications by L1; It's not enough to take
2573      * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2574      * have more bits than L1 expected.
2575      */
2576     vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2577     vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2578 
2579     vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2580     vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2581 
2582     vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2583     /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2584     vmx_set_efer(vcpu, vcpu->arch.efer);
2585 
2586     /*
2587      * Guest state is invalid and unrestricted guest is disabled,
2588      * which means L1 attempted VMEntry to L2 with invalid state.
2589      * Fail the VMEntry.
2590      *
2591      * However when force loading the guest state (SMM exit or
2592      * loading nested state after migration, it is possible to
2593      * have invalid guest state now, which will be later fixed by
2594      * restoring L2 register state
2595      */
2596     if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2597         *entry_failure_code = ENTRY_FAIL_DEFAULT;
2598         return -EINVAL;
2599     }
2600 
2601     /* Shadow page tables on either EPT or shadow page tables. */
2602     if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2603                 from_vmentry, entry_failure_code))
2604         return -EINVAL;
2605 
2606     /*
2607      * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2608      * on nested VM-Exit, which can occur without actually running L2 and
2609      * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2610      * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2611      * transition to HLT instead of running L2.
2612      */
2613     if (enable_ept)
2614         vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2615 
2616     /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2617     if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2618         is_pae_paging(vcpu)) {
2619         vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2620         vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2621         vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2622         vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2623     }
2624 
2625     if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2626         intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
2627         WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2628                      vmcs12->guest_ia32_perf_global_ctrl))) {
2629         *entry_failure_code = ENTRY_FAIL_DEFAULT;
2630         return -EINVAL;
2631     }
2632 
2633     kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2634     kvm_rip_write(vcpu, vmcs12->guest_rip);
2635 
2636     /*
2637      * It was observed that genuine Hyper-V running in L1 doesn't reset
2638      * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2639      * bits when it changes a field in eVMCS. Mark all fields as clean
2640      * here.
2641      */
2642     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2643         vmx->nested.hv_evmcs->hv_clean_fields |=
2644             HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2645 
2646     return 0;
2647 }
2648 
2649 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2650 {
2651     if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2652            nested_cpu_has_virtual_nmis(vmcs12)))
2653         return -EINVAL;
2654 
2655     if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2656            nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2657         return -EINVAL;
2658 
2659     return 0;
2660 }
2661 
2662 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2663 {
2664     struct vcpu_vmx *vmx = to_vmx(vcpu);
2665 
2666     /* Check for memory type validity */
2667     switch (new_eptp & VMX_EPTP_MT_MASK) {
2668     case VMX_EPTP_MT_UC:
2669         if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2670             return false;
2671         break;
2672     case VMX_EPTP_MT_WB:
2673         if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2674             return false;
2675         break;
2676     default:
2677         return false;
2678     }
2679 
2680     /* Page-walk levels validity. */
2681     switch (new_eptp & VMX_EPTP_PWL_MASK) {
2682     case VMX_EPTP_PWL_5:
2683         if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2684             return false;
2685         break;
2686     case VMX_EPTP_PWL_4:
2687         if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2688             return false;
2689         break;
2690     default:
2691         return false;
2692     }
2693 
2694     /* Reserved bits should not be set */
2695     if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2696         return false;
2697 
2698     /* AD, if set, should be supported */
2699     if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2700         if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2701             return false;
2702     }
2703 
2704     return true;
2705 }
2706 
2707 /*
2708  * Checks related to VM-Execution Control Fields
2709  */
2710 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2711                                               struct vmcs12 *vmcs12)
2712 {
2713     struct vcpu_vmx *vmx = to_vmx(vcpu);
2714 
2715     if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2716                    vmx->nested.msrs.pinbased_ctls_low,
2717                    vmx->nested.msrs.pinbased_ctls_high)) ||
2718         CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2719                    vmx->nested.msrs.procbased_ctls_low,
2720                    vmx->nested.msrs.procbased_ctls_high)))
2721         return -EINVAL;
2722 
2723     if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2724         CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2725                    vmx->nested.msrs.secondary_ctls_low,
2726                    vmx->nested.msrs.secondary_ctls_high)))
2727         return -EINVAL;
2728 
2729     if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2730         nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2731         nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2732         nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2733         nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2734         nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2735         nested_vmx_check_nmi_controls(vmcs12) ||
2736         nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2737         nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2738         nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2739         nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2740         CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2741         return -EINVAL;
2742 
2743     if (!nested_cpu_has_preemption_timer(vmcs12) &&
2744         nested_cpu_has_save_preemption_timer(vmcs12))
2745         return -EINVAL;
2746 
2747     if (nested_cpu_has_ept(vmcs12) &&
2748         CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2749         return -EINVAL;
2750 
2751     if (nested_cpu_has_vmfunc(vmcs12)) {
2752         if (CC(vmcs12->vm_function_control &
2753                ~vmx->nested.msrs.vmfunc_controls))
2754             return -EINVAL;
2755 
2756         if (nested_cpu_has_eptp_switching(vmcs12)) {
2757             if (CC(!nested_cpu_has_ept(vmcs12)) ||
2758                 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2759                 return -EINVAL;
2760         }
2761     }
2762 
2763     return 0;
2764 }
2765 
2766 /*
2767  * Checks related to VM-Exit Control Fields
2768  */
2769 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2770                                          struct vmcs12 *vmcs12)
2771 {
2772     struct vcpu_vmx *vmx = to_vmx(vcpu);
2773 
2774     if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2775                     vmx->nested.msrs.exit_ctls_low,
2776                     vmx->nested.msrs.exit_ctls_high)) ||
2777         CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2778         return -EINVAL;
2779 
2780     return 0;
2781 }
2782 
2783 /*
2784  * Checks related to VM-Entry Control Fields
2785  */
2786 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2787                       struct vmcs12 *vmcs12)
2788 {
2789     struct vcpu_vmx *vmx = to_vmx(vcpu);
2790 
2791     if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2792                     vmx->nested.msrs.entry_ctls_low,
2793                     vmx->nested.msrs.entry_ctls_high)))
2794         return -EINVAL;
2795 
2796     /*
2797      * From the Intel SDM, volume 3:
2798      * Fields relevant to VM-entry event injection must be set properly.
2799      * These fields are the VM-entry interruption-information field, the
2800      * VM-entry exception error code, and the VM-entry instruction length.
2801      */
2802     if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2803         u32 intr_info = vmcs12->vm_entry_intr_info_field;
2804         u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2805         u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2806         bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2807         bool should_have_error_code;
2808         bool urg = nested_cpu_has2(vmcs12,
2809                        SECONDARY_EXEC_UNRESTRICTED_GUEST);
2810         bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2811 
2812         /* VM-entry interruption-info field: interruption type */
2813         if (CC(intr_type == INTR_TYPE_RESERVED) ||
2814             CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2815                !nested_cpu_supports_monitor_trap_flag(vcpu)))
2816             return -EINVAL;
2817 
2818         /* VM-entry interruption-info field: vector */
2819         if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2820             CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2821             CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2822             return -EINVAL;
2823 
2824         /* VM-entry interruption-info field: deliver error code */
2825         should_have_error_code =
2826             intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2827             x86_exception_has_error_code(vector);
2828         if (CC(has_error_code != should_have_error_code))
2829             return -EINVAL;
2830 
2831         /* VM-entry exception error code */
2832         if (CC(has_error_code &&
2833                vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2834             return -EINVAL;
2835 
2836         /* VM-entry interruption-info field: reserved bits */
2837         if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2838             return -EINVAL;
2839 
2840         /* VM-entry instruction length */
2841         switch (intr_type) {
2842         case INTR_TYPE_SOFT_EXCEPTION:
2843         case INTR_TYPE_SOFT_INTR:
2844         case INTR_TYPE_PRIV_SW_EXCEPTION:
2845             if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2846                 CC(vmcs12->vm_entry_instruction_len == 0 &&
2847                 CC(!nested_cpu_has_zero_length_injection(vcpu))))
2848                 return -EINVAL;
2849         }
2850     }
2851 
2852     if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2853         return -EINVAL;
2854 
2855     return 0;
2856 }
2857 
2858 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2859                      struct vmcs12 *vmcs12)
2860 {
2861     if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2862         nested_check_vm_exit_controls(vcpu, vmcs12) ||
2863         nested_check_vm_entry_controls(vcpu, vmcs12))
2864         return -EINVAL;
2865 
2866     if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2867         return nested_evmcs_check_controls(vmcs12);
2868 
2869     return 0;
2870 }
2871 
2872 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
2873                        struct vmcs12 *vmcs12)
2874 {
2875 #ifdef CONFIG_X86_64
2876     if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
2877         !!(vcpu->arch.efer & EFER_LMA)))
2878         return -EINVAL;
2879 #endif
2880     return 0;
2881 }
2882 
2883 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2884                        struct vmcs12 *vmcs12)
2885 {
2886     bool ia32e;
2887 
2888     if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2889         CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2890         CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
2891         return -EINVAL;
2892 
2893     if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2894         CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2895         return -EINVAL;
2896 
2897     if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2898         CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2899         return -EINVAL;
2900 
2901     if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2902         CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2903                        vmcs12->host_ia32_perf_global_ctrl)))
2904         return -EINVAL;
2905 
2906 #ifdef CONFIG_X86_64
2907     ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
2908 #else
2909     ia32e = false;
2910 #endif
2911 
2912     if (ia32e) {
2913         if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2914             return -EINVAL;
2915     } else {
2916         if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2917             CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2918             CC((vmcs12->host_rip) >> 32))
2919             return -EINVAL;
2920     }
2921 
2922     if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2923         CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2924         CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2925         CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2926         CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2927         CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2928         CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2929         CC(vmcs12->host_cs_selector == 0) ||
2930         CC(vmcs12->host_tr_selector == 0) ||
2931         CC(vmcs12->host_ss_selector == 0 && !ia32e))
2932         return -EINVAL;
2933 
2934     if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2935         CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2936         CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2937         CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2938         CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2939         CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2940         return -EINVAL;
2941 
2942     /*
2943      * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2944      * IA32_EFER MSR must be 0 in the field for that register. In addition,
2945      * the values of the LMA and LME bits in the field must each be that of
2946      * the host address-space size VM-exit control.
2947      */
2948     if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2949         if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2950             CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2951             CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2952             return -EINVAL;
2953     }
2954 
2955     return 0;
2956 }
2957 
2958 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2959                       struct vmcs12 *vmcs12)
2960 {
2961     struct vcpu_vmx *vmx = to_vmx(vcpu);
2962     struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
2963     struct vmcs_hdr hdr;
2964 
2965     if (vmcs12->vmcs_link_pointer == INVALID_GPA)
2966         return 0;
2967 
2968     if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2969         return -EINVAL;
2970 
2971     if (ghc->gpa != vmcs12->vmcs_link_pointer &&
2972         CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
2973                      vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
2974                 return -EINVAL;
2975 
2976     if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
2977                         offsetof(struct vmcs12, hdr),
2978                         sizeof(hdr))))
2979         return -EINVAL;
2980 
2981     if (CC(hdr.revision_id != VMCS12_REVISION) ||
2982         CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2983         return -EINVAL;
2984 
2985     return 0;
2986 }
2987 
2988 /*
2989  * Checks related to Guest Non-register State
2990  */
2991 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2992 {
2993     if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2994            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
2995            vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
2996         return -EINVAL;
2997 
2998     return 0;
2999 }
3000 
3001 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
3002                     struct vmcs12 *vmcs12,
3003                     enum vm_entry_failure_code *entry_failure_code)
3004 {
3005     bool ia32e;
3006 
3007     *entry_failure_code = ENTRY_FAIL_DEFAULT;
3008 
3009     if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3010         CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3011         return -EINVAL;
3012 
3013     if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3014         CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
3015         return -EINVAL;
3016 
3017     if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3018         CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3019         return -EINVAL;
3020 
3021     if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3022         *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3023         return -EINVAL;
3024     }
3025 
3026     if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3027         CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3028                        vmcs12->guest_ia32_perf_global_ctrl)))
3029         return -EINVAL;
3030 
3031     /*
3032      * If the load IA32_EFER VM-entry control is 1, the following checks
3033      * are performed on the field for the IA32_EFER MSR:
3034      * - Bits reserved in the IA32_EFER MSR must be 0.
3035      * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3036      *   the IA-32e mode guest VM-exit control. It must also be identical
3037      *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3038      *   CR0.PG) is 1.
3039      */
3040     if (to_vmx(vcpu)->nested.nested_run_pending &&
3041         (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3042         ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
3043         if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3044             CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3045             CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3046              ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3047             return -EINVAL;
3048     }
3049 
3050     if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3051         (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3052          CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3053         return -EINVAL;
3054 
3055     if (nested_check_guest_non_reg_state(vmcs12))
3056         return -EINVAL;
3057 
3058     return 0;
3059 }
3060 
3061 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3062 {
3063     struct vcpu_vmx *vmx = to_vmx(vcpu);
3064     unsigned long cr3, cr4;
3065     bool vm_fail;
3066 
3067     if (!nested_early_check)
3068         return 0;
3069 
3070     if (vmx->msr_autoload.host.nr)
3071         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3072     if (vmx->msr_autoload.guest.nr)
3073         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3074 
3075     preempt_disable();
3076 
3077     vmx_prepare_switch_to_guest(vcpu);
3078 
3079     /*
3080      * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3081      * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
3082      * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3083      * there is no need to preserve other bits or save/restore the field.
3084      */
3085     vmcs_writel(GUEST_RFLAGS, 0);
3086 
3087     cr3 = __get_current_cr3_fast();
3088     if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3089         vmcs_writel(HOST_CR3, cr3);
3090         vmx->loaded_vmcs->host_state.cr3 = cr3;
3091     }
3092 
3093     cr4 = cr4_read_shadow();
3094     if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3095         vmcs_writel(HOST_CR4, cr4);
3096         vmx->loaded_vmcs->host_state.cr4 = cr4;
3097     }
3098 
3099     vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3100                  __vmx_vcpu_run_flags(vmx));
3101 
3102     if (vmx->msr_autoload.host.nr)
3103         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3104     if (vmx->msr_autoload.guest.nr)
3105         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3106 
3107     if (vm_fail) {
3108         u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3109 
3110         preempt_enable();
3111 
3112         trace_kvm_nested_vmenter_failed(
3113             "early hardware check VM-instruction error: ", error);
3114         WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3115         return 1;
3116     }
3117 
3118     /*
3119      * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3120      */
3121     if (hw_breakpoint_active())
3122         set_debugreg(__this_cpu_read(cpu_dr7), 7);
3123     local_irq_enable();
3124     preempt_enable();
3125 
3126     /*
3127      * A non-failing VMEntry means we somehow entered guest mode with
3128      * an illegal RIP, and that's just the tip of the iceberg.  There
3129      * is no telling what memory has been modified or what state has
3130      * been exposed to unknown code.  Hitting this all but guarantees
3131      * a (very critical) hardware issue.
3132      */
3133     WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3134         VMX_EXIT_REASONS_FAILED_VMENTRY));
3135 
3136     return 0;
3137 }
3138 
3139 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3140 {
3141     struct vcpu_vmx *vmx = to_vmx(vcpu);
3142 
3143     /*
3144      * hv_evmcs may end up being not mapped after migration (when
3145      * L2 was running), map it here to make sure vmcs12 changes are
3146      * properly reflected.
3147      */
3148     if (vmx->nested.enlightened_vmcs_enabled &&
3149         vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3150         enum nested_evmptrld_status evmptrld_status =
3151             nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3152 
3153         if (evmptrld_status == EVMPTRLD_VMFAIL ||
3154             evmptrld_status == EVMPTRLD_ERROR)
3155             return false;
3156 
3157         /*
3158          * Post migration VMCS12 always provides the most actual
3159          * information, copy it to eVMCS upon entry.
3160          */
3161         vmx->nested.need_vmcs12_to_shadow_sync = true;
3162     }
3163 
3164     return true;
3165 }
3166 
3167 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3168 {
3169     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3170     struct vcpu_vmx *vmx = to_vmx(vcpu);
3171     struct kvm_host_map *map;
3172 
3173     if (!vcpu->arch.pdptrs_from_userspace &&
3174         !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3175         /*
3176          * Reload the guest's PDPTRs since after a migration
3177          * the guest CR3 might be restored prior to setting the nested
3178          * state which can lead to a load of wrong PDPTRs.
3179          */
3180         if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
3181             return false;
3182     }
3183 
3184 
3185     if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3186         map = &vmx->nested.apic_access_page_map;
3187 
3188         if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
3189             vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
3190         } else {
3191             pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
3192                          __func__);
3193             vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3194             vcpu->run->internal.suberror =
3195                 KVM_INTERNAL_ERROR_EMULATION;
3196             vcpu->run->internal.ndata = 0;
3197             return false;
3198         }
3199     }
3200 
3201     if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3202         map = &vmx->nested.virtual_apic_map;
3203 
3204         if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3205             vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3206         } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3207                    nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3208                !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3209             /*
3210              * The processor will never use the TPR shadow, simply
3211              * clear the bit from the execution control.  Such a
3212              * configuration is useless, but it happens in tests.
3213              * For any other configuration, failing the vm entry is
3214              * _not_ what the processor does but it's basically the
3215              * only possibility we have.
3216              */
3217             exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3218         } else {
3219             /*
3220              * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3221              * force VM-Entry to fail.
3222              */
3223             vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
3224         }
3225     }
3226 
3227     if (nested_cpu_has_posted_intr(vmcs12)) {
3228         map = &vmx->nested.pi_desc_map;
3229 
3230         if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3231             vmx->nested.pi_desc =
3232                 (struct pi_desc *)(((void *)map->hva) +
3233                 offset_in_page(vmcs12->posted_intr_desc_addr));
3234             vmcs_write64(POSTED_INTR_DESC_ADDR,
3235                      pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3236         } else {
3237             /*
3238              * Defer the KVM_INTERNAL_EXIT until KVM tries to
3239              * access the contents of the VMCS12 posted interrupt
3240              * descriptor. (Note that KVM may do this when it
3241              * should not, per the architectural specification.)
3242              */
3243             vmx->nested.pi_desc = NULL;
3244             pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3245         }
3246     }
3247     if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3248         exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3249     else
3250         exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3251 
3252     return true;
3253 }
3254 
3255 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3256 {
3257     if (!nested_get_evmcs_page(vcpu)) {
3258         pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3259                      __func__);
3260         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3261         vcpu->run->internal.suberror =
3262             KVM_INTERNAL_ERROR_EMULATION;
3263         vcpu->run->internal.ndata = 0;
3264 
3265         return false;
3266     }
3267 
3268     if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3269         return false;
3270 
3271     return true;
3272 }
3273 
3274 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3275 {
3276     struct vmcs12 *vmcs12;
3277     struct vcpu_vmx *vmx = to_vmx(vcpu);
3278     gpa_t dst;
3279 
3280     if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3281         return 0;
3282 
3283     if (WARN_ON_ONCE(vmx->nested.pml_full))
3284         return 1;
3285 
3286     /*
3287      * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3288      * set is already checked as part of A/D emulation.
3289      */
3290     vmcs12 = get_vmcs12(vcpu);
3291     if (!nested_cpu_has_pml(vmcs12))
3292         return 0;
3293 
3294     if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3295         vmx->nested.pml_full = true;
3296         return 1;
3297     }
3298 
3299     gpa &= ~0xFFFull;
3300     dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3301 
3302     if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3303                  offset_in_page(dst), sizeof(gpa)))
3304         return 0;
3305 
3306     vmcs12->guest_pml_index--;
3307 
3308     return 0;
3309 }
3310 
3311 /*
3312  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3313  * for running VMX instructions (except VMXON, whose prerequisites are
3314  * slightly different). It also specifies what exception to inject otherwise.
3315  * Note that many of these exceptions have priority over VM exits, so they
3316  * don't have to be checked again here.
3317  */
3318 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3319 {
3320     if (!to_vmx(vcpu)->nested.vmxon) {
3321         kvm_queue_exception(vcpu, UD_VECTOR);
3322         return 0;
3323     }
3324 
3325     if (vmx_get_cpl(vcpu)) {
3326         kvm_inject_gp(vcpu, 0);
3327         return 0;
3328     }
3329 
3330     return 1;
3331 }
3332 
3333 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3334 {
3335     u8 rvi = vmx_get_rvi();
3336     u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3337 
3338     return ((rvi & 0xf0) > (vppr & 0xf0));
3339 }
3340 
3341 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3342                    struct vmcs12 *vmcs12);
3343 
3344 /*
3345  * If from_vmentry is false, this is being called from state restore (either RSM
3346  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3347  *
3348  * Returns:
3349  *  NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3350  *  NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
3351  *  NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
3352  *  NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3353  */
3354 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3355                             bool from_vmentry)
3356 {
3357     struct vcpu_vmx *vmx = to_vmx(vcpu);
3358     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3359     enum vm_entry_failure_code entry_failure_code;
3360     bool evaluate_pending_interrupts;
3361     union vmx_exit_reason exit_reason = {
3362         .basic = EXIT_REASON_INVALID_STATE,
3363         .failed_vmentry = 1,
3364     };
3365     u32 failed_index;
3366 
3367     kvm_service_local_tlb_flush_requests(vcpu);
3368 
3369     evaluate_pending_interrupts = exec_controls_get(vmx) &
3370         (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3371     if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3372         evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3373 
3374     if (!vmx->nested.nested_run_pending ||
3375         !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3376         vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3377     if (kvm_mpx_supported() &&
3378         (!vmx->nested.nested_run_pending ||
3379          !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3380         vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3381 
3382     /*
3383      * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3384      * nested early checks are disabled.  In the event of a "late" VM-Fail,
3385      * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3386      * software model to the pre-VMEntry host state.  When EPT is disabled,
3387      * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3388      * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3389      * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3390      * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3391      * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3392      * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3393      * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3394      * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3395      * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3396      * path would need to manually save/restore vmcs01.GUEST_CR3.
3397      */
3398     if (!enable_ept && !nested_early_check)
3399         vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3400 
3401     vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3402 
3403     prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3404 
3405     if (from_vmentry) {
3406         if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3407             vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3408             return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3409         }
3410 
3411         if (nested_vmx_check_vmentry_hw(vcpu)) {
3412             vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3413             return NVMX_VMENTRY_VMFAIL;
3414         }
3415 
3416         if (nested_vmx_check_guest_state(vcpu, vmcs12,
3417                          &entry_failure_code)) {
3418             exit_reason.basic = EXIT_REASON_INVALID_STATE;
3419             vmcs12->exit_qualification = entry_failure_code;
3420             goto vmentry_fail_vmexit;
3421         }
3422     }
3423 
3424     enter_guest_mode(vcpu);
3425 
3426     if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3427         exit_reason.basic = EXIT_REASON_INVALID_STATE;
3428         vmcs12->exit_qualification = entry_failure_code;
3429         goto vmentry_fail_vmexit_guest_mode;
3430     }
3431 
3432     if (from_vmentry) {
3433         failed_index = nested_vmx_load_msr(vcpu,
3434                            vmcs12->vm_entry_msr_load_addr,
3435                            vmcs12->vm_entry_msr_load_count);
3436         if (failed_index) {
3437             exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3438             vmcs12->exit_qualification = failed_index;
3439             goto vmentry_fail_vmexit_guest_mode;
3440         }
3441     } else {
3442         /*
3443          * The MMU is not initialized to point at the right entities yet and
3444          * "get pages" would need to read data from the guest (i.e. we will
3445          * need to perform gpa to hpa translation). Request a call
3446          * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3447          * have already been set at vmentry time and should not be reset.
3448          */
3449         kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3450     }
3451 
3452     /*
3453      * If L1 had a pending IRQ/NMI until it executed
3454      * VMLAUNCH/VMRESUME which wasn't delivered because it was
3455      * disallowed (e.g. interrupts disabled), L0 needs to
3456      * evaluate if this pending event should cause an exit from L2
3457      * to L1 or delivered directly to L2 (e.g. In case L1 don't
3458      * intercept EXTERNAL_INTERRUPT).
3459      *
3460      * Usually this would be handled by the processor noticing an
3461      * IRQ/NMI window request, or checking RVI during evaluation of
3462      * pending virtual interrupts.  However, this setting was done
3463      * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3464      * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3465      */
3466     if (unlikely(evaluate_pending_interrupts))
3467         kvm_make_request(KVM_REQ_EVENT, vcpu);
3468 
3469     /*
3470      * Do not start the preemption timer hrtimer until after we know
3471      * we are successful, so that only nested_vmx_vmexit needs to cancel
3472      * the timer.
3473      */
3474     vmx->nested.preemption_timer_expired = false;
3475     if (nested_cpu_has_preemption_timer(vmcs12)) {
3476         u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3477         vmx_start_preemption_timer(vcpu, timer_value);
3478     }
3479 
3480     /*
3481      * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3482      * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3483      * returned as far as L1 is concerned. It will only return (and set
3484      * the success flag) when L2 exits (see nested_vmx_vmexit()).
3485      */
3486     return NVMX_VMENTRY_SUCCESS;
3487 
3488     /*
3489      * A failed consistency check that leads to a VMExit during L1's
3490      * VMEnter to L2 is a variation of a normal VMexit, as explained in
3491      * 26.7 "VM-entry failures during or after loading guest state".
3492      */
3493 vmentry_fail_vmexit_guest_mode:
3494     if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3495         vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3496     leave_guest_mode(vcpu);
3497 
3498 vmentry_fail_vmexit:
3499     vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3500 
3501     if (!from_vmentry)
3502         return NVMX_VMENTRY_VMEXIT;
3503 
3504     load_vmcs12_host_state(vcpu, vmcs12);
3505     vmcs12->vm_exit_reason = exit_reason.full;
3506     if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
3507         vmx->nested.need_vmcs12_to_shadow_sync = true;
3508     return NVMX_VMENTRY_VMEXIT;
3509 }
3510 
3511 /*
3512  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3513  * for running an L2 nested guest.
3514  */
3515 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3516 {
3517     struct vmcs12 *vmcs12;
3518     enum nvmx_vmentry_status status;
3519     struct vcpu_vmx *vmx = to_vmx(vcpu);
3520     u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3521     enum nested_evmptrld_status evmptrld_status;
3522 
3523     if (!nested_vmx_check_permission(vcpu))
3524         return 1;
3525 
3526     evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3527     if (evmptrld_status == EVMPTRLD_ERROR) {
3528         kvm_queue_exception(vcpu, UD_VECTOR);
3529         return 1;
3530     }
3531 
3532     kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
3533 
3534     if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3535         return nested_vmx_failInvalid(vcpu);
3536 
3537     if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
3538            vmx->nested.current_vmptr == INVALID_GPA))
3539         return nested_vmx_failInvalid(vcpu);
3540 
3541     vmcs12 = get_vmcs12(vcpu);
3542 
3543     /*
3544      * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3545      * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3546      * rather than RFLAGS.ZF, and no error number is stored to the
3547      * VM-instruction error field.
3548      */
3549     if (CC(vmcs12->hdr.shadow_vmcs))
3550         return nested_vmx_failInvalid(vcpu);
3551 
3552     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
3553         copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
3554         /* Enlightened VMCS doesn't have launch state */
3555         vmcs12->launch_state = !launch;
3556     } else if (enable_shadow_vmcs) {
3557         copy_shadow_to_vmcs12(vmx);
3558     }
3559 
3560     /*
3561      * The nested entry process starts with enforcing various prerequisites
3562      * on vmcs12 as required by the Intel SDM, and act appropriately when
3563      * they fail: As the SDM explains, some conditions should cause the
3564      * instruction to fail, while others will cause the instruction to seem
3565      * to succeed, but return an EXIT_REASON_INVALID_STATE.
3566      * To speed up the normal (success) code path, we should avoid checking
3567      * for misconfigurations which will anyway be caught by the processor
3568      * when using the merged vmcs02.
3569      */
3570     if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3571         return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3572 
3573     if (CC(vmcs12->launch_state == launch))
3574         return nested_vmx_fail(vcpu,
3575             launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3576                    : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3577 
3578     if (nested_vmx_check_controls(vcpu, vmcs12))
3579         return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3580 
3581     if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3582         return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3583 
3584     if (nested_vmx_check_host_state(vcpu, vmcs12))
3585         return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3586 
3587     /*
3588      * We're finally done with prerequisite checking, and can start with
3589      * the nested entry.
3590      */
3591     vmx->nested.nested_run_pending = 1;
3592     vmx->nested.has_preemption_timer_deadline = false;
3593     status = nested_vmx_enter_non_root_mode(vcpu, true);
3594     if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3595         goto vmentry_failed;
3596 
3597     /* Emulate processing of posted interrupts on VM-Enter. */
3598     if (nested_cpu_has_posted_intr(vmcs12) &&
3599         kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3600         vmx->nested.pi_pending = true;
3601         kvm_make_request(KVM_REQ_EVENT, vcpu);
3602         kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3603     }
3604 
3605     /* Hide L1D cache contents from the nested guest.  */
3606     vmx->vcpu.arch.l1tf_flush_l1d = true;
3607 
3608     /*
3609      * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3610      * also be used as part of restoring nVMX state for
3611      * snapshot restore (migration).
3612      *
3613      * In this flow, it is assumed that vmcs12 cache was
3614      * transferred as part of captured nVMX state and should
3615      * therefore not be read from guest memory (which may not
3616      * exist on destination host yet).
3617      */
3618     nested_cache_shadow_vmcs12(vcpu, vmcs12);
3619 
3620     switch (vmcs12->guest_activity_state) {
3621     case GUEST_ACTIVITY_HLT:
3622         /*
3623          * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3624          * awakened by event injection or by an NMI-window VM-exit or
3625          * by an interrupt-window VM-exit, halt the vcpu.
3626          */
3627         if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3628             !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3629             !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3630               (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3631             vmx->nested.nested_run_pending = 0;
3632             return kvm_emulate_halt_noskip(vcpu);
3633         }
3634         break;
3635     case GUEST_ACTIVITY_WAIT_SIPI:
3636         vmx->nested.nested_run_pending = 0;
3637         vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3638         break;
3639     default:
3640         break;
3641     }
3642 
3643     return 1;
3644 
3645 vmentry_failed:
3646     vmx->nested.nested_run_pending = 0;
3647     if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3648         return 0;
3649     if (status == NVMX_VMENTRY_VMEXIT)
3650         return 1;
3651     WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3652     return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3653 }
3654 
3655 /*
3656  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3657  * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3658  * This function returns the new value we should put in vmcs12.guest_cr0.
3659  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3660  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3661  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3662  *     didn't trap the bit, because if L1 did, so would L0).
3663  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3664  *     been modified by L2, and L1 knows it. So just leave the old value of
3665  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3666  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3667  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3668  *     changed these bits, and therefore they need to be updated, but L0
3669  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3670  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3671  */
3672 static inline unsigned long
3673 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3674 {
3675     return
3676     /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3677     /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3678     /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3679             vcpu->arch.cr0_guest_owned_bits));
3680 }
3681 
3682 static inline unsigned long
3683 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3684 {
3685     return
3686     /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3687     /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3688     /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3689             vcpu->arch.cr4_guest_owned_bits));
3690 }
3691 
3692 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3693                       struct vmcs12 *vmcs12,
3694                       u32 vm_exit_reason, u32 exit_intr_info)
3695 {
3696     u32 idt_vectoring;
3697     unsigned int nr;
3698 
3699     /*
3700      * Per the SDM, VM-Exits due to double and triple faults are never
3701      * considered to occur during event delivery, even if the double/triple
3702      * fault is the result of an escalating vectoring issue.
3703      *
3704      * Note, the SDM qualifies the double fault behavior with "The original
3705      * event results in a double-fault exception".  It's unclear why the
3706      * qualification exists since exits due to double fault can occur only
3707      * while vectoring a different exception (injected events are never
3708      * subject to interception), i.e. there's _always_ an original event.
3709      *
3710      * The SDM also uses NMI as a confusing example for the "original event
3711      * causes the VM exit directly" clause.  NMI isn't special in any way,
3712      * the same rule applies to all events that cause an exit directly.
3713      * NMI is an odd choice for the example because NMIs can only occur on
3714      * instruction boundaries, i.e. they _can't_ occur during vectoring.
3715      */
3716     if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3717         ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3718          is_double_fault(exit_intr_info))) {
3719         vmcs12->idt_vectoring_info_field = 0;
3720     } else if (vcpu->arch.exception.injected) {
3721         nr = vcpu->arch.exception.nr;
3722         idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3723 
3724         if (kvm_exception_is_soft(nr)) {
3725             vmcs12->vm_exit_instruction_len =
3726                 vcpu->arch.event_exit_inst_len;
3727             idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3728         } else
3729             idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3730 
3731         if (vcpu->arch.exception.has_error_code) {
3732             idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3733             vmcs12->idt_vectoring_error_code =
3734                 vcpu->arch.exception.error_code;
3735         }
3736 
3737         vmcs12->idt_vectoring_info_field = idt_vectoring;
3738     } else if (vcpu->arch.nmi_injected) {
3739         vmcs12->idt_vectoring_info_field =
3740             INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3741     } else if (vcpu->arch.interrupt.injected) {
3742         nr = vcpu->arch.interrupt.nr;
3743         idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3744 
3745         if (vcpu->arch.interrupt.soft) {
3746             idt_vectoring |= INTR_TYPE_SOFT_INTR;
3747             vmcs12->vm_entry_instruction_len =
3748                 vcpu->arch.event_exit_inst_len;
3749         } else
3750             idt_vectoring |= INTR_TYPE_EXT_INTR;
3751 
3752         vmcs12->idt_vectoring_info_field = idt_vectoring;
3753     } else {
3754         vmcs12->idt_vectoring_info_field = 0;
3755     }
3756 }
3757 
3758 
3759 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3760 {
3761     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3762     gfn_t gfn;
3763 
3764     /*
3765      * Don't need to mark the APIC access page dirty; it is never
3766      * written to by the CPU during APIC virtualization.
3767      */
3768 
3769     if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3770         gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3771         kvm_vcpu_mark_page_dirty(vcpu, gfn);
3772     }
3773 
3774     if (nested_cpu_has_posted_intr(vmcs12)) {
3775         gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3776         kvm_vcpu_mark_page_dirty(vcpu, gfn);
3777     }
3778 }
3779 
3780 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3781 {
3782     struct vcpu_vmx *vmx = to_vmx(vcpu);
3783     int max_irr;
3784     void *vapic_page;
3785     u16 status;
3786 
3787     if (!vmx->nested.pi_pending)
3788         return 0;
3789 
3790     if (!vmx->nested.pi_desc)
3791         goto mmio_needed;
3792 
3793     vmx->nested.pi_pending = false;
3794 
3795     if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3796         return 0;
3797 
3798     max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3799     if (max_irr != 256) {
3800         vapic_page = vmx->nested.virtual_apic_map.hva;
3801         if (!vapic_page)
3802             goto mmio_needed;
3803 
3804         __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3805             vapic_page, &max_irr);
3806         status = vmcs_read16(GUEST_INTR_STATUS);
3807         if ((u8)max_irr > ((u8)status & 0xff)) {
3808             status &= ~0xff;
3809             status |= (u8)max_irr;
3810             vmcs_write16(GUEST_INTR_STATUS, status);
3811         }
3812     }
3813 
3814     nested_mark_vmcs12_pages_dirty(vcpu);
3815     return 0;
3816 
3817 mmio_needed:
3818     kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3819     return -ENXIO;
3820 }
3821 
3822 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3823                            unsigned long exit_qual)
3824 {
3825     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3826     unsigned int nr = vcpu->arch.exception.nr;
3827     u32 intr_info = nr | INTR_INFO_VALID_MASK;
3828 
3829     if (vcpu->arch.exception.has_error_code) {
3830         vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3831         intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3832     }
3833 
3834     if (kvm_exception_is_soft(nr))
3835         intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3836     else
3837         intr_info |= INTR_TYPE_HARD_EXCEPTION;
3838 
3839     if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3840         vmx_get_nmi_mask(vcpu))
3841         intr_info |= INTR_INFO_UNBLOCK_NMI;
3842 
3843     nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3844 }
3845 
3846 /*
3847  * Returns true if a debug trap is pending delivery.
3848  *
3849  * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3850  * exception may be inferred from the presence of an exception payload.
3851  */
3852 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3853 {
3854     return vcpu->arch.exception.pending &&
3855             vcpu->arch.exception.nr == DB_VECTOR &&
3856             vcpu->arch.exception.payload;
3857 }
3858 
3859 /*
3860  * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3861  * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3862  * represents these debug traps with a payload that is said to be compatible
3863  * with the 'pending debug exceptions' field, write the payload to the VMCS
3864  * field if a VM-exit is delivered before the debug trap.
3865  */
3866 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3867 {
3868     if (vmx_pending_dbg_trap(vcpu))
3869         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3870                 vcpu->arch.exception.payload);
3871 }
3872 
3873 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3874 {
3875     return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3876            to_vmx(vcpu)->nested.preemption_timer_expired;
3877 }
3878 
3879 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
3880 {
3881     struct vcpu_vmx *vmx = to_vmx(vcpu);
3882     unsigned long exit_qual;
3883     bool block_nested_events =
3884         vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3885     bool mtf_pending = vmx->nested.mtf_pending;
3886     struct kvm_lapic *apic = vcpu->arch.apic;
3887 
3888     /*
3889      * Clear the MTF state. If a higher priority VM-exit is delivered first,
3890      * this state is discarded.
3891      */
3892     if (!block_nested_events)
3893         vmx->nested.mtf_pending = false;
3894 
3895     if (lapic_in_kernel(vcpu) &&
3896         test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3897         if (block_nested_events)
3898             return -EBUSY;
3899         nested_vmx_update_pending_dbg(vcpu);
3900         clear_bit(KVM_APIC_INIT, &apic->pending_events);
3901         if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
3902             nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3903         return 0;
3904     }
3905 
3906     if (lapic_in_kernel(vcpu) &&
3907         test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3908         if (block_nested_events)
3909             return -EBUSY;
3910 
3911         clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3912         if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3913             nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
3914                         apic->sipi_vector & 0xFFUL);
3915         return 0;
3916     }
3917 
3918     /*
3919      * Process any exceptions that are not debug traps before MTF.
3920      *
3921      * Note that only a pending nested run can block a pending exception.
3922      * Otherwise an injected NMI/interrupt should either be
3923      * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3924      * while delivering the pending exception.
3925      */
3926 
3927     if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
3928         if (vmx->nested.nested_run_pending)
3929             return -EBUSY;
3930         if (!nested_vmx_check_exception(vcpu, &exit_qual))
3931             goto no_vmexit;
3932         nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3933         return 0;
3934     }
3935 
3936     if (mtf_pending) {
3937         if (block_nested_events)
3938             return -EBUSY;
3939         nested_vmx_update_pending_dbg(vcpu);
3940         nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3941         return 0;
3942     }
3943 
3944     if (vcpu->arch.exception.pending) {
3945         if (vmx->nested.nested_run_pending)
3946             return -EBUSY;
3947         if (!nested_vmx_check_exception(vcpu, &exit_qual))
3948             goto no_vmexit;
3949         nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3950         return 0;
3951     }
3952 
3953     if (nested_vmx_preemption_timer_pending(vcpu)) {
3954         if (block_nested_events)
3955             return -EBUSY;
3956         nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3957         return 0;
3958     }
3959 
3960     if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3961         if (block_nested_events)
3962             return -EBUSY;
3963         goto no_vmexit;
3964     }
3965 
3966     if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
3967         if (block_nested_events)
3968             return -EBUSY;
3969         if (!nested_exit_on_nmi(vcpu))
3970             goto no_vmexit;
3971 
3972         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3973                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
3974                   INTR_INFO_VALID_MASK, 0);
3975         /*
3976          * The NMI-triggered VM exit counts as injection:
3977          * clear this one and block further NMIs.
3978          */
3979         vcpu->arch.nmi_pending = 0;
3980         vmx_set_nmi_mask(vcpu, true);
3981         return 0;
3982     }
3983 
3984     if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
3985         if (block_nested_events)
3986             return -EBUSY;
3987         if (!nested_exit_on_intr(vcpu))
3988             goto no_vmexit;
3989         nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3990         return 0;
3991     }
3992 
3993 no_vmexit:
3994     return vmx_complete_nested_posted_interrupt(vcpu);
3995 }
3996 
3997 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3998 {
3999     ktime_t remaining =
4000         hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4001     u64 value;
4002 
4003     if (ktime_to_ns(remaining) <= 0)
4004         return 0;
4005 
4006     value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4007     do_div(value, 1000000);
4008     return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4009 }
4010 
4011 static bool is_vmcs12_ext_field(unsigned long field)
4012 {
4013     switch (field) {
4014     case GUEST_ES_SELECTOR:
4015     case GUEST_CS_SELECTOR:
4016     case GUEST_SS_SELECTOR:
4017     case GUEST_DS_SELECTOR:
4018     case GUEST_FS_SELECTOR:
4019     case GUEST_GS_SELECTOR:
4020     case GUEST_LDTR_SELECTOR:
4021     case GUEST_TR_SELECTOR:
4022     case GUEST_ES_LIMIT:
4023     case GUEST_CS_LIMIT:
4024     case GUEST_SS_LIMIT:
4025     case GUEST_DS_LIMIT:
4026     case GUEST_FS_LIMIT:
4027     case GUEST_GS_LIMIT:
4028     case GUEST_LDTR_LIMIT:
4029     case GUEST_TR_LIMIT:
4030     case GUEST_GDTR_LIMIT:
4031     case GUEST_IDTR_LIMIT:
4032     case GUEST_ES_AR_BYTES:
4033     case GUEST_DS_AR_BYTES:
4034     case GUEST_FS_AR_BYTES:
4035     case GUEST_GS_AR_BYTES:
4036     case GUEST_LDTR_AR_BYTES:
4037     case GUEST_TR_AR_BYTES:
4038     case GUEST_ES_BASE:
4039     case GUEST_CS_BASE:
4040     case GUEST_SS_BASE:
4041     case GUEST_DS_BASE:
4042     case GUEST_FS_BASE:
4043     case GUEST_GS_BASE:
4044     case GUEST_LDTR_BASE:
4045     case GUEST_TR_BASE:
4046     case GUEST_GDTR_BASE:
4047     case GUEST_IDTR_BASE:
4048     case GUEST_PENDING_DBG_EXCEPTIONS:
4049     case GUEST_BNDCFGS:
4050         return true;
4051     default:
4052         break;
4053     }
4054 
4055     return false;
4056 }
4057 
4058 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4059                        struct vmcs12 *vmcs12)
4060 {
4061     struct vcpu_vmx *vmx = to_vmx(vcpu);
4062 
4063     vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4064     vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4065     vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4066     vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4067     vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4068     vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4069     vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4070     vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4071     vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4072     vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4073     vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4074     vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4075     vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4076     vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4077     vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4078     vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4079     vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4080     vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4081     vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4082     vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4083     vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4084     vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4085     vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4086     vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4087     vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4088     vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4089     vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4090     vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4091     vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4092     vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4093     vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4094     vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4095     vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4096     vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4097     vmcs12->guest_pending_dbg_exceptions =
4098         vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4099 
4100     vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4101 }
4102 
4103 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4104                        struct vmcs12 *vmcs12)
4105 {
4106     struct vcpu_vmx *vmx = to_vmx(vcpu);
4107     int cpu;
4108 
4109     if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4110         return;
4111 
4112 
4113     WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4114 
4115     cpu = get_cpu();
4116     vmx->loaded_vmcs = &vmx->nested.vmcs02;
4117     vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
4118 
4119     sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4120 
4121     vmx->loaded_vmcs = &vmx->vmcs01;
4122     vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
4123     put_cpu();
4124 }
4125 
4126 /*
4127  * Update the guest state fields of vmcs12 to reflect changes that
4128  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4129  * VM-entry controls is also updated, since this is really a guest
4130  * state bit.)
4131  */
4132 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4133 {
4134     struct vcpu_vmx *vmx = to_vmx(vcpu);
4135 
4136     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
4137         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4138 
4139     vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4140         !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
4141 
4142     vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4143     vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4144 
4145     vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4146     vmcs12->guest_rip = kvm_rip_read(vcpu);
4147     vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4148 
4149     vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4150     vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4151 
4152     vmcs12->guest_interruptibility_info =
4153         vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4154 
4155     if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4156         vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4157     else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4158         vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4159     else
4160         vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4161 
4162     if (nested_cpu_has_preemption_timer(vmcs12) &&
4163         vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4164         !vmx->nested.nested_run_pending)
4165         vmcs12->vmx_preemption_timer_value =
4166             vmx_get_preemption_timer_value(vcpu);
4167 
4168     /*
4169      * In some cases (usually, nested EPT), L2 is allowed to change its
4170      * own CR3 without exiting. If it has changed it, we must keep it.
4171      * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4172      * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4173      *
4174      * Additionally, restore L2's PDPTR to vmcs12.
4175      */
4176     if (enable_ept) {
4177         vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4178         if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4179             vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4180             vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4181             vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4182             vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4183         }
4184     }
4185 
4186     vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4187 
4188     if (nested_cpu_has_vid(vmcs12))
4189         vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4190 
4191     vmcs12->vm_entry_controls =
4192         (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4193         (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4194 
4195     if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4196         kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
4197 
4198     if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4199         vmcs12->guest_ia32_efer = vcpu->arch.efer;
4200 }
4201 
4202 /*
4203  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4204  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4205  * and this function updates it to reflect the changes to the guest state while
4206  * L2 was running (and perhaps made some exits which were handled directly by L0
4207  * without going back to L1), and to reflect the exit reason.
4208  * Note that we do not have to copy here all VMCS fields, just those that
4209  * could have changed by the L2 guest or the exit - i.e., the guest-state and
4210  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4211  * which already writes to vmcs12 directly.
4212  */
4213 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4214                u32 vm_exit_reason, u32 exit_intr_info,
4215                unsigned long exit_qualification)
4216 {
4217     /* update exit information fields: */
4218     vmcs12->vm_exit_reason = vm_exit_reason;
4219     if (to_vmx(vcpu)->exit_reason.enclave_mode)
4220         vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4221     vmcs12->exit_qualification = exit_qualification;
4222 
4223     /*
4224      * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4225      * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4226      * exit info fields are unmodified.
4227      */
4228     if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4229         vmcs12->launch_state = 1;
4230 
4231         /* vm_entry_intr_info_field is cleared on exit. Emulate this
4232          * instead of reading the real value. */
4233         vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4234 
4235         /*
4236          * Transfer the event that L0 or L1 may wanted to inject into
4237          * L2 to IDT_VECTORING_INFO_FIELD.
4238          */
4239         vmcs12_save_pending_event(vcpu, vmcs12,
4240                       vm_exit_reason, exit_intr_info);
4241 
4242         vmcs12->vm_exit_intr_info = exit_intr_info;
4243         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4244         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4245 
4246         /*
4247          * According to spec, there's no need to store the guest's
4248          * MSRs if the exit is due to a VM-entry failure that occurs
4249          * during or after loading the guest state. Since this exit
4250          * does not fall in that category, we need to save the MSRs.
4251          */
4252         if (nested_vmx_store_msr(vcpu,
4253                      vmcs12->vm_exit_msr_store_addr,
4254                      vmcs12->vm_exit_msr_store_count))
4255             nested_vmx_abort(vcpu,
4256                      VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4257     }
4258 
4259     /*
4260      * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4261      * preserved above and would only end up incorrectly in L1.
4262      */
4263     vcpu->arch.nmi_injected = false;
4264     kvm_clear_exception_queue(vcpu);
4265     kvm_clear_interrupt_queue(vcpu);
4266 }
4267 
4268 /*
4269  * A part of what we need to when the nested L2 guest exits and we want to
4270  * run its L1 parent, is to reset L1's guest state to the host state specified
4271  * in vmcs12.
4272  * This function is to be called not only on normal nested exit, but also on
4273  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4274  * Failures During or After Loading Guest State").
4275  * This function should be called when the active VMCS is L1's (vmcs01).
4276  */
4277 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4278                    struct vmcs12 *vmcs12)
4279 {
4280     enum vm_entry_failure_code ignored;
4281     struct kvm_segment seg;
4282 
4283     if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4284         vcpu->arch.efer = vmcs12->host_ia32_efer;
4285     else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4286         vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4287     else
4288         vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4289     vmx_set_efer(vcpu, vcpu->arch.efer);
4290 
4291     kvm_rsp_write(vcpu, vmcs12->host_rsp);
4292     kvm_rip_write(vcpu, vmcs12->host_rip);
4293     vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4294     vmx_set_interrupt_shadow(vcpu, 0);
4295 
4296     /*
4297      * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4298      * actually changed, because vmx_set_cr0 refers to efer set above.
4299      *
4300      * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4301      * (KVM doesn't change it);
4302      */
4303     vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4304     vmx_set_cr0(vcpu, vmcs12->host_cr0);
4305 
4306     /* Same as above - no reason to call set_cr4_guest_host_mask().  */
4307     vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4308     vmx_set_cr4(vcpu, vmcs12->host_cr4);
4309 
4310     nested_ept_uninit_mmu_context(vcpu);
4311 
4312     /*
4313      * Only PDPTE load can fail as the value of cr3 was checked on entry and
4314      * couldn't have changed.
4315      */
4316     if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4317         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4318 
4319     nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4320 
4321     vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4322     vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4323     vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4324     vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4325     vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4326     vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4327     vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4328 
4329     /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
4330     if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4331         vmcs_write64(GUEST_BNDCFGS, 0);
4332 
4333     if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4334         vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4335         vcpu->arch.pat = vmcs12->host_ia32_pat;
4336     }
4337     if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
4338         intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
4339         WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4340                      vmcs12->host_ia32_perf_global_ctrl));
4341 
4342     /* Set L1 segment info according to Intel SDM
4343         27.5.2 Loading Host Segment and Descriptor-Table Registers */
4344     seg = (struct kvm_segment) {
4345         .base = 0,
4346         .limit = 0xFFFFFFFF,
4347         .selector = vmcs12->host_cs_selector,
4348         .type = 11,
4349         .present = 1,
4350         .s = 1,
4351         .g = 1
4352     };
4353     if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4354         seg.l = 1;
4355     else
4356         seg.db = 1;
4357     __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4358     seg = (struct kvm_segment) {
4359         .base = 0,
4360         .limit = 0xFFFFFFFF,
4361         .type = 3,
4362         .present = 1,
4363         .s = 1,
4364         .db = 1,
4365         .g = 1
4366     };
4367     seg.selector = vmcs12->host_ds_selector;
4368     __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4369     seg.selector = vmcs12->host_es_selector;
4370     __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4371     seg.selector = vmcs12->host_ss_selector;
4372     __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4373     seg.selector = vmcs12->host_fs_selector;
4374     seg.base = vmcs12->host_fs_base;
4375     __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4376     seg.selector = vmcs12->host_gs_selector;
4377     seg.base = vmcs12->host_gs_base;
4378     __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4379     seg = (struct kvm_segment) {
4380         .base = vmcs12->host_tr_base,
4381         .limit = 0x67,
4382         .selector = vmcs12->host_tr_selector,
4383         .type = 11,
4384         .present = 1
4385     };
4386     __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4387 
4388     memset(&seg, 0, sizeof(seg));
4389     seg.unusable = 1;
4390     __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4391 
4392     kvm_set_dr(vcpu, 7, 0x400);
4393     vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4394 
4395     if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4396                 vmcs12->vm_exit_msr_load_count))
4397         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4398 
4399     to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4400 }
4401 
4402 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4403 {
4404     struct vmx_uret_msr *efer_msr;
4405     unsigned int i;
4406 
4407     if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4408         return vmcs_read64(GUEST_IA32_EFER);
4409 
4410     if (cpu_has_load_ia32_efer())
4411         return host_efer;
4412 
4413     for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4414         if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4415             return vmx->msr_autoload.guest.val[i].value;
4416     }
4417 
4418     efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4419     if (efer_msr)
4420         return efer_msr->data;
4421 
4422     return host_efer;
4423 }
4424 
4425 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4426 {
4427     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4428     struct vcpu_vmx *vmx = to_vmx(vcpu);
4429     struct vmx_msr_entry g, h;
4430     gpa_t gpa;
4431     u32 i, j;
4432 
4433     vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4434 
4435     if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4436         /*
4437          * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4438          * as vmcs01.GUEST_DR7 contains a userspace defined value
4439          * and vcpu->arch.dr7 is not squirreled away before the
4440          * nested VMENTER (not worth adding a variable in nested_vmx).
4441          */
4442         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4443             kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4444         else
4445             WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4446     }
4447 
4448     /*
4449      * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4450      * handle a variety of side effects to KVM's software model.
4451      */
4452     vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4453 
4454     vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4455     vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4456 
4457     vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4458     vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4459 
4460     nested_ept_uninit_mmu_context(vcpu);
4461     vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4462     kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4463 
4464     /*
4465      * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4466      * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4467      * VMFail, like everything else we just need to ensure our
4468      * software model is up-to-date.
4469      */
4470     if (enable_ept && is_pae_paging(vcpu))
4471         ept_save_pdptrs(vcpu);
4472 
4473     kvm_mmu_reset_context(vcpu);
4474 
4475     /*
4476      * This nasty bit of open coding is a compromise between blindly
4477      * loading L1's MSRs using the exit load lists (incorrect emulation
4478      * of VMFail), leaving the nested VM's MSRs in the software model
4479      * (incorrect behavior) and snapshotting the modified MSRs (too
4480      * expensive since the lists are unbound by hardware).  For each
4481      * MSR that was (prematurely) loaded from the nested VMEntry load
4482      * list, reload it from the exit load list if it exists and differs
4483      * from the guest value.  The intent is to stuff host state as
4484      * silently as possible, not to fully process the exit load list.
4485      */
4486     for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4487         gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4488         if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4489             pr_debug_ratelimited(
4490                 "%s read MSR index failed (%u, 0x%08llx)\n",
4491                 __func__, i, gpa);
4492             goto vmabort;
4493         }
4494 
4495         for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4496             gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4497             if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4498                 pr_debug_ratelimited(
4499                     "%s read MSR failed (%u, 0x%08llx)\n",
4500                     __func__, j, gpa);
4501                 goto vmabort;
4502             }
4503             if (h.index != g.index)
4504                 continue;
4505             if (h.value == g.value)
4506                 break;
4507 
4508             if (nested_vmx_load_msr_check(vcpu, &h)) {
4509                 pr_debug_ratelimited(
4510                     "%s check failed (%u, 0x%x, 0x%x)\n",
4511                     __func__, j, h.index, h.reserved);
4512                 goto vmabort;
4513             }
4514 
4515             if (kvm_set_msr(vcpu, h.index, h.value)) {
4516                 pr_debug_ratelimited(
4517                     "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4518                     __func__, j, h.index, h.value);
4519                 goto vmabort;
4520             }
4521         }
4522     }
4523 
4524     return;
4525 
4526 vmabort:
4527     nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4528 }
4529 
4530 /*
4531  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4532  * and modify vmcs12 to make it see what it would expect to see there if
4533  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4534  */
4535 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4536                u32 exit_intr_info, unsigned long exit_qualification)
4537 {
4538     struct vcpu_vmx *vmx = to_vmx(vcpu);
4539     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4540 
4541     /* trying to cancel vmlaunch/vmresume is a bug */
4542     WARN_ON_ONCE(vmx->nested.nested_run_pending);
4543 
4544     if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4545         /*
4546          * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4547          * Enlightened VMCS after migration and we still need to
4548          * do that when something is forcing L2->L1 exit prior to
4549          * the first L2 run.
4550          */
4551         (void)nested_get_evmcs_page(vcpu);
4552     }
4553 
4554     /* Service pending TLB flush requests for L2 before switching to L1. */
4555     kvm_service_local_tlb_flush_requests(vcpu);
4556 
4557     /*
4558      * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4559      * now and the new vmentry.  Ensure that the VMCS02 PDPTR fields are
4560      * up-to-date before switching to L1.
4561      */
4562     if (enable_ept && is_pae_paging(vcpu))
4563         vmx_ept_load_pdptrs(vcpu);
4564 
4565     leave_guest_mode(vcpu);
4566 
4567     if (nested_cpu_has_preemption_timer(vmcs12))
4568         hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4569 
4570     if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4571         vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4572         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4573             vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4574     }
4575 
4576     if (likely(!vmx->fail)) {
4577         sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4578 
4579         if (vm_exit_reason != -1)
4580             prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4581                        exit_intr_info, exit_qualification);
4582 
4583         /*
4584          * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4585          * also be used to capture vmcs12 cache as part of
4586          * capturing nVMX state for snapshot (migration).
4587          *
4588          * Otherwise, this flush will dirty guest memory at a
4589          * point it is already assumed by user-space to be
4590          * immutable.
4591          */
4592         nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4593     } else {
4594         /*
4595          * The only expected VM-instruction error is "VM entry with
4596          * invalid control field(s)." Anything else indicates a
4597          * problem with L0.  And we should never get here with a
4598          * VMFail of any type if early consistency checks are enabled.
4599          */
4600         WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4601                  VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4602         WARN_ON_ONCE(nested_early_check);
4603     }
4604 
4605     vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4606 
4607     /* Update any VMCS fields that might have changed while L2 ran */
4608     vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4609     vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4610     vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4611     if (kvm_caps.has_tsc_control)
4612         vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
4613 
4614     if (vmx->nested.l1_tpr_threshold != -1)
4615         vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4616 
4617     if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4618         vmx->nested.change_vmcs01_virtual_apic_mode = false;
4619         vmx_set_virtual_apic_mode(vcpu);
4620     }
4621 
4622     if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
4623         vmx->nested.update_vmcs01_cpu_dirty_logging = false;
4624         vmx_update_cpu_dirty_logging(vcpu);
4625     }
4626 
4627     /* Unpin physical memory we referred to in vmcs02 */
4628     kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
4629     kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4630     kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4631     vmx->nested.pi_desc = NULL;
4632 
4633     if (vmx->nested.reload_vmcs01_apic_access_page) {
4634         vmx->nested.reload_vmcs01_apic_access_page = false;
4635         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4636     }
4637 
4638     if (vmx->nested.update_vmcs01_apicv_status) {
4639         vmx->nested.update_vmcs01_apicv_status = false;
4640         kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
4641     }
4642 
4643     if ((vm_exit_reason != -1) &&
4644         (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
4645         vmx->nested.need_vmcs12_to_shadow_sync = true;
4646 
4647     /* in case we halted in L2 */
4648     vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4649 
4650     if (likely(!vmx->fail)) {
4651         if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4652             nested_exit_intr_ack_set(vcpu)) {
4653             int irq = kvm_cpu_get_interrupt(vcpu);
4654             WARN_ON(irq < 0);
4655             vmcs12->vm_exit_intr_info = irq |
4656                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4657         }
4658 
4659         if (vm_exit_reason != -1)
4660             trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4661                                vmcs12->exit_qualification,
4662                                vmcs12->idt_vectoring_info_field,
4663                                vmcs12->vm_exit_intr_info,
4664                                vmcs12->vm_exit_intr_error_code,
4665                                KVM_ISA_VMX);
4666 
4667         load_vmcs12_host_state(vcpu, vmcs12);
4668 
4669         return;
4670     }
4671 
4672     /*
4673      * After an early L2 VM-entry failure, we're now back
4674      * in L1 which thinks it just finished a VMLAUNCH or
4675      * VMRESUME instruction, so we need to set the failure
4676      * flag and the VM-instruction error field of the VMCS
4677      * accordingly, and skip the emulated instruction.
4678      */
4679     (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4680 
4681     /*
4682      * Restore L1's host state to KVM's software model.  We're here
4683      * because a consistency check was caught by hardware, which
4684      * means some amount of guest state has been propagated to KVM's
4685      * model and needs to be unwound to the host's state.
4686      */
4687     nested_vmx_restore_host_state(vcpu);
4688 
4689     vmx->fail = 0;
4690 }
4691 
4692 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
4693 {
4694     nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
4695 }
4696 
4697 /*
4698  * Decode the memory-address operand of a vmx instruction, as recorded on an
4699  * exit caused by such an instruction (run by a guest hypervisor).
4700  * On success, returns 0. When the operand is invalid, returns 1 and throws
4701  * #UD, #GP, or #SS.
4702  */
4703 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4704             u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4705 {
4706     gva_t off;
4707     bool exn;
4708     struct kvm_segment s;
4709 
4710     /*
4711      * According to Vol. 3B, "Information for VM Exits Due to Instruction
4712      * Execution", on an exit, vmx_instruction_info holds most of the
4713      * addressing components of the operand. Only the displacement part
4714      * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4715      * For how an actual address is calculated from all these components,
4716      * refer to Vol. 1, "Operand Addressing".
4717      */
4718     int  scaling = vmx_instruction_info & 3;
4719     int  addr_size = (vmx_instruction_info >> 7) & 7;
4720     bool is_reg = vmx_instruction_info & (1u << 10);
4721     int  seg_reg = (vmx_instruction_info >> 15) & 7;
4722     int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4723     bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4724     int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4725     bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4726 
4727     if (is_reg) {
4728         kvm_queue_exception(vcpu, UD_VECTOR);
4729         return 1;
4730     }
4731 
4732     /* Addr = segment_base + offset */
4733     /* offset = base + [index * scale] + displacement */
4734     off = exit_qualification; /* holds the displacement */
4735     if (addr_size == 1)
4736         off = (gva_t)sign_extend64(off, 31);
4737     else if (addr_size == 0)
4738         off = (gva_t)sign_extend64(off, 15);
4739     if (base_is_valid)
4740         off += kvm_register_read(vcpu, base_reg);
4741     if (index_is_valid)
4742         off += kvm_register_read(vcpu, index_reg) << scaling;
4743     vmx_get_segment(vcpu, &s, seg_reg);
4744 
4745     /*
4746      * The effective address, i.e. @off, of a memory operand is truncated
4747      * based on the address size of the instruction.  Note that this is
4748      * the *effective address*, i.e. the address prior to accounting for
4749      * the segment's base.
4750      */
4751     if (addr_size == 1) /* 32 bit */
4752         off &= 0xffffffff;
4753     else if (addr_size == 0) /* 16 bit */
4754         off &= 0xffff;
4755 
4756     /* Checks for #GP/#SS exceptions. */
4757     exn = false;
4758     if (is_long_mode(vcpu)) {
4759         /*
4760          * The virtual/linear address is never truncated in 64-bit
4761          * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4762          * address when using FS/GS with a non-zero base.
4763          */
4764         if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4765             *ret = s.base + off;
4766         else
4767             *ret = off;
4768 
4769         /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4770          * non-canonical form. This is the only check on the memory
4771          * destination for long mode!
4772          */
4773         exn = is_noncanonical_address(*ret, vcpu);
4774     } else {
4775         /*
4776          * When not in long mode, the virtual/linear address is
4777          * unconditionally truncated to 32 bits regardless of the
4778          * address size.
4779          */
4780         *ret = (s.base + off) & 0xffffffff;
4781 
4782         /* Protected mode: apply checks for segment validity in the
4783          * following order:
4784          * - segment type check (#GP(0) may be thrown)
4785          * - usability check (#GP(0)/#SS(0))
4786          * - limit check (#GP(0)/#SS(0))
4787          */
4788         if (wr)
4789             /* #GP(0) if the destination operand is located in a
4790              * read-only data segment or any code segment.
4791              */
4792             exn = ((s.type & 0xa) == 0 || (s.type & 8));
4793         else
4794             /* #GP(0) if the source operand is located in an
4795              * execute-only code segment
4796              */
4797             exn = ((s.type & 0xa) == 8);
4798         if (exn) {
4799             kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4800             return 1;
4801         }
4802         /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4803          */
4804         exn = (s.unusable != 0);
4805 
4806         /*
4807          * Protected mode: #GP(0)/#SS(0) if the memory operand is
4808          * outside the segment limit.  All CPUs that support VMX ignore
4809          * limit checks for flat segments, i.e. segments with base==0,
4810          * limit==0xffffffff and of type expand-up data or code.
4811          */
4812         if (!(s.base == 0 && s.limit == 0xffffffff &&
4813              ((s.type & 8) || !(s.type & 4))))
4814             exn = exn || ((u64)off + len - 1 > s.limit);
4815     }
4816     if (exn) {
4817         kvm_queue_exception_e(vcpu,
4818                       seg_reg == VCPU_SREG_SS ?
4819                         SS_VECTOR : GP_VECTOR,
4820                       0);
4821         return 1;
4822     }
4823 
4824     return 0;
4825 }
4826 
4827 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4828                 int *ret)
4829 {
4830     gva_t gva;
4831     struct x86_exception e;
4832     int r;
4833 
4834     if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
4835                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4836                 sizeof(*vmpointer), &gva)) {
4837         *ret = 1;
4838         return -EINVAL;
4839     }
4840 
4841     r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4842     if (r != X86EMUL_CONTINUE) {
4843         *ret = kvm_handle_memory_failure(vcpu, r, &e);
4844         return -EINVAL;
4845     }
4846 
4847     return 0;
4848 }
4849 
4850 /*
4851  * Allocate a shadow VMCS and associate it with the currently loaded
4852  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4853  * VMCS is also VMCLEARed, so that it is ready for use.
4854  */
4855 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4856 {
4857     struct vcpu_vmx *vmx = to_vmx(vcpu);
4858     struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4859 
4860     /*
4861      * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
4862      * when L1 executes VMXOFF or the vCPU is forced out of nested
4863      * operation.  VMXON faults if the CPU is already post-VMXON, so it
4864      * should be impossible to already have an allocated shadow VMCS.  KVM
4865      * doesn't support virtualization of VMCS shadowing, so vmcs01 should
4866      * always be the loaded VMCS.
4867      */
4868     if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
4869         return loaded_vmcs->shadow_vmcs;
4870 
4871     loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4872     if (loaded_vmcs->shadow_vmcs)
4873         vmcs_clear(loaded_vmcs->shadow_vmcs);
4874 
4875     return loaded_vmcs->shadow_vmcs;
4876 }
4877 
4878 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4879 {
4880     struct vcpu_vmx *vmx = to_vmx(vcpu);
4881     int r;
4882 
4883     r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4884     if (r < 0)
4885         goto out_vmcs02;
4886 
4887     vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4888     if (!vmx->nested.cached_vmcs12)
4889         goto out_cached_vmcs12;
4890 
4891     vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
4892     vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4893     if (!vmx->nested.cached_shadow_vmcs12)
4894         goto out_cached_shadow_vmcs12;
4895 
4896     if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4897         goto out_shadow_vmcs;
4898 
4899     hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4900              HRTIMER_MODE_ABS_PINNED);
4901     vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4902 
4903     vmx->nested.vpid02 = allocate_vpid();
4904 
4905     vmx->nested.vmcs02_initialized = false;
4906     vmx->nested.vmxon = true;
4907 
4908     if (vmx_pt_mode_is_host_guest()) {
4909         vmx->pt_desc.guest.ctl = 0;
4910         pt_update_intercept_for_msr(vcpu);
4911     }
4912 
4913     return 0;
4914 
4915 out_shadow_vmcs:
4916     kfree(vmx->nested.cached_shadow_vmcs12);
4917 
4918 out_cached_shadow_vmcs12:
4919     kfree(vmx->nested.cached_vmcs12);
4920 
4921 out_cached_vmcs12:
4922     free_loaded_vmcs(&vmx->nested.vmcs02);
4923 
4924 out_vmcs02:
4925     return -ENOMEM;
4926 }
4927 
4928 /* Emulate the VMXON instruction. */
4929 static int handle_vmxon(struct kvm_vcpu *vcpu)
4930 {
4931     int ret;
4932     gpa_t vmptr;
4933     uint32_t revision;
4934     struct vcpu_vmx *vmx = to_vmx(vcpu);
4935     const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4936         | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
4937 
4938     /*
4939      * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
4940      * that have higher priority than VM-Exit (see Intel SDM's pseudocode
4941      * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
4942      * running the guest, i.e. KVM needs to check the _guest_ values.
4943      *
4944      * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
4945      * !COMPATIBILITY modes.  KVM may run the guest in VM86 to emulate Real
4946      * Mode, but KVM will never take the guest out of those modes.
4947      */
4948     if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
4949         !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
4950         kvm_queue_exception(vcpu, UD_VECTOR);
4951         return 1;
4952     }
4953 
4954     /*
4955      * CPL=0 and all other checks that are lower priority than VM-Exit must
4956      * be checked manually.
4957      */
4958     if (vmx_get_cpl(vcpu)) {
4959         kvm_inject_gp(vcpu, 0);
4960         return 1;
4961     }
4962 
4963     if (vmx->nested.vmxon)
4964         return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4965 
4966     if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4967             != VMXON_NEEDED_FEATURES) {
4968         kvm_inject_gp(vcpu, 0);
4969         return 1;
4970     }
4971 
4972     if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4973         return ret;
4974 
4975     /*
4976      * SDM 3: 24.11.5
4977      * The first 4 bytes of VMXON region contain the supported
4978      * VMCS revision identifier
4979      *
4980      * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4981      * which replaces physical address width with 32
4982      */
4983     if (!page_address_valid(vcpu, vmptr))
4984         return nested_vmx_failInvalid(vcpu);
4985 
4986     if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4987         revision != VMCS12_REVISION)
4988         return nested_vmx_failInvalid(vcpu);
4989 
4990     vmx->nested.vmxon_ptr = vmptr;
4991     ret = enter_vmx_operation(vcpu);
4992     if (ret)
4993         return ret;
4994 
4995     return nested_vmx_succeed(vcpu);
4996 }
4997 
4998 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4999 {
5000     struct vcpu_vmx *vmx = to_vmx(vcpu);
5001 
5002     if (vmx->nested.current_vmptr == INVALID_GPA)
5003         return;
5004 
5005     copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5006 
5007     if (enable_shadow_vmcs) {
5008         /* copy to memory all shadowed fields in case
5009            they were modified */
5010         copy_shadow_to_vmcs12(vmx);
5011         vmx_disable_shadow_vmcs(vmx);
5012     }
5013     vmx->nested.posted_intr_nv = -1;
5014 
5015     /* Flush VMCS12 to guest memory */
5016     kvm_vcpu_write_guest_page(vcpu,
5017                   vmx->nested.current_vmptr >> PAGE_SHIFT,
5018                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5019 
5020     kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5021 
5022     vmx->nested.current_vmptr = INVALID_GPA;
5023 }
5024 
5025 /* Emulate the VMXOFF instruction */
5026 static int handle_vmxoff(struct kvm_vcpu *vcpu)
5027 {
5028     if (!nested_vmx_check_permission(vcpu))
5029         return 1;
5030 
5031     free_nested(vcpu);
5032 
5033     /* Process a latched INIT during time CPU was in VMX operation */
5034     kvm_make_request(KVM_REQ_EVENT, vcpu);
5035 
5036     return nested_vmx_succeed(vcpu);
5037 }
5038 
5039 /* Emulate the VMCLEAR instruction */
5040 static int handle_vmclear(struct kvm_vcpu *vcpu)
5041 {
5042     struct vcpu_vmx *vmx = to_vmx(vcpu);
5043     u32 zero = 0;
5044     gpa_t vmptr;
5045     u64 evmcs_gpa;
5046     int r;
5047 
5048     if (!nested_vmx_check_permission(vcpu))
5049         return 1;
5050 
5051     if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5052         return r;
5053 
5054     if (!page_address_valid(vcpu, vmptr))
5055         return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5056 
5057     if (vmptr == vmx->nested.vmxon_ptr)
5058         return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5059 
5060     /*
5061      * When Enlightened VMEntry is enabled on the calling CPU we treat
5062      * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5063      * way to distinguish it from VMCS12) and we must not corrupt it by
5064      * writing to the non-existent 'launch_state' field. The area doesn't
5065      * have to be the currently active EVMCS on the calling CPU and there's
5066      * nothing KVM has to do to transition it from 'active' to 'non-active'
5067      * state. It is possible that the area will stay mapped as
5068      * vmx->nested.hv_evmcs but this shouldn't be a problem.
5069      */
5070     if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5071            !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
5072         if (vmptr == vmx->nested.current_vmptr)
5073             nested_release_vmcs12(vcpu);
5074 
5075         kvm_vcpu_write_guest(vcpu,
5076                      vmptr + offsetof(struct vmcs12,
5077                               launch_state),
5078                      &zero, sizeof(zero));
5079     } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
5080         nested_release_evmcs(vcpu);
5081     }
5082 
5083     return nested_vmx_succeed(vcpu);
5084 }
5085 
5086 /* Emulate the VMLAUNCH instruction */
5087 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5088 {
5089     return nested_vmx_run(vcpu, true);
5090 }
5091 
5092 /* Emulate the VMRESUME instruction */
5093 static int handle_vmresume(struct kvm_vcpu *vcpu)
5094 {
5095 
5096     return nested_vmx_run(vcpu, false);
5097 }
5098 
5099 static int handle_vmread(struct kvm_vcpu *vcpu)
5100 {
5101     struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5102                             : get_vmcs12(vcpu);
5103     unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5104     u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5105     struct vcpu_vmx *vmx = to_vmx(vcpu);
5106     struct x86_exception e;
5107     unsigned long field;
5108     u64 value;
5109     gva_t gva = 0;
5110     short offset;
5111     int len, r;
5112 
5113     if (!nested_vmx_check_permission(vcpu))
5114         return 1;
5115 
5116     /* Decode instruction info and find the field to read */
5117     field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5118 
5119     if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
5120         /*
5121          * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5122          * any VMREAD sets the ALU flags for VMfailInvalid.
5123          */
5124         if (vmx->nested.current_vmptr == INVALID_GPA ||
5125             (is_guest_mode(vcpu) &&
5126              get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5127             return nested_vmx_failInvalid(vcpu);
5128 
5129         offset = get_vmcs12_field_offset(field);
5130         if (offset < 0)
5131             return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5132 
5133         if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5134             copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5135 
5136         /* Read the field, zero-extended to a u64 value */
5137         value = vmcs12_read_any(vmcs12, field, offset);
5138     } else {
5139         /*
5140          * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5141          * enlightened VMCS is active VMREAD/VMWRITE instructions are
5142          * unsupported. Unfortunately, certain versions of Windows 11
5143          * don't comply with this requirement which is not enforced in
5144          * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5145          * workaround, as misbehaving guests will panic on VM-Fail.
5146          * Note, enlightened VMCS is incompatible with shadow VMCS so
5147          * all VMREADs from L2 should go to L1.
5148          */
5149         if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5150             return nested_vmx_failInvalid(vcpu);
5151 
5152         offset = evmcs_field_offset(field, NULL);
5153         if (offset < 0)
5154             return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5155 
5156         /* Read the field, zero-extended to a u64 value */
5157         value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
5158     }
5159 
5160     /*
5161      * Now copy part of this value to register or memory, as requested.
5162      * Note that the number of bits actually copied is 32 or 64 depending
5163      * on the guest's mode (32 or 64 bit), not on the given field's length.
5164      */
5165     if (instr_info & BIT(10)) {
5166         kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5167     } else {
5168         len = is_64_bit_mode(vcpu) ? 8 : 4;
5169         if (get_vmx_mem_address(vcpu, exit_qualification,
5170                     instr_info, true, len, &gva))
5171             return 1;
5172         /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5173         r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5174         if (r != X86EMUL_CONTINUE)
5175             return kvm_handle_memory_failure(vcpu, r, &e);
5176     }
5177 
5178     return nested_vmx_succeed(vcpu);
5179 }
5180 
5181 static bool is_shadow_field_rw(unsigned long field)
5182 {
5183     switch (field) {
5184 #define SHADOW_FIELD_RW(x, y) case x:
5185 #include "vmcs_shadow_fields.h"
5186         return true;
5187     default:
5188         break;
5189     }
5190     return false;
5191 }
5192 
5193 static bool is_shadow_field_ro(unsigned long field)
5194 {
5195     switch (field) {
5196 #define SHADOW_FIELD_RO(x, y) case x:
5197 #include "vmcs_shadow_fields.h"
5198         return true;
5199     default:
5200         break;
5201     }
5202     return false;
5203 }
5204 
5205 static int handle_vmwrite(struct kvm_vcpu *vcpu)
5206 {
5207     struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5208                             : get_vmcs12(vcpu);
5209     unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5210     u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5211     struct vcpu_vmx *vmx = to_vmx(vcpu);
5212     struct x86_exception e;
5213     unsigned long field;
5214     short offset;
5215     gva_t gva;
5216     int len, r;
5217 
5218     /*
5219      * The value to write might be 32 or 64 bits, depending on L1's long
5220      * mode, and eventually we need to write that into a field of several
5221      * possible lengths. The code below first zero-extends the value to 64
5222      * bit (value), and then copies only the appropriate number of
5223      * bits into the vmcs12 field.
5224      */
5225     u64 value = 0;
5226 
5227     if (!nested_vmx_check_permission(vcpu))
5228         return 1;
5229 
5230     /*
5231      * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5232      * any VMWRITE sets the ALU flags for VMfailInvalid.
5233      */
5234     if (vmx->nested.current_vmptr == INVALID_GPA ||
5235         (is_guest_mode(vcpu) &&
5236          get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5237         return nested_vmx_failInvalid(vcpu);
5238 
5239     if (instr_info & BIT(10))
5240         value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5241     else {
5242         len = is_64_bit_mode(vcpu) ? 8 : 4;
5243         if (get_vmx_mem_address(vcpu, exit_qualification,
5244                     instr_info, false, len, &gva))
5245             return 1;
5246         r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5247         if (r != X86EMUL_CONTINUE)
5248             return kvm_handle_memory_failure(vcpu, r, &e);
5249     }
5250 
5251     field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5252 
5253     offset = get_vmcs12_field_offset(field);
5254     if (offset < 0)
5255         return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5256 
5257     /*
5258      * If the vCPU supports "VMWRITE to any supported field in the
5259      * VMCS," then the "read-only" fields are actually read/write.
5260      */
5261     if (vmcs_field_readonly(field) &&
5262         !nested_cpu_has_vmwrite_any_field(vcpu))
5263         return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5264 
5265     /*
5266      * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5267      * vmcs12, else we may crush a field or consume a stale value.
5268      */
5269     if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5270         copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5271 
5272     /*
5273      * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5274      * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
5275      * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5276      * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5277      * from L1 will return a different value than VMREAD from L2 (L1 sees
5278      * the stripped down value, L2 sees the full value as stored by KVM).
5279      */
5280     if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5281         value &= 0x1f0ff;
5282 
5283     vmcs12_write_any(vmcs12, field, offset, value);
5284 
5285     /*
5286      * Do not track vmcs12 dirty-state if in guest-mode as we actually
5287      * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
5288      * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5289      * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5290      */
5291     if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5292         /*
5293          * L1 can read these fields without exiting, ensure the
5294          * shadow VMCS is up-to-date.
5295          */
5296         if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5297             preempt_disable();
5298             vmcs_load(vmx->vmcs01.shadow_vmcs);
5299 
5300             __vmcs_writel(field, value);
5301 
5302             vmcs_clear(vmx->vmcs01.shadow_vmcs);
5303             vmcs_load(vmx->loaded_vmcs->vmcs);
5304             preempt_enable();
5305         }
5306         vmx->nested.dirty_vmcs12 = true;
5307     }
5308 
5309     return nested_vmx_succeed(vcpu);
5310 }
5311 
5312 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5313 {
5314     vmx->nested.current_vmptr = vmptr;
5315     if (enable_shadow_vmcs) {
5316         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5317         vmcs_write64(VMCS_LINK_POINTER,
5318                  __pa(vmx->vmcs01.shadow_vmcs));
5319         vmx->nested.need_vmcs12_to_shadow_sync = true;
5320     }
5321     vmx->nested.dirty_vmcs12 = true;
5322     vmx->nested.force_msr_bitmap_recalc = true;
5323 }
5324 
5325 /* Emulate the VMPTRLD instruction */
5326 static int handle_vmptrld(struct kvm_vcpu *vcpu)
5327 {
5328     struct vcpu_vmx *vmx = to_vmx(vcpu);
5329     gpa_t vmptr;
5330     int r;
5331 
5332     if (!nested_vmx_check_permission(vcpu))
5333         return 1;
5334 
5335     if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5336         return r;
5337 
5338     if (!page_address_valid(vcpu, vmptr))
5339         return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5340 
5341     if (vmptr == vmx->nested.vmxon_ptr)
5342         return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5343 
5344     /* Forbid normal VMPTRLD if Enlightened version was used */
5345     if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
5346         return 1;
5347 
5348     if (vmx->nested.current_vmptr != vmptr) {
5349         struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5350         struct vmcs_hdr hdr;
5351 
5352         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
5353             /*
5354              * Reads from an unbacked page return all 1s,
5355              * which means that the 32 bits located at the
5356              * given physical address won't match the required
5357              * VMCS12_REVISION identifier.
5358              */
5359             return nested_vmx_fail(vcpu,
5360                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5361         }
5362 
5363         if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5364                          offsetof(struct vmcs12, hdr),
5365                          sizeof(hdr))) {
5366             return nested_vmx_fail(vcpu,
5367                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5368         }
5369 
5370         if (hdr.revision_id != VMCS12_REVISION ||
5371             (hdr.shadow_vmcs &&
5372              !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5373             return nested_vmx_fail(vcpu,
5374                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5375         }
5376 
5377         nested_release_vmcs12(vcpu);
5378 
5379         /*
5380          * Load VMCS12 from guest memory since it is not already
5381          * cached.
5382          */
5383         if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5384                       VMCS12_SIZE)) {
5385             return nested_vmx_fail(vcpu,
5386                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5387         }
5388 
5389         set_current_vmptr(vmx, vmptr);
5390     }
5391 
5392     return nested_vmx_succeed(vcpu);
5393 }
5394 
5395 /* Emulate the VMPTRST instruction */
5396 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5397 {
5398     unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5399     u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5400     gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5401     struct x86_exception e;
5402     gva_t gva;
5403     int r;
5404 
5405     if (!nested_vmx_check_permission(vcpu))
5406         return 1;
5407 
5408     if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
5409         return 1;
5410 
5411     if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5412                 true, sizeof(gpa_t), &gva))
5413         return 1;
5414     /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5415     r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5416                     sizeof(gpa_t), &e);
5417     if (r != X86EMUL_CONTINUE)
5418         return kvm_handle_memory_failure(vcpu, r, &e);
5419 
5420     return nested_vmx_succeed(vcpu);
5421 }
5422 
5423 /* Emulate the INVEPT instruction */
5424 static int handle_invept(struct kvm_vcpu *vcpu)
5425 {
5426     struct vcpu_vmx *vmx = to_vmx(vcpu);
5427     u32 vmx_instruction_info, types;
5428     unsigned long type, roots_to_free;
5429     struct kvm_mmu *mmu;
5430     gva_t gva;
5431     struct x86_exception e;
5432     struct {
5433         u64 eptp, gpa;
5434     } operand;
5435     int i, r, gpr_index;
5436 
5437     if (!(vmx->nested.msrs.secondary_ctls_high &
5438           SECONDARY_EXEC_ENABLE_EPT) ||
5439         !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5440         kvm_queue_exception(vcpu, UD_VECTOR);
5441         return 1;
5442     }
5443 
5444     if (!nested_vmx_check_permission(vcpu))
5445         return 1;
5446 
5447     vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5448     gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5449     type = kvm_register_read(vcpu, gpr_index);
5450 
5451     types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5452 
5453     if (type >= 32 || !(types & (1 << type)))
5454         return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5455 
5456     /* According to the Intel VMX instruction reference, the memory
5457      * operand is read even if it isn't needed (e.g., for type==global)
5458      */
5459     if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5460             vmx_instruction_info, false, sizeof(operand), &gva))
5461         return 1;
5462     r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5463     if (r != X86EMUL_CONTINUE)
5464         return kvm_handle_memory_failure(vcpu, r, &e);
5465 
5466     /*
5467      * Nested EPT roots are always held through guest_mmu,
5468      * not root_mmu.
5469      */
5470     mmu = &vcpu->arch.guest_mmu;
5471 
5472     switch (type) {
5473     case VMX_EPT_EXTENT_CONTEXT:
5474         if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5475             return nested_vmx_fail(vcpu,
5476                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5477 
5478         roots_to_free = 0;
5479         if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
5480                         operand.eptp))
5481             roots_to_free |= KVM_MMU_ROOT_CURRENT;
5482 
5483         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5484             if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5485                             mmu->prev_roots[i].pgd,
5486                             operand.eptp))
5487                 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5488         }
5489         break;
5490     case VMX_EPT_EXTENT_GLOBAL:
5491         roots_to_free = KVM_MMU_ROOTS_ALL;
5492         break;
5493     default:
5494         BUG();
5495         break;
5496     }
5497 
5498     if (roots_to_free)
5499         kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
5500 
5501     return nested_vmx_succeed(vcpu);
5502 }
5503 
5504 static int handle_invvpid(struct kvm_vcpu *vcpu)
5505 {
5506     struct vcpu_vmx *vmx = to_vmx(vcpu);
5507     u32 vmx_instruction_info;
5508     unsigned long type, types;
5509     gva_t gva;
5510     struct x86_exception e;
5511     struct {
5512         u64 vpid;
5513         u64 gla;
5514     } operand;
5515     u16 vpid02;
5516     int r, gpr_index;
5517 
5518     if (!(vmx->nested.msrs.secondary_ctls_high &
5519           SECONDARY_EXEC_ENABLE_VPID) ||
5520             !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5521         kvm_queue_exception(vcpu, UD_VECTOR);
5522         return 1;
5523     }
5524 
5525     if (!nested_vmx_check_permission(vcpu))
5526         return 1;
5527 
5528     vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5529     gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5530     type = kvm_register_read(vcpu, gpr_index);
5531 
5532     types = (vmx->nested.msrs.vpid_caps &
5533             VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5534 
5535     if (type >= 32 || !(types & (1 << type)))
5536         return nested_vmx_fail(vcpu,
5537             VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5538 
5539     /* according to the intel vmx instruction reference, the memory
5540      * operand is read even if it isn't needed (e.g., for type==global)
5541      */
5542     if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5543             vmx_instruction_info, false, sizeof(operand), &gva))
5544         return 1;
5545     r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5546     if (r != X86EMUL_CONTINUE)
5547         return kvm_handle_memory_failure(vcpu, r, &e);
5548 
5549     if (operand.vpid >> 16)
5550         return nested_vmx_fail(vcpu,
5551             VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5552 
5553     vpid02 = nested_get_vpid02(vcpu);
5554     switch (type) {
5555     case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5556         if (!operand.vpid ||
5557             is_noncanonical_address(operand.gla, vcpu))
5558             return nested_vmx_fail(vcpu,
5559                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5560         vpid_sync_vcpu_addr(vpid02, operand.gla);
5561         break;
5562     case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5563     case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5564         if (!operand.vpid)
5565             return nested_vmx_fail(vcpu,
5566                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5567         vpid_sync_context(vpid02);
5568         break;
5569     case VMX_VPID_EXTENT_ALL_CONTEXT:
5570         vpid_sync_context(vpid02);
5571         break;
5572     default:
5573         WARN_ON_ONCE(1);
5574         return kvm_skip_emulated_instruction(vcpu);
5575     }
5576 
5577     /*
5578      * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5579      * linear mappings for L2 (tagged with L2's VPID).  Free all guest
5580      * roots as VPIDs are not tracked in the MMU role.
5581      *
5582      * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5583      * an MMU when EPT is disabled.
5584      *
5585      * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5586      */
5587     if (!enable_ept)
5588         kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5589 
5590     return nested_vmx_succeed(vcpu);
5591 }
5592 
5593 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5594                      struct vmcs12 *vmcs12)
5595 {
5596     u32 index = kvm_rcx_read(vcpu);
5597     u64 new_eptp;
5598 
5599     if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
5600         return 1;
5601     if (index >= VMFUNC_EPTP_ENTRIES)
5602         return 1;
5603 
5604     if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5605                      &new_eptp, index * 8, 8))
5606         return 1;
5607 
5608     /*
5609      * If the (L2) guest does a vmfunc to the currently
5610      * active ept pointer, we don't have to do anything else
5611      */
5612     if (vmcs12->ept_pointer != new_eptp) {
5613         if (!nested_vmx_check_eptp(vcpu, new_eptp))
5614             return 1;
5615 
5616         vmcs12->ept_pointer = new_eptp;
5617         nested_ept_new_eptp(vcpu);
5618 
5619         if (!nested_cpu_has_vpid(vmcs12))
5620             kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
5621     }
5622 
5623     return 0;
5624 }
5625 
5626 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5627 {
5628     struct vcpu_vmx *vmx = to_vmx(vcpu);
5629     struct vmcs12 *vmcs12;
5630     u32 function = kvm_rax_read(vcpu);
5631 
5632     /*
5633      * VMFUNC is only supported for nested guests, but we always enable the
5634      * secondary control for simplicity; for non-nested mode, fake that we
5635      * didn't by injecting #UD.
5636      */
5637     if (!is_guest_mode(vcpu)) {
5638         kvm_queue_exception(vcpu, UD_VECTOR);
5639         return 1;
5640     }
5641 
5642     vmcs12 = get_vmcs12(vcpu);
5643 
5644     /*
5645      * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
5646      * is enabled in vmcs02 if and only if it's enabled in vmcs12.
5647      */
5648     if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
5649         kvm_queue_exception(vcpu, UD_VECTOR);
5650         return 1;
5651     }
5652 
5653     if (!(vmcs12->vm_function_control & BIT_ULL(function)))
5654         goto fail;
5655 
5656     switch (function) {
5657     case 0:
5658         if (nested_vmx_eptp_switching(vcpu, vmcs12))
5659             goto fail;
5660         break;
5661     default:
5662         goto fail;
5663     }
5664     return kvm_skip_emulated_instruction(vcpu);
5665 
5666 fail:
5667     /*
5668      * This is effectively a reflected VM-Exit, as opposed to a synthesized
5669      * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
5670      * EXIT_REASON_VMFUNC as the exit reason.
5671      */
5672     nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
5673               vmx_get_intr_info(vcpu),
5674               vmx_get_exit_qual(vcpu));
5675     return 1;
5676 }
5677 
5678 /*
5679  * Return true if an IO instruction with the specified port and size should cause
5680  * a VM-exit into L1.
5681  */
5682 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5683                  int size)
5684 {
5685     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5686     gpa_t bitmap, last_bitmap;
5687     u8 b;
5688 
5689     last_bitmap = INVALID_GPA;
5690     b = -1;
5691 
5692     while (size > 0) {
5693         if (port < 0x8000)
5694             bitmap = vmcs12->io_bitmap_a;
5695         else if (port < 0x10000)
5696             bitmap = vmcs12->io_bitmap_b;
5697         else
5698             return true;
5699         bitmap += (port & 0x7fff) / 8;
5700 
5701         if (last_bitmap != bitmap)
5702             if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5703                 return true;
5704         if (b & (1 << (port & 7)))
5705             return true;
5706 
5707         port++;
5708         size--;
5709         last_bitmap = bitmap;
5710     }
5711 
5712     return false;
5713 }
5714 
5715 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5716                        struct vmcs12 *vmcs12)
5717 {
5718     unsigned long exit_qualification;
5719     unsigned short port;
5720     int size;
5721 
5722     if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5723         return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5724 
5725     exit_qualification = vmx_get_exit_qual(vcpu);
5726 
5727     port = exit_qualification >> 16;
5728     size = (exit_qualification & 7) + 1;
5729 
5730     return nested_vmx_check_io_bitmaps(vcpu, port, size);
5731 }
5732 
5733 /*
5734  * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5735  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5736  * disinterest in the current event (read or write a specific MSR) by using an
5737  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5738  */
5739 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5740                     struct vmcs12 *vmcs12,
5741                     union vmx_exit_reason exit_reason)
5742 {
5743     u32 msr_index = kvm_rcx_read(vcpu);
5744     gpa_t bitmap;
5745 
5746     if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5747         return true;
5748 
5749     /*
5750      * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5751      * for the four combinations of read/write and low/high MSR numbers.
5752      * First we need to figure out which of the four to use:
5753      */
5754     bitmap = vmcs12->msr_bitmap;
5755     if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
5756         bitmap += 2048;
5757     if (msr_index >= 0xc0000000) {
5758         msr_index -= 0xc0000000;
5759         bitmap += 1024;
5760     }
5761 
5762     /* Then read the msr_index'th bit from this bitmap: */
5763     if (msr_index < 1024*8) {
5764         unsigned char b;
5765         if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5766             return true;
5767         return 1 & (b >> (msr_index & 7));
5768     } else
5769         return true; /* let L1 handle the wrong parameter */
5770 }
5771 
5772 /*
5773  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5774  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5775  * intercept (via guest_host_mask etc.) the current event.
5776  */
5777 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5778     struct vmcs12 *vmcs12)
5779 {
5780     unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5781     int cr = exit_qualification & 15;
5782     int reg;
5783     unsigned long val;
5784 
5785     switch ((exit_qualification >> 4) & 3) {
5786     case 0: /* mov to cr */
5787         reg = (exit_qualification >> 8) & 15;
5788         val = kvm_register_read(vcpu, reg);
5789         switch (cr) {
5790         case 0:
5791             if (vmcs12->cr0_guest_host_mask &
5792                 (val ^ vmcs12->cr0_read_shadow))
5793                 return true;
5794             break;
5795         case 3:
5796             if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5797                 return true;
5798             break;
5799         case 4:
5800             if (vmcs12->cr4_guest_host_mask &
5801                 (vmcs12->cr4_read_shadow ^ val))
5802                 return true;
5803             break;
5804         case 8:
5805             if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5806                 return true;
5807             break;
5808         }
5809         break;
5810     case 2: /* clts */
5811         if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5812             (vmcs12->cr0_read_shadow & X86_CR0_TS))
5813             return true;
5814         break;
5815     case 1: /* mov from cr */
5816         switch (cr) {
5817         case 3:
5818             if (vmcs12->cpu_based_vm_exec_control &
5819                 CPU_BASED_CR3_STORE_EXITING)
5820                 return true;
5821             break;
5822         case 8:
5823             if (vmcs12->cpu_based_vm_exec_control &
5824                 CPU_BASED_CR8_STORE_EXITING)
5825                 return true;
5826             break;
5827         }
5828         break;
5829     case 3: /* lmsw */
5830         /*
5831          * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5832          * cr0. Other attempted changes are ignored, with no exit.
5833          */
5834         val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5835         if (vmcs12->cr0_guest_host_mask & 0xe &
5836             (val ^ vmcs12->cr0_read_shadow))
5837             return true;
5838         if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5839             !(vmcs12->cr0_read_shadow & 0x1) &&
5840             (val & 0x1))
5841             return true;
5842         break;
5843     }
5844     return false;
5845 }
5846 
5847 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
5848                       struct vmcs12 *vmcs12)
5849 {
5850     u32 encls_leaf;
5851 
5852     if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
5853         !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
5854         return false;
5855 
5856     encls_leaf = kvm_rax_read(vcpu);
5857     if (encls_leaf > 62)
5858         encls_leaf = 63;
5859     return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
5860 }
5861 
5862 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5863     struct vmcs12 *vmcs12, gpa_t bitmap)
5864 {
5865     u32 vmx_instruction_info;
5866     unsigned long field;
5867     u8 b;
5868 
5869     if (!nested_cpu_has_shadow_vmcs(vmcs12))
5870         return true;
5871 
5872     /* Decode instruction info and find the field to access */
5873     vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5874     field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5875 
5876     /* Out-of-range fields always cause a VM exit from L2 to L1 */
5877     if (field >> 15)
5878         return true;
5879 
5880     if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5881         return true;
5882 
5883     return 1 & (b >> (field & 7));
5884 }
5885 
5886 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5887 {
5888     u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5889 
5890     if (nested_cpu_has_mtf(vmcs12))
5891         return true;
5892 
5893     /*
5894      * An MTF VM-exit may be injected into the guest by setting the
5895      * interruption-type to 7 (other event) and the vector field to 0. Such
5896      * is the case regardless of the 'monitor trap flag' VM-execution
5897      * control.
5898      */
5899     return entry_intr_info == (INTR_INFO_VALID_MASK
5900                    | INTR_TYPE_OTHER_EVENT);
5901 }
5902 
5903 /*
5904  * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5905  * L1 wants the exit.  Only call this when in is_guest_mode (L2).
5906  */
5907 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5908                      union vmx_exit_reason exit_reason)
5909 {
5910     u32 intr_info;
5911 
5912     switch ((u16)exit_reason.basic) {
5913     case EXIT_REASON_EXCEPTION_NMI:
5914         intr_info = vmx_get_intr_info(vcpu);
5915         if (is_nmi(intr_info))
5916             return true;
5917         else if (is_page_fault(intr_info))
5918             return vcpu->arch.apf.host_apf_flags ||
5919                    vmx_need_pf_intercept(vcpu);
5920         else if (is_debug(intr_info) &&
5921              vcpu->guest_debug &
5922              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5923             return true;
5924         else if (is_breakpoint(intr_info) &&
5925              vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5926             return true;
5927         else if (is_alignment_check(intr_info) &&
5928              !vmx_guest_inject_ac(vcpu))
5929             return true;
5930         return false;
5931     case EXIT_REASON_EXTERNAL_INTERRUPT:
5932         return true;
5933     case EXIT_REASON_MCE_DURING_VMENTRY:
5934         return true;
5935     case EXIT_REASON_EPT_VIOLATION:
5936         /*
5937          * L0 always deals with the EPT violation. If nested EPT is
5938          * used, and the nested mmu code discovers that the address is
5939          * missing in the guest EPT table (EPT12), the EPT violation
5940          * will be injected with nested_ept_inject_page_fault()
5941          */
5942         return true;
5943     case EXIT_REASON_EPT_MISCONFIG:
5944         /*
5945          * L2 never uses directly L1's EPT, but rather L0's own EPT
5946          * table (shadow on EPT) or a merged EPT table that L0 built
5947          * (EPT on EPT). So any problems with the structure of the
5948          * table is L0's fault.
5949          */
5950         return true;
5951     case EXIT_REASON_PREEMPTION_TIMER:
5952         return true;
5953     case EXIT_REASON_PML_FULL:
5954         /*
5955          * PML is emulated for an L1 VMM and should never be enabled in
5956          * vmcs02, always "handle" PML_FULL by exiting to userspace.
5957          */
5958         return true;
5959     case EXIT_REASON_VMFUNC:
5960         /* VM functions are emulated through L2->L0 vmexits. */
5961         return true;
5962     case EXIT_REASON_BUS_LOCK:
5963         /*
5964          * At present, bus lock VM exit is never exposed to L1.
5965          * Handle L2's bus locks in L0 directly.
5966          */
5967         return true;
5968     default:
5969         break;
5970     }
5971     return false;
5972 }
5973 
5974 /*
5975  * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
5976  * is_guest_mode (L2).
5977  */
5978 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
5979                      union vmx_exit_reason exit_reason)
5980 {
5981     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5982     u32 intr_info;
5983 
5984     switch ((u16)exit_reason.basic) {
5985     case EXIT_REASON_EXCEPTION_NMI:
5986         intr_info = vmx_get_intr_info(vcpu);
5987         if (is_nmi(intr_info))
5988             return true;
5989         else if (is_page_fault(intr_info))
5990             return true;
5991         return vmcs12->exception_bitmap &
5992                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5993     case EXIT_REASON_EXTERNAL_INTERRUPT:
5994         return nested_exit_on_intr(vcpu);
5995     case EXIT_REASON_TRIPLE_FAULT:
5996         return true;
5997     case EXIT_REASON_INTERRUPT_WINDOW:
5998         return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5999     case EXIT_REASON_NMI_WINDOW:
6000         return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
6001     case EXIT_REASON_TASK_SWITCH:
6002         return true;
6003     case EXIT_REASON_CPUID:
6004         return true;
6005     case EXIT_REASON_HLT:
6006         return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6007     case EXIT_REASON_INVD:
6008         return true;
6009     case EXIT_REASON_INVLPG:
6010         return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6011     case EXIT_REASON_RDPMC:
6012         return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6013     case EXIT_REASON_RDRAND:
6014         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6015     case EXIT_REASON_RDSEED:
6016         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6017     case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6018         return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6019     case EXIT_REASON_VMREAD:
6020         return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6021             vmcs12->vmread_bitmap);
6022     case EXIT_REASON_VMWRITE:
6023         return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6024             vmcs12->vmwrite_bitmap);
6025     case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6026     case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6027     case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6028     case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6029     case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6030         /*
6031          * VMX instructions trap unconditionally. This allows L1 to
6032          * emulate them for its L2 guest, i.e., allows 3-level nesting!
6033          */
6034         return true;
6035     case EXIT_REASON_CR_ACCESS:
6036         return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6037     case EXIT_REASON_DR_ACCESS:
6038         return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6039     case EXIT_REASON_IO_INSTRUCTION:
6040         return nested_vmx_exit_handled_io(vcpu, vmcs12);
6041     case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6042         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6043     case EXIT_REASON_MSR_READ:
6044     case EXIT_REASON_MSR_WRITE:
6045         return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6046     case EXIT_REASON_INVALID_STATE:
6047         return true;
6048     case EXIT_REASON_MWAIT_INSTRUCTION:
6049         return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6050     case EXIT_REASON_MONITOR_TRAP_FLAG:
6051         return nested_vmx_exit_handled_mtf(vmcs12);
6052     case EXIT_REASON_MONITOR_INSTRUCTION:
6053         return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6054     case EXIT_REASON_PAUSE_INSTRUCTION:
6055         return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6056             nested_cpu_has2(vmcs12,
6057                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6058     case EXIT_REASON_MCE_DURING_VMENTRY:
6059         return true;
6060     case EXIT_REASON_TPR_BELOW_THRESHOLD:
6061         return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6062     case EXIT_REASON_APIC_ACCESS:
6063     case EXIT_REASON_APIC_WRITE:
6064     case EXIT_REASON_EOI_INDUCED:
6065         /*
6066          * The controls for "virtualize APIC accesses," "APIC-
6067          * register virtualization," and "virtual-interrupt
6068          * delivery" only come from vmcs12.
6069          */
6070         return true;
6071     case EXIT_REASON_INVPCID:
6072         return
6073             nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6074             nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6075     case EXIT_REASON_WBINVD:
6076         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6077     case EXIT_REASON_XSETBV:
6078         return true;
6079     case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6080         /*
6081          * This should never happen, since it is not possible to
6082          * set XSS to a non-zero value---neither in L1 nor in L2.
6083          * If if it were, XSS would have to be checked against
6084          * the XSS exit bitmap in vmcs12.
6085          */
6086         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
6087     case EXIT_REASON_UMWAIT:
6088     case EXIT_REASON_TPAUSE:
6089         return nested_cpu_has2(vmcs12,
6090             SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6091     case EXIT_REASON_ENCLS:
6092         return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6093     case EXIT_REASON_NOTIFY:
6094         /* Notify VM exit is not exposed to L1 */
6095         return false;
6096     default:
6097         return true;
6098     }
6099 }
6100 
6101 /*
6102  * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
6103  * reflected into L1.
6104  */
6105 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6106 {
6107     struct vcpu_vmx *vmx = to_vmx(vcpu);
6108     union vmx_exit_reason exit_reason = vmx->exit_reason;
6109     unsigned long exit_qual;
6110     u32 exit_intr_info;
6111 
6112     WARN_ON_ONCE(vmx->nested.nested_run_pending);
6113 
6114     /*
6115      * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6116      * has already loaded L2's state.
6117      */
6118     if (unlikely(vmx->fail)) {
6119         trace_kvm_nested_vmenter_failed(
6120             "hardware VM-instruction error: ",
6121             vmcs_read32(VM_INSTRUCTION_ERROR));
6122         exit_intr_info = 0;
6123         exit_qual = 0;
6124         goto reflect_vmexit;
6125     }
6126 
6127     trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
6128 
6129     /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6130     if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6131         return false;
6132 
6133     /* If L1 doesn't want the exit, handle it in L0. */
6134     if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6135         return false;
6136 
6137     /*
6138      * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
6139      * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6140      * need to be synthesized by querying the in-kernel LAPIC, but external
6141      * interrupts are never reflected to L1 so it's a non-issue.
6142      */
6143     exit_intr_info = vmx_get_intr_info(vcpu);
6144     if (is_exception_with_error_code(exit_intr_info)) {
6145         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6146 
6147         vmcs12->vm_exit_intr_error_code =
6148             vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6149     }
6150     exit_qual = vmx_get_exit_qual(vcpu);
6151 
6152 reflect_vmexit:
6153     nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6154     return true;
6155 }
6156 
6157 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6158                 struct kvm_nested_state __user *user_kvm_nested_state,
6159                 u32 user_data_size)
6160 {
6161     struct vcpu_vmx *vmx;
6162     struct vmcs12 *vmcs12;
6163     struct kvm_nested_state kvm_state = {
6164         .flags = 0,
6165         .format = KVM_STATE_NESTED_FORMAT_VMX,
6166         .size = sizeof(kvm_state),
6167         .hdr.vmx.flags = 0,
6168         .hdr.vmx.vmxon_pa = INVALID_GPA,
6169         .hdr.vmx.vmcs12_pa = INVALID_GPA,
6170         .hdr.vmx.preemption_timer_deadline = 0,
6171     };
6172     struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6173         &user_kvm_nested_state->data.vmx[0];
6174 
6175     if (!vcpu)
6176         return kvm_state.size + sizeof(*user_vmx_nested_state);
6177 
6178     vmx = to_vmx(vcpu);
6179     vmcs12 = get_vmcs12(vcpu);
6180 
6181     if (nested_vmx_allowed(vcpu) &&
6182         (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6183         kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6184         kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6185 
6186         if (vmx_has_valid_vmcs12(vcpu)) {
6187             kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6188 
6189             /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6190             if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
6191                 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6192 
6193             if (is_guest_mode(vcpu) &&
6194                 nested_cpu_has_shadow_vmcs(vmcs12) &&
6195                 vmcs12->vmcs_link_pointer != INVALID_GPA)
6196                 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6197         }
6198 
6199         if (vmx->nested.smm.vmxon)
6200             kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6201 
6202         if (vmx->nested.smm.guest_mode)
6203             kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6204 
6205         if (is_guest_mode(vcpu)) {
6206             kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6207 
6208             if (vmx->nested.nested_run_pending)
6209                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6210 
6211             if (vmx->nested.mtf_pending)
6212                 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6213 
6214             if (nested_cpu_has_preemption_timer(vmcs12) &&
6215                 vmx->nested.has_preemption_timer_deadline) {
6216                 kvm_state.hdr.vmx.flags |=
6217                     KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6218                 kvm_state.hdr.vmx.preemption_timer_deadline =
6219                     vmx->nested.preemption_timer_deadline;
6220             }
6221         }
6222     }
6223 
6224     if (user_data_size < kvm_state.size)
6225         goto out;
6226 
6227     if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6228         return -EFAULT;
6229 
6230     if (!vmx_has_valid_vmcs12(vcpu))
6231         goto out;
6232 
6233     /*
6234      * When running L2, the authoritative vmcs12 state is in the
6235      * vmcs02. When running L1, the authoritative vmcs12 state is
6236      * in the shadow or enlightened vmcs linked to vmcs01, unless
6237      * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6238      * vmcs12 state is in the vmcs12 already.
6239      */
6240     if (is_guest_mode(vcpu)) {
6241         sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6242         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6243     } else  {
6244         copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6245         if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6246             if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
6247                 /*
6248                  * L1 hypervisor is not obliged to keep eVMCS
6249                  * clean fields data always up-to-date while
6250                  * not in guest mode, 'hv_clean_fields' is only
6251                  * supposed to be actual upon vmentry so we need
6252                  * to ignore it here and do full copy.
6253                  */
6254                 copy_enlightened_to_vmcs12(vmx, 0);
6255             else if (enable_shadow_vmcs)
6256                 copy_shadow_to_vmcs12(vmx);
6257         }
6258     }
6259 
6260     BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6261     BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6262 
6263     /*
6264      * Copy over the full allocated size of vmcs12 rather than just the size
6265      * of the struct.
6266      */
6267     if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6268         return -EFAULT;
6269 
6270     if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6271         vmcs12->vmcs_link_pointer != INVALID_GPA) {
6272         if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6273                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6274             return -EFAULT;
6275     }
6276 out:
6277     return kvm_state.size;
6278 }
6279 
6280 /*
6281  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6282  */
6283 void vmx_leave_nested(struct kvm_vcpu *vcpu)
6284 {
6285     if (is_guest_mode(vcpu)) {
6286         to_vmx(vcpu)->nested.nested_run_pending = 0;
6287         nested_vmx_vmexit(vcpu, -1, 0, 0);
6288     }
6289     free_nested(vcpu);
6290 }
6291 
6292 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6293                 struct kvm_nested_state __user *user_kvm_nested_state,
6294                 struct kvm_nested_state *kvm_state)
6295 {
6296     struct vcpu_vmx *vmx = to_vmx(vcpu);
6297     struct vmcs12 *vmcs12;
6298     enum vm_entry_failure_code ignored;
6299     struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6300         &user_kvm_nested_state->data.vmx[0];
6301     int ret;
6302 
6303     if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6304         return -EINVAL;
6305 
6306     if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6307         if (kvm_state->hdr.vmx.smm.flags)
6308             return -EINVAL;
6309 
6310         if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
6311             return -EINVAL;
6312 
6313         /*
6314          * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6315          * enable eVMCS capability on vCPU. However, since then
6316          * code was changed such that flag signals vmcs12 should
6317          * be copied into eVMCS in guest memory.
6318          *
6319          * To preserve backwards compatability, allow user
6320          * to set this flag even when there is no VMXON region.
6321          */
6322         if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6323             return -EINVAL;
6324     } else {
6325         if (!nested_vmx_allowed(vcpu))
6326             return -EINVAL;
6327 
6328         if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6329             return -EINVAL;
6330     }
6331 
6332     if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6333         (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6334         return -EINVAL;
6335 
6336     if (kvm_state->hdr.vmx.smm.flags &
6337         ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6338         return -EINVAL;
6339 
6340     if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6341         return -EINVAL;
6342 
6343     /*
6344      * SMM temporarily disables VMX, so we cannot be in guest mode,
6345      * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
6346      * must be zero.
6347      */
6348     if (is_smm(vcpu) ?
6349         (kvm_state->flags &
6350          (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6351         : kvm_state->hdr.vmx.smm.flags)
6352         return -EINVAL;
6353 
6354     if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6355         !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6356         return -EINVAL;
6357 
6358     if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6359         (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
6360             return -EINVAL;
6361 
6362     vmx_leave_nested(vcpu);
6363 
6364     if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
6365         return 0;
6366 
6367     vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6368     ret = enter_vmx_operation(vcpu);
6369     if (ret)
6370         return ret;
6371 
6372     /* Empty 'VMXON' state is permitted if no VMCS loaded */
6373     if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6374         /* See vmx_has_valid_vmcs12.  */
6375         if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6376             (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6377             (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
6378             return -EINVAL;
6379         else
6380             return 0;
6381     }
6382 
6383     if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
6384         if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6385             !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6386             return -EINVAL;
6387 
6388         set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6389     } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6390         /*
6391          * nested_vmx_handle_enlightened_vmptrld() cannot be called
6392          * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6393          * restored yet. EVMCS will be mapped from
6394          * nested_get_vmcs12_pages().
6395          */
6396         vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6397         kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6398     } else {
6399         return -EINVAL;
6400     }
6401 
6402     if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6403         vmx->nested.smm.vmxon = true;
6404         vmx->nested.vmxon = false;
6405 
6406         if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6407             vmx->nested.smm.guest_mode = true;
6408     }
6409 
6410     vmcs12 = get_vmcs12(vcpu);
6411     if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6412         return -EFAULT;
6413 
6414     if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6415         return -EINVAL;
6416 
6417     if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6418         return 0;
6419 
6420     vmx->nested.nested_run_pending =
6421         !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6422 
6423     vmx->nested.mtf_pending =
6424         !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6425 
6426     ret = -EINVAL;
6427     if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6428         vmcs12->vmcs_link_pointer != INVALID_GPA) {
6429         struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6430 
6431         if (kvm_state->size <
6432             sizeof(*kvm_state) +
6433             sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6434             goto error_guest_mode;
6435 
6436         if (copy_from_user(shadow_vmcs12,
6437                    user_vmx_nested_state->shadow_vmcs12,
6438                    sizeof(*shadow_vmcs12))) {
6439             ret = -EFAULT;
6440             goto error_guest_mode;
6441         }
6442 
6443         if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6444             !shadow_vmcs12->hdr.shadow_vmcs)
6445             goto error_guest_mode;
6446     }
6447 
6448     vmx->nested.has_preemption_timer_deadline = false;
6449     if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6450         vmx->nested.has_preemption_timer_deadline = true;
6451         vmx->nested.preemption_timer_deadline =
6452             kvm_state->hdr.vmx.preemption_timer_deadline;
6453     }
6454 
6455     if (nested_vmx_check_controls(vcpu, vmcs12) ||
6456         nested_vmx_check_host_state(vcpu, vmcs12) ||
6457         nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6458         goto error_guest_mode;
6459 
6460     vmx->nested.dirty_vmcs12 = true;
6461     vmx->nested.force_msr_bitmap_recalc = true;
6462     ret = nested_vmx_enter_non_root_mode(vcpu, false);
6463     if (ret)
6464         goto error_guest_mode;
6465 
6466     return 0;
6467 
6468 error_guest_mode:
6469     vmx->nested.nested_run_pending = 0;
6470     return ret;
6471 }
6472 
6473 void nested_vmx_set_vmcs_shadowing_bitmap(void)
6474 {
6475     if (enable_shadow_vmcs) {
6476         vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6477         vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6478     }
6479 }
6480 
6481 /*
6482  * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
6483  * that madness to get the encoding for comparison.
6484  */
6485 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
6486 
6487 static u64 nested_vmx_calc_vmcs_enum_msr(void)
6488 {
6489     /*
6490      * Note these are the so called "index" of the VMCS field encoding, not
6491      * the index into vmcs12.
6492      */
6493     unsigned int max_idx, idx;
6494     int i;
6495 
6496     /*
6497      * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
6498      * vmcs12, regardless of whether or not the associated feature is
6499      * exposed to L1.  Simply find the field with the highest index.
6500      */
6501     max_idx = 0;
6502     for (i = 0; i < nr_vmcs12_fields; i++) {
6503         /* The vmcs12 table is very, very sparsely populated. */
6504         if (!vmcs12_field_offsets[i])
6505             continue;
6506 
6507         idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
6508         if (idx > max_idx)
6509             max_idx = idx;
6510     }
6511 
6512     return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
6513 }
6514 
6515 /*
6516  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6517  * returned for the various VMX controls MSRs when nested VMX is enabled.
6518  * The same values should also be used to verify that vmcs12 control fields are
6519  * valid during nested entry from L1 to L2.
6520  * Each of these control msrs has a low and high 32-bit half: A low bit is on
6521  * if the corresponding bit in the (32-bit) control field *must* be on, and a
6522  * bit in the high half is on if the corresponding bit in the control field
6523  * may be on. See also vmx_control_verify().
6524  */
6525 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
6526 {
6527     /*
6528      * Note that as a general rule, the high half of the MSRs (bits in
6529      * the control fields which may be 1) should be initialized by the
6530      * intersection of the underlying hardware's MSR (i.e., features which
6531      * can be supported) and the list of features we want to expose -
6532      * because they are known to be properly supported in our code.
6533      * Also, usually, the low half of the MSRs (bits which must be 1) can
6534      * be set to 0, meaning that L1 may turn off any of these bits. The
6535      * reason is that if one of these bits is necessary, it will appear
6536      * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6537      * fields of vmcs01 and vmcs02, will turn these bits off - and
6538      * nested_vmx_l1_wants_exit() will not pass related exits to L1.
6539      * These rules have exceptions below.
6540      */
6541 
6542     /* pin-based controls */
6543     rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6544         msrs->pinbased_ctls_low,
6545         msrs->pinbased_ctls_high);
6546     msrs->pinbased_ctls_low |=
6547         PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6548     msrs->pinbased_ctls_high &=
6549         PIN_BASED_EXT_INTR_MASK |
6550         PIN_BASED_NMI_EXITING |
6551         PIN_BASED_VIRTUAL_NMIS |
6552         (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6553     msrs->pinbased_ctls_high |=
6554         PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6555         PIN_BASED_VMX_PREEMPTION_TIMER;
6556 
6557     /* exit controls */
6558     rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6559         msrs->exit_ctls_low,
6560         msrs->exit_ctls_high);
6561     msrs->exit_ctls_low =
6562         VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6563 
6564     msrs->exit_ctls_high &=
6565 #ifdef CONFIG_X86_64
6566         VM_EXIT_HOST_ADDR_SPACE_SIZE |
6567 #endif
6568         VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6569         VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
6570     msrs->exit_ctls_high |=
6571         VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6572         VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6573         VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6574 
6575     /* We support free control of debug control saving. */
6576     msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6577 
6578     /* entry controls */
6579     rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6580         msrs->entry_ctls_low,
6581         msrs->entry_ctls_high);
6582     msrs->entry_ctls_low =
6583         VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6584     msrs->entry_ctls_high &=
6585 #ifdef CONFIG_X86_64
6586         VM_ENTRY_IA32E_MODE |
6587 #endif
6588         VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6589         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
6590     msrs->entry_ctls_high |=
6591         (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6592 
6593     /* We support free control of debug control loading. */
6594     msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6595 
6596     /* cpu-based controls */
6597     rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6598         msrs->procbased_ctls_low,
6599         msrs->procbased_ctls_high);
6600     msrs->procbased_ctls_low =
6601         CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6602     msrs->procbased_ctls_high &=
6603         CPU_BASED_INTR_WINDOW_EXITING |
6604         CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6605         CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6606         CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6607         CPU_BASED_CR3_STORE_EXITING |
6608 #ifdef CONFIG_X86_64
6609         CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6610 #endif
6611         CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6612         CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6613         CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6614         CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6615         CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6616     /*
6617      * We can allow some features even when not supported by the
6618      * hardware. For example, L1 can specify an MSR bitmap - and we
6619      * can use it to avoid exits to L1 - even when L0 runs L2
6620      * without MSR bitmaps.
6621      */
6622     msrs->procbased_ctls_high |=
6623         CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6624         CPU_BASED_USE_MSR_BITMAPS;
6625 
6626     /* We support free control of CR3 access interception. */
6627     msrs->procbased_ctls_low &=
6628         ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6629 
6630     /*
6631      * secondary cpu-based controls.  Do not include those that
6632      * depend on CPUID bits, they are added later by
6633      * vmx_vcpu_after_set_cpuid.
6634      */
6635     if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6636         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6637               msrs->secondary_ctls_low,
6638               msrs->secondary_ctls_high);
6639 
6640     msrs->secondary_ctls_low = 0;
6641     msrs->secondary_ctls_high &=
6642         SECONDARY_EXEC_DESC |
6643         SECONDARY_EXEC_ENABLE_RDTSCP |
6644         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6645         SECONDARY_EXEC_WBINVD_EXITING |
6646         SECONDARY_EXEC_APIC_REGISTER_VIRT |
6647         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6648         SECONDARY_EXEC_RDRAND_EXITING |
6649         SECONDARY_EXEC_ENABLE_INVPCID |
6650         SECONDARY_EXEC_RDSEED_EXITING |
6651         SECONDARY_EXEC_XSAVES |
6652         SECONDARY_EXEC_TSC_SCALING;
6653 
6654     /*
6655      * We can emulate "VMCS shadowing," even if the hardware
6656      * doesn't support it.
6657      */
6658     msrs->secondary_ctls_high |=
6659         SECONDARY_EXEC_SHADOW_VMCS;
6660 
6661     if (enable_ept) {
6662         /* nested EPT: emulate EPT also to L1 */
6663         msrs->secondary_ctls_high |=
6664             SECONDARY_EXEC_ENABLE_EPT;
6665         msrs->ept_caps =
6666             VMX_EPT_PAGE_WALK_4_BIT |
6667             VMX_EPT_PAGE_WALK_5_BIT |
6668             VMX_EPTP_WB_BIT |
6669             VMX_EPT_INVEPT_BIT |
6670             VMX_EPT_EXECUTE_ONLY_BIT;
6671 
6672         msrs->ept_caps &= ept_caps;
6673         msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6674             VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6675             VMX_EPT_1GB_PAGE_BIT;
6676         if (enable_ept_ad_bits) {
6677             msrs->secondary_ctls_high |=
6678                 SECONDARY_EXEC_ENABLE_PML;
6679             msrs->ept_caps |= VMX_EPT_AD_BIT;
6680         }
6681     }
6682 
6683     if (cpu_has_vmx_vmfunc()) {
6684         msrs->secondary_ctls_high |=
6685             SECONDARY_EXEC_ENABLE_VMFUNC;
6686         /*
6687          * Advertise EPTP switching unconditionally
6688          * since we emulate it
6689          */
6690         if (enable_ept)
6691             msrs->vmfunc_controls =
6692                 VMX_VMFUNC_EPTP_SWITCHING;
6693     }
6694 
6695     /*
6696      * Old versions of KVM use the single-context version without
6697      * checking for support, so declare that it is supported even
6698      * though it is treated as global context.  The alternative is
6699      * not failing the single-context invvpid, and it is worse.
6700      */
6701     if (enable_vpid) {
6702         msrs->secondary_ctls_high |=
6703             SECONDARY_EXEC_ENABLE_VPID;
6704         msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6705             VMX_VPID_EXTENT_SUPPORTED_MASK;
6706     }
6707 
6708     if (enable_unrestricted_guest)
6709         msrs->secondary_ctls_high |=
6710             SECONDARY_EXEC_UNRESTRICTED_GUEST;
6711 
6712     if (flexpriority_enabled)
6713         msrs->secondary_ctls_high |=
6714             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6715 
6716     if (enable_sgx)
6717         msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
6718 
6719     /* miscellaneous data */
6720     rdmsr(MSR_IA32_VMX_MISC,
6721         msrs->misc_low,
6722         msrs->misc_high);
6723     msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6724     msrs->misc_low |=
6725         MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6726         VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6727         VMX_MISC_ACTIVITY_HLT |
6728         VMX_MISC_ACTIVITY_WAIT_SIPI;
6729     msrs->misc_high = 0;
6730 
6731     /*
6732      * This MSR reports some information about VMX support. We
6733      * should return information about the VMX we emulate for the
6734      * guest, and the VMCS structure we give it - not about the
6735      * VMX support of the underlying hardware.
6736      */
6737     msrs->basic =
6738         VMCS12_REVISION |
6739         VMX_BASIC_TRUE_CTLS |
6740         ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6741         (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6742 
6743     if (cpu_has_vmx_basic_inout())
6744         msrs->basic |= VMX_BASIC_INOUT;
6745 
6746     /*
6747      * These MSRs specify bits which the guest must keep fixed on
6748      * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6749      * We picked the standard core2 setting.
6750      */
6751 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6752 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6753     msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6754     msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6755 
6756     /* These MSRs specify bits which the guest must keep fixed off. */
6757     rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6758     rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6759 
6760     if (vmx_umip_emulated())
6761         msrs->cr4_fixed1 |= X86_CR4_UMIP;
6762 
6763     msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
6764 }
6765 
6766 void nested_vmx_hardware_unsetup(void)
6767 {
6768     int i;
6769 
6770     if (enable_shadow_vmcs) {
6771         for (i = 0; i < VMX_BITMAP_NR; i++)
6772             free_page((unsigned long)vmx_bitmap[i]);
6773     }
6774 }
6775 
6776 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6777 {
6778     int i;
6779 
6780     if (!cpu_has_vmx_shadow_vmcs())
6781         enable_shadow_vmcs = 0;
6782     if (enable_shadow_vmcs) {
6783         for (i = 0; i < VMX_BITMAP_NR; i++) {
6784             /*
6785              * The vmx_bitmap is not tied to a VM and so should
6786              * not be charged to a memcg.
6787              */
6788             vmx_bitmap[i] = (unsigned long *)
6789                 __get_free_page(GFP_KERNEL);
6790             if (!vmx_bitmap[i]) {
6791                 nested_vmx_hardware_unsetup();
6792                 return -ENOMEM;
6793             }
6794         }
6795 
6796         init_vmcs_shadow_fields();
6797     }
6798 
6799     exit_handlers[EXIT_REASON_VMCLEAR]  = handle_vmclear;
6800     exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6801     exit_handlers[EXIT_REASON_VMPTRLD]  = handle_vmptrld;
6802     exit_handlers[EXIT_REASON_VMPTRST]  = handle_vmptrst;
6803     exit_handlers[EXIT_REASON_VMREAD]   = handle_vmread;
6804     exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6805     exit_handlers[EXIT_REASON_VMWRITE]  = handle_vmwrite;
6806     exit_handlers[EXIT_REASON_VMOFF]    = handle_vmxoff;
6807     exit_handlers[EXIT_REASON_VMON]     = handle_vmxon;
6808     exit_handlers[EXIT_REASON_INVEPT]   = handle_invept;
6809     exit_handlers[EXIT_REASON_INVVPID]  = handle_invvpid;
6810     exit_handlers[EXIT_REASON_VMFUNC]   = handle_vmfunc;
6811 
6812     return 0;
6813 }
6814 
6815 struct kvm_x86_nested_ops vmx_nested_ops = {
6816     .leave_nested = vmx_leave_nested,
6817     .check_events = vmx_check_nested_events,
6818     .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
6819     .hv_timer_pending = nested_vmx_preemption_timer_pending,
6820     .triple_fault = nested_vmx_triple_fault,
6821     .get_state = vmx_get_nested_state,
6822     .set_state = vmx_set_nested_state,
6823     .get_nested_state_pages = vmx_get_nested_state_pages,
6824     .write_log_dirty = nested_vmx_write_pml_buffer,
6825     .enable_evmcs = nested_enable_evmcs,
6826     .get_evmcs_version = nested_get_evmcs_version,
6827 };