kvm/svm/svm.c

0001 #define pr_fmt(fmt) "SVM: " fmt
0002
0003 #include <linux/kvm_host.h>
0004
0005 #include "irq.h"
0006 #include "mmu.h"
0007 #include "kvm_cache_regs.h"
0008 #include "x86.h"
0009 #include "cpuid.h"
0010 #include "pmu.h"
0011
0012 #include <linux/module.h>
0013 #include <linux/mod_devicetable.h>
0014 #include <linux/kernel.h>
0015 #include <linux/vmalloc.h>
0016 #include <linux/highmem.h>
0017 #include <linux/amd-iommu.h>
0018 #include <linux/sched.h>
0019 #include <linux/trace_events.h>
0020 #include <linux/slab.h>
0021 #include <linux/hashtable.h>
0022 #include <linux/objtool.h>
0023 #include <linux/psp-sev.h>
0024 #include <linux/file.h>
0025 #include <linux/pagemap.h>
0026 #include <linux/swap.h>
0027 #include <linux/rwsem.h>
0028 #include <linux/cc_platform.h>
0029
0030 #include <asm/apic.h>
0031 #include <asm/perf_event.h>
0032 #include <asm/tlbflush.h>
0033 #include <asm/desc.h>
0034 #include <asm/debugreg.h>
0035 #include <asm/kvm_para.h>
0036 #include <asm/irq_remapping.h>
0037 #include <asm/spec-ctrl.h>
0038 #include <asm/cpu_device_id.h>
0039 #include <asm/traps.h>
0040 #include <asm/fpu/api.h>
0041
0042 #include <asm/virtext.h>
0043 #include "trace.h"
0044
0045 #include "svm.h"
0046 #include "svm_ops.h"
0047
0048 #include "kvm_onhyperv.h"
0049 #include "svm_onhyperv.h"
0050
0051 MODULE_AUTHOR("Qumranet");
0052 MODULE_LICENSE("GPL");
0053
0054 #ifdef MODULE
0055 static const struct x86_cpu_id svm_cpu_id[] = {
0056     X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
0057     {}
0058 };
0059 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
0060 #endif
0061
0062 #define SEG_TYPE_LDT 2
0063 #define SEG_TYPE_BUSY_TSS16 3
0064
0065 static bool erratum_383_found __read_mostly;
0066
0067 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
0068
0069 /*
0070  * Set osvw_len to higher value when updated Revision Guides
0071  * are published and we know what the new status bits are
0072  */
0073 static uint64_t osvw_len = 4, osvw_status;
0074
0075 static DEFINE_PER_CPU(u64, current_tsc_ratio);
0076
0077 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
0078
0079 static const struct svm_direct_access_msrs {
0080     u32 index;   /* Index of the MSR */
0081     bool always; /* True if intercept is initially cleared */
0082 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
0083     { .index = MSR_STAR,                .always = true  },
0084     { .index = MSR_IA32_SYSENTER_CS,        .always = true  },
0085     { .index = MSR_IA32_SYSENTER_EIP,       .always = false },
0086     { .index = MSR_IA32_SYSENTER_ESP,       .always = false },
0087 #ifdef CONFIG_X86_64
0088     { .index = MSR_GS_BASE,             .always = true  },
0089     { .index = MSR_FS_BASE,             .always = true  },
0090     { .index = MSR_KERNEL_GS_BASE,          .always = true  },
0091     { .index = MSR_LSTAR,               .always = true  },
0092     { .index = MSR_CSTAR,               .always = true  },
0093     { .index = MSR_SYSCALL_MASK,            .always = true  },
0094 #endif
0095     { .index = MSR_IA32_SPEC_CTRL,          .always = false },
0096     { .index = MSR_IA32_PRED_CMD,           .always = false },
0097     { .index = MSR_IA32_LASTBRANCHFROMIP,       .always = false },
0098     { .index = MSR_IA32_LASTBRANCHTOIP,     .always = false },
0099     { .index = MSR_IA32_LASTINTFROMIP,      .always = false },
0100     { .index = MSR_IA32_LASTINTTOIP,        .always = false },
0101     { .index = MSR_EFER,                .always = false },
0102     { .index = MSR_IA32_CR_PAT,         .always = false },
0103     { .index = MSR_AMD64_SEV_ES_GHCB,       .always = true  },
0104     { .index = MSR_TSC_AUX,             .always = false },
0105     { .index = X2APIC_MSR(APIC_ID),         .always = false },
0106     { .index = X2APIC_MSR(APIC_LVR),        .always = false },
0107     { .index = X2APIC_MSR(APIC_TASKPRI),        .always = false },
0108     { .index = X2APIC_MSR(APIC_ARBPRI),     .always = false },
0109     { .index = X2APIC_MSR(APIC_PROCPRI),        .always = false },
0110     { .index = X2APIC_MSR(APIC_EOI),        .always = false },
0111     { .index = X2APIC_MSR(APIC_RRR),        .always = false },
0112     { .index = X2APIC_MSR(APIC_LDR),        .always = false },
0113     { .index = X2APIC_MSR(APIC_DFR),        .always = false },
0114     { .index = X2APIC_MSR(APIC_SPIV),       .always = false },
0115     { .index = X2APIC_MSR(APIC_ISR),        .always = false },
0116     { .index = X2APIC_MSR(APIC_TMR),        .always = false },
0117     { .index = X2APIC_MSR(APIC_IRR),        .always = false },
0118     { .index = X2APIC_MSR(APIC_ESR),        .always = false },
0119     { .index = X2APIC_MSR(APIC_ICR),        .always = false },
0120     { .index = X2APIC_MSR(APIC_ICR2),       .always = false },
0121
0122     /*
0123      * Note:
0124      * AMD does not virtualize APIC TSC-deadline timer mode, but it is
0125      * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
0126      * the AVIC hardware would generate GP fault. Therefore, always
0127      * intercept the MSR 0x832, and do not setup direct_access_msr.
0128      */
0129     { .index = X2APIC_MSR(APIC_LVTTHMR),        .always = false },
0130     { .index = X2APIC_MSR(APIC_LVTPC),      .always = false },
0131     { .index = X2APIC_MSR(APIC_LVT0),       .always = false },
0132     { .index = X2APIC_MSR(APIC_LVT1),       .always = false },
0133     { .index = X2APIC_MSR(APIC_LVTERR),     .always = false },
0134     { .index = X2APIC_MSR(APIC_TMICT),      .always = false },
0135     { .index = X2APIC_MSR(APIC_TMCCT),      .always = false },
0136     { .index = X2APIC_MSR(APIC_TDCR),       .always = false },
0137     { .index = MSR_INVALID,             .always = false },
0138 };
0139
0140 /*
0141  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
0142  * pause_filter_count: On processors that support Pause filtering(indicated
0143  *  by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
0144  *  count value. On VMRUN this value is loaded into an internal counter.
0145  *  Each time a pause instruction is executed, this counter is decremented
0146  *  until it reaches zero at which time a #VMEXIT is generated if pause
0147  *  intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
0148  *  Intercept Filtering for more details.
0149  *  This also indicate if ple logic enabled.
0150  *
0151  * pause_filter_thresh: In addition, some processor families support advanced
0152  *  pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
0153  *  the amount of time a guest is allowed to execute in a pause loop.
0154  *  In this mode, a 16-bit pause filter threshold field is added in the
0155  *  VMCB. The threshold value is a cycle count that is used to reset the
0156  *  pause counter. As with simple pause filtering, VMRUN loads the pause
0157  *  count value from VMCB into an internal counter. Then, on each pause
0158  *  instruction the hardware checks the elapsed number of cycles since
0159  *  the most recent pause instruction against the pause filter threshold.
0160  *  If the elapsed cycle count is greater than the pause filter threshold,
0161  *  then the internal pause count is reloaded from the VMCB and execution
0162  *  continues. If the elapsed cycle count is less than the pause filter
0163  *  threshold, then the internal pause count is decremented. If the count
0164  *  value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
0165  *  triggered. If advanced pause filtering is supported and pause filter
0166  *  threshold field is set to zero, the filter will operate in the simpler,
0167  *  count only mode.
0168  */
0169
0170 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
0171 module_param(pause_filter_thresh, ushort, 0444);
0172
0173 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
0174 module_param(pause_filter_count, ushort, 0444);
0175
0176 /* Default doubles per-vcpu window every exit. */
0177 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
0178 module_param(pause_filter_count_grow, ushort, 0444);
0179
0180 /* Default resets per-vcpu window every exit to pause_filter_count. */
0181 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
0182 module_param(pause_filter_count_shrink, ushort, 0444);
0183
0184 /* Default is to compute the maximum so we can never overflow. */
0185 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
0186 module_param(pause_filter_count_max, ushort, 0444);
0187
0188 /*
0189  * Use nested page tables by default.  Note, NPT may get forced off by
0190  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
0191  */
0192 bool npt_enabled = true;
0193 module_param_named(npt, npt_enabled, bool, 0444);
0194
0195 /* allow nested virtualization in KVM/SVM */
0196 static int nested = true;
0197 module_param(nested, int, S_IRUGO);
0198
0199 /* enable/disable Next RIP Save */
0200 static int nrips = true;
0201 module_param(nrips, int, 0444);
0202
0203 /* enable/disable Virtual VMLOAD VMSAVE */
0204 static int vls = true;
0205 module_param(vls, int, 0444);
0206
0207 /* enable/disable Virtual GIF */
0208 int vgif = true;
0209 module_param(vgif, int, 0444);
0210
0211 /* enable/disable LBR virtualization */
0212 static int lbrv = true;
0213 module_param(lbrv, int, 0444);
0214
0215 static int tsc_scaling = true;
0216 module_param(tsc_scaling, int, 0444);
0217
0218 /*
0219  * enable / disable AVIC.  Because the defaults differ for APICv
0220  * support between VMX and SVM we cannot use module_param_named.
0221  */
0222 static bool avic;
0223 module_param(avic, bool, 0444);
0224
0225 bool __read_mostly dump_invalid_vmcb;
0226 module_param(dump_invalid_vmcb, bool, 0644);
0227
0228
0229 bool intercept_smi = true;
0230 module_param(intercept_smi, bool, 0444);
0231
0232
0233 static bool svm_gp_erratum_intercept = true;
0234
0235 static u8 rsm_ins_bytes[] = "\x0f\xaa";
0236
0237 static unsigned long iopm_base;
0238
0239 struct kvm_ldttss_desc {
0240     u16 limit0;
0241     u16 base0;
0242     unsigned base1:8, type:5, dpl:2, p:1;
0243     unsigned limit1:4, zero0:3, g:1, base2:8;
0244     u32 base3;
0245     u32 zero1;
0246 } __attribute__((packed));
0247
0248 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
0249
0250 /*
0251  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
0252  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
0253  *
0254  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
0255  * defer the restoration of TSC_AUX until the CPU returns to userspace.
0256  */
0257 static int tsc_aux_uret_slot __read_mostly = -1;
0258
0259 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
0260
0261 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
0262 #define MSRS_RANGE_SIZE 2048
0263 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
0264
0265 u32 svm_msrpm_offset(u32 msr)
0266 {
0267     u32 offset;
0268     int i;
0269
0270     for (i = 0; i < NUM_MSR_MAPS; i++) {
0271         if (msr < msrpm_ranges[i] ||
0272             msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
0273             continue;
0274
0275         offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
0276         offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
0277
0278         /* Now we have the u8 offset - but need the u32 offset */
0279         return offset / 4;
0280     }
0281
0282     /* MSR not in any range */
0283     return MSR_INVALID;
0284 }
0285
0286 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
0287
0288 static int get_npt_level(void)
0289 {
0290 #ifdef CONFIG_X86_64
0291     return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
0292 #else
0293     return PT32E_ROOT_LEVEL;
0294 #endif
0295 }
0296
0297 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
0298 {
0299     struct vcpu_svm *svm = to_svm(vcpu);
0300     u64 old_efer = vcpu->arch.efer;
0301     vcpu->arch.efer = efer;
0302
0303     if (!npt_enabled) {
0304         /* Shadow paging assumes NX to be available.  */
0305         efer |= EFER_NX;
0306
0307         if (!(efer & EFER_LMA))
0308             efer &= ~EFER_LME;
0309     }
0310
0311     if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
0312         if (!(efer & EFER_SVME)) {
0313             svm_leave_nested(vcpu);
0314             svm_set_gif(svm, true);
0315             /* #GP intercept is still needed for vmware backdoor */
0316             if (!enable_vmware_backdoor)
0317                 clr_exception_intercept(svm, GP_VECTOR);
0318
0319             /*
0320              * Free the nested guest state, unless we are in SMM.
0321              * In this case we will return to the nested guest
0322              * as soon as we leave SMM.
0323              */
0324             if (!is_smm(vcpu))
0325                 svm_free_nested(svm);
0326
0327         } else {
0328             int ret = svm_allocate_nested(svm);
0329
0330             if (ret) {
0331                 vcpu->arch.efer = old_efer;
0332                 return ret;
0333             }
0334
0335             /*
0336              * Never intercept #GP for SEV guests, KVM can't
0337              * decrypt guest memory to workaround the erratum.
0338              */
0339             if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
0340                 set_exception_intercept(svm, GP_VECTOR);
0341         }
0342     }
0343
0344     svm->vmcb->save.efer = efer | EFER_SVME;
0345     vmcb_mark_dirty(svm->vmcb, VMCB_CR);
0346     return 0;
0347 }
0348
0349 static int is_external_interrupt(u32 info)
0350 {
0351     info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
0352     return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
0353 }
0354
0355 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
0356 {
0357     struct vcpu_svm *svm = to_svm(vcpu);
0358     u32 ret = 0;
0359
0360     if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
0361         ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
0362     return ret;
0363 }
0364
0365 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
0366 {
0367     struct vcpu_svm *svm = to_svm(vcpu);
0368
0369     if (mask == 0)
0370         svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
0371     else
0372         svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
0373
0374 }
0375
0376 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
0377                        bool commit_side_effects)
0378 {
0379     struct vcpu_svm *svm = to_svm(vcpu);
0380     unsigned long old_rflags;
0381
0382     /*
0383      * SEV-ES does not expose the next RIP. The RIP update is controlled by
0384      * the type of exit and the #VC handler in the guest.
0385      */
0386     if (sev_es_guest(vcpu->kvm))
0387         goto done;
0388
0389     if (nrips && svm->vmcb->control.next_rip != 0) {
0390         WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
0391         svm->next_rip = svm->vmcb->control.next_rip;
0392     }
0393
0394     if (!svm->next_rip) {
0395         if (unlikely(!commit_side_effects))
0396             old_rflags = svm->vmcb->save.rflags;
0397
0398         if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
0399             return 0;
0400
0401         if (unlikely(!commit_side_effects))
0402             svm->vmcb->save.rflags = old_rflags;
0403     } else {
0404         kvm_rip_write(vcpu, svm->next_rip);
0405     }
0406
0407 done:
0408     if (likely(commit_side_effects))
0409         svm_set_interrupt_shadow(vcpu, 0);
0410
0411     return 1;
0412 }
0413
0414 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
0415 {
0416     return __svm_skip_emulated_instruction(vcpu, true);
0417 }
0418
0419 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
0420 {
0421     unsigned long rip, old_rip = kvm_rip_read(vcpu);
0422     struct vcpu_svm *svm = to_svm(vcpu);
0423
0424     /*
0425      * Due to architectural shortcomings, the CPU doesn't always provide
0426      * NextRIP, e.g. if KVM intercepted an exception that occurred while
0427      * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
0428      * the instruction even if NextRIP is supported to acquire the next
0429      * RIP so that it can be shoved into the NextRIP field, otherwise
0430      * hardware will fail to advance guest RIP during event injection.
0431      * Drop the exception/interrupt if emulation fails and effectively
0432      * retry the instruction, it's the least awful option.  If NRIPS is
0433      * in use, the skip must not commit any side effects such as clearing
0434      * the interrupt shadow or RFLAGS.RF.
0435      */
0436     if (!__svm_skip_emulated_instruction(vcpu, !nrips))
0437         return -EIO;
0438
0439     rip = kvm_rip_read(vcpu);
0440
0441     /*
0442      * Save the injection information, even when using next_rip, as the
0443      * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
0444      * doesn't complete due to a VM-Exit occurring while the CPU is
0445      * vectoring the event.   Decoding the instruction isn't guaranteed to
0446      * work as there may be no backing instruction, e.g. if the event is
0447      * being injected by L1 for L2, or if the guest is patching INT3 into
0448      * a different instruction.
0449      */
0450     svm->soft_int_injected = true;
0451     svm->soft_int_csbase = svm->vmcb->save.cs.base;
0452     svm->soft_int_old_rip = old_rip;
0453     svm->soft_int_next_rip = rip;
0454
0455     if (nrips)
0456         kvm_rip_write(vcpu, old_rip);
0457
0458     if (static_cpu_has(X86_FEATURE_NRIPS))
0459         svm->vmcb->control.next_rip = rip;
0460
0461     return 0;
0462 }
0463
0464 static void svm_queue_exception(struct kvm_vcpu *vcpu)
0465 {
0466     struct vcpu_svm *svm = to_svm(vcpu);
0467     unsigned nr = vcpu->arch.exception.nr;
0468     bool has_error_code = vcpu->arch.exception.has_error_code;
0469     u32 error_code = vcpu->arch.exception.error_code;
0470
0471     kvm_deliver_exception_payload(vcpu);
0472
0473     if (kvm_exception_is_soft(nr) &&
0474         svm_update_soft_interrupt_rip(vcpu))
0475         return;
0476
0477     svm->vmcb->control.event_inj = nr
0478         | SVM_EVTINJ_VALID
0479         | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
0480         | SVM_EVTINJ_TYPE_EXEPT;
0481     svm->vmcb->control.event_inj_err = error_code;
0482 }
0483
0484 static void svm_init_erratum_383(void)
0485 {
0486     u32 low, high;
0487     int err;
0488     u64 val;
0489
0490     if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
0491         return;
0492
0493     /* Use _safe variants to not break nested virtualization */
0494     val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
0495     if (err)
0496         return;
0497
0498     val |= (1ULL << 47);
0499
0500     low  = lower_32_bits(val);
0501     high = upper_32_bits(val);
0502
0503     native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
0504
0505     erratum_383_found = true;
0506 }
0507
0508 static void svm_init_osvw(struct kvm_vcpu *vcpu)
0509 {
0510     /*
0511      * Guests should see errata 400 and 415 as fixed (assuming that
0512      * HLT and IO instructions are intercepted).
0513      */
0514     vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
0515     vcpu->arch.osvw.status = osvw_status & ~(6ULL);
0516
0517     /*
0518      * By increasing VCPU's osvw.length to 3 we are telling the guest that
0519      * all osvw.status bits inside that length, including bit 0 (which is
0520      * reserved for erratum 298), are valid. However, if host processor's
0521      * osvw_len is 0 then osvw_status[0] carries no information. We need to
0522      * be conservative here and therefore we tell the guest that erratum 298
0523      * is present (because we really don't know).
0524      */
0525     if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
0526         vcpu->arch.osvw.status |= 1;
0527 }
0528
0529 static int has_svm(void)
0530 {
0531     const char *msg;
0532
0533     if (!cpu_has_svm(&msg)) {
0534         printk(KERN_INFO "has_svm: %s\n", msg);
0535         return 0;
0536     }
0537
0538     if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
0539         pr_info("KVM is unsupported when running as an SEV guest\n");
0540         return 0;
0541     }
0542
0543     return 1;
0544 }
0545
0546 void __svm_write_tsc_multiplier(u64 multiplier)
0547 {
0548     preempt_disable();
0549
0550     if (multiplier == __this_cpu_read(current_tsc_ratio))
0551         goto out;
0552
0553     wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
0554     __this_cpu_write(current_tsc_ratio, multiplier);
0555 out:
0556     preempt_enable();
0557 }
0558
0559 static void svm_hardware_disable(void)
0560 {
0561     /* Make sure we clean up behind us */
0562     if (tsc_scaling)
0563         __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
0564
0565     cpu_svm_disable();
0566
0567     amd_pmu_disable_virt();
0568 }
0569
0570 static int svm_hardware_enable(void)
0571 {
0572
0573     struct svm_cpu_data *sd;
0574     uint64_t efer;
0575     struct desc_struct *gdt;
0576     int me = raw_smp_processor_id();
0577
0578     rdmsrl(MSR_EFER, efer);
0579     if (efer & EFER_SVME)
0580         return -EBUSY;
0581
0582     if (!has_svm()) {
0583         pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
0584         return -EINVAL;
0585     }
0586     sd = per_cpu(svm_data, me);
0587     if (!sd) {
0588         pr_err("%s: svm_data is NULL on %d\n", __func__, me);
0589         return -EINVAL;
0590     }
0591
0592     sd->asid_generation = 1;
0593     sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
0594     sd->next_asid = sd->max_asid + 1;
0595     sd->min_asid = max_sev_asid + 1;
0596
0597     gdt = get_current_gdt_rw();
0598     sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
0599
0600     wrmsrl(MSR_EFER, efer | EFER_SVME);
0601
0602     wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
0603
0604     if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
0605         /*
0606          * Set the default value, even if we don't use TSC scaling
0607          * to avoid having stale value in the msr
0608          */
0609         __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
0610     }
0611
0612
0613     /*
0614      * Get OSVW bits.
0615      *
0616      * Note that it is possible to have a system with mixed processor
0617      * revisions and therefore different OSVW bits. If bits are not the same
0618      * on different processors then choose the worst case (i.e. if erratum
0619      * is present on one processor and not on another then assume that the
0620      * erratum is present everywhere).
0621      */
0622     if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
0623         uint64_t len, status = 0;
0624         int err;
0625
0626         len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
0627         if (!err)
0628             status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
0629                               &err);
0630
0631         if (err)
0632             osvw_status = osvw_len = 0;
0633         else {
0634             if (len < osvw_len)
0635                 osvw_len = len;
0636             osvw_status |= status;
0637             osvw_status &= (1ULL << osvw_len) - 1;
0638         }
0639     } else
0640         osvw_status = osvw_len = 0;
0641
0642     svm_init_erratum_383();
0643
0644     amd_pmu_enable_virt();
0645
0646     return 0;
0647 }
0648
0649 static void svm_cpu_uninit(int cpu)
0650 {
0651     struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
0652
0653     if (!sd)
0654         return;
0655
0656     per_cpu(svm_data, cpu) = NULL;
0657     kfree(sd->sev_vmcbs);
0658     __free_page(sd->save_area);
0659     kfree(sd);
0660 }
0661
0662 static int svm_cpu_init(int cpu)
0663 {
0664     struct svm_cpu_data *sd;
0665     int ret = -ENOMEM;
0666
0667     sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
0668     if (!sd)
0669         return ret;
0670     sd->cpu = cpu;
0671     sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
0672     if (!sd->save_area)
0673         goto free_cpu_data;
0674
0675     ret = sev_cpu_init(sd);
0676     if (ret)
0677         goto free_save_area;
0678
0679     per_cpu(svm_data, cpu) = sd;
0680
0681     return 0;
0682
0683 free_save_area:
0684     __free_page(sd->save_area);
0685 free_cpu_data:
0686     kfree(sd);
0687     return ret;
0688
0689 }
0690
0691 static int direct_access_msr_slot(u32 msr)
0692 {
0693     u32 i;
0694
0695     for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
0696         if (direct_access_msrs[i].index == msr)
0697             return i;
0698
0699     return -ENOENT;
0700 }
0701
0702 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
0703                      int write)
0704 {
0705     struct vcpu_svm *svm = to_svm(vcpu);
0706     int slot = direct_access_msr_slot(msr);
0707
0708     if (slot == -ENOENT)
0709         return;
0710
0711     /* Set the shadow bitmaps to the desired intercept states */
0712     if (read)
0713         set_bit(slot, svm->shadow_msr_intercept.read);
0714     else
0715         clear_bit(slot, svm->shadow_msr_intercept.read);
0716
0717     if (write)
0718         set_bit(slot, svm->shadow_msr_intercept.write);
0719     else
0720         clear_bit(slot, svm->shadow_msr_intercept.write);
0721 }
0722
0723 static bool valid_msr_intercept(u32 index)
0724 {
0725     return direct_access_msr_slot(index) != -ENOENT;
0726 }
0727
0728 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
0729 {
0730     u8 bit_write;
0731     unsigned long tmp;
0732     u32 offset;
0733     u32 *msrpm;
0734
0735     msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
0736                       to_svm(vcpu)->msrpm;
0737
0738     offset    = svm_msrpm_offset(msr);
0739     bit_write = 2 * (msr & 0x0f) + 1;
0740     tmp       = msrpm[offset];
0741
0742     BUG_ON(offset == MSR_INVALID);
0743
0744     return !!test_bit(bit_write,  &tmp);
0745 }
0746
0747 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
0748                     u32 msr, int read, int write)
0749 {
0750     struct vcpu_svm *svm = to_svm(vcpu);
0751     u8 bit_read, bit_write;
0752     unsigned long tmp;
0753     u32 offset;
0754
0755     /*
0756      * If this warning triggers extend the direct_access_msrs list at the
0757      * beginning of the file
0758      */
0759     WARN_ON(!valid_msr_intercept(msr));
0760
0761     /* Enforce non allowed MSRs to trap */
0762     if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
0763         read = 0;
0764
0765     if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
0766         write = 0;
0767
0768     offset    = svm_msrpm_offset(msr);
0769     bit_read  = 2 * (msr & 0x0f);
0770     bit_write = 2 * (msr & 0x0f) + 1;
0771     tmp       = msrpm[offset];
0772
0773     BUG_ON(offset == MSR_INVALID);
0774
0775     read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
0776     write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
0777
0778     msrpm[offset] = tmp;
0779
0780     svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
0781     svm->nested.force_msr_bitmap_recalc = true;
0782 }
0783
0784 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
0785               int read, int write)
0786 {
0787     set_shadow_msr_intercept(vcpu, msr, read, write);
0788     set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
0789 }
0790
0791 u32 *svm_vcpu_alloc_msrpm(void)
0792 {
0793     unsigned int order = get_order(MSRPM_SIZE);
0794     struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
0795     u32 *msrpm;
0796
0797     if (!pages)
0798         return NULL;
0799
0800     msrpm = page_address(pages);
0801     memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
0802
0803     return msrpm;
0804 }
0805
0806 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
0807 {
0808     int i;
0809
0810     for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
0811         if (!direct_access_msrs[i].always)
0812             continue;
0813         set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
0814     }
0815 }
0816
0817 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
0818 {
0819     int i;
0820
0821     if (intercept == svm->x2avic_msrs_intercepted)
0822         return;
0823
0824     if (avic_mode != AVIC_MODE_X2 ||
0825         !apic_x2apic_mode(svm->vcpu.arch.apic))
0826         return;
0827
0828     for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
0829         int index = direct_access_msrs[i].index;
0830
0831         if ((index < APIC_BASE_MSR) ||
0832             (index > APIC_BASE_MSR + 0xff))
0833             continue;
0834         set_msr_interception(&svm->vcpu, svm->msrpm, index,
0835                      !intercept, !intercept);
0836     }
0837
0838     svm->x2avic_msrs_intercepted = intercept;
0839 }
0840
0841 void svm_vcpu_free_msrpm(u32 *msrpm)
0842 {
0843     __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
0844 }
0845
0846 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
0847 {
0848     struct vcpu_svm *svm = to_svm(vcpu);
0849     u32 i;
0850
0851     /*
0852      * Set intercept permissions for all direct access MSRs again. They
0853      * will automatically get filtered through the MSR filter, so we are
0854      * back in sync after this.
0855      */
0856     for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
0857         u32 msr = direct_access_msrs[i].index;
0858         u32 read = test_bit(i, svm->shadow_msr_intercept.read);
0859         u32 write = test_bit(i, svm->shadow_msr_intercept.write);
0860
0861         set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
0862     }
0863 }
0864
0865 static void add_msr_offset(u32 offset)
0866 {
0867     int i;
0868
0869     for (i = 0; i < MSRPM_OFFSETS; ++i) {
0870
0871         /* Offset already in list? */
0872         if (msrpm_offsets[i] == offset)
0873             return;
0874
0875         /* Slot used by another offset? */
0876         if (msrpm_offsets[i] != MSR_INVALID)
0877             continue;
0878
0879         /* Add offset to list */
0880         msrpm_offsets[i] = offset;
0881
0882         return;
0883     }
0884
0885     /*
0886      * If this BUG triggers the msrpm_offsets table has an overflow. Just
0887      * increase MSRPM_OFFSETS in this case.
0888      */
0889     BUG();
0890 }
0891
0892 static void init_msrpm_offsets(void)
0893 {
0894     int i;
0895
0896     memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
0897
0898     for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
0899         u32 offset;
0900
0901         offset = svm_msrpm_offset(direct_access_msrs[i].index);
0902         BUG_ON(offset == MSR_INVALID);
0903
0904         add_msr_offset(offset);
0905     }
0906 }
0907
0908 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
0909 {
0910     to_vmcb->save.dbgctl        = from_vmcb->save.dbgctl;
0911     to_vmcb->save.br_from       = from_vmcb->save.br_from;
0912     to_vmcb->save.br_to     = from_vmcb->save.br_to;
0913     to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
0914     to_vmcb->save.last_excp_to  = from_vmcb->save.last_excp_to;
0915
0916     vmcb_mark_dirty(to_vmcb, VMCB_LBR);
0917 }
0918
0919 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
0920 {
0921     struct vcpu_svm *svm = to_svm(vcpu);
0922
0923     svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
0924     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
0925     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
0926     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
0927     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
0928
0929     /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
0930     if (is_guest_mode(vcpu))
0931         svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
0932 }
0933
0934 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
0935 {
0936     struct vcpu_svm *svm = to_svm(vcpu);
0937
0938     svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
0939     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
0940     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
0941     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
0942     set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
0943
0944     /*
0945      * Move the LBR msrs back to the vmcb01 to avoid copying them
0946      * on nested guest entries.
0947      */
0948     if (is_guest_mode(vcpu))
0949         svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
0950 }
0951
0952 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
0953 {
0954     /*
0955      * If the LBR virtualization is disabled, the LBR msrs are always
0956      * kept in the vmcb01 to avoid copying them on nested guest entries.
0957      *
0958      * If nested, and the LBR virtualization is enabled/disabled, the msrs
0959      * are moved between the vmcb01 and vmcb02 as needed.
0960      */
0961     struct vmcb *vmcb =
0962         (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
0963             svm->vmcb : svm->vmcb01.ptr;
0964
0965     switch (index) {
0966     case MSR_IA32_DEBUGCTLMSR:
0967         return vmcb->save.dbgctl;
0968     case MSR_IA32_LASTBRANCHFROMIP:
0969         return vmcb->save.br_from;
0970     case MSR_IA32_LASTBRANCHTOIP:
0971         return vmcb->save.br_to;
0972     case MSR_IA32_LASTINTFROMIP:
0973         return vmcb->save.last_excp_from;
0974     case MSR_IA32_LASTINTTOIP:
0975         return vmcb->save.last_excp_to;
0976     default:
0977         KVM_BUG(false, svm->vcpu.kvm,
0978             "%s: Unknown MSR 0x%x", __func__, index);
0979         return 0;
0980     }
0981 }
0982
0983 void svm_update_lbrv(struct kvm_vcpu *vcpu)
0984 {
0985     struct vcpu_svm *svm = to_svm(vcpu);
0986
0987     bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
0988                        DEBUGCTLMSR_LBR;
0989
0990     bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
0991                       LBR_CTL_ENABLE_MASK);
0992
0993     if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
0994         if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
0995             enable_lbrv = true;
0996
0997     if (enable_lbrv == current_enable_lbrv)
0998         return;
0999
1000     if (enable_lbrv)
1001         svm_enable_lbrv(vcpu);
1002     else
1003         svm_disable_lbrv(vcpu);
1004 }
1005
1006 void disable_nmi_singlestep(struct vcpu_svm *svm)
1007 {
1008     svm->nmi_singlestep = false;
1009
1010     if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1011         /* Clear our flags if they were not set by the guest */
1012         if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1013             svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1014         if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1015             svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1016     }
1017 }
1018
1019 static void grow_ple_window(struct kvm_vcpu *vcpu)
1020 {
1021     struct vcpu_svm *svm = to_svm(vcpu);
1022     struct vmcb_control_area *control = &svm->vmcb->control;
1023     int old = control->pause_filter_count;
1024
1025     if (kvm_pause_in_guest(vcpu->kvm))
1026         return;
1027
1028     control->pause_filter_count = __grow_ple_window(old,
1029                             pause_filter_count,
1030                             pause_filter_count_grow,
1031                             pause_filter_count_max);
1032
1033     if (control->pause_filter_count != old) {
1034         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1035         trace_kvm_ple_window_update(vcpu->vcpu_id,
1036                         control->pause_filter_count, old);
1037     }
1038 }
1039
1040 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1041 {
1042     struct vcpu_svm *svm = to_svm(vcpu);
1043     struct vmcb_control_area *control = &svm->vmcb->control;
1044     int old = control->pause_filter_count;
1045
1046     if (kvm_pause_in_guest(vcpu->kvm))
1047         return;
1048
1049     control->pause_filter_count =
1050                 __shrink_ple_window(old,
1051                             pause_filter_count,
1052                             pause_filter_count_shrink,
1053                             pause_filter_count);
1054     if (control->pause_filter_count != old) {
1055         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1056         trace_kvm_ple_window_update(vcpu->vcpu_id,
1057                         control->pause_filter_count, old);
1058     }
1059 }
1060
1061 static void svm_hardware_unsetup(void)
1062 {
1063     int cpu;
1064
1065     sev_hardware_unsetup();
1066
1067     for_each_possible_cpu(cpu)
1068         svm_cpu_uninit(cpu);
1069
1070     __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1071     get_order(IOPM_SIZE));
1072     iopm_base = 0;
1073 }
1074
1075 static void init_seg(struct vmcb_seg *seg)
1076 {
1077     seg->selector = 0;
1078     seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1079               SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1080     seg->limit = 0xffff;
1081     seg->base = 0;
1082 }
1083
1084 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1085 {
1086     seg->selector = 0;
1087     seg->attrib = SVM_SELECTOR_P_MASK | type;
1088     seg->limit = 0xffff;
1089     seg->base = 0;
1090 }
1091
1092 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1093 {
1094     struct vcpu_svm *svm = to_svm(vcpu);
1095
1096     return svm->nested.ctl.tsc_offset;
1097 }
1098
1099 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1100 {
1101     struct vcpu_svm *svm = to_svm(vcpu);
1102
1103     return svm->tsc_ratio_msr;
1104 }
1105
1106 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1107 {
1108     struct vcpu_svm *svm = to_svm(vcpu);
1109
1110     svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1111     svm->vmcb->control.tsc_offset = offset;
1112     vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1113 }
1114
1115 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1116 {
1117     __svm_write_tsc_multiplier(multiplier);
1118 }
1119
1120
1121 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1122 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1123                           struct vcpu_svm *svm)
1124 {
1125     /*
1126      * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1127      * roots, or if INVPCID is disabled in the guest to inject #UD.
1128      */
1129     if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1130         if (!npt_enabled ||
1131             !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1132             svm_set_intercept(svm, INTERCEPT_INVPCID);
1133         else
1134             svm_clr_intercept(svm, INTERCEPT_INVPCID);
1135     }
1136
1137     if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1138         if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1139             svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1140         else
1141             svm_set_intercept(svm, INTERCEPT_RDTSCP);
1142     }
1143 }
1144
1145 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1146 {
1147     struct vcpu_svm *svm = to_svm(vcpu);
1148
1149     if (guest_cpuid_is_intel(vcpu)) {
1150         /*
1151          * We must intercept SYSENTER_EIP and SYSENTER_ESP
1152          * accesses because the processor only stores 32 bits.
1153          * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1154          */
1155         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1156         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1157         svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1158
1159         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1160         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1161
1162         svm->v_vmload_vmsave_enabled = false;
1163     } else {
1164         /*
1165          * If hardware supports Virtual VMLOAD VMSAVE then enable it
1166          * in VMCB and clear intercepts to avoid #VMEXIT.
1167          */
1168         if (vls) {
1169             svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1170             svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1171             svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1172         }
1173         /* No need to intercept these MSRs */
1174         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1175         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1176     }
1177 }
1178
1179 static void init_vmcb(struct kvm_vcpu *vcpu)
1180 {
1181     struct vcpu_svm *svm = to_svm(vcpu);
1182     struct vmcb *vmcb = svm->vmcb01.ptr;
1183     struct vmcb_control_area *control = &vmcb->control;
1184     struct vmcb_save_area *save = &vmcb->save;
1185
1186     svm_set_intercept(svm, INTERCEPT_CR0_READ);
1187     svm_set_intercept(svm, INTERCEPT_CR3_READ);
1188     svm_set_intercept(svm, INTERCEPT_CR4_READ);
1189     svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1190     svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1191     svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1192     if (!kvm_vcpu_apicv_active(vcpu))
1193         svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1194
1195     set_dr_intercepts(svm);
1196
1197     set_exception_intercept(svm, PF_VECTOR);
1198     set_exception_intercept(svm, UD_VECTOR);
1199     set_exception_intercept(svm, MC_VECTOR);
1200     set_exception_intercept(svm, AC_VECTOR);
1201     set_exception_intercept(svm, DB_VECTOR);
1202     /*
1203      * Guest access to VMware backdoor ports could legitimately
1204      * trigger #GP because of TSS I/O permission bitmap.
1205      * We intercept those #GP and allow access to them anyway
1206      * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1207      * decrypt guest memory to decode the faulting instruction.
1208      */
1209     if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1210         set_exception_intercept(svm, GP_VECTOR);
1211
1212     svm_set_intercept(svm, INTERCEPT_INTR);
1213     svm_set_intercept(svm, INTERCEPT_NMI);
1214
1215     if (intercept_smi)
1216         svm_set_intercept(svm, INTERCEPT_SMI);
1217
1218     svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1219     svm_set_intercept(svm, INTERCEPT_RDPMC);
1220     svm_set_intercept(svm, INTERCEPT_CPUID);
1221     svm_set_intercept(svm, INTERCEPT_INVD);
1222     svm_set_intercept(svm, INTERCEPT_INVLPG);
1223     svm_set_intercept(svm, INTERCEPT_INVLPGA);
1224     svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1225     svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1226     svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1227     svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1228     svm_set_intercept(svm, INTERCEPT_VMRUN);
1229     svm_set_intercept(svm, INTERCEPT_VMMCALL);
1230     svm_set_intercept(svm, INTERCEPT_VMLOAD);
1231     svm_set_intercept(svm, INTERCEPT_VMSAVE);
1232     svm_set_intercept(svm, INTERCEPT_STGI);
1233     svm_set_intercept(svm, INTERCEPT_CLGI);
1234     svm_set_intercept(svm, INTERCEPT_SKINIT);
1235     svm_set_intercept(svm, INTERCEPT_WBINVD);
1236     svm_set_intercept(svm, INTERCEPT_XSETBV);
1237     svm_set_intercept(svm, INTERCEPT_RDPRU);
1238     svm_set_intercept(svm, INTERCEPT_RSM);
1239
1240     if (!kvm_mwait_in_guest(vcpu->kvm)) {
1241         svm_set_intercept(svm, INTERCEPT_MONITOR);
1242         svm_set_intercept(svm, INTERCEPT_MWAIT);
1243     }
1244
1245     if (!kvm_hlt_in_guest(vcpu->kvm))
1246         svm_set_intercept(svm, INTERCEPT_HLT);
1247
1248     control->iopm_base_pa = __sme_set(iopm_base);
1249     control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1250     control->int_ctl = V_INTR_MASKING_MASK;
1251
1252     init_seg(&save->es);
1253     init_seg(&save->ss);
1254     init_seg(&save->ds);
1255     init_seg(&save->fs);
1256     init_seg(&save->gs);
1257
1258     save->cs.selector = 0xf000;
1259     save->cs.base = 0xffff0000;
1260     /* Executable/Readable Code Segment */
1261     save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1262         SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1263     save->cs.limit = 0xffff;
1264
1265     save->gdtr.base = 0;
1266     save->gdtr.limit = 0xffff;
1267     save->idtr.base = 0;
1268     save->idtr.limit = 0xffff;
1269
1270     init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1271     init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1272
1273     if (npt_enabled) {
1274         /* Setup VMCB for Nested Paging */
1275         control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1276         svm_clr_intercept(svm, INTERCEPT_INVLPG);
1277         clr_exception_intercept(svm, PF_VECTOR);
1278         svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1279         svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1280         save->g_pat = vcpu->arch.pat;
1281         save->cr3 = 0;
1282     }
1283     svm->current_vmcb->asid_generation = 0;
1284     svm->asid = 0;
1285
1286     svm->nested.vmcb12_gpa = INVALID_GPA;
1287     svm->nested.last_vmcb12_gpa = INVALID_GPA;
1288
1289     if (!kvm_pause_in_guest(vcpu->kvm)) {
1290         control->pause_filter_count = pause_filter_count;
1291         if (pause_filter_thresh)
1292             control->pause_filter_thresh = pause_filter_thresh;
1293         svm_set_intercept(svm, INTERCEPT_PAUSE);
1294     } else {
1295         svm_clr_intercept(svm, INTERCEPT_PAUSE);
1296     }
1297
1298     svm_recalc_instruction_intercepts(vcpu, svm);
1299
1300     /*
1301      * If the host supports V_SPEC_CTRL then disable the interception
1302      * of MSR_IA32_SPEC_CTRL.
1303      */
1304     if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1305         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1306
1307     if (kvm_vcpu_apicv_active(vcpu))
1308         avic_init_vmcb(svm, vmcb);
1309
1310     if (vgif) {
1311         svm_clr_intercept(svm, INTERCEPT_STGI);
1312         svm_clr_intercept(svm, INTERCEPT_CLGI);
1313         svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1314     }
1315
1316     if (sev_guest(vcpu->kvm))
1317         sev_init_vmcb(svm);
1318
1319     svm_hv_init_vmcb(vmcb);
1320     init_vmcb_after_set_cpuid(vcpu);
1321
1322     vmcb_mark_all_dirty(vmcb);
1323
1324     enable_gif(svm);
1325 }
1326
1327 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1328 {
1329     struct vcpu_svm *svm = to_svm(vcpu);
1330
1331     svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1332
1333     svm_init_osvw(vcpu);
1334     vcpu->arch.microcode_version = 0x01000065;
1335     svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1336
1337     if (sev_es_guest(vcpu->kvm))
1338         sev_es_vcpu_reset(svm);
1339 }
1340
1341 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1342 {
1343     struct vcpu_svm *svm = to_svm(vcpu);
1344
1345     svm->spec_ctrl = 0;
1346     svm->virt_spec_ctrl = 0;
1347
1348     init_vmcb(vcpu);
1349
1350     if (!init_event)
1351         __svm_vcpu_reset(vcpu);
1352 }
1353
1354 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1355 {
1356     svm->current_vmcb = target_vmcb;
1357     svm->vmcb = target_vmcb->ptr;
1358 }
1359
1360 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1361 {
1362     struct vcpu_svm *svm;
1363     struct page *vmcb01_page;
1364     struct page *vmsa_page = NULL;
1365     int err;
1366
1367     BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1368     svm = to_svm(vcpu);
1369
1370     err = -ENOMEM;
1371     vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1372     if (!vmcb01_page)
1373         goto out;
1374
1375     if (sev_es_guest(vcpu->kvm)) {
1376         /*
1377          * SEV-ES guests require a separate VMSA page used to contain
1378          * the encrypted register state of the guest.
1379          */
1380         vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1381         if (!vmsa_page)
1382             goto error_free_vmcb_page;
1383
1384         /*
1385          * SEV-ES guests maintain an encrypted version of their FPU
1386          * state which is restored and saved on VMRUN and VMEXIT.
1387          * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1388          * do xsave/xrstor on it.
1389          */
1390         fpstate_set_confidential(&vcpu->arch.guest_fpu);
1391     }
1392
1393     err = avic_init_vcpu(svm);
1394     if (err)
1395         goto error_free_vmsa_page;
1396
1397     svm->msrpm = svm_vcpu_alloc_msrpm();
1398     if (!svm->msrpm) {
1399         err = -ENOMEM;
1400         goto error_free_vmsa_page;
1401     }
1402
1403     svm->x2avic_msrs_intercepted = true;
1404
1405     svm->vmcb01.ptr = page_address(vmcb01_page);
1406     svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1407     svm_switch_vmcb(svm, &svm->vmcb01);
1408
1409     if (vmsa_page)
1410         svm->sev_es.vmsa = page_address(vmsa_page);
1411
1412     svm->guest_state_loaded = false;
1413
1414     return 0;
1415
1416 error_free_vmsa_page:
1417     if (vmsa_page)
1418         __free_page(vmsa_page);
1419 error_free_vmcb_page:
1420     __free_page(vmcb01_page);
1421 out:
1422     return err;
1423 }
1424
1425 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1426 {
1427     int i;
1428
1429     for_each_online_cpu(i)
1430         cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1431 }
1432
1433 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1434 {
1435     struct vcpu_svm *svm = to_svm(vcpu);
1436
1437     /*
1438      * The vmcb page can be recycled, causing a false negative in
1439      * svm_vcpu_load(). So, ensure that no logical CPU has this
1440      * vmcb page recorded as its current vmcb.
1441      */
1442     svm_clear_current_vmcb(svm->vmcb);
1443
1444     svm_free_nested(svm);
1445
1446     sev_free_vcpu(vcpu);
1447
1448     __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1449     __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1450 }
1451
1452 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1453 {
1454     struct vcpu_svm *svm = to_svm(vcpu);
1455     struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
1456
1457     if (sev_es_guest(vcpu->kvm))
1458         sev_es_unmap_ghcb(svm);
1459
1460     if (svm->guest_state_loaded)
1461         return;
1462
1463     /*
1464      * Save additional host state that will be restored on VMEXIT (sev-es)
1465      * or subsequent vmload of host save area.
1466      */
1467     vmsave(__sme_page_pa(sd->save_area));
1468     if (sev_es_guest(vcpu->kvm)) {
1469         struct sev_es_save_area *hostsa;
1470         hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1471
1472         sev_es_prepare_switch_to_guest(hostsa);
1473     }
1474
1475     if (tsc_scaling)
1476         __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1477
1478     if (likely(tsc_aux_uret_slot >= 0))
1479         kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1480
1481     svm->guest_state_loaded = true;
1482 }
1483
1484 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1485 {
1486     to_svm(vcpu)->guest_state_loaded = false;
1487 }
1488
1489 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1490 {
1491     struct vcpu_svm *svm = to_svm(vcpu);
1492     struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1493
1494     if (sd->current_vmcb != svm->vmcb) {
1495         sd->current_vmcb = svm->vmcb;
1496         indirect_branch_prediction_barrier();
1497     }
1498     if (kvm_vcpu_apicv_active(vcpu))
1499         avic_vcpu_load(vcpu, cpu);
1500 }
1501
1502 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1503 {
1504     if (kvm_vcpu_apicv_active(vcpu))
1505         avic_vcpu_put(vcpu);
1506
1507     svm_prepare_host_switch(vcpu);
1508
1509     ++vcpu->stat.host_state_reload;
1510 }
1511
1512 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1513 {
1514     struct vcpu_svm *svm = to_svm(vcpu);
1515     unsigned long rflags = svm->vmcb->save.rflags;
1516
1517     if (svm->nmi_singlestep) {
1518         /* Hide our flags if they were not set by the guest */
1519         if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1520             rflags &= ~X86_EFLAGS_TF;
1521         if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1522             rflags &= ~X86_EFLAGS_RF;
1523     }
1524     return rflags;
1525 }
1526
1527 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1528 {
1529     if (to_svm(vcpu)->nmi_singlestep)
1530         rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1531
1532        /*
1533         * Any change of EFLAGS.VM is accompanied by a reload of SS
1534         * (caused by either a task switch or an inter-privilege IRET),
1535         * so we do not need to update the CPL here.
1536         */
1537     to_svm(vcpu)->vmcb->save.rflags = rflags;
1538 }
1539
1540 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1541 {
1542     struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1543
1544     return sev_es_guest(vcpu->kvm)
1545         ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1546         : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1547 }
1548
1549 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1550 {
1551     kvm_register_mark_available(vcpu, reg);
1552
1553     switch (reg) {
1554     case VCPU_EXREG_PDPTR:
1555         /*
1556          * When !npt_enabled, mmu->pdptrs[] is already available since
1557          * it is always updated per SDM when moving to CRs.
1558          */
1559         if (npt_enabled)
1560             load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1561         break;
1562     default:
1563         KVM_BUG_ON(1, vcpu->kvm);
1564     }
1565 }
1566
1567 static void svm_set_vintr(struct vcpu_svm *svm)
1568 {
1569     struct vmcb_control_area *control;
1570
1571     /*
1572      * The following fields are ignored when AVIC is enabled
1573      */
1574     WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1575
1576     svm_set_intercept(svm, INTERCEPT_VINTR);
1577
1578     /*
1579      * This is just a dummy VINTR to actually cause a vmexit to happen.
1580      * Actual injection of virtual interrupts happens through EVENTINJ.
1581      */
1582     control = &svm->vmcb->control;
1583     control->int_vector = 0x0;
1584     control->int_ctl &= ~V_INTR_PRIO_MASK;
1585     control->int_ctl |= V_IRQ_MASK |
1586         ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1587     vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1588 }
1589
1590 static void svm_clear_vintr(struct vcpu_svm *svm)
1591 {
1592     svm_clr_intercept(svm, INTERCEPT_VINTR);
1593
1594     /* Drop int_ctl fields related to VINTR injection.  */
1595     svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1596     if (is_guest_mode(&svm->vcpu)) {
1597         svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1598
1599         WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1600             (svm->nested.ctl.int_ctl & V_TPR_MASK));
1601
1602         svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1603             V_IRQ_INJECTION_BITS_MASK;
1604
1605         svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1606     }
1607
1608     vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1609 }
1610
1611 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1612 {
1613     struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1614     struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1615
1616     switch (seg) {
1617     case VCPU_SREG_CS: return &save->cs;
1618     case VCPU_SREG_DS: return &save->ds;
1619     case VCPU_SREG_ES: return &save->es;
1620     case VCPU_SREG_FS: return &save01->fs;
1621     case VCPU_SREG_GS: return &save01->gs;
1622     case VCPU_SREG_SS: return &save->ss;
1623     case VCPU_SREG_TR: return &save01->tr;
1624     case VCPU_SREG_LDTR: return &save01->ldtr;
1625     }
1626     BUG();
1627     return NULL;
1628 }
1629
1630 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1631 {
1632     struct vmcb_seg *s = svm_seg(vcpu, seg);
1633
1634     return s->base;
1635 }
1636
1637 static void svm_get_segment(struct kvm_vcpu *vcpu,
1638                 struct kvm_segment *var, int seg)
1639 {
1640     struct vmcb_seg *s = svm_seg(vcpu, seg);
1641
1642     var->base = s->base;
1643     var->limit = s->limit;
1644     var->selector = s->selector;
1645     var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1646     var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1647     var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1648     var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1649     var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1650     var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1651     var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1652
1653     /*
1654      * AMD CPUs circa 2014 track the G bit for all segments except CS.
1655      * However, the SVM spec states that the G bit is not observed by the
1656      * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1657      * So let's synthesize a legal G bit for all segments, this helps
1658      * running KVM nested. It also helps cross-vendor migration, because
1659      * Intel's vmentry has a check on the 'G' bit.
1660      */
1661     var->g = s->limit > 0xfffff;
1662
1663     /*
1664      * AMD's VMCB does not have an explicit unusable field, so emulate it
1665      * for cross vendor migration purposes by "not present"
1666      */
1667     var->unusable = !var->present;
1668
1669     switch (seg) {
1670     case VCPU_SREG_TR:
1671         /*
1672          * Work around a bug where the busy flag in the tr selector
1673          * isn't exposed
1674          */
1675         var->type |= 0x2;
1676         break;
1677     case VCPU_SREG_DS:
1678     case VCPU_SREG_ES:
1679     case VCPU_SREG_FS:
1680     case VCPU_SREG_GS:
1681         /*
1682          * The accessed bit must always be set in the segment
1683          * descriptor cache, although it can be cleared in the
1684          * descriptor, the cached bit always remains at 1. Since
1685          * Intel has a check on this, set it here to support
1686          * cross-vendor migration.
1687          */
1688         if (!var->unusable)
1689             var->type |= 0x1;
1690         break;
1691     case VCPU_SREG_SS:
1692         /*
1693          * On AMD CPUs sometimes the DB bit in the segment
1694          * descriptor is left as 1, although the whole segment has
1695          * been made unusable. Clear it here to pass an Intel VMX
1696          * entry check when cross vendor migrating.
1697          */
1698         if (var->unusable)
1699             var->db = 0;
1700         /* This is symmetric with svm_set_segment() */
1701         var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1702         break;
1703     }
1704 }
1705
1706 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1707 {
1708     struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1709
1710     return save->cpl;
1711 }
1712
1713 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1714 {
1715     struct kvm_segment cs;
1716
1717     svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1718     *db = cs.db;
1719     *l = cs.l;
1720 }
1721
1722 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1723 {
1724     struct vcpu_svm *svm = to_svm(vcpu);
1725
1726     dt->size = svm->vmcb->save.idtr.limit;
1727     dt->address = svm->vmcb->save.idtr.base;
1728 }
1729
1730 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1731 {
1732     struct vcpu_svm *svm = to_svm(vcpu);
1733
1734     svm->vmcb->save.idtr.limit = dt->size;
1735     svm->vmcb->save.idtr.base = dt->address ;
1736     vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1737 }
1738
1739 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1740 {
1741     struct vcpu_svm *svm = to_svm(vcpu);
1742
1743     dt->size = svm->vmcb->save.gdtr.limit;
1744     dt->address = svm->vmcb->save.gdtr.base;
1745 }
1746
1747 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1748 {
1749     struct vcpu_svm *svm = to_svm(vcpu);
1750
1751     svm->vmcb->save.gdtr.limit = dt->size;
1752     svm->vmcb->save.gdtr.base = dt->address ;
1753     vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1754 }
1755
1756 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1757 {
1758     struct vcpu_svm *svm = to_svm(vcpu);
1759
1760     /*
1761      * For guests that don't set guest_state_protected, the cr3 update is
1762      * handled via kvm_mmu_load() while entering the guest. For guests
1763      * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1764      * VMCB save area now, since the save area will become the initial
1765      * contents of the VMSA, and future VMCB save area updates won't be
1766      * seen.
1767      */
1768     if (sev_es_guest(vcpu->kvm)) {
1769         svm->vmcb->save.cr3 = cr3;
1770         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1771     }
1772 }
1773
1774 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1775 {
1776     struct vcpu_svm *svm = to_svm(vcpu);
1777     u64 hcr0 = cr0;
1778     bool old_paging = is_paging(vcpu);
1779
1780 #ifdef CONFIG_X86_64
1781     if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1782         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1783             vcpu->arch.efer |= EFER_LMA;
1784             svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1785         }
1786
1787         if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1788             vcpu->arch.efer &= ~EFER_LMA;
1789             svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1790         }
1791     }
1792 #endif
1793     vcpu->arch.cr0 = cr0;
1794
1795     if (!npt_enabled) {
1796         hcr0 |= X86_CR0_PG | X86_CR0_WP;
1797         if (old_paging != is_paging(vcpu))
1798             svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1799     }
1800
1801     /*
1802      * re-enable caching here because the QEMU bios
1803      * does not do it - this results in some delay at
1804      * reboot
1805      */
1806     if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1807         hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1808
1809     svm->vmcb->save.cr0 = hcr0;
1810     vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1811
1812     /*
1813      * SEV-ES guests must always keep the CR intercepts cleared. CR
1814      * tracking is done using the CR write traps.
1815      */
1816     if (sev_es_guest(vcpu->kvm))
1817         return;
1818
1819     if (hcr0 == cr0) {
1820         /* Selective CR0 write remains on.  */
1821         svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1822         svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1823     } else {
1824         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1825         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1826     }
1827 }
1828
1829 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1830 {
1831     return true;
1832 }
1833
1834 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1835 {
1836     unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1837     unsigned long old_cr4 = vcpu->arch.cr4;
1838
1839     if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1840         svm_flush_tlb_current(vcpu);
1841
1842     vcpu->arch.cr4 = cr4;
1843     if (!npt_enabled) {
1844         cr4 |= X86_CR4_PAE;
1845
1846         if (!is_paging(vcpu))
1847             cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1848     }
1849     cr4 |= host_cr4_mce;
1850     to_svm(vcpu)->vmcb->save.cr4 = cr4;
1851     vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1852
1853     if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1854         kvm_update_cpuid_runtime(vcpu);
1855 }
1856
1857 static void svm_set_segment(struct kvm_vcpu *vcpu,
1858                 struct kvm_segment *var, int seg)
1859 {
1860     struct vcpu_svm *svm = to_svm(vcpu);
1861     struct vmcb_seg *s = svm_seg(vcpu, seg);
1862
1863     s->base = var->base;
1864     s->limit = var->limit;
1865     s->selector = var->selector;
1866     s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1867     s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1868     s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1869     s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1870     s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1871     s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1872     s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1873     s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1874
1875     /*
1876      * This is always accurate, except if SYSRET returned to a segment
1877      * with SS.DPL != 3.  Intel does not have this quirk, and always
1878      * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1879      * would entail passing the CPL to userspace and back.
1880      */
1881     if (seg == VCPU_SREG_SS)
1882         /* This is symmetric with svm_get_segment() */
1883         svm->vmcb->save.cpl = (var->dpl & 3);
1884
1885     vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1886 }
1887
1888 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1889 {
1890     struct vcpu_svm *svm = to_svm(vcpu);
1891
1892     clr_exception_intercept(svm, BP_VECTOR);
1893
1894     if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1895         if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1896             set_exception_intercept(svm, BP_VECTOR);
1897     }
1898 }
1899
1900 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1901 {
1902     if (sd->next_asid > sd->max_asid) {
1903         ++sd->asid_generation;
1904         sd->next_asid = sd->min_asid;
1905         svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1906         vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1907     }
1908
1909     svm->current_vmcb->asid_generation = sd->asid_generation;
1910     svm->asid = sd->next_asid++;
1911 }
1912
1913 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1914 {
1915     struct vmcb *vmcb = svm->vmcb;
1916
1917     if (svm->vcpu.arch.guest_state_protected)
1918         return;
1919
1920     if (unlikely(value != vmcb->save.dr6)) {
1921         vmcb->save.dr6 = value;
1922         vmcb_mark_dirty(vmcb, VMCB_DR);
1923     }
1924 }
1925
1926 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1927 {
1928     struct vcpu_svm *svm = to_svm(vcpu);
1929
1930     if (vcpu->arch.guest_state_protected)
1931         return;
1932
1933     get_debugreg(vcpu->arch.db[0], 0);
1934     get_debugreg(vcpu->arch.db[1], 1);
1935     get_debugreg(vcpu->arch.db[2], 2);
1936     get_debugreg(vcpu->arch.db[3], 3);
1937     /*
1938      * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1939      * because db_interception might need it.  We can do it before vmentry.
1940      */
1941     vcpu->arch.dr6 = svm->vmcb->save.dr6;
1942     vcpu->arch.dr7 = svm->vmcb->save.dr7;
1943     vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1944     set_dr_intercepts(svm);
1945 }
1946
1947 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1948 {
1949     struct vcpu_svm *svm = to_svm(vcpu);
1950
1951     if (vcpu->arch.guest_state_protected)
1952         return;
1953
1954     svm->vmcb->save.dr7 = value;
1955     vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1956 }
1957
1958 static int pf_interception(struct kvm_vcpu *vcpu)
1959 {
1960     struct vcpu_svm *svm = to_svm(vcpu);
1961
1962     u64 fault_address = svm->vmcb->control.exit_info_2;
1963     u64 error_code = svm->vmcb->control.exit_info_1;
1964
1965     return kvm_handle_page_fault(vcpu, error_code, fault_address,
1966             static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1967             svm->vmcb->control.insn_bytes : NULL,
1968             svm->vmcb->control.insn_len);
1969 }
1970
1971 static int npf_interception(struct kvm_vcpu *vcpu)
1972 {
1973     struct vcpu_svm *svm = to_svm(vcpu);
1974
1975     u64 fault_address = svm->vmcb->control.exit_info_2;
1976     u64 error_code = svm->vmcb->control.exit_info_1;
1977
1978     trace_kvm_page_fault(fault_address, error_code);
1979     return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1980             static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1981             svm->vmcb->control.insn_bytes : NULL,
1982             svm->vmcb->control.insn_len);
1983 }
1984
1985 static int db_interception(struct kvm_vcpu *vcpu)
1986 {
1987     struct kvm_run *kvm_run = vcpu->run;
1988     struct vcpu_svm *svm = to_svm(vcpu);
1989
1990     if (!(vcpu->guest_debug &
1991           (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1992         !svm->nmi_singlestep) {
1993         u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1994         kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1995         return 1;
1996     }
1997
1998     if (svm->nmi_singlestep) {
1999         disable_nmi_singlestep(svm);
2000         /* Make sure we check for pending NMIs upon entry */
2001         kvm_make_request(KVM_REQ_EVENT, vcpu);
2002     }
2003
2004     if (vcpu->guest_debug &
2005         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2006         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2007         kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2008         kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2009         kvm_run->debug.arch.pc =
2010             svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2011         kvm_run->debug.arch.exception = DB_VECTOR;
2012         return 0;
2013     }
2014
2015     return 1;
2016 }
2017
2018 static int bp_interception(struct kvm_vcpu *vcpu)
2019 {
2020     struct vcpu_svm *svm = to_svm(vcpu);
2021     struct kvm_run *kvm_run = vcpu->run;
2022
2023     kvm_run->exit_reason = KVM_EXIT_DEBUG;
2024     kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2025     kvm_run->debug.arch.exception = BP_VECTOR;
2026     return 0;
2027 }
2028
2029 static int ud_interception(struct kvm_vcpu *vcpu)
2030 {
2031     return handle_ud(vcpu);
2032 }
2033
2034 static int ac_interception(struct kvm_vcpu *vcpu)
2035 {
2036     kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2037     return 1;
2038 }
2039
2040 static bool is_erratum_383(void)
2041 {
2042     int err, i;
2043     u64 value;
2044
2045     if (!erratum_383_found)
2046         return false;
2047
2048     value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2049     if (err)
2050         return false;
2051
2052     /* Bit 62 may or may not be set for this mce */
2053     value &= ~(1ULL << 62);
2054
2055     if (value != 0xb600000000010015ULL)
2056         return false;
2057
2058     /* Clear MCi_STATUS registers */
2059     for (i = 0; i < 6; ++i)
2060         native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2061
2062     value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2063     if (!err) {
2064         u32 low, high;
2065
2066         value &= ~(1ULL << 2);
2067         low    = lower_32_bits(value);
2068         high   = upper_32_bits(value);
2069
2070         native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2071     }
2072
2073     /* Flush tlb to evict multi-match entries */
2074     __flush_tlb_all();
2075
2076     return true;
2077 }
2078
2079 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2080 {
2081     if (is_erratum_383()) {
2082         /*
2083          * Erratum 383 triggered. Guest state is corrupt so kill the
2084          * guest.
2085          */
2086         pr_err("KVM: Guest triggered AMD Erratum 383\n");
2087
2088         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2089
2090         return;
2091     }
2092
2093     /*
2094      * On an #MC intercept the MCE handler is not called automatically in
2095      * the host. So do it by hand here.
2096      */
2097     kvm_machine_check();
2098 }
2099
2100 static int mc_interception(struct kvm_vcpu *vcpu)
2101 {
2102     return 1;
2103 }
2104
2105 static int shutdown_interception(struct kvm_vcpu *vcpu)
2106 {
2107     struct kvm_run *kvm_run = vcpu->run;
2108     struct vcpu_svm *svm = to_svm(vcpu);
2109
2110     /*
2111      * The VM save area has already been encrypted so it
2112      * cannot be reinitialized - just terminate.
2113      */
2114     if (sev_es_guest(vcpu->kvm))
2115         return -EINVAL;
2116
2117     /*
2118      * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2119      * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2120      * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2121      * userspace.  At a platform view, INIT is acceptable behavior as
2122      * there exist bare metal platforms that automatically INIT the CPU
2123      * in response to shutdown.
2124      */
2125     clear_page(svm->vmcb);
2126     kvm_vcpu_reset(vcpu, true);
2127
2128     kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2129     return 0;
2130 }
2131
2132 static int io_interception(struct kvm_vcpu *vcpu)
2133 {
2134     struct vcpu_svm *svm = to_svm(vcpu);
2135     u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2136     int size, in, string;
2137     unsigned port;
2138
2139     ++vcpu->stat.io_exits;
2140     string = (io_info & SVM_IOIO_STR_MASK) != 0;
2141     in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2142     port = io_info >> 16;
2143     size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2144
2145     if (string) {
2146         if (sev_es_guest(vcpu->kvm))
2147             return sev_es_string_io(svm, size, port, in);
2148         else
2149             return kvm_emulate_instruction(vcpu, 0);
2150     }
2151
2152     svm->next_rip = svm->vmcb->control.exit_info_2;
2153
2154     return kvm_fast_pio(vcpu, size, port, in);
2155 }
2156
2157 static int nmi_interception(struct kvm_vcpu *vcpu)
2158 {
2159     return 1;
2160 }
2161
2162 static int smi_interception(struct kvm_vcpu *vcpu)
2163 {
2164     return 1;
2165 }
2166
2167 static int intr_interception(struct kvm_vcpu *vcpu)
2168 {
2169     ++vcpu->stat.irq_exits;
2170     return 1;
2171 }
2172
2173 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2174 {
2175     struct vcpu_svm *svm = to_svm(vcpu);
2176     struct vmcb *vmcb12;
2177     struct kvm_host_map map;
2178     int ret;
2179
2180     if (nested_svm_check_permissions(vcpu))
2181         return 1;
2182
2183     ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2184     if (ret) {
2185         if (ret == -EINVAL)
2186             kvm_inject_gp(vcpu, 0);
2187         return 1;
2188     }
2189
2190     vmcb12 = map.hva;
2191
2192     ret = kvm_skip_emulated_instruction(vcpu);
2193
2194     if (vmload) {
2195         svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2196         svm->sysenter_eip_hi = 0;
2197         svm->sysenter_esp_hi = 0;
2198     } else {
2199         svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2200     }
2201
2202     kvm_vcpu_unmap(vcpu, &map, true);
2203
2204     return ret;
2205 }
2206
2207 static int vmload_interception(struct kvm_vcpu *vcpu)
2208 {
2209     return vmload_vmsave_interception(vcpu, true);
2210 }
2211
2212 static int vmsave_interception(struct kvm_vcpu *vcpu)
2213 {
2214     return vmload_vmsave_interception(vcpu, false);
2215 }
2216
2217 static int vmrun_interception(struct kvm_vcpu *vcpu)
2218 {
2219     if (nested_svm_check_permissions(vcpu))
2220         return 1;
2221
2222     return nested_svm_vmrun(vcpu);
2223 }
2224
2225 enum {
2226     NONE_SVM_INSTR,
2227     SVM_INSTR_VMRUN,
2228     SVM_INSTR_VMLOAD,
2229     SVM_INSTR_VMSAVE,
2230 };
2231
2232 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2233 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2234 {
2235     struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2236
2237     if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2238         return NONE_SVM_INSTR;
2239
2240     switch (ctxt->modrm) {
2241     case 0xd8: /* VMRUN */
2242         return SVM_INSTR_VMRUN;
2243     case 0xda: /* VMLOAD */
2244         return SVM_INSTR_VMLOAD;
2245     case 0xdb: /* VMSAVE */
2246         return SVM_INSTR_VMSAVE;
2247     default:
2248         break;
2249     }
2250
2251     return NONE_SVM_INSTR;
2252 }
2253
2254 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2255 {
2256     const int guest_mode_exit_codes[] = {
2257         [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2258         [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2259         [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2260     };
2261     int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2262         [SVM_INSTR_VMRUN] = vmrun_interception,
2263         [SVM_INSTR_VMLOAD] = vmload_interception,
2264         [SVM_INSTR_VMSAVE] = vmsave_interception,
2265     };
2266     struct vcpu_svm *svm = to_svm(vcpu);
2267     int ret;
2268
2269     if (is_guest_mode(vcpu)) {
2270         /* Returns '1' or -errno on failure, '0' on success. */
2271         ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2272         if (ret)
2273             return ret;
2274         return 1;
2275     }
2276     return svm_instr_handlers[opcode](vcpu);
2277 }
2278
2279 /*
2280  * #GP handling code. Note that #GP can be triggered under the following two
2281  * cases:
2282  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2283  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2284  *      regions (e.g. SMM memory on host).
2285  *   2) VMware backdoor
2286  */
2287 static int gp_interception(struct kvm_vcpu *vcpu)
2288 {
2289     struct vcpu_svm *svm = to_svm(vcpu);
2290     u32 error_code = svm->vmcb->control.exit_info_1;
2291     int opcode;
2292
2293     /* Both #GP cases have zero error_code */
2294     if (error_code)
2295         goto reinject;
2296
2297     /* Decode the instruction for usage later */
2298     if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2299         goto reinject;
2300
2301     opcode = svm_instr_opcode(vcpu);
2302
2303     if (opcode == NONE_SVM_INSTR) {
2304         if (!enable_vmware_backdoor)
2305             goto reinject;
2306
2307         /*
2308          * VMware backdoor emulation on #GP interception only handles
2309          * IN{S}, OUT{S}, and RDPMC.
2310          */
2311         if (!is_guest_mode(vcpu))
2312             return kvm_emulate_instruction(vcpu,
2313                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2314     } else {
2315         /* All SVM instructions expect page aligned RAX */
2316         if (svm->vmcb->save.rax & ~PAGE_MASK)
2317             goto reinject;
2318
2319         return emulate_svm_instr(vcpu, opcode);
2320     }
2321
2322 reinject:
2323     kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2324     return 1;
2325 }
2326
2327 void svm_set_gif(struct vcpu_svm *svm, bool value)
2328 {
2329     if (value) {
2330         /*
2331          * If VGIF is enabled, the STGI intercept is only added to
2332          * detect the opening of the SMI/NMI window; remove it now.
2333          * Likewise, clear the VINTR intercept, we will set it
2334          * again while processing KVM_REQ_EVENT if needed.
2335          */
2336         if (vgif)
2337             svm_clr_intercept(svm, INTERCEPT_STGI);
2338         if (svm_is_intercept(svm, INTERCEPT_VINTR))
2339             svm_clear_vintr(svm);
2340
2341         enable_gif(svm);
2342         if (svm->vcpu.arch.smi_pending ||
2343             svm->vcpu.arch.nmi_pending ||
2344             kvm_cpu_has_injectable_intr(&svm->vcpu))
2345             kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2346     } else {
2347         disable_gif(svm);
2348
2349         /*
2350          * After a CLGI no interrupts should come.  But if vGIF is
2351          * in use, we still rely on the VINTR intercept (rather than
2352          * STGI) to detect an open interrupt window.
2353         */
2354         if (!vgif)
2355             svm_clear_vintr(svm);
2356     }
2357 }
2358
2359 static int stgi_interception(struct kvm_vcpu *vcpu)
2360 {
2361     int ret;
2362
2363     if (nested_svm_check_permissions(vcpu))
2364         return 1;
2365
2366     ret = kvm_skip_emulated_instruction(vcpu);
2367     svm_set_gif(to_svm(vcpu), true);
2368     return ret;
2369 }
2370
2371 static int clgi_interception(struct kvm_vcpu *vcpu)
2372 {
2373     int ret;
2374
2375     if (nested_svm_check_permissions(vcpu))
2376         return 1;
2377
2378     ret = kvm_skip_emulated_instruction(vcpu);
2379     svm_set_gif(to_svm(vcpu), false);
2380     return ret;
2381 }
2382
2383 static int invlpga_interception(struct kvm_vcpu *vcpu)
2384 {
2385     gva_t gva = kvm_rax_read(vcpu);
2386     u32 asid = kvm_rcx_read(vcpu);
2387
2388     /* FIXME: Handle an address size prefix. */
2389     if (!is_long_mode(vcpu))
2390         gva = (u32)gva;
2391
2392     trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2393
2394     /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2395     kvm_mmu_invlpg(vcpu, gva);
2396
2397     return kvm_skip_emulated_instruction(vcpu);
2398 }
2399
2400 static int skinit_interception(struct kvm_vcpu *vcpu)
2401 {
2402     trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2403
2404     kvm_queue_exception(vcpu, UD_VECTOR);
2405     return 1;
2406 }
2407
2408 static int task_switch_interception(struct kvm_vcpu *vcpu)
2409 {
2410     struct vcpu_svm *svm = to_svm(vcpu);
2411     u16 tss_selector;
2412     int reason;
2413     int int_type = svm->vmcb->control.exit_int_info &
2414         SVM_EXITINTINFO_TYPE_MASK;
2415     int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2416     uint32_t type =
2417         svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2418     uint32_t idt_v =
2419         svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2420     bool has_error_code = false;
2421     u32 error_code = 0;
2422
2423     tss_selector = (u16)svm->vmcb->control.exit_info_1;
2424
2425     if (svm->vmcb->control.exit_info_2 &
2426         (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2427         reason = TASK_SWITCH_IRET;
2428     else if (svm->vmcb->control.exit_info_2 &
2429          (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2430         reason = TASK_SWITCH_JMP;
2431     else if (idt_v)
2432         reason = TASK_SWITCH_GATE;
2433     else
2434         reason = TASK_SWITCH_CALL;
2435
2436     if (reason == TASK_SWITCH_GATE) {
2437         switch (type) {
2438         case SVM_EXITINTINFO_TYPE_NMI:
2439             vcpu->arch.nmi_injected = false;
2440             break;
2441         case SVM_EXITINTINFO_TYPE_EXEPT:
2442             if (svm->vmcb->control.exit_info_2 &
2443                 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2444                 has_error_code = true;
2445                 error_code =
2446                     (u32)svm->vmcb->control.exit_info_2;
2447             }
2448             kvm_clear_exception_queue(vcpu);
2449             break;
2450         case SVM_EXITINTINFO_TYPE_INTR:
2451         case SVM_EXITINTINFO_TYPE_SOFT:
2452             kvm_clear_interrupt_queue(vcpu);
2453             break;
2454         default:
2455             break;
2456         }
2457     }
2458
2459     if (reason != TASK_SWITCH_GATE ||
2460         int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2461         (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2462          (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2463         if (!svm_skip_emulated_instruction(vcpu))
2464             return 0;
2465     }
2466
2467     if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2468         int_vec = -1;
2469
2470     return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2471                    has_error_code, error_code);
2472 }
2473
2474 static int iret_interception(struct kvm_vcpu *vcpu)
2475 {
2476     struct vcpu_svm *svm = to_svm(vcpu);
2477
2478     ++vcpu->stat.nmi_window_exits;
2479     vcpu->arch.hflags |= HF_IRET_MASK;
2480     if (!sev_es_guest(vcpu->kvm)) {
2481         svm_clr_intercept(svm, INTERCEPT_IRET);
2482         svm->nmi_iret_rip = kvm_rip_read(vcpu);
2483     }
2484     kvm_make_request(KVM_REQ_EVENT, vcpu);
2485     return 1;
2486 }
2487
2488 static int invlpg_interception(struct kvm_vcpu *vcpu)
2489 {
2490     if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2491         return kvm_emulate_instruction(vcpu, 0);
2492
2493     kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2494     return kvm_skip_emulated_instruction(vcpu);
2495 }
2496
2497 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2498 {
2499     return kvm_emulate_instruction(vcpu, 0);
2500 }
2501
2502 static int rsm_interception(struct kvm_vcpu *vcpu)
2503 {
2504     return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2505 }
2506
2507 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2508                         unsigned long val)
2509 {
2510     struct vcpu_svm *svm = to_svm(vcpu);
2511     unsigned long cr0 = vcpu->arch.cr0;
2512     bool ret = false;
2513
2514     if (!is_guest_mode(vcpu) ||
2515         (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2516         return false;
2517
2518     cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2519     val &= ~SVM_CR0_SELECTIVE_MASK;
2520
2521     if (cr0 ^ val) {
2522         svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2523         ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2524     }
2525
2526     return ret;
2527 }
2528
2529 #define CR_VALID (1ULL << 63)
2530
2531 static int cr_interception(struct kvm_vcpu *vcpu)
2532 {
2533     struct vcpu_svm *svm = to_svm(vcpu);
2534     int reg, cr;
2535     unsigned long val;
2536     int err;
2537
2538     if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2539         return emulate_on_interception(vcpu);
2540
2541     if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2542         return emulate_on_interception(vcpu);
2543
2544     reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2545     if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2546         cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2547     else
2548         cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2549
2550     err = 0;
2551     if (cr >= 16) { /* mov to cr */
2552         cr -= 16;
2553         val = kvm_register_read(vcpu, reg);
2554         trace_kvm_cr_write(cr, val);
2555         switch (cr) {
2556         case 0:
2557             if (!check_selective_cr0_intercepted(vcpu, val))
2558                 err = kvm_set_cr0(vcpu, val);
2559             else
2560                 return 1;
2561
2562             break;
2563         case 3:
2564             err = kvm_set_cr3(vcpu, val);
2565             break;
2566         case 4:
2567             err = kvm_set_cr4(vcpu, val);
2568             break;
2569         case 8:
2570             err = kvm_set_cr8(vcpu, val);
2571             break;
2572         default:
2573             WARN(1, "unhandled write to CR%d", cr);
2574             kvm_queue_exception(vcpu, UD_VECTOR);
2575             return 1;
2576         }
2577     } else { /* mov from cr */
2578         switch (cr) {
2579         case 0:
2580             val = kvm_read_cr0(vcpu);
2581             break;
2582         case 2:
2583             val = vcpu->arch.cr2;
2584             break;
2585         case 3:
2586             val = kvm_read_cr3(vcpu);
2587             break;
2588         case 4:
2589             val = kvm_read_cr4(vcpu);
2590             break;
2591         case 8:
2592             val = kvm_get_cr8(vcpu);
2593             break;
2594         default:
2595             WARN(1, "unhandled read from CR%d", cr);
2596             kvm_queue_exception(vcpu, UD_VECTOR);
2597             return 1;
2598         }
2599         kvm_register_write(vcpu, reg, val);
2600         trace_kvm_cr_read(cr, val);
2601     }
2602     return kvm_complete_insn_gp(vcpu, err);
2603 }
2604
2605 static int cr_trap(struct kvm_vcpu *vcpu)
2606 {
2607     struct vcpu_svm *svm = to_svm(vcpu);
2608     unsigned long old_value, new_value;
2609     unsigned int cr;
2610     int ret = 0;
2611
2612     new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2613
2614     cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2615     switch (cr) {
2616     case 0:
2617         old_value = kvm_read_cr0(vcpu);
2618         svm_set_cr0(vcpu, new_value);
2619
2620         kvm_post_set_cr0(vcpu, old_value, new_value);
2621         break;
2622     case 4:
2623         old_value = kvm_read_cr4(vcpu);
2624         svm_set_cr4(vcpu, new_value);
2625
2626         kvm_post_set_cr4(vcpu, old_value, new_value);
2627         break;
2628     case 8:
2629         ret = kvm_set_cr8(vcpu, new_value);
2630         break;
2631     default:
2632         WARN(1, "unhandled CR%d write trap", cr);
2633         kvm_queue_exception(vcpu, UD_VECTOR);
2634         return 1;
2635     }
2636
2637     return kvm_complete_insn_gp(vcpu, ret);
2638 }
2639
2640 static int dr_interception(struct kvm_vcpu *vcpu)
2641 {
2642     struct vcpu_svm *svm = to_svm(vcpu);
2643     int reg, dr;
2644     unsigned long val;
2645     int err = 0;
2646
2647     if (vcpu->guest_debug == 0) {
2648         /*
2649          * No more DR vmexits; force a reload of the debug registers
2650          * and reenter on this instruction.  The next vmexit will
2651          * retrieve the full state of the debug registers.
2652          */
2653         clr_dr_intercepts(svm);
2654         vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2655         return 1;
2656     }
2657
2658     if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2659         return emulate_on_interception(vcpu);
2660
2661     reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2662     dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2663     if (dr >= 16) { /* mov to DRn  */
2664         dr -= 16;
2665         val = kvm_register_read(vcpu, reg);
2666         err = kvm_set_dr(vcpu, dr, val);
2667     } else {
2668         kvm_get_dr(vcpu, dr, &val);
2669         kvm_register_write(vcpu, reg, val);
2670     }
2671
2672     return kvm_complete_insn_gp(vcpu, err);
2673 }
2674
2675 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2676 {
2677     int r;
2678
2679     u8 cr8_prev = kvm_get_cr8(vcpu);
2680     /* instruction emulation calls kvm_set_cr8() */
2681     r = cr_interception(vcpu);
2682     if (lapic_in_kernel(vcpu))
2683         return r;
2684     if (cr8_prev <= kvm_get_cr8(vcpu))
2685         return r;
2686     vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2687     return 0;
2688 }
2689
2690 static int efer_trap(struct kvm_vcpu *vcpu)
2691 {
2692     struct msr_data msr_info;
2693     int ret;
2694
2695     /*
2696      * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2697      * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2698      * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2699      * the guest doesn't have X86_FEATURE_SVM.
2700      */
2701     msr_info.host_initiated = false;
2702     msr_info.index = MSR_EFER;
2703     msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2704     ret = kvm_set_msr_common(vcpu, &msr_info);
2705
2706     return kvm_complete_insn_gp(vcpu, ret);
2707 }
2708
2709 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2710 {
2711     msr->data = 0;
2712
2713     switch (msr->index) {
2714     case MSR_F10H_DECFG:
2715         if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2716             msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2717         break;
2718     case MSR_IA32_PERF_CAPABILITIES:
2719         return 0;
2720     default:
2721         return KVM_MSR_RET_INVALID;
2722     }
2723
2724     return 0;
2725 }
2726
2727 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2728 {
2729     struct vcpu_svm *svm = to_svm(vcpu);
2730
2731     switch (msr_info->index) {
2732     case MSR_AMD64_TSC_RATIO:
2733         if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2734             return 1;
2735         msr_info->data = svm->tsc_ratio_msr;
2736         break;
2737     case MSR_STAR:
2738         msr_info->data = svm->vmcb01.ptr->save.star;
2739         break;
2740 #ifdef CONFIG_X86_64
2741     case MSR_LSTAR:
2742         msr_info->data = svm->vmcb01.ptr->save.lstar;
2743         break;
2744     case MSR_CSTAR:
2745         msr_info->data = svm->vmcb01.ptr->save.cstar;
2746         break;
2747     case MSR_KERNEL_GS_BASE:
2748         msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2749         break;
2750     case MSR_SYSCALL_MASK:
2751         msr_info->data = svm->vmcb01.ptr->save.sfmask;
2752         break;
2753 #endif
2754     case MSR_IA32_SYSENTER_CS:
2755         msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2756         break;
2757     case MSR_IA32_SYSENTER_EIP:
2758         msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2759         if (guest_cpuid_is_intel(vcpu))
2760             msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2761         break;
2762     case MSR_IA32_SYSENTER_ESP:
2763         msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2764         if (guest_cpuid_is_intel(vcpu))
2765             msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2766         break;
2767     case MSR_TSC_AUX:
2768         msr_info->data = svm->tsc_aux;
2769         break;
2770     case MSR_IA32_DEBUGCTLMSR:
2771     case MSR_IA32_LASTBRANCHFROMIP:
2772     case MSR_IA32_LASTBRANCHTOIP:
2773     case MSR_IA32_LASTINTFROMIP:
2774     case MSR_IA32_LASTINTTOIP:
2775         msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2776         break;
2777     case MSR_VM_HSAVE_PA:
2778         msr_info->data = svm->nested.hsave_msr;
2779         break;
2780     case MSR_VM_CR:
2781         msr_info->data = svm->nested.vm_cr_msr;
2782         break;
2783     case MSR_IA32_SPEC_CTRL:
2784         if (!msr_info->host_initiated &&
2785             !guest_has_spec_ctrl_msr(vcpu))
2786             return 1;
2787
2788         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2789             msr_info->data = svm->vmcb->save.spec_ctrl;
2790         else
2791             msr_info->data = svm->spec_ctrl;
2792         break;
2793     case MSR_AMD64_VIRT_SPEC_CTRL:
2794         if (!msr_info->host_initiated &&
2795             !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2796             return 1;
2797
2798         msr_info->data = svm->virt_spec_ctrl;
2799         break;
2800     case MSR_F15H_IC_CFG: {
2801
2802         int family, model;
2803
2804         family = guest_cpuid_family(vcpu);
2805         model  = guest_cpuid_model(vcpu);
2806
2807         if (family < 0 || model < 0)
2808             return kvm_get_msr_common(vcpu, msr_info);
2809
2810         msr_info->data = 0;
2811
2812         if (family == 0x15 &&
2813             (model >= 0x2 && model < 0x20))
2814             msr_info->data = 0x1E;
2815         }
2816         break;
2817     case MSR_F10H_DECFG:
2818         msr_info->data = svm->msr_decfg;
2819         break;
2820     default:
2821         return kvm_get_msr_common(vcpu, msr_info);
2822     }
2823     return 0;
2824 }
2825
2826 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2827 {
2828     struct vcpu_svm *svm = to_svm(vcpu);
2829     if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2830         return kvm_complete_insn_gp(vcpu, err);
2831
2832     ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2833     ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2834                 X86_TRAP_GP |
2835                 SVM_EVTINJ_TYPE_EXEPT |
2836                 SVM_EVTINJ_VALID);
2837     return 1;
2838 }
2839
2840 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2841 {
2842     struct vcpu_svm *svm = to_svm(vcpu);
2843     int svm_dis, chg_mask;
2844
2845     if (data & ~SVM_VM_CR_VALID_MASK)
2846         return 1;
2847
2848     chg_mask = SVM_VM_CR_VALID_MASK;
2849
2850     if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2851         chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2852
2853     svm->nested.vm_cr_msr &= ~chg_mask;
2854     svm->nested.vm_cr_msr |= (data & chg_mask);
2855
2856     svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2857
2858     /* check for svm_disable while efer.svme is set */
2859     if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2860         return 1;
2861
2862     return 0;
2863 }
2864
2865 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2866 {
2867     struct vcpu_svm *svm = to_svm(vcpu);
2868     int r;
2869
2870     u32 ecx = msr->index;
2871     u64 data = msr->data;
2872     switch (ecx) {
2873     case MSR_AMD64_TSC_RATIO:
2874
2875         if (!svm->tsc_scaling_enabled) {
2876
2877             if (!msr->host_initiated)
2878                 return 1;
2879             /*
2880              * In case TSC scaling is not enabled, always
2881              * leave this MSR at the default value.
2882              *
2883              * Due to bug in qemu 6.2.0, it would try to set
2884              * this msr to 0 if tsc scaling is not enabled.
2885              * Ignore this value as well.
2886              */
2887             if (data != 0 && data != svm->tsc_ratio_msr)
2888                 return 1;
2889             break;
2890         }
2891
2892         if (data & SVM_TSC_RATIO_RSVD)
2893             return 1;
2894
2895         svm->tsc_ratio_msr = data;
2896
2897         if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2898             nested_svm_update_tsc_ratio_msr(vcpu);
2899
2900         break;
2901     case MSR_IA32_CR_PAT:
2902         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2903             return 1;
2904         vcpu->arch.pat = data;
2905         svm->vmcb01.ptr->save.g_pat = data;
2906         if (is_guest_mode(vcpu))
2907             nested_vmcb02_compute_g_pat(svm);
2908         vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2909         break;
2910     case MSR_IA32_SPEC_CTRL:
2911         if (!msr->host_initiated &&
2912             !guest_has_spec_ctrl_msr(vcpu))
2913             return 1;
2914
2915         if (kvm_spec_ctrl_test_value(data))
2916             return 1;
2917
2918         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2919             svm->vmcb->save.spec_ctrl = data;
2920         else
2921             svm->spec_ctrl = data;
2922         if (!data)
2923             break;
2924
2925         /*
2926          * For non-nested:
2927          * When it's written (to non-zero) for the first time, pass
2928          * it through.
2929          *
2930          * For nested:
2931          * The handling of the MSR bitmap for L2 guests is done in
2932          * nested_svm_vmrun_msrpm.
2933          * We update the L1 MSR bit as well since it will end up
2934          * touching the MSR anyway now.
2935          */
2936         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2937         break;
2938     case MSR_IA32_PRED_CMD:
2939         if (!msr->host_initiated &&
2940             !guest_has_pred_cmd_msr(vcpu))
2941             return 1;
2942
2943         if (data & ~PRED_CMD_IBPB)
2944             return 1;
2945         if (!boot_cpu_has(X86_FEATURE_IBPB))
2946             return 1;
2947         if (!data)
2948             break;
2949
2950         wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2951         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2952         break;
2953     case MSR_AMD64_VIRT_SPEC_CTRL:
2954         if (!msr->host_initiated &&
2955             !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2956             return 1;
2957
2958         if (data & ~SPEC_CTRL_SSBD)
2959             return 1;
2960
2961         svm->virt_spec_ctrl = data;
2962         break;
2963     case MSR_STAR:
2964         svm->vmcb01.ptr->save.star = data;
2965         break;
2966 #ifdef CONFIG_X86_64
2967     case MSR_LSTAR:
2968         svm->vmcb01.ptr->save.lstar = data;
2969         break;
2970     case MSR_CSTAR:
2971         svm->vmcb01.ptr->save.cstar = data;
2972         break;
2973     case MSR_KERNEL_GS_BASE:
2974         svm->vmcb01.ptr->save.kernel_gs_base = data;
2975         break;
2976     case MSR_SYSCALL_MASK:
2977         svm->vmcb01.ptr->save.sfmask = data;
2978         break;
2979 #endif
2980     case MSR_IA32_SYSENTER_CS:
2981         svm->vmcb01.ptr->save.sysenter_cs = data;
2982         break;
2983     case MSR_IA32_SYSENTER_EIP:
2984         svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2985         /*
2986          * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2987          * when we spoof an Intel vendor ID (for cross vendor migration).
2988          * In this case we use this intercept to track the high
2989          * 32 bit part of these msrs to support Intel's
2990          * implementation of SYSENTER/SYSEXIT.
2991          */
2992         svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2993         break;
2994     case MSR_IA32_SYSENTER_ESP:
2995         svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2996         svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2997         break;
2998     case MSR_TSC_AUX:
2999         /*
3000          * TSC_AUX is usually changed only during boot and never read
3001          * directly.  Intercept TSC_AUX instead of exposing it to the
3002          * guest via direct_access_msrs, and switch it via user return.
3003          */
3004         preempt_disable();
3005         r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3006         preempt_enable();
3007         if (r)
3008             return 1;
3009
3010         svm->tsc_aux = data;
3011         break;
3012     case MSR_IA32_DEBUGCTLMSR:
3013         if (!lbrv) {
3014             vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3015                     __func__, data);
3016             break;
3017         }
3018         if (data & DEBUGCTL_RESERVED_BITS)
3019             return 1;
3020
3021         if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3022             svm->vmcb->save.dbgctl = data;
3023         else
3024             svm->vmcb01.ptr->save.dbgctl = data;
3025
3026         svm_update_lbrv(vcpu);
3027
3028         break;
3029     case MSR_VM_HSAVE_PA:
3030         /*
3031          * Old kernels did not validate the value written to
3032          * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3033          * value to allow live migrating buggy or malicious guests
3034          * originating from those kernels.
3035          */
3036         if (!msr->host_initiated && !page_address_valid(vcpu, data))
3037             return 1;
3038
3039         svm->nested.hsave_msr = data & PAGE_MASK;
3040         break;
3041     case MSR_VM_CR:
3042         return svm_set_vm_cr(vcpu, data);
3043     case MSR_VM_IGNNE:
3044         vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3045         break;
3046     case MSR_F10H_DECFG: {
3047         struct kvm_msr_entry msr_entry;
3048
3049         msr_entry.index = msr->index;
3050         if (svm_get_msr_feature(&msr_entry))
3051             return 1;
3052
3053         /* Check the supported bits */
3054         if (data & ~msr_entry.data)
3055             return 1;
3056
3057         /* Don't allow the guest to change a bit, #GP */
3058         if (!msr->host_initiated && (data ^ msr_entry.data))
3059             return 1;
3060
3061         svm->msr_decfg = data;
3062         break;
3063     }
3064     default:
3065         return kvm_set_msr_common(vcpu, msr);
3066     }
3067     return 0;
3068 }
3069
3070 static int msr_interception(struct kvm_vcpu *vcpu)
3071 {
3072     if (to_svm(vcpu)->vmcb->control.exit_info_1)
3073         return kvm_emulate_wrmsr(vcpu);
3074     else
3075         return kvm_emulate_rdmsr(vcpu);
3076 }
3077
3078 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3079 {
3080     kvm_make_request(KVM_REQ_EVENT, vcpu);
3081     svm_clear_vintr(to_svm(vcpu));
3082
3083     /*
3084      * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3085      * In this case AVIC was temporarily disabled for
3086      * requesting the IRQ window and we have to re-enable it.
3087      *
3088      * If running nested, still remove the VM wide AVIC inhibit to
3089      * support case in which the interrupt window was requested when the
3090      * vCPU was not running nested.
3091
3092      * All vCPUs which run still run nested, will remain to have their
3093      * AVIC still inhibited due to per-cpu AVIC inhibition.
3094      */
3095     kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3096
3097     ++vcpu->stat.irq_window_exits;
3098     return 1;
3099 }
3100
3101 static int pause_interception(struct kvm_vcpu *vcpu)
3102 {
3103     bool in_kernel;
3104     /*
3105      * CPL is not made available for an SEV-ES guest, therefore
3106      * vcpu->arch.preempted_in_kernel can never be true.  Just
3107      * set in_kernel to false as well.
3108      */
3109     in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3110
3111     grow_ple_window(vcpu);
3112
3113     kvm_vcpu_on_spin(vcpu, in_kernel);
3114     return kvm_skip_emulated_instruction(vcpu);
3115 }
3116
3117 static int invpcid_interception(struct kvm_vcpu *vcpu)
3118 {
3119     struct vcpu_svm *svm = to_svm(vcpu);
3120     unsigned long type;
3121     gva_t gva;
3122
3123     if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3124         kvm_queue_exception(vcpu, UD_VECTOR);
3125         return 1;
3126     }
3127
3128     /*
3129      * For an INVPCID intercept:
3130      * EXITINFO1 provides the linear address of the memory operand.
3131      * EXITINFO2 provides the contents of the register operand.
3132      */
3133     type = svm->vmcb->control.exit_info_2;
3134     gva = svm->vmcb->control.exit_info_1;
3135
3136     return kvm_handle_invpcid(vcpu, type, gva);
3137 }
3138
3139 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3140     [SVM_EXIT_READ_CR0]         = cr_interception,
3141     [SVM_EXIT_READ_CR3]         = cr_interception,
3142     [SVM_EXIT_READ_CR4]         = cr_interception,
3143     [SVM_EXIT_READ_CR8]         = cr_interception,
3144     [SVM_EXIT_CR0_SEL_WRITE]        = cr_interception,
3145     [SVM_EXIT_WRITE_CR0]            = cr_interception,
3146     [SVM_EXIT_WRITE_CR3]            = cr_interception,
3147     [SVM_EXIT_WRITE_CR4]            = cr_interception,
3148     [SVM_EXIT_WRITE_CR8]            = cr8_write_interception,
3149     [SVM_EXIT_READ_DR0]         = dr_interception,
3150     [SVM_EXIT_READ_DR1]         = dr_interception,
3151     [SVM_EXIT_READ_DR2]         = dr_interception,
3152     [SVM_EXIT_READ_DR3]         = dr_interception,
3153     [SVM_EXIT_READ_DR4]         = dr_interception,
3154     [SVM_EXIT_READ_DR5]         = dr_interception,
3155     [SVM_EXIT_READ_DR6]         = dr_interception,
3156     [SVM_EXIT_READ_DR7]         = dr_interception,
3157     [SVM_EXIT_WRITE_DR0]            = dr_interception,
3158     [SVM_EXIT_WRITE_DR1]            = dr_interception,
3159     [SVM_EXIT_WRITE_DR2]            = dr_interception,
3160     [SVM_EXIT_WRITE_DR3]            = dr_interception,
3161     [SVM_EXIT_WRITE_DR4]            = dr_interception,
3162     [SVM_EXIT_WRITE_DR5]            = dr_interception,
3163     [SVM_EXIT_WRITE_DR6]            = dr_interception,
3164     [SVM_EXIT_WRITE_DR7]            = dr_interception,
3165     [SVM_EXIT_EXCP_BASE + DB_VECTOR]    = db_interception,
3166     [SVM_EXIT_EXCP_BASE + BP_VECTOR]    = bp_interception,
3167     [SVM_EXIT_EXCP_BASE + UD_VECTOR]    = ud_interception,
3168     [SVM_EXIT_EXCP_BASE + PF_VECTOR]    = pf_interception,
3169     [SVM_EXIT_EXCP_BASE + MC_VECTOR]    = mc_interception,
3170     [SVM_EXIT_EXCP_BASE + AC_VECTOR]    = ac_interception,
3171     [SVM_EXIT_EXCP_BASE + GP_VECTOR]    = gp_interception,
3172     [SVM_EXIT_INTR]             = intr_interception,
3173     [SVM_EXIT_NMI]              = nmi_interception,
3174     [SVM_EXIT_SMI]              = smi_interception,
3175     [SVM_EXIT_VINTR]            = interrupt_window_interception,
3176     [SVM_EXIT_RDPMC]            = kvm_emulate_rdpmc,
3177     [SVM_EXIT_CPUID]            = kvm_emulate_cpuid,
3178     [SVM_EXIT_IRET]                         = iret_interception,
3179     [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3180     [SVM_EXIT_PAUSE]            = pause_interception,
3181     [SVM_EXIT_HLT]              = kvm_emulate_halt,
3182     [SVM_EXIT_INVLPG]           = invlpg_interception,
3183     [SVM_EXIT_INVLPGA]          = invlpga_interception,
3184     [SVM_EXIT_IOIO]             = io_interception,
3185     [SVM_EXIT_MSR]              = msr_interception,
3186     [SVM_EXIT_TASK_SWITCH]          = task_switch_interception,
3187     [SVM_EXIT_SHUTDOWN]         = shutdown_interception,
3188     [SVM_EXIT_VMRUN]            = vmrun_interception,
3189     [SVM_EXIT_VMMCALL]          = kvm_emulate_hypercall,
3190     [SVM_EXIT_VMLOAD]           = vmload_interception,
3191     [SVM_EXIT_VMSAVE]           = vmsave_interception,
3192     [SVM_EXIT_STGI]             = stgi_interception,
3193     [SVM_EXIT_CLGI]             = clgi_interception,
3194     [SVM_EXIT_SKINIT]           = skinit_interception,
3195     [SVM_EXIT_RDTSCP]           = kvm_handle_invalid_op,
3196     [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3197     [SVM_EXIT_MONITOR]          = kvm_emulate_monitor,
3198     [SVM_EXIT_MWAIT]            = kvm_emulate_mwait,
3199     [SVM_EXIT_XSETBV]           = kvm_emulate_xsetbv,
3200     [SVM_EXIT_RDPRU]            = kvm_handle_invalid_op,
3201     [SVM_EXIT_EFER_WRITE_TRAP]      = efer_trap,
3202     [SVM_EXIT_CR0_WRITE_TRAP]       = cr_trap,
3203     [SVM_EXIT_CR4_WRITE_TRAP]       = cr_trap,
3204     [SVM_EXIT_CR8_WRITE_TRAP]       = cr_trap,
3205     [SVM_EXIT_INVPCID]                      = invpcid_interception,
3206     [SVM_EXIT_NPF]              = npf_interception,
3207     [SVM_EXIT_RSM]                          = rsm_interception,
3208     [SVM_EXIT_AVIC_INCOMPLETE_IPI]      = avic_incomplete_ipi_interception,
3209     [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3210     [SVM_EXIT_VMGEXIT]          = sev_handle_vmgexit,
3211 };
3212
3213 static void dump_vmcb(struct kvm_vcpu *vcpu)
3214 {
3215     struct vcpu_svm *svm = to_svm(vcpu);
3216     struct vmcb_control_area *control = &svm->vmcb->control;
3217     struct vmcb_save_area *save = &svm->vmcb->save;
3218     struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3219
3220     if (!dump_invalid_vmcb) {
3221         pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3222         return;
3223     }
3224
3225     pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3226            svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3227     pr_err("VMCB Control Area:\n");
3228     pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3229     pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3230     pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3231     pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3232     pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3233     pr_err("%-20s%08x %08x\n", "intercepts:",
3234               control->intercepts[INTERCEPT_WORD3],
3235            control->intercepts[INTERCEPT_WORD4]);
3236     pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3237     pr_err("%-20s%d\n", "pause filter threshold:",
3238            control->pause_filter_thresh);
3239     pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3240     pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3241     pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3242     pr_err("%-20s%d\n", "asid:", control->asid);
3243     pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3244     pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3245     pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3246     pr_err("%-20s%08x\n", "int_state:", control->int_state);
3247     pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3248     pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3249     pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3250     pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3251     pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3252     pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3253     pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3254     pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3255     pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3256     pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3257     pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3258     pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3259     pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3260     pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3261     pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3262     pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3263     pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3264     pr_err("VMCB State Save Area:\n");
3265     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3266            "es:",
3267            save->es.selector, save->es.attrib,
3268            save->es.limit, save->es.base);
3269     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3270            "cs:",
3271            save->cs.selector, save->cs.attrib,
3272            save->cs.limit, save->cs.base);
3273     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3274            "ss:",
3275            save->ss.selector, save->ss.attrib,
3276            save->ss.limit, save->ss.base);
3277     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3278            "ds:",
3279            save->ds.selector, save->ds.attrib,
3280            save->ds.limit, save->ds.base);
3281     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3282            "fs:",
3283            save01->fs.selector, save01->fs.attrib,
3284            save01->fs.limit, save01->fs.base);
3285     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3286            "gs:",
3287            save01->gs.selector, save01->gs.attrib,
3288            save01->gs.limit, save01->gs.base);
3289     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3290            "gdtr:",
3291            save->gdtr.selector, save->gdtr.attrib,
3292            save->gdtr.limit, save->gdtr.base);
3293     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3294            "ldtr:",
3295            save01->ldtr.selector, save01->ldtr.attrib,
3296            save01->ldtr.limit, save01->ldtr.base);
3297     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3298            "idtr:",
3299            save->idtr.selector, save->idtr.attrib,
3300            save->idtr.limit, save->idtr.base);
3301     pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3302            "tr:",
3303            save01->tr.selector, save01->tr.attrib,
3304            save01->tr.limit, save01->tr.base);
3305     pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3306            save->vmpl, save->cpl, save->efer);
3307     pr_err("%-15s %016llx %-13s %016llx\n",
3308            "cr0:", save->cr0, "cr2:", save->cr2);
3309     pr_err("%-15s %016llx %-13s %016llx\n",
3310            "cr3:", save->cr3, "cr4:", save->cr4);
3311     pr_err("%-15s %016llx %-13s %016llx\n",
3312            "dr6:", save->dr6, "dr7:", save->dr7);
3313     pr_err("%-15s %016llx %-13s %016llx\n",
3314            "rip:", save->rip, "rflags:", save->rflags);
3315     pr_err("%-15s %016llx %-13s %016llx\n",
3316            "rsp:", save->rsp, "rax:", save->rax);
3317     pr_err("%-15s %016llx %-13s %016llx\n",
3318            "star:", save01->star, "lstar:", save01->lstar);
3319     pr_err("%-15s %016llx %-13s %016llx\n",
3320            "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3321     pr_err("%-15s %016llx %-13s %016llx\n",
3322            "kernel_gs_base:", save01->kernel_gs_base,
3323            "sysenter_cs:", save01->sysenter_cs);
3324     pr_err("%-15s %016llx %-13s %016llx\n",
3325            "sysenter_esp:", save01->sysenter_esp,
3326            "sysenter_eip:", save01->sysenter_eip);
3327     pr_err("%-15s %016llx %-13s %016llx\n",
3328            "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3329     pr_err("%-15s %016llx %-13s %016llx\n",
3330            "br_from:", save->br_from, "br_to:", save->br_to);
3331     pr_err("%-15s %016llx %-13s %016llx\n",
3332            "excp_from:", save->last_excp_from,
3333            "excp_to:", save->last_excp_to);
3334 }
3335
3336 static bool svm_check_exit_valid(u64 exit_code)
3337 {
3338     return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3339         svm_exit_handlers[exit_code]);
3340 }
3341
3342 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3343 {
3344     vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3345     dump_vmcb(vcpu);
3346     vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3347     vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3348     vcpu->run->internal.ndata = 2;
3349     vcpu->run->internal.data[0] = exit_code;
3350     vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3351     return 0;
3352 }
3353
3354 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3355 {
3356     if (!svm_check_exit_valid(exit_code))
3357         return svm_handle_invalid_exit(vcpu, exit_code);
3358
3359 #ifdef CONFIG_RETPOLINE
3360     if (exit_code == SVM_EXIT_MSR)
3361         return msr_interception(vcpu);
3362     else if (exit_code == SVM_EXIT_VINTR)
3363         return interrupt_window_interception(vcpu);
3364     else if (exit_code == SVM_EXIT_INTR)
3365         return intr_interception(vcpu);
3366     else if (exit_code == SVM_EXIT_HLT)
3367         return kvm_emulate_halt(vcpu);
3368     else if (exit_code == SVM_EXIT_NPF)
3369         return npf_interception(vcpu);
3370 #endif
3371     return svm_exit_handlers[exit_code](vcpu);
3372 }
3373
3374 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3375                   u64 *info1, u64 *info2,
3376                   u32 *intr_info, u32 *error_code)
3377 {
3378     struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3379
3380     *reason = control->exit_code;
3381     *info1 = control->exit_info_1;
3382     *info2 = control->exit_info_2;
3383     *intr_info = control->exit_int_info;
3384     if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3385         (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3386         *error_code = control->exit_int_info_err;
3387     else
3388         *error_code = 0;
3389 }
3390
3391 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3392 {
3393     struct vcpu_svm *svm = to_svm(vcpu);
3394     struct kvm_run *kvm_run = vcpu->run;
3395     u32 exit_code = svm->vmcb->control.exit_code;
3396
3397     trace_kvm_exit(vcpu, KVM_ISA_SVM);
3398
3399     /* SEV-ES guests must use the CR write traps to track CR registers. */
3400     if (!sev_es_guest(vcpu->kvm)) {
3401         if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3402             vcpu->arch.cr0 = svm->vmcb->save.cr0;
3403         if (npt_enabled)
3404             vcpu->arch.cr3 = svm->vmcb->save.cr3;
3405     }
3406
3407     if (is_guest_mode(vcpu)) {
3408         int vmexit;
3409
3410         trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3411
3412         vmexit = nested_svm_exit_special(svm);
3413
3414         if (vmexit == NESTED_EXIT_CONTINUE)
3415             vmexit = nested_svm_exit_handled(svm);
3416
3417         if (vmexit == NESTED_EXIT_DONE)
3418             return 1;
3419     }
3420
3421     if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3422         kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3423         kvm_run->fail_entry.hardware_entry_failure_reason
3424             = svm->vmcb->control.exit_code;
3425         kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3426         dump_vmcb(vcpu);
3427         return 0;
3428     }
3429
3430     if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3431         exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3432         exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3433         exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3434         printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3435                "exit_code 0x%x\n",
3436                __func__, svm->vmcb->control.exit_int_info,
3437                exit_code);
3438
3439     if (exit_fastpath != EXIT_FASTPATH_NONE)
3440         return 1;
3441
3442     return svm_invoke_exit_handler(vcpu, exit_code);
3443 }
3444
3445 static void reload_tss(struct kvm_vcpu *vcpu)
3446 {
3447     struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3448
3449     sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3450     load_TR_desc();
3451 }
3452
3453 static void pre_svm_run(struct kvm_vcpu *vcpu)
3454 {
3455     struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3456     struct vcpu_svm *svm = to_svm(vcpu);
3457
3458     /*
3459      * If the previous vmrun of the vmcb occurred on a different physical
3460      * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3461      * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3462      */
3463     if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3464         svm->current_vmcb->asid_generation = 0;
3465         vmcb_mark_all_dirty(svm->vmcb);
3466         svm->current_vmcb->cpu = vcpu->cpu;
3467         }
3468
3469     if (sev_guest(vcpu->kvm))
3470         return pre_sev_run(svm, vcpu->cpu);
3471
3472     /* FIXME: handle wraparound of asid_generation */
3473     if (svm->current_vmcb->asid_generation != sd->asid_generation)
3474         new_asid(svm, sd);
3475 }
3476
3477 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3478 {
3479     struct vcpu_svm *svm = to_svm(vcpu);
3480
3481     svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3482
3483     if (svm->nmi_l1_to_l2)
3484         return;
3485
3486     vcpu->arch.hflags |= HF_NMI_MASK;
3487     if (!sev_es_guest(vcpu->kvm))
3488         svm_set_intercept(svm, INTERCEPT_IRET);
3489     ++vcpu->stat.nmi_injections;
3490 }
3491
3492 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3493 {
3494     struct vcpu_svm *svm = to_svm(vcpu);
3495     u32 type;
3496
3497     if (vcpu->arch.interrupt.soft) {
3498         if (svm_update_soft_interrupt_rip(vcpu))
3499             return;
3500
3501         type = SVM_EVTINJ_TYPE_SOFT;
3502     } else {
3503         type = SVM_EVTINJ_TYPE_INTR;
3504     }
3505
3506     trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3507                vcpu->arch.interrupt.soft, reinjected);
3508     ++vcpu->stat.irq_injections;
3509
3510     svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3511                        SVM_EVTINJ_VALID | type;
3512 }
3513
3514 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3515                      int trig_mode, int vector)
3516 {
3517     /*
3518      * apic->apicv_active must be read after vcpu->mode.
3519      * Pairs with smp_store_release in vcpu_enter_guest.
3520      */
3521     bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3522
3523     /* Note, this is called iff the local APIC is in-kernel. */
3524     if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3525         /* Process the interrupt via inject_pending_event */
3526         kvm_make_request(KVM_REQ_EVENT, vcpu);
3527         kvm_vcpu_kick(vcpu);
3528         return;
3529     }
3530
3531     trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3532     if (in_guest_mode) {
3533         /*
3534          * Signal the doorbell to tell hardware to inject the IRQ.  If
3535          * the vCPU exits the guest before the doorbell chimes, hardware
3536          * will automatically process AVIC interrupts at the next VMRUN.
3537          */
3538         avic_ring_doorbell(vcpu);
3539     } else {
3540         /*
3541          * Wake the vCPU if it was blocking.  KVM will then detect the
3542          * pending IRQ when checking if the vCPU has a wake event.
3543          */
3544         kvm_vcpu_wake_up(vcpu);
3545     }
3546 }
3547
3548 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3549                   int trig_mode, int vector)
3550 {
3551     kvm_lapic_set_irr(vector, apic);
3552
3553     /*
3554      * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3555      * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3556      * the read of guest_mode.  This guarantees that either VMRUN will see
3557      * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3558      * will signal the doorbell if the CPU has already entered the guest.
3559      */
3560     smp_mb__after_atomic();
3561     svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3562 }
3563
3564 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3565 {
3566     struct vcpu_svm *svm = to_svm(vcpu);
3567
3568     /*
3569      * SEV-ES guests must always keep the CR intercepts cleared. CR
3570      * tracking is done using the CR write traps.
3571      */
3572     if (sev_es_guest(vcpu->kvm))
3573         return;
3574
3575     if (nested_svm_virtualize_tpr(vcpu))
3576         return;
3577
3578     svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3579
3580     if (irr == -1)
3581         return;
3582
3583     if (tpr >= irr)
3584         svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3585 }
3586
3587 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3588 {
3589     struct vcpu_svm *svm = to_svm(vcpu);
3590     struct vmcb *vmcb = svm->vmcb;
3591     bool ret;
3592
3593     if (!gif_set(svm))
3594         return true;
3595
3596     if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3597         return false;
3598
3599     ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3600           (vcpu->arch.hflags & HF_NMI_MASK);
3601
3602     return ret;
3603 }
3604
3605 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3606 {
3607     struct vcpu_svm *svm = to_svm(vcpu);
3608     if (svm->nested.nested_run_pending)
3609         return -EBUSY;
3610
3611     if (svm_nmi_blocked(vcpu))
3612         return 0;
3613
3614     /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3615     if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3616         return -EBUSY;
3617     return 1;
3618 }
3619
3620 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3621 {
3622     return !!(vcpu->arch.hflags & HF_NMI_MASK);
3623 }
3624
3625 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3626 {
3627     struct vcpu_svm *svm = to_svm(vcpu);
3628
3629     if (masked) {
3630         vcpu->arch.hflags |= HF_NMI_MASK;
3631         if (!sev_es_guest(vcpu->kvm))
3632             svm_set_intercept(svm, INTERCEPT_IRET);
3633     } else {
3634         vcpu->arch.hflags &= ~HF_NMI_MASK;
3635         if (!sev_es_guest(vcpu->kvm))
3636             svm_clr_intercept(svm, INTERCEPT_IRET);
3637     }
3638 }
3639
3640 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3641 {
3642     struct vcpu_svm *svm = to_svm(vcpu);
3643     struct vmcb *vmcb = svm->vmcb;
3644
3645     if (!gif_set(svm))
3646         return true;
3647
3648     if (is_guest_mode(vcpu)) {
3649         /* As long as interrupts are being delivered...  */
3650         if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3651             ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3652             : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3653             return true;
3654
3655         /* ... vmexits aren't blocked by the interrupt shadow  */
3656         if (nested_exit_on_intr(svm))
3657             return false;
3658     } else {
3659         if (!svm_get_if_flag(vcpu))
3660             return true;
3661     }
3662
3663     return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3664 }
3665
3666 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3667 {
3668     struct vcpu_svm *svm = to_svm(vcpu);
3669
3670     if (svm->nested.nested_run_pending)
3671         return -EBUSY;
3672
3673     if (svm_interrupt_blocked(vcpu))
3674         return 0;
3675
3676     /*
3677      * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3678      * e.g. if the IRQ arrived asynchronously after checking nested events.
3679      */
3680     if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3681         return -EBUSY;
3682
3683     return 1;
3684 }
3685
3686 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3687 {
3688     struct vcpu_svm *svm = to_svm(vcpu);
3689
3690     /*
3691      * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3692      * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3693      * get that intercept, this function will be called again though and
3694      * we'll get the vintr intercept. However, if the vGIF feature is
3695      * enabled, the STGI interception will not occur. Enable the irq
3696      * window under the assumption that the hardware will set the GIF.
3697      */
3698     if (vgif || gif_set(svm)) {
3699         /*
3700          * IRQ window is not needed when AVIC is enabled,
3701          * unless we have pending ExtINT since it cannot be injected
3702          * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3703          * and fallback to injecting IRQ via V_IRQ.
3704          *
3705          * If running nested, AVIC is already locally inhibited
3706          * on this vCPU, therefore there is no need to request
3707          * the VM wide AVIC inhibition.
3708          */
3709         if (!is_guest_mode(vcpu))
3710             kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3711
3712         svm_set_vintr(svm);
3713     }
3714 }
3715
3716 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3717 {
3718     struct vcpu_svm *svm = to_svm(vcpu);
3719
3720     if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
3721         return; /* IRET will cause a vm exit */
3722
3723     if (!gif_set(svm)) {
3724         if (vgif)
3725             svm_set_intercept(svm, INTERCEPT_STGI);
3726         return; /* STGI will cause a vm exit */
3727     }
3728
3729     /*
3730      * Something prevents NMI from been injected. Single step over possible
3731      * problem (IRET or exception injection or interrupt shadow)
3732      */
3733     svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3734     svm->nmi_singlestep = true;
3735     svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3736 }
3737
3738 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3739 {
3740     struct vcpu_svm *svm = to_svm(vcpu);
3741
3742     /*
3743      * Flush only the current ASID even if the TLB flush was invoked via
3744      * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3745      * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3746      * unconditionally does a TLB flush on both nested VM-Enter and nested
3747      * VM-Exit (via kvm_mmu_reset_context()).
3748      */
3749     if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3750         svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3751     else
3752         svm->current_vmcb->asid_generation--;
3753 }
3754
3755 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3756 {
3757     struct vcpu_svm *svm = to_svm(vcpu);
3758
3759     invlpga(gva, svm->vmcb->control.asid);
3760 }
3761
3762 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3763 {
3764     struct vcpu_svm *svm = to_svm(vcpu);
3765
3766     if (nested_svm_virtualize_tpr(vcpu))
3767         return;
3768
3769     if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3770         int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3771         kvm_set_cr8(vcpu, cr8);
3772     }
3773 }
3774
3775 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3776 {
3777     struct vcpu_svm *svm = to_svm(vcpu);
3778     u64 cr8;
3779
3780     if (nested_svm_virtualize_tpr(vcpu) ||
3781         kvm_vcpu_apicv_active(vcpu))
3782         return;
3783
3784     cr8 = kvm_get_cr8(vcpu);
3785     svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3786     svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3787 }
3788
3789 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3790                     int type)
3791 {
3792     bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3793     bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3794     struct vcpu_svm *svm = to_svm(vcpu);
3795
3796     /*
3797      * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3798      * associated with the original soft exception/interrupt.  next_rip is
3799      * cleared on all exits that can occur while vectoring an event, so KVM
3800      * needs to manually set next_rip for re-injection.  Unlike the !nrips
3801      * case below, this needs to be done if and only if KVM is re-injecting
3802      * the same event, i.e. if the event is a soft exception/interrupt,
3803      * otherwise next_rip is unused on VMRUN.
3804      */
3805     if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3806         kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3807         svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3808     /*
3809      * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3810      * injecting the soft exception/interrupt.  That advancement needs to
3811      * be unwound if vectoring didn't complete.  Note, the new event may
3812      * not be the injected event, e.g. if KVM injected an INTn, the INTn
3813      * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3814      * be the reported vectored event, but RIP still needs to be unwound.
3815      */
3816     else if (!nrips && (is_soft || is_exception) &&
3817          kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3818         kvm_rip_write(vcpu, svm->soft_int_old_rip);
3819 }
3820
3821 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3822 {
3823     struct vcpu_svm *svm = to_svm(vcpu);
3824     u8 vector;
3825     int type;
3826     u32 exitintinfo = svm->vmcb->control.exit_int_info;
3827     bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3828     bool soft_int_injected = svm->soft_int_injected;
3829
3830     svm->nmi_l1_to_l2 = false;
3831     svm->soft_int_injected = false;
3832
3833     /*
3834      * If we've made progress since setting HF_IRET_MASK, we've
3835      * executed an IRET and can allow NMI injection.
3836      */
3837     if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3838         (sev_es_guest(vcpu->kvm) ||
3839          kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3840         vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3841         kvm_make_request(KVM_REQ_EVENT, vcpu);
3842     }
3843
3844     vcpu->arch.nmi_injected = false;
3845     kvm_clear_exception_queue(vcpu);
3846     kvm_clear_interrupt_queue(vcpu);
3847
3848     if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3849         return;
3850
3851     kvm_make_request(KVM_REQ_EVENT, vcpu);
3852
3853     vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3854     type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3855
3856     if (soft_int_injected)
3857         svm_complete_soft_interrupt(vcpu, vector, type);
3858
3859     switch (type) {
3860     case SVM_EXITINTINFO_TYPE_NMI:
3861         vcpu->arch.nmi_injected = true;
3862         svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3863         break;
3864     case SVM_EXITINTINFO_TYPE_EXEPT:
3865         /*
3866          * Never re-inject a #VC exception.
3867          */
3868         if (vector == X86_TRAP_VC)
3869             break;
3870
3871         if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3872             u32 err = svm->vmcb->control.exit_int_info_err;
3873             kvm_requeue_exception_e(vcpu, vector, err);
3874
3875         } else
3876             kvm_requeue_exception(vcpu, vector);
3877         break;
3878     case SVM_EXITINTINFO_TYPE_INTR:
3879         kvm_queue_interrupt(vcpu, vector, false);
3880         break;
3881     case SVM_EXITINTINFO_TYPE_SOFT:
3882         kvm_queue_interrupt(vcpu, vector, true);
3883         break;
3884     default:
3885         break;
3886     }
3887
3888 }
3889
3890 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3891 {
3892     struct vcpu_svm *svm = to_svm(vcpu);
3893     struct vmcb_control_area *control = &svm->vmcb->control;
3894
3895     control->exit_int_info = control->event_inj;
3896     control->exit_int_info_err = control->event_inj_err;
3897     control->event_inj = 0;
3898     svm_complete_interrupts(vcpu);
3899 }
3900
3901 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3902 {
3903     return 1;
3904 }
3905
3906 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3907 {
3908     if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
3909         to_svm(vcpu)->vmcb->control.exit_info_1)
3910         return handle_fastpath_set_msr_irqoff(vcpu);
3911
3912     return EXIT_FASTPATH_NONE;
3913 }
3914
3915 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
3916 {
3917     struct vcpu_svm *svm = to_svm(vcpu);
3918     unsigned long vmcb_pa = svm->current_vmcb->pa;
3919
3920     guest_state_enter_irqoff();
3921
3922     if (sev_es_guest(vcpu->kvm)) {
3923         __svm_sev_es_vcpu_run(vmcb_pa);
3924     } else {
3925         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3926
3927         /*
3928          * Use a single vmcb (vmcb01 because it's always valid) for
3929          * context switching guest state via VMLOAD/VMSAVE, that way
3930          * the state doesn't need to be copied between vmcb01 and
3931          * vmcb02 when switching vmcbs for nested virtualization.
3932          */
3933         vmload(svm->vmcb01.pa);
3934         __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
3935         vmsave(svm->vmcb01.pa);
3936
3937         vmload(__sme_page_pa(sd->save_area));
3938     }
3939
3940     guest_state_exit_irqoff();
3941 }
3942
3943 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3944 {
3945     struct vcpu_svm *svm = to_svm(vcpu);
3946
3947     trace_kvm_entry(vcpu);
3948
3949     svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3950     svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3951     svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3952
3953     /*
3954      * Disable singlestep if we're injecting an interrupt/exception.
3955      * We don't want our modified rflags to be pushed on the stack where
3956      * we might not be able to easily reset them if we disabled NMI
3957      * singlestep later.
3958      */
3959     if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3960         /*
3961          * Event injection happens before external interrupts cause a
3962          * vmexit and interrupts are disabled here, so smp_send_reschedule
3963          * is enough to force an immediate vmexit.
3964          */
3965         disable_nmi_singlestep(svm);
3966         smp_send_reschedule(vcpu->cpu);
3967     }
3968
3969     pre_svm_run(vcpu);
3970
3971     sync_lapic_to_cr8(vcpu);
3972
3973     if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3974         svm->vmcb->control.asid = svm->asid;
3975         vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3976     }
3977     svm->vmcb->save.cr2 = vcpu->arch.cr2;
3978
3979     svm_hv_update_vp_id(svm->vmcb, vcpu);
3980
3981     /*
3982      * Run with all-zero DR6 unless needed, so that we can get the exact cause
3983      * of a #DB.
3984      */
3985     if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3986         svm_set_dr6(svm, vcpu->arch.dr6);
3987     else
3988         svm_set_dr6(svm, DR6_ACTIVE_LOW);
3989
3990     clgi();
3991     kvm_load_guest_xsave_state(vcpu);
3992
3993     kvm_wait_lapic_expire(vcpu);
3994
3995     /*
3996      * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3997      * it's non-zero. Since vmentry is serialising on affected CPUs, there
3998      * is no need to worry about the conditional branch over the wrmsr
3999      * being speculatively taken.
4000      */
4001     if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4002         x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
4003
4004     svm_vcpu_enter_exit(vcpu);
4005
4006     /*
4007      * We do not use IBRS in the kernel. If this vCPU has used the
4008      * SPEC_CTRL MSR it may have left it on; save the value and
4009      * turn it off. This is much more efficient than blindly adding
4010      * it to the atomic save/restore list. Especially as the former
4011      * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
4012      *
4013      * For non-nested case:
4014      * If the L01 MSR bitmap does not intercept the MSR, then we need to
4015      * save it.
4016      *
4017      * For nested case:
4018      * If the L02 MSR bitmap does not intercept the MSR, then we need to
4019      * save it.
4020      */
4021     if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
4022         unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
4023         svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
4024
4025     if (!sev_es_guest(vcpu->kvm))
4026         reload_tss(vcpu);
4027
4028     if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4029         x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
4030
4031     if (!sev_es_guest(vcpu->kvm)) {
4032         vcpu->arch.cr2 = svm->vmcb->save.cr2;
4033         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4034         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4035         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4036     }
4037     vcpu->arch.regs_dirty = 0;
4038
4039     if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4040         kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4041
4042     kvm_load_host_xsave_state(vcpu);
4043     stgi();
4044
4045     /* Any pending NMI will happen here */
4046
4047     if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4048         kvm_after_interrupt(vcpu);
4049
4050     sync_cr8_to_lapic(vcpu);
4051
4052     svm->next_rip = 0;
4053     if (is_guest_mode(vcpu)) {
4054         nested_sync_control_from_vmcb02(svm);
4055
4056         /* Track VMRUNs that have made past consistency checking */
4057         if (svm->nested.nested_run_pending &&
4058             svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4059                         ++vcpu->stat.nested_run;
4060
4061         svm->nested.nested_run_pending = 0;
4062     }
4063
4064     svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4065     vmcb_mark_all_clean(svm->vmcb);
4066
4067     /* if exit due to PF check for async PF */
4068     if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4069         vcpu->arch.apf.host_apf_flags =
4070             kvm_read_and_reset_apf_flags();
4071
4072     vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4073
4074     /*
4075      * We need to handle MC intercepts here before the vcpu has a chance to
4076      * change the physical cpu
4077      */
4078     if (unlikely(svm->vmcb->control.exit_code ==
4079              SVM_EXIT_EXCP_BASE + MC_VECTOR))
4080         svm_handle_mce(vcpu);
4081
4082     svm_complete_interrupts(vcpu);
4083
4084     if (is_guest_mode(vcpu))
4085         return EXIT_FASTPATH_NONE;
4086
4087     return svm_exit_handlers_fastpath(vcpu);
4088 }
4089
4090 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4091                  int root_level)
4092 {
4093     struct vcpu_svm *svm = to_svm(vcpu);
4094     unsigned long cr3;
4095
4096     if (npt_enabled) {
4097         svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4098         vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4099
4100         hv_track_root_tdp(vcpu, root_hpa);
4101
4102         cr3 = vcpu->arch.cr3;
4103     } else if (root_level >= PT64_ROOT_4LEVEL) {
4104         cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4105     } else {
4106         /* PCID in the guest should be impossible with a 32-bit MMU. */
4107         WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4108         cr3 = root_hpa;
4109     }
4110
4111     svm->vmcb->save.cr3 = cr3;
4112     vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4113 }
4114
4115 static int is_disabled(void)
4116 {
4117     u64 vm_cr;
4118
4119     rdmsrl(MSR_VM_CR, vm_cr);
4120     if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4121         return 1;
4122
4123     return 0;
4124 }
4125
4126 static void
4127 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4128 {
4129     /*
4130      * Patch in the VMMCALL instruction:
4131      */
4132     hypercall[0] = 0x0f;
4133     hypercall[1] = 0x01;
4134     hypercall[2] = 0xd9;
4135 }
4136
4137 static int __init svm_check_processor_compat(void)
4138 {
4139     return 0;
4140 }
4141
4142 /*
4143  * The kvm parameter can be NULL (module initialization, or invocation before
4144  * VM creation). Be sure to check the kvm parameter before using it.
4145  */
4146 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4147 {
4148     switch (index) {
4149     case MSR_IA32_MCG_EXT_CTL:
4150     case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4151         return false;
4152     case MSR_IA32_SMBASE:
4153         /* SEV-ES guests do not support SMM, so report false */
4154         if (kvm && sev_es_guest(kvm))
4155             return false;
4156         break;
4157     default:
4158         break;
4159     }
4160
4161     return true;
4162 }
4163
4164 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4165 {
4166     struct vcpu_svm *svm = to_svm(vcpu);
4167     struct kvm_cpuid_entry2 *best;
4168
4169     vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4170                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4171                     boot_cpu_has(X86_FEATURE_XSAVES);
4172
4173     /* Update nrips enabled cache */
4174     svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4175                  guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4176
4177     svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4178     svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4179
4180     svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4181
4182     svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4183             guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4184
4185     svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4186             guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4187
4188     svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4189
4190     svm_recalc_instruction_intercepts(vcpu, svm);
4191
4192     /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4193     if (sev_guest(vcpu->kvm)) {
4194         best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4195         if (best)
4196             vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4197     }
4198
4199     init_vmcb_after_set_cpuid(vcpu);
4200 }
4201
4202 static bool svm_has_wbinvd_exit(void)
4203 {
4204     return true;
4205 }
4206
4207 #define PRE_EX(exit)  { .exit_code = (exit), \
4208             .stage = X86_ICPT_PRE_EXCEPT, }
4209 #define POST_EX(exit) { .exit_code = (exit), \
4210             .stage = X86_ICPT_POST_EXCEPT, }
4211 #define POST_MEM(exit) { .exit_code = (exit), \
4212             .stage = X86_ICPT_POST_MEMACCESS, }
4213
4214 static const struct __x86_intercept {
4215     u32 exit_code;
4216     enum x86_intercept_stage stage;
4217 } x86_intercept_map[] = {
4218     [x86_intercept_cr_read]     = POST_EX(SVM_EXIT_READ_CR0),
4219     [x86_intercept_cr_write]    = POST_EX(SVM_EXIT_WRITE_CR0),
4220     [x86_intercept_clts]        = POST_EX(SVM_EXIT_WRITE_CR0),
4221     [x86_intercept_lmsw]        = POST_EX(SVM_EXIT_WRITE_CR0),
4222     [x86_intercept_smsw]        = POST_EX(SVM_EXIT_READ_CR0),
4223     [x86_intercept_dr_read]     = POST_EX(SVM_EXIT_READ_DR0),
4224     [x86_intercept_dr_write]    = POST_EX(SVM_EXIT_WRITE_DR0),
4225     [x86_intercept_sldt]        = POST_EX(SVM_EXIT_LDTR_READ),
4226     [x86_intercept_str]     = POST_EX(SVM_EXIT_TR_READ),
4227     [x86_intercept_lldt]        = POST_EX(SVM_EXIT_LDTR_WRITE),
4228     [x86_intercept_ltr]     = POST_EX(SVM_EXIT_TR_WRITE),
4229     [x86_intercept_sgdt]        = POST_EX(SVM_EXIT_GDTR_READ),
4230     [x86_intercept_sidt]        = POST_EX(SVM_EXIT_IDTR_READ),
4231     [x86_intercept_lgdt]        = POST_EX(SVM_EXIT_GDTR_WRITE),
4232     [x86_intercept_lidt]        = POST_EX(SVM_EXIT_IDTR_WRITE),
4233     [x86_intercept_vmrun]       = POST_EX(SVM_EXIT_VMRUN),
4234     [x86_intercept_vmmcall]     = POST_EX(SVM_EXIT_VMMCALL),
4235     [x86_intercept_vmload]      = POST_EX(SVM_EXIT_VMLOAD),
4236     [x86_intercept_vmsave]      = POST_EX(SVM_EXIT_VMSAVE),
4237     [x86_intercept_stgi]        = POST_EX(SVM_EXIT_STGI),
4238     [x86_intercept_clgi]        = POST_EX(SVM_EXIT_CLGI),
4239     [x86_intercept_skinit]      = POST_EX(SVM_EXIT_SKINIT),
4240     [x86_intercept_invlpga]     = POST_EX(SVM_EXIT_INVLPGA),
4241     [x86_intercept_rdtscp]      = POST_EX(SVM_EXIT_RDTSCP),
4242     [x86_intercept_monitor]     = POST_MEM(SVM_EXIT_MONITOR),
4243     [x86_intercept_mwait]       = POST_EX(SVM_EXIT_MWAIT),
4244     [x86_intercept_invlpg]      = POST_EX(SVM_EXIT_INVLPG),
4245     [x86_intercept_invd]        = POST_EX(SVM_EXIT_INVD),
4246     [x86_intercept_wbinvd]      = POST_EX(SVM_EXIT_WBINVD),
4247     [x86_intercept_wrmsr]       = POST_EX(SVM_EXIT_MSR),
4248     [x86_intercept_rdtsc]       = POST_EX(SVM_EXIT_RDTSC),
4249     [x86_intercept_rdmsr]       = POST_EX(SVM_EXIT_MSR),
4250     [x86_intercept_rdpmc]       = POST_EX(SVM_EXIT_RDPMC),
4251     [x86_intercept_cpuid]       = PRE_EX(SVM_EXIT_CPUID),
4252     [x86_intercept_rsm]     = PRE_EX(SVM_EXIT_RSM),
4253     [x86_intercept_pause]       = PRE_EX(SVM_EXIT_PAUSE),
4254     [x86_intercept_pushf]       = PRE_EX(SVM_EXIT_PUSHF),
4255     [x86_intercept_popf]        = PRE_EX(SVM_EXIT_POPF),
4256     [x86_intercept_intn]        = PRE_EX(SVM_EXIT_SWINT),
4257     [x86_intercept_iret]        = PRE_EX(SVM_EXIT_IRET),
4258     [x86_intercept_icebp]       = PRE_EX(SVM_EXIT_ICEBP),
4259     [x86_intercept_hlt]     = POST_EX(SVM_EXIT_HLT),
4260     [x86_intercept_in]      = POST_EX(SVM_EXIT_IOIO),
4261     [x86_intercept_ins]     = POST_EX(SVM_EXIT_IOIO),
4262     [x86_intercept_out]     = POST_EX(SVM_EXIT_IOIO),
4263     [x86_intercept_outs]        = POST_EX(SVM_EXIT_IOIO),
4264     [x86_intercept_xsetbv]      = PRE_EX(SVM_EXIT_XSETBV),
4265 };
4266
4267 #undef PRE_EX
4268 #undef POST_EX
4269 #undef POST_MEM
4270
4271 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4272                    struct x86_instruction_info *info,
4273                    enum x86_intercept_stage stage,
4274                    struct x86_exception *exception)
4275 {
4276     struct vcpu_svm *svm = to_svm(vcpu);
4277     int vmexit, ret = X86EMUL_CONTINUE;
4278     struct __x86_intercept icpt_info;
4279     struct vmcb *vmcb = svm->vmcb;
4280
4281     if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4282         goto out;
4283
4284     icpt_info = x86_intercept_map[info->intercept];
4285
4286     if (stage != icpt_info.stage)
4287         goto out;
4288
4289     switch (icpt_info.exit_code) {
4290     case SVM_EXIT_READ_CR0:
4291         if (info->intercept == x86_intercept_cr_read)
4292             icpt_info.exit_code += info->modrm_reg;
4293         break;
4294     case SVM_EXIT_WRITE_CR0: {
4295         unsigned long cr0, val;
4296
4297         if (info->intercept == x86_intercept_cr_write)
4298             icpt_info.exit_code += info->modrm_reg;
4299
4300         if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4301             info->intercept == x86_intercept_clts)
4302             break;
4303
4304         if (!(vmcb12_is_intercept(&svm->nested.ctl,
4305                     INTERCEPT_SELECTIVE_CR0)))
4306             break;
4307
4308         cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4309         val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4310
4311         if (info->intercept == x86_intercept_lmsw) {
4312             cr0 &= 0xfUL;
4313             val &= 0xfUL;
4314             /* lmsw can't clear PE - catch this here */
4315             if (cr0 & X86_CR0_PE)
4316                 val |= X86_CR0_PE;
4317         }
4318
4319         if (cr0 ^ val)
4320             icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4321
4322         break;
4323     }
4324     case SVM_EXIT_READ_DR0:
4325     case SVM_EXIT_WRITE_DR0:
4326         icpt_info.exit_code += info->modrm_reg;
4327         break;
4328     case SVM_EXIT_MSR:
4329         if (info->intercept == x86_intercept_wrmsr)
4330             vmcb->control.exit_info_1 = 1;
4331         else
4332             vmcb->control.exit_info_1 = 0;
4333         break;
4334     case SVM_EXIT_PAUSE:
4335         /*
4336          * We get this for NOP only, but pause
4337          * is rep not, check this here
4338          */
4339         if (info->rep_prefix != REPE_PREFIX)
4340             goto out;
4341         break;
4342     case SVM_EXIT_IOIO: {
4343         u64 exit_info;
4344         u32 bytes;
4345
4346         if (info->intercept == x86_intercept_in ||
4347             info->intercept == x86_intercept_ins) {
4348             exit_info = ((info->src_val & 0xffff) << 16) |
4349                 SVM_IOIO_TYPE_MASK;
4350             bytes = info->dst_bytes;
4351         } else {
4352             exit_info = (info->dst_val & 0xffff) << 16;
4353             bytes = info->src_bytes;
4354         }
4355
4356         if (info->intercept == x86_intercept_outs ||
4357             info->intercept == x86_intercept_ins)
4358             exit_info |= SVM_IOIO_STR_MASK;
4359
4360         if (info->rep_prefix)
4361             exit_info |= SVM_IOIO_REP_MASK;
4362
4363         bytes = min(bytes, 4u);
4364
4365         exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4366
4367         exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4368
4369         vmcb->control.exit_info_1 = exit_info;
4370         vmcb->control.exit_info_2 = info->next_rip;
4371
4372         break;
4373     }
4374     default:
4375         break;
4376     }
4377
4378     /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4379     if (static_cpu_has(X86_FEATURE_NRIPS))
4380         vmcb->control.next_rip  = info->next_rip;
4381     vmcb->control.exit_code = icpt_info.exit_code;
4382     vmexit = nested_svm_exit_handled(svm);
4383
4384     ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4385                        : X86EMUL_CONTINUE;
4386
4387 out:
4388     return ret;
4389 }
4390
4391 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4392 {
4393     if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4394         vcpu->arch.at_instruction_boundary = true;
4395 }
4396
4397 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4398 {
4399     if (!kvm_pause_in_guest(vcpu->kvm))
4400         shrink_ple_window(vcpu);
4401 }
4402
4403 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4404 {
4405     /* [63:9] are reserved. */
4406     vcpu->arch.mcg_cap &= 0x1ff;
4407 }
4408
4409 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4410 {
4411     struct vcpu_svm *svm = to_svm(vcpu);
4412
4413     /* Per APM Vol.2 15.22.2 "Response to SMI" */
4414     if (!gif_set(svm))
4415         return true;
4416
4417     return is_smm(vcpu);
4418 }
4419
4420 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4421 {
4422     struct vcpu_svm *svm = to_svm(vcpu);
4423     if (svm->nested.nested_run_pending)
4424         return -EBUSY;
4425
4426     if (svm_smi_blocked(vcpu))
4427         return 0;
4428
4429     /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4430     if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4431         return -EBUSY;
4432
4433     return 1;
4434 }
4435
4436 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
4437 {
4438     struct vcpu_svm *svm = to_svm(vcpu);
4439     struct kvm_host_map map_save;
4440     int ret;
4441
4442     if (!is_guest_mode(vcpu))
4443         return 0;
4444
4445     /* FED8h - SVM Guest */
4446     put_smstate(u64, smstate, 0x7ed8, 1);
4447     /* FEE0h - SVM Guest VMCB Physical Address */
4448     put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
4449
4450     svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4451     svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4452     svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4453
4454     ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4455     if (ret)
4456         return ret;
4457
4458     /*
4459      * KVM uses VMCB01 to store L1 host state while L2 runs but
4460      * VMCB01 is going to be used during SMM and thus the state will
4461      * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4462      * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4463      * format of the area is identical to guest save area offsetted
4464      * by 0x400 (matches the offset of 'struct vmcb_save_area'
4465      * within 'struct vmcb'). Note: HSAVE area may also be used by
4466      * L1 hypervisor to save additional host context (e.g. KVM does
4467      * that, see svm_prepare_switch_to_guest()) which must be
4468      * preserved.
4469      */
4470     if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4471              &map_save) == -EINVAL)
4472         return 1;
4473
4474     BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4475
4476     svm_copy_vmrun_state(map_save.hva + 0x400,
4477                  &svm->vmcb01.ptr->save);
4478
4479     kvm_vcpu_unmap(vcpu, &map_save, true);
4480     return 0;
4481 }
4482
4483 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
4484 {
4485     struct vcpu_svm *svm = to_svm(vcpu);
4486     struct kvm_host_map map, map_save;
4487     u64 saved_efer, vmcb12_gpa;
4488     struct vmcb *vmcb12;
4489     int ret;
4490
4491     if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4492         return 0;
4493
4494     /* Non-zero if SMI arrived while vCPU was in guest mode. */
4495     if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4496         return 0;
4497
4498     if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4499         return 1;
4500
4501     saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4502     if (!(saved_efer & EFER_SVME))
4503         return 1;
4504
4505     vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4506     if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4507         return 1;
4508
4509     ret = 1;
4510     if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4511         goto unmap_map;
4512
4513     if (svm_allocate_nested(svm))
4514         goto unmap_save;
4515
4516     /*
4517      * Restore L1 host state from L1 HSAVE area as VMCB01 was
4518      * used during SMM (see svm_enter_smm())
4519      */
4520
4521     svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4522
4523     /*
4524      * Enter the nested guest now
4525      */
4526
4527     vmcb_mark_all_dirty(svm->vmcb01.ptr);
4528
4529     vmcb12 = map.hva;
4530     nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4531     nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4532     ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4533
4534     if (ret)
4535         goto unmap_save;
4536
4537     svm->nested.nested_run_pending = 1;
4538
4539 unmap_save:
4540     kvm_vcpu_unmap(vcpu, &map_save, true);
4541 unmap_map:
4542     kvm_vcpu_unmap(vcpu, &map, true);
4543     return ret;
4544 }
4545
4546 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4547 {
4548     struct vcpu_svm *svm = to_svm(vcpu);
4549
4550     if (!gif_set(svm)) {
4551         if (vgif)
4552             svm_set_intercept(svm, INTERCEPT_STGI);
4553         /* STGI will cause a vm exit */
4554     } else {
4555         /* We must be in SMM; RSM will cause a vmexit anyway.  */
4556     }
4557 }
4558
4559 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4560                     void *insn, int insn_len)
4561 {
4562     bool smep, smap, is_user;
4563     unsigned long cr4;
4564     u64 error_code;
4565
4566     /* Emulation is always possible when KVM has access to all guest state. */
4567     if (!sev_guest(vcpu->kvm))
4568         return true;
4569
4570     /* #UD and #GP should never be intercepted for SEV guests. */
4571     WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4572                   EMULTYPE_TRAP_UD_FORCED |
4573                   EMULTYPE_VMWARE_GP));
4574
4575     /*
4576      * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4577      * to guest register state.
4578      */
4579     if (sev_es_guest(vcpu->kvm))
4580         return false;
4581
4582     /*
4583      * Emulation is possible if the instruction is already decoded, e.g.
4584      * when completing I/O after returning from userspace.
4585      */
4586     if (emul_type & EMULTYPE_NO_DECODE)
4587         return true;
4588
4589     /*
4590      * Emulation is possible for SEV guests if and only if a prefilled
4591      * buffer containing the bytes of the intercepted instruction is
4592      * available. SEV guest memory is encrypted with a guest specific key
4593      * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4594      * decode garbage.
4595      *
4596      * Inject #UD if KVM reached this point without an instruction buffer.
4597      * In practice, this path should never be hit by a well-behaved guest,
4598      * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4599      * is still theoretically reachable, e.g. via unaccelerated fault-like
4600      * AVIC access, and needs to be handled by KVM to avoid putting the
4601      * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4602      * but its the least awful option given lack of insight into the guest.
4603      */
4604     if (unlikely(!insn)) {
4605         kvm_queue_exception(vcpu, UD_VECTOR);
4606         return false;
4607     }
4608
4609     /*
4610      * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4611      * will be empty if the DecodeAssist microcode cannot fetch bytes for
4612      * the faulting instruction because the code fetch itself faulted, e.g.
4613      * the guest attempted to fetch from emulated MMIO or a guest page
4614      * table used to translate CS:RIP resides in emulated MMIO.
4615      */
4616     if (likely(insn_len))
4617         return true;
4618
4619     /*
4620      * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4621      *
4622      * Errata:
4623      * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4624      * possible that CPU microcode implementing DecodeAssist will fail to
4625      * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4626      * be '0'.  This happens because microcode reads CS:RIP using a _data_
4627      * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4628      * gives up and does not fill the instruction bytes buffer.
4629      *
4630      * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4631      * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4632      * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4633      * GuestIntrBytes field of the VMCB.
4634      *
4635      * This does _not_ mean that the erratum has been encountered, as the
4636      * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4637      * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4638      * encountered a reserved/not-present #PF.
4639      *
4640      * To hit the erratum, the following conditions must be true:
4641      *    1. CR4.SMAP=1 (obviously).
4642      *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4643      *       have been hit as the guest would have encountered a SMEP
4644      *       violation #PF, not a #NPF.
4645      *    3. The #NPF is not due to a code fetch, in which case failure to
4646      *       retrieve the instruction bytes is legitimate (see abvoe).
4647      *
4648      * In addition, don't apply the erratum workaround if the #NPF occurred
4649      * while translating guest page tables (see below).
4650      */
4651     error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4652     if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4653         goto resume_guest;
4654
4655     cr4 = kvm_read_cr4(vcpu);
4656     smep = cr4 & X86_CR4_SMEP;
4657     smap = cr4 & X86_CR4_SMAP;
4658     is_user = svm_get_cpl(vcpu) == 3;
4659     if (smap && (!smep || is_user)) {
4660         pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4661
4662         /*
4663          * If the fault occurred in userspace, arbitrarily inject #GP
4664          * to avoid killing the guest and to hopefully avoid confusing
4665          * the guest kernel too much, e.g. injecting #PF would not be
4666          * coherent with respect to the guest's page tables.  Request
4667          * triple fault if the fault occurred in the kernel as there's
4668          * no fault that KVM can inject without confusing the guest.
4669          * In practice, the triple fault is moot as no sane SEV kernel
4670          * will execute from user memory while also running with SMAP=1.
4671          */
4672         if (is_user)
4673             kvm_inject_gp(vcpu, 0);
4674         else
4675             kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4676     }
4677
4678 resume_guest:
4679     /*
4680      * If the erratum was not hit, simply resume the guest and let it fault
4681      * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4682      * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4683      * userspace will kill the guest, and letting the emulator read garbage
4684      * will yield random behavior and potentially corrupt the guest.
4685      *
4686      * Simply resuming the guest is technically not a violation of the SEV
4687      * architecture.  AMD's APM states that all code fetches and page table
4688      * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4689      * APM also states that encrypted accesses to MMIO are "ignored", but
4690      * doesn't explicitly define "ignored", i.e. doing nothing and letting
4691      * the guest spin is technically "ignoring" the access.
4692      */
4693     return false;
4694 }
4695
4696 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4697 {
4698     struct vcpu_svm *svm = to_svm(vcpu);
4699
4700     /*
4701      * TODO: Last condition latch INIT signals on vCPU when
4702      * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
4703      * To properly emulate the INIT intercept,
4704      * svm_check_nested_events() should call nested_svm_vmexit()
4705      * if an INIT signal is pending.
4706      */
4707     return !gif_set(svm) ||
4708            (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4709 }
4710
4711 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4712 {
4713     if (!sev_es_guest(vcpu->kvm))
4714         return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4715
4716     sev_vcpu_deliver_sipi_vector(vcpu, vector);
4717 }
4718
4719 static void svm_vm_destroy(struct kvm *kvm)
4720 {
4721     avic_vm_destroy(kvm);
4722     sev_vm_destroy(kvm);
4723 }
4724
4725 static int svm_vm_init(struct kvm *kvm)
4726 {
4727     if (!pause_filter_count || !pause_filter_thresh)
4728         kvm->arch.pause_in_guest = true;
4729
4730     if (enable_apicv) {
4731         int ret = avic_vm_init(kvm);
4732         if (ret)
4733             return ret;
4734     }
4735
4736     return 0;
4737 }
4738
4739 static struct kvm_x86_ops svm_x86_ops __initdata = {
4740     .name = "kvm_amd",
4741
4742     .hardware_unsetup = svm_hardware_unsetup,
4743     .hardware_enable = svm_hardware_enable,
4744     .hardware_disable = svm_hardware_disable,
4745     .has_emulated_msr = svm_has_emulated_msr,
4746
4747     .vcpu_create = svm_vcpu_create,
4748     .vcpu_free = svm_vcpu_free,
4749     .vcpu_reset = svm_vcpu_reset,
4750
4751     .vm_size = sizeof(struct kvm_svm),
4752     .vm_init = svm_vm_init,
4753     .vm_destroy = svm_vm_destroy,
4754
4755     .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4756     .vcpu_load = svm_vcpu_load,
4757     .vcpu_put = svm_vcpu_put,
4758     .vcpu_blocking = avic_vcpu_blocking,
4759     .vcpu_unblocking = avic_vcpu_unblocking,
4760
4761     .update_exception_bitmap = svm_update_exception_bitmap,
4762     .get_msr_feature = svm_get_msr_feature,
4763     .get_msr = svm_get_msr,
4764     .set_msr = svm_set_msr,
4765     .get_segment_base = svm_get_segment_base,
4766     .get_segment = svm_get_segment,
4767     .set_segment = svm_set_segment,
4768     .get_cpl = svm_get_cpl,
4769     .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4770     .set_cr0 = svm_set_cr0,
4771     .post_set_cr3 = sev_post_set_cr3,
4772     .is_valid_cr4 = svm_is_valid_cr4,
4773     .set_cr4 = svm_set_cr4,
4774     .set_efer = svm_set_efer,
4775     .get_idt = svm_get_idt,
4776     .set_idt = svm_set_idt,
4777     .get_gdt = svm_get_gdt,
4778     .set_gdt = svm_set_gdt,
4779     .set_dr7 = svm_set_dr7,
4780     .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4781     .cache_reg = svm_cache_reg,
4782     .get_rflags = svm_get_rflags,
4783     .set_rflags = svm_set_rflags,
4784     .get_if_flag = svm_get_if_flag,
4785
4786     .flush_tlb_all = svm_flush_tlb_current,
4787     .flush_tlb_current = svm_flush_tlb_current,
4788     .flush_tlb_gva = svm_flush_tlb_gva,
4789     .flush_tlb_guest = svm_flush_tlb_current,
4790
4791     .vcpu_pre_run = svm_vcpu_pre_run,
4792     .vcpu_run = svm_vcpu_run,
4793     .handle_exit = svm_handle_exit,
4794     .skip_emulated_instruction = svm_skip_emulated_instruction,
4795     .update_emulated_instruction = NULL,
4796     .set_interrupt_shadow = svm_set_interrupt_shadow,
4797     .get_interrupt_shadow = svm_get_interrupt_shadow,
4798     .patch_hypercall = svm_patch_hypercall,
4799     .inject_irq = svm_inject_irq,
4800     .inject_nmi = svm_inject_nmi,
4801     .queue_exception = svm_queue_exception,
4802     .cancel_injection = svm_cancel_injection,
4803     .interrupt_allowed = svm_interrupt_allowed,
4804     .nmi_allowed = svm_nmi_allowed,
4805     .get_nmi_mask = svm_get_nmi_mask,
4806     .set_nmi_mask = svm_set_nmi_mask,
4807     .enable_nmi_window = svm_enable_nmi_window,
4808     .enable_irq_window = svm_enable_irq_window,
4809     .update_cr8_intercept = svm_update_cr8_intercept,
4810     .set_virtual_apic_mode = avic_set_virtual_apic_mode,
4811     .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4812     .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
4813     .apicv_post_state_restore = avic_apicv_post_state_restore,
4814
4815     .get_exit_info = svm_get_exit_info,
4816
4817     .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4818
4819     .has_wbinvd_exit = svm_has_wbinvd_exit,
4820
4821     .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4822     .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4823     .write_tsc_offset = svm_write_tsc_offset,
4824     .write_tsc_multiplier = svm_write_tsc_multiplier,
4825
4826     .load_mmu_pgd = svm_load_mmu_pgd,
4827
4828     .check_intercept = svm_check_intercept,
4829     .handle_exit_irqoff = svm_handle_exit_irqoff,
4830
4831     .request_immediate_exit = __kvm_request_immediate_exit,
4832
4833     .sched_in = svm_sched_in,
4834
4835     .nested_ops = &svm_nested_ops,
4836
4837     .deliver_interrupt = svm_deliver_interrupt,
4838     .pi_update_irte = avic_pi_update_irte,
4839     .setup_mce = svm_setup_mce,
4840
4841     .smi_allowed = svm_smi_allowed,
4842     .enter_smm = svm_enter_smm,
4843     .leave_smm = svm_leave_smm,
4844     .enable_smi_window = svm_enable_smi_window,
4845
4846     .mem_enc_ioctl = sev_mem_enc_ioctl,
4847     .mem_enc_register_region = sev_mem_enc_register_region,
4848     .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4849     .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4850
4851     .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4852     .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4853
4854     .can_emulate_instruction = svm_can_emulate_instruction,
4855
4856     .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4857
4858     .msr_filter_changed = svm_msr_filter_changed,
4859     .complete_emulated_msr = svm_complete_emulated_msr,
4860
4861     .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4862     .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4863 };
4864
4865 /*
4866  * The default MMIO mask is a single bit (excluding the present bit),
4867  * which could conflict with the memory encryption bit. Check for
4868  * memory encryption support and override the default MMIO mask if
4869  * memory encryption is enabled.
4870  */
4871 static __init void svm_adjust_mmio_mask(void)
4872 {
4873     unsigned int enc_bit, mask_bit;
4874     u64 msr, mask;
4875
4876     /* If there is no memory encryption support, use existing mask */
4877     if (cpuid_eax(0x80000000) < 0x8000001f)
4878         return;
4879
4880     /* If memory encryption is not enabled, use existing mask */
4881     rdmsrl(MSR_AMD64_SYSCFG, msr);
4882     if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4883         return;
4884
4885     enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4886     mask_bit = boot_cpu_data.x86_phys_bits;
4887
4888     /* Increment the mask bit if it is the same as the encryption bit */
4889     if (enc_bit == mask_bit)
4890         mask_bit++;
4891
4892     /*
4893      * If the mask bit location is below 52, then some bits above the
4894      * physical addressing limit will always be reserved, so use the
4895      * rsvd_bits() function to generate the mask. This mask, along with
4896      * the present bit, will be used to generate a page fault with
4897      * PFER.RSV = 1.
4898      *
4899      * If the mask bit location is 52 (or above), then clear the mask.
4900      */
4901     mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4902
4903     kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4904 }
4905
4906 static __init void svm_set_cpu_caps(void)
4907 {
4908     kvm_set_cpu_caps();
4909
4910     kvm_caps.supported_xss = 0;
4911
4912     /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4913     if (nested) {
4914         kvm_cpu_cap_set(X86_FEATURE_SVM);
4915         kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4916
4917         if (nrips)
4918             kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4919
4920         if (npt_enabled)
4921             kvm_cpu_cap_set(X86_FEATURE_NPT);
4922
4923         if (tsc_scaling)
4924             kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4925
4926         if (vls)
4927             kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
4928         if (lbrv)
4929             kvm_cpu_cap_set(X86_FEATURE_LBRV);
4930
4931         if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4932             kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4933
4934         if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4935             kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4936
4937         if (vgif)
4938             kvm_cpu_cap_set(X86_FEATURE_VGIF);
4939
4940         /* Nested VM can receive #VMEXIT instead of triggering #GP */
4941         kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4942     }
4943
4944     /* CPUID 0x80000008 */
4945     if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4946         boot_cpu_has(X86_FEATURE_AMD_SSBD))
4947         kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4948
4949     /* AMD PMU PERFCTR_CORE CPUID */
4950     if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4951         kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4952
4953     /* CPUID 0x8000001F (SME/SEV features) */
4954     sev_set_cpu_caps();
4955 }
4956
4957 static __init int svm_hardware_setup(void)
4958 {
4959     int cpu;
4960     struct page *iopm_pages;
4961     void *iopm_va;
4962     int r;
4963     unsigned int order = get_order(IOPM_SIZE);
4964
4965     /*
4966      * NX is required for shadow paging and for NPT if the NX huge pages
4967      * mitigation is enabled.
4968      */
4969     if (!boot_cpu_has(X86_FEATURE_NX)) {
4970         pr_err_ratelimited("NX (Execute Disable) not supported\n");
4971         return -EOPNOTSUPP;
4972     }
4973     kvm_enable_efer_bits(EFER_NX);
4974
4975     iopm_pages = alloc_pages(GFP_KERNEL, order);
4976
4977     if (!iopm_pages)
4978         return -ENOMEM;
4979
4980     iopm_va = page_address(iopm_pages);
4981     memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4982     iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4983
4984     init_msrpm_offsets();
4985
4986     kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4987                      XFEATURE_MASK_BNDCSR);
4988
4989     if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4990         kvm_enable_efer_bits(EFER_FFXSR);
4991
4992     if (tsc_scaling) {
4993         if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4994             tsc_scaling = false;
4995         } else {
4996             pr_info("TSC scaling supported\n");
4997             kvm_caps.has_tsc_control = true;
4998         }
4999     }
5000     kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5001     kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5002
5003     tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5004
5005     /* Check for pause filtering support */
5006     if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5007         pause_filter_count = 0;
5008         pause_filter_thresh = 0;
5009     } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5010         pause_filter_thresh = 0;
5011     }
5012
5013     if (nested) {
5014         printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
5015         kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5016     }
5017
5018     /*
5019      * KVM's MMU doesn't support using 2-level paging for itself, and thus
5020      * NPT isn't supported if the host is using 2-level paging since host
5021      * CR4 is unchanged on VMRUN.
5022      */
5023     if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5024         npt_enabled = false;
5025
5026     if (!boot_cpu_has(X86_FEATURE_NPT))
5027         npt_enabled = false;
5028
5029     /* Force VM NPT level equal to the host's paging level */
5030     kvm_configure_mmu(npt_enabled, get_npt_level(),
5031               get_npt_level(), PG_LEVEL_1G);
5032     pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5033
5034     /* Setup shadow_me_value and shadow_me_mask */
5035     kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5036
5037     svm_adjust_mmio_mask();
5038
5039     /*
5040      * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5041      * may be modified by svm_adjust_mmio_mask()).
5042      */
5043     sev_hardware_setup();
5044
5045     svm_hv_hardware_setup();
5046
5047     for_each_possible_cpu(cpu) {
5048         r = svm_cpu_init(cpu);
5049         if (r)
5050             goto err;
5051     }
5052
5053     if (nrips) {
5054         if (!boot_cpu_has(X86_FEATURE_NRIPS))
5055             nrips = false;
5056     }
5057
5058     enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
5059
5060     if (!enable_apicv) {
5061         svm_x86_ops.vcpu_blocking = NULL;
5062         svm_x86_ops.vcpu_unblocking = NULL;
5063         svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5064     }
5065
5066     if (vls) {
5067         if (!npt_enabled ||
5068             !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5069             !IS_ENABLED(CONFIG_X86_64)) {
5070             vls = false;
5071         } else {
5072             pr_info("Virtual VMLOAD VMSAVE supported\n");
5073         }
5074     }
5075
5076     if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5077         svm_gp_erratum_intercept = false;
5078
5079     if (vgif) {
5080         if (!boot_cpu_has(X86_FEATURE_VGIF))
5081             vgif = false;
5082         else
5083             pr_info("Virtual GIF supported\n");
5084     }
5085
5086     if (lbrv) {
5087         if (!boot_cpu_has(X86_FEATURE_LBRV))
5088             lbrv = false;
5089         else
5090             pr_info("LBR virtualization supported\n");
5091     }
5092
5093     if (!enable_pmu)
5094         pr_info("PMU virtualization is disabled\n");
5095
5096     svm_set_cpu_caps();
5097
5098     /*
5099      * It seems that on AMD processors PTE's accessed bit is
5100      * being set by the CPU hardware before the NPF vmexit.
5101      * This is not expected behaviour and our tests fail because
5102      * of it.
5103      * A workaround here is to disable support for
5104      * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5105      * In this case userspace can know if there is support using
5106      * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5107      * it
5108      * If future AMD CPU models change the behaviour described above,
5109      * this variable can be changed accordingly
5110      */
5111     allow_smaller_maxphyaddr = !npt_enabled;
5112
5113     return 0;
5114
5115 err:
5116     svm_hardware_unsetup();
5117     return r;
5118 }
5119
5120
5121 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5122     .cpu_has_kvm_support = has_svm,
5123     .disabled_by_bios = is_disabled,
5124     .hardware_setup = svm_hardware_setup,
5125     .check_processor_compatibility = svm_check_processor_compat,
5126
5127     .runtime_ops = &svm_x86_ops,
5128     .pmu_ops = &amd_pmu_ops,
5129 };
5130
5131 static int __init svm_init(void)
5132 {
5133     __unused_size_checks();
5134
5135     return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
5136             __alignof__(struct vcpu_svm), THIS_MODULE);
5137 }
5138
5139 static void __exit svm_exit(void)
5140 {
5141     kvm_exit();
5142 }
5143
5144 module_init(svm_init)
5145 module_exit(svm_exit)