Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Kernel-based Virtual Machine driver for Linux
0004  * cpuid support routines
0005  *
0006  * derived from arch/x86/kvm/x86.c
0007  *
0008  * Copyright 2011 Red Hat, Inc. and/or its affiliates.
0009  * Copyright IBM Corporation, 2008
0010  */
0011 
0012 #include <linux/kvm_host.h>
0013 #include <linux/export.h>
0014 #include <linux/vmalloc.h>
0015 #include <linux/uaccess.h>
0016 #include <linux/sched/stat.h>
0017 
0018 #include <asm/processor.h>
0019 #include <asm/user.h>
0020 #include <asm/fpu/xstate.h>
0021 #include <asm/sgx.h>
0022 #include <asm/cpuid.h>
0023 #include "cpuid.h"
0024 #include "lapic.h"
0025 #include "mmu.h"
0026 #include "trace.h"
0027 #include "pmu.h"
0028 
0029 /*
0030  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
0031  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
0032  */
0033 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
0034 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
0035 
0036 u32 xstate_required_size(u64 xstate_bv, bool compacted)
0037 {
0038     int feature_bit = 0;
0039     u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
0040 
0041     xstate_bv &= XFEATURE_MASK_EXTEND;
0042     while (xstate_bv) {
0043         if (xstate_bv & 0x1) {
0044                 u32 eax, ebx, ecx, edx, offset;
0045                 cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
0046             /* ECX[1]: 64B alignment in compacted form */
0047             if (compacted)
0048                 offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
0049             else
0050                 offset = ebx;
0051             ret = max(ret, offset + eax);
0052         }
0053 
0054         xstate_bv >>= 1;
0055         feature_bit++;
0056     }
0057 
0058     return ret;
0059 }
0060 
0061 /*
0062  * This one is tied to SSB in the user API, and not
0063  * visible in /proc/cpuinfo.
0064  */
0065 #define KVM_X86_FEATURE_PSFD        (13*32+28) /* Predictive Store Forwarding Disable */
0066 
0067 #define F feature_bit
0068 #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
0069 
0070 /*
0071  * Magic value used by KVM when querying userspace-provided CPUID entries and
0072  * doesn't care about the CPIUD index because the index of the function in
0073  * question is not significant.  Note, this magic value must have at least one
0074  * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
0075  * to avoid false positives when processing guest CPUID input.
0076  */
0077 #define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
0078 
0079 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
0080     struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
0081 {
0082     struct kvm_cpuid_entry2 *e;
0083     int i;
0084 
0085     for (i = 0; i < nent; i++) {
0086         e = &entries[i];
0087 
0088         if (e->function != function)
0089             continue;
0090 
0091         /*
0092          * If the index isn't significant, use the first entry with a
0093          * matching function.  It's userspace's responsibilty to not
0094          * provide "duplicate" entries in all cases.
0095          */
0096         if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
0097             return e;
0098 
0099 
0100         /*
0101          * Similarly, use the first matching entry if KVM is doing a
0102          * lookup (as opposed to emulating CPUID) for a function that's
0103          * architecturally defined as not having a significant index.
0104          */
0105         if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
0106             /*
0107              * Direct lookups from KVM should not diverge from what
0108              * KVM defines internally (the architectural behavior).
0109              */
0110             WARN_ON_ONCE(cpuid_function_is_indexed(function));
0111             return e;
0112         }
0113     }
0114 
0115     return NULL;
0116 }
0117 
0118 static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
0119                struct kvm_cpuid_entry2 *entries,
0120                int nent)
0121 {
0122     struct kvm_cpuid_entry2 *best;
0123     u64 xfeatures;
0124 
0125     /*
0126      * The existing code assumes virtual address is 48-bit or 57-bit in the
0127      * canonical address checks; exit if it is ever changed.
0128      */
0129     best = cpuid_entry2_find(entries, nent, 0x80000008,
0130                  KVM_CPUID_INDEX_NOT_SIGNIFICANT);
0131     if (best) {
0132         int vaddr_bits = (best->eax & 0xff00) >> 8;
0133 
0134         if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
0135             return -EINVAL;
0136     }
0137 
0138     /*
0139      * Exposing dynamic xfeatures to the guest requires additional
0140      * enabling in the FPU, e.g. to expand the guest XSAVE state size.
0141      */
0142     best = cpuid_entry2_find(entries, nent, 0xd, 0);
0143     if (!best)
0144         return 0;
0145 
0146     xfeatures = best->eax | ((u64)best->edx << 32);
0147     xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
0148     if (!xfeatures)
0149         return 0;
0150 
0151     return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
0152 }
0153 
0154 /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
0155 static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
0156                  int nent)
0157 {
0158     struct kvm_cpuid_entry2 *orig;
0159     int i;
0160 
0161     if (nent != vcpu->arch.cpuid_nent)
0162         return -EINVAL;
0163 
0164     for (i = 0; i < nent; i++) {
0165         orig = &vcpu->arch.cpuid_entries[i];
0166         if (e2[i].function != orig->function ||
0167             e2[i].index != orig->index ||
0168             e2[i].flags != orig->flags ||
0169             e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
0170             e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
0171             return -EINVAL;
0172     }
0173 
0174     return 0;
0175 }
0176 
0177 static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
0178 {
0179     u32 function;
0180     struct kvm_cpuid_entry2 *entry;
0181 
0182     vcpu->arch.kvm_cpuid_base = 0;
0183 
0184     for_each_possible_hypervisor_cpuid_base(function) {
0185         entry = kvm_find_cpuid_entry(vcpu, function);
0186 
0187         if (entry) {
0188             u32 signature[3];
0189 
0190             signature[0] = entry->ebx;
0191             signature[1] = entry->ecx;
0192             signature[2] = entry->edx;
0193 
0194             BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
0195             if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
0196                 vcpu->arch.kvm_cpuid_base = function;
0197                 break;
0198             }
0199         }
0200     }
0201 }
0202 
0203 static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
0204                           struct kvm_cpuid_entry2 *entries, int nent)
0205 {
0206     u32 base = vcpu->arch.kvm_cpuid_base;
0207 
0208     if (!base)
0209         return NULL;
0210 
0211     return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES,
0212                  KVM_CPUID_INDEX_NOT_SIGNIFICANT);
0213 }
0214 
0215 static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
0216 {
0217     return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
0218                          vcpu->arch.cpuid_nent);
0219 }
0220 
0221 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
0222 {
0223     struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu);
0224 
0225     /*
0226      * save the feature bitmap to avoid cpuid lookup for every PV
0227      * operation
0228      */
0229     if (best)
0230         vcpu->arch.pv_cpuid.features = best->eax;
0231 }
0232 
0233 /*
0234  * Calculate guest's supported XCR0 taking into account guest CPUID data and
0235  * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
0236  */
0237 static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
0238 {
0239     struct kvm_cpuid_entry2 *best;
0240 
0241     best = cpuid_entry2_find(entries, nent, 0xd, 0);
0242     if (!best)
0243         return 0;
0244 
0245     return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
0246 }
0247 
0248 static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
0249                        int nent)
0250 {
0251     struct kvm_cpuid_entry2 *best;
0252     u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
0253 
0254     best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
0255     if (best) {
0256         /* Update OSXSAVE bit */
0257         if (boot_cpu_has(X86_FEATURE_XSAVE))
0258             cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
0259                    kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
0260 
0261         cpuid_entry_change(best, X86_FEATURE_APIC,
0262                vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
0263     }
0264 
0265     best = cpuid_entry2_find(entries, nent, 7, 0);
0266     if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
0267         cpuid_entry_change(best, X86_FEATURE_OSPKE,
0268                    kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
0269 
0270     best = cpuid_entry2_find(entries, nent, 0xD, 0);
0271     if (best)
0272         best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
0273 
0274     best = cpuid_entry2_find(entries, nent, 0xD, 1);
0275     if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
0276              cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
0277         best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
0278 
0279     best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
0280     if (kvm_hlt_in_guest(vcpu->kvm) && best &&
0281         (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
0282         best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
0283 
0284     if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
0285         best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
0286         if (best)
0287             cpuid_entry_change(best, X86_FEATURE_MWAIT,
0288                        vcpu->arch.ia32_misc_enable_msr &
0289                        MSR_IA32_MISC_ENABLE_MWAIT);
0290     }
0291 
0292     /*
0293      * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
0294      * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
0295      * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
0296      * at the time of EENTER, thus adjust the allowed XFRM by the guest's
0297      * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
0298      * '1' even on CPUs that don't support XSAVE.
0299      */
0300     best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
0301     if (best) {
0302         best->ecx &= guest_supported_xcr0 & 0xffffffff;
0303         best->edx &= guest_supported_xcr0 >> 32;
0304         best->ecx |= XFEATURE_MASK_FPSSE;
0305     }
0306 }
0307 
0308 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
0309 {
0310     __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
0311 }
0312 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
0313 
0314 static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
0315 {
0316     struct kvm_lapic *apic = vcpu->arch.apic;
0317     struct kvm_cpuid_entry2 *best;
0318 
0319     best = kvm_find_cpuid_entry(vcpu, 1);
0320     if (best && apic) {
0321         if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
0322             apic->lapic_timer.timer_mode_mask = 3 << 17;
0323         else
0324             apic->lapic_timer.timer_mode_mask = 1 << 17;
0325 
0326         kvm_apic_set_version(vcpu);
0327     }
0328 
0329     vcpu->arch.guest_supported_xcr0 =
0330         cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
0331 
0332     /*
0333      * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
0334      * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
0335      * supported by the host.
0336      */
0337     vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 |
0338                                XFEATURE_MASK_FPSSE;
0339 
0340     kvm_update_pv_runtime(vcpu);
0341 
0342     vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
0343     vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
0344 
0345     kvm_pmu_refresh(vcpu);
0346     vcpu->arch.cr4_guest_rsvd_bits =
0347         __cr4_reserved_bits(guest_cpuid_has, vcpu);
0348 
0349     kvm_hv_set_cpuid(vcpu);
0350 
0351     /* Invoke the vendor callback only after the above state is updated. */
0352     static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
0353 
0354     /*
0355      * Except for the MMU, which needs to do its thing any vendor specific
0356      * adjustments to the reserved GPA bits.
0357      */
0358     kvm_mmu_after_set_cpuid(vcpu);
0359 }
0360 
0361 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
0362 {
0363     struct kvm_cpuid_entry2 *best;
0364 
0365     best = kvm_find_cpuid_entry(vcpu, 0x80000000);
0366     if (!best || best->eax < 0x80000008)
0367         goto not_found;
0368     best = kvm_find_cpuid_entry(vcpu, 0x80000008);
0369     if (best)
0370         return best->eax & 0xff;
0371 not_found:
0372     return 36;
0373 }
0374 
0375 /*
0376  * This "raw" version returns the reserved GPA bits without any adjustments for
0377  * encryption technologies that usurp bits.  The raw mask should be used if and
0378  * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
0379  */
0380 u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
0381 {
0382     return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
0383 }
0384 
0385 static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
0386                         int nent)
0387 {
0388     int r;
0389 
0390     __kvm_update_cpuid_runtime(vcpu, e2, nent);
0391 
0392     /*
0393      * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
0394      * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
0395      * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
0396      * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
0397      * the core vCPU model on the fly. It would've been better to forbid any
0398      * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
0399      * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
0400      * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
0401      * whether the supplied CPUID data is equal to what's already set.
0402      */
0403     if (vcpu->arch.last_vmentry_cpu != -1) {
0404         r = kvm_cpuid_check_equal(vcpu, e2, nent);
0405         if (r)
0406             return r;
0407 
0408         kvfree(e2);
0409         return 0;
0410     }
0411 
0412     r = kvm_check_cpuid(vcpu, e2, nent);
0413     if (r)
0414         return r;
0415 
0416     kvfree(vcpu->arch.cpuid_entries);
0417     vcpu->arch.cpuid_entries = e2;
0418     vcpu->arch.cpuid_nent = nent;
0419 
0420     kvm_update_kvm_cpuid_base(vcpu);
0421     kvm_vcpu_after_set_cpuid(vcpu);
0422 
0423     return 0;
0424 }
0425 
0426 /* when an old userspace process fills a new kernel module */
0427 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
0428                  struct kvm_cpuid *cpuid,
0429                  struct kvm_cpuid_entry __user *entries)
0430 {
0431     int r, i;
0432     struct kvm_cpuid_entry *e = NULL;
0433     struct kvm_cpuid_entry2 *e2 = NULL;
0434 
0435     if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
0436         return -E2BIG;
0437 
0438     if (cpuid->nent) {
0439         e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
0440         if (IS_ERR(e))
0441             return PTR_ERR(e);
0442 
0443         e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
0444         if (!e2) {
0445             r = -ENOMEM;
0446             goto out_free_cpuid;
0447         }
0448     }
0449     for (i = 0; i < cpuid->nent; i++) {
0450         e2[i].function = e[i].function;
0451         e2[i].eax = e[i].eax;
0452         e2[i].ebx = e[i].ebx;
0453         e2[i].ecx = e[i].ecx;
0454         e2[i].edx = e[i].edx;
0455         e2[i].index = 0;
0456         e2[i].flags = 0;
0457         e2[i].padding[0] = 0;
0458         e2[i].padding[1] = 0;
0459         e2[i].padding[2] = 0;
0460     }
0461 
0462     r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
0463     if (r)
0464         kvfree(e2);
0465 
0466 out_free_cpuid:
0467     kvfree(e);
0468 
0469     return r;
0470 }
0471 
0472 int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
0473                   struct kvm_cpuid2 *cpuid,
0474                   struct kvm_cpuid_entry2 __user *entries)
0475 {
0476     struct kvm_cpuid_entry2 *e2 = NULL;
0477     int r;
0478 
0479     if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
0480         return -E2BIG;
0481 
0482     if (cpuid->nent) {
0483         e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent));
0484         if (IS_ERR(e2))
0485             return PTR_ERR(e2);
0486     }
0487 
0488     r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
0489     if (r)
0490         kvfree(e2);
0491 
0492     return r;
0493 }
0494 
0495 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
0496                   struct kvm_cpuid2 *cpuid,
0497                   struct kvm_cpuid_entry2 __user *entries)
0498 {
0499     int r;
0500 
0501     r = -E2BIG;
0502     if (cpuid->nent < vcpu->arch.cpuid_nent)
0503         goto out;
0504     r = -EFAULT;
0505     if (copy_to_user(entries, vcpu->arch.cpuid_entries,
0506              vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
0507         goto out;
0508     return 0;
0509 
0510 out:
0511     cpuid->nent = vcpu->arch.cpuid_nent;
0512     return r;
0513 }
0514 
0515 /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
0516 static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
0517 {
0518     const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
0519     struct kvm_cpuid_entry2 entry;
0520 
0521     reverse_cpuid_check(leaf);
0522 
0523     cpuid_count(cpuid.function, cpuid.index,
0524             &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
0525 
0526     kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
0527 }
0528 
0529 static __always_inline
0530 void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
0531 {
0532     /* Use kvm_cpu_cap_mask for non-scattered leafs. */
0533     BUILD_BUG_ON(leaf < NCAPINTS);
0534 
0535     kvm_cpu_caps[leaf] = mask;
0536 
0537     __kvm_cpu_cap_mask(leaf);
0538 }
0539 
0540 static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
0541 {
0542     /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
0543     BUILD_BUG_ON(leaf >= NCAPINTS);
0544 
0545     kvm_cpu_caps[leaf] &= mask;
0546 
0547     __kvm_cpu_cap_mask(leaf);
0548 }
0549 
0550 void kvm_set_cpu_caps(void)
0551 {
0552 #ifdef CONFIG_X86_64
0553     unsigned int f_gbpages = F(GBPAGES);
0554     unsigned int f_lm = F(LM);
0555     unsigned int f_xfd = F(XFD);
0556 #else
0557     unsigned int f_gbpages = 0;
0558     unsigned int f_lm = 0;
0559     unsigned int f_xfd = 0;
0560 #endif
0561     memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
0562 
0563     BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
0564              sizeof(boot_cpu_data.x86_capability));
0565 
0566     memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
0567            sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
0568 
0569     kvm_cpu_cap_mask(CPUID_1_ECX,
0570         /*
0571          * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
0572          * advertised to guests via CPUID!
0573          */
0574         F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0575         0 /* DS-CPL, VMX, SMX, EST */ |
0576         0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0577         F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
0578         F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
0579         F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0580         0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
0581         F(F16C) | F(RDRAND)
0582     );
0583     /* KVM emulates x2apic in software irrespective of host support. */
0584     kvm_cpu_cap_set(X86_FEATURE_X2APIC);
0585 
0586     kvm_cpu_cap_mask(CPUID_1_EDX,
0587         F(FPU) | F(VME) | F(DE) | F(PSE) |
0588         F(TSC) | F(MSR) | F(PAE) | F(MCE) |
0589         F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
0590         F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
0591         F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
0592         0 /* Reserved, DS, ACPI */ | F(MMX) |
0593         F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
0594         0 /* HTT, TM, Reserved, PBE */
0595     );
0596 
0597     kvm_cpu_cap_mask(CPUID_7_0_EBX,
0598         F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
0599         F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
0600         F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
0601         F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
0602         F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
0603         F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
0604         F(AVX512VL));
0605 
0606     kvm_cpu_cap_mask(CPUID_7_ECX,
0607         F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
0608         F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
0609         F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
0610         F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
0611         F(SGX_LC) | F(BUS_LOCK_DETECT)
0612     );
0613     /* Set LA57 based on hardware capability. */
0614     if (cpuid_ecx(7) & F(LA57))
0615         kvm_cpu_cap_set(X86_FEATURE_LA57);
0616 
0617     /*
0618      * PKU not yet implemented for shadow paging and requires OSPKE
0619      * to be set on the host. Clear it if that is not the case
0620      */
0621     if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
0622         kvm_cpu_cap_clear(X86_FEATURE_PKU);
0623 
0624     kvm_cpu_cap_mask(CPUID_7_EDX,
0625         F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
0626         F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
0627         F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
0628         F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
0629         F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
0630     );
0631 
0632     /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
0633     kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
0634     kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
0635 
0636     if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
0637         kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
0638     if (boot_cpu_has(X86_FEATURE_STIBP))
0639         kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
0640     if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
0641         kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
0642 
0643     kvm_cpu_cap_mask(CPUID_7_1_EAX,
0644         F(AVX_VNNI) | F(AVX512_BF16)
0645     );
0646 
0647     kvm_cpu_cap_mask(CPUID_D_1_EAX,
0648         F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
0649     );
0650 
0651     kvm_cpu_cap_init_scattered(CPUID_12_EAX,
0652         SF(SGX1) | SF(SGX2)
0653     );
0654 
0655     kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
0656         F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
0657         F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
0658         F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0659         0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
0660         F(TOPOEXT) | 0 /* PERFCTR_CORE */
0661     );
0662 
0663     kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
0664         F(FPU) | F(VME) | F(DE) | F(PSE) |
0665         F(TSC) | F(MSR) | F(PAE) | F(MCE) |
0666         F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
0667         F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
0668         F(PAT) | F(PSE36) | 0 /* Reserved */ |
0669         F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
0670         F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
0671         0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
0672     );
0673 
0674     if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
0675         kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
0676 
0677     kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
0678         F(CLZERO) | F(XSAVEERPTR) |
0679         F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
0680         F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
0681         __feature_bit(KVM_X86_FEATURE_PSFD)
0682     );
0683 
0684     /*
0685      * AMD has separate bits for each SPEC_CTRL bit.
0686      * arch/x86/kernel/cpu/bugs.c is kind enough to
0687      * record that in cpufeatures so use them.
0688      */
0689     if (boot_cpu_has(X86_FEATURE_IBPB))
0690         kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
0691     if (boot_cpu_has(X86_FEATURE_IBRS))
0692         kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
0693     if (boot_cpu_has(X86_FEATURE_STIBP))
0694         kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
0695     if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
0696         kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
0697     if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
0698         kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
0699     /*
0700      * The preference is to use SPEC CTRL MSR instead of the
0701      * VIRT_SPEC MSR.
0702      */
0703     if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
0704         !boot_cpu_has(X86_FEATURE_AMD_SSBD))
0705         kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
0706 
0707     /*
0708      * Hide all SVM features by default, SVM will set the cap bits for
0709      * features it emulates and/or exposes for L1.
0710      */
0711     kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
0712 
0713     kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
0714         0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
0715         F(SME_COHERENT));
0716 
0717     kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
0718         F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
0719         F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
0720         F(PMM) | F(PMM_EN)
0721     );
0722 
0723     /*
0724      * Hide RDTSCP and RDPID if either feature is reported as supported but
0725      * probing MSR_TSC_AUX failed.  This is purely a sanity check and
0726      * should never happen, but the guest will likely crash if RDTSCP or
0727      * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
0728      * the past.  For example, the sanity check may fire if this instance of
0729      * KVM is running as L1 on top of an older, broken KVM.
0730      */
0731     if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
0732              kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
0733              !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
0734         kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
0735         kvm_cpu_cap_clear(X86_FEATURE_RDPID);
0736     }
0737 }
0738 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
0739 
0740 struct kvm_cpuid_array {
0741     struct kvm_cpuid_entry2 *entries;
0742     int maxnent;
0743     int nent;
0744 };
0745 
0746 static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
0747                           u32 function, u32 index)
0748 {
0749     struct kvm_cpuid_entry2 *entry;
0750 
0751     if (array->nent >= array->maxnent)
0752         return NULL;
0753 
0754     entry = &array->entries[array->nent++];
0755 
0756     memset(entry, 0, sizeof(*entry));
0757     entry->function = function;
0758     entry->index = index;
0759     switch (function & 0xC0000000) {
0760     case 0x40000000:
0761         /* Hypervisor leaves are always synthesized by __do_cpuid_func.  */
0762         return entry;
0763 
0764     case 0x80000000:
0765         /*
0766          * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
0767          * would result in out-of-bounds calls to do_host_cpuid.
0768          */
0769         {
0770             static int max_cpuid_80000000;
0771             if (!READ_ONCE(max_cpuid_80000000))
0772                 WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
0773             if (function > READ_ONCE(max_cpuid_80000000))
0774                 return entry;
0775         }
0776         break;
0777 
0778     default:
0779         break;
0780     }
0781 
0782     cpuid_count(entry->function, entry->index,
0783             &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
0784 
0785     if (cpuid_function_is_indexed(function))
0786         entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
0787 
0788     return entry;
0789 }
0790 
0791 static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
0792 {
0793     struct kvm_cpuid_entry2 *entry;
0794 
0795     if (array->nent >= array->maxnent)
0796         return -E2BIG;
0797 
0798     entry = &array->entries[array->nent];
0799     entry->function = func;
0800     entry->index = 0;
0801     entry->flags = 0;
0802 
0803     switch (func) {
0804     case 0:
0805         entry->eax = 7;
0806         ++array->nent;
0807         break;
0808     case 1:
0809         entry->ecx = F(MOVBE);
0810         ++array->nent;
0811         break;
0812     case 7:
0813         entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
0814         entry->eax = 0;
0815         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
0816             entry->ecx = F(RDPID);
0817         ++array->nent;
0818         break;
0819     default:
0820         break;
0821     }
0822 
0823     return 0;
0824 }
0825 
0826 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
0827 {
0828     struct kvm_cpuid_entry2 *entry;
0829     int r, i, max_idx;
0830 
0831     /* all calls to cpuid_count() should be made on the same cpu */
0832     get_cpu();
0833 
0834     r = -E2BIG;
0835 
0836     entry = do_host_cpuid(array, function, 0);
0837     if (!entry)
0838         goto out;
0839 
0840     switch (function) {
0841     case 0:
0842         /* Limited to the highest leaf implemented in KVM. */
0843         entry->eax = min(entry->eax, 0x1fU);
0844         break;
0845     case 1:
0846         cpuid_entry_override(entry, CPUID_1_EDX);
0847         cpuid_entry_override(entry, CPUID_1_ECX);
0848         break;
0849     case 2:
0850         /*
0851          * On ancient CPUs, function 2 entries are STATEFUL.  That is,
0852          * CPUID(function=2, index=0) may return different results each
0853          * time, with the least-significant byte in EAX enumerating the
0854          * number of times software should do CPUID(2, 0).
0855          *
0856          * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
0857          * idiotic.  Intel's SDM states that EAX & 0xff "will always
0858          * return 01H. Software should ignore this value and not
0859          * interpret it as an informational descriptor", while AMD's
0860          * APM states that CPUID(2) is reserved.
0861          *
0862          * WARN if a frankenstein CPU that supports virtualization and
0863          * a stateful CPUID.0x2 is encountered.
0864          */
0865         WARN_ON_ONCE((entry->eax & 0xff) > 1);
0866         break;
0867     /* functions 4 and 0x8000001d have additional index. */
0868     case 4:
0869     case 0x8000001d:
0870         /*
0871          * Read entries until the cache type in the previous entry is
0872          * zero, i.e. indicates an invalid entry.
0873          */
0874         for (i = 1; entry->eax & 0x1f; ++i) {
0875             entry = do_host_cpuid(array, function, i);
0876             if (!entry)
0877                 goto out;
0878         }
0879         break;
0880     case 6: /* Thermal management */
0881         entry->eax = 0x4; /* allow ARAT */
0882         entry->ebx = 0;
0883         entry->ecx = 0;
0884         entry->edx = 0;
0885         break;
0886     /* function 7 has additional index. */
0887     case 7:
0888         entry->eax = min(entry->eax, 1u);
0889         cpuid_entry_override(entry, CPUID_7_0_EBX);
0890         cpuid_entry_override(entry, CPUID_7_ECX);
0891         cpuid_entry_override(entry, CPUID_7_EDX);
0892 
0893         /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
0894         if (entry->eax == 1) {
0895             entry = do_host_cpuid(array, function, 1);
0896             if (!entry)
0897                 goto out;
0898 
0899             cpuid_entry_override(entry, CPUID_7_1_EAX);
0900             entry->ebx = 0;
0901             entry->ecx = 0;
0902             entry->edx = 0;
0903         }
0904         break;
0905     case 0xa: { /* Architectural Performance Monitoring */
0906         union cpuid10_eax eax;
0907         union cpuid10_edx edx;
0908 
0909         if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
0910             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
0911             break;
0912         }
0913 
0914         eax.split.version_id = kvm_pmu_cap.version;
0915         eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
0916         eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
0917         eax.split.mask_length = kvm_pmu_cap.events_mask_len;
0918         edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
0919         edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
0920 
0921         if (kvm_pmu_cap.version)
0922             edx.split.anythread_deprecated = 1;
0923         edx.split.reserved1 = 0;
0924         edx.split.reserved2 = 0;
0925 
0926         entry->eax = eax.full;
0927         entry->ebx = kvm_pmu_cap.events_mask;
0928         entry->ecx = 0;
0929         entry->edx = edx.full;
0930         break;
0931     }
0932     /*
0933      * Per Intel's SDM, the 0x1f is a superset of 0xb,
0934      * thus they can be handled by common code.
0935      */
0936     case 0x1f:
0937     case 0xb:
0938         /*
0939          * Populate entries until the level type (ECX[15:8]) of the
0940          * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
0941          * the starting entry, filled by the primary do_host_cpuid().
0942          */
0943         for (i = 1; entry->ecx & 0xff00; ++i) {
0944             entry = do_host_cpuid(array, function, i);
0945             if (!entry)
0946                 goto out;
0947         }
0948         break;
0949     case 0xd: {
0950         u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
0951         u64 permitted_xss = kvm_caps.supported_xss;
0952 
0953         entry->eax &= permitted_xcr0;
0954         entry->ebx = xstate_required_size(permitted_xcr0, false);
0955         entry->ecx = entry->ebx;
0956         entry->edx &= permitted_xcr0 >> 32;
0957         if (!permitted_xcr0)
0958             break;
0959 
0960         entry = do_host_cpuid(array, function, 1);
0961         if (!entry)
0962             goto out;
0963 
0964         cpuid_entry_override(entry, CPUID_D_1_EAX);
0965         if (entry->eax & (F(XSAVES)|F(XSAVEC)))
0966             entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
0967                               true);
0968         else {
0969             WARN_ON_ONCE(permitted_xss != 0);
0970             entry->ebx = 0;
0971         }
0972         entry->ecx &= permitted_xss;
0973         entry->edx &= permitted_xss >> 32;
0974 
0975         for (i = 2; i < 64; ++i) {
0976             bool s_state;
0977             if (permitted_xcr0 & BIT_ULL(i))
0978                 s_state = false;
0979             else if (permitted_xss & BIT_ULL(i))
0980                 s_state = true;
0981             else
0982                 continue;
0983 
0984             entry = do_host_cpuid(array, function, i);
0985             if (!entry)
0986                 goto out;
0987 
0988             /*
0989              * The supported check above should have filtered out
0990              * invalid sub-leafs.  Only valid sub-leafs should
0991              * reach this point, and they should have a non-zero
0992              * save state size.  Furthermore, check whether the
0993              * processor agrees with permitted_xcr0/permitted_xss
0994              * on whether this is an XCR0- or IA32_XSS-managed area.
0995              */
0996             if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
0997                 --array->nent;
0998                 continue;
0999             }
1000 
1001             if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
1002                 entry->ecx &= ~BIT_ULL(2);
1003             entry->edx = 0;
1004         }
1005         break;
1006     }
1007     case 0x12:
1008         /* Intel SGX */
1009         if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
1010             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1011             break;
1012         }
1013 
1014         /*
1015          * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
1016          * and max enclave sizes.   The SGX sub-features and MISCSELECT
1017          * are restricted by kernel and KVM capabilities (like most
1018          * feature flags), while enclave size is unrestricted.
1019          */
1020         cpuid_entry_override(entry, CPUID_12_EAX);
1021         entry->ebx &= SGX_MISC_EXINFO;
1022 
1023         entry = do_host_cpuid(array, function, 1);
1024         if (!entry)
1025             goto out;
1026 
1027         /*
1028          * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
1029          * feature flags.  Advertise all supported flags, including
1030          * privileged attributes that require explicit opt-in from
1031          * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
1032          * expected to derive it from supported XCR0.
1033          */
1034         entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
1035                   SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
1036                   SGX_ATTR_KSS;
1037         entry->ebx &= 0;
1038         break;
1039     /* Intel PT */
1040     case 0x14:
1041         if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
1042             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1043             break;
1044         }
1045 
1046         for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
1047             if (!do_host_cpuid(array, function, i))
1048                 goto out;
1049         }
1050         break;
1051     /* Intel AMX TILE */
1052     case 0x1d:
1053         if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
1054             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1055             break;
1056         }
1057 
1058         for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
1059             if (!do_host_cpuid(array, function, i))
1060                 goto out;
1061         }
1062         break;
1063     case 0x1e: /* TMUL information */
1064         if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
1065             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1066             break;
1067         }
1068         break;
1069     case KVM_CPUID_SIGNATURE: {
1070         const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
1071         entry->eax = KVM_CPUID_FEATURES;
1072         entry->ebx = sigptr[0];
1073         entry->ecx = sigptr[1];
1074         entry->edx = sigptr[2];
1075         break;
1076     }
1077     case KVM_CPUID_FEATURES:
1078         entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
1079                  (1 << KVM_FEATURE_NOP_IO_DELAY) |
1080                  (1 << KVM_FEATURE_CLOCKSOURCE2) |
1081                  (1 << KVM_FEATURE_ASYNC_PF) |
1082                  (1 << KVM_FEATURE_PV_EOI) |
1083                  (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
1084                  (1 << KVM_FEATURE_PV_UNHALT) |
1085                  (1 << KVM_FEATURE_PV_TLB_FLUSH) |
1086                  (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
1087                  (1 << KVM_FEATURE_PV_SEND_IPI) |
1088                  (1 << KVM_FEATURE_POLL_CONTROL) |
1089                  (1 << KVM_FEATURE_PV_SCHED_YIELD) |
1090                  (1 << KVM_FEATURE_ASYNC_PF_INT);
1091 
1092         if (sched_info_on())
1093             entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
1094 
1095         entry->ebx = 0;
1096         entry->ecx = 0;
1097         entry->edx = 0;
1098         break;
1099     case 0x80000000:
1100         entry->eax = min(entry->eax, 0x80000021);
1101         /*
1102          * Serializing LFENCE is reported in a multitude of ways, and
1103          * NullSegClearsBase is not reported in CPUID on Zen2; help
1104          * userspace by providing the CPUID leaf ourselves.
1105          *
1106          * However, only do it if the host has CPUID leaf 0x8000001d.
1107          * QEMU thinks that it can query the host blindly for that
1108          * CPUID leaf if KVM reports that it supports 0x8000001d or
1109          * above.  The processor merrily returns values from the
1110          * highest Intel leaf which QEMU tries to use as the guest's
1111          * 0x8000001d.  Even worse, this can result in an infinite
1112          * loop if said highest leaf has no subleaves indexed by ECX.
1113          */
1114         if (entry->eax >= 0x8000001d &&
1115             (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
1116              || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
1117             entry->eax = max(entry->eax, 0x80000021);
1118         break;
1119     case 0x80000001:
1120         cpuid_entry_override(entry, CPUID_8000_0001_EDX);
1121         cpuid_entry_override(entry, CPUID_8000_0001_ECX);
1122         break;
1123     case 0x80000006:
1124         /* L2 cache and TLB: pass through host info. */
1125         break;
1126     case 0x80000007: /* Advanced power management */
1127         /* invariant TSC is CPUID.80000007H:EDX[8] */
1128         entry->edx &= (1 << 8);
1129         /* mask against host */
1130         entry->edx &= boot_cpu_data.x86_power;
1131         entry->eax = entry->ebx = entry->ecx = 0;
1132         break;
1133     case 0x80000008: {
1134         unsigned g_phys_as = (entry->eax >> 16) & 0xff;
1135         unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
1136         unsigned phys_as = entry->eax & 0xff;
1137 
1138         /*
1139          * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
1140          * the guest operates in the same PA space as the host, i.e.
1141          * reductions in MAXPHYADDR for memory encryption affect shadow
1142          * paging, too.
1143          *
1144          * If TDP is enabled but an explicit guest MAXPHYADDR is not
1145          * provided, use the raw bare metal MAXPHYADDR as reductions to
1146          * the HPAs do not affect GPAs.
1147          */
1148         if (!tdp_enabled)
1149             g_phys_as = boot_cpu_data.x86_phys_bits;
1150         else if (!g_phys_as)
1151             g_phys_as = phys_as;
1152 
1153         entry->eax = g_phys_as | (virt_as << 8);
1154         entry->edx = 0;
1155         cpuid_entry_override(entry, CPUID_8000_0008_EBX);
1156         break;
1157     }
1158     case 0x8000000A:
1159         if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
1160             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1161             break;
1162         }
1163         entry->eax = 1; /* SVM revision 1 */
1164         entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
1165                    ASID emulation to nested SVM */
1166         entry->ecx = 0; /* Reserved */
1167         cpuid_entry_override(entry, CPUID_8000_000A_EDX);
1168         break;
1169     case 0x80000019:
1170         entry->ecx = entry->edx = 0;
1171         break;
1172     case 0x8000001a:
1173     case 0x8000001e:
1174         break;
1175     case 0x8000001F:
1176         if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
1177             entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1178         } else {
1179             cpuid_entry_override(entry, CPUID_8000_001F_EAX);
1180 
1181             /*
1182              * Enumerate '0' for "PA bits reduction", the adjusted
1183              * MAXPHYADDR is enumerated directly (see 0x80000008).
1184              */
1185             entry->ebx &= ~GENMASK(11, 6);
1186         }
1187         break;
1188     case 0x80000020:
1189         entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1190         break;
1191     case 0x80000021:
1192         entry->ebx = entry->ecx = entry->edx = 0;
1193         /*
1194          * Pass down these bits:
1195          *    EAX      0      NNDBP, Processor ignores nested data breakpoints
1196          *    EAX      2      LAS, LFENCE always serializing
1197          *    EAX      6      NSCB, Null selector clear base
1198          *
1199          * Other defined bits are for MSRs that KVM does not expose:
1200          *   EAX      3      SPCL, SMM page configuration lock
1201          *   EAX      13     PCMSR, Prefetch control MSR
1202          */
1203         entry->eax &= BIT(0) | BIT(2) | BIT(6);
1204         if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
1205             entry->eax |= BIT(2);
1206         if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
1207             entry->eax |= BIT(6);
1208         break;
1209     /*Add support for Centaur's CPUID instruction*/
1210     case 0xC0000000:
1211         /*Just support up to 0xC0000004 now*/
1212         entry->eax = min(entry->eax, 0xC0000004);
1213         break;
1214     case 0xC0000001:
1215         cpuid_entry_override(entry, CPUID_C000_0001_EDX);
1216         break;
1217     case 3: /* Processor serial number */
1218     case 5: /* MONITOR/MWAIT */
1219     case 0xC0000002:
1220     case 0xC0000003:
1221     case 0xC0000004:
1222     default:
1223         entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1224         break;
1225     }
1226 
1227     r = 0;
1228 
1229 out:
1230     put_cpu();
1231 
1232     return r;
1233 }
1234 
1235 static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
1236              unsigned int type)
1237 {
1238     if (type == KVM_GET_EMULATED_CPUID)
1239         return __do_cpuid_func_emulated(array, func);
1240 
1241     return __do_cpuid_func(array, func);
1242 }
1243 
1244 #define CENTAUR_CPUID_SIGNATURE 0xC0000000
1245 
1246 static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
1247               unsigned int type)
1248 {
1249     u32 limit;
1250     int r;
1251 
1252     if (func == CENTAUR_CPUID_SIGNATURE &&
1253         boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
1254         return 0;
1255 
1256     r = do_cpuid_func(array, func, type);
1257     if (r)
1258         return r;
1259 
1260     limit = array->entries[array->nent - 1].eax;
1261     for (func = func + 1; func <= limit; ++func) {
1262         r = do_cpuid_func(array, func, type);
1263         if (r)
1264             break;
1265     }
1266 
1267     return r;
1268 }
1269 
1270 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
1271                  __u32 num_entries, unsigned int ioctl_type)
1272 {
1273     int i;
1274     __u32 pad[3];
1275 
1276     if (ioctl_type != KVM_GET_EMULATED_CPUID)
1277         return false;
1278 
1279     /*
1280      * We want to make sure that ->padding is being passed clean from
1281      * userspace in case we want to use it for something in the future.
1282      *
1283      * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
1284      * have to give ourselves satisfied only with the emulated side. /me
1285      * sheds a tear.
1286      */
1287     for (i = 0; i < num_entries; i++) {
1288         if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
1289             return true;
1290 
1291         if (pad[0] || pad[1] || pad[2])
1292             return true;
1293     }
1294     return false;
1295 }
1296 
1297 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
1298                 struct kvm_cpuid_entry2 __user *entries,
1299                 unsigned int type)
1300 {
1301     static const u32 funcs[] = {
1302         0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
1303     };
1304 
1305     struct kvm_cpuid_array array = {
1306         .nent = 0,
1307     };
1308     int r, i;
1309 
1310     if (cpuid->nent < 1)
1311         return -E2BIG;
1312     if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1313         cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1314 
1315     if (sanity_check_entries(entries, cpuid->nent, type))
1316         return -EINVAL;
1317 
1318     array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL);
1319     if (!array.entries)
1320         return -ENOMEM;
1321 
1322     array.maxnent = cpuid->nent;
1323 
1324     for (i = 0; i < ARRAY_SIZE(funcs); i++) {
1325         r = get_cpuid_func(&array, funcs[i], type);
1326         if (r)
1327             goto out_free;
1328     }
1329     cpuid->nent = array.nent;
1330 
1331     if (copy_to_user(entries, array.entries,
1332              array.nent * sizeof(struct kvm_cpuid_entry2)))
1333         r = -EFAULT;
1334 
1335 out_free:
1336     kvfree(array.entries);
1337     return r;
1338 }
1339 
1340 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
1341                             u32 function, u32 index)
1342 {
1343     return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
1344                  function, index);
1345 }
1346 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
1347 
1348 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
1349                           u32 function)
1350 {
1351     return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
1352                  function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
1353 }
1354 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
1355 
1356 /*
1357  * Intel CPUID semantics treats any query for an out-of-range leaf as if the
1358  * highest basic leaf (i.e. CPUID.0H:EAX) were requested.  AMD CPUID semantics
1359  * returns all zeroes for any undefined leaf, whether or not the leaf is in
1360  * range.  Centaur/VIA follows Intel semantics.
1361  *
1362  * A leaf is considered out-of-range if its function is higher than the maximum
1363  * supported leaf of its associated class or if its associated class does not
1364  * exist.
1365  *
1366  * There are three primary classes to be considered, with their respective
1367  * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive.  A primary
1368  * class exists if a guest CPUID entry for its <base> leaf exists.  For a given
1369  * class, CPUID.<base>.EAX contains the max supported leaf for the class.
1370  *
1371  *  - Basic:      0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
1372  *  - Hypervisor: 0x40000000 - 0x4fffffff
1373  *  - Extended:   0x80000000 - 0xbfffffff
1374  *  - Centaur:    0xc0000000 - 0xcfffffff
1375  *
1376  * The Hypervisor class is further subdivided into sub-classes that each act as
1377  * their own independent class associated with a 0x100 byte range.  E.g. if Qemu
1378  * is advertising support for both HyperV and KVM, the resulting Hypervisor
1379  * CPUID sub-classes are:
1380  *
1381  *  - HyperV:     0x40000000 - 0x400000ff
1382  *  - KVM:        0x40000100 - 0x400001ff
1383  */
1384 static struct kvm_cpuid_entry2 *
1385 get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
1386 {
1387     struct kvm_cpuid_entry2 *basic, *class;
1388     u32 function = *fn_ptr;
1389 
1390     basic = kvm_find_cpuid_entry(vcpu, 0);
1391     if (!basic)
1392         return NULL;
1393 
1394     if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
1395         is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
1396         return NULL;
1397 
1398     if (function >= 0x40000000 && function <= 0x4fffffff)
1399         class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
1400     else if (function >= 0xc0000000)
1401         class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
1402     else
1403         class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
1404 
1405     if (class && function <= class->eax)
1406         return NULL;
1407 
1408     /*
1409      * Leaf specific adjustments are also applied when redirecting to the
1410      * max basic entry, e.g. if the max basic leaf is 0xb but there is no
1411      * entry for CPUID.0xb.index (see below), then the output value for EDX
1412      * needs to be pulled from CPUID.0xb.1.
1413      */
1414     *fn_ptr = basic->eax;
1415 
1416     /*
1417      * The class does not exist or the requested function is out of range;
1418      * the effective CPUID entry is the max basic leaf.  Note, the index of
1419      * the original requested leaf is observed!
1420      */
1421     return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
1422 }
1423 
1424 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
1425            u32 *ecx, u32 *edx, bool exact_only)
1426 {
1427     u32 orig_function = *eax, function = *eax, index = *ecx;
1428     struct kvm_cpuid_entry2 *entry;
1429     bool exact, used_max_basic = false;
1430 
1431     entry = kvm_find_cpuid_entry_index(vcpu, function, index);
1432     exact = !!entry;
1433 
1434     if (!entry && !exact_only) {
1435         entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
1436         used_max_basic = !!entry;
1437     }
1438 
1439     if (entry) {
1440         *eax = entry->eax;
1441         *ebx = entry->ebx;
1442         *ecx = entry->ecx;
1443         *edx = entry->edx;
1444         if (function == 7 && index == 0) {
1445             u64 data;
1446                 if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
1447                 (data & TSX_CTRL_CPUID_CLEAR))
1448                 *ebx &= ~(F(RTM) | F(HLE));
1449         }
1450     } else {
1451         *eax = *ebx = *ecx = *edx = 0;
1452         /*
1453          * When leaf 0BH or 1FH is defined, CL is pass-through
1454          * and EDX is always the x2APIC ID, even for undefined
1455          * subleaves. Index 1 will exist iff the leaf is
1456          * implemented, so we pass through CL iff leaf 1
1457          * exists. EDX can be copied from any existing index.
1458          */
1459         if (function == 0xb || function == 0x1f) {
1460             entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
1461             if (entry) {
1462                 *ecx = index & 0xff;
1463                 *edx = entry->edx;
1464             }
1465         }
1466     }
1467     trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
1468             used_max_basic);
1469     return exact;
1470 }
1471 EXPORT_SYMBOL_GPL(kvm_cpuid);
1472 
1473 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1474 {
1475     u32 eax, ebx, ecx, edx;
1476 
1477     if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
1478         return 1;
1479 
1480     eax = kvm_rax_read(vcpu);
1481     ecx = kvm_rcx_read(vcpu);
1482     kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1483     kvm_rax_write(vcpu, eax);
1484     kvm_rbx_write(vcpu, ebx);
1485     kvm_rcx_write(vcpu, ecx);
1486     kvm_rdx_write(vcpu, edx);
1487     return kvm_skip_emulated_instruction(vcpu);
1488 }
1489 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);