x86/kernel/kvm.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * KVM paravirt_ops implementation
0004  *
0005  * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
0006  * Copyright IBM Corporation, 2007
0007  *   Authors: Anthony Liguori <aliguori@us.ibm.com>
0008  */
0009
0010 #define pr_fmt(fmt) "kvm-guest: " fmt
0011
0012 #include <linux/context_tracking.h>
0013 #include <linux/init.h>
0014 #include <linux/irq.h>
0015 #include <linux/kernel.h>
0016 #include <linux/kvm_para.h>
0017 #include <linux/cpu.h>
0018 #include <linux/mm.h>
0019 #include <linux/highmem.h>
0020 #include <linux/hardirq.h>
0021 #include <linux/notifier.h>
0022 #include <linux/reboot.h>
0023 #include <linux/hash.h>
0024 #include <linux/sched.h>
0025 #include <linux/slab.h>
0026 #include <linux/kprobes.h>
0027 #include <linux/nmi.h>
0028 #include <linux/swait.h>
0029 #include <linux/syscore_ops.h>
0030 #include <linux/cc_platform.h>
0031 #include <linux/efi.h>
0032 #include <asm/timer.h>
0033 #include <asm/cpu.h>
0034 #include <asm/traps.h>
0035 #include <asm/desc.h>
0036 #include <asm/tlbflush.h>
0037 #include <asm/apic.h>
0038 #include <asm/apicdef.h>
0039 #include <asm/hypervisor.h>
0040 #include <asm/tlb.h>
0041 #include <asm/cpuidle_haltpoll.h>
0042 #include <asm/ptrace.h>
0043 #include <asm/reboot.h>
0044 #include <asm/svm.h>
0045 #include <asm/e820/api.h>
0046
0047 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
0048
0049 static int kvmapf = 1;
0050
0051 static int __init parse_no_kvmapf(char *arg)
0052 {
0053         kvmapf = 0;
0054         return 0;
0055 }
0056
0057 early_param("no-kvmapf", parse_no_kvmapf);
0058
0059 static int steal_acc = 1;
0060 static int __init parse_no_stealacc(char *arg)
0061 {
0062         steal_acc = 0;
0063         return 0;
0064 }
0065
0066 early_param("no-steal-acc", parse_no_stealacc);
0067
0068 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
0069 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
0070 static int has_steal_clock = 0;
0071
0072 static int has_guest_poll = 0;
0073 /*
0074  * No need for any "IO delay" on KVM
0075  */
0076 static void kvm_io_delay(void)
0077 {
0078 }
0079
0080 #define KVM_TASK_SLEEP_HASHBITS 8
0081 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
0082
0083 struct kvm_task_sleep_node {
0084     struct hlist_node link;
0085     struct swait_queue_head wq;
0086     u32 token;
0087     int cpu;
0088 };
0089
0090 static struct kvm_task_sleep_head {
0091     raw_spinlock_t lock;
0092     struct hlist_head list;
0093 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
0094
0095 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
0096                           u32 token)
0097 {
0098     struct hlist_node *p;
0099
0100     hlist_for_each(p, &b->list) {
0101         struct kvm_task_sleep_node *n =
0102             hlist_entry(p, typeof(*n), link);
0103         if (n->token == token)
0104             return n;
0105     }
0106
0107     return NULL;
0108 }
0109
0110 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
0111 {
0112     u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
0113     struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
0114     struct kvm_task_sleep_node *e;
0115
0116     raw_spin_lock(&b->lock);
0117     e = _find_apf_task(b, token);
0118     if (e) {
0119         /* dummy entry exist -> wake up was delivered ahead of PF */
0120         hlist_del(&e->link);
0121         raw_spin_unlock(&b->lock);
0122         kfree(e);
0123         return false;
0124     }
0125
0126     n->token = token;
0127     n->cpu = smp_processor_id();
0128     init_swait_queue_head(&n->wq);
0129     hlist_add_head(&n->link, &b->list);
0130     raw_spin_unlock(&b->lock);
0131     return true;
0132 }
0133
0134 /*
0135  * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
0136  * @token:  Token to identify the sleep node entry
0137  *
0138  * Invoked from the async pagefault handling code or from the VM exit page
0139  * fault handler. In both cases RCU is watching.
0140  */
0141 void kvm_async_pf_task_wait_schedule(u32 token)
0142 {
0143     struct kvm_task_sleep_node n;
0144     DECLARE_SWAITQUEUE(wait);
0145
0146     lockdep_assert_irqs_disabled();
0147
0148     if (!kvm_async_pf_queue_task(token, &n))
0149         return;
0150
0151     for (;;) {
0152         prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
0153         if (hlist_unhashed(&n.link))
0154             break;
0155
0156         local_irq_enable();
0157         schedule();
0158         local_irq_disable();
0159     }
0160     finish_swait(&n.wq, &wait);
0161 }
0162 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
0163
0164 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
0165 {
0166     hlist_del_init(&n->link);
0167     if (swq_has_sleeper(&n->wq))
0168         swake_up_one(&n->wq);
0169 }
0170
0171 static void apf_task_wake_all(void)
0172 {
0173     int i;
0174
0175     for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
0176         struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
0177         struct kvm_task_sleep_node *n;
0178         struct hlist_node *p, *next;
0179
0180         raw_spin_lock(&b->lock);
0181         hlist_for_each_safe(p, next, &b->list) {
0182             n = hlist_entry(p, typeof(*n), link);
0183             if (n->cpu == smp_processor_id())
0184                 apf_task_wake_one(n);
0185         }
0186         raw_spin_unlock(&b->lock);
0187     }
0188 }
0189
0190 void kvm_async_pf_task_wake(u32 token)
0191 {
0192     u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
0193     struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
0194     struct kvm_task_sleep_node *n, *dummy = NULL;
0195
0196     if (token == ~0) {
0197         apf_task_wake_all();
0198         return;
0199     }
0200
0201 again:
0202     raw_spin_lock(&b->lock);
0203     n = _find_apf_task(b, token);
0204     if (!n) {
0205         /*
0206          * Async #PF not yet handled, add a dummy entry for the token.
0207          * Allocating the token must be down outside of the raw lock
0208          * as the allocator is preemptible on PREEMPT_RT kernels.
0209          */
0210         if (!dummy) {
0211             raw_spin_unlock(&b->lock);
0212             dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
0213
0214             /*
0215              * Continue looping on allocation failure, eventually
0216              * the async #PF will be handled and allocating a new
0217              * node will be unnecessary.
0218              */
0219             if (!dummy)
0220                 cpu_relax();
0221
0222             /*
0223              * Recheck for async #PF completion before enqueueing
0224              * the dummy token to avoid duplicate list entries.
0225              */
0226             goto again;
0227         }
0228         dummy->token = token;
0229         dummy->cpu = smp_processor_id();
0230         init_swait_queue_head(&dummy->wq);
0231         hlist_add_head(&dummy->link, &b->list);
0232         dummy = NULL;
0233     } else {
0234         apf_task_wake_one(n);
0235     }
0236     raw_spin_unlock(&b->lock);
0237
0238     /* A dummy token might be allocated and ultimately not used.  */
0239     kfree(dummy);
0240 }
0241 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
0242
0243 noinstr u32 kvm_read_and_reset_apf_flags(void)
0244 {
0245     u32 flags = 0;
0246
0247     if (__this_cpu_read(apf_reason.enabled)) {
0248         flags = __this_cpu_read(apf_reason.flags);
0249         __this_cpu_write(apf_reason.flags, 0);
0250     }
0251
0252     return flags;
0253 }
0254 EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
0255
0256 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
0257 {
0258     u32 flags = kvm_read_and_reset_apf_flags();
0259     irqentry_state_t state;
0260
0261     if (!flags)
0262         return false;
0263
0264     state = irqentry_enter(regs);
0265     instrumentation_begin();
0266
0267     /*
0268      * If the host managed to inject an async #PF into an interrupt
0269      * disabled region, then die hard as this is not going to end well
0270      * and the host side is seriously broken.
0271      */
0272     if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
0273         panic("Host injected async #PF in interrupt disabled region\n");
0274
0275     if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
0276         if (unlikely(!(user_mode(regs))))
0277             panic("Host injected async #PF in kernel mode\n");
0278         /* Page is swapped out by the host. */
0279         kvm_async_pf_task_wait_schedule(token);
0280     } else {
0281         WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
0282     }
0283
0284     instrumentation_end();
0285     irqentry_exit(regs, state);
0286     return true;
0287 }
0288
0289 DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
0290 {
0291     struct pt_regs *old_regs = set_irq_regs(regs);
0292     u32 token;
0293
0294     ack_APIC_irq();
0295
0296     inc_irq_stat(irq_hv_callback_count);
0297
0298     if (__this_cpu_read(apf_reason.enabled)) {
0299         token = __this_cpu_read(apf_reason.token);
0300         kvm_async_pf_task_wake(token);
0301         __this_cpu_write(apf_reason.token, 0);
0302         wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
0303     }
0304
0305     set_irq_regs(old_regs);
0306 }
0307
0308 static void __init paravirt_ops_setup(void)
0309 {
0310     pv_info.name = "KVM";
0311
0312     if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
0313         pv_ops.cpu.io_delay = kvm_io_delay;
0314
0315 #ifdef CONFIG_X86_IO_APIC
0316     no_timer_check = 1;
0317 #endif
0318 }
0319
0320 static void kvm_register_steal_time(void)
0321 {
0322     int cpu = smp_processor_id();
0323     struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
0324
0325     if (!has_steal_clock)
0326         return;
0327
0328     wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
0329     pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
0330         (unsigned long long) slow_virt_to_phys(st));
0331 }
0332
0333 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
0334
0335 static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
0336 {
0337     /**
0338      * This relies on __test_and_clear_bit to modify the memory
0339      * in a way that is atomic with respect to the local CPU.
0340      * The hypervisor only accesses this memory from the local CPU so
0341      * there's no need for lock or memory barriers.
0342      * An optimization barrier is implied in apic write.
0343      */
0344     if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
0345         return;
0346     apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
0347 }
0348
0349 static void kvm_guest_cpu_init(void)
0350 {
0351     if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
0352         u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
0353
0354         WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
0355
0356         pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
0357         pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
0358
0359         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
0360             pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
0361
0362         wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
0363
0364         wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
0365         __this_cpu_write(apf_reason.enabled, 1);
0366         pr_debug("setup async PF for cpu %d\n", smp_processor_id());
0367     }
0368
0369     if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
0370         unsigned long pa;
0371
0372         /* Size alignment is implied but just to make it explicit. */
0373         BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
0374         __this_cpu_write(kvm_apic_eoi, 0);
0375         pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
0376             | KVM_MSR_ENABLED;
0377         wrmsrl(MSR_KVM_PV_EOI_EN, pa);
0378     }
0379
0380     if (has_steal_clock)
0381         kvm_register_steal_time();
0382 }
0383
0384 static void kvm_pv_disable_apf(void)
0385 {
0386     if (!__this_cpu_read(apf_reason.enabled))
0387         return;
0388
0389     wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
0390     __this_cpu_write(apf_reason.enabled, 0);
0391
0392     pr_debug("disable async PF for cpu %d\n", smp_processor_id());
0393 }
0394
0395 static void kvm_disable_steal_time(void)
0396 {
0397     if (!has_steal_clock)
0398         return;
0399
0400     wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
0401 }
0402
0403 static u64 kvm_steal_clock(int cpu)
0404 {
0405     u64 steal;
0406     struct kvm_steal_time *src;
0407     int version;
0408
0409     src = &per_cpu(steal_time, cpu);
0410     do {
0411         version = src->version;
0412         virt_rmb();
0413         steal = src->steal;
0414         virt_rmb();
0415     } while ((version & 1) || (version != src->version));
0416
0417     return steal;
0418 }
0419
0420 static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
0421 {
0422     early_set_memory_decrypted((unsigned long) ptr, size);
0423 }
0424
0425 /*
0426  * Iterate through all possible CPUs and map the memory region pointed
0427  * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
0428  *
0429  * Note: we iterate through all possible CPUs to ensure that CPUs
0430  * hotplugged will have their per-cpu variable already mapped as
0431  * decrypted.
0432  */
0433 static void __init sev_map_percpu_data(void)
0434 {
0435     int cpu;
0436
0437     if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
0438         return;
0439
0440     for_each_possible_cpu(cpu) {
0441         __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
0442         __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
0443         __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
0444     }
0445 }
0446
0447 static void kvm_guest_cpu_offline(bool shutdown)
0448 {
0449     kvm_disable_steal_time();
0450     if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
0451         wrmsrl(MSR_KVM_PV_EOI_EN, 0);
0452     if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
0453         wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
0454     kvm_pv_disable_apf();
0455     if (!shutdown)
0456         apf_task_wake_all();
0457     kvmclock_disable();
0458 }
0459
0460 static int kvm_cpu_online(unsigned int cpu)
0461 {
0462     unsigned long flags;
0463
0464     local_irq_save(flags);
0465     kvm_guest_cpu_init();
0466     local_irq_restore(flags);
0467     return 0;
0468 }
0469
0470 #ifdef CONFIG_SMP
0471
0472 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
0473
0474 static bool pv_tlb_flush_supported(void)
0475 {
0476     return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
0477         !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
0478         kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
0479         !boot_cpu_has(X86_FEATURE_MWAIT) &&
0480         (num_possible_cpus() != 1));
0481 }
0482
0483 static bool pv_ipi_supported(void)
0484 {
0485     return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
0486            (num_possible_cpus() != 1));
0487 }
0488
0489 static bool pv_sched_yield_supported(void)
0490 {
0491     return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
0492         !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
0493         kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
0494         !boot_cpu_has(X86_FEATURE_MWAIT) &&
0495         (num_possible_cpus() != 1));
0496 }
0497
0498 #define KVM_IPI_CLUSTER_SIZE    (2 * BITS_PER_LONG)
0499
0500 static void __send_ipi_mask(const struct cpumask *mask, int vector)
0501 {
0502     unsigned long flags;
0503     int cpu, apic_id, icr;
0504     int min = 0, max = 0;
0505 #ifdef CONFIG_X86_64
0506     __uint128_t ipi_bitmap = 0;
0507 #else
0508     u64 ipi_bitmap = 0;
0509 #endif
0510     long ret;
0511
0512     if (cpumask_empty(mask))
0513         return;
0514
0515     local_irq_save(flags);
0516
0517     switch (vector) {
0518     default:
0519         icr = APIC_DM_FIXED | vector;
0520         break;
0521     case NMI_VECTOR:
0522         icr = APIC_DM_NMI;
0523         break;
0524     }
0525
0526     for_each_cpu(cpu, mask) {
0527         apic_id = per_cpu(x86_cpu_to_apicid, cpu);
0528         if (!ipi_bitmap) {
0529             min = max = apic_id;
0530         } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
0531             ipi_bitmap <<= min - apic_id;
0532             min = apic_id;
0533         } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
0534             max = apic_id < max ? max : apic_id;
0535         } else {
0536             ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
0537                 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
0538             WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
0539                   ret);
0540             min = max = apic_id;
0541             ipi_bitmap = 0;
0542         }
0543         __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
0544     }
0545
0546     if (ipi_bitmap) {
0547         ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
0548             (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
0549         WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
0550               ret);
0551     }
0552
0553     local_irq_restore(flags);
0554 }
0555
0556 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
0557 {
0558     __send_ipi_mask(mask, vector);
0559 }
0560
0561 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
0562 {
0563     unsigned int this_cpu = smp_processor_id();
0564     struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
0565     const struct cpumask *local_mask;
0566
0567     cpumask_copy(new_mask, mask);
0568     cpumask_clear_cpu(this_cpu, new_mask);
0569     local_mask = new_mask;
0570     __send_ipi_mask(local_mask, vector);
0571 }
0572
0573 static int __init setup_efi_kvm_sev_migration(void)
0574 {
0575     efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
0576     efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
0577     efi_status_t status;
0578     unsigned long size;
0579     bool enabled;
0580
0581     if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
0582         !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
0583         return 0;
0584
0585     if (!efi_enabled(EFI_BOOT))
0586         return 0;
0587
0588     if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
0589         pr_info("%s : EFI runtime services are not enabled\n", __func__);
0590         return 0;
0591     }
0592
0593     size = sizeof(enabled);
0594
0595     /* Get variable contents into buffer */
0596     status = efi.get_variable(efi_sev_live_migration_enabled,
0597                   &efi_variable_guid, NULL, &size, &enabled);
0598
0599     if (status == EFI_NOT_FOUND) {
0600         pr_info("%s : EFI live migration variable not found\n", __func__);
0601         return 0;
0602     }
0603
0604     if (status != EFI_SUCCESS) {
0605         pr_info("%s : EFI variable retrieval failed\n", __func__);
0606         return 0;
0607     }
0608
0609     if (enabled == 0) {
0610         pr_info("%s: live migration disabled in EFI\n", __func__);
0611         return 0;
0612     }
0613
0614     pr_info("%s : live migration enabled in EFI\n", __func__);
0615     wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
0616
0617     return 1;
0618 }
0619
0620 late_initcall(setup_efi_kvm_sev_migration);
0621
0622 /*
0623  * Set the IPI entry points
0624  */
0625 static void kvm_setup_pv_ipi(void)
0626 {
0627     apic->send_IPI_mask = kvm_send_ipi_mask;
0628     apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
0629     pr_info("setup PV IPIs\n");
0630 }
0631
0632 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
0633 {
0634     int cpu;
0635
0636     native_send_call_func_ipi(mask);
0637
0638     /* Make sure other vCPUs get a chance to run if they need to. */
0639     for_each_cpu(cpu, mask) {
0640         if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
0641             kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
0642             break;
0643         }
0644     }
0645 }
0646
0647 static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
0648             const struct flush_tlb_info *info)
0649 {
0650     u8 state;
0651     int cpu;
0652     struct kvm_steal_time *src;
0653     struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
0654
0655     cpumask_copy(flushmask, cpumask);
0656     /*
0657      * We have to call flush only on online vCPUs. And
0658      * queue flush_on_enter for pre-empted vCPUs
0659      */
0660     for_each_cpu(cpu, flushmask) {
0661         /*
0662          * The local vCPU is never preempted, so we do not explicitly
0663          * skip check for local vCPU - it will never be cleared from
0664          * flushmask.
0665          */
0666         src = &per_cpu(steal_time, cpu);
0667         state = READ_ONCE(src->preempted);
0668         if ((state & KVM_VCPU_PREEMPTED)) {
0669             if (try_cmpxchg(&src->preempted, &state,
0670                     state | KVM_VCPU_FLUSH_TLB))
0671                 __cpumask_clear_cpu(cpu, flushmask);
0672         }
0673     }
0674
0675     native_flush_tlb_multi(flushmask, info);
0676 }
0677
0678 static __init int kvm_alloc_cpumask(void)
0679 {
0680     int cpu;
0681
0682     if (!kvm_para_available() || nopv)
0683         return 0;
0684
0685     if (pv_tlb_flush_supported() || pv_ipi_supported())
0686         for_each_possible_cpu(cpu) {
0687             zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
0688                 GFP_KERNEL, cpu_to_node(cpu));
0689         }
0690
0691     return 0;
0692 }
0693 arch_initcall(kvm_alloc_cpumask);
0694
0695 static void __init kvm_smp_prepare_boot_cpu(void)
0696 {
0697     /*
0698      * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
0699      * shares the guest physical address with the hypervisor.
0700      */
0701     sev_map_percpu_data();
0702
0703     kvm_guest_cpu_init();
0704     native_smp_prepare_boot_cpu();
0705     kvm_spinlock_init();
0706 }
0707
0708 static int kvm_cpu_down_prepare(unsigned int cpu)
0709 {
0710     unsigned long flags;
0711
0712     local_irq_save(flags);
0713     kvm_guest_cpu_offline(false);
0714     local_irq_restore(flags);
0715     return 0;
0716 }
0717
0718 #endif
0719
0720 static int kvm_suspend(void)
0721 {
0722     u64 val = 0;
0723
0724     kvm_guest_cpu_offline(false);
0725
0726 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
0727     if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
0728         rdmsrl(MSR_KVM_POLL_CONTROL, val);
0729     has_guest_poll = !(val & 1);
0730 #endif
0731     return 0;
0732 }
0733
0734 static void kvm_resume(void)
0735 {
0736     kvm_cpu_online(raw_smp_processor_id());
0737
0738 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
0739     if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
0740         wrmsrl(MSR_KVM_POLL_CONTROL, 0);
0741 #endif
0742 }
0743
0744 static struct syscore_ops kvm_syscore_ops = {
0745     .suspend    = kvm_suspend,
0746     .resume     = kvm_resume,
0747 };
0748
0749 static void kvm_pv_guest_cpu_reboot(void *unused)
0750 {
0751     kvm_guest_cpu_offline(true);
0752 }
0753
0754 static int kvm_pv_reboot_notify(struct notifier_block *nb,
0755                 unsigned long code, void *unused)
0756 {
0757     if (code == SYS_RESTART)
0758         on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
0759     return NOTIFY_DONE;
0760 }
0761
0762 static struct notifier_block kvm_pv_reboot_nb = {
0763     .notifier_call = kvm_pv_reboot_notify,
0764 };
0765
0766 /*
0767  * After a PV feature is registered, the host will keep writing to the
0768  * registered memory location. If the guest happens to shutdown, this memory
0769  * won't be valid. In cases like kexec, in which you install a new kernel, this
0770  * means a random memory location will be kept being written.
0771  */
0772 #ifdef CONFIG_KEXEC_CORE
0773 static void kvm_crash_shutdown(struct pt_regs *regs)
0774 {
0775     kvm_guest_cpu_offline(true);
0776     native_machine_crash_shutdown(regs);
0777 }
0778 #endif
0779
0780 #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
0781 bool __kvm_vcpu_is_preempted(long cpu);
0782
0783 __visible bool __kvm_vcpu_is_preempted(long cpu)
0784 {
0785     struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
0786
0787     return !!(src->preempted & KVM_VCPU_PREEMPTED);
0788 }
0789 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
0790
0791 #else
0792
0793 #include <asm/asm-offsets.h>
0794
0795 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
0796
0797 /*
0798  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
0799  * restoring to/from the stack.
0800  */
0801 asm(
0802 ".pushsection .text;"
0803 ".global __raw_callee_save___kvm_vcpu_is_preempted;"
0804 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
0805 "__raw_callee_save___kvm_vcpu_is_preempted:"
0806 ASM_ENDBR
0807 "movq   __per_cpu_offset(,%rdi,8), %rax;"
0808 "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
0809 "setne  %al;"
0810 ASM_RET
0811 ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
0812 ".popsection");
0813
0814 #endif
0815
0816 static void __init kvm_guest_init(void)
0817 {
0818     int i;
0819
0820     paravirt_ops_setup();
0821     register_reboot_notifier(&kvm_pv_reboot_nb);
0822     for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
0823         raw_spin_lock_init(&async_pf_sleepers[i].lock);
0824
0825     if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
0826         has_steal_clock = 1;
0827         static_call_update(pv_steal_clock, kvm_steal_clock);
0828
0829         pv_ops.lock.vcpu_is_preempted =
0830             PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
0831     }
0832
0833     if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
0834         apic_set_eoi_write(kvm_guest_apic_eoi_write);
0835
0836     if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
0837         static_branch_enable(&kvm_async_pf_enabled);
0838         alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
0839     }
0840
0841 #ifdef CONFIG_SMP
0842     if (pv_tlb_flush_supported()) {
0843         pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
0844         pv_ops.mmu.tlb_remove_table = tlb_remove_table;
0845         pr_info("KVM setup pv remote TLB flush\n");
0846     }
0847
0848     smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
0849     if (pv_sched_yield_supported()) {
0850         smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
0851         pr_info("setup PV sched yield\n");
0852     }
0853     if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
0854                       kvm_cpu_online, kvm_cpu_down_prepare) < 0)
0855         pr_err("failed to install cpu hotplug callbacks\n");
0856 #else
0857     sev_map_percpu_data();
0858     kvm_guest_cpu_init();
0859 #endif
0860
0861 #ifdef CONFIG_KEXEC_CORE
0862     machine_ops.crash_shutdown = kvm_crash_shutdown;
0863 #endif
0864
0865     register_syscore_ops(&kvm_syscore_ops);
0866
0867     /*
0868      * Hard lockup detection is enabled by default. Disable it, as guests
0869      * can get false positives too easily, for example if the host is
0870      * overcommitted.
0871      */
0872     hardlockup_detector_disable();
0873 }
0874
0875 static noinline uint32_t __kvm_cpuid_base(void)
0876 {
0877     if (boot_cpu_data.cpuid_level < 0)
0878         return 0;   /* So we don't blow up on old processors */
0879
0880     if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
0881         return hypervisor_cpuid_base(KVM_SIGNATURE, 0);
0882
0883     return 0;
0884 }
0885
0886 static inline uint32_t kvm_cpuid_base(void)
0887 {
0888     static int kvm_cpuid_base = -1;
0889
0890     if (kvm_cpuid_base == -1)
0891         kvm_cpuid_base = __kvm_cpuid_base();
0892
0893     return kvm_cpuid_base;
0894 }
0895
0896 bool kvm_para_available(void)
0897 {
0898     return kvm_cpuid_base() != 0;
0899 }
0900 EXPORT_SYMBOL_GPL(kvm_para_available);
0901
0902 unsigned int kvm_arch_para_features(void)
0903 {
0904     return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
0905 }
0906
0907 unsigned int kvm_arch_para_hints(void)
0908 {
0909     return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
0910 }
0911 EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
0912
0913 static uint32_t __init kvm_detect(void)
0914 {
0915     return kvm_cpuid_base();
0916 }
0917
0918 static void __init kvm_apic_init(void)
0919 {
0920 #ifdef CONFIG_SMP
0921     if (pv_ipi_supported())
0922         kvm_setup_pv_ipi();
0923 #endif
0924 }
0925
0926 static bool __init kvm_msi_ext_dest_id(void)
0927 {
0928     return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
0929 }
0930
0931 static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
0932 {
0933     kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
0934                KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
0935 }
0936
0937 static void __init kvm_init_platform(void)
0938 {
0939     if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
0940         kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
0941         unsigned long nr_pages;
0942         int i;
0943
0944         pv_ops.mmu.notify_page_enc_status_changed =
0945             kvm_sev_hc_page_enc_status;
0946
0947         /*
0948          * Reset the host's shared pages list related to kernel
0949          * specific page encryption status settings before we load a
0950          * new kernel by kexec. Reset the page encryption status
0951          * during early boot intead of just before kexec to avoid SMP
0952          * races during kvm_pv_guest_cpu_reboot().
0953          * NOTE: We cannot reset the complete shared pages list
0954          * here as we need to retain the UEFI/OVMF firmware
0955          * specific settings.
0956          */
0957
0958         for (i = 0; i < e820_table->nr_entries; i++) {
0959             struct e820_entry *entry = &e820_table->entries[i];
0960
0961             if (entry->type != E820_TYPE_RAM)
0962                 continue;
0963
0964             nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
0965
0966             kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
0967                        nr_pages,
0968                        KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
0969         }
0970
0971         /*
0972          * Ensure that _bss_decrypted section is marked as decrypted in the
0973          * shared pages list.
0974          */
0975         nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted,
0976                     PAGE_SIZE);
0977         early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
0978                         nr_pages, 0);
0979
0980         /*
0981          * If not booted using EFI, enable Live migration support.
0982          */
0983         if (!efi_enabled(EFI_BOOT))
0984             wrmsrl(MSR_KVM_MIGRATION_CONTROL,
0985                    KVM_MIGRATION_READY);
0986     }
0987     kvmclock_init();
0988     x86_platform.apic_post_init = kvm_apic_init;
0989 }
0990
0991 #if defined(CONFIG_AMD_MEM_ENCRYPT)
0992 static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
0993 {
0994     /* RAX and CPL are already in the GHCB */
0995     ghcb_set_rbx(ghcb, regs->bx);
0996     ghcb_set_rcx(ghcb, regs->cx);
0997     ghcb_set_rdx(ghcb, regs->dx);
0998     ghcb_set_rsi(ghcb, regs->si);
0999 }
1000
1001 static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
1002 {
1003     /* No checking of the return state needed */
1004     return true;
1005 }
1006 #endif
1007
1008 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1009     .name               = "KVM",
1010     .detect             = kvm_detect,
1011     .type               = X86_HYPER_KVM,
1012     .init.guest_late_init       = kvm_guest_init,
1013     .init.x2apic_available      = kvm_para_available,
1014     .init.msi_ext_dest_id       = kvm_msi_ext_dest_id,
1015     .init.init_platform     = kvm_init_platform,
1016 #if defined(CONFIG_AMD_MEM_ENCRYPT)
1017     .runtime.sev_es_hcall_prepare   = kvm_sev_es_hcall_prepare,
1018     .runtime.sev_es_hcall_finish    = kvm_sev_es_hcall_finish,
1019 #endif
1020 };
1021
1022 static __init int activate_jump_labels(void)
1023 {
1024     if (has_steal_clock) {
1025         static_key_slow_inc(&paravirt_steal_enabled);
1026         if (steal_acc)
1027             static_key_slow_inc(&paravirt_steal_rq_enabled);
1028     }
1029
1030     return 0;
1031 }
1032 arch_initcall(activate_jump_labels);
1033
1034 #ifdef CONFIG_PARAVIRT_SPINLOCKS
1035
1036 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
1037 static void kvm_kick_cpu(int cpu)
1038 {
1039     int apicid;
1040     unsigned long flags = 0;
1041
1042     apicid = per_cpu(x86_cpu_to_apicid, cpu);
1043     kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
1044 }
1045
1046 #include <asm/qspinlock.h>
1047
1048 static void kvm_wait(u8 *ptr, u8 val)
1049 {
1050     if (in_nmi())
1051         return;
1052
1053     /*
1054      * halt until it's our turn and kicked. Note that we do safe halt
1055      * for irq enabled case to avoid hang when lock info is overwritten
1056      * in irq spinlock slowpath and no spurious interrupt occur to save us.
1057      */
1058     if (irqs_disabled()) {
1059         if (READ_ONCE(*ptr) == val)
1060             halt();
1061     } else {
1062         local_irq_disable();
1063
1064         /* safe_halt() will enable IRQ */
1065         if (READ_ONCE(*ptr) == val)
1066             safe_halt();
1067         else
1068             local_irq_enable();
1069     }
1070 }
1071
1072 /*
1073  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1074  */
1075 void __init kvm_spinlock_init(void)
1076 {
1077     /*
1078      * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1079      * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1080      * preferred over native qspinlock when vCPU is preempted.
1081      */
1082     if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1083         pr_info("PV spinlocks disabled, no host support\n");
1084         return;
1085     }
1086
1087     /*
1088      * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1089      * are available.
1090      */
1091     if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1092         pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1093         goto out;
1094     }
1095
1096     if (num_possible_cpus() == 1) {
1097         pr_info("PV spinlocks disabled, single CPU\n");
1098         goto out;
1099     }
1100
1101     if (nopvspin) {
1102         pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1103         goto out;
1104     }
1105
1106     pr_info("PV spinlocks enabled\n");
1107
1108     __pv_init_lock_hash();
1109     pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1110     pv_ops.lock.queued_spin_unlock =
1111         PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1112     pv_ops.lock.wait = kvm_wait;
1113     pv_ops.lock.kick = kvm_kick_cpu;
1114
1115     /*
1116      * When PV spinlock is enabled which is preferred over
1117      * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1118      * Just disable it anyway.
1119      */
1120 out:
1121     static_branch_disable(&virt_spin_lock_key);
1122 }
1123
1124 #endif  /* CONFIG_PARAVIRT_SPINLOCKS */
1125
1126 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1127
1128 static void kvm_disable_host_haltpoll(void *i)
1129 {
1130     wrmsrl(MSR_KVM_POLL_CONTROL, 0);
1131 }
1132
1133 static void kvm_enable_host_haltpoll(void *i)
1134 {
1135     wrmsrl(MSR_KVM_POLL_CONTROL, 1);
1136 }
1137
1138 void arch_haltpoll_enable(unsigned int cpu)
1139 {
1140     if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1141         pr_err_once("host does not support poll control\n");
1142         pr_err_once("host upgrade recommended\n");
1143         return;
1144     }
1145
1146     /* Enable guest halt poll disables host halt poll */
1147     smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
1148 }
1149 EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1150
1151 void arch_haltpoll_disable(unsigned int cpu)
1152 {
1153     if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1154         return;
1155
1156     /* Disable guest halt poll enables host halt poll */
1157     smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
1158 }
1159 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1160 #endif