0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013 #include <linux/types.h>
0014 #include <linux/kvm_host.h>
0015 #include <linux/perf_event.h>
0016 #include <linux/bsearch.h>
0017 #include <linux/sort.h>
0018 #include <asm/perf_event.h>
0019 #include <asm/cpu_device_id.h>
0020 #include "x86.h"
0021 #include "cpuid.h"
0022 #include "lapic.h"
0023 #include "pmu.h"
0024
0025
0026 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
0027
0028 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
0029 EXPORT_SYMBOL_GPL(kvm_pmu_cap);
0030
0031 static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
0032 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
0033 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
0034 {}
0035 };
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
0066
0067 #define KVM_X86_PMU_OP(func) \
0068 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
0069 *(((struct kvm_pmu_ops *)0)->func));
0070 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
0071 #include <asm/kvm-x86-pmu-ops.h>
0072
0073 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
0074 {
0075 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
0076
0077 #define __KVM_X86_PMU_OP(func) \
0078 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
0079 #define KVM_X86_PMU_OP(func) \
0080 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
0081 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
0082 #include <asm/kvm-x86-pmu-ops.h>
0083 #undef __KVM_X86_PMU_OP
0084 }
0085
0086 static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
0087 {
0088 return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
0089 }
0090
0091 static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
0092 {
0093 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
0094 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
0095
0096 kvm_pmu_deliver_pmi(vcpu);
0097 }
0098
0099 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
0100 {
0101 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
0102 bool skip_pmi = false;
0103
0104
0105 if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
0106 return;
0107
0108 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
0109
0110 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
0111 (unsigned long *)&pmu->global_status);
0112 } else {
0113 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
0114 }
0115 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
0116
0117 if (!pmc->intr || skip_pmi)
0118 return;
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128 if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
0129 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
0130 else
0131 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
0132 }
0133
0134 static void kvm_perf_overflow(struct perf_event *perf_event,
0135 struct perf_sample_data *data,
0136 struct pt_regs *regs)
0137 {
0138 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
0139
0140 __kvm_perf_overflow(pmc, true);
0141 }
0142
0143 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
0144 u64 config, bool exclude_user,
0145 bool exclude_kernel, bool intr)
0146 {
0147 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
0148 struct perf_event *event;
0149 struct perf_event_attr attr = {
0150 .type = type,
0151 .size = sizeof(attr),
0152 .pinned = true,
0153 .exclude_idle = true,
0154 .exclude_host = 1,
0155 .exclude_user = exclude_user,
0156 .exclude_kernel = exclude_kernel,
0157 .config = config,
0158 };
0159 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
0160
0161 attr.sample_period = get_sample_period(pmc, pmc->counter);
0162
0163 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
0164 guest_cpuid_is_intel(pmc->vcpu)) {
0165
0166
0167
0168
0169
0170 attr.sample_period = 0;
0171 }
0172 if (pebs) {
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187 attr.precise_ip = 1;
0188 if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
0189 attr.precise_ip = 3;
0190 }
0191
0192 event = perf_event_create_kernel_counter(&attr, -1, current,
0193 kvm_perf_overflow, pmc);
0194 if (IS_ERR(event)) {
0195 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
0196 PTR_ERR(event), pmc->idx);
0197 return;
0198 }
0199
0200 pmc->perf_event = event;
0201 pmc_to_pmu(pmc)->event_count++;
0202 clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
0203 pmc->is_paused = false;
0204 pmc->intr = intr || pebs;
0205 }
0206
0207 static void pmc_pause_counter(struct kvm_pmc *pmc)
0208 {
0209 u64 counter = pmc->counter;
0210
0211 if (!pmc->perf_event || pmc->is_paused)
0212 return;
0213
0214
0215 counter += perf_event_pause(pmc->perf_event, true);
0216 pmc->counter = counter & pmc_bitmask(pmc);
0217 pmc->is_paused = true;
0218 }
0219
0220 static bool pmc_resume_counter(struct kvm_pmc *pmc)
0221 {
0222 if (!pmc->perf_event)
0223 return false;
0224
0225
0226 if (perf_event_period(pmc->perf_event,
0227 get_sample_period(pmc, pmc->counter)))
0228 return false;
0229
0230 if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
0231 pmc->perf_event->attr.precise_ip)
0232 return false;
0233
0234
0235 perf_event_enable(pmc->perf_event);
0236 pmc->is_paused = false;
0237
0238 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
0239 return true;
0240 }
0241
0242 static int cmp_u64(const void *pa, const void *pb)
0243 {
0244 u64 a = *(u64 *)pa;
0245 u64 b = *(u64 *)pb;
0246
0247 return (a > b) - (a < b);
0248 }
0249
0250 static bool check_pmu_event_filter(struct kvm_pmc *pmc)
0251 {
0252 struct kvm_pmu_event_filter *filter;
0253 struct kvm *kvm = pmc->vcpu->kvm;
0254 bool allow_event = true;
0255 __u64 key;
0256 int idx;
0257
0258 if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
0259 return false;
0260
0261 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
0262 if (!filter)
0263 goto out;
0264
0265 if (pmc_is_gp(pmc)) {
0266 key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
0267 if (bsearch(&key, filter->events, filter->nevents,
0268 sizeof(__u64), cmp_u64))
0269 allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
0270 else
0271 allow_event = filter->action == KVM_PMU_EVENT_DENY;
0272 } else {
0273 idx = pmc->idx - INTEL_PMC_IDX_FIXED;
0274 if (filter->action == KVM_PMU_EVENT_DENY &&
0275 test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
0276 allow_event = false;
0277 if (filter->action == KVM_PMU_EVENT_ALLOW &&
0278 !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
0279 allow_event = false;
0280 }
0281
0282 out:
0283 return allow_event;
0284 }
0285
0286 void reprogram_counter(struct kvm_pmc *pmc)
0287 {
0288 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
0289 u64 eventsel = pmc->eventsel;
0290 u64 new_config = eventsel;
0291 u8 fixed_ctr_ctrl;
0292
0293 pmc_pause_counter(pmc);
0294
0295 if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
0296 return;
0297
0298 if (!check_pmu_event_filter(pmc))
0299 return;
0300
0301 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
0302 printk_once("kvm pmu: pin control bit is ignored\n");
0303
0304 if (pmc_is_fixed(pmc)) {
0305 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
0306 pmc->idx - INTEL_PMC_IDX_FIXED);
0307 if (fixed_ctr_ctrl & 0x1)
0308 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
0309 if (fixed_ctr_ctrl & 0x2)
0310 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
0311 if (fixed_ctr_ctrl & 0x8)
0312 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
0313 new_config = (u64)fixed_ctr_ctrl;
0314 }
0315
0316 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
0317 return;
0318
0319 pmc_release_perf_event(pmc);
0320
0321 pmc->current_config = new_config;
0322 pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
0323 (eventsel & pmu->raw_event_mask),
0324 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
0325 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
0326 eventsel & ARCH_PERFMON_EVENTSEL_INT);
0327 }
0328 EXPORT_SYMBOL_GPL(reprogram_counter);
0329
0330 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
0331 {
0332 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0333 int bit;
0334
0335 for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
0336 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
0337
0338 if (unlikely(!pmc || !pmc->perf_event)) {
0339 clear_bit(bit, pmu->reprogram_pmi);
0340 continue;
0341 }
0342 reprogram_counter(pmc);
0343 }
0344
0345
0346
0347
0348
0349
0350 if (unlikely(pmu->need_cleanup))
0351 kvm_pmu_cleanup(vcpu);
0352 }
0353
0354
0355 bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
0356 {
0357 return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
0358 }
0359
0360 bool is_vmware_backdoor_pmc(u32 pmc_idx)
0361 {
0362 switch (pmc_idx) {
0363 case VMWARE_BACKDOOR_PMC_HOST_TSC:
0364 case VMWARE_BACKDOOR_PMC_REAL_TIME:
0365 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
0366 return true;
0367 }
0368 return false;
0369 }
0370
0371 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
0372 {
0373 u64 ctr_val;
0374
0375 switch (idx) {
0376 case VMWARE_BACKDOOR_PMC_HOST_TSC:
0377 ctr_val = rdtsc();
0378 break;
0379 case VMWARE_BACKDOOR_PMC_REAL_TIME:
0380 ctr_val = ktime_get_boottime_ns();
0381 break;
0382 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
0383 ctr_val = ktime_get_boottime_ns() +
0384 vcpu->kvm->arch.kvmclock_offset;
0385 break;
0386 default:
0387 return 1;
0388 }
0389
0390 *data = ctr_val;
0391 return 0;
0392 }
0393
0394 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
0395 {
0396 bool fast_mode = idx & (1u << 31);
0397 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0398 struct kvm_pmc *pmc;
0399 u64 mask = fast_mode ? ~0u : ~0ull;
0400
0401 if (!pmu->version)
0402 return 1;
0403
0404 if (is_vmware_backdoor_pmc(idx))
0405 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
0406
0407 pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
0408 if (!pmc)
0409 return 1;
0410
0411 if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
0412 (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
0413 (kvm_read_cr0(vcpu) & X86_CR0_PE))
0414 return 1;
0415
0416 *data = pmc_read_counter(pmc) & mask;
0417 return 0;
0418 }
0419
0420 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
0421 {
0422 if (lapic_in_kernel(vcpu)) {
0423 static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
0424 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
0425 }
0426 }
0427
0428 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
0429 {
0430 return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
0431 static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
0432 }
0433
0434 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
0435 {
0436 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0437 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
0438
0439 if (pmc)
0440 __set_bit(pmc->idx, pmu->pmc_in_use);
0441 }
0442
0443 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
0444 {
0445 return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
0446 }
0447
0448 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
0449 {
0450 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
0451 return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
0452 }
0453
0454
0455
0456
0457
0458 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
0459 {
0460 static_call(kvm_x86_pmu_refresh)(vcpu);
0461 }
0462
0463 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
0464 {
0465 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0466
0467 irq_work_sync(&pmu->irq_work);
0468 static_call(kvm_x86_pmu_reset)(vcpu);
0469 }
0470
0471 void kvm_pmu_init(struct kvm_vcpu *vcpu)
0472 {
0473 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0474
0475 memset(pmu, 0, sizeof(*pmu));
0476 static_call(kvm_x86_pmu_init)(vcpu);
0477 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
0478 pmu->event_count = 0;
0479 pmu->need_cleanup = false;
0480 kvm_pmu_refresh(vcpu);
0481 }
0482
0483
0484 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
0485 {
0486 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0487 struct kvm_pmc *pmc = NULL;
0488 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
0489 int i;
0490
0491 pmu->need_cleanup = false;
0492
0493 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
0494 pmu->pmc_in_use, X86_PMC_IDX_MAX);
0495
0496 for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
0497 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
0498
0499 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
0500 pmc_stop_counter(pmc);
0501 }
0502
0503 static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
0504
0505 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
0506 }
0507
0508 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
0509 {
0510 kvm_pmu_reset(vcpu);
0511 }
0512
0513 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
0514 {
0515 u64 prev_count;
0516
0517 prev_count = pmc->counter;
0518 pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
0519
0520 reprogram_counter(pmc);
0521 if (pmc->counter < prev_count)
0522 __kvm_perf_overflow(pmc, false);
0523 }
0524
0525 static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
0526 unsigned int perf_hw_id)
0527 {
0528 return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
0529 AMD64_RAW_EVENT_MASK_NB);
0530 }
0531
0532 static inline bool cpl_is_matched(struct kvm_pmc *pmc)
0533 {
0534 bool select_os, select_user;
0535 u64 config = pmc->current_config;
0536
0537 if (pmc_is_gp(pmc)) {
0538 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
0539 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
0540 } else {
0541 select_os = config & 0x1;
0542 select_user = config & 0x2;
0543 }
0544
0545 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
0546 }
0547
0548 void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
0549 {
0550 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
0551 struct kvm_pmc *pmc;
0552 int i;
0553
0554 for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
0555 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
0556
0557 if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
0558 continue;
0559
0560
0561 if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
0562 kvm_pmu_incr_counter(pmc);
0563 }
0564 }
0565 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
0566
0567 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
0568 {
0569 struct kvm_pmu_event_filter tmp, *filter;
0570 size_t size;
0571 int r;
0572
0573 if (copy_from_user(&tmp, argp, sizeof(tmp)))
0574 return -EFAULT;
0575
0576 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
0577 tmp.action != KVM_PMU_EVENT_DENY)
0578 return -EINVAL;
0579
0580 if (tmp.flags != 0)
0581 return -EINVAL;
0582
0583 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
0584 return -E2BIG;
0585
0586 size = struct_size(filter, events, tmp.nevents);
0587 filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
0588 if (!filter)
0589 return -ENOMEM;
0590
0591 r = -EFAULT;
0592 if (copy_from_user(filter, argp, size))
0593 goto cleanup;
0594
0595
0596 *filter = tmp;
0597
0598
0599
0600
0601 sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
0602
0603 mutex_lock(&kvm->lock);
0604 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
0605 mutex_is_locked(&kvm->lock));
0606 mutex_unlock(&kvm->lock);
0607
0608 synchronize_srcu_expedited(&kvm->srcu);
0609 r = 0;
0610 cleanup:
0611 kfree(filter);
0612 return r;
0613 }