x86/kvm/xen.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
0004  * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
0005  *
0006  * KVM Xen emulation
0007  */
0008
0009 #include "x86.h"
0010 #include "xen.h"
0011 #include "hyperv.h"
0012 #include "lapic.h"
0013
0014 #include <linux/eventfd.h>
0015 #include <linux/kvm_host.h>
0016 #include <linux/sched/stat.h>
0017
0018 #include <trace/events/kvm.h>
0019 #include <xen/interface/xen.h>
0020 #include <xen/interface/vcpu.h>
0021 #include <xen/interface/version.h>
0022 #include <xen/interface/event_channel.h>
0023 #include <xen/interface/sched.h>
0024
0025 #include "trace.h"
0026
0027 static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
0028 static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
0029 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
0030
0031 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
0032
0033 static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
0034 {
0035     struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
0036     struct pvclock_wall_clock *wc;
0037     gpa_t gpa = gfn_to_gpa(gfn);
0038     u32 *wc_sec_hi;
0039     u32 wc_version;
0040     u64 wall_nsec;
0041     int ret = 0;
0042     int idx = srcu_read_lock(&kvm->srcu);
0043
0044     if (gfn == GPA_INVALID) {
0045         kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
0046         goto out;
0047     }
0048
0049     do {
0050         ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
0051                         gpa, PAGE_SIZE);
0052         if (ret)
0053             goto out;
0054
0055         /*
0056          * This code mirrors kvm_write_wall_clock() except that it writes
0057          * directly through the pfn cache and doesn't mark the page dirty.
0058          */
0059         wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
0060
0061         /* It could be invalid again already, so we need to check */
0062         read_lock_irq(&gpc->lock);
0063
0064         if (gpc->valid)
0065             break;
0066
0067         read_unlock_irq(&gpc->lock);
0068     } while (1);
0069
0070     /* Paranoia checks on the 32-bit struct layout */
0071     BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
0072     BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
0073     BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
0074
0075 #ifdef CONFIG_X86_64
0076     /* Paranoia checks on the 64-bit struct layout */
0077     BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
0078     BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
0079
0080     if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
0081         struct shared_info *shinfo = gpc->khva;
0082
0083         wc_sec_hi = &shinfo->wc_sec_hi;
0084         wc = &shinfo->wc;
0085     } else
0086 #endif
0087     {
0088         struct compat_shared_info *shinfo = gpc->khva;
0089
0090         wc_sec_hi = &shinfo->arch.wc_sec_hi;
0091         wc = &shinfo->wc;
0092     }
0093
0094     /* Increment and ensure an odd value */
0095     wc_version = wc->version = (wc->version + 1) | 1;
0096     smp_wmb();
0097
0098     wc->nsec = do_div(wall_nsec,  1000000000);
0099     wc->sec = (u32)wall_nsec;
0100     *wc_sec_hi = wall_nsec >> 32;
0101     smp_wmb();
0102
0103     wc->version = wc_version + 1;
0104     read_unlock_irq(&gpc->lock);
0105
0106     kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
0107
0108 out:
0109     srcu_read_unlock(&kvm->srcu, idx);
0110     return ret;
0111 }
0112
0113 void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
0114 {
0115     if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
0116         struct kvm_xen_evtchn e;
0117
0118         e.vcpu_id = vcpu->vcpu_id;
0119         e.vcpu_idx = vcpu->vcpu_idx;
0120         e.port = vcpu->arch.xen.timer_virq;
0121         e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
0122
0123         kvm_xen_set_evtchn(&e, vcpu->kvm);
0124
0125         vcpu->arch.xen.timer_expires = 0;
0126         atomic_set(&vcpu->arch.xen.timer_pending, 0);
0127     }
0128 }
0129
0130 static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
0131 {
0132     struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
0133                          arch.xen.timer);
0134     if (atomic_read(&vcpu->arch.xen.timer_pending))
0135         return HRTIMER_NORESTART;
0136
0137     atomic_inc(&vcpu->arch.xen.timer_pending);
0138     kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
0139     kvm_vcpu_kick(vcpu);
0140
0141     return HRTIMER_NORESTART;
0142 }
0143
0144 static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
0145 {
0146     atomic_set(&vcpu->arch.xen.timer_pending, 0);
0147     vcpu->arch.xen.timer_expires = guest_abs;
0148
0149     if (delta_ns <= 0) {
0150         xen_timer_callback(&vcpu->arch.xen.timer);
0151     } else {
0152         ktime_t ktime_now = ktime_get();
0153         hrtimer_start(&vcpu->arch.xen.timer,
0154                   ktime_add_ns(ktime_now, delta_ns),
0155                   HRTIMER_MODE_ABS_HARD);
0156     }
0157 }
0158
0159 static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
0160 {
0161     hrtimer_cancel(&vcpu->arch.xen.timer);
0162     vcpu->arch.xen.timer_expires = 0;
0163     atomic_set(&vcpu->arch.xen.timer_pending, 0);
0164 }
0165
0166 static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
0167 {
0168     hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
0169              HRTIMER_MODE_ABS_HARD);
0170     vcpu->arch.xen.timer.function = xen_timer_callback;
0171 }
0172
0173 static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
0174 {
0175     struct kvm_vcpu_xen *vx = &v->arch.xen;
0176     u64 now = get_kvmclock_ns(v->kvm);
0177     u64 delta_ns = now - vx->runstate_entry_time;
0178     u64 run_delay = current->sched_info.run_delay;
0179
0180     if (unlikely(!vx->runstate_entry_time))
0181         vx->current_runstate = RUNSTATE_offline;
0182
0183     /*
0184      * Time waiting for the scheduler isn't "stolen" if the
0185      * vCPU wasn't running anyway.
0186      */
0187     if (vx->current_runstate == RUNSTATE_running) {
0188         u64 steal_ns = run_delay - vx->last_steal;
0189
0190         delta_ns -= steal_ns;
0191
0192         vx->runstate_times[RUNSTATE_runnable] += steal_ns;
0193     }
0194     vx->last_steal = run_delay;
0195
0196     vx->runstate_times[vx->current_runstate] += delta_ns;
0197     vx->current_runstate = state;
0198     vx->runstate_entry_time = now;
0199 }
0200
0201 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
0202 {
0203     struct kvm_vcpu_xen *vx = &v->arch.xen;
0204     struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
0205     uint64_t *user_times;
0206     unsigned long flags;
0207     size_t user_len;
0208     int *user_state;
0209
0210     kvm_xen_update_runstate(v, state);
0211
0212     if (!vx->runstate_cache.active)
0213         return;
0214
0215     if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
0216         user_len = sizeof(struct vcpu_runstate_info);
0217     else
0218         user_len = sizeof(struct compat_vcpu_runstate_info);
0219
0220     read_lock_irqsave(&gpc->lock, flags);
0221     while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
0222                        user_len)) {
0223         read_unlock_irqrestore(&gpc->lock, flags);
0224
0225         /* When invoked from kvm_sched_out() we cannot sleep */
0226         if (state == RUNSTATE_runnable)
0227             return;
0228
0229         if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
0230             return;
0231
0232         read_lock_irqsave(&gpc->lock, flags);
0233     }
0234
0235     /*
0236      * The only difference between 32-bit and 64-bit versions of the
0237      * runstate struct us the alignment of uint64_t in 32-bit, which
0238      * means that the 64-bit version has an additional 4 bytes of
0239      * padding after the first field 'state'.
0240      *
0241      * So we use 'int __user *user_state' to point to the state field,
0242      * and 'uint64_t __user *user_times' for runstate_entry_time. So
0243      * the actual array of time[] in each state starts at user_times[1].
0244      */
0245     BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
0246     BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
0247     BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
0248 #ifdef CONFIG_X86_64
0249     BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
0250              offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
0251     BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
0252              offsetof(struct compat_vcpu_runstate_info, time) + 4);
0253 #endif
0254
0255     user_state = gpc->khva;
0256
0257     if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
0258         user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
0259                           state_entry_time);
0260     else
0261         user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
0262                           state_entry_time);
0263
0264     /*
0265      * First write the updated state_entry_time at the appropriate
0266      * location determined by 'offset'.
0267      */
0268     BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
0269              sizeof(user_times[0]));
0270     BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
0271              sizeof(user_times[0]));
0272
0273     user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
0274     smp_wmb();
0275
0276     /*
0277      * Next, write the new runstate. This is in the *same* place
0278      * for 32-bit and 64-bit guests, asserted here for paranoia.
0279      */
0280     BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
0281              offsetof(struct compat_vcpu_runstate_info, state));
0282     BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
0283              sizeof(vx->current_runstate));
0284     BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
0285              sizeof(vx->current_runstate));
0286
0287     *user_state = vx->current_runstate;
0288
0289     /*
0290      * Write the actual runstate times immediately after the
0291      * runstate_entry_time.
0292      */
0293     BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
0294              offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
0295     BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
0296              offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
0297     BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
0298              sizeof_field(struct compat_vcpu_runstate_info, time));
0299     BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
0300              sizeof(vx->runstate_times));
0301
0302     memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
0303     smp_wmb();
0304
0305     /*
0306      * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
0307      * runstate_entry_time field.
0308      */
0309     user_times[0] &= ~XEN_RUNSTATE_UPDATE;
0310     smp_wmb();
0311
0312     read_unlock_irqrestore(&gpc->lock, flags);
0313
0314     mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
0315 }
0316
0317 static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
0318 {
0319     struct kvm_lapic_irq irq = { };
0320     int r;
0321
0322     irq.dest_id = v->vcpu_id;
0323     irq.vector = v->arch.xen.upcall_vector;
0324     irq.dest_mode = APIC_DEST_PHYSICAL;
0325     irq.shorthand = APIC_DEST_NOSHORT;
0326     irq.delivery_mode = APIC_DM_FIXED;
0327     irq.level = 1;
0328
0329     /* The fast version will always work for physical unicast */
0330     WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
0331 }
0332
0333 /*
0334  * On event channel delivery, the vcpu_info may not have been accessible.
0335  * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
0336  * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
0337  * Do so now that we can sleep in the context of the vCPU to bring the
0338  * page in, and refresh the pfn cache for it.
0339  */
0340 void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
0341 {
0342     unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
0343     struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
0344     unsigned long flags;
0345
0346     if (!evtchn_pending_sel)
0347         return;
0348
0349     /*
0350      * Yes, this is an open-coded loop. But that's just what put_user()
0351      * does anyway. Page it in and retry the instruction. We're just a
0352      * little more honest about it.
0353      */
0354     read_lock_irqsave(&gpc->lock, flags);
0355     while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
0356                        sizeof(struct vcpu_info))) {
0357         read_unlock_irqrestore(&gpc->lock, flags);
0358
0359         if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
0360                          sizeof(struct vcpu_info)))
0361             return;
0362
0363         read_lock_irqsave(&gpc->lock, flags);
0364     }
0365
0366     /* Now gpc->khva is a valid kernel address for the vcpu_info */
0367     if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
0368         struct vcpu_info *vi = gpc->khva;
0369
0370         asm volatile(LOCK_PREFIX "orq %0, %1\n"
0371                  "notq %0\n"
0372                  LOCK_PREFIX "andq %0, %2\n"
0373                  : "=r" (evtchn_pending_sel),
0374                    "+m" (vi->evtchn_pending_sel),
0375                    "+m" (v->arch.xen.evtchn_pending_sel)
0376                  : "0" (evtchn_pending_sel));
0377         WRITE_ONCE(vi->evtchn_upcall_pending, 1);
0378     } else {
0379         u32 evtchn_pending_sel32 = evtchn_pending_sel;
0380         struct compat_vcpu_info *vi = gpc->khva;
0381
0382         asm volatile(LOCK_PREFIX "orl %0, %1\n"
0383                  "notl %0\n"
0384                  LOCK_PREFIX "andl %0, %2\n"
0385                  : "=r" (evtchn_pending_sel32),
0386                    "+m" (vi->evtchn_pending_sel),
0387                    "+m" (v->arch.xen.evtchn_pending_sel)
0388                  : "0" (evtchn_pending_sel32));
0389         WRITE_ONCE(vi->evtchn_upcall_pending, 1);
0390     }
0391     read_unlock_irqrestore(&gpc->lock, flags);
0392
0393     /* For the per-vCPU lapic vector, deliver it as MSI. */
0394     if (v->arch.xen.upcall_vector)
0395         kvm_xen_inject_vcpu_vector(v);
0396
0397     mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
0398 }
0399
0400 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
0401 {
0402     struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
0403     unsigned long flags;
0404     u8 rc = 0;
0405
0406     /*
0407      * If the global upcall vector (HVMIRQ_callback_vector) is set and
0408      * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
0409      */
0410
0411     /* No need for compat handling here */
0412     BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
0413              offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
0414     BUILD_BUG_ON(sizeof(rc) !=
0415              sizeof_field(struct vcpu_info, evtchn_upcall_pending));
0416     BUILD_BUG_ON(sizeof(rc) !=
0417              sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
0418
0419     read_lock_irqsave(&gpc->lock, flags);
0420     while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
0421                        sizeof(struct vcpu_info))) {
0422         read_unlock_irqrestore(&gpc->lock, flags);
0423
0424         /*
0425          * This function gets called from kvm_vcpu_block() after setting the
0426          * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
0427          * from a HLT. So we really mustn't sleep. If the page ended up absent
0428          * at that point, just return 1 in order to trigger an immediate wake,
0429          * and we'll end up getting called again from a context where we *can*
0430          * fault in the page and wait for it.
0431          */
0432         if (in_atomic() || !task_is_running(current))
0433             return 1;
0434
0435         if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
0436                          sizeof(struct vcpu_info))) {
0437             /*
0438              * If this failed, userspace has screwed up the
0439              * vcpu_info mapping. No interrupts for you.
0440              */
0441             return 0;
0442         }
0443         read_lock_irqsave(&gpc->lock, flags);
0444     }
0445
0446     rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
0447     read_unlock_irqrestore(&gpc->lock, flags);
0448     return rc;
0449 }
0450
0451 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
0452 {
0453     int r = -ENOENT;
0454
0455
0456     switch (data->type) {
0457     case KVM_XEN_ATTR_TYPE_LONG_MODE:
0458         if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
0459             r = -EINVAL;
0460         } else {
0461             mutex_lock(&kvm->lock);
0462             kvm->arch.xen.long_mode = !!data->u.long_mode;
0463             mutex_unlock(&kvm->lock);
0464             r = 0;
0465         }
0466         break;
0467
0468     case KVM_XEN_ATTR_TYPE_SHARED_INFO:
0469         mutex_lock(&kvm->lock);
0470         r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
0471         mutex_unlock(&kvm->lock);
0472         break;
0473
0474     case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
0475         if (data->u.vector && data->u.vector < 0x10)
0476             r = -EINVAL;
0477         else {
0478             mutex_lock(&kvm->lock);
0479             kvm->arch.xen.upcall_vector = data->u.vector;
0480             mutex_unlock(&kvm->lock);
0481             r = 0;
0482         }
0483         break;
0484
0485     case KVM_XEN_ATTR_TYPE_EVTCHN:
0486         r = kvm_xen_setattr_evtchn(kvm, data);
0487         break;
0488
0489     case KVM_XEN_ATTR_TYPE_XEN_VERSION:
0490         mutex_lock(&kvm->lock);
0491         kvm->arch.xen.xen_version = data->u.xen_version;
0492         mutex_unlock(&kvm->lock);
0493         r = 0;
0494         break;
0495
0496     default:
0497         break;
0498     }
0499
0500     return r;
0501 }
0502
0503 int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
0504 {
0505     int r = -ENOENT;
0506
0507     mutex_lock(&kvm->lock);
0508
0509     switch (data->type) {
0510     case KVM_XEN_ATTR_TYPE_LONG_MODE:
0511         data->u.long_mode = kvm->arch.xen.long_mode;
0512         r = 0;
0513         break;
0514
0515     case KVM_XEN_ATTR_TYPE_SHARED_INFO:
0516         if (kvm->arch.xen.shinfo_cache.active)
0517             data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
0518         else
0519             data->u.shared_info.gfn = GPA_INVALID;
0520         r = 0;
0521         break;
0522
0523     case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
0524         data->u.vector = kvm->arch.xen.upcall_vector;
0525         r = 0;
0526         break;
0527
0528     case KVM_XEN_ATTR_TYPE_XEN_VERSION:
0529         data->u.xen_version = kvm->arch.xen.xen_version;
0530         r = 0;
0531         break;
0532
0533     default:
0534         break;
0535     }
0536
0537     mutex_unlock(&kvm->lock);
0538     return r;
0539 }
0540
0541 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
0542 {
0543     int idx, r = -ENOENT;
0544
0545     mutex_lock(&vcpu->kvm->lock);
0546     idx = srcu_read_lock(&vcpu->kvm->srcu);
0547
0548     switch (data->type) {
0549     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
0550         /* No compat necessary here. */
0551         BUILD_BUG_ON(sizeof(struct vcpu_info) !=
0552                  sizeof(struct compat_vcpu_info));
0553         BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
0554                  offsetof(struct compat_vcpu_info, time));
0555
0556         if (data->u.gpa == GPA_INVALID) {
0557             kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
0558             r = 0;
0559             break;
0560         }
0561
0562         r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
0563                           &vcpu->arch.xen.vcpu_info_cache,
0564                           NULL, KVM_HOST_USES_PFN, data->u.gpa,
0565                           sizeof(struct vcpu_info));
0566         if (!r)
0567             kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0568
0569         break;
0570
0571     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
0572         if (data->u.gpa == GPA_INVALID) {
0573             kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
0574                              &vcpu->arch.xen.vcpu_time_info_cache);
0575             r = 0;
0576             break;
0577         }
0578
0579         r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
0580                           &vcpu->arch.xen.vcpu_time_info_cache,
0581                           NULL, KVM_HOST_USES_PFN, data->u.gpa,
0582                           sizeof(struct pvclock_vcpu_time_info));
0583         if (!r)
0584             kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
0585         break;
0586
0587     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
0588         if (!sched_info_on()) {
0589             r = -EOPNOTSUPP;
0590             break;
0591         }
0592         if (data->u.gpa == GPA_INVALID) {
0593             kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
0594                              &vcpu->arch.xen.runstate_cache);
0595             r = 0;
0596             break;
0597         }
0598
0599         r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
0600                           &vcpu->arch.xen.runstate_cache,
0601                           NULL, KVM_HOST_USES_PFN, data->u.gpa,
0602                           sizeof(struct vcpu_runstate_info));
0603         break;
0604
0605     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
0606         if (!sched_info_on()) {
0607             r = -EOPNOTSUPP;
0608             break;
0609         }
0610         if (data->u.runstate.state > RUNSTATE_offline) {
0611             r = -EINVAL;
0612             break;
0613         }
0614
0615         kvm_xen_update_runstate(vcpu, data->u.runstate.state);
0616         r = 0;
0617         break;
0618
0619     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
0620         if (!sched_info_on()) {
0621             r = -EOPNOTSUPP;
0622             break;
0623         }
0624         if (data->u.runstate.state > RUNSTATE_offline) {
0625             r = -EINVAL;
0626             break;
0627         }
0628         if (data->u.runstate.state_entry_time !=
0629             (data->u.runstate.time_running +
0630              data->u.runstate.time_runnable +
0631              data->u.runstate.time_blocked +
0632              data->u.runstate.time_offline)) {
0633             r = -EINVAL;
0634             break;
0635         }
0636         if (get_kvmclock_ns(vcpu->kvm) <
0637             data->u.runstate.state_entry_time) {
0638             r = -EINVAL;
0639             break;
0640         }
0641
0642         vcpu->arch.xen.current_runstate = data->u.runstate.state;
0643         vcpu->arch.xen.runstate_entry_time =
0644             data->u.runstate.state_entry_time;
0645         vcpu->arch.xen.runstate_times[RUNSTATE_running] =
0646             data->u.runstate.time_running;
0647         vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
0648             data->u.runstate.time_runnable;
0649         vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
0650             data->u.runstate.time_blocked;
0651         vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
0652             data->u.runstate.time_offline;
0653         vcpu->arch.xen.last_steal = current->sched_info.run_delay;
0654         r = 0;
0655         break;
0656
0657     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
0658         if (!sched_info_on()) {
0659             r = -EOPNOTSUPP;
0660             break;
0661         }
0662         if (data->u.runstate.state > RUNSTATE_offline &&
0663             data->u.runstate.state != (u64)-1) {
0664             r = -EINVAL;
0665             break;
0666         }
0667         /* The adjustment must add up */
0668         if (data->u.runstate.state_entry_time !=
0669             (data->u.runstate.time_running +
0670              data->u.runstate.time_runnable +
0671              data->u.runstate.time_blocked +
0672              data->u.runstate.time_offline)) {
0673             r = -EINVAL;
0674             break;
0675         }
0676
0677         if (get_kvmclock_ns(vcpu->kvm) <
0678             (vcpu->arch.xen.runstate_entry_time +
0679              data->u.runstate.state_entry_time)) {
0680             r = -EINVAL;
0681             break;
0682         }
0683
0684         vcpu->arch.xen.runstate_entry_time +=
0685             data->u.runstate.state_entry_time;
0686         vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
0687             data->u.runstate.time_running;
0688         vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
0689             data->u.runstate.time_runnable;
0690         vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
0691             data->u.runstate.time_blocked;
0692         vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
0693             data->u.runstate.time_offline;
0694
0695         if (data->u.runstate.state <= RUNSTATE_offline)
0696             kvm_xen_update_runstate(vcpu, data->u.runstate.state);
0697         r = 0;
0698         break;
0699
0700     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
0701         if (data->u.vcpu_id >= KVM_MAX_VCPUS)
0702             r = -EINVAL;
0703         else {
0704             vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
0705             r = 0;
0706         }
0707         break;
0708
0709     case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
0710         if (data->u.timer.port &&
0711             data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
0712             r = -EINVAL;
0713             break;
0714         }
0715
0716         if (!vcpu->arch.xen.timer.function)
0717             kvm_xen_init_timer(vcpu);
0718
0719         /* Stop the timer (if it's running) before changing the vector */
0720         kvm_xen_stop_timer(vcpu);
0721         vcpu->arch.xen.timer_virq = data->u.timer.port;
0722
0723         /* Start the timer if the new value has a valid vector+expiry. */
0724         if (data->u.timer.port && data->u.timer.expires_ns)
0725             kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
0726                         data->u.timer.expires_ns -
0727                         get_kvmclock_ns(vcpu->kvm));
0728
0729         r = 0;
0730         break;
0731
0732     case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
0733         if (data->u.vector && data->u.vector < 0x10)
0734             r = -EINVAL;
0735         else {
0736             vcpu->arch.xen.upcall_vector = data->u.vector;
0737             r = 0;
0738         }
0739         break;
0740
0741     default:
0742         break;
0743     }
0744
0745     srcu_read_unlock(&vcpu->kvm->srcu, idx);
0746     mutex_unlock(&vcpu->kvm->lock);
0747     return r;
0748 }
0749
0750 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
0751 {
0752     int r = -ENOENT;
0753
0754     mutex_lock(&vcpu->kvm->lock);
0755
0756     switch (data->type) {
0757     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
0758         if (vcpu->arch.xen.vcpu_info_cache.active)
0759             data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
0760         else
0761             data->u.gpa = GPA_INVALID;
0762         r = 0;
0763         break;
0764
0765     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
0766         if (vcpu->arch.xen.vcpu_time_info_cache.active)
0767             data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
0768         else
0769             data->u.gpa = GPA_INVALID;
0770         r = 0;
0771         break;
0772
0773     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
0774         if (!sched_info_on()) {
0775             r = -EOPNOTSUPP;
0776             break;
0777         }
0778         if (vcpu->arch.xen.runstate_cache.active) {
0779             data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
0780             r = 0;
0781         }
0782         break;
0783
0784     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
0785         if (!sched_info_on()) {
0786             r = -EOPNOTSUPP;
0787             break;
0788         }
0789         data->u.runstate.state = vcpu->arch.xen.current_runstate;
0790         r = 0;
0791         break;
0792
0793     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
0794         if (!sched_info_on()) {
0795             r = -EOPNOTSUPP;
0796             break;
0797         }
0798         data->u.runstate.state = vcpu->arch.xen.current_runstate;
0799         data->u.runstate.state_entry_time =
0800             vcpu->arch.xen.runstate_entry_time;
0801         data->u.runstate.time_running =
0802             vcpu->arch.xen.runstate_times[RUNSTATE_running];
0803         data->u.runstate.time_runnable =
0804             vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
0805         data->u.runstate.time_blocked =
0806             vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
0807         data->u.runstate.time_offline =
0808             vcpu->arch.xen.runstate_times[RUNSTATE_offline];
0809         r = 0;
0810         break;
0811
0812     case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
0813         r = -EINVAL;
0814         break;
0815
0816     case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
0817         data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
0818         r = 0;
0819         break;
0820
0821     case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
0822         data->u.timer.port = vcpu->arch.xen.timer_virq;
0823         data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
0824         data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
0825         r = 0;
0826         break;
0827
0828     case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
0829         data->u.vector = vcpu->arch.xen.upcall_vector;
0830         r = 0;
0831         break;
0832
0833     default:
0834         break;
0835     }
0836
0837     mutex_unlock(&vcpu->kvm->lock);
0838     return r;
0839 }
0840
0841 int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
0842 {
0843     struct kvm *kvm = vcpu->kvm;
0844     u32 page_num = data & ~PAGE_MASK;
0845     u64 page_addr = data & PAGE_MASK;
0846     bool lm = is_long_mode(vcpu);
0847
0848     /* Latch long_mode for shared_info pages etc. */
0849     vcpu->kvm->arch.xen.long_mode = lm;
0850
0851     /*
0852      * If Xen hypercall intercept is enabled, fill the hypercall
0853      * page with VMCALL/VMMCALL instructions since that's what
0854      * we catch. Else the VMM has provided the hypercall pages
0855      * with instructions of its own choosing, so use those.
0856      */
0857     if (kvm_xen_hypercall_enabled(kvm)) {
0858         u8 instructions[32];
0859         int i;
0860
0861         if (page_num)
0862             return 1;
0863
0864         /* mov imm32, %eax */
0865         instructions[0] = 0xb8;
0866
0867         /* vmcall / vmmcall */
0868         static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
0869
0870         /* ret */
0871         instructions[8] = 0xc3;
0872
0873         /* int3 to pad */
0874         memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
0875
0876         for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
0877             *(u32 *)&instructions[1] = i;
0878             if (kvm_vcpu_write_guest(vcpu,
0879                          page_addr + (i * sizeof(instructions)),
0880                          instructions, sizeof(instructions)))
0881                 return 1;
0882         }
0883     } else {
0884         /*
0885          * Note, truncation is a non-issue as 'lm' is guaranteed to be
0886          * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
0887          */
0888         hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
0889                      : kvm->arch.xen_hvm_config.blob_addr_32;
0890         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
0891                   : kvm->arch.xen_hvm_config.blob_size_32;
0892         u8 *page;
0893
0894         if (page_num >= blob_size)
0895             return 1;
0896
0897         blob_addr += page_num * PAGE_SIZE;
0898
0899         page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
0900         if (IS_ERR(page))
0901             return PTR_ERR(page);
0902
0903         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
0904             kfree(page);
0905             return 1;
0906         }
0907     }
0908     return 0;
0909 }
0910
0911 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
0912 {
0913     /* Only some feature flags need to be *enabled* by userspace */
0914     u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
0915         KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
0916
0917     if (xhc->flags & ~permitted_flags)
0918         return -EINVAL;
0919
0920     /*
0921      * With hypercall interception the kernel generates its own
0922      * hypercall page so it must not be provided.
0923      */
0924     if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
0925         (xhc->blob_addr_32 || xhc->blob_addr_64 ||
0926          xhc->blob_size_32 || xhc->blob_size_64))
0927         return -EINVAL;
0928
0929     mutex_lock(&kvm->lock);
0930
0931     if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
0932         static_branch_inc(&kvm_xen_enabled.key);
0933     else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
0934         static_branch_slow_dec_deferred(&kvm_xen_enabled);
0935
0936     memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
0937
0938     mutex_unlock(&kvm->lock);
0939     return 0;
0940 }
0941
0942 static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
0943 {
0944     kvm_rax_write(vcpu, result);
0945     return kvm_skip_emulated_instruction(vcpu);
0946 }
0947
0948 static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
0949 {
0950     struct kvm_run *run = vcpu->run;
0951
0952     if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
0953         return 1;
0954
0955     return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
0956 }
0957
0958 static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
0959                    evtchn_port_t *ports)
0960 {
0961     struct kvm *kvm = vcpu->kvm;
0962     struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
0963     unsigned long *pending_bits;
0964     unsigned long flags;
0965     bool ret = true;
0966     int idx, i;
0967
0968     read_lock_irqsave(&gpc->lock, flags);
0969     idx = srcu_read_lock(&kvm->srcu);
0970     if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
0971         goto out_rcu;
0972
0973     ret = false;
0974     if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
0975         struct shared_info *shinfo = gpc->khva;
0976         pending_bits = (unsigned long *)&shinfo->evtchn_pending;
0977     } else {
0978         struct compat_shared_info *shinfo = gpc->khva;
0979         pending_bits = (unsigned long *)&shinfo->evtchn_pending;
0980     }
0981
0982     for (i = 0; i < nr_ports; i++) {
0983         if (test_bit(ports[i], pending_bits)) {
0984             ret = true;
0985             break;
0986         }
0987     }
0988
0989  out_rcu:
0990     srcu_read_unlock(&kvm->srcu, idx);
0991     read_unlock_irqrestore(&gpc->lock, flags);
0992
0993     return ret;
0994 }
0995
0996 static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
0997                  u64 param, u64 *r)
0998 {
0999     int idx, i;
1000     struct sched_poll sched_poll;
1001     evtchn_port_t port, *ports;
1002     gpa_t gpa;
1003
1004     if (!longmode || !lapic_in_kernel(vcpu) ||
1005         !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
1006         return false;
1007
1008     idx = srcu_read_lock(&vcpu->kvm->srcu);
1009     gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
1010     srcu_read_unlock(&vcpu->kvm->srcu, idx);
1011
1012     if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
1013                     sizeof(sched_poll))) {
1014         *r = -EFAULT;
1015         return true;
1016     }
1017
1018     if (unlikely(sched_poll.nr_ports > 1)) {
1019         /* Xen (unofficially) limits number of pollers to 128 */
1020         if (sched_poll.nr_ports > 128) {
1021             *r = -EINVAL;
1022             return true;
1023         }
1024
1025         ports = kmalloc_array(sched_poll.nr_ports,
1026                       sizeof(*ports), GFP_KERNEL);
1027         if (!ports) {
1028             *r = -ENOMEM;
1029             return true;
1030         }
1031     } else
1032         ports = &port;
1033
1034     for (i = 0; i < sched_poll.nr_ports; i++) {
1035         idx = srcu_read_lock(&vcpu->kvm->srcu);
1036         gpa = kvm_mmu_gva_to_gpa_system(vcpu,
1037                         (gva_t)(sched_poll.ports + i),
1038                         NULL);
1039         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1040
1041         if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
1042                         &ports[i], sizeof(port))) {
1043             *r = -EFAULT;
1044             goto out;
1045         }
1046     }
1047
1048     if (sched_poll.nr_ports == 1)
1049         vcpu->arch.xen.poll_evtchn = port;
1050     else
1051         vcpu->arch.xen.poll_evtchn = -1;
1052
1053     set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1054
1055     if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
1056         vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
1057
1058         if (sched_poll.timeout)
1059             mod_timer(&vcpu->arch.xen.poll_timer,
1060                   jiffies + nsecs_to_jiffies(sched_poll.timeout));
1061
1062         kvm_vcpu_halt(vcpu);
1063
1064         if (sched_poll.timeout)
1065             del_timer(&vcpu->arch.xen.poll_timer);
1066
1067         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1068         kvm_clear_request(KVM_REQ_UNHALT, vcpu);
1069     }
1070
1071     vcpu->arch.xen.poll_evtchn = 0;
1072     *r = 0;
1073 out:
1074     /* Really, this is only needed in case of timeout */
1075     clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1076
1077     if (unlikely(sched_poll.nr_ports > 1))
1078         kfree(ports);
1079     return true;
1080 }
1081
1082 static void cancel_evtchn_poll(struct timer_list *t)
1083 {
1084     struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
1085
1086     kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1087     kvm_vcpu_kick(vcpu);
1088 }
1089
1090 static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
1091                    int cmd, u64 param, u64 *r)
1092 {
1093     switch (cmd) {
1094     case SCHEDOP_poll:
1095         if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
1096             return true;
1097         fallthrough;
1098     case SCHEDOP_yield:
1099         kvm_vcpu_on_spin(vcpu, true);
1100         *r = 0;
1101         return true;
1102     default:
1103         break;
1104     }
1105
1106     return false;
1107 }
1108
1109 struct compat_vcpu_set_singleshot_timer {
1110     uint64_t timeout_abs_ns;
1111     uint32_t flags;
1112 } __attribute__((packed));
1113
1114 static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
1115                   int vcpu_id, u64 param, u64 *r)
1116 {
1117     struct vcpu_set_singleshot_timer oneshot;
1118     s64 delta;
1119     gpa_t gpa;
1120     int idx;
1121
1122     if (!kvm_xen_timer_enabled(vcpu))
1123         return false;
1124
1125     switch (cmd) {
1126     case VCPUOP_set_singleshot_timer:
1127         if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1128             *r = -EINVAL;
1129             return true;
1130         }
1131         idx = srcu_read_lock(&vcpu->kvm->srcu);
1132         gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
1133         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1134
1135         /*
1136          * The only difference for 32-bit compat is the 4 bytes of
1137          * padding after the interesting part of the structure. So
1138          * for a faithful emulation of Xen we have to *try* to copy
1139          * the padding and return -EFAULT if we can't. Otherwise we
1140          * might as well just have copied the 12-byte 32-bit struct.
1141          */
1142         BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1143                  offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1144         BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1145                  sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1146         BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
1147                  offsetof(struct vcpu_set_singleshot_timer, flags));
1148         BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
1149                  sizeof_field(struct vcpu_set_singleshot_timer, flags));
1150
1151         if (!gpa ||
1152             kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
1153                     sizeof(struct compat_vcpu_set_singleshot_timer))) {
1154             *r = -EFAULT;
1155             return true;
1156         }
1157
1158         delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
1159         if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
1160             *r = -ETIME;
1161             return true;
1162         }
1163
1164         kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
1165         *r = 0;
1166         return true;
1167
1168     case VCPUOP_stop_singleshot_timer:
1169         if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1170             *r = -EINVAL;
1171             return true;
1172         }
1173         kvm_xen_stop_timer(vcpu);
1174         *r = 0;
1175         return true;
1176     }
1177
1178     return false;
1179 }
1180
1181 static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
1182                        u64 *r)
1183 {
1184     if (!kvm_xen_timer_enabled(vcpu))
1185         return false;
1186
1187     if (timeout) {
1188         uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
1189         int64_t delta = timeout - guest_now;
1190
1191         /* Xen has a 'Linux workaround' in do_set_timer_op() which
1192          * checks for negative absolute timeout values (caused by
1193          * integer overflow), and for values about 13 days in the
1194          * future (2^50ns) which would be caused by jiffies
1195          * overflow. For those cases, it sets the timeout 100ms in
1196          * the future (not *too* soon, since if a guest really did
1197          * set a long timeout on purpose we don't want to keep
1198          * churning CPU time by waking it up).
1199          */
1200         if (unlikely((int64_t)timeout < 0 ||
1201                  (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
1202             delta = 100 * NSEC_PER_MSEC;
1203             timeout = guest_now + delta;
1204         }
1205
1206         kvm_xen_start_timer(vcpu, timeout, delta);
1207     } else {
1208         kvm_xen_stop_timer(vcpu);
1209     }
1210
1211     *r = 0;
1212     return true;
1213 }
1214
1215 int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
1216 {
1217     bool longmode;
1218     u64 input, params[6], r = -ENOSYS;
1219     bool handled = false;
1220
1221     input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
1222
1223     /* Hyper-V hypercalls get bit 31 set in EAX */
1224     if ((input & 0x80000000) &&
1225         kvm_hv_hypercall_enabled(vcpu))
1226         return kvm_hv_hypercall(vcpu);
1227
1228     longmode = is_64_bit_hypercall(vcpu);
1229     if (!longmode) {
1230         params[0] = (u32)kvm_rbx_read(vcpu);
1231         params[1] = (u32)kvm_rcx_read(vcpu);
1232         params[2] = (u32)kvm_rdx_read(vcpu);
1233         params[3] = (u32)kvm_rsi_read(vcpu);
1234         params[4] = (u32)kvm_rdi_read(vcpu);
1235         params[5] = (u32)kvm_rbp_read(vcpu);
1236     }
1237 #ifdef CONFIG_X86_64
1238     else {
1239         params[0] = (u64)kvm_rdi_read(vcpu);
1240         params[1] = (u64)kvm_rsi_read(vcpu);
1241         params[2] = (u64)kvm_rdx_read(vcpu);
1242         params[3] = (u64)kvm_r10_read(vcpu);
1243         params[4] = (u64)kvm_r8_read(vcpu);
1244         params[5] = (u64)kvm_r9_read(vcpu);
1245     }
1246 #endif
1247     trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
1248                 params[3], params[4], params[5]);
1249
1250     switch (input) {
1251     case __HYPERVISOR_xen_version:
1252         if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
1253             r = vcpu->kvm->arch.xen.xen_version;
1254             handled = true;
1255         }
1256         break;
1257     case __HYPERVISOR_event_channel_op:
1258         if (params[0] == EVTCHNOP_send)
1259             handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
1260         break;
1261     case __HYPERVISOR_sched_op:
1262         handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
1263                          params[1], &r);
1264         break;
1265     case __HYPERVISOR_vcpu_op:
1266         handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
1267                         params[2], &r);
1268         break;
1269     case __HYPERVISOR_set_timer_op: {
1270         u64 timeout = params[0];
1271         /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
1272         if (!longmode)
1273             timeout |= params[1] << 32;
1274         handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
1275         break;
1276     }
1277     default:
1278         break;
1279     }
1280
1281     if (handled)
1282         return kvm_xen_hypercall_set_result(vcpu, r);
1283
1284     vcpu->run->exit_reason = KVM_EXIT_XEN;
1285     vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
1286     vcpu->run->xen.u.hcall.longmode = longmode;
1287     vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
1288     vcpu->run->xen.u.hcall.input = input;
1289     vcpu->run->xen.u.hcall.params[0] = params[0];
1290     vcpu->run->xen.u.hcall.params[1] = params[1];
1291     vcpu->run->xen.u.hcall.params[2] = params[2];
1292     vcpu->run->xen.u.hcall.params[3] = params[3];
1293     vcpu->run->xen.u.hcall.params[4] = params[4];
1294     vcpu->run->xen.u.hcall.params[5] = params[5];
1295     vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
1296     vcpu->arch.complete_userspace_io =
1297         kvm_xen_hypercall_complete_userspace;
1298
1299     return 0;
1300 }
1301
1302 static inline int max_evtchn_port(struct kvm *kvm)
1303 {
1304     if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
1305         return EVTCHN_2L_NR_CHANNELS;
1306     else
1307         return COMPAT_EVTCHN_2L_NR_CHANNELS;
1308 }
1309
1310 static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
1311 {
1312     int poll_evtchn = vcpu->arch.xen.poll_evtchn;
1313
1314     if ((poll_evtchn == port || poll_evtchn == -1) &&
1315         test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
1316         kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1317         kvm_vcpu_kick(vcpu);
1318     }
1319 }
1320
1321 /*
1322  * The return value from this function is propagated to kvm_set_irq() API,
1323  * so it returns:
1324  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
1325  *  = 0   Interrupt was coalesced (previous irq is still pending)
1326  *  > 0   Number of CPUs interrupt was delivered to
1327  *
1328  * It is also called directly from kvm_arch_set_irq_inatomic(), where the
1329  * only check on its return value is a comparison with -EWOULDBLOCK'.
1330  */
1331 int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1332 {
1333     struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1334     struct kvm_vcpu *vcpu;
1335     unsigned long *pending_bits, *mask_bits;
1336     unsigned long flags;
1337     int port_word_bit;
1338     bool kick_vcpu = false;
1339     int vcpu_idx, idx, rc;
1340
1341     vcpu_idx = READ_ONCE(xe->vcpu_idx);
1342     if (vcpu_idx >= 0)
1343         vcpu = kvm_get_vcpu(kvm, vcpu_idx);
1344     else {
1345         vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
1346         if (!vcpu)
1347             return -EINVAL;
1348         WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
1349     }
1350
1351     if (!vcpu->arch.xen.vcpu_info_cache.active)
1352         return -EINVAL;
1353
1354     if (xe->port >= max_evtchn_port(kvm))
1355         return -EINVAL;
1356
1357     rc = -EWOULDBLOCK;
1358
1359     idx = srcu_read_lock(&kvm->srcu);
1360
1361     read_lock_irqsave(&gpc->lock, flags);
1362     if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
1363         goto out_rcu;
1364
1365     if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1366         struct shared_info *shinfo = gpc->khva;
1367         pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1368         mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1369         port_word_bit = xe->port / 64;
1370     } else {
1371         struct compat_shared_info *shinfo = gpc->khva;
1372         pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1373         mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1374         port_word_bit = xe->port / 32;
1375     }
1376
1377     /*
1378      * If this port wasn't already set, and if it isn't masked, then
1379      * we try to set the corresponding bit in the in-kernel shadow of
1380      * evtchn_pending_sel for the target vCPU. And if *that* wasn't
1381      * already set, then we kick the vCPU in question to write to the
1382      * *real* evtchn_pending_sel in its own guest vcpu_info struct.
1383      */
1384     if (test_and_set_bit(xe->port, pending_bits)) {
1385         rc = 0; /* It was already raised */
1386     } else if (test_bit(xe->port, mask_bits)) {
1387         rc = -ENOTCONN; /* Masked */
1388         kvm_xen_check_poller(vcpu, xe->port);
1389     } else {
1390         rc = 1; /* Delivered to the bitmap in shared_info. */
1391         /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
1392         read_unlock_irqrestore(&gpc->lock, flags);
1393         gpc = &vcpu->arch.xen.vcpu_info_cache;
1394
1395         read_lock_irqsave(&gpc->lock, flags);
1396         if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
1397             /*
1398              * Could not access the vcpu_info. Set the bit in-kernel
1399              * and prod the vCPU to deliver it for itself.
1400              */
1401             if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
1402                 kick_vcpu = true;
1403             goto out_rcu;
1404         }
1405
1406         if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1407             struct vcpu_info *vcpu_info = gpc->khva;
1408             if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
1409                 WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1410                 kick_vcpu = true;
1411             }
1412         } else {
1413             struct compat_vcpu_info *vcpu_info = gpc->khva;
1414             if (!test_and_set_bit(port_word_bit,
1415                           (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
1416                 WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1417                 kick_vcpu = true;
1418             }
1419         }
1420
1421         /* For the per-vCPU lapic vector, deliver it as MSI. */
1422         if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
1423             kvm_xen_inject_vcpu_vector(vcpu);
1424             kick_vcpu = false;
1425         }
1426     }
1427
1428  out_rcu:
1429     read_unlock_irqrestore(&gpc->lock, flags);
1430     srcu_read_unlock(&kvm->srcu, idx);
1431
1432     if (kick_vcpu) {
1433         kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1434         kvm_vcpu_kick(vcpu);
1435     }
1436
1437     return rc;
1438 }
1439
1440 static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1441 {
1442     bool mm_borrowed = false;
1443     int rc;
1444
1445     rc = kvm_xen_set_evtchn_fast(xe, kvm);
1446     if (rc != -EWOULDBLOCK)
1447         return rc;
1448
1449     if (current->mm != kvm->mm) {
1450         /*
1451          * If not on a thread which already belongs to this KVM,
1452          * we'd better be in the irqfd workqueue.
1453          */
1454         if (WARN_ON_ONCE(current->mm))
1455             return -EINVAL;
1456
1457         kthread_use_mm(kvm->mm);
1458         mm_borrowed = true;
1459     }
1460
1461     /*
1462      * For the irqfd workqueue, using the main kvm->lock mutex is
1463      * fine since this function is invoked from kvm_set_irq() with
1464      * no other lock held, no srcu. In future if it will be called
1465      * directly from a vCPU thread (e.g. on hypercall for an IPI)
1466      * then it may need to switch to using a leaf-node mutex for
1467      * serializing the shared_info mapping.
1468      */
1469     mutex_lock(&kvm->lock);
1470
1471     /*
1472      * It is theoretically possible for the page to be unmapped
1473      * and the MMU notifier to invalidate the shared_info before
1474      * we even get to use it. In that case, this looks like an
1475      * infinite loop. It was tempting to do it via the userspace
1476      * HVA instead... but that just *hides* the fact that it's
1477      * an infinite loop, because if a fault occurs and it waits
1478      * for the page to come back, it can *still* immediately
1479      * fault and have to wait again, repeatedly.
1480      *
1481      * Conversely, the page could also have been reinstated by
1482      * another thread before we even obtain the mutex above, so
1483      * check again *first* before remapping it.
1484      */
1485     do {
1486         struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1487         int idx;
1488
1489         rc = kvm_xen_set_evtchn_fast(xe, kvm);
1490         if (rc != -EWOULDBLOCK)
1491             break;
1492
1493         idx = srcu_read_lock(&kvm->srcu);
1494         rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
1495         srcu_read_unlock(&kvm->srcu, idx);
1496     } while(!rc);
1497
1498     mutex_unlock(&kvm->lock);
1499
1500     if (mm_borrowed)
1501         kthread_unuse_mm(kvm->mm);
1502
1503     return rc;
1504 }
1505
1506 /* This is the version called from kvm_set_irq() as the .set function */
1507 static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
1508              int irq_source_id, int level, bool line_status)
1509 {
1510     if (!level)
1511         return -EINVAL;
1512
1513     return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
1514 }
1515
1516 /*
1517  * Set up an event channel interrupt from the KVM IRQ routing table.
1518  * Used for e.g. PIRQ from passed through physical devices.
1519  */
1520 int kvm_xen_setup_evtchn(struct kvm *kvm,
1521              struct kvm_kernel_irq_routing_entry *e,
1522              const struct kvm_irq_routing_entry *ue)
1523
1524 {
1525     struct kvm_vcpu *vcpu;
1526
1527     if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
1528         return -EINVAL;
1529
1530     /* We only support 2 level event channels for now */
1531     if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1532         return -EINVAL;
1533
1534     /*
1535      * Xen gives us interesting mappings from vCPU index to APIC ID,
1536      * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1537      * to find it. Do that once at setup time, instead of every time.
1538      * But beware that on live update / live migration, the routing
1539      * table might be reinstated before the vCPU threads have finished
1540      * recreating their vCPUs.
1541      */
1542     vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
1543     if (vcpu)
1544         e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
1545     else
1546         e->xen_evtchn.vcpu_idx = -1;
1547
1548     e->xen_evtchn.port = ue->u.xen_evtchn.port;
1549     e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
1550     e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
1551     e->set = evtchn_set_fn;
1552
1553     return 0;
1554 }
1555
1556 /*
1557  * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
1558  */
1559 int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
1560 {
1561     struct kvm_xen_evtchn e;
1562     int ret;
1563
1564     if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
1565         return -EINVAL;
1566
1567     /* We only support 2 level event channels for now */
1568     if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1569         return -EINVAL;
1570
1571     e.port = uxe->port;
1572     e.vcpu_id = uxe->vcpu;
1573     e.vcpu_idx = -1;
1574     e.priority = uxe->priority;
1575
1576     ret = kvm_xen_set_evtchn(&e, kvm);
1577
1578     /*
1579      * None of that 'return 1 if it actually got delivered' nonsense.
1580      * We don't care if it was masked (-ENOTCONN) either.
1581      */
1582     if (ret > 0 || ret == -ENOTCONN)
1583         ret = 0;
1584
1585     return ret;
1586 }
1587
1588 /*
1589  * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
1590  */
1591 struct evtchnfd {
1592     u32 send_port;
1593     u32 type;
1594     union {
1595         struct kvm_xen_evtchn port;
1596         struct {
1597             u32 port; /* zero */
1598             struct eventfd_ctx *ctx;
1599         } eventfd;
1600     } deliver;
1601 };
1602
1603 /*
1604  * Update target vCPU or priority for a registered sending channel.
1605  */
1606 static int kvm_xen_eventfd_update(struct kvm *kvm,
1607                   struct kvm_xen_hvm_attr *data)
1608 {
1609     u32 port = data->u.evtchn.send_port;
1610     struct evtchnfd *evtchnfd;
1611
1612     if (!port || port >= max_evtchn_port(kvm))
1613         return -EINVAL;
1614
1615     mutex_lock(&kvm->lock);
1616     evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
1617     mutex_unlock(&kvm->lock);
1618
1619     if (!evtchnfd)
1620         return -ENOENT;
1621
1622     /* For an UPDATE, nothing may change except the priority/vcpu */
1623     if (evtchnfd->type != data->u.evtchn.type)
1624         return -EINVAL;
1625
1626     /*
1627      * Port cannot change, and if it's zero that was an eventfd
1628      * which can't be changed either.
1629      */
1630     if (!evtchnfd->deliver.port.port ||
1631         evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
1632         return -EINVAL;
1633
1634     /* We only support 2 level event channels for now */
1635     if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1636         return -EINVAL;
1637
1638     mutex_lock(&kvm->lock);
1639     evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
1640     if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
1641         evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
1642         evtchnfd->deliver.port.vcpu_idx = -1;
1643     }
1644     mutex_unlock(&kvm->lock);
1645     return 0;
1646 }
1647
1648 /*
1649  * Configure the target (eventfd or local port delivery) for sending on
1650  * a given event channel.
1651  */
1652 static int kvm_xen_eventfd_assign(struct kvm *kvm,
1653                   struct kvm_xen_hvm_attr *data)
1654 {
1655     u32 port = data->u.evtchn.send_port;
1656     struct eventfd_ctx *eventfd = NULL;
1657     struct evtchnfd *evtchnfd = NULL;
1658     int ret = -EINVAL;
1659
1660     if (!port || port >= max_evtchn_port(kvm))
1661         return -EINVAL;
1662
1663     evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
1664     if (!evtchnfd)
1665         return -ENOMEM;
1666
1667     switch(data->u.evtchn.type) {
1668     case EVTCHNSTAT_ipi:
1669         /* IPI  must map back to the same port# */
1670         if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
1671             goto out; /* -EINVAL */
1672         break;
1673
1674     case EVTCHNSTAT_interdomain:
1675         if (data->u.evtchn.deliver.port.port) {
1676             if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
1677                 goto out; /* -EINVAL */
1678         } else {
1679             eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
1680             if (IS_ERR(eventfd)) {
1681                 ret = PTR_ERR(eventfd);
1682                 goto out;
1683             }
1684         }
1685         break;
1686
1687     case EVTCHNSTAT_virq:
1688     case EVTCHNSTAT_closed:
1689     case EVTCHNSTAT_unbound:
1690     case EVTCHNSTAT_pirq:
1691     default: /* Unknown event channel type */
1692         goto out; /* -EINVAL */
1693     }
1694
1695     evtchnfd->send_port = data->u.evtchn.send_port;
1696     evtchnfd->type = data->u.evtchn.type;
1697     if (eventfd) {
1698         evtchnfd->deliver.eventfd.ctx = eventfd;
1699     } else {
1700         /* We only support 2 level event channels for now */
1701         if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1702             goto out; /* -EINVAL; */
1703
1704         evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
1705         evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
1706         evtchnfd->deliver.port.vcpu_idx = -1;
1707         evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
1708     }
1709
1710     mutex_lock(&kvm->lock);
1711     ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
1712             GFP_KERNEL);
1713     mutex_unlock(&kvm->lock);
1714     if (ret >= 0)
1715         return 0;
1716
1717     if (ret == -ENOSPC)
1718         ret = -EEXIST;
1719 out:
1720     if (eventfd)
1721         eventfd_ctx_put(eventfd);
1722     kfree(evtchnfd);
1723     return ret;
1724 }
1725
1726 static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
1727 {
1728     struct evtchnfd *evtchnfd;
1729
1730     mutex_lock(&kvm->lock);
1731     evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
1732     mutex_unlock(&kvm->lock);
1733
1734     if (!evtchnfd)
1735         return -ENOENT;
1736
1737     if (kvm)
1738         synchronize_srcu(&kvm->srcu);
1739     if (!evtchnfd->deliver.port.port)
1740         eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
1741     kfree(evtchnfd);
1742     return 0;
1743 }
1744
1745 static int kvm_xen_eventfd_reset(struct kvm *kvm)
1746 {
1747     struct evtchnfd *evtchnfd;
1748     int i;
1749
1750     mutex_lock(&kvm->lock);
1751     idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
1752         idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
1753         synchronize_srcu(&kvm->srcu);
1754         if (!evtchnfd->deliver.port.port)
1755             eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
1756         kfree(evtchnfd);
1757     }
1758     mutex_unlock(&kvm->lock);
1759
1760     return 0;
1761 }
1762
1763 static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
1764 {
1765     u32 port = data->u.evtchn.send_port;
1766
1767     if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
1768         return kvm_xen_eventfd_reset(kvm);
1769
1770     if (!port || port >= max_evtchn_port(kvm))
1771         return -EINVAL;
1772
1773     if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
1774         return kvm_xen_eventfd_deassign(kvm, port);
1775     if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
1776         return kvm_xen_eventfd_update(kvm, data);
1777     if (data->u.evtchn.flags)
1778         return -EINVAL;
1779
1780     return kvm_xen_eventfd_assign(kvm, data);
1781 }
1782
1783 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
1784 {
1785     struct evtchnfd *evtchnfd;
1786     struct evtchn_send send;
1787     gpa_t gpa;
1788     int idx;
1789
1790     idx = srcu_read_lock(&vcpu->kvm->srcu);
1791     gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
1792     srcu_read_unlock(&vcpu->kvm->srcu, idx);
1793
1794     if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
1795         *r = -EFAULT;
1796         return true;
1797     }
1798
1799     /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
1800     evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
1801     if (!evtchnfd)
1802         return false;
1803
1804     if (evtchnfd->deliver.port.port) {
1805         int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
1806         if (ret < 0 && ret != -ENOTCONN)
1807             return false;
1808     } else {
1809         eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
1810     }
1811
1812     *r = 0;
1813     return true;
1814 }
1815
1816 void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
1817 {
1818     vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
1819     vcpu->arch.xen.poll_evtchn = 0;
1820     timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
1821 }
1822
1823 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
1824 {
1825     if (kvm_xen_timer_enabled(vcpu))
1826         kvm_xen_stop_timer(vcpu);
1827
1828     kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
1829                      &vcpu->arch.xen.runstate_cache);
1830     kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
1831                      &vcpu->arch.xen.vcpu_info_cache);
1832     kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
1833                      &vcpu->arch.xen.vcpu_time_info_cache);
1834     del_timer_sync(&vcpu->arch.xen.poll_timer);
1835 }
1836
1837 void kvm_xen_init_vm(struct kvm *kvm)
1838 {
1839     idr_init(&kvm->arch.xen.evtchn_ports);
1840 }
1841
1842 void kvm_xen_destroy_vm(struct kvm *kvm)
1843 {
1844     struct evtchnfd *evtchnfd;
1845     int i;
1846
1847     kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
1848
1849     idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
1850         if (!evtchnfd->deliver.port.port)
1851             eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
1852         kfree(evtchnfd);
1853     }
1854     idr_destroy(&kvm->arch.xen.evtchn_ports);
1855
1856     if (kvm->arch.xen_hvm_config.msr)
1857         static_branch_slow_dec_deferred(&kvm_xen_enabled);
1858 }