Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Xen time implementation.
0004  *
0005  * This is implemented in terms of a clocksource driver which uses
0006  * the hypervisor clock as a nanosecond timebase, and a clockevent
0007  * driver which uses the hypervisor's timer mechanism.
0008  *
0009  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
0010  */
0011 #include <linux/kernel.h>
0012 #include <linux/interrupt.h>
0013 #include <linux/clocksource.h>
0014 #include <linux/clockchips.h>
0015 #include <linux/gfp.h>
0016 #include <linux/slab.h>
0017 #include <linux/pvclock_gtod.h>
0018 #include <linux/timekeeper_internal.h>
0019 
0020 #include <asm/pvclock.h>
0021 #include <asm/xen/hypervisor.h>
0022 #include <asm/xen/hypercall.h>
0023 
0024 #include <xen/events.h>
0025 #include <xen/features.h>
0026 #include <xen/interface/xen.h>
0027 #include <xen/interface/vcpu.h>
0028 
0029 #include "xen-ops.h"
0030 
0031 /* Minimum amount of time until next clock event fires */
0032 #define TIMER_SLOP  100000
0033 
0034 static u64 xen_sched_clock_offset __read_mostly;
0035 
0036 /* Get the TSC speed from Xen */
0037 static unsigned long xen_tsc_khz(void)
0038 {
0039     struct pvclock_vcpu_time_info *info =
0040         &HYPERVISOR_shared_info->vcpu_info[0].time;
0041 
0042     setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
0043     return pvclock_tsc_khz(info);
0044 }
0045 
0046 static u64 xen_clocksource_read(void)
0047 {
0048         struct pvclock_vcpu_time_info *src;
0049     u64 ret;
0050 
0051     preempt_disable_notrace();
0052     src = &__this_cpu_read(xen_vcpu)->time;
0053     ret = pvclock_clocksource_read(src);
0054     preempt_enable_notrace();
0055     return ret;
0056 }
0057 
0058 static u64 xen_clocksource_get_cycles(struct clocksource *cs)
0059 {
0060     return xen_clocksource_read();
0061 }
0062 
0063 static u64 xen_sched_clock(void)
0064 {
0065     return xen_clocksource_read() - xen_sched_clock_offset;
0066 }
0067 
0068 static void xen_read_wallclock(struct timespec64 *ts)
0069 {
0070     struct shared_info *s = HYPERVISOR_shared_info;
0071     struct pvclock_wall_clock *wall_clock = &(s->wc);
0072         struct pvclock_vcpu_time_info *vcpu_time;
0073 
0074     vcpu_time = &get_cpu_var(xen_vcpu)->time;
0075     pvclock_read_wallclock(wall_clock, vcpu_time, ts);
0076     put_cpu_var(xen_vcpu);
0077 }
0078 
0079 static void xen_get_wallclock(struct timespec64 *now)
0080 {
0081     xen_read_wallclock(now);
0082 }
0083 
0084 static int xen_set_wallclock(const struct timespec64 *now)
0085 {
0086     return -ENODEV;
0087 }
0088 
0089 static int xen_pvclock_gtod_notify(struct notifier_block *nb,
0090                    unsigned long was_set, void *priv)
0091 {
0092     /* Protected by the calling core code serialization */
0093     static struct timespec64 next_sync;
0094 
0095     struct xen_platform_op op;
0096     struct timespec64 now;
0097     struct timekeeper *tk = priv;
0098     static bool settime64_supported = true;
0099     int ret;
0100 
0101     now.tv_sec = tk->xtime_sec;
0102     now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
0103 
0104     /*
0105      * We only take the expensive HV call when the clock was set
0106      * or when the 11 minutes RTC synchronization time elapsed.
0107      */
0108     if (!was_set && timespec64_compare(&now, &next_sync) < 0)
0109         return NOTIFY_OK;
0110 
0111 again:
0112     if (settime64_supported) {
0113         op.cmd = XENPF_settime64;
0114         op.u.settime64.mbz = 0;
0115         op.u.settime64.secs = now.tv_sec;
0116         op.u.settime64.nsecs = now.tv_nsec;
0117         op.u.settime64.system_time = xen_clocksource_read();
0118     } else {
0119         op.cmd = XENPF_settime32;
0120         op.u.settime32.secs = now.tv_sec;
0121         op.u.settime32.nsecs = now.tv_nsec;
0122         op.u.settime32.system_time = xen_clocksource_read();
0123     }
0124 
0125     ret = HYPERVISOR_platform_op(&op);
0126 
0127     if (ret == -ENOSYS && settime64_supported) {
0128         settime64_supported = false;
0129         goto again;
0130     }
0131     if (ret < 0)
0132         return NOTIFY_BAD;
0133 
0134     /*
0135      * Move the next drift compensation time 11 minutes
0136      * ahead. That's emulating the sync_cmos_clock() update for
0137      * the hardware RTC.
0138      */
0139     next_sync = now;
0140     next_sync.tv_sec += 11 * 60;
0141 
0142     return NOTIFY_OK;
0143 }
0144 
0145 static struct notifier_block xen_pvclock_gtod_notifier = {
0146     .notifier_call = xen_pvclock_gtod_notify,
0147 };
0148 
0149 static int xen_cs_enable(struct clocksource *cs)
0150 {
0151     vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
0152     return 0;
0153 }
0154 
0155 static struct clocksource xen_clocksource __read_mostly = {
0156     .name   = "xen",
0157     .rating = 400,
0158     .read   = xen_clocksource_get_cycles,
0159     .mask   = CLOCKSOURCE_MASK(64),
0160     .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
0161     .enable = xen_cs_enable,
0162 };
0163 
0164 /*
0165    Xen clockevent implementation
0166 
0167    Xen has two clockevent implementations:
0168 
0169    The old timer_op one works with all released versions of Xen prior
0170    to version 3.0.4.  This version of the hypervisor provides a
0171    single-shot timer with nanosecond resolution.  However, sharing the
0172    same event channel is a 100Hz tick which is delivered while the
0173    vcpu is running.  We don't care about or use this tick, but it will
0174    cause the core time code to think the timer fired too soon, and
0175    will end up resetting it each time.  It could be filtered, but
0176    doing so has complications when the ktime clocksource is not yet
0177    the xen clocksource (ie, at boot time).
0178 
0179    The new vcpu_op-based timer interface allows the tick timer period
0180    to be changed or turned off.  The tick timer is not useful as a
0181    periodic timer because events are only delivered to running vcpus.
0182    The one-shot timer can report when a timeout is in the past, so
0183    set_next_event is capable of returning -ETIME when appropriate.
0184    This interface is used when available.
0185 */
0186 
0187 
0188 /*
0189   Get a hypervisor absolute time.  In theory we could maintain an
0190   offset between the kernel's time and the hypervisor's time, and
0191   apply that to a kernel's absolute timeout.  Unfortunately the
0192   hypervisor and kernel times can drift even if the kernel is using
0193   the Xen clocksource, because ntp can warp the kernel's clocksource.
0194 */
0195 static s64 get_abs_timeout(unsigned long delta)
0196 {
0197     return xen_clocksource_read() + delta;
0198 }
0199 
0200 static int xen_timerop_shutdown(struct clock_event_device *evt)
0201 {
0202     /* cancel timeout */
0203     HYPERVISOR_set_timer_op(0);
0204 
0205     return 0;
0206 }
0207 
0208 static int xen_timerop_set_next_event(unsigned long delta,
0209                       struct clock_event_device *evt)
0210 {
0211     WARN_ON(!clockevent_state_oneshot(evt));
0212 
0213     if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
0214         BUG();
0215 
0216     /* We may have missed the deadline, but there's no real way of
0217        knowing for sure.  If the event was in the past, then we'll
0218        get an immediate interrupt. */
0219 
0220     return 0;
0221 }
0222 
0223 static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
0224     .name           = "xen",
0225     .features       = CLOCK_EVT_FEAT_ONESHOT,
0226 
0227     .max_delta_ns       = 0xffffffff,
0228     .max_delta_ticks    = 0xffffffff,
0229     .min_delta_ns       = TIMER_SLOP,
0230     .min_delta_ticks    = TIMER_SLOP,
0231 
0232     .mult           = 1,
0233     .shift          = 0,
0234     .rating         = 500,
0235 
0236     .set_state_shutdown = xen_timerop_shutdown,
0237     .set_next_event     = xen_timerop_set_next_event,
0238 };
0239 
0240 static int xen_vcpuop_shutdown(struct clock_event_device *evt)
0241 {
0242     int cpu = smp_processor_id();
0243 
0244     if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
0245                    NULL) ||
0246         HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
0247                    NULL))
0248         BUG();
0249 
0250     return 0;
0251 }
0252 
0253 static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
0254 {
0255     int cpu = smp_processor_id();
0256 
0257     if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
0258                    NULL))
0259         BUG();
0260 
0261     return 0;
0262 }
0263 
0264 static int xen_vcpuop_set_next_event(unsigned long delta,
0265                      struct clock_event_device *evt)
0266 {
0267     int cpu = smp_processor_id();
0268     struct vcpu_set_singleshot_timer single;
0269     int ret;
0270 
0271     WARN_ON(!clockevent_state_oneshot(evt));
0272 
0273     single.timeout_abs_ns = get_abs_timeout(delta);
0274     /* Get an event anyway, even if the timeout is already expired */
0275     single.flags = 0;
0276 
0277     ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
0278                  &single);
0279     BUG_ON(ret != 0);
0280 
0281     return ret;
0282 }
0283 
0284 static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
0285     .name = "xen",
0286     .features = CLOCK_EVT_FEAT_ONESHOT,
0287 
0288     .max_delta_ns = 0xffffffff,
0289     .max_delta_ticks = 0xffffffff,
0290     .min_delta_ns = TIMER_SLOP,
0291     .min_delta_ticks = TIMER_SLOP,
0292 
0293     .mult = 1,
0294     .shift = 0,
0295     .rating = 500,
0296 
0297     .set_state_shutdown = xen_vcpuop_shutdown,
0298     .set_state_oneshot = xen_vcpuop_set_oneshot,
0299     .set_next_event = xen_vcpuop_set_next_event,
0300 };
0301 
0302 static const struct clock_event_device *xen_clockevent =
0303     &xen_timerop_clockevent;
0304 
0305 struct xen_clock_event_device {
0306     struct clock_event_device evt;
0307     char name[16];
0308 };
0309 static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
0310 
0311 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
0312 {
0313     struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
0314     irqreturn_t ret;
0315 
0316     ret = IRQ_NONE;
0317     if (evt->event_handler) {
0318         evt->event_handler(evt);
0319         ret = IRQ_HANDLED;
0320     }
0321 
0322     return ret;
0323 }
0324 
0325 void xen_teardown_timer(int cpu)
0326 {
0327     struct clock_event_device *evt;
0328     evt = &per_cpu(xen_clock_events, cpu).evt;
0329 
0330     if (evt->irq >= 0) {
0331         unbind_from_irqhandler(evt->irq, NULL);
0332         evt->irq = -1;
0333     }
0334 }
0335 
0336 void xen_setup_timer(int cpu)
0337 {
0338     struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
0339     struct clock_event_device *evt = &xevt->evt;
0340     int irq;
0341 
0342     WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
0343     if (evt->irq >= 0)
0344         xen_teardown_timer(cpu);
0345 
0346     printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
0347 
0348     snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
0349 
0350     irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
0351                       IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
0352                       IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
0353                       xevt->name, NULL);
0354     (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
0355 
0356     memcpy(evt, xen_clockevent, sizeof(*evt));
0357 
0358     evt->cpumask = cpumask_of(cpu);
0359     evt->irq = irq;
0360 }
0361 
0362 
0363 void xen_setup_cpu_clockevents(void)
0364 {
0365     clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
0366 }
0367 
0368 void xen_timer_resume(void)
0369 {
0370     int cpu;
0371 
0372     if (xen_clockevent != &xen_vcpuop_clockevent)
0373         return;
0374 
0375     for_each_online_cpu(cpu) {
0376         if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
0377                        xen_vcpu_nr(cpu), NULL))
0378             BUG();
0379     }
0380 }
0381 
0382 static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
0383 static u64 xen_clock_value_saved;
0384 
0385 void xen_save_time_memory_area(void)
0386 {
0387     struct vcpu_register_time_memory_area t;
0388     int ret;
0389 
0390     xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
0391 
0392     if (!xen_clock)
0393         return;
0394 
0395     t.addr.v = NULL;
0396 
0397     ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
0398     if (ret != 0)
0399         pr_notice("Cannot save secondary vcpu_time_info (err %d)",
0400               ret);
0401     else
0402         clear_page(xen_clock);
0403 }
0404 
0405 void xen_restore_time_memory_area(void)
0406 {
0407     struct vcpu_register_time_memory_area t;
0408     int ret;
0409 
0410     if (!xen_clock)
0411         goto out;
0412 
0413     t.addr.v = &xen_clock->pvti;
0414 
0415     ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
0416 
0417     /*
0418      * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
0419      * register the secondary time info with Xen or if we migrated to a
0420      * host without the necessary flags. On both of these cases what
0421      * happens is either process seeing a zeroed out pvti or seeing no
0422      * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
0423      * if 0, it discards the data in pvti and fallbacks to a system
0424      * call for a reliable timestamp.
0425      */
0426     if (ret != 0)
0427         pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
0428               ret);
0429 
0430 out:
0431     /* Need pvclock_resume() before using xen_clocksource_read(). */
0432     pvclock_resume();
0433     xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
0434 }
0435 
0436 static void xen_setup_vsyscall_time_info(void)
0437 {
0438     struct vcpu_register_time_memory_area t;
0439     struct pvclock_vsyscall_time_info *ti;
0440     int ret;
0441 
0442     ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
0443     if (!ti)
0444         return;
0445 
0446     t.addr.v = &ti->pvti;
0447 
0448     ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
0449     if (ret) {
0450         pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
0451         free_page((unsigned long)ti);
0452         return;
0453     }
0454 
0455     /*
0456      * If primary time info had this bit set, secondary should too since
0457      * it's the same data on both just different memory regions. But we
0458      * still check it in case hypervisor is buggy.
0459      */
0460     if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
0461         t.addr.v = NULL;
0462         ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
0463                      0, &t);
0464         if (!ret)
0465             free_page((unsigned long)ti);
0466 
0467         pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
0468         return;
0469     }
0470 
0471     xen_clock = ti;
0472     pvclock_set_pvti_cpu0_va(xen_clock);
0473 
0474     xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
0475 }
0476 
0477 static void __init xen_time_init(void)
0478 {
0479     struct pvclock_vcpu_time_info *pvti;
0480     int cpu = smp_processor_id();
0481     struct timespec64 tp;
0482 
0483     /* As Dom0 is never moved, no penalty on using TSC there */
0484     if (xen_initial_domain())
0485         xen_clocksource.rating = 275;
0486 
0487     clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
0488 
0489     if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
0490                    NULL) == 0) {
0491         /* Successfully turned off 100Hz tick, so we have the
0492            vcpuop-based timer interface */
0493         printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
0494         xen_clockevent = &xen_vcpuop_clockevent;
0495     }
0496 
0497     /* Set initial system time with full resolution */
0498     xen_read_wallclock(&tp);
0499     do_settimeofday64(&tp);
0500 
0501     setup_force_cpu_cap(X86_FEATURE_TSC);
0502 
0503     /*
0504      * We check ahead on the primary time info if this
0505      * bit is supported hence speeding up Xen clocksource.
0506      */
0507     pvti = &__this_cpu_read(xen_vcpu)->time;
0508     if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
0509         pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
0510         xen_setup_vsyscall_time_info();
0511     }
0512 
0513     xen_setup_runstate_info(cpu);
0514     xen_setup_timer(cpu);
0515     xen_setup_cpu_clockevents();
0516 
0517     xen_time_setup_guest();
0518 
0519     if (xen_initial_domain())
0520         pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
0521 }
0522 
0523 static void __init xen_init_time_common(void)
0524 {
0525     xen_sched_clock_offset = xen_clocksource_read();
0526     static_call_update(pv_steal_clock, xen_steal_clock);
0527     paravirt_set_sched_clock(xen_sched_clock);
0528 
0529     x86_platform.calibrate_tsc = xen_tsc_khz;
0530     x86_platform.get_wallclock = xen_get_wallclock;
0531 }
0532 
0533 void __init xen_init_time_ops(void)
0534 {
0535     xen_init_time_common();
0536 
0537     x86_init.timers.timer_init = xen_time_init;
0538     x86_init.timers.setup_percpu_clockev = x86_init_noop;
0539     x86_cpuinit.setup_percpu_clockev = x86_init_noop;
0540 
0541     /* Dom0 uses the native method to set the hardware RTC. */
0542     if (!xen_initial_domain())
0543         x86_platform.set_wallclock = xen_set_wallclock;
0544 }
0545 
0546 #ifdef CONFIG_XEN_PVHVM
0547 static void xen_hvm_setup_cpu_clockevents(void)
0548 {
0549     int cpu = smp_processor_id();
0550     xen_setup_runstate_info(cpu);
0551     /*
0552      * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
0553      * doing it xen_hvm_cpu_notify (which gets called by smp_init during
0554      * early bootup and also during CPU hotplug events).
0555      */
0556     xen_setup_cpu_clockevents();
0557 }
0558 
0559 void __init xen_hvm_init_time_ops(void)
0560 {
0561     static bool hvm_time_initialized;
0562 
0563     if (hvm_time_initialized)
0564         return;
0565 
0566     /*
0567      * vector callback is needed otherwise we cannot receive interrupts
0568      * on cpu > 0 and at this point we don't know how many cpus are
0569      * available.
0570      */
0571     if (!xen_have_vector_callback)
0572         return;
0573 
0574     if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
0575         pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
0576         return;
0577     }
0578 
0579     /*
0580      * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
0581      * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
0582      * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
0583      * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
0584      *
0585      * The xen_hvm_init_time_ops() should be called again later after
0586      * __this_cpu_read(xen_vcpu) is available.
0587      */
0588     if (!__this_cpu_read(xen_vcpu)) {
0589         pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
0590             xen_vcpu_nr(0));
0591         return;
0592     }
0593 
0594     xen_init_time_common();
0595 
0596     x86_init.timers.setup_percpu_clockev = xen_time_init;
0597     x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
0598 
0599     x86_platform.set_wallclock = xen_set_wallclock;
0600 
0601     hvm_time_initialized = true;
0602 }
0603 #endif
0604 
0605 /* Kernel parameter to specify Xen timer slop */
0606 static int __init parse_xen_timer_slop(char *ptr)
0607 {
0608     unsigned long slop = memparse(ptr, NULL);
0609 
0610     xen_timerop_clockevent.min_delta_ns = slop;
0611     xen_timerop_clockevent.min_delta_ticks = slop;
0612     xen_vcpuop_clockevent.min_delta_ns = slop;
0613     xen_vcpuop_clockevent.min_delta_ticks = slop;
0614 
0615     return 0;
0616 }
0617 early_param("xen_timer_slop", parse_xen_timer_slop);