Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 
0003 /*
0004  * Clocksource driver for the synthetic counter and timers
0005  * provided by the Hyper-V hypervisor to guest VMs, as described
0006  * in the Hyper-V Top Level Functional Spec (TLFS). This driver
0007  * is instruction set architecture independent.
0008  *
0009  * Copyright (C) 2019, Microsoft, Inc.
0010  *
0011  * Author:  Michael Kelley <mikelley@microsoft.com>
0012  */
0013 
0014 #include <linux/percpu.h>
0015 #include <linux/cpumask.h>
0016 #include <linux/clockchips.h>
0017 #include <linux/clocksource.h>
0018 #include <linux/sched_clock.h>
0019 #include <linux/mm.h>
0020 #include <linux/cpuhotplug.h>
0021 #include <linux/interrupt.h>
0022 #include <linux/irq.h>
0023 #include <linux/acpi.h>
0024 #include <clocksource/hyperv_timer.h>
0025 #include <asm/hyperv-tlfs.h>
0026 #include <asm/mshyperv.h>
0027 
0028 static struct clock_event_device __percpu *hv_clock_event;
0029 static u64 hv_sched_clock_offset __ro_after_init;
0030 
0031 /*
0032  * If false, we're using the old mechanism for stimer0 interrupts
0033  * where it sends a VMbus message when it expires. The old
0034  * mechanism is used when running on older versions of Hyper-V
0035  * that don't support Direct Mode. While Hyper-V provides
0036  * four stimer's per CPU, Linux uses only stimer0.
0037  *
0038  * Because Direct Mode does not require processing a VMbus
0039  * message, stimer interrupts can be enabled earlier in the
0040  * process of booting a CPU, and consistent with when timer
0041  * interrupts are enabled for other clocksource drivers.
0042  * However, for legacy versions of Hyper-V when Direct Mode
0043  * is not enabled, setting up stimer interrupts must be
0044  * delayed until VMbus is initialized and can process the
0045  * interrupt message.
0046  */
0047 static bool direct_mode_enabled;
0048 
0049 static int stimer0_irq = -1;
0050 static int stimer0_message_sint;
0051 static DEFINE_PER_CPU(long, stimer0_evt);
0052 
0053 /*
0054  * Common code for stimer0 interrupts coming via Direct Mode or
0055  * as a VMbus message.
0056  */
0057 void hv_stimer0_isr(void)
0058 {
0059     struct clock_event_device *ce;
0060 
0061     ce = this_cpu_ptr(hv_clock_event);
0062     ce->event_handler(ce);
0063 }
0064 EXPORT_SYMBOL_GPL(hv_stimer0_isr);
0065 
0066 /*
0067  * stimer0 interrupt handler for architectures that support
0068  * per-cpu interrupts, which also implies Direct Mode.
0069  */
0070 static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id)
0071 {
0072     hv_stimer0_isr();
0073     return IRQ_HANDLED;
0074 }
0075 
0076 static int hv_ce_set_next_event(unsigned long delta,
0077                 struct clock_event_device *evt)
0078 {
0079     u64 current_tick;
0080 
0081     current_tick = hv_read_reference_counter();
0082     current_tick += delta;
0083     hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick);
0084     return 0;
0085 }
0086 
0087 static int hv_ce_shutdown(struct clock_event_device *evt)
0088 {
0089     hv_set_register(HV_REGISTER_STIMER0_COUNT, 0);
0090     hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0);
0091     if (direct_mode_enabled && stimer0_irq >= 0)
0092         disable_percpu_irq(stimer0_irq);
0093 
0094     return 0;
0095 }
0096 
0097 static int hv_ce_set_oneshot(struct clock_event_device *evt)
0098 {
0099     union hv_stimer_config timer_cfg;
0100 
0101     timer_cfg.as_uint64 = 0;
0102     timer_cfg.enable = 1;
0103     timer_cfg.auto_enable = 1;
0104     if (direct_mode_enabled) {
0105         /*
0106          * When it expires, the timer will directly interrupt
0107          * on the specified hardware vector/IRQ.
0108          */
0109         timer_cfg.direct_mode = 1;
0110         timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR;
0111         if (stimer0_irq >= 0)
0112             enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE);
0113     } else {
0114         /*
0115          * When it expires, the timer will generate a VMbus message,
0116          * to be handled by the normal VMbus interrupt handler.
0117          */
0118         timer_cfg.direct_mode = 0;
0119         timer_cfg.sintx = stimer0_message_sint;
0120     }
0121     hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64);
0122     return 0;
0123 }
0124 
0125 /*
0126  * hv_stimer_init - Per-cpu initialization of the clockevent
0127  */
0128 static int hv_stimer_init(unsigned int cpu)
0129 {
0130     struct clock_event_device *ce;
0131 
0132     if (!hv_clock_event)
0133         return 0;
0134 
0135     ce = per_cpu_ptr(hv_clock_event, cpu);
0136     ce->name = "Hyper-V clockevent";
0137     ce->features = CLOCK_EVT_FEAT_ONESHOT;
0138     ce->cpumask = cpumask_of(cpu);
0139     ce->rating = 1000;
0140     ce->set_state_shutdown = hv_ce_shutdown;
0141     ce->set_state_oneshot = hv_ce_set_oneshot;
0142     ce->set_next_event = hv_ce_set_next_event;
0143 
0144     clockevents_config_and_register(ce,
0145                     HV_CLOCK_HZ,
0146                     HV_MIN_DELTA_TICKS,
0147                     HV_MAX_MAX_DELTA_TICKS);
0148     return 0;
0149 }
0150 
0151 /*
0152  * hv_stimer_cleanup - Per-cpu cleanup of the clockevent
0153  */
0154 int hv_stimer_cleanup(unsigned int cpu)
0155 {
0156     struct clock_event_device *ce;
0157 
0158     if (!hv_clock_event)
0159         return 0;
0160 
0161     /*
0162      * In the legacy case where Direct Mode is not enabled
0163      * (which can only be on x86/64), stimer cleanup happens
0164      * relatively early in the CPU offlining process. We
0165      * must unbind the stimer-based clockevent device so
0166      * that the LAPIC timer can take over until clockevents
0167      * are no longer needed in the offlining process. Note
0168      * that clockevents_unbind_device() eventually calls
0169      * hv_ce_shutdown().
0170      *
0171      * The unbind should not be done when Direct Mode is
0172      * enabled because we may be on an architecture where
0173      * there are no other clockevent devices to fallback to.
0174      */
0175     ce = per_cpu_ptr(hv_clock_event, cpu);
0176     if (direct_mode_enabled)
0177         hv_ce_shutdown(ce);
0178     else
0179         clockevents_unbind_device(ce, cpu);
0180 
0181     return 0;
0182 }
0183 EXPORT_SYMBOL_GPL(hv_stimer_cleanup);
0184 
0185 /*
0186  * These placeholders are overridden by arch specific code on
0187  * architectures that need special setup of the stimer0 IRQ because
0188  * they don't support per-cpu IRQs (such as x86/x64).
0189  */
0190 void __weak hv_setup_stimer0_handler(void (*handler)(void))
0191 {
0192 };
0193 
0194 void __weak hv_remove_stimer0_handler(void)
0195 {
0196 };
0197 
0198 /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */
0199 static int hv_setup_stimer0_irq(void)
0200 {
0201     int ret;
0202 
0203     ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR,
0204             ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH);
0205     if (ret < 0) {
0206         pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret);
0207         return ret;
0208     }
0209     stimer0_irq = ret;
0210 
0211     ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr,
0212         "Hyper-V stimer0", &stimer0_evt);
0213     if (ret) {
0214         pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d",
0215             stimer0_irq, ret);
0216         acpi_unregister_gsi(stimer0_irq);
0217         stimer0_irq = -1;
0218     }
0219     return ret;
0220 }
0221 
0222 static void hv_remove_stimer0_irq(void)
0223 {
0224     if (stimer0_irq == -1) {
0225         hv_remove_stimer0_handler();
0226     } else {
0227         free_percpu_irq(stimer0_irq, &stimer0_evt);
0228         acpi_unregister_gsi(stimer0_irq);
0229         stimer0_irq = -1;
0230     }
0231 }
0232 
0233 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */
0234 int hv_stimer_alloc(bool have_percpu_irqs)
0235 {
0236     int ret;
0237 
0238     /*
0239      * Synthetic timers are always available except on old versions of
0240      * Hyper-V on x86.  In that case, return as error as Linux will use a
0241      * clockevent based on emulated LAPIC timer hardware.
0242      */
0243     if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE))
0244         return -EINVAL;
0245 
0246     hv_clock_event = alloc_percpu(struct clock_event_device);
0247     if (!hv_clock_event)
0248         return -ENOMEM;
0249 
0250     direct_mode_enabled = ms_hyperv.misc_features &
0251             HV_STIMER_DIRECT_MODE_AVAILABLE;
0252 
0253     /*
0254      * If Direct Mode isn't enabled, the remainder of the initialization
0255      * is done later by hv_stimer_legacy_init()
0256      */
0257     if (!direct_mode_enabled)
0258         return 0;
0259 
0260     if (have_percpu_irqs) {
0261         ret = hv_setup_stimer0_irq();
0262         if (ret)
0263             goto free_clock_event;
0264     } else {
0265         hv_setup_stimer0_handler(hv_stimer0_isr);
0266     }
0267 
0268     /*
0269      * Since we are in Direct Mode, stimer initialization
0270      * can be done now with a CPUHP value in the same range
0271      * as other clockevent devices.
0272      */
0273     ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING,
0274             "clockevents/hyperv/stimer:starting",
0275             hv_stimer_init, hv_stimer_cleanup);
0276     if (ret < 0) {
0277         hv_remove_stimer0_irq();
0278         goto free_clock_event;
0279     }
0280     return ret;
0281 
0282 free_clock_event:
0283     free_percpu(hv_clock_event);
0284     hv_clock_event = NULL;
0285     return ret;
0286 }
0287 EXPORT_SYMBOL_GPL(hv_stimer_alloc);
0288 
0289 /*
0290  * hv_stimer_legacy_init -- Called from the VMbus driver to handle
0291  * the case when Direct Mode is not enabled, and the stimer
0292  * must be initialized late in the CPU onlining process.
0293  *
0294  */
0295 void hv_stimer_legacy_init(unsigned int cpu, int sint)
0296 {
0297     if (direct_mode_enabled)
0298         return;
0299 
0300     /*
0301      * This function gets called by each vCPU, so setting the
0302      * global stimer_message_sint value each time is conceptually
0303      * not ideal, but the value passed in is always the same and
0304      * it avoids introducing yet another interface into this
0305      * clocksource driver just to set the sint in the legacy case.
0306      */
0307     stimer0_message_sint = sint;
0308     (void)hv_stimer_init(cpu);
0309 }
0310 EXPORT_SYMBOL_GPL(hv_stimer_legacy_init);
0311 
0312 /*
0313  * hv_stimer_legacy_cleanup -- Called from the VMbus driver to
0314  * handle the case when Direct Mode is not enabled, and the
0315  * stimer must be cleaned up early in the CPU offlining
0316  * process.
0317  */
0318 void hv_stimer_legacy_cleanup(unsigned int cpu)
0319 {
0320     if (direct_mode_enabled)
0321         return;
0322     (void)hv_stimer_cleanup(cpu);
0323 }
0324 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup);
0325 
0326 /*
0327  * Do a global cleanup of clockevents for the cases of kexec and
0328  * vmbus exit
0329  */
0330 void hv_stimer_global_cleanup(void)
0331 {
0332     int cpu;
0333 
0334     /*
0335      * hv_stime_legacy_cleanup() will stop the stimer if Direct
0336      * Mode is not enabled, and fallback to the LAPIC timer.
0337      */
0338     for_each_present_cpu(cpu) {
0339         hv_stimer_legacy_cleanup(cpu);
0340     }
0341 
0342     if (!hv_clock_event)
0343         return;
0344 
0345     if (direct_mode_enabled) {
0346         cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING);
0347         hv_remove_stimer0_irq();
0348         stimer0_irq = -1;
0349     }
0350     free_percpu(hv_clock_event);
0351     hv_clock_event = NULL;
0352 
0353 }
0354 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
0355 
0356 /*
0357  * Code and definitions for the Hyper-V clocksources.  Two
0358  * clocksources are defined: one that reads the Hyper-V defined MSR, and
0359  * the other that uses the TSC reference page feature as defined in the
0360  * TLFS.  The MSR version is for compatibility with old versions of
0361  * Hyper-V and 32-bit x86.  The TSC reference page version is preferred.
0362  */
0363 
0364 static union {
0365     struct ms_hyperv_tsc_page page;
0366     u8 reserved[PAGE_SIZE];
0367 } tsc_pg __aligned(PAGE_SIZE);
0368 
0369 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
0370 {
0371     return &tsc_pg.page;
0372 }
0373 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
0374 
0375 static u64 notrace read_hv_clock_tsc(void)
0376 {
0377     u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
0378 
0379     if (current_tick == U64_MAX)
0380         current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
0381 
0382     return current_tick;
0383 }
0384 
0385 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
0386 {
0387     return read_hv_clock_tsc();
0388 }
0389 
0390 static u64 notrace read_hv_sched_clock_tsc(void)
0391 {
0392     return (read_hv_clock_tsc() - hv_sched_clock_offset) *
0393         (NSEC_PER_SEC / HV_CLOCK_HZ);
0394 }
0395 
0396 static void suspend_hv_clock_tsc(struct clocksource *arg)
0397 {
0398     u64 tsc_msr;
0399 
0400     /* Disable the TSC page */
0401     tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
0402     tsc_msr &= ~BIT_ULL(0);
0403     hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
0404 }
0405 
0406 
0407 static void resume_hv_clock_tsc(struct clocksource *arg)
0408 {
0409     phys_addr_t phys_addr = virt_to_phys(&tsc_pg);
0410     u64 tsc_msr;
0411 
0412     /* Re-enable the TSC page */
0413     tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
0414     tsc_msr &= GENMASK_ULL(11, 0);
0415     tsc_msr |= BIT_ULL(0) | (u64)phys_addr;
0416     hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
0417 }
0418 
0419 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
0420 static int hv_cs_enable(struct clocksource *cs)
0421 {
0422     vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
0423     return 0;
0424 }
0425 #endif
0426 
0427 static struct clocksource hyperv_cs_tsc = {
0428     .name   = "hyperv_clocksource_tsc_page",
0429     .rating = 500,
0430     .read   = read_hv_clock_tsc_cs,
0431     .mask   = CLOCKSOURCE_MASK(64),
0432     .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
0433     .suspend= suspend_hv_clock_tsc,
0434     .resume = resume_hv_clock_tsc,
0435 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
0436     .enable = hv_cs_enable,
0437     .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK,
0438 #else
0439     .vdso_clock_mode = VDSO_CLOCKMODE_NONE,
0440 #endif
0441 };
0442 
0443 static u64 notrace read_hv_clock_msr(void)
0444 {
0445     /*
0446      * Read the partition counter to get the current tick count. This count
0447      * is set to 0 when the partition is created and is incremented in
0448      * 100 nanosecond units.
0449      */
0450     return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
0451 }
0452 
0453 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
0454 {
0455     return read_hv_clock_msr();
0456 }
0457 
0458 static u64 notrace read_hv_sched_clock_msr(void)
0459 {
0460     return (read_hv_clock_msr() - hv_sched_clock_offset) *
0461         (NSEC_PER_SEC / HV_CLOCK_HZ);
0462 }
0463 
0464 static struct clocksource hyperv_cs_msr = {
0465     .name   = "hyperv_clocksource_msr",
0466     .rating = 500,
0467     .read   = read_hv_clock_msr_cs,
0468     .mask   = CLOCKSOURCE_MASK(64),
0469     .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
0470 };
0471 
0472 /*
0473  * Reference to pv_ops must be inline so objtool
0474  * detection of noinstr violations can work correctly.
0475  */
0476 #ifdef CONFIG_GENERIC_SCHED_CLOCK
0477 static __always_inline void hv_setup_sched_clock(void *sched_clock)
0478 {
0479     /*
0480      * We're on an architecture with generic sched clock (not x86/x64).
0481      * The Hyper-V sched clock read function returns nanoseconds, not
0482      * the normal 100ns units of the Hyper-V synthetic clock.
0483      */
0484     sched_clock_register(sched_clock, 64, NSEC_PER_SEC);
0485 }
0486 #elif defined CONFIG_PARAVIRT
0487 static __always_inline void hv_setup_sched_clock(void *sched_clock)
0488 {
0489     /* We're on x86/x64 *and* using PV ops */
0490     paravirt_set_sched_clock(sched_clock);
0491 }
0492 #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
0493 static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
0494 #endif /* CONFIG_GENERIC_SCHED_CLOCK */
0495 
0496 static bool __init hv_init_tsc_clocksource(void)
0497 {
0498     u64     tsc_msr;
0499     phys_addr_t phys_addr;
0500 
0501     if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
0502         return false;
0503 
0504     if (hv_root_partition)
0505         return false;
0506 
0507     /*
0508      * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly
0509      * handles frequency and offset changes due to live migration,
0510      * pause/resume, and other VM management operations.  So lower the
0511      * Hyper-V Reference TSC rating, causing the generic TSC to be used.
0512      * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference
0513      * TSC will be preferred over the virtualized ARM64 arch counter.
0514      * While the Hyper-V MSR clocksource won't be used since the
0515      * Reference TSC clocksource is present, change its rating as
0516      * well for consistency.
0517      */
0518     if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
0519         hyperv_cs_tsc.rating = 250;
0520         hyperv_cs_msr.rating = 250;
0521     }
0522 
0523     hv_read_reference_counter = read_hv_clock_tsc;
0524     phys_addr = virt_to_phys(hv_get_tsc_page());
0525 
0526     /*
0527      * The Hyper-V TLFS specifies to preserve the value of reserved
0528      * bits in registers. So read the existing value, preserve the
0529      * low order 12 bits, and add in the guest physical address
0530      * (which already has at least the low 12 bits set to zero since
0531      * it is page aligned). Also set the "enable" bit, which is bit 0.
0532      */
0533     tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
0534     tsc_msr &= GENMASK_ULL(11, 0);
0535     tsc_msr = tsc_msr | 0x1 | (u64)phys_addr;
0536     hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
0537 
0538     clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
0539 
0540     hv_sched_clock_offset = hv_read_reference_counter();
0541     hv_setup_sched_clock(read_hv_sched_clock_tsc);
0542 
0543     return true;
0544 }
0545 
0546 void __init hv_init_clocksource(void)
0547 {
0548     /*
0549      * Try to set up the TSC page clocksource. If it succeeds, we're
0550      * done. Otherwise, set up the MSR clocksource.  At least one of
0551      * these will always be available except on very old versions of
0552      * Hyper-V on x86.  In that case we won't have a Hyper-V
0553      * clocksource, but Linux will still run with a clocksource based
0554      * on the emulated PIT or LAPIC timer.
0555      */
0556     if (hv_init_tsc_clocksource())
0557         return;
0558 
0559     if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE))
0560         return;
0561 
0562     hv_read_reference_counter = read_hv_clock_msr;
0563     clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
0564 
0565     hv_sched_clock_offset = hv_read_reference_counter();
0566     hv_setup_sched_clock(read_hv_sched_clock_msr);
0567 }