Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0003 
0004 #include <linux/kernel.h>
0005 #include <linux/sched.h>
0006 #include <linux/sched/clock.h>
0007 #include <linux/init.h>
0008 #include <linux/export.h>
0009 #include <linux/timer.h>
0010 #include <linux/acpi_pmtmr.h>
0011 #include <linux/cpufreq.h>
0012 #include <linux/delay.h>
0013 #include <linux/clocksource.h>
0014 #include <linux/percpu.h>
0015 #include <linux/timex.h>
0016 #include <linux/static_key.h>
0017 #include <linux/static_call.h>
0018 
0019 #include <asm/hpet.h>
0020 #include <asm/timer.h>
0021 #include <asm/vgtod.h>
0022 #include <asm/time.h>
0023 #include <asm/delay.h>
0024 #include <asm/hypervisor.h>
0025 #include <asm/nmi.h>
0026 #include <asm/x86_init.h>
0027 #include <asm/geode.h>
0028 #include <asm/apic.h>
0029 #include <asm/intel-family.h>
0030 #include <asm/i8259.h>
0031 #include <asm/uv/uv.h>
0032 
0033 unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
0034 EXPORT_SYMBOL(cpu_khz);
0035 
0036 unsigned int __read_mostly tsc_khz;
0037 EXPORT_SYMBOL(tsc_khz);
0038 
0039 #define KHZ 1000
0040 
0041 /*
0042  * TSC can be unstable due to cpufreq or due to unsynced TSCs
0043  */
0044 static int __read_mostly tsc_unstable;
0045 static unsigned int __initdata tsc_early_khz;
0046 
0047 static DEFINE_STATIC_KEY_FALSE(__use_tsc);
0048 
0049 int tsc_clocksource_reliable;
0050 
0051 static u32 art_to_tsc_numerator;
0052 static u32 art_to_tsc_denominator;
0053 static u64 art_to_tsc_offset;
0054 struct clocksource *art_related_clocksource;
0055 
0056 struct cyc2ns {
0057     struct cyc2ns_data data[2]; /*  0 + 2*16 = 32 */
0058     seqcount_latch_t   seq;     /* 32 + 4    = 36 */
0059 
0060 }; /* fits one cacheline */
0061 
0062 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
0063 
0064 static int __init tsc_early_khz_setup(char *buf)
0065 {
0066     return kstrtouint(buf, 0, &tsc_early_khz);
0067 }
0068 early_param("tsc_early_khz", tsc_early_khz_setup);
0069 
0070 __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
0071 {
0072     int seq, idx;
0073 
0074     preempt_disable_notrace();
0075 
0076     do {
0077         seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
0078         idx = seq & 1;
0079 
0080         data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
0081         data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
0082         data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
0083 
0084     } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
0085 }
0086 
0087 __always_inline void cyc2ns_read_end(void)
0088 {
0089     preempt_enable_notrace();
0090 }
0091 
0092 /*
0093  * Accelerators for sched_clock()
0094  * convert from cycles(64bits) => nanoseconds (64bits)
0095  *  basic equation:
0096  *              ns = cycles / (freq / ns_per_sec)
0097  *              ns = cycles * (ns_per_sec / freq)
0098  *              ns = cycles * (10^9 / (cpu_khz * 10^3))
0099  *              ns = cycles * (10^6 / cpu_khz)
0100  *
0101  *      Then we use scaling math (suggested by george@mvista.com) to get:
0102  *              ns = cycles * (10^6 * SC / cpu_khz) / SC
0103  *              ns = cycles * cyc2ns_scale / SC
0104  *
0105  *      And since SC is a constant power of two, we can convert the div
0106  *  into a shift. The larger SC is, the more accurate the conversion, but
0107  *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
0108  *  (64-bit result) can be used.
0109  *
0110  *  We can use khz divisor instead of mhz to keep a better precision.
0111  *  (mathieu.desnoyers@polymtl.ca)
0112  *
0113  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
0114  */
0115 
0116 static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
0117 {
0118     struct cyc2ns_data data;
0119     unsigned long long ns;
0120 
0121     cyc2ns_read_begin(&data);
0122 
0123     ns = data.cyc2ns_offset;
0124     ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
0125 
0126     cyc2ns_read_end();
0127 
0128     return ns;
0129 }
0130 
0131 static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
0132 {
0133     unsigned long long ns_now;
0134     struct cyc2ns_data data;
0135     struct cyc2ns *c2n;
0136 
0137     ns_now = cycles_2_ns(tsc_now);
0138 
0139     /*
0140      * Compute a new multiplier as per the above comment and ensure our
0141      * time function is continuous; see the comment near struct
0142      * cyc2ns_data.
0143      */
0144     clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
0145                    NSEC_PER_MSEC, 0);
0146 
0147     /*
0148      * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
0149      * not expected to be greater than 31 due to the original published
0150      * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
0151      * value) - refer perf_event_mmap_page documentation in perf_event.h.
0152      */
0153     if (data.cyc2ns_shift == 32) {
0154         data.cyc2ns_shift = 31;
0155         data.cyc2ns_mul >>= 1;
0156     }
0157 
0158     data.cyc2ns_offset = ns_now -
0159         mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
0160 
0161     c2n = per_cpu_ptr(&cyc2ns, cpu);
0162 
0163     raw_write_seqcount_latch(&c2n->seq);
0164     c2n->data[0] = data;
0165     raw_write_seqcount_latch(&c2n->seq);
0166     c2n->data[1] = data;
0167 }
0168 
0169 static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
0170 {
0171     unsigned long flags;
0172 
0173     local_irq_save(flags);
0174     sched_clock_idle_sleep_event();
0175 
0176     if (khz)
0177         __set_cyc2ns_scale(khz, cpu, tsc_now);
0178 
0179     sched_clock_idle_wakeup_event();
0180     local_irq_restore(flags);
0181 }
0182 
0183 /*
0184  * Initialize cyc2ns for boot cpu
0185  */
0186 static void __init cyc2ns_init_boot_cpu(void)
0187 {
0188     struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
0189 
0190     seqcount_latch_init(&c2n->seq);
0191     __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
0192 }
0193 
0194 /*
0195  * Secondary CPUs do not run through tsc_init(), so set up
0196  * all the scale factors for all CPUs, assuming the same
0197  * speed as the bootup CPU.
0198  */
0199 static void __init cyc2ns_init_secondary_cpus(void)
0200 {
0201     unsigned int cpu, this_cpu = smp_processor_id();
0202     struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
0203     struct cyc2ns_data *data = c2n->data;
0204 
0205     for_each_possible_cpu(cpu) {
0206         if (cpu != this_cpu) {
0207             seqcount_latch_init(&c2n->seq);
0208             c2n = per_cpu_ptr(&cyc2ns, cpu);
0209             c2n->data[0] = data[0];
0210             c2n->data[1] = data[1];
0211         }
0212     }
0213 }
0214 
0215 /*
0216  * Scheduler clock - returns current time in nanosec units.
0217  */
0218 u64 native_sched_clock(void)
0219 {
0220     if (static_branch_likely(&__use_tsc)) {
0221         u64 tsc_now = rdtsc();
0222 
0223         /* return the value in ns */
0224         return cycles_2_ns(tsc_now);
0225     }
0226 
0227     /*
0228      * Fall back to jiffies if there's no TSC available:
0229      * ( But note that we still use it if the TSC is marked
0230      *   unstable. We do this because unlike Time Of Day,
0231      *   the scheduler clock tolerates small errors and it's
0232      *   very important for it to be as fast as the platform
0233      *   can achieve it. )
0234      */
0235 
0236     /* No locking but a rare wrong value is not a big deal: */
0237     return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
0238 }
0239 
0240 /*
0241  * Generate a sched_clock if you already have a TSC value.
0242  */
0243 u64 native_sched_clock_from_tsc(u64 tsc)
0244 {
0245     return cycles_2_ns(tsc);
0246 }
0247 
0248 /* We need to define a real function for sched_clock, to override the
0249    weak default version */
0250 #ifdef CONFIG_PARAVIRT
0251 unsigned long long sched_clock(void)
0252 {
0253     return paravirt_sched_clock();
0254 }
0255 
0256 bool using_native_sched_clock(void)
0257 {
0258     return static_call_query(pv_sched_clock) == native_sched_clock;
0259 }
0260 #else
0261 unsigned long long
0262 sched_clock(void) __attribute__((alias("native_sched_clock")));
0263 
0264 bool using_native_sched_clock(void) { return true; }
0265 #endif
0266 
0267 int check_tsc_unstable(void)
0268 {
0269     return tsc_unstable;
0270 }
0271 EXPORT_SYMBOL_GPL(check_tsc_unstable);
0272 
0273 #ifdef CONFIG_X86_TSC
0274 int __init notsc_setup(char *str)
0275 {
0276     mark_tsc_unstable("boot parameter notsc");
0277     return 1;
0278 }
0279 #else
0280 /*
0281  * disable flag for tsc. Takes effect by clearing the TSC cpu flag
0282  * in cpu/common.c
0283  */
0284 int __init notsc_setup(char *str)
0285 {
0286     setup_clear_cpu_cap(X86_FEATURE_TSC);
0287     return 1;
0288 }
0289 #endif
0290 
0291 __setup("notsc", notsc_setup);
0292 
0293 static int no_sched_irq_time;
0294 static int no_tsc_watchdog;
0295 
0296 static int __init tsc_setup(char *str)
0297 {
0298     if (!strcmp(str, "reliable"))
0299         tsc_clocksource_reliable = 1;
0300     if (!strncmp(str, "noirqtime", 9))
0301         no_sched_irq_time = 1;
0302     if (!strcmp(str, "unstable"))
0303         mark_tsc_unstable("boot parameter");
0304     if (!strcmp(str, "nowatchdog"))
0305         no_tsc_watchdog = 1;
0306     return 1;
0307 }
0308 
0309 __setup("tsc=", tsc_setup);
0310 
0311 #define MAX_RETRIES     5
0312 #define TSC_DEFAULT_THRESHOLD   0x20000
0313 
0314 /*
0315  * Read TSC and the reference counters. Take care of any disturbances
0316  */
0317 static u64 tsc_read_refs(u64 *p, int hpet)
0318 {
0319     u64 t1, t2;
0320     u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
0321     int i;
0322 
0323     for (i = 0; i < MAX_RETRIES; i++) {
0324         t1 = get_cycles();
0325         if (hpet)
0326             *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
0327         else
0328             *p = acpi_pm_read_early();
0329         t2 = get_cycles();
0330         if ((t2 - t1) < thresh)
0331             return t2;
0332     }
0333     return ULLONG_MAX;
0334 }
0335 
0336 /*
0337  * Calculate the TSC frequency from HPET reference
0338  */
0339 static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
0340 {
0341     u64 tmp;
0342 
0343     if (hpet2 < hpet1)
0344         hpet2 += 0x100000000ULL;
0345     hpet2 -= hpet1;
0346     tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
0347     do_div(tmp, 1000000);
0348     deltatsc = div64_u64(deltatsc, tmp);
0349 
0350     return (unsigned long) deltatsc;
0351 }
0352 
0353 /*
0354  * Calculate the TSC frequency from PMTimer reference
0355  */
0356 static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
0357 {
0358     u64 tmp;
0359 
0360     if (!pm1 && !pm2)
0361         return ULONG_MAX;
0362 
0363     if (pm2 < pm1)
0364         pm2 += (u64)ACPI_PM_OVRRUN;
0365     pm2 -= pm1;
0366     tmp = pm2 * 1000000000LL;
0367     do_div(tmp, PMTMR_TICKS_PER_SEC);
0368     do_div(deltatsc, tmp);
0369 
0370     return (unsigned long) deltatsc;
0371 }
0372 
0373 #define CAL_MS      10
0374 #define CAL_LATCH   (PIT_TICK_RATE / (1000 / CAL_MS))
0375 #define CAL_PIT_LOOPS   1000
0376 
0377 #define CAL2_MS     50
0378 #define CAL2_LATCH  (PIT_TICK_RATE / (1000 / CAL2_MS))
0379 #define CAL2_PIT_LOOPS  5000
0380 
0381 
0382 /*
0383  * Try to calibrate the TSC against the Programmable
0384  * Interrupt Timer and return the frequency of the TSC
0385  * in kHz.
0386  *
0387  * Return ULONG_MAX on failure to calibrate.
0388  */
0389 static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
0390 {
0391     u64 tsc, t1, t2, delta;
0392     unsigned long tscmin, tscmax;
0393     int pitcnt;
0394 
0395     if (!has_legacy_pic()) {
0396         /*
0397          * Relies on tsc_early_delay_calibrate() to have given us semi
0398          * usable udelay(), wait for the same 50ms we would have with
0399          * the PIT loop below.
0400          */
0401         udelay(10 * USEC_PER_MSEC);
0402         udelay(10 * USEC_PER_MSEC);
0403         udelay(10 * USEC_PER_MSEC);
0404         udelay(10 * USEC_PER_MSEC);
0405         udelay(10 * USEC_PER_MSEC);
0406         return ULONG_MAX;
0407     }
0408 
0409     /* Set the Gate high, disable speaker */
0410     outb((inb(0x61) & ~0x02) | 0x01, 0x61);
0411 
0412     /*
0413      * Setup CTC channel 2* for mode 0, (interrupt on terminal
0414      * count mode), binary count. Set the latch register to 50ms
0415      * (LSB then MSB) to begin countdown.
0416      */
0417     outb(0xb0, 0x43);
0418     outb(latch & 0xff, 0x42);
0419     outb(latch >> 8, 0x42);
0420 
0421     tsc = t1 = t2 = get_cycles();
0422 
0423     pitcnt = 0;
0424     tscmax = 0;
0425     tscmin = ULONG_MAX;
0426     while ((inb(0x61) & 0x20) == 0) {
0427         t2 = get_cycles();
0428         delta = t2 - tsc;
0429         tsc = t2;
0430         if ((unsigned long) delta < tscmin)
0431             tscmin = (unsigned int) delta;
0432         if ((unsigned long) delta > tscmax)
0433             tscmax = (unsigned int) delta;
0434         pitcnt++;
0435     }
0436 
0437     /*
0438      * Sanity checks:
0439      *
0440      * If we were not able to read the PIT more than loopmin
0441      * times, then we have been hit by a massive SMI
0442      *
0443      * If the maximum is 10 times larger than the minimum,
0444      * then we got hit by an SMI as well.
0445      */
0446     if (pitcnt < loopmin || tscmax > 10 * tscmin)
0447         return ULONG_MAX;
0448 
0449     /* Calculate the PIT value */
0450     delta = t2 - t1;
0451     do_div(delta, ms);
0452     return delta;
0453 }
0454 
0455 /*
0456  * This reads the current MSB of the PIT counter, and
0457  * checks if we are running on sufficiently fast and
0458  * non-virtualized hardware.
0459  *
0460  * Our expectations are:
0461  *
0462  *  - the PIT is running at roughly 1.19MHz
0463  *
0464  *  - each IO is going to take about 1us on real hardware,
0465  *    but we allow it to be much faster (by a factor of 10) or
0466  *    _slightly_ slower (ie we allow up to a 2us read+counter
0467  *    update - anything else implies a unacceptably slow CPU
0468  *    or PIT for the fast calibration to work.
0469  *
0470  *  - with 256 PIT ticks to read the value, we have 214us to
0471  *    see the same MSB (and overhead like doing a single TSC
0472  *    read per MSB value etc).
0473  *
0474  *  - We're doing 2 reads per loop (LSB, MSB), and we expect
0475  *    them each to take about a microsecond on real hardware.
0476  *    So we expect a count value of around 100. But we'll be
0477  *    generous, and accept anything over 50.
0478  *
0479  *  - if the PIT is stuck, and we see *many* more reads, we
0480  *    return early (and the next caller of pit_expect_msb()
0481  *    then consider it a failure when they don't see the
0482  *    next expected value).
0483  *
0484  * These expectations mean that we know that we have seen the
0485  * transition from one expected value to another with a fairly
0486  * high accuracy, and we didn't miss any events. We can thus
0487  * use the TSC value at the transitions to calculate a pretty
0488  * good value for the TSC frequency.
0489  */
0490 static inline int pit_verify_msb(unsigned char val)
0491 {
0492     /* Ignore LSB */
0493     inb(0x42);
0494     return inb(0x42) == val;
0495 }
0496 
0497 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
0498 {
0499     int count;
0500     u64 tsc = 0, prev_tsc = 0;
0501 
0502     for (count = 0; count < 50000; count++) {
0503         if (!pit_verify_msb(val))
0504             break;
0505         prev_tsc = tsc;
0506         tsc = get_cycles();
0507     }
0508     *deltap = get_cycles() - prev_tsc;
0509     *tscp = tsc;
0510 
0511     /*
0512      * We require _some_ success, but the quality control
0513      * will be based on the error terms on the TSC values.
0514      */
0515     return count > 5;
0516 }
0517 
0518 /*
0519  * How many MSB values do we want to see? We aim for
0520  * a maximum error rate of 500ppm (in practice the
0521  * real error is much smaller), but refuse to spend
0522  * more than 50ms on it.
0523  */
0524 #define MAX_QUICK_PIT_MS 50
0525 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
0526 
0527 static unsigned long quick_pit_calibrate(void)
0528 {
0529     int i;
0530     u64 tsc, delta;
0531     unsigned long d1, d2;
0532 
0533     if (!has_legacy_pic())
0534         return 0;
0535 
0536     /* Set the Gate high, disable speaker */
0537     outb((inb(0x61) & ~0x02) | 0x01, 0x61);
0538 
0539     /*
0540      * Counter 2, mode 0 (one-shot), binary count
0541      *
0542      * NOTE! Mode 2 decrements by two (and then the
0543      * output is flipped each time, giving the same
0544      * final output frequency as a decrement-by-one),
0545      * so mode 0 is much better when looking at the
0546      * individual counts.
0547      */
0548     outb(0xb0, 0x43);
0549 
0550     /* Start at 0xffff */
0551     outb(0xff, 0x42);
0552     outb(0xff, 0x42);
0553 
0554     /*
0555      * The PIT starts counting at the next edge, so we
0556      * need to delay for a microsecond. The easiest way
0557      * to do that is to just read back the 16-bit counter
0558      * once from the PIT.
0559      */
0560     pit_verify_msb(0);
0561 
0562     if (pit_expect_msb(0xff, &tsc, &d1)) {
0563         for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
0564             if (!pit_expect_msb(0xff-i, &delta, &d2))
0565                 break;
0566 
0567             delta -= tsc;
0568 
0569             /*
0570              * Extrapolate the error and fail fast if the error will
0571              * never be below 500 ppm.
0572              */
0573             if (i == 1 &&
0574                 d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
0575                 return 0;
0576 
0577             /*
0578              * Iterate until the error is less than 500 ppm
0579              */
0580             if (d1+d2 >= delta >> 11)
0581                 continue;
0582 
0583             /*
0584              * Check the PIT one more time to verify that
0585              * all TSC reads were stable wrt the PIT.
0586              *
0587              * This also guarantees serialization of the
0588              * last cycle read ('d2') in pit_expect_msb.
0589              */
0590             if (!pit_verify_msb(0xfe - i))
0591                 break;
0592             goto success;
0593         }
0594     }
0595     pr_info("Fast TSC calibration failed\n");
0596     return 0;
0597 
0598 success:
0599     /*
0600      * Ok, if we get here, then we've seen the
0601      * MSB of the PIT decrement 'i' times, and the
0602      * error has shrunk to less than 500 ppm.
0603      *
0604      * As a result, we can depend on there not being
0605      * any odd delays anywhere, and the TSC reads are
0606      * reliable (within the error).
0607      *
0608      * kHz = ticks / time-in-seconds / 1000;
0609      * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
0610      * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
0611      */
0612     delta *= PIT_TICK_RATE;
0613     do_div(delta, i*256*1000);
0614     pr_info("Fast TSC calibration using PIT\n");
0615     return delta;
0616 }
0617 
0618 /**
0619  * native_calibrate_tsc
0620  * Determine TSC frequency via CPUID, else return 0.
0621  */
0622 unsigned long native_calibrate_tsc(void)
0623 {
0624     unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
0625     unsigned int crystal_khz;
0626 
0627     if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
0628         return 0;
0629 
0630     if (boot_cpu_data.cpuid_level < 0x15)
0631         return 0;
0632 
0633     eax_denominator = ebx_numerator = ecx_hz = edx = 0;
0634 
0635     /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
0636     cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
0637 
0638     if (ebx_numerator == 0 || eax_denominator == 0)
0639         return 0;
0640 
0641     crystal_khz = ecx_hz / 1000;
0642 
0643     /*
0644      * Denverton SoCs don't report crystal clock, and also don't support
0645      * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
0646      * clock.
0647      */
0648     if (crystal_khz == 0 &&
0649             boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
0650         crystal_khz = 25000;
0651 
0652     /*
0653      * TSC frequency reported directly by CPUID is a "hardware reported"
0654      * frequency and is the most accurate one so far we have. This
0655      * is considered a known frequency.
0656      */
0657     if (crystal_khz != 0)
0658         setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
0659 
0660     /*
0661      * Some Intel SoCs like Skylake and Kabylake don't report the crystal
0662      * clock, but we can easily calculate it to a high degree of accuracy
0663      * by considering the crystal ratio and the CPU speed.
0664      */
0665     if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
0666         unsigned int eax_base_mhz, ebx, ecx, edx;
0667 
0668         cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
0669         crystal_khz = eax_base_mhz * 1000 *
0670             eax_denominator / ebx_numerator;
0671     }
0672 
0673     if (crystal_khz == 0)
0674         return 0;
0675 
0676     /*
0677      * For Atom SoCs TSC is the only reliable clocksource.
0678      * Mark TSC reliable so no watchdog on it.
0679      */
0680     if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
0681         setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
0682 
0683 #ifdef CONFIG_X86_LOCAL_APIC
0684     /*
0685      * The local APIC appears to be fed by the core crystal clock
0686      * (which sounds entirely sensible). We can set the global
0687      * lapic_timer_period here to avoid having to calibrate the APIC
0688      * timer later.
0689      */
0690     lapic_timer_period = crystal_khz * 1000 / HZ;
0691 #endif
0692 
0693     return crystal_khz * ebx_numerator / eax_denominator;
0694 }
0695 
0696 static unsigned long cpu_khz_from_cpuid(void)
0697 {
0698     unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
0699 
0700     if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
0701         return 0;
0702 
0703     if (boot_cpu_data.cpuid_level < 0x16)
0704         return 0;
0705 
0706     eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
0707 
0708     cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
0709 
0710     return eax_base_mhz * 1000;
0711 }
0712 
0713 /*
0714  * calibrate cpu using pit, hpet, and ptimer methods. They are available
0715  * later in boot after acpi is initialized.
0716  */
0717 static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
0718 {
0719     u64 tsc1, tsc2, delta, ref1, ref2;
0720     unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
0721     unsigned long flags, latch, ms;
0722     int hpet = is_hpet_enabled(), i, loopmin;
0723 
0724     /*
0725      * Run 5 calibration loops to get the lowest frequency value
0726      * (the best estimate). We use two different calibration modes
0727      * here:
0728      *
0729      * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
0730      * load a timeout of 50ms. We read the time right after we
0731      * started the timer and wait until the PIT count down reaches
0732      * zero. In each wait loop iteration we read the TSC and check
0733      * the delta to the previous read. We keep track of the min
0734      * and max values of that delta. The delta is mostly defined
0735      * by the IO time of the PIT access, so we can detect when
0736      * any disturbance happened between the two reads. If the
0737      * maximum time is significantly larger than the minimum time,
0738      * then we discard the result and have another try.
0739      *
0740      * 2) Reference counter. If available we use the HPET or the
0741      * PMTIMER as a reference to check the sanity of that value.
0742      * We use separate TSC readouts and check inside of the
0743      * reference read for any possible disturbance. We discard
0744      * disturbed values here as well. We do that around the PIT
0745      * calibration delay loop as we have to wait for a certain
0746      * amount of time anyway.
0747      */
0748 
0749     /* Preset PIT loop values */
0750     latch = CAL_LATCH;
0751     ms = CAL_MS;
0752     loopmin = CAL_PIT_LOOPS;
0753 
0754     for (i = 0; i < 3; i++) {
0755         unsigned long tsc_pit_khz;
0756 
0757         /*
0758          * Read the start value and the reference count of
0759          * hpet/pmtimer when available. Then do the PIT
0760          * calibration, which will take at least 50ms, and
0761          * read the end value.
0762          */
0763         local_irq_save(flags);
0764         tsc1 = tsc_read_refs(&ref1, hpet);
0765         tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
0766         tsc2 = tsc_read_refs(&ref2, hpet);
0767         local_irq_restore(flags);
0768 
0769         /* Pick the lowest PIT TSC calibration so far */
0770         tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
0771 
0772         /* hpet or pmtimer available ? */
0773         if (ref1 == ref2)
0774             continue;
0775 
0776         /* Check, whether the sampling was disturbed */
0777         if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
0778             continue;
0779 
0780         tsc2 = (tsc2 - tsc1) * 1000000LL;
0781         if (hpet)
0782             tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
0783         else
0784             tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
0785 
0786         tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
0787 
0788         /* Check the reference deviation */
0789         delta = ((u64) tsc_pit_min) * 100;
0790         do_div(delta, tsc_ref_min);
0791 
0792         /*
0793          * If both calibration results are inside a 10% window
0794          * then we can be sure, that the calibration
0795          * succeeded. We break out of the loop right away. We
0796          * use the reference value, as it is more precise.
0797          */
0798         if (delta >= 90 && delta <= 110) {
0799             pr_info("PIT calibration matches %s. %d loops\n",
0800                 hpet ? "HPET" : "PMTIMER", i + 1);
0801             return tsc_ref_min;
0802         }
0803 
0804         /*
0805          * Check whether PIT failed more than once. This
0806          * happens in virtualized environments. We need to
0807          * give the virtual PC a slightly longer timeframe for
0808          * the HPET/PMTIMER to make the result precise.
0809          */
0810         if (i == 1 && tsc_pit_min == ULONG_MAX) {
0811             latch = CAL2_LATCH;
0812             ms = CAL2_MS;
0813             loopmin = CAL2_PIT_LOOPS;
0814         }
0815     }
0816 
0817     /*
0818      * Now check the results.
0819      */
0820     if (tsc_pit_min == ULONG_MAX) {
0821         /* PIT gave no useful value */
0822         pr_warn("Unable to calibrate against PIT\n");
0823 
0824         /* We don't have an alternative source, disable TSC */
0825         if (!hpet && !ref1 && !ref2) {
0826             pr_notice("No reference (HPET/PMTIMER) available\n");
0827             return 0;
0828         }
0829 
0830         /* The alternative source failed as well, disable TSC */
0831         if (tsc_ref_min == ULONG_MAX) {
0832             pr_warn("HPET/PMTIMER calibration failed\n");
0833             return 0;
0834         }
0835 
0836         /* Use the alternative source */
0837         pr_info("using %s reference calibration\n",
0838             hpet ? "HPET" : "PMTIMER");
0839 
0840         return tsc_ref_min;
0841     }
0842 
0843     /* We don't have an alternative source, use the PIT calibration value */
0844     if (!hpet && !ref1 && !ref2) {
0845         pr_info("Using PIT calibration value\n");
0846         return tsc_pit_min;
0847     }
0848 
0849     /* The alternative source failed, use the PIT calibration value */
0850     if (tsc_ref_min == ULONG_MAX) {
0851         pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
0852         return tsc_pit_min;
0853     }
0854 
0855     /*
0856      * The calibration values differ too much. In doubt, we use
0857      * the PIT value as we know that there are PMTIMERs around
0858      * running at double speed. At least we let the user know:
0859      */
0860     pr_warn("PIT calibration deviates from %s: %lu %lu\n",
0861         hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
0862     pr_info("Using PIT calibration value\n");
0863     return tsc_pit_min;
0864 }
0865 
0866 /**
0867  * native_calibrate_cpu_early - can calibrate the cpu early in boot
0868  */
0869 unsigned long native_calibrate_cpu_early(void)
0870 {
0871     unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
0872 
0873     if (!fast_calibrate)
0874         fast_calibrate = cpu_khz_from_msr();
0875     if (!fast_calibrate) {
0876         local_irq_save(flags);
0877         fast_calibrate = quick_pit_calibrate();
0878         local_irq_restore(flags);
0879     }
0880     return fast_calibrate;
0881 }
0882 
0883 
0884 /**
0885  * native_calibrate_cpu - calibrate the cpu
0886  */
0887 static unsigned long native_calibrate_cpu(void)
0888 {
0889     unsigned long tsc_freq = native_calibrate_cpu_early();
0890 
0891     if (!tsc_freq)
0892         tsc_freq = pit_hpet_ptimer_calibrate_cpu();
0893 
0894     return tsc_freq;
0895 }
0896 
0897 void recalibrate_cpu_khz(void)
0898 {
0899 #ifndef CONFIG_SMP
0900     unsigned long cpu_khz_old = cpu_khz;
0901 
0902     if (!boot_cpu_has(X86_FEATURE_TSC))
0903         return;
0904 
0905     cpu_khz = x86_platform.calibrate_cpu();
0906     tsc_khz = x86_platform.calibrate_tsc();
0907     if (tsc_khz == 0)
0908         tsc_khz = cpu_khz;
0909     else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
0910         cpu_khz = tsc_khz;
0911     cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
0912                             cpu_khz_old, cpu_khz);
0913 #endif
0914 }
0915 
0916 EXPORT_SYMBOL(recalibrate_cpu_khz);
0917 
0918 
0919 static unsigned long long cyc2ns_suspend;
0920 
0921 void tsc_save_sched_clock_state(void)
0922 {
0923     if (!sched_clock_stable())
0924         return;
0925 
0926     cyc2ns_suspend = sched_clock();
0927 }
0928 
0929 /*
0930  * Even on processors with invariant TSC, TSC gets reset in some the
0931  * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
0932  * arbitrary value (still sync'd across cpu's) during resume from such sleep
0933  * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
0934  * that sched_clock() continues from the point where it was left off during
0935  * suspend.
0936  */
0937 void tsc_restore_sched_clock_state(void)
0938 {
0939     unsigned long long offset;
0940     unsigned long flags;
0941     int cpu;
0942 
0943     if (!sched_clock_stable())
0944         return;
0945 
0946     local_irq_save(flags);
0947 
0948     /*
0949      * We're coming out of suspend, there's no concurrency yet; don't
0950      * bother being nice about the RCU stuff, just write to both
0951      * data fields.
0952      */
0953 
0954     this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
0955     this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
0956 
0957     offset = cyc2ns_suspend - sched_clock();
0958 
0959     for_each_possible_cpu(cpu) {
0960         per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
0961         per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
0962     }
0963 
0964     local_irq_restore(flags);
0965 }
0966 
0967 #ifdef CONFIG_CPU_FREQ
0968 /*
0969  * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
0970  * changes.
0971  *
0972  * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
0973  * as unstable and give up in those cases.
0974  *
0975  * Should fix up last_tsc too. Currently gettimeofday in the
0976  * first tick after the change will be slightly wrong.
0977  */
0978 
0979 static unsigned int  ref_freq;
0980 static unsigned long loops_per_jiffy_ref;
0981 static unsigned long tsc_khz_ref;
0982 
0983 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
0984                 void *data)
0985 {
0986     struct cpufreq_freqs *freq = data;
0987 
0988     if (num_online_cpus() > 1) {
0989         mark_tsc_unstable("cpufreq changes on SMP");
0990         return 0;
0991     }
0992 
0993     if (!ref_freq) {
0994         ref_freq = freq->old;
0995         loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
0996         tsc_khz_ref = tsc_khz;
0997     }
0998 
0999     if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
1000         (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
1001         boot_cpu_data.loops_per_jiffy =
1002             cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
1003 
1004         tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
1005         if (!(freq->flags & CPUFREQ_CONST_LOOPS))
1006             mark_tsc_unstable("cpufreq changes");
1007 
1008         set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
1009     }
1010 
1011     return 0;
1012 }
1013 
1014 static struct notifier_block time_cpufreq_notifier_block = {
1015     .notifier_call  = time_cpufreq_notifier
1016 };
1017 
1018 static int __init cpufreq_register_tsc_scaling(void)
1019 {
1020     if (!boot_cpu_has(X86_FEATURE_TSC))
1021         return 0;
1022     if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1023         return 0;
1024     cpufreq_register_notifier(&time_cpufreq_notifier_block,
1025                 CPUFREQ_TRANSITION_NOTIFIER);
1026     return 0;
1027 }
1028 
1029 core_initcall(cpufreq_register_tsc_scaling);
1030 
1031 #endif /* CONFIG_CPU_FREQ */
1032 
1033 #define ART_CPUID_LEAF (0x15)
1034 #define ART_MIN_DENOMINATOR (1)
1035 
1036 
1037 /*
1038  * If ART is present detect the numerator:denominator to convert to TSC
1039  */
1040 static void __init detect_art(void)
1041 {
1042     unsigned int unused[2];
1043 
1044     if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
1045         return;
1046 
1047     /*
1048      * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
1049      * and the TSC counter resets must not occur asynchronously.
1050      */
1051     if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
1052         !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
1053         !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
1054         tsc_async_resets)
1055         return;
1056 
1057     cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
1058           &art_to_tsc_numerator, unused, unused+1);
1059 
1060     if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
1061         return;
1062 
1063     rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
1064 
1065     /* Make this sticky over multiple CPU init calls */
1066     setup_force_cpu_cap(X86_FEATURE_ART);
1067 }
1068 
1069 
1070 /* clocksource code */
1071 
1072 static void tsc_resume(struct clocksource *cs)
1073 {
1074     tsc_verify_tsc_adjust(true);
1075 }
1076 
1077 /*
1078  * We used to compare the TSC to the cycle_last value in the clocksource
1079  * structure to avoid a nasty time-warp. This can be observed in a
1080  * very small window right after one CPU updated cycle_last under
1081  * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
1082  * is smaller than the cycle_last reference value due to a TSC which
1083  * is slightly behind. This delta is nowhere else observable, but in
1084  * that case it results in a forward time jump in the range of hours
1085  * due to the unsigned delta calculation of the time keeping core
1086  * code, which is necessary to support wrapping clocksources like pm
1087  * timer.
1088  *
1089  * This sanity check is now done in the core timekeeping code.
1090  * checking the result of read_tsc() - cycle_last for being negative.
1091  * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
1092  */
1093 static u64 read_tsc(struct clocksource *cs)
1094 {
1095     return (u64)rdtsc_ordered();
1096 }
1097 
1098 static void tsc_cs_mark_unstable(struct clocksource *cs)
1099 {
1100     if (tsc_unstable)
1101         return;
1102 
1103     tsc_unstable = 1;
1104     if (using_native_sched_clock())
1105         clear_sched_clock_stable();
1106     disable_sched_clock_irqtime();
1107     pr_info("Marking TSC unstable due to clocksource watchdog\n");
1108 }
1109 
1110 static void tsc_cs_tick_stable(struct clocksource *cs)
1111 {
1112     if (tsc_unstable)
1113         return;
1114 
1115     if (using_native_sched_clock())
1116         sched_clock_tick_stable();
1117 }
1118 
1119 static int tsc_cs_enable(struct clocksource *cs)
1120 {
1121     vclocks_set_used(VDSO_CLOCKMODE_TSC);
1122     return 0;
1123 }
1124 
1125 /*
1126  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
1127  */
1128 static struct clocksource clocksource_tsc_early = {
1129     .name           = "tsc-early",
1130     .rating         = 299,
1131     .uncertainty_margin = 32 * NSEC_PER_MSEC,
1132     .read           = read_tsc,
1133     .mask           = CLOCKSOURCE_MASK(64),
1134     .flags          = CLOCK_SOURCE_IS_CONTINUOUS |
1135                   CLOCK_SOURCE_MUST_VERIFY,
1136     .vdso_clock_mode    = VDSO_CLOCKMODE_TSC,
1137     .enable         = tsc_cs_enable,
1138     .resume         = tsc_resume,
1139     .mark_unstable      = tsc_cs_mark_unstable,
1140     .tick_stable        = tsc_cs_tick_stable,
1141     .list           = LIST_HEAD_INIT(clocksource_tsc_early.list),
1142 };
1143 
1144 /*
1145  * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
1146  * this one will immediately take over. We will only register if TSC has
1147  * been found good.
1148  */
1149 static struct clocksource clocksource_tsc = {
1150     .name           = "tsc",
1151     .rating         = 300,
1152     .read           = read_tsc,
1153     .mask           = CLOCKSOURCE_MASK(64),
1154     .flags          = CLOCK_SOURCE_IS_CONTINUOUS |
1155                   CLOCK_SOURCE_VALID_FOR_HRES |
1156                   CLOCK_SOURCE_MUST_VERIFY |
1157                   CLOCK_SOURCE_VERIFY_PERCPU,
1158     .vdso_clock_mode    = VDSO_CLOCKMODE_TSC,
1159     .enable         = tsc_cs_enable,
1160     .resume         = tsc_resume,
1161     .mark_unstable      = tsc_cs_mark_unstable,
1162     .tick_stable        = tsc_cs_tick_stable,
1163     .list           = LIST_HEAD_INIT(clocksource_tsc.list),
1164 };
1165 
1166 void mark_tsc_unstable(char *reason)
1167 {
1168     if (tsc_unstable)
1169         return;
1170 
1171     tsc_unstable = 1;
1172     if (using_native_sched_clock())
1173         clear_sched_clock_stable();
1174     disable_sched_clock_irqtime();
1175     pr_info("Marking TSC unstable due to %s\n", reason);
1176 
1177     clocksource_mark_unstable(&clocksource_tsc_early);
1178     clocksource_mark_unstable(&clocksource_tsc);
1179 }
1180 
1181 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
1182 
1183 static void __init tsc_disable_clocksource_watchdog(void)
1184 {
1185     clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1186     clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1187 }
1188 
1189 static void __init check_system_tsc_reliable(void)
1190 {
1191 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
1192     if (is_geode_lx()) {
1193         /* RTSC counts during suspend */
1194 #define RTSC_SUSP 0x100
1195         unsigned long res_low, res_high;
1196 
1197         rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
1198         /* Geode_LX - the OLPC CPU has a very reliable TSC */
1199         if (res_low & RTSC_SUSP)
1200             tsc_clocksource_reliable = 1;
1201     }
1202 #endif
1203     if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
1204         tsc_clocksource_reliable = 1;
1205 
1206     /*
1207      * Disable the clocksource watchdog when the system has:
1208      *  - TSC running at constant frequency
1209      *  - TSC which does not stop in C-States
1210      *  - the TSC_ADJUST register which allows to detect even minimal
1211      *    modifications
1212      *  - not more than two sockets. As the number of sockets cannot be
1213      *    evaluated at the early boot stage where this has to be
1214      *    invoked, check the number of online memory nodes as a
1215      *    fallback solution which is an reasonable estimate.
1216      */
1217     if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1218         boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
1219         boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
1220         nr_online_nodes <= 2)
1221         tsc_disable_clocksource_watchdog();
1222 }
1223 
1224 /*
1225  * Make an educated guess if the TSC is trustworthy and synchronized
1226  * over all CPUs.
1227  */
1228 int unsynchronized_tsc(void)
1229 {
1230     if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
1231         return 1;
1232 
1233 #ifdef CONFIG_SMP
1234     if (apic_is_clustered_box())
1235         return 1;
1236 #endif
1237 
1238     if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1239         return 0;
1240 
1241     if (tsc_clocksource_reliable)
1242         return 0;
1243     /*
1244      * Intel systems are normally all synchronized.
1245      * Exceptions must mark TSC as unstable:
1246      */
1247     if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
1248         /* assume multi socket systems are not synchronized: */
1249         if (num_possible_cpus() > 1)
1250             return 1;
1251     }
1252 
1253     return 0;
1254 }
1255 
1256 /*
1257  * Convert ART to TSC given numerator/denominator found in detect_art()
1258  */
1259 struct system_counterval_t convert_art_to_tsc(u64 art)
1260 {
1261     u64 tmp, res, rem;
1262 
1263     rem = do_div(art, art_to_tsc_denominator);
1264 
1265     res = art * art_to_tsc_numerator;
1266     tmp = rem * art_to_tsc_numerator;
1267 
1268     do_div(tmp, art_to_tsc_denominator);
1269     res += tmp + art_to_tsc_offset;
1270 
1271     return (struct system_counterval_t) {.cs = art_related_clocksource,
1272             .cycles = res};
1273 }
1274 EXPORT_SYMBOL(convert_art_to_tsc);
1275 
1276 /**
1277  * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
1278  * @art_ns: ART (Always Running Timer) in unit of nanoseconds
1279  *
1280  * PTM requires all timestamps to be in units of nanoseconds. When user
1281  * software requests a cross-timestamp, this function converts system timestamp
1282  * to TSC.
1283  *
1284  * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
1285  * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
1286  * that this flag is set before conversion to TSC is attempted.
1287  *
1288  * Return:
1289  * struct system_counterval_t - system counter value with the pointer to the
1290  *  corresponding clocksource
1291  *  @cycles:    System counter value
1292  *  @cs:        Clocksource corresponding to system counter value. Used
1293  *          by timekeeping code to verify comparability of two cycle
1294  *          values.
1295  */
1296 
1297 struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
1298 {
1299     u64 tmp, res, rem;
1300 
1301     rem = do_div(art_ns, USEC_PER_SEC);
1302 
1303     res = art_ns * tsc_khz;
1304     tmp = rem * tsc_khz;
1305 
1306     do_div(tmp, USEC_PER_SEC);
1307     res += tmp;
1308 
1309     return (struct system_counterval_t) { .cs = art_related_clocksource,
1310                           .cycles = res};
1311 }
1312 EXPORT_SYMBOL(convert_art_ns_to_tsc);
1313 
1314 
1315 static void tsc_refine_calibration_work(struct work_struct *work);
1316 static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
1317 /**
1318  * tsc_refine_calibration_work - Further refine tsc freq calibration
1319  * @work - ignored.
1320  *
1321  * This functions uses delayed work over a period of a
1322  * second to further refine the TSC freq value. Since this is
1323  * timer based, instead of loop based, we don't block the boot
1324  * process while this longer calibration is done.
1325  *
1326  * If there are any calibration anomalies (too many SMIs, etc),
1327  * or the refined calibration is off by 1% of the fast early
1328  * calibration, we throw out the new calibration and use the
1329  * early calibration.
1330  */
1331 static void tsc_refine_calibration_work(struct work_struct *work)
1332 {
1333     static u64 tsc_start = ULLONG_MAX, ref_start;
1334     static int hpet;
1335     u64 tsc_stop, ref_stop, delta;
1336     unsigned long freq;
1337     int cpu;
1338 
1339     /* Don't bother refining TSC on unstable systems */
1340     if (tsc_unstable)
1341         goto unreg;
1342 
1343     /*
1344      * Since the work is started early in boot, we may be
1345      * delayed the first time we expire. So set the workqueue
1346      * again once we know timers are working.
1347      */
1348     if (tsc_start == ULLONG_MAX) {
1349 restart:
1350         /*
1351          * Only set hpet once, to avoid mixing hardware
1352          * if the hpet becomes enabled later.
1353          */
1354         hpet = is_hpet_enabled();
1355         tsc_start = tsc_read_refs(&ref_start, hpet);
1356         schedule_delayed_work(&tsc_irqwork, HZ);
1357         return;
1358     }
1359 
1360     tsc_stop = tsc_read_refs(&ref_stop, hpet);
1361 
1362     /* hpet or pmtimer available ? */
1363     if (ref_start == ref_stop)
1364         goto out;
1365 
1366     /* Check, whether the sampling was disturbed */
1367     if (tsc_stop == ULLONG_MAX)
1368         goto restart;
1369 
1370     delta = tsc_stop - tsc_start;
1371     delta *= 1000000LL;
1372     if (hpet)
1373         freq = calc_hpet_ref(delta, ref_start, ref_stop);
1374     else
1375         freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
1376 
1377     /* Make sure we're within 1% */
1378     if (abs(tsc_khz - freq) > tsc_khz/100)
1379         goto out;
1380 
1381     tsc_khz = freq;
1382     pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
1383         (unsigned long)tsc_khz / 1000,
1384         (unsigned long)tsc_khz % 1000);
1385 
1386     /* Inform the TSC deadline clockevent devices about the recalibration */
1387     lapic_update_tsc_freq();
1388 
1389     /* Update the sched_clock() rate to match the clocksource one */
1390     for_each_possible_cpu(cpu)
1391         set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
1392 
1393 out:
1394     if (tsc_unstable)
1395         goto unreg;
1396 
1397     if (boot_cpu_has(X86_FEATURE_ART))
1398         art_related_clocksource = &clocksource_tsc;
1399     clocksource_register_khz(&clocksource_tsc, tsc_khz);
1400 unreg:
1401     clocksource_unregister(&clocksource_tsc_early);
1402 }
1403 
1404 
1405 static int __init init_tsc_clocksource(void)
1406 {
1407     if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
1408         return 0;
1409 
1410     if (tsc_unstable)
1411         goto unreg;
1412 
1413     if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
1414         clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
1415 
1416     /*
1417      * When TSC frequency is known (retrieved via MSR or CPUID), we skip
1418      * the refined calibration and directly register it as a clocksource.
1419      */
1420     if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
1421         if (boot_cpu_has(X86_FEATURE_ART))
1422             art_related_clocksource = &clocksource_tsc;
1423         clocksource_register_khz(&clocksource_tsc, tsc_khz);
1424 unreg:
1425         clocksource_unregister(&clocksource_tsc_early);
1426         return 0;
1427     }
1428 
1429     schedule_delayed_work(&tsc_irqwork, 0);
1430     return 0;
1431 }
1432 /*
1433  * We use device_initcall here, to ensure we run after the hpet
1434  * is fully initialized, which may occur at fs_initcall time.
1435  */
1436 device_initcall(init_tsc_clocksource);
1437 
1438 static bool __init determine_cpu_tsc_frequencies(bool early)
1439 {
1440     /* Make sure that cpu and tsc are not already calibrated */
1441     WARN_ON(cpu_khz || tsc_khz);
1442 
1443     if (early) {
1444         cpu_khz = x86_platform.calibrate_cpu();
1445         if (tsc_early_khz)
1446             tsc_khz = tsc_early_khz;
1447         else
1448             tsc_khz = x86_platform.calibrate_tsc();
1449     } else {
1450         /* We should not be here with non-native cpu calibration */
1451         WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
1452         cpu_khz = pit_hpet_ptimer_calibrate_cpu();
1453     }
1454 
1455     /*
1456      * Trust non-zero tsc_khz as authoritative,
1457      * and use it to sanity check cpu_khz,
1458      * which will be off if system timer is off.
1459      */
1460     if (tsc_khz == 0)
1461         tsc_khz = cpu_khz;
1462     else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
1463         cpu_khz = tsc_khz;
1464 
1465     if (tsc_khz == 0)
1466         return false;
1467 
1468     pr_info("Detected %lu.%03lu MHz processor\n",
1469         (unsigned long)cpu_khz / KHZ,
1470         (unsigned long)cpu_khz % KHZ);
1471 
1472     if (cpu_khz != tsc_khz) {
1473         pr_info("Detected %lu.%03lu MHz TSC",
1474             (unsigned long)tsc_khz / KHZ,
1475             (unsigned long)tsc_khz % KHZ);
1476     }
1477     return true;
1478 }
1479 
1480 static unsigned long __init get_loops_per_jiffy(void)
1481 {
1482     u64 lpj = (u64)tsc_khz * KHZ;
1483 
1484     do_div(lpj, HZ);
1485     return lpj;
1486 }
1487 
1488 static void __init tsc_enable_sched_clock(void)
1489 {
1490     loops_per_jiffy = get_loops_per_jiffy();
1491     use_tsc_delay();
1492 
1493     /* Sanitize TSC ADJUST before cyc2ns gets initialized */
1494     tsc_store_and_check_tsc_adjust(true);
1495     cyc2ns_init_boot_cpu();
1496     static_branch_enable(&__use_tsc);
1497 }
1498 
1499 void __init tsc_early_init(void)
1500 {
1501     if (!boot_cpu_has(X86_FEATURE_TSC))
1502         return;
1503     /* Don't change UV TSC multi-chassis synchronization */
1504     if (is_early_uv_system())
1505         return;
1506     if (!determine_cpu_tsc_frequencies(true))
1507         return;
1508     tsc_enable_sched_clock();
1509 }
1510 
1511 void __init tsc_init(void)
1512 {
1513     /*
1514      * native_calibrate_cpu_early can only calibrate using methods that are
1515      * available early in boot.
1516      */
1517     if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
1518         x86_platform.calibrate_cpu = native_calibrate_cpu;
1519 
1520     if (!boot_cpu_has(X86_FEATURE_TSC)) {
1521         setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
1522         return;
1523     }
1524 
1525     if (!tsc_khz) {
1526         /* We failed to determine frequencies earlier, try again */
1527         if (!determine_cpu_tsc_frequencies(false)) {
1528             mark_tsc_unstable("could not calculate TSC khz");
1529             setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
1530             return;
1531         }
1532         tsc_enable_sched_clock();
1533     }
1534 
1535     cyc2ns_init_secondary_cpus();
1536 
1537     if (!no_sched_irq_time)
1538         enable_sched_clock_irqtime();
1539 
1540     lpj_fine = get_loops_per_jiffy();
1541 
1542     check_system_tsc_reliable();
1543 
1544     if (unsynchronized_tsc()) {
1545         mark_tsc_unstable("TSCs unsynchronized");
1546         return;
1547     }
1548 
1549     if (tsc_clocksource_reliable || no_tsc_watchdog)
1550         tsc_disable_clocksource_watchdog();
1551 
1552     clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
1553     detect_art();
1554 }
1555 
1556 #ifdef CONFIG_SMP
1557 /*
1558  * If we have a constant TSC and are using the TSC for the delay loop,
1559  * we can skip clock calibration if another cpu in the same socket has already
1560  * been calibrated. This assumes that CONSTANT_TSC applies to all
1561  * cpus in the socket - this should be a safe assumption.
1562  */
1563 unsigned long calibrate_delay_is_known(void)
1564 {
1565     int sibling, cpu = smp_processor_id();
1566     int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
1567     const struct cpumask *mask = topology_core_cpumask(cpu);
1568 
1569     if (!constant_tsc || !mask)
1570         return 0;
1571 
1572     sibling = cpumask_any_but(mask, cpu);
1573     if (sibling < nr_cpu_ids)
1574         return cpu_data(sibling).loops_per_jiffy;
1575     return 0;
1576 }
1577 #endif