powerpc/kernel/time.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Common time routines among all ppc machines.
0004  *
0005  * Written by Cort Dougan (cort@cs.nmt.edu) to merge
0006  * Paul Mackerras' version and mine for PReP and Pmac.
0007  * MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
0008  * Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
0009  *
0010  * First round of bugfixes by Gabriel Paubert (paubert@iram.es)
0011  * to make clock more stable (2.4.0-test5). The only thing
0012  * that this code assumes is that the timebases have been synchronized
0013  * by firmware on SMP and are never stopped (never do sleep
0014  * on SMP then, nap and doze are OK).
0015  *
0016  * Speeded up do_gettimeofday by getting rid of references to
0017  * xtime (which required locks for consistency). (mikejc@us.ibm.com)
0018  *
0019  * TODO (not necessarily in this file):
0020  * - improve precision and reproducibility of timebase frequency
0021  * measurement at boot time.
0022  * - for astronomical applications: add a new function to get
0023  * non ambiguous timestamps even around leap seconds. This needs
0024  * a new timestamp format and a good name.
0025  *
0026  * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
0027  *             "A Kernel Model for Precision Timekeeping" by Dave Mills
0028  */
0029
0030 #include <linux/errno.h>
0031 #include <linux/export.h>
0032 #include <linux/sched.h>
0033 #include <linux/sched/clock.h>
0034 #include <linux/sched/cputime.h>
0035 #include <linux/kernel.h>
0036 #include <linux/param.h>
0037 #include <linux/string.h>
0038 #include <linux/mm.h>
0039 #include <linux/interrupt.h>
0040 #include <linux/timex.h>
0041 #include <linux/kernel_stat.h>
0042 #include <linux/time.h>
0043 #include <linux/init.h>
0044 #include <linux/profile.h>
0045 #include <linux/cpu.h>
0046 #include <linux/security.h>
0047 #include <linux/percpu.h>
0048 #include <linux/rtc.h>
0049 #include <linux/jiffies.h>
0050 #include <linux/posix-timers.h>
0051 #include <linux/irq.h>
0052 #include <linux/delay.h>
0053 #include <linux/irq_work.h>
0054 #include <linux/of_clk.h>
0055 #include <linux/suspend.h>
0056 #include <linux/processor.h>
0057 #include <linux/mc146818rtc.h>
0058 #include <linux/platform_device.h>
0059
0060 #include <asm/trace.h>
0061 #include <asm/interrupt.h>
0062 #include <asm/io.h>
0063 #include <asm/nvram.h>
0064 #include <asm/cache.h>
0065 #include <asm/machdep.h>
0066 #include <linux/uaccess.h>
0067 #include <asm/time.h>
0068 #include <asm/irq.h>
0069 #include <asm/div64.h>
0070 #include <asm/smp.h>
0071 #include <asm/vdso_datapage.h>
0072 #include <asm/firmware.h>
0073 #include <asm/mce.h>
0074
0075 /* powerpc clocksource/clockevent code */
0076
0077 #include <linux/clockchips.h>
0078 #include <linux/timekeeper_internal.h>
0079
0080 static u64 timebase_read(struct clocksource *);
0081 static struct clocksource clocksource_timebase = {
0082     .name         = "timebase",
0083     .rating       = 400,
0084     .flags        = CLOCK_SOURCE_IS_CONTINUOUS,
0085     .mask         = CLOCKSOURCE_MASK(64),
0086     .read         = timebase_read,
0087     .vdso_clock_mode    = VDSO_CLOCKMODE_ARCHTIMER,
0088 };
0089
0090 #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
0091 u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
0092 EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */
0093
0094 static int decrementer_set_next_event(unsigned long evt,
0095                       struct clock_event_device *dev);
0096 static int decrementer_shutdown(struct clock_event_device *evt);
0097
0098 struct clock_event_device decrementer_clockevent = {
0099     .name           = "decrementer",
0100     .rating         = 200,
0101     .irq            = 0,
0102     .set_next_event     = decrementer_set_next_event,
0103     .set_state_oneshot_stopped = decrementer_shutdown,
0104     .set_state_shutdown = decrementer_shutdown,
0105     .tick_resume        = decrementer_shutdown,
0106     .features       = CLOCK_EVT_FEAT_ONESHOT |
0107                   CLOCK_EVT_FEAT_C3STOP,
0108 };
0109 EXPORT_SYMBOL(decrementer_clockevent);
0110
0111 /*
0112  * This always puts next_tb beyond now, so the clock event will never fire
0113  * with the usual comparison, no need for a separate test for stopped.
0114  */
0115 #define DEC_CLOCKEVENT_STOPPED ~0ULL
0116 DEFINE_PER_CPU(u64, decrementers_next_tb) = DEC_CLOCKEVENT_STOPPED;
0117 EXPORT_SYMBOL_GPL(decrementers_next_tb);
0118 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
0119
0120 #define XSEC_PER_SEC (1024*1024)
0121
0122 #ifdef CONFIG_PPC64
0123 #define SCALE_XSEC(xsec, max)   (((xsec) * max) / XSEC_PER_SEC)
0124 #else
0125 /* compute ((xsec << 12) * max) >> 32 */
0126 #define SCALE_XSEC(xsec, max)   mulhwu((xsec) << 12, max)
0127 #endif
0128
0129 unsigned long tb_ticks_per_jiffy;
0130 unsigned long tb_ticks_per_usec = 100; /* sane default */
0131 EXPORT_SYMBOL(tb_ticks_per_usec);
0132 unsigned long tb_ticks_per_sec;
0133 EXPORT_SYMBOL(tb_ticks_per_sec);    /* for cputime_t conversions */
0134
0135 DEFINE_SPINLOCK(rtc_lock);
0136 EXPORT_SYMBOL_GPL(rtc_lock);
0137
0138 static u64 tb_to_ns_scale __read_mostly;
0139 static unsigned tb_to_ns_shift __read_mostly;
0140 static u64 boot_tb __read_mostly;
0141
0142 extern struct timezone sys_tz;
0143 static long timezone_offset;
0144
0145 unsigned long ppc_proc_freq;
0146 EXPORT_SYMBOL_GPL(ppc_proc_freq);
0147 unsigned long ppc_tb_freq;
0148 EXPORT_SYMBOL_GPL(ppc_tb_freq);
0149
0150 bool tb_invalid;
0151
0152 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0153 /*
0154  * Factor for converting from cputime_t (timebase ticks) to
0155  * microseconds. This is stored as 0.64 fixed-point binary fraction.
0156  */
0157 u64 __cputime_usec_factor;
0158 EXPORT_SYMBOL(__cputime_usec_factor);
0159
0160 static void calc_cputime_factors(void)
0161 {
0162     struct div_result res;
0163
0164     div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
0165     __cputime_usec_factor = res.result_low;
0166 }
0167
0168 /*
0169  * Read the SPURR on systems that have it, otherwise the PURR,
0170  * or if that doesn't exist return the timebase value passed in.
0171  */
0172 static inline unsigned long read_spurr(unsigned long tb)
0173 {
0174     if (cpu_has_feature(CPU_FTR_SPURR))
0175         return mfspr(SPRN_SPURR);
0176     if (cpu_has_feature(CPU_FTR_PURR))
0177         return mfspr(SPRN_PURR);
0178     return tb;
0179 }
0180
0181 #ifdef CONFIG_PPC_SPLPAR
0182
0183 #include <asm/dtl.h>
0184
0185 void (*dtl_consumer)(struct dtl_entry *, u64);
0186
0187 /*
0188  * Scan the dispatch trace log and count up the stolen time.
0189  * Should be called with interrupts disabled.
0190  */
0191 static u64 scan_dispatch_log(u64 stop_tb)
0192 {
0193     u64 i = local_paca->dtl_ridx;
0194     struct dtl_entry *dtl = local_paca->dtl_curr;
0195     struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
0196     struct lppaca *vpa = local_paca->lppaca_ptr;
0197     u64 tb_delta;
0198     u64 stolen = 0;
0199     u64 dtb;
0200
0201     if (!dtl)
0202         return 0;
0203
0204     if (i == be64_to_cpu(vpa->dtl_idx))
0205         return 0;
0206     while (i < be64_to_cpu(vpa->dtl_idx)) {
0207         dtb = be64_to_cpu(dtl->timebase);
0208         tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
0209             be32_to_cpu(dtl->ready_to_enqueue_time);
0210         barrier();
0211         if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
0212             /* buffer has overflowed */
0213             i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
0214             dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
0215             continue;
0216         }
0217         if (dtb > stop_tb)
0218             break;
0219         if (dtl_consumer)
0220             dtl_consumer(dtl, i);
0221         stolen += tb_delta;
0222         ++i;
0223         ++dtl;
0224         if (dtl == dtl_end)
0225             dtl = local_paca->dispatch_log;
0226     }
0227     local_paca->dtl_ridx = i;
0228     local_paca->dtl_curr = dtl;
0229     return stolen;
0230 }
0231
0232 /*
0233  * Accumulate stolen time by scanning the dispatch trace log.
0234  * Called on entry from user mode.
0235  */
0236 void notrace accumulate_stolen_time(void)
0237 {
0238     u64 sst, ust;
0239     struct cpu_accounting_data *acct = &local_paca->accounting;
0240
0241     sst = scan_dispatch_log(acct->starttime_user);
0242     ust = scan_dispatch_log(acct->starttime);
0243     acct->stime -= sst;
0244     acct->utime -= ust;
0245     acct->steal_time += ust + sst;
0246 }
0247
0248 static inline u64 calculate_stolen_time(u64 stop_tb)
0249 {
0250     if (!firmware_has_feature(FW_FEATURE_SPLPAR))
0251         return 0;
0252
0253     if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx))
0254         return scan_dispatch_log(stop_tb);
0255
0256     return 0;
0257 }
0258
0259 #else /* CONFIG_PPC_SPLPAR */
0260 static inline u64 calculate_stolen_time(u64 stop_tb)
0261 {
0262     return 0;
0263 }
0264
0265 #endif /* CONFIG_PPC_SPLPAR */
0266
0267 /*
0268  * Account time for a transition between system, hard irq
0269  * or soft irq state.
0270  */
0271 static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct,
0272                     unsigned long now, unsigned long stime)
0273 {
0274     unsigned long stime_scaled = 0;
0275 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
0276     unsigned long nowscaled, deltascaled;
0277     unsigned long utime, utime_scaled;
0278
0279     nowscaled = read_spurr(now);
0280     deltascaled = nowscaled - acct->startspurr;
0281     acct->startspurr = nowscaled;
0282     utime = acct->utime - acct->utime_sspurr;
0283     acct->utime_sspurr = acct->utime;
0284
0285     /*
0286      * Because we don't read the SPURR on every kernel entry/exit,
0287      * deltascaled includes both user and system SPURR ticks.
0288      * Apportion these ticks to system SPURR ticks and user
0289      * SPURR ticks in the same ratio as the system time (delta)
0290      * and user time (udelta) values obtained from the timebase
0291      * over the same interval.  The system ticks get accounted here;
0292      * the user ticks get saved up in paca->user_time_scaled to be
0293      * used by account_process_tick.
0294      */
0295     stime_scaled = stime;
0296     utime_scaled = utime;
0297     if (deltascaled != stime + utime) {
0298         if (utime) {
0299             stime_scaled = deltascaled * stime / (stime + utime);
0300             utime_scaled = deltascaled - stime_scaled;
0301         } else {
0302             stime_scaled = deltascaled;
0303         }
0304     }
0305     acct->utime_scaled += utime_scaled;
0306 #endif
0307
0308     return stime_scaled;
0309 }
0310
0311 static unsigned long vtime_delta(struct cpu_accounting_data *acct,
0312                  unsigned long *stime_scaled,
0313                  unsigned long *steal_time)
0314 {
0315     unsigned long now, stime;
0316
0317     WARN_ON_ONCE(!irqs_disabled());
0318
0319     now = mftb();
0320     stime = now - acct->starttime;
0321     acct->starttime = now;
0322
0323     *stime_scaled = vtime_delta_scaled(acct, now, stime);
0324
0325     *steal_time = calculate_stolen_time(now);
0326
0327     return stime;
0328 }
0329
0330 static void vtime_delta_kernel(struct cpu_accounting_data *acct,
0331                    unsigned long *stime, unsigned long *stime_scaled)
0332 {
0333     unsigned long steal_time;
0334
0335     *stime = vtime_delta(acct, stime_scaled, &steal_time);
0336     *stime -= min(*stime, steal_time);
0337     acct->steal_time += steal_time;
0338 }
0339
0340 void vtime_account_kernel(struct task_struct *tsk)
0341 {
0342     struct cpu_accounting_data *acct = get_accounting(tsk);
0343     unsigned long stime, stime_scaled;
0344
0345     vtime_delta_kernel(acct, &stime, &stime_scaled);
0346
0347     if (tsk->flags & PF_VCPU) {
0348         acct->gtime += stime;
0349 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
0350         acct->utime_scaled += stime_scaled;
0351 #endif
0352     } else {
0353         acct->stime += stime;
0354 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
0355         acct->stime_scaled += stime_scaled;
0356 #endif
0357     }
0358 }
0359 EXPORT_SYMBOL_GPL(vtime_account_kernel);
0360
0361 void vtime_account_idle(struct task_struct *tsk)
0362 {
0363     unsigned long stime, stime_scaled, steal_time;
0364     struct cpu_accounting_data *acct = get_accounting(tsk);
0365
0366     stime = vtime_delta(acct, &stime_scaled, &steal_time);
0367     acct->idle_time += stime + steal_time;
0368 }
0369
0370 static void vtime_account_irq_field(struct cpu_accounting_data *acct,
0371                     unsigned long *field)
0372 {
0373     unsigned long stime, stime_scaled;
0374
0375     vtime_delta_kernel(acct, &stime, &stime_scaled);
0376     *field += stime;
0377 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
0378     acct->stime_scaled += stime_scaled;
0379 #endif
0380 }
0381
0382 void vtime_account_softirq(struct task_struct *tsk)
0383 {
0384     struct cpu_accounting_data *acct = get_accounting(tsk);
0385     vtime_account_irq_field(acct, &acct->softirq_time);
0386 }
0387
0388 void vtime_account_hardirq(struct task_struct *tsk)
0389 {
0390     struct cpu_accounting_data *acct = get_accounting(tsk);
0391     vtime_account_irq_field(acct, &acct->hardirq_time);
0392 }
0393
0394 static void vtime_flush_scaled(struct task_struct *tsk,
0395                    struct cpu_accounting_data *acct)
0396 {
0397 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
0398     if (acct->utime_scaled)
0399         tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
0400     if (acct->stime_scaled)
0401         tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
0402
0403     acct->utime_scaled = 0;
0404     acct->utime_sspurr = 0;
0405     acct->stime_scaled = 0;
0406 #endif
0407 }
0408
0409 /*
0410  * Account the whole cputime accumulated in the paca
0411  * Must be called with interrupts disabled.
0412  * Assumes that vtime_account_kernel/idle() has been called
0413  * recently (i.e. since the last entry from usermode) so that
0414  * get_paca()->user_time_scaled is up to date.
0415  */
0416 void vtime_flush(struct task_struct *tsk)
0417 {
0418     struct cpu_accounting_data *acct = get_accounting(tsk);
0419
0420     if (acct->utime)
0421         account_user_time(tsk, cputime_to_nsecs(acct->utime));
0422
0423     if (acct->gtime)
0424         account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
0425
0426     if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) {
0427         account_steal_time(cputime_to_nsecs(acct->steal_time));
0428         acct->steal_time = 0;
0429     }
0430
0431     if (acct->idle_time)
0432         account_idle_time(cputime_to_nsecs(acct->idle_time));
0433
0434     if (acct->stime)
0435         account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
0436                       CPUTIME_SYSTEM);
0437
0438     if (acct->hardirq_time)
0439         account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
0440                       CPUTIME_IRQ);
0441     if (acct->softirq_time)
0442         account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
0443                       CPUTIME_SOFTIRQ);
0444
0445     vtime_flush_scaled(tsk, acct);
0446
0447     acct->utime = 0;
0448     acct->gtime = 0;
0449     acct->idle_time = 0;
0450     acct->stime = 0;
0451     acct->hardirq_time = 0;
0452     acct->softirq_time = 0;
0453 }
0454
0455 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
0456 #define calc_cputime_factors()
0457 #endif
0458
0459 void __delay(unsigned long loops)
0460 {
0461     unsigned long start;
0462
0463     spin_begin();
0464     if (tb_invalid) {
0465         /*
0466          * TB is in error state and isn't ticking anymore.
0467          * HMI handler was unable to recover from TB error.
0468          * Return immediately, so that kernel won't get stuck here.
0469          */
0470         spin_cpu_relax();
0471     } else {
0472         start = mftb();
0473         while (mftb() - start < loops)
0474             spin_cpu_relax();
0475     }
0476     spin_end();
0477 }
0478 EXPORT_SYMBOL(__delay);
0479
0480 void udelay(unsigned long usecs)
0481 {
0482     __delay(tb_ticks_per_usec * usecs);
0483 }
0484 EXPORT_SYMBOL(udelay);
0485
0486 #ifdef CONFIG_SMP
0487 unsigned long profile_pc(struct pt_regs *regs)
0488 {
0489     unsigned long pc = instruction_pointer(regs);
0490
0491     if (in_lock_functions(pc))
0492         return regs->link;
0493
0494     return pc;
0495 }
0496 EXPORT_SYMBOL(profile_pc);
0497 #endif
0498
0499 #ifdef CONFIG_IRQ_WORK
0500
0501 /*
0502  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
0503  */
0504 #ifdef CONFIG_PPC64
0505 static inline unsigned long test_irq_work_pending(void)
0506 {
0507     unsigned long x;
0508
0509     asm volatile("lbz %0,%1(13)"
0510         : "=r" (x)
0511         : "i" (offsetof(struct paca_struct, irq_work_pending)));
0512     return x;
0513 }
0514
0515 static inline void set_irq_work_pending_flag(void)
0516 {
0517     asm volatile("stb %0,%1(13)" : :
0518         "r" (1),
0519         "i" (offsetof(struct paca_struct, irq_work_pending)));
0520 }
0521
0522 static inline void clear_irq_work_pending(void)
0523 {
0524     asm volatile("stb %0,%1(13)" : :
0525         "r" (0),
0526         "i" (offsetof(struct paca_struct, irq_work_pending)));
0527 }
0528
0529 #else /* 32-bit */
0530
0531 DEFINE_PER_CPU(u8, irq_work_pending);
0532
0533 #define set_irq_work_pending_flag() __this_cpu_write(irq_work_pending, 1)
0534 #define test_irq_work_pending()     __this_cpu_read(irq_work_pending)
0535 #define clear_irq_work_pending()    __this_cpu_write(irq_work_pending, 0)
0536
0537 #endif /* 32 vs 64 bit */
0538
0539 void arch_irq_work_raise(void)
0540 {
0541     /*
0542      * 64-bit code that uses irq soft-mask can just cause an immediate
0543      * interrupt here that gets soft masked, if this is called under
0544      * local_irq_disable(). It might be possible to prevent that happening
0545      * by noticing interrupts are disabled and setting decrementer pending
0546      * to be replayed when irqs are enabled. The problem there is that
0547      * tracing can call irq_work_raise, including in code that does low
0548      * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on)
0549      * which could get tangled up if we're messing with the same state
0550      * here.
0551      */
0552     preempt_disable();
0553     set_irq_work_pending_flag();
0554     set_dec(1);
0555     preempt_enable();
0556 }
0557
0558 static void set_dec_or_work(u64 val)
0559 {
0560     set_dec(val);
0561     /* We may have raced with new irq work */
0562     if (unlikely(test_irq_work_pending()))
0563         set_dec(1);
0564 }
0565
0566 #else  /* CONFIG_IRQ_WORK */
0567
0568 #define test_irq_work_pending() 0
0569 #define clear_irq_work_pending()
0570
0571 static void set_dec_or_work(u64 val)
0572 {
0573     set_dec(val);
0574 }
0575 #endif /* CONFIG_IRQ_WORK */
0576
0577 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
0578 void timer_rearm_host_dec(u64 now)
0579 {
0580     u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
0581
0582     WARN_ON_ONCE(!arch_irqs_disabled());
0583     WARN_ON_ONCE(mfmsr() & MSR_EE);
0584
0585     if (now >= *next_tb) {
0586         local_paca->irq_happened |= PACA_IRQ_DEC;
0587     } else {
0588         now = *next_tb - now;
0589         if (now > decrementer_max)
0590             now = decrementer_max;
0591         set_dec_or_work(now);
0592     }
0593 }
0594 EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
0595 #endif
0596
0597 /*
0598  * timer_interrupt - gets called when the decrementer overflows,
0599  * with interrupts disabled.
0600  */
0601 DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
0602 {
0603     struct clock_event_device *evt = this_cpu_ptr(&decrementers);
0604     u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
0605     struct pt_regs *old_regs;
0606     u64 now;
0607
0608     /*
0609      * Some implementations of hotplug will get timer interrupts while
0610      * offline, just ignore these.
0611      */
0612     if (unlikely(!cpu_online(smp_processor_id()))) {
0613         set_dec(decrementer_max);
0614         return;
0615     }
0616
0617     /*
0618      * Ensure a positive value is written to the decrementer, or
0619      * else some CPUs will continue to take decrementer exceptions.
0620      * When the PPC_WATCHDOG (decrementer based) is configured,
0621      * keep this at most 31 bits, which is about 4 seconds on most
0622      * systems, which gives the watchdog a chance of catching timer
0623      * interrupt hard lockups.
0624      */
0625     if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
0626         set_dec(0x7fffffff);
0627     else
0628         set_dec(decrementer_max);
0629
0630     /* Conditionally hard-enable interrupts. */
0631     if (should_hard_irq_enable())
0632         do_hard_irq_enable();
0633
0634 #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
0635     if (atomic_read(&ppc_n_lost_interrupts) != 0)
0636         __do_IRQ(regs);
0637 #endif
0638
0639     old_regs = set_irq_regs(regs);
0640
0641     trace_timer_interrupt_entry(regs);
0642
0643     if (test_irq_work_pending()) {
0644         clear_irq_work_pending();
0645         mce_run_irq_context_handlers();
0646         irq_work_run();
0647     }
0648
0649     now = get_tb();
0650     if (now >= *next_tb) {
0651         evt->event_handler(evt);
0652         __this_cpu_inc(irq_stat.timer_irqs_event);
0653     } else {
0654         now = *next_tb - now;
0655         if (now > decrementer_max)
0656             now = decrementer_max;
0657         set_dec_or_work(now);
0658         __this_cpu_inc(irq_stat.timer_irqs_others);
0659     }
0660
0661     trace_timer_interrupt_exit(regs);
0662
0663     set_irq_regs(old_regs);
0664 }
0665 EXPORT_SYMBOL(timer_interrupt);
0666
0667 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
0668 void timer_broadcast_interrupt(void)
0669 {
0670     tick_receive_broadcast();
0671     __this_cpu_inc(irq_stat.broadcast_irqs_event);
0672 }
0673 #endif
0674
0675 #ifdef CONFIG_SUSPEND
0676 /* Overrides the weak version in kernel/power/main.c */
0677 void arch_suspend_disable_irqs(void)
0678 {
0679     if (ppc_md.suspend_disable_irqs)
0680         ppc_md.suspend_disable_irqs();
0681
0682     /* Disable the decrementer, so that it doesn't interfere
0683      * with suspending.
0684      */
0685
0686     set_dec(decrementer_max);
0687     local_irq_disable();
0688     set_dec(decrementer_max);
0689 }
0690
0691 /* Overrides the weak version in kernel/power/main.c */
0692 void arch_suspend_enable_irqs(void)
0693 {
0694     local_irq_enable();
0695
0696     if (ppc_md.suspend_enable_irqs)
0697         ppc_md.suspend_enable_irqs();
0698 }
0699 #endif
0700
0701 unsigned long long tb_to_ns(unsigned long long ticks)
0702 {
0703     return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
0704 }
0705 EXPORT_SYMBOL_GPL(tb_to_ns);
0706
0707 /*
0708  * Scheduler clock - returns current time in nanosec units.
0709  *
0710  * Note: mulhdu(a, b) (multiply high double unsigned) returns
0711  * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
0712  * are 64-bit unsigned numbers.
0713  */
0714 notrace unsigned long long sched_clock(void)
0715 {
0716     return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
0717 }
0718
0719
0720 #ifdef CONFIG_PPC_PSERIES
0721
0722 /*
0723  * Running clock - attempts to give a view of time passing for a virtualised
0724  * kernels.
0725  * Uses the VTB register if available otherwise a next best guess.
0726  */
0727 unsigned long long running_clock(void)
0728 {
0729     /*
0730      * Don't read the VTB as a host since KVM does not switch in host
0731      * timebase into the VTB when it takes a guest off the CPU, reading the
0732      * VTB would result in reading 'last switched out' guest VTB.
0733      *
0734      * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
0735      * would be unsafe to rely only on the #ifdef above.
0736      */
0737     if (firmware_has_feature(FW_FEATURE_LPAR) &&
0738         cpu_has_feature(CPU_FTR_ARCH_207S))
0739         return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
0740
0741     /*
0742      * This is a next best approximation without a VTB.
0743      * On a host which is running bare metal there should never be any stolen
0744      * time and on a host which doesn't do any virtualisation TB *should* equal
0745      * VTB so it makes no difference anyway.
0746      */
0747     return local_clock() - kcpustat_this_cpu->cpustat[CPUTIME_STEAL];
0748 }
0749 #endif
0750
0751 static int __init get_freq(char *name, int cells, unsigned long *val)
0752 {
0753     struct device_node *cpu;
0754     const __be32 *fp;
0755     int found = 0;
0756
0757     /* The cpu node should have timebase and clock frequency properties */
0758     cpu = of_find_node_by_type(NULL, "cpu");
0759
0760     if (cpu) {
0761         fp = of_get_property(cpu, name, NULL);
0762         if (fp) {
0763             found = 1;
0764             *val = of_read_ulong(fp, cells);
0765         }
0766
0767         of_node_put(cpu);
0768     }
0769
0770     return found;
0771 }
0772
0773 static void start_cpu_decrementer(void)
0774 {
0775 #ifdef CONFIG_BOOKE_OR_40x
0776     unsigned int tcr;
0777
0778     /* Clear any pending timer interrupts */
0779     mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
0780
0781     tcr = mfspr(SPRN_TCR);
0782     /*
0783      * The watchdog may have already been enabled by u-boot. So leave
0784      * TRC[WP] (Watchdog Period) alone.
0785      */
0786     tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */
0787     tcr |= TCR_DIE;     /* Enable decrementer */
0788     mtspr(SPRN_TCR, tcr);
0789 #endif
0790 }
0791
0792 void __init generic_calibrate_decr(void)
0793 {
0794     ppc_tb_freq = DEFAULT_TB_FREQ;      /* hardcoded default */
0795
0796     if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) &&
0797         !get_freq("timebase-frequency", 1, &ppc_tb_freq)) {
0798
0799         printk(KERN_ERR "WARNING: Estimating decrementer frequency "
0800                 "(not found)\n");
0801     }
0802
0803     ppc_proc_freq = DEFAULT_PROC_FREQ;  /* hardcoded default */
0804
0805     if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) &&
0806         !get_freq("clock-frequency", 1, &ppc_proc_freq)) {
0807
0808         printk(KERN_ERR "WARNING: Estimating processor frequency "
0809                 "(not found)\n");
0810     }
0811 }
0812
0813 int update_persistent_clock64(struct timespec64 now)
0814 {
0815     struct rtc_time tm;
0816
0817     if (!ppc_md.set_rtc_time)
0818         return -ENODEV;
0819
0820     rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);
0821
0822     return ppc_md.set_rtc_time(&tm);
0823 }
0824
0825 static void __read_persistent_clock(struct timespec64 *ts)
0826 {
0827     struct rtc_time tm;
0828     static int first = 1;
0829
0830     ts->tv_nsec = 0;
0831     /* XXX this is a little fragile but will work okay in the short term */
0832     if (first) {
0833         first = 0;
0834         if (ppc_md.time_init)
0835             timezone_offset = ppc_md.time_init();
0836
0837         /* get_boot_time() isn't guaranteed to be safe to call late */
0838         if (ppc_md.get_boot_time) {
0839             ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
0840             return;
0841         }
0842     }
0843     if (!ppc_md.get_rtc_time) {
0844         ts->tv_sec = 0;
0845         return;
0846     }
0847     ppc_md.get_rtc_time(&tm);
0848
0849     ts->tv_sec = rtc_tm_to_time64(&tm);
0850 }
0851
0852 void read_persistent_clock64(struct timespec64 *ts)
0853 {
0854     __read_persistent_clock(ts);
0855
0856     /* Sanitize it in case real time clock is set below EPOCH */
0857     if (ts->tv_sec < 0) {
0858         ts->tv_sec = 0;
0859         ts->tv_nsec = 0;
0860     }
0861
0862 }
0863
0864 /* clocksource code */
0865 static notrace u64 timebase_read(struct clocksource *cs)
0866 {
0867     return (u64)get_tb();
0868 }
0869
0870 static void __init clocksource_init(void)
0871 {
0872     struct clocksource *clock = &clocksource_timebase;
0873
0874     if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
0875         printk(KERN_ERR "clocksource: %s is already registered\n",
0876                clock->name);
0877         return;
0878     }
0879
0880     printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n",
0881            clock->name, clock->mult, clock->shift);
0882 }
0883
0884 static int decrementer_set_next_event(unsigned long evt,
0885                       struct clock_event_device *dev)
0886 {
0887     __this_cpu_write(decrementers_next_tb, get_tb() + evt);
0888     set_dec_or_work(evt);
0889
0890     return 0;
0891 }
0892
0893 static int decrementer_shutdown(struct clock_event_device *dev)
0894 {
0895     __this_cpu_write(decrementers_next_tb, DEC_CLOCKEVENT_STOPPED);
0896     set_dec_or_work(decrementer_max);
0897
0898     return 0;
0899 }
0900
0901 static void register_decrementer_clockevent(int cpu)
0902 {
0903     struct clock_event_device *dec = &per_cpu(decrementers, cpu);
0904
0905     *dec = decrementer_clockevent;
0906     dec->cpumask = cpumask_of(cpu);
0907
0908     clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
0909
0910     printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
0911             dec->name, dec->mult, dec->shift, cpu);
0912
0913     /* Set values for KVM, see kvm_emulate_dec() */
0914     decrementer_clockevent.mult = dec->mult;
0915     decrementer_clockevent.shift = dec->shift;
0916 }
0917
0918 static void enable_large_decrementer(void)
0919 {
0920     if (!cpu_has_feature(CPU_FTR_ARCH_300))
0921         return;
0922
0923     if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
0924         return;
0925
0926     /*
0927      * If we're running as the hypervisor we need to enable the LD manually
0928      * otherwise firmware should have done it for us.
0929      */
0930     if (cpu_has_feature(CPU_FTR_HVMODE))
0931         mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD);
0932 }
0933
0934 static void __init set_decrementer_max(void)
0935 {
0936     struct device_node *cpu;
0937     u32 bits = 32;
0938
0939     /* Prior to ISAv3 the decrementer is always 32 bit */
0940     if (!cpu_has_feature(CPU_FTR_ARCH_300))
0941         return;
0942
0943     cpu = of_find_node_by_type(NULL, "cpu");
0944
0945     if (of_property_read_u32(cpu, "ibm,dec-bits", &bits) == 0) {
0946         if (bits > 64 || bits < 32) {
0947             pr_warn("time_init: firmware supplied invalid ibm,dec-bits");
0948             bits = 32;
0949         }
0950
0951         /* calculate the signed maximum given this many bits */
0952         decrementer_max = (1ul << (bits - 1)) - 1;
0953     }
0954
0955     of_node_put(cpu);
0956
0957     pr_info("time_init: %u bit decrementer (max: %llx)\n",
0958         bits, decrementer_max);
0959 }
0960
0961 static void __init init_decrementer_clockevent(void)
0962 {
0963     register_decrementer_clockevent(smp_processor_id());
0964 }
0965
0966 void secondary_cpu_time_init(void)
0967 {
0968     /* Enable and test the large decrementer for this cpu */
0969     enable_large_decrementer();
0970
0971     /* Start the decrementer on CPUs that have manual control
0972      * such as BookE
0973      */
0974     start_cpu_decrementer();
0975
0976     /* FIME: Should make unrelated change to move snapshot_timebase
0977      * call here ! */
0978     register_decrementer_clockevent(smp_processor_id());
0979 }
0980
0981 /* This function is only called on the boot processor */
0982 void __init time_init(void)
0983 {
0984     struct div_result res;
0985     u64 scale;
0986     unsigned shift;
0987
0988     /* Normal PowerPC with timebase register */
0989     ppc_md.calibrate_decr();
0990     printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n",
0991            ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
0992     printk(KERN_DEBUG "time_init: processor frequency   = %lu.%.6lu MHz\n",
0993            ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
0994
0995     tb_ticks_per_jiffy = ppc_tb_freq / HZ;
0996     tb_ticks_per_sec = ppc_tb_freq;
0997     tb_ticks_per_usec = ppc_tb_freq / 1000000;
0998     calc_cputime_factors();
0999
1000     /*
1001      * Compute scale factor for sched_clock.
1002      * The calibrate_decr() function has set tb_ticks_per_sec,
1003      * which is the timebase frequency.
1004      * We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
1005      * the 128-bit result as a 64.64 fixed-point number.
1006      * We then shift that number right until it is less than 1.0,
1007      * giving us the scale factor and shift count to use in
1008      * sched_clock().
1009      */
1010     div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
1011     scale = res.result_low;
1012     for (shift = 0; res.result_high != 0; ++shift) {
1013         scale = (scale >> 1) | (res.result_high << 63);
1014         res.result_high >>= 1;
1015     }
1016     tb_to_ns_scale = scale;
1017     tb_to_ns_shift = shift;
1018     /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
1019     boot_tb = get_tb();
1020
1021     /* If platform provided a timezone (pmac), we correct the time */
1022     if (timezone_offset) {
1023         sys_tz.tz_minuteswest = -timezone_offset / 60;
1024         sys_tz.tz_dsttime = 0;
1025     }
1026
1027     vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
1028
1029     /* initialise and enable the large decrementer (if we have one) */
1030     set_decrementer_max();
1031     enable_large_decrementer();
1032
1033     /* Start the decrementer on CPUs that have manual control
1034      * such as BookE
1035      */
1036     start_cpu_decrementer();
1037
1038     /* Register the clocksource */
1039     clocksource_init();
1040
1041     init_decrementer_clockevent();
1042     tick_setup_hrtimer_broadcast();
1043
1044     of_clk_init(NULL);
1045     enable_sched_clock_irqtime();
1046 }
1047
1048 /*
1049  * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
1050  * result.
1051  */
1052 void div128_by_32(u64 dividend_high, u64 dividend_low,
1053           unsigned divisor, struct div_result *dr)
1054 {
1055     unsigned long a, b, c, d;
1056     unsigned long w, x, y, z;
1057     u64 ra, rb, rc;
1058
1059     a = dividend_high >> 32;
1060     b = dividend_high & 0xffffffff;
1061     c = dividend_low >> 32;
1062     d = dividend_low & 0xffffffff;
1063
1064     w = a / divisor;
1065     ra = ((u64)(a - (w * divisor)) << 32) + b;
1066
1067     rb = ((u64) do_div(ra, divisor) << 32) + c;
1068     x = ra;
1069
1070     rc = ((u64) do_div(rb, divisor) << 32) + d;
1071     y = rb;
1072
1073     do_div(rc, divisor);
1074     z = rc;
1075
1076     dr->result_high = ((u64)w << 32) + x;
1077     dr->result_low  = ((u64)y << 32) + z;
1078
1079 }
1080
1081 /* We don't need to calibrate delay, we use the CPU timebase for that */
1082 void calibrate_delay(void)
1083 {
1084     /* Some generic code (such as spinlock debug) use loops_per_jiffy
1085      * as the number of __delay(1) in a jiffy, so make it so
1086      */
1087     loops_per_jiffy = tb_ticks_per_jiffy;
1088 }
1089
1090 #if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
1091 static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
1092 {
1093     ppc_md.get_rtc_time(tm);
1094     return 0;
1095 }
1096
1097 static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
1098 {
1099     if (!ppc_md.set_rtc_time)
1100         return -EOPNOTSUPP;
1101
1102     if (ppc_md.set_rtc_time(tm) < 0)
1103         return -EOPNOTSUPP;
1104
1105     return 0;
1106 }
1107
1108 static const struct rtc_class_ops rtc_generic_ops = {
1109     .read_time = rtc_generic_get_time,
1110     .set_time = rtc_generic_set_time,
1111 };
1112
1113 static int __init rtc_init(void)
1114 {
1115     struct platform_device *pdev;
1116
1117     if (!ppc_md.get_rtc_time)
1118         return -ENODEV;
1119
1120     pdev = platform_device_register_data(NULL, "rtc-generic", -1,
1121                          &rtc_generic_ops,
1122                          sizeof(rtc_generic_ops));
1123
1124     return PTR_ERR_OR_ZERO(pdev);
1125 }
1126
1127 device_initcall(rtc_init);
1128 #endif