kernel/sched/cputime.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Simple CPU accounting cgroup controller
0004  */
0005
0006 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
0007
0008 /*
0009  * There are no locks covering percpu hardirq/softirq time.
0010  * They are only modified in vtime_account, on corresponding CPU
0011  * with interrupts disabled. So, writes are safe.
0012  * They are read and saved off onto struct rq in update_rq_clock().
0013  * This may result in other CPU reading this CPU's irq time and can
0014  * race with irq/vtime_account on this CPU. We would either get old
0015  * or new value with a side effect of accounting a slice of irq time to wrong
0016  * task when irq is in progress while we read rq->clock. That is a worthy
0017  * compromise in place of having locks on each irq in account_system_time.
0018  */
0019 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
0020
0021 static int sched_clock_irqtime;
0022
0023 void enable_sched_clock_irqtime(void)
0024 {
0025     sched_clock_irqtime = 1;
0026 }
0027
0028 void disable_sched_clock_irqtime(void)
0029 {
0030     sched_clock_irqtime = 0;
0031 }
0032
0033 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
0034                   enum cpu_usage_stat idx)
0035 {
0036     u64 *cpustat = kcpustat_this_cpu->cpustat;
0037
0038     u64_stats_update_begin(&irqtime->sync);
0039     cpustat[idx] += delta;
0040     irqtime->total += delta;
0041     irqtime->tick_delta += delta;
0042     u64_stats_update_end(&irqtime->sync);
0043 }
0044
0045 /*
0046  * Called after incrementing preempt_count on {soft,}irq_enter
0047  * and before decrementing preempt_count on {soft,}irq_exit.
0048  */
0049 void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
0050 {
0051     struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
0052     unsigned int pc;
0053     s64 delta;
0054     int cpu;
0055
0056     if (!sched_clock_irqtime)
0057         return;
0058
0059     cpu = smp_processor_id();
0060     delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
0061     irqtime->irq_start_time += delta;
0062     pc = irq_count() - offset;
0063
0064     /*
0065      * We do not account for softirq time from ksoftirqd here.
0066      * We want to continue accounting softirq time to ksoftirqd thread
0067      * in that case, so as not to confuse scheduler with a special task
0068      * that do not consume any time, but still wants to run.
0069      */
0070     if (pc & HARDIRQ_MASK)
0071         irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
0072     else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
0073         irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
0074 }
0075
0076 static u64 irqtime_tick_accounted(u64 maxtime)
0077 {
0078     struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
0079     u64 delta;
0080
0081     delta = min(irqtime->tick_delta, maxtime);
0082     irqtime->tick_delta -= delta;
0083
0084     return delta;
0085 }
0086
0087 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
0088
0089 #define sched_clock_irqtime (0)
0090
0091 static u64 irqtime_tick_accounted(u64 dummy)
0092 {
0093     return 0;
0094 }
0095
0096 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
0097
0098 static inline void task_group_account_field(struct task_struct *p, int index,
0099                         u64 tmp)
0100 {
0101     /*
0102      * Since all updates are sure to touch the root cgroup, we
0103      * get ourselves ahead and touch it first. If the root cgroup
0104      * is the only cgroup, then nothing else should be necessary.
0105      *
0106      */
0107     __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
0108
0109     cgroup_account_cputime_field(p, index, tmp);
0110 }
0111
0112 /*
0113  * Account user CPU time to a process.
0114  * @p: the process that the CPU time gets accounted to
0115  * @cputime: the CPU time spent in user space since the last update
0116  */
0117 void account_user_time(struct task_struct *p, u64 cputime)
0118 {
0119     int index;
0120
0121     /* Add user time to process. */
0122     p->utime += cputime;
0123     account_group_user_time(p, cputime);
0124
0125     index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
0126
0127     /* Add user time to cpustat. */
0128     task_group_account_field(p, index, cputime);
0129
0130     /* Account for user time used */
0131     acct_account_cputime(p);
0132 }
0133
0134 /*
0135  * Account guest CPU time to a process.
0136  * @p: the process that the CPU time gets accounted to
0137  * @cputime: the CPU time spent in virtual machine since the last update
0138  */
0139 void account_guest_time(struct task_struct *p, u64 cputime)
0140 {
0141     u64 *cpustat = kcpustat_this_cpu->cpustat;
0142
0143     /* Add guest time to process. */
0144     p->utime += cputime;
0145     account_group_user_time(p, cputime);
0146     p->gtime += cputime;
0147
0148     /* Add guest time to cpustat. */
0149     if (task_nice(p) > 0) {
0150         task_group_account_field(p, CPUTIME_NICE, cputime);
0151         cpustat[CPUTIME_GUEST_NICE] += cputime;
0152     } else {
0153         task_group_account_field(p, CPUTIME_USER, cputime);
0154         cpustat[CPUTIME_GUEST] += cputime;
0155     }
0156 }
0157
0158 /*
0159  * Account system CPU time to a process and desired cpustat field
0160  * @p: the process that the CPU time gets accounted to
0161  * @cputime: the CPU time spent in kernel space since the last update
0162  * @index: pointer to cpustat field that has to be updated
0163  */
0164 void account_system_index_time(struct task_struct *p,
0165                    u64 cputime, enum cpu_usage_stat index)
0166 {
0167     /* Add system time to process. */
0168     p->stime += cputime;
0169     account_group_system_time(p, cputime);
0170
0171     /* Add system time to cpustat. */
0172     task_group_account_field(p, index, cputime);
0173
0174     /* Account for system time used */
0175     acct_account_cputime(p);
0176 }
0177
0178 /*
0179  * Account system CPU time to a process.
0180  * @p: the process that the CPU time gets accounted to
0181  * @hardirq_offset: the offset to subtract from hardirq_count()
0182  * @cputime: the CPU time spent in kernel space since the last update
0183  */
0184 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
0185 {
0186     int index;
0187
0188     if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
0189         account_guest_time(p, cputime);
0190         return;
0191     }
0192
0193     if (hardirq_count() - hardirq_offset)
0194         index = CPUTIME_IRQ;
0195     else if (in_serving_softirq())
0196         index = CPUTIME_SOFTIRQ;
0197     else
0198         index = CPUTIME_SYSTEM;
0199
0200     account_system_index_time(p, cputime, index);
0201 }
0202
0203 /*
0204  * Account for involuntary wait time.
0205  * @cputime: the CPU time spent in involuntary wait
0206  */
0207 void account_steal_time(u64 cputime)
0208 {
0209     u64 *cpustat = kcpustat_this_cpu->cpustat;
0210
0211     cpustat[CPUTIME_STEAL] += cputime;
0212 }
0213
0214 /*
0215  * Account for idle time.
0216  * @cputime: the CPU time spent in idle wait
0217  */
0218 void account_idle_time(u64 cputime)
0219 {
0220     u64 *cpustat = kcpustat_this_cpu->cpustat;
0221     struct rq *rq = this_rq();
0222
0223     if (atomic_read(&rq->nr_iowait) > 0)
0224         cpustat[CPUTIME_IOWAIT] += cputime;
0225     else
0226         cpustat[CPUTIME_IDLE] += cputime;
0227 }
0228
0229
0230 #ifdef CONFIG_SCHED_CORE
0231 /*
0232  * Account for forceidle time due to core scheduling.
0233  *
0234  * REQUIRES: schedstat is enabled.
0235  */
0236 void __account_forceidle_time(struct task_struct *p, u64 delta)
0237 {
0238     __schedstat_add(p->stats.core_forceidle_sum, delta);
0239
0240     task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
0241 }
0242 #endif
0243
0244 /*
0245  * When a guest is interrupted for a longer amount of time, missed clock
0246  * ticks are not redelivered later. Due to that, this function may on
0247  * occasion account more time than the calling functions think elapsed.
0248  */
0249 static __always_inline u64 steal_account_process_time(u64 maxtime)
0250 {
0251 #ifdef CONFIG_PARAVIRT
0252     if (static_key_false(&paravirt_steal_enabled)) {
0253         u64 steal;
0254
0255         steal = paravirt_steal_clock(smp_processor_id());
0256         steal -= this_rq()->prev_steal_time;
0257         steal = min(steal, maxtime);
0258         account_steal_time(steal);
0259         this_rq()->prev_steal_time += steal;
0260
0261         return steal;
0262     }
0263 #endif
0264     return 0;
0265 }
0266
0267 /*
0268  * Account how much elapsed time was spent in steal, irq, or softirq time.
0269  */
0270 static inline u64 account_other_time(u64 max)
0271 {
0272     u64 accounted;
0273
0274     lockdep_assert_irqs_disabled();
0275
0276     accounted = steal_account_process_time(max);
0277
0278     if (accounted < max)
0279         accounted += irqtime_tick_accounted(max - accounted);
0280
0281     return accounted;
0282 }
0283
0284 #ifdef CONFIG_64BIT
0285 static inline u64 read_sum_exec_runtime(struct task_struct *t)
0286 {
0287     return t->se.sum_exec_runtime;
0288 }
0289 #else
0290 static u64 read_sum_exec_runtime(struct task_struct *t)
0291 {
0292     u64 ns;
0293     struct rq_flags rf;
0294     struct rq *rq;
0295
0296     rq = task_rq_lock(t, &rf);
0297     ns = t->se.sum_exec_runtime;
0298     task_rq_unlock(rq, t, &rf);
0299
0300     return ns;
0301 }
0302 #endif
0303
0304 /*
0305  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
0306  * tasks (sum on group iteration) belonging to @tsk's group.
0307  */
0308 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
0309 {
0310     struct signal_struct *sig = tsk->signal;
0311     u64 utime, stime;
0312     struct task_struct *t;
0313     unsigned int seq, nextseq;
0314     unsigned long flags;
0315
0316     /*
0317      * Update current task runtime to account pending time since last
0318      * scheduler action or thread_group_cputime() call. This thread group
0319      * might have other running tasks on different CPUs, but updating
0320      * their runtime can affect syscall performance, so we skip account
0321      * those pending times and rely only on values updated on tick or
0322      * other scheduler action.
0323      */
0324     if (same_thread_group(current, tsk))
0325         (void) task_sched_runtime(current);
0326
0327     rcu_read_lock();
0328     /* Attempt a lockless read on the first round. */
0329     nextseq = 0;
0330     do {
0331         seq = nextseq;
0332         flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
0333         times->utime = sig->utime;
0334         times->stime = sig->stime;
0335         times->sum_exec_runtime = sig->sum_sched_runtime;
0336
0337         for_each_thread(tsk, t) {
0338             task_cputime(t, &utime, &stime);
0339             times->utime += utime;
0340             times->stime += stime;
0341             times->sum_exec_runtime += read_sum_exec_runtime(t);
0342         }
0343         /* If lockless access failed, take the lock. */
0344         nextseq = 1;
0345     } while (need_seqretry(&sig->stats_lock, seq));
0346     done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
0347     rcu_read_unlock();
0348 }
0349
0350 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
0351 /*
0352  * Account a tick to a process and cpustat
0353  * @p: the process that the CPU time gets accounted to
0354  * @user_tick: is the tick from userspace
0355  * @rq: the pointer to rq
0356  *
0357  * Tick demultiplexing follows the order
0358  * - pending hardirq update
0359  * - pending softirq update
0360  * - user_time
0361  * - idle_time
0362  * - system time
0363  *   - check for guest_time
0364  *   - else account as system_time
0365  *
0366  * Check for hardirq is done both for system and user time as there is
0367  * no timer going off while we are on hardirq and hence we may never get an
0368  * opportunity to update it solely in system time.
0369  * p->stime and friends are only updated on system time and not on irq
0370  * softirq as those do not count in task exec_runtime any more.
0371  */
0372 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
0373                      int ticks)
0374 {
0375     u64 other, cputime = TICK_NSEC * ticks;
0376
0377     /*
0378      * When returning from idle, many ticks can get accounted at
0379      * once, including some ticks of steal, irq, and softirq time.
0380      * Subtract those ticks from the amount of time accounted to
0381      * idle, or potentially user or system time. Due to rounding,
0382      * other time can exceed ticks occasionally.
0383      */
0384     other = account_other_time(ULONG_MAX);
0385     if (other >= cputime)
0386         return;
0387
0388     cputime -= other;
0389
0390     if (this_cpu_ksoftirqd() == p) {
0391         /*
0392          * ksoftirqd time do not get accounted in cpu_softirq_time.
0393          * So, we have to handle it separately here.
0394          * Also, p->stime needs to be updated for ksoftirqd.
0395          */
0396         account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
0397     } else if (user_tick) {
0398         account_user_time(p, cputime);
0399     } else if (p == this_rq()->idle) {
0400         account_idle_time(cputime);
0401     } else if (p->flags & PF_VCPU) { /* System time or guest time */
0402         account_guest_time(p, cputime);
0403     } else {
0404         account_system_index_time(p, cputime, CPUTIME_SYSTEM);
0405     }
0406 }
0407
0408 static void irqtime_account_idle_ticks(int ticks)
0409 {
0410     irqtime_account_process_tick(current, 0, ticks);
0411 }
0412 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
0413 static inline void irqtime_account_idle_ticks(int ticks) { }
0414 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
0415                         int nr_ticks) { }
0416 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
0417
0418 /*
0419  * Use precise platform statistics if available:
0420  */
0421 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0422
0423 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
0424 void vtime_task_switch(struct task_struct *prev)
0425 {
0426     if (is_idle_task(prev))
0427         vtime_account_idle(prev);
0428     else
0429         vtime_account_kernel(prev);
0430
0431     vtime_flush(prev);
0432     arch_vtime_task_switch(prev);
0433 }
0434 # endif
0435
0436 void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
0437 {
0438     unsigned int pc = irq_count() - offset;
0439
0440     if (pc & HARDIRQ_OFFSET) {
0441         vtime_account_hardirq(tsk);
0442     } else if (pc & SOFTIRQ_OFFSET) {
0443         vtime_account_softirq(tsk);
0444     } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
0445            is_idle_task(tsk)) {
0446         vtime_account_idle(tsk);
0447     } else {
0448         vtime_account_kernel(tsk);
0449     }
0450 }
0451
0452 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
0453             u64 *ut, u64 *st)
0454 {
0455     *ut = curr->utime;
0456     *st = curr->stime;
0457 }
0458
0459 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
0460 {
0461     *ut = p->utime;
0462     *st = p->stime;
0463 }
0464 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
0465
0466 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
0467 {
0468     struct task_cputime cputime;
0469
0470     thread_group_cputime(p, &cputime);
0471
0472     *ut = cputime.utime;
0473     *st = cputime.stime;
0474 }
0475
0476 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
0477
0478 /*
0479  * Account a single tick of CPU time.
0480  * @p: the process that the CPU time gets accounted to
0481  * @user_tick: indicates if the tick is a user or a system tick
0482  */
0483 void account_process_tick(struct task_struct *p, int user_tick)
0484 {
0485     u64 cputime, steal;
0486
0487     if (vtime_accounting_enabled_this_cpu())
0488         return;
0489
0490     if (sched_clock_irqtime) {
0491         irqtime_account_process_tick(p, user_tick, 1);
0492         return;
0493     }
0494
0495     cputime = TICK_NSEC;
0496     steal = steal_account_process_time(ULONG_MAX);
0497
0498     if (steal >= cputime)
0499         return;
0500
0501     cputime -= steal;
0502
0503     if (user_tick)
0504         account_user_time(p, cputime);
0505     else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
0506         account_system_time(p, HARDIRQ_OFFSET, cputime);
0507     else
0508         account_idle_time(cputime);
0509 }
0510
0511 /*
0512  * Account multiple ticks of idle time.
0513  * @ticks: number of stolen ticks
0514  */
0515 void account_idle_ticks(unsigned long ticks)
0516 {
0517     u64 cputime, steal;
0518
0519     if (sched_clock_irqtime) {
0520         irqtime_account_idle_ticks(ticks);
0521         return;
0522     }
0523
0524     cputime = ticks * TICK_NSEC;
0525     steal = steal_account_process_time(ULONG_MAX);
0526
0527     if (steal >= cputime)
0528         return;
0529
0530     cputime -= steal;
0531     account_idle_time(cputime);
0532 }
0533
0534 /*
0535  * Adjust tick based cputime random precision against scheduler runtime
0536  * accounting.
0537  *
0538  * Tick based cputime accounting depend on random scheduling timeslices of a
0539  * task to be interrupted or not by the timer.  Depending on these
0540  * circumstances, the number of these interrupts may be over or
0541  * under-optimistic, matching the real user and system cputime with a variable
0542  * precision.
0543  *
0544  * Fix this by scaling these tick based values against the total runtime
0545  * accounted by the CFS scheduler.
0546  *
0547  * This code provides the following guarantees:
0548  *
0549  *   stime + utime == rtime
0550  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
0551  *
0552  * Assuming that rtime_i+1 >= rtime_i.
0553  */
0554 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
0555             u64 *ut, u64 *st)
0556 {
0557     u64 rtime, stime, utime;
0558     unsigned long flags;
0559
0560     /* Serialize concurrent callers such that we can honour our guarantees */
0561     raw_spin_lock_irqsave(&prev->lock, flags);
0562     rtime = curr->sum_exec_runtime;
0563
0564     /*
0565      * This is possible under two circumstances:
0566      *  - rtime isn't monotonic after all (a bug);
0567      *  - we got reordered by the lock.
0568      *
0569      * In both cases this acts as a filter such that the rest of the code
0570      * can assume it is monotonic regardless of anything else.
0571      */
0572     if (prev->stime + prev->utime >= rtime)
0573         goto out;
0574
0575     stime = curr->stime;
0576     utime = curr->utime;
0577
0578     /*
0579      * If either stime or utime are 0, assume all runtime is userspace.
0580      * Once a task gets some ticks, the monotonicity code at 'update:'
0581      * will ensure things converge to the observed ratio.
0582      */
0583     if (stime == 0) {
0584         utime = rtime;
0585         goto update;
0586     }
0587
0588     if (utime == 0) {
0589         stime = rtime;
0590         goto update;
0591     }
0592
0593     stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
0594
0595 update:
0596     /*
0597      * Make sure stime doesn't go backwards; this preserves monotonicity
0598      * for utime because rtime is monotonic.
0599      *
0600      *  utime_i+1 = rtime_i+1 - stime_i
0601      *            = rtime_i+1 - (rtime_i - utime_i)
0602      *            = (rtime_i+1 - rtime_i) + utime_i
0603      *            >= utime_i
0604      */
0605     if (stime < prev->stime)
0606         stime = prev->stime;
0607     utime = rtime - stime;
0608
0609     /*
0610      * Make sure utime doesn't go backwards; this still preserves
0611      * monotonicity for stime, analogous argument to above.
0612      */
0613     if (utime < prev->utime) {
0614         utime = prev->utime;
0615         stime = rtime - utime;
0616     }
0617
0618     prev->stime = stime;
0619     prev->utime = utime;
0620 out:
0621     *ut = prev->utime;
0622     *st = prev->stime;
0623     raw_spin_unlock_irqrestore(&prev->lock, flags);
0624 }
0625
0626 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
0627 {
0628     struct task_cputime cputime = {
0629         .sum_exec_runtime = p->se.sum_exec_runtime,
0630     };
0631
0632     if (task_cputime(p, &cputime.utime, &cputime.stime))
0633         cputime.sum_exec_runtime = task_sched_runtime(p);
0634     cputime_adjust(&cputime, &p->prev_cputime, ut, st);
0635 }
0636 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
0637
0638 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
0639 {
0640     struct task_cputime cputime;
0641
0642     thread_group_cputime(p, &cputime);
0643     cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
0644 }
0645 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
0646
0647 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
0648 static u64 vtime_delta(struct vtime *vtime)
0649 {
0650     unsigned long long clock;
0651
0652     clock = sched_clock();
0653     if (clock < vtime->starttime)
0654         return 0;
0655
0656     return clock - vtime->starttime;
0657 }
0658
0659 static u64 get_vtime_delta(struct vtime *vtime)
0660 {
0661     u64 delta = vtime_delta(vtime);
0662     u64 other;
0663
0664     /*
0665      * Unlike tick based timing, vtime based timing never has lost
0666      * ticks, and no need for steal time accounting to make up for
0667      * lost ticks. Vtime accounts a rounded version of actual
0668      * elapsed time. Limit account_other_time to prevent rounding
0669      * errors from causing elapsed vtime to go negative.
0670      */
0671     other = account_other_time(delta);
0672     WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
0673     vtime->starttime += delta;
0674
0675     return delta - other;
0676 }
0677
0678 static void vtime_account_system(struct task_struct *tsk,
0679                  struct vtime *vtime)
0680 {
0681     vtime->stime += get_vtime_delta(vtime);
0682     if (vtime->stime >= TICK_NSEC) {
0683         account_system_time(tsk, irq_count(), vtime->stime);
0684         vtime->stime = 0;
0685     }
0686 }
0687
0688 static void vtime_account_guest(struct task_struct *tsk,
0689                 struct vtime *vtime)
0690 {
0691     vtime->gtime += get_vtime_delta(vtime);
0692     if (vtime->gtime >= TICK_NSEC) {
0693         account_guest_time(tsk, vtime->gtime);
0694         vtime->gtime = 0;
0695     }
0696 }
0697
0698 static void __vtime_account_kernel(struct task_struct *tsk,
0699                    struct vtime *vtime)
0700 {
0701     /* We might have scheduled out from guest path */
0702     if (vtime->state == VTIME_GUEST)
0703         vtime_account_guest(tsk, vtime);
0704     else
0705         vtime_account_system(tsk, vtime);
0706 }
0707
0708 void vtime_account_kernel(struct task_struct *tsk)
0709 {
0710     struct vtime *vtime = &tsk->vtime;
0711
0712     if (!vtime_delta(vtime))
0713         return;
0714
0715     write_seqcount_begin(&vtime->seqcount);
0716     __vtime_account_kernel(tsk, vtime);
0717     write_seqcount_end(&vtime->seqcount);
0718 }
0719
0720 void vtime_user_enter(struct task_struct *tsk)
0721 {
0722     struct vtime *vtime = &tsk->vtime;
0723
0724     write_seqcount_begin(&vtime->seqcount);
0725     vtime_account_system(tsk, vtime);
0726     vtime->state = VTIME_USER;
0727     write_seqcount_end(&vtime->seqcount);
0728 }
0729
0730 void vtime_user_exit(struct task_struct *tsk)
0731 {
0732     struct vtime *vtime = &tsk->vtime;
0733
0734     write_seqcount_begin(&vtime->seqcount);
0735     vtime->utime += get_vtime_delta(vtime);
0736     if (vtime->utime >= TICK_NSEC) {
0737         account_user_time(tsk, vtime->utime);
0738         vtime->utime = 0;
0739     }
0740     vtime->state = VTIME_SYS;
0741     write_seqcount_end(&vtime->seqcount);
0742 }
0743
0744 void vtime_guest_enter(struct task_struct *tsk)
0745 {
0746     struct vtime *vtime = &tsk->vtime;
0747     /*
0748      * The flags must be updated under the lock with
0749      * the vtime_starttime flush and update.
0750      * That enforces a right ordering and update sequence
0751      * synchronization against the reader (task_gtime())
0752      * that can thus safely catch up with a tickless delta.
0753      */
0754     write_seqcount_begin(&vtime->seqcount);
0755     vtime_account_system(tsk, vtime);
0756     tsk->flags |= PF_VCPU;
0757     vtime->state = VTIME_GUEST;
0758     write_seqcount_end(&vtime->seqcount);
0759 }
0760 EXPORT_SYMBOL_GPL(vtime_guest_enter);
0761
0762 void vtime_guest_exit(struct task_struct *tsk)
0763 {
0764     struct vtime *vtime = &tsk->vtime;
0765
0766     write_seqcount_begin(&vtime->seqcount);
0767     vtime_account_guest(tsk, vtime);
0768     tsk->flags &= ~PF_VCPU;
0769     vtime->state = VTIME_SYS;
0770     write_seqcount_end(&vtime->seqcount);
0771 }
0772 EXPORT_SYMBOL_GPL(vtime_guest_exit);
0773
0774 void vtime_account_idle(struct task_struct *tsk)
0775 {
0776     account_idle_time(get_vtime_delta(&tsk->vtime));
0777 }
0778
0779 void vtime_task_switch_generic(struct task_struct *prev)
0780 {
0781     struct vtime *vtime = &prev->vtime;
0782
0783     write_seqcount_begin(&vtime->seqcount);
0784     if (vtime->state == VTIME_IDLE)
0785         vtime_account_idle(prev);
0786     else
0787         __vtime_account_kernel(prev, vtime);
0788     vtime->state = VTIME_INACTIVE;
0789     vtime->cpu = -1;
0790     write_seqcount_end(&vtime->seqcount);
0791
0792     vtime = &current->vtime;
0793
0794     write_seqcount_begin(&vtime->seqcount);
0795     if (is_idle_task(current))
0796         vtime->state = VTIME_IDLE;
0797     else if (current->flags & PF_VCPU)
0798         vtime->state = VTIME_GUEST;
0799     else
0800         vtime->state = VTIME_SYS;
0801     vtime->starttime = sched_clock();
0802     vtime->cpu = smp_processor_id();
0803     write_seqcount_end(&vtime->seqcount);
0804 }
0805
0806 void vtime_init_idle(struct task_struct *t, int cpu)
0807 {
0808     struct vtime *vtime = &t->vtime;
0809     unsigned long flags;
0810
0811     local_irq_save(flags);
0812     write_seqcount_begin(&vtime->seqcount);
0813     vtime->state = VTIME_IDLE;
0814     vtime->starttime = sched_clock();
0815     vtime->cpu = cpu;
0816     write_seqcount_end(&vtime->seqcount);
0817     local_irq_restore(flags);
0818 }
0819
0820 u64 task_gtime(struct task_struct *t)
0821 {
0822     struct vtime *vtime = &t->vtime;
0823     unsigned int seq;
0824     u64 gtime;
0825
0826     if (!vtime_accounting_enabled())
0827         return t->gtime;
0828
0829     do {
0830         seq = read_seqcount_begin(&vtime->seqcount);
0831
0832         gtime = t->gtime;
0833         if (vtime->state == VTIME_GUEST)
0834             gtime += vtime->gtime + vtime_delta(vtime);
0835
0836     } while (read_seqcount_retry(&vtime->seqcount, seq));
0837
0838     return gtime;
0839 }
0840
0841 /*
0842  * Fetch cputime raw values from fields of task_struct and
0843  * add up the pending nohz execution time since the last
0844  * cputime snapshot.
0845  */
0846 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
0847 {
0848     struct vtime *vtime = &t->vtime;
0849     unsigned int seq;
0850     u64 delta;
0851     int ret;
0852
0853     if (!vtime_accounting_enabled()) {
0854         *utime = t->utime;
0855         *stime = t->stime;
0856         return false;
0857     }
0858
0859     do {
0860         ret = false;
0861         seq = read_seqcount_begin(&vtime->seqcount);
0862
0863         *utime = t->utime;
0864         *stime = t->stime;
0865
0866         /* Task is sleeping or idle, nothing to add */
0867         if (vtime->state < VTIME_SYS)
0868             continue;
0869
0870         ret = true;
0871         delta = vtime_delta(vtime);
0872
0873         /*
0874          * Task runs either in user (including guest) or kernel space,
0875          * add pending nohz time to the right place.
0876          */
0877         if (vtime->state == VTIME_SYS)
0878             *stime += vtime->stime + delta;
0879         else
0880             *utime += vtime->utime + delta;
0881     } while (read_seqcount_retry(&vtime->seqcount, seq));
0882
0883     return ret;
0884 }
0885
0886 static int vtime_state_fetch(struct vtime *vtime, int cpu)
0887 {
0888     int state = READ_ONCE(vtime->state);
0889
0890     /*
0891      * We raced against a context switch, fetch the
0892      * kcpustat task again.
0893      */
0894     if (vtime->cpu != cpu && vtime->cpu != -1)
0895         return -EAGAIN;
0896
0897     /*
0898      * Two possible things here:
0899      * 1) We are seeing the scheduling out task (prev) or any past one.
0900      * 2) We are seeing the scheduling in task (next) but it hasn't
0901      *    passed though vtime_task_switch() yet so the pending
0902      *    cputime of the prev task may not be flushed yet.
0903      *
0904      * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
0905      */
0906     if (state == VTIME_INACTIVE)
0907         return -EAGAIN;
0908
0909     return state;
0910 }
0911
0912 static u64 kcpustat_user_vtime(struct vtime *vtime)
0913 {
0914     if (vtime->state == VTIME_USER)
0915         return vtime->utime + vtime_delta(vtime);
0916     else if (vtime->state == VTIME_GUEST)
0917         return vtime->gtime + vtime_delta(vtime);
0918     return 0;
0919 }
0920
0921 static int kcpustat_field_vtime(u64 *cpustat,
0922                 struct task_struct *tsk,
0923                 enum cpu_usage_stat usage,
0924                 int cpu, u64 *val)
0925 {
0926     struct vtime *vtime = &tsk->vtime;
0927     unsigned int seq;
0928
0929     do {
0930         int state;
0931
0932         seq = read_seqcount_begin(&vtime->seqcount);
0933
0934         state = vtime_state_fetch(vtime, cpu);
0935         if (state < 0)
0936             return state;
0937
0938         *val = cpustat[usage];
0939
0940         /*
0941          * Nice VS unnice cputime accounting may be inaccurate if
0942          * the nice value has changed since the last vtime update.
0943          * But proper fix would involve interrupting target on nice
0944          * updates which is a no go on nohz_full (although the scheduler
0945          * may still interrupt the target if rescheduling is needed...)
0946          */
0947         switch (usage) {
0948         case CPUTIME_SYSTEM:
0949             if (state == VTIME_SYS)
0950                 *val += vtime->stime + vtime_delta(vtime);
0951             break;
0952         case CPUTIME_USER:
0953             if (task_nice(tsk) <= 0)
0954                 *val += kcpustat_user_vtime(vtime);
0955             break;
0956         case CPUTIME_NICE:
0957             if (task_nice(tsk) > 0)
0958                 *val += kcpustat_user_vtime(vtime);
0959             break;
0960         case CPUTIME_GUEST:
0961             if (state == VTIME_GUEST && task_nice(tsk) <= 0)
0962                 *val += vtime->gtime + vtime_delta(vtime);
0963             break;
0964         case CPUTIME_GUEST_NICE:
0965             if (state == VTIME_GUEST && task_nice(tsk) > 0)
0966                 *val += vtime->gtime + vtime_delta(vtime);
0967             break;
0968         default:
0969             break;
0970         }
0971     } while (read_seqcount_retry(&vtime->seqcount, seq));
0972
0973     return 0;
0974 }
0975
0976 u64 kcpustat_field(struct kernel_cpustat *kcpustat,
0977            enum cpu_usage_stat usage, int cpu)
0978 {
0979     u64 *cpustat = kcpustat->cpustat;
0980     u64 val = cpustat[usage];
0981     struct rq *rq;
0982     int err;
0983
0984     if (!vtime_accounting_enabled_cpu(cpu))
0985         return val;
0986
0987     rq = cpu_rq(cpu);
0988
0989     for (;;) {
0990         struct task_struct *curr;
0991
0992         rcu_read_lock();
0993         curr = rcu_dereference(rq->curr);
0994         if (WARN_ON_ONCE(!curr)) {
0995             rcu_read_unlock();
0996             return cpustat[usage];
0997         }
0998
0999         err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
1000         rcu_read_unlock();
1001
1002         if (!err)
1003             return val;
1004
1005         cpu_relax();
1006     }
1007 }
1008 EXPORT_SYMBOL_GPL(kcpustat_field);
1009
1010 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
1011                     const struct kernel_cpustat *src,
1012                     struct task_struct *tsk, int cpu)
1013 {
1014     struct vtime *vtime = &tsk->vtime;
1015     unsigned int seq;
1016
1017     do {
1018         u64 *cpustat;
1019         u64 delta;
1020         int state;
1021
1022         seq = read_seqcount_begin(&vtime->seqcount);
1023
1024         state = vtime_state_fetch(vtime, cpu);
1025         if (state < 0)
1026             return state;
1027
1028         *dst = *src;
1029         cpustat = dst->cpustat;
1030
1031         /* Task is sleeping, dead or idle, nothing to add */
1032         if (state < VTIME_SYS)
1033             continue;
1034
1035         delta = vtime_delta(vtime);
1036
1037         /*
1038          * Task runs either in user (including guest) or kernel space,
1039          * add pending nohz time to the right place.
1040          */
1041         if (state == VTIME_SYS) {
1042             cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
1043         } else if (state == VTIME_USER) {
1044             if (task_nice(tsk) > 0)
1045                 cpustat[CPUTIME_NICE] += vtime->utime + delta;
1046             else
1047                 cpustat[CPUTIME_USER] += vtime->utime + delta;
1048         } else {
1049             WARN_ON_ONCE(state != VTIME_GUEST);
1050             if (task_nice(tsk) > 0) {
1051                 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
1052                 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
1053             } else {
1054                 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
1055                 cpustat[CPUTIME_USER] += vtime->gtime + delta;
1056             }
1057         }
1058     } while (read_seqcount_retry(&vtime->seqcount, seq));
1059
1060     return 0;
1061 }
1062
1063 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
1064 {
1065     const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
1066     struct rq *rq;
1067     int err;
1068
1069     if (!vtime_accounting_enabled_cpu(cpu)) {
1070         *dst = *src;
1071         return;
1072     }
1073
1074     rq = cpu_rq(cpu);
1075
1076     for (;;) {
1077         struct task_struct *curr;
1078
1079         rcu_read_lock();
1080         curr = rcu_dereference(rq->curr);
1081         if (WARN_ON_ONCE(!curr)) {
1082             rcu_read_unlock();
1083             *dst = *src;
1084             return;
1085         }
1086
1087         err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
1088         rcu_read_unlock();
1089
1090         if (!err)
1091             return;
1092
1093         cpu_relax();
1094     }
1095 }
1096 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
1097
1098 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */