kernel/cgroup/rstat.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 #include "cgroup-internal.h"
0003
0004 #include <linux/sched/cputime.h>
0005
0006 static DEFINE_SPINLOCK(cgroup_rstat_lock);
0007 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
0008
0009 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
0010
0011 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
0012 {
0013     return per_cpu_ptr(cgrp->rstat_cpu, cpu);
0014 }
0015
0016 /**
0017  * cgroup_rstat_updated - keep track of updated rstat_cpu
0018  * @cgrp: target cgroup
0019  * @cpu: cpu on which rstat_cpu was updated
0020  *
0021  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
0022  * rstat_cpu->updated_children list.  See the comment on top of
0023  * cgroup_rstat_cpu definition for details.
0024  */
0025 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
0026 {
0027     raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
0028     unsigned long flags;
0029
0030     /*
0031      * Speculative already-on-list test. This may race leading to
0032      * temporary inaccuracies, which is fine.
0033      *
0034      * Because @parent's updated_children is terminated with @parent
0035      * instead of NULL, we can tell whether @cgrp is on the list by
0036      * testing the next pointer for NULL.
0037      */
0038     if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
0039         return;
0040
0041     raw_spin_lock_irqsave(cpu_lock, flags);
0042
0043     /* put @cgrp and all ancestors on the corresponding updated lists */
0044     while (true) {
0045         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
0046         struct cgroup *parent = cgroup_parent(cgrp);
0047         struct cgroup_rstat_cpu *prstatc;
0048
0049         /*
0050          * Both additions and removals are bottom-up.  If a cgroup
0051          * is already in the tree, all ancestors are.
0052          */
0053         if (rstatc->updated_next)
0054             break;
0055
0056         /* Root has no parent to link it to, but mark it busy */
0057         if (!parent) {
0058             rstatc->updated_next = cgrp;
0059             break;
0060         }
0061
0062         prstatc = cgroup_rstat_cpu(parent, cpu);
0063         rstatc->updated_next = prstatc->updated_children;
0064         prstatc->updated_children = cgrp;
0065
0066         cgrp = parent;
0067     }
0068
0069     raw_spin_unlock_irqrestore(cpu_lock, flags);
0070 }
0071
0072 /**
0073  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
0074  * @pos: current position
0075  * @root: root of the tree to traversal
0076  * @cpu: target cpu
0077  *
0078  * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
0079  * the traversal and %NULL return indicates the end.  During traversal,
0080  * each returned cgroup is unlinked from the tree.  Must be called with the
0081  * matching cgroup_rstat_cpu_lock held.
0082  *
0083  * The only ordering guarantee is that, for a parent and a child pair
0084  * covered by a given traversal, if a child is visited, its parent is
0085  * guaranteed to be visited afterwards.
0086  */
0087 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
0088                            struct cgroup *root, int cpu)
0089 {
0090     struct cgroup_rstat_cpu *rstatc;
0091     struct cgroup *parent;
0092
0093     if (pos == root)
0094         return NULL;
0095
0096     /*
0097      * We're gonna walk down to the first leaf and visit/remove it.  We
0098      * can pick whatever unvisited node as the starting point.
0099      */
0100     if (!pos) {
0101         pos = root;
0102         /* return NULL if this subtree is not on-list */
0103         if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
0104             return NULL;
0105     } else {
0106         pos = cgroup_parent(pos);
0107     }
0108
0109     /* walk down to the first leaf */
0110     while (true) {
0111         rstatc = cgroup_rstat_cpu(pos, cpu);
0112         if (rstatc->updated_children == pos)
0113             break;
0114         pos = rstatc->updated_children;
0115     }
0116
0117     /*
0118      * Unlink @pos from the tree.  As the updated_children list is
0119      * singly linked, we have to walk it to find the removal point.
0120      * However, due to the way we traverse, @pos will be the first
0121      * child in most cases. The only exception is @root.
0122      */
0123     parent = cgroup_parent(pos);
0124     if (parent) {
0125         struct cgroup_rstat_cpu *prstatc;
0126         struct cgroup **nextp;
0127
0128         prstatc = cgroup_rstat_cpu(parent, cpu);
0129         nextp = &prstatc->updated_children;
0130         while (*nextp != pos) {
0131             struct cgroup_rstat_cpu *nrstatc;
0132
0133             nrstatc = cgroup_rstat_cpu(*nextp, cpu);
0134             WARN_ON_ONCE(*nextp == parent);
0135             nextp = &nrstatc->updated_next;
0136         }
0137         *nextp = rstatc->updated_next;
0138     }
0139
0140     rstatc->updated_next = NULL;
0141     return pos;
0142 }
0143
0144 /* see cgroup_rstat_flush() */
0145 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
0146     __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
0147 {
0148     int cpu;
0149
0150     lockdep_assert_held(&cgroup_rstat_lock);
0151
0152     for_each_possible_cpu(cpu) {
0153         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
0154                                cpu);
0155         struct cgroup *pos = NULL;
0156         unsigned long flags;
0157
0158         /*
0159          * The _irqsave() is needed because cgroup_rstat_lock is
0160          * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
0161          * this lock with the _irq() suffix only disables interrupts on
0162          * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
0163          * interrupts on both configurations. The _irqsave() ensures
0164          * that interrupts are always disabled and later restored.
0165          */
0166         raw_spin_lock_irqsave(cpu_lock, flags);
0167         while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
0168             struct cgroup_subsys_state *css;
0169
0170             cgroup_base_stat_flush(pos, cpu);
0171
0172             rcu_read_lock();
0173             list_for_each_entry_rcu(css, &pos->rstat_css_list,
0174                         rstat_css_node)
0175                 css->ss->css_rstat_flush(css, cpu);
0176             rcu_read_unlock();
0177         }
0178         raw_spin_unlock_irqrestore(cpu_lock, flags);
0179
0180         /* if @may_sleep, play nice and yield if necessary */
0181         if (may_sleep && (need_resched() ||
0182                   spin_needbreak(&cgroup_rstat_lock))) {
0183             spin_unlock_irq(&cgroup_rstat_lock);
0184             if (!cond_resched())
0185                 cpu_relax();
0186             spin_lock_irq(&cgroup_rstat_lock);
0187         }
0188     }
0189 }
0190
0191 /**
0192  * cgroup_rstat_flush - flush stats in @cgrp's subtree
0193  * @cgrp: target cgroup
0194  *
0195  * Collect all per-cpu stats in @cgrp's subtree into the global counters
0196  * and propagate them upwards.  After this function returns, all cgroups in
0197  * the subtree have up-to-date ->stat.
0198  *
0199  * This also gets all cgroups in the subtree including @cgrp off the
0200  * ->updated_children lists.
0201  *
0202  * This function may block.
0203  */
0204 void cgroup_rstat_flush(struct cgroup *cgrp)
0205 {
0206     might_sleep();
0207
0208     spin_lock_irq(&cgroup_rstat_lock);
0209     cgroup_rstat_flush_locked(cgrp, true);
0210     spin_unlock_irq(&cgroup_rstat_lock);
0211 }
0212
0213 /**
0214  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
0215  * @cgrp: target cgroup
0216  *
0217  * This function can be called from any context.
0218  */
0219 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
0220 {
0221     unsigned long flags;
0222
0223     spin_lock_irqsave(&cgroup_rstat_lock, flags);
0224     cgroup_rstat_flush_locked(cgrp, false);
0225     spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
0226 }
0227
0228 /**
0229  * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
0230  * @cgrp: target cgroup
0231  *
0232  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
0233  * paired with cgroup_rstat_flush_release().
0234  *
0235  * This function may block.
0236  */
0237 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
0238     __acquires(&cgroup_rstat_lock)
0239 {
0240     might_sleep();
0241     spin_lock_irq(&cgroup_rstat_lock);
0242     cgroup_rstat_flush_locked(cgrp, true);
0243 }
0244
0245 /**
0246  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
0247  */
0248 void cgroup_rstat_flush_release(void)
0249     __releases(&cgroup_rstat_lock)
0250 {
0251     spin_unlock_irq(&cgroup_rstat_lock);
0252 }
0253
0254 int cgroup_rstat_init(struct cgroup *cgrp)
0255 {
0256     int cpu;
0257
0258     /* the root cgrp has rstat_cpu preallocated */
0259     if (!cgrp->rstat_cpu) {
0260         cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
0261         if (!cgrp->rstat_cpu)
0262             return -ENOMEM;
0263     }
0264
0265     /* ->updated_children list is self terminated */
0266     for_each_possible_cpu(cpu) {
0267         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
0268
0269         rstatc->updated_children = cgrp;
0270         u64_stats_init(&rstatc->bsync);
0271     }
0272
0273     return 0;
0274 }
0275
0276 void cgroup_rstat_exit(struct cgroup *cgrp)
0277 {
0278     int cpu;
0279
0280     cgroup_rstat_flush(cgrp);
0281
0282     /* sanity check */
0283     for_each_possible_cpu(cpu) {
0284         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
0285
0286         if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
0287             WARN_ON_ONCE(rstatc->updated_next))
0288             return;
0289     }
0290
0291     free_percpu(cgrp->rstat_cpu);
0292     cgrp->rstat_cpu = NULL;
0293 }
0294
0295 void __init cgroup_rstat_boot(void)
0296 {
0297     int cpu;
0298
0299     for_each_possible_cpu(cpu)
0300         raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
0301 }
0302
0303 /*
0304  * Functions for cgroup basic resource statistics implemented on top of
0305  * rstat.
0306  */
0307 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
0308                  struct cgroup_base_stat *src_bstat)
0309 {
0310     dst_bstat->cputime.utime += src_bstat->cputime.utime;
0311     dst_bstat->cputime.stime += src_bstat->cputime.stime;
0312     dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
0313 #ifdef CONFIG_SCHED_CORE
0314     dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
0315 #endif
0316 }
0317
0318 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
0319                  struct cgroup_base_stat *src_bstat)
0320 {
0321     dst_bstat->cputime.utime -= src_bstat->cputime.utime;
0322     dst_bstat->cputime.stime -= src_bstat->cputime.stime;
0323     dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
0324 #ifdef CONFIG_SCHED_CORE
0325     dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
0326 #endif
0327 }
0328
0329 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
0330 {
0331     struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
0332     struct cgroup *parent = cgroup_parent(cgrp);
0333     struct cgroup_base_stat delta;
0334     unsigned seq;
0335
0336     /* Root-level stats are sourced from system-wide CPU stats */
0337     if (!parent)
0338         return;
0339
0340     /* fetch the current per-cpu values */
0341     do {
0342         seq = __u64_stats_fetch_begin(&rstatc->bsync);
0343         delta = rstatc->bstat;
0344     } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
0345
0346     /* propagate percpu delta to global */
0347     cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
0348     cgroup_base_stat_add(&cgrp->bstat, &delta);
0349     cgroup_base_stat_add(&rstatc->last_bstat, &delta);
0350
0351     /* propagate global delta to parent (unless that's root) */
0352     if (cgroup_parent(parent)) {
0353         delta = cgrp->bstat;
0354         cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
0355         cgroup_base_stat_add(&parent->bstat, &delta);
0356         cgroup_base_stat_add(&cgrp->last_bstat, &delta);
0357     }
0358 }
0359
0360 static struct cgroup_rstat_cpu *
0361 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
0362 {
0363     struct cgroup_rstat_cpu *rstatc;
0364
0365     rstatc = get_cpu_ptr(cgrp->rstat_cpu);
0366     *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
0367     return rstatc;
0368 }
0369
0370 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
0371                          struct cgroup_rstat_cpu *rstatc,
0372                          unsigned long flags)
0373 {
0374     u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
0375     cgroup_rstat_updated(cgrp, smp_processor_id());
0376     put_cpu_ptr(rstatc);
0377 }
0378
0379 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
0380 {
0381     struct cgroup_rstat_cpu *rstatc;
0382     unsigned long flags;
0383
0384     rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
0385     rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
0386     cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
0387 }
0388
0389 void __cgroup_account_cputime_field(struct cgroup *cgrp,
0390                     enum cpu_usage_stat index, u64 delta_exec)
0391 {
0392     struct cgroup_rstat_cpu *rstatc;
0393     unsigned long flags;
0394
0395     rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
0396
0397     switch (index) {
0398     case CPUTIME_USER:
0399     case CPUTIME_NICE:
0400         rstatc->bstat.cputime.utime += delta_exec;
0401         break;
0402     case CPUTIME_SYSTEM:
0403     case CPUTIME_IRQ:
0404     case CPUTIME_SOFTIRQ:
0405         rstatc->bstat.cputime.stime += delta_exec;
0406         break;
0407 #ifdef CONFIG_SCHED_CORE
0408     case CPUTIME_FORCEIDLE:
0409         rstatc->bstat.forceidle_sum += delta_exec;
0410         break;
0411 #endif
0412     default:
0413         break;
0414     }
0415
0416     cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
0417 }
0418
0419 /*
0420  * compute the cputime for the root cgroup by getting the per cpu data
0421  * at a global level, then categorizing the fields in a manner consistent
0422  * with how it is done by __cgroup_account_cputime_field for each bit of
0423  * cpu time attributed to a cgroup.
0424  */
0425 static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
0426 {
0427     struct task_cputime *cputime = &bstat->cputime;
0428     int i;
0429
0430     cputime->stime = 0;
0431     cputime->utime = 0;
0432     cputime->sum_exec_runtime = 0;
0433     for_each_possible_cpu(i) {
0434         struct kernel_cpustat kcpustat;
0435         u64 *cpustat = kcpustat.cpustat;
0436         u64 user = 0;
0437         u64 sys = 0;
0438
0439         kcpustat_cpu_fetch(&kcpustat, i);
0440
0441         user += cpustat[CPUTIME_USER];
0442         user += cpustat[CPUTIME_NICE];
0443         cputime->utime += user;
0444
0445         sys += cpustat[CPUTIME_SYSTEM];
0446         sys += cpustat[CPUTIME_IRQ];
0447         sys += cpustat[CPUTIME_SOFTIRQ];
0448         cputime->stime += sys;
0449
0450         cputime->sum_exec_runtime += user;
0451         cputime->sum_exec_runtime += sys;
0452         cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
0453
0454 #ifdef CONFIG_SCHED_CORE
0455         bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
0456 #endif
0457     }
0458 }
0459
0460 void cgroup_base_stat_cputime_show(struct seq_file *seq)
0461 {
0462     struct cgroup *cgrp = seq_css(seq)->cgroup;
0463     u64 usage, utime, stime;
0464     struct cgroup_base_stat bstat;
0465 #ifdef CONFIG_SCHED_CORE
0466     u64 forceidle_time;
0467 #endif
0468
0469     if (cgroup_parent(cgrp)) {
0470         cgroup_rstat_flush_hold(cgrp);
0471         usage = cgrp->bstat.cputime.sum_exec_runtime;
0472         cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
0473                    &utime, &stime);
0474 #ifdef CONFIG_SCHED_CORE
0475         forceidle_time = cgrp->bstat.forceidle_sum;
0476 #endif
0477         cgroup_rstat_flush_release();
0478     } else {
0479         root_cgroup_cputime(&bstat);
0480         usage = bstat.cputime.sum_exec_runtime;
0481         utime = bstat.cputime.utime;
0482         stime = bstat.cputime.stime;
0483 #ifdef CONFIG_SCHED_CORE
0484         forceidle_time = bstat.forceidle_sum;
0485 #endif
0486     }
0487
0488     do_div(usage, NSEC_PER_USEC);
0489     do_div(utime, NSEC_PER_USEC);
0490     do_div(stime, NSEC_PER_USEC);
0491 #ifdef CONFIG_SCHED_CORE
0492     do_div(forceidle_time, NSEC_PER_USEC);
0493 #endif
0494
0495     seq_printf(seq, "usage_usec %llu\n"
0496            "user_usec %llu\n"
0497            "system_usec %llu\n",
0498            usage, utime, stime);
0499
0500 #ifdef CONFIG_SCHED_CORE
0501     seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
0502 #endif
0503 }