the-tree/kernel/latencytop.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * latencytop.c: Latency display infrastructure
0004  *
0005  * (C) Copyright 2008 Intel Corporation
0006  * Author: Arjan van de Ven <arjan@linux.intel.com>
0007  */
0008
0009 /*
0010  * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
0011  * used by the "latencytop" userspace tool. The latency that is tracked is not
0012  * the 'traditional' interrupt latency (which is primarily caused by something
0013  * else consuming CPU), but instead, it is the latency an application encounters
0014  * because the kernel sleeps on its behalf for various reasons.
0015  *
0016  * This code tracks 2 levels of statistics:
0017  * 1) System level latency
0018  * 2) Per process latency
0019  *
0020  * The latency is stored in fixed sized data structures in an accumulated form;
0021  * if the "same" latency cause is hit twice, this will be tracked as one entry
0022  * in the data structure. Both the count, total accumulated latency and maximum
0023  * latency are tracked in this data structure. When the fixed size structure is
0024  * full, no new causes are tracked until the buffer is flushed by writing to
0025  * the /proc file; the userspace tool does this on a regular basis.
0026  *
0027  * A latency cause is identified by a stringified backtrace at the point that
0028  * the scheduler gets invoked. The userland tool will use this string to
0029  * identify the cause of the latency in human readable form.
0030  *
0031  * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
0032  * These files look like this:
0033  *
0034  * Latency Top version : v0.1
0035  * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
0036  * |    |    |    |
0037  * |    |    |    +----> the stringified backtrace
0038  * |    |    +---------> The maximum latency for this entry in microseconds
0039  * |    +--------------> The accumulated latency for this entry (microseconds)
0040  * +-------------------> The number of times this entry is hit
0041  *
0042  * (note: the average latency is the accumulated latency divided by the number
0043  * of times)
0044  */
0045
0046 #include <linux/kallsyms.h>
0047 #include <linux/seq_file.h>
0048 #include <linux/notifier.h>
0049 #include <linux/spinlock.h>
0050 #include <linux/proc_fs.h>
0051 #include <linux/latencytop.h>
0052 #include <linux/export.h>
0053 #include <linux/sched.h>
0054 #include <linux/sched/debug.h>
0055 #include <linux/sched/stat.h>
0056 #include <linux/list.h>
0057 #include <linux/stacktrace.h>
0058 #include <linux/sysctl.h>
0059
0060 static DEFINE_RAW_SPINLOCK(latency_lock);
0061
0062 #define MAXLR 128
0063 static struct latency_record latency_record[MAXLR];
0064
0065 int latencytop_enabled;
0066
0067 #ifdef CONFIG_SYSCTL
0068 static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
0069         size_t *lenp, loff_t *ppos)
0070 {
0071     int err;
0072
0073     err = proc_dointvec(table, write, buffer, lenp, ppos);
0074     if (latencytop_enabled)
0075         force_schedstat_enabled();
0076
0077     return err;
0078 }
0079
0080 static struct ctl_table latencytop_sysctl[] = {
0081     {
0082         .procname   = "latencytop",
0083         .data       = &latencytop_enabled,
0084         .maxlen     = sizeof(int),
0085         .mode       = 0644,
0086         .proc_handler   = sysctl_latencytop,
0087     },
0088     {}
0089 };
0090 #endif
0091
0092 void clear_tsk_latency_tracing(struct task_struct *p)
0093 {
0094     unsigned long flags;
0095
0096     raw_spin_lock_irqsave(&latency_lock, flags);
0097     memset(&p->latency_record, 0, sizeof(p->latency_record));
0098     p->latency_record_count = 0;
0099     raw_spin_unlock_irqrestore(&latency_lock, flags);
0100 }
0101
0102 static void clear_global_latency_tracing(void)
0103 {
0104     unsigned long flags;
0105
0106     raw_spin_lock_irqsave(&latency_lock, flags);
0107     memset(&latency_record, 0, sizeof(latency_record));
0108     raw_spin_unlock_irqrestore(&latency_lock, flags);
0109 }
0110
0111 static void __sched
0112 account_global_scheduler_latency(struct task_struct *tsk,
0113                  struct latency_record *lat)
0114 {
0115     int firstnonnull = MAXLR + 1;
0116     int i;
0117
0118     /* skip kernel threads for now */
0119     if (!tsk->mm)
0120         return;
0121
0122     for (i = 0; i < MAXLR; i++) {
0123         int q, same = 1;
0124
0125         /* Nothing stored: */
0126         if (!latency_record[i].backtrace[0]) {
0127             if (firstnonnull > i)
0128                 firstnonnull = i;
0129             continue;
0130         }
0131         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
0132             unsigned long record = lat->backtrace[q];
0133
0134             if (latency_record[i].backtrace[q] != record) {
0135                 same = 0;
0136                 break;
0137             }
0138
0139             /* 0 entry marks end of backtrace: */
0140             if (!record)
0141                 break;
0142         }
0143         if (same) {
0144             latency_record[i].count++;
0145             latency_record[i].time += lat->time;
0146             if (lat->time > latency_record[i].max)
0147                 latency_record[i].max = lat->time;
0148             return;
0149         }
0150     }
0151
0152     i = firstnonnull;
0153     if (i >= MAXLR - 1)
0154         return;
0155
0156     /* Allocted a new one: */
0157     memcpy(&latency_record[i], lat, sizeof(struct latency_record));
0158 }
0159
0160 /**
0161  * __account_scheduler_latency - record an occurred latency
0162  * @tsk - the task struct of the task hitting the latency
0163  * @usecs - the duration of the latency in microseconds
0164  * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
0165  *
0166  * This function is the main entry point for recording latency entries
0167  * as called by the scheduler.
0168  *
0169  * This function has a few special cases to deal with normal 'non-latency'
0170  * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
0171  * since this usually is caused by waiting for events via select() and co.
0172  *
0173  * Negative latencies (caused by time going backwards) are also explicitly
0174  * skipped.
0175  */
0176 void __sched
0177 __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
0178 {
0179     unsigned long flags;
0180     int i, q;
0181     struct latency_record lat;
0182
0183     /* Long interruptible waits are generally user requested... */
0184     if (inter && usecs > 5000)
0185         return;
0186
0187     /* Negative sleeps are time going backwards */
0188     /* Zero-time sleeps are non-interesting */
0189     if (usecs <= 0)
0190         return;
0191
0192     memset(&lat, 0, sizeof(lat));
0193     lat.count = 1;
0194     lat.time = usecs;
0195     lat.max = usecs;
0196
0197     stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
0198
0199     raw_spin_lock_irqsave(&latency_lock, flags);
0200
0201     account_global_scheduler_latency(tsk, &lat);
0202
0203     for (i = 0; i < tsk->latency_record_count; i++) {
0204         struct latency_record *mylat;
0205         int same = 1;
0206
0207         mylat = &tsk->latency_record[i];
0208         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
0209             unsigned long record = lat.backtrace[q];
0210
0211             if (mylat->backtrace[q] != record) {
0212                 same = 0;
0213                 break;
0214             }
0215
0216             /* 0 entry is end of backtrace */
0217             if (!record)
0218                 break;
0219         }
0220         if (same) {
0221             mylat->count++;
0222             mylat->time += lat.time;
0223             if (lat.time > mylat->max)
0224                 mylat->max = lat.time;
0225             goto out_unlock;
0226         }
0227     }
0228
0229     /*
0230      * short term hack; if we're > 32 we stop; future we recycle:
0231      */
0232     if (tsk->latency_record_count >= LT_SAVECOUNT)
0233         goto out_unlock;
0234
0235     /* Allocated a new one: */
0236     i = tsk->latency_record_count++;
0237     memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
0238
0239 out_unlock:
0240     raw_spin_unlock_irqrestore(&latency_lock, flags);
0241 }
0242
0243 static int lstats_show(struct seq_file *m, void *v)
0244 {
0245     int i;
0246
0247     seq_puts(m, "Latency Top version : v0.1\n");
0248
0249     for (i = 0; i < MAXLR; i++) {
0250         struct latency_record *lr = &latency_record[i];
0251
0252         if (lr->backtrace[0]) {
0253             int q;
0254             seq_printf(m, "%i %lu %lu",
0255                    lr->count, lr->time, lr->max);
0256             for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
0257                 unsigned long bt = lr->backtrace[q];
0258
0259                 if (!bt)
0260                     break;
0261
0262                 seq_printf(m, " %ps", (void *)bt);
0263             }
0264             seq_puts(m, "\n");
0265         }
0266     }
0267     return 0;
0268 }
0269
0270 static ssize_t
0271 lstats_write(struct file *file, const char __user *buf, size_t count,
0272          loff_t *offs)
0273 {
0274     clear_global_latency_tracing();
0275
0276     return count;
0277 }
0278
0279 static int lstats_open(struct inode *inode, struct file *filp)
0280 {
0281     return single_open(filp, lstats_show, NULL);
0282 }
0283
0284 static const struct proc_ops lstats_proc_ops = {
0285     .proc_open  = lstats_open,
0286     .proc_read  = seq_read,
0287     .proc_write = lstats_write,
0288     .proc_lseek = seq_lseek,
0289     .proc_release   = single_release,
0290 };
0291
0292 static int __init init_lstats_procfs(void)
0293 {
0294     proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
0295 #ifdef CONFIG_SYSCTL
0296     register_sysctl_init("kernel", latencytop_sysctl);
0297 #endif
0298     return 0;
0299 }
0300 device_initcall(init_lstats_procfs);