the-tree/kernel/profile.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  linux/kernel/profile.c
0004  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
0005  *  with configurable resolution, support for restricting the cpus on
0006  *  which profiling is done, and switching between cpu time and
0007  *  schedule() calls via kernel command line parameters passed at boot.
0008  *
0009  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
0010  *  Red Hat, July 2004
0011  *  Consolidation of architecture support code for profiling,
0012  *  Nadia Yvette Chambers, Oracle, July 2004
0013  *  Amortized hit count accounting via per-cpu open-addressed hashtables
0014  *  to resolve timer interrupt livelocks, Nadia Yvette Chambers,
0015  *  Oracle, 2004
0016  */
0017
0018 #include <linux/export.h>
0019 #include <linux/profile.h>
0020 #include <linux/memblock.h>
0021 #include <linux/notifier.h>
0022 #include <linux/mm.h>
0023 #include <linux/cpumask.h>
0024 #include <linux/cpu.h>
0025 #include <linux/highmem.h>
0026 #include <linux/mutex.h>
0027 #include <linux/slab.h>
0028 #include <linux/vmalloc.h>
0029 #include <linux/sched/stat.h>
0030
0031 #include <asm/sections.h>
0032 #include <asm/irq_regs.h>
0033 #include <asm/ptrace.h>
0034
0035 struct profile_hit {
0036     u32 pc, hits;
0037 };
0038 #define PROFILE_GRPSHIFT    3
0039 #define PROFILE_GRPSZ       (1 << PROFILE_GRPSHIFT)
0040 #define NR_PROFILE_HIT      (PAGE_SIZE/sizeof(struct profile_hit))
0041 #define NR_PROFILE_GRP      (NR_PROFILE_HIT/PROFILE_GRPSZ)
0042
0043 static atomic_t *prof_buffer;
0044 static unsigned long prof_len;
0045 static unsigned short int prof_shift;
0046
0047 int prof_on __read_mostly;
0048 EXPORT_SYMBOL_GPL(prof_on);
0049
0050 static cpumask_var_t prof_cpu_mask;
0051 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
0052 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
0053 static DEFINE_PER_CPU(int, cpu_profile_flip);
0054 static DEFINE_MUTEX(profile_flip_mutex);
0055 #endif /* CONFIG_SMP */
0056
0057 int profile_setup(char *str)
0058 {
0059     static const char schedstr[] = "schedule";
0060     static const char sleepstr[] = "sleep";
0061     static const char kvmstr[] = "kvm";
0062     int par;
0063
0064     if (!strncmp(str, sleepstr, strlen(sleepstr))) {
0065 #ifdef CONFIG_SCHEDSTATS
0066         force_schedstat_enabled();
0067         prof_on = SLEEP_PROFILING;
0068         if (str[strlen(sleepstr)] == ',')
0069             str += strlen(sleepstr) + 1;
0070         if (get_option(&str, &par))
0071             prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
0072         pr_info("kernel sleep profiling enabled (shift: %u)\n",
0073             prof_shift);
0074 #else
0075         pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
0076 #endif /* CONFIG_SCHEDSTATS */
0077     } else if (!strncmp(str, schedstr, strlen(schedstr))) {
0078         prof_on = SCHED_PROFILING;
0079         if (str[strlen(schedstr)] == ',')
0080             str += strlen(schedstr) + 1;
0081         if (get_option(&str, &par))
0082             prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
0083         pr_info("kernel schedule profiling enabled (shift: %u)\n",
0084             prof_shift);
0085     } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
0086         prof_on = KVM_PROFILING;
0087         if (str[strlen(kvmstr)] == ',')
0088             str += strlen(kvmstr) + 1;
0089         if (get_option(&str, &par))
0090             prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
0091         pr_info("kernel KVM profiling enabled (shift: %u)\n",
0092             prof_shift);
0093     } else if (get_option(&str, &par)) {
0094         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
0095         prof_on = CPU_PROFILING;
0096         pr_info("kernel profiling enabled (shift: %u)\n",
0097             prof_shift);
0098     }
0099     return 1;
0100 }
0101 __setup("profile=", profile_setup);
0102
0103
0104 int __ref profile_init(void)
0105 {
0106     int buffer_bytes;
0107     if (!prof_on)
0108         return 0;
0109
0110     /* only text is profiled */
0111     prof_len = (_etext - _stext) >> prof_shift;
0112
0113     if (!prof_len) {
0114         pr_warn("profiling shift: %u too large\n", prof_shift);
0115         prof_on = 0;
0116         return -EINVAL;
0117     }
0118
0119     buffer_bytes = prof_len*sizeof(atomic_t);
0120
0121     if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
0122         return -ENOMEM;
0123
0124     cpumask_copy(prof_cpu_mask, cpu_possible_mask);
0125
0126     prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
0127     if (prof_buffer)
0128         return 0;
0129
0130     prof_buffer = alloc_pages_exact(buffer_bytes,
0131                     GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
0132     if (prof_buffer)
0133         return 0;
0134
0135     prof_buffer = vzalloc(buffer_bytes);
0136     if (prof_buffer)
0137         return 0;
0138
0139     free_cpumask_var(prof_cpu_mask);
0140     return -ENOMEM;
0141 }
0142
0143 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
0144 /*
0145  * Each cpu has a pair of open-addressed hashtables for pending
0146  * profile hits. read_profile() IPI's all cpus to request them
0147  * to flip buffers and flushes their contents to prof_buffer itself.
0148  * Flip requests are serialized by the profile_flip_mutex. The sole
0149  * use of having a second hashtable is for avoiding cacheline
0150  * contention that would otherwise happen during flushes of pending
0151  * profile hits required for the accuracy of reported profile hits
0152  * and so resurrect the interrupt livelock issue.
0153  *
0154  * The open-addressed hashtables are indexed by profile buffer slot
0155  * and hold the number of pending hits to that profile buffer slot on
0156  * a cpu in an entry. When the hashtable overflows, all pending hits
0157  * are accounted to their corresponding profile buffer slots with
0158  * atomic_add() and the hashtable emptied. As numerous pending hits
0159  * may be accounted to a profile buffer slot in a hashtable entry,
0160  * this amortizes a number of atomic profile buffer increments likely
0161  * to be far larger than the number of entries in the hashtable,
0162  * particularly given that the number of distinct profile buffer
0163  * positions to which hits are accounted during short intervals (e.g.
0164  * several seconds) is usually very small. Exclusion from buffer
0165  * flipping is provided by interrupt disablement (note that for
0166  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
0167  * process context).
0168  * The hash function is meant to be lightweight as opposed to strong,
0169  * and was vaguely inspired by ppc64 firmware-supported inverted
0170  * pagetable hash functions, but uses a full hashtable full of finite
0171  * collision chains, not just pairs of them.
0172  *
0173  * -- nyc
0174  */
0175 static void __profile_flip_buffers(void *unused)
0176 {
0177     int cpu = smp_processor_id();
0178
0179     per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
0180 }
0181
0182 static void profile_flip_buffers(void)
0183 {
0184     int i, j, cpu;
0185
0186     mutex_lock(&profile_flip_mutex);
0187     j = per_cpu(cpu_profile_flip, get_cpu());
0188     put_cpu();
0189     on_each_cpu(__profile_flip_buffers, NULL, 1);
0190     for_each_online_cpu(cpu) {
0191         struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
0192         for (i = 0; i < NR_PROFILE_HIT; ++i) {
0193             if (!hits[i].hits) {
0194                 if (hits[i].pc)
0195                     hits[i].pc = 0;
0196                 continue;
0197             }
0198             atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
0199             hits[i].hits = hits[i].pc = 0;
0200         }
0201     }
0202     mutex_unlock(&profile_flip_mutex);
0203 }
0204
0205 static void profile_discard_flip_buffers(void)
0206 {
0207     int i, cpu;
0208
0209     mutex_lock(&profile_flip_mutex);
0210     i = per_cpu(cpu_profile_flip, get_cpu());
0211     put_cpu();
0212     on_each_cpu(__profile_flip_buffers, NULL, 1);
0213     for_each_online_cpu(cpu) {
0214         struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
0215         memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
0216     }
0217     mutex_unlock(&profile_flip_mutex);
0218 }
0219
0220 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
0221 {
0222     unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
0223     int i, j, cpu;
0224     struct profile_hit *hits;
0225
0226     pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
0227     i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
0228     secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
0229     cpu = get_cpu();
0230     hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
0231     if (!hits) {
0232         put_cpu();
0233         return;
0234     }
0235     /*
0236      * We buffer the global profiler buffer into a per-CPU
0237      * queue and thus reduce the number of global (and possibly
0238      * NUMA-alien) accesses. The write-queue is self-coalescing:
0239      */
0240     local_irq_save(flags);
0241     do {
0242         for (j = 0; j < PROFILE_GRPSZ; ++j) {
0243             if (hits[i + j].pc == pc) {
0244                 hits[i + j].hits += nr_hits;
0245                 goto out;
0246             } else if (!hits[i + j].hits) {
0247                 hits[i + j].pc = pc;
0248                 hits[i + j].hits = nr_hits;
0249                 goto out;
0250             }
0251         }
0252         i = (i + secondary) & (NR_PROFILE_HIT - 1);
0253     } while (i != primary);
0254
0255     /*
0256      * Add the current hit(s) and flush the write-queue out
0257      * to the global buffer:
0258      */
0259     atomic_add(nr_hits, &prof_buffer[pc]);
0260     for (i = 0; i < NR_PROFILE_HIT; ++i) {
0261         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
0262         hits[i].pc = hits[i].hits = 0;
0263     }
0264 out:
0265     local_irq_restore(flags);
0266     put_cpu();
0267 }
0268
0269 static int profile_dead_cpu(unsigned int cpu)
0270 {
0271     struct page *page;
0272     int i;
0273
0274     if (cpumask_available(prof_cpu_mask))
0275         cpumask_clear_cpu(cpu, prof_cpu_mask);
0276
0277     for (i = 0; i < 2; i++) {
0278         if (per_cpu(cpu_profile_hits, cpu)[i]) {
0279             page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
0280             per_cpu(cpu_profile_hits, cpu)[i] = NULL;
0281             __free_page(page);
0282         }
0283     }
0284     return 0;
0285 }
0286
0287 static int profile_prepare_cpu(unsigned int cpu)
0288 {
0289     int i, node = cpu_to_mem(cpu);
0290     struct page *page;
0291
0292     per_cpu(cpu_profile_flip, cpu) = 0;
0293
0294     for (i = 0; i < 2; i++) {
0295         if (per_cpu(cpu_profile_hits, cpu)[i])
0296             continue;
0297
0298         page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
0299         if (!page) {
0300             profile_dead_cpu(cpu);
0301             return -ENOMEM;
0302         }
0303         per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
0304
0305     }
0306     return 0;
0307 }
0308
0309 static int profile_online_cpu(unsigned int cpu)
0310 {
0311     if (cpumask_available(prof_cpu_mask))
0312         cpumask_set_cpu(cpu, prof_cpu_mask);
0313
0314     return 0;
0315 }
0316
0317 #else /* !CONFIG_SMP */
0318 #define profile_flip_buffers()      do { } while (0)
0319 #define profile_discard_flip_buffers()  do { } while (0)
0320
0321 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
0322 {
0323     unsigned long pc;
0324     pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
0325     atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
0326 }
0327 #endif /* !CONFIG_SMP */
0328
0329 void profile_hits(int type, void *__pc, unsigned int nr_hits)
0330 {
0331     if (prof_on != type || !prof_buffer)
0332         return;
0333     do_profile_hits(type, __pc, nr_hits);
0334 }
0335 EXPORT_SYMBOL_GPL(profile_hits);
0336
0337 void profile_tick(int type)
0338 {
0339     struct pt_regs *regs = get_irq_regs();
0340
0341     if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
0342         cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
0343         profile_hit(type, (void *)profile_pc(regs));
0344 }
0345
0346 #ifdef CONFIG_PROC_FS
0347 #include <linux/proc_fs.h>
0348 #include <linux/seq_file.h>
0349 #include <linux/uaccess.h>
0350
0351 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
0352 {
0353     seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
0354     return 0;
0355 }
0356
0357 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
0358 {
0359     return single_open(file, prof_cpu_mask_proc_show, NULL);
0360 }
0361
0362 static ssize_t prof_cpu_mask_proc_write(struct file *file,
0363     const char __user *buffer, size_t count, loff_t *pos)
0364 {
0365     cpumask_var_t new_value;
0366     int err;
0367
0368     if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
0369         return -ENOMEM;
0370
0371     err = cpumask_parse_user(buffer, count, new_value);
0372     if (!err) {
0373         cpumask_copy(prof_cpu_mask, new_value);
0374         err = count;
0375     }
0376     free_cpumask_var(new_value);
0377     return err;
0378 }
0379
0380 static const struct proc_ops prof_cpu_mask_proc_ops = {
0381     .proc_open  = prof_cpu_mask_proc_open,
0382     .proc_read  = seq_read,
0383     .proc_lseek = seq_lseek,
0384     .proc_release   = single_release,
0385     .proc_write = prof_cpu_mask_proc_write,
0386 };
0387
0388 void create_prof_cpu_mask(void)
0389 {
0390     /* create /proc/irq/prof_cpu_mask */
0391     proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
0392 }
0393
0394 /*
0395  * This function accesses profiling information. The returned data is
0396  * binary: the sampling step and the actual contents of the profile
0397  * buffer. Use of the program readprofile is recommended in order to
0398  * get meaningful info out of these data.
0399  */
0400 static ssize_t
0401 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
0402 {
0403     unsigned long p = *ppos;
0404     ssize_t read;
0405     char *pnt;
0406     unsigned long sample_step = 1UL << prof_shift;
0407
0408     profile_flip_buffers();
0409     if (p >= (prof_len+1)*sizeof(unsigned int))
0410         return 0;
0411     if (count > (prof_len+1)*sizeof(unsigned int) - p)
0412         count = (prof_len+1)*sizeof(unsigned int) - p;
0413     read = 0;
0414
0415     while (p < sizeof(unsigned int) && count > 0) {
0416         if (put_user(*((char *)(&sample_step)+p), buf))
0417             return -EFAULT;
0418         buf++; p++; count--; read++;
0419     }
0420     pnt = (char *)prof_buffer + p - sizeof(atomic_t);
0421     if (copy_to_user(buf, (void *)pnt, count))
0422         return -EFAULT;
0423     read += count;
0424     *ppos += read;
0425     return read;
0426 }
0427
0428 /* default is to not implement this call */
0429 int __weak setup_profiling_timer(unsigned mult)
0430 {
0431     return -EINVAL;
0432 }
0433
0434 /*
0435  * Writing to /proc/profile resets the counters
0436  *
0437  * Writing a 'profiling multiplier' value into it also re-sets the profiling
0438  * interrupt frequency, on architectures that support this.
0439  */
0440 static ssize_t write_profile(struct file *file, const char __user *buf,
0441                  size_t count, loff_t *ppos)
0442 {
0443 #ifdef CONFIG_SMP
0444     if (count == sizeof(int)) {
0445         unsigned int multiplier;
0446
0447         if (copy_from_user(&multiplier, buf, sizeof(int)))
0448             return -EFAULT;
0449
0450         if (setup_profiling_timer(multiplier))
0451             return -EINVAL;
0452     }
0453 #endif
0454     profile_discard_flip_buffers();
0455     memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
0456     return count;
0457 }
0458
0459 static const struct proc_ops profile_proc_ops = {
0460     .proc_read  = read_profile,
0461     .proc_write = write_profile,
0462     .proc_lseek = default_llseek,
0463 };
0464
0465 int __ref create_proc_profile(void)
0466 {
0467     struct proc_dir_entry *entry;
0468 #ifdef CONFIG_SMP
0469     enum cpuhp_state online_state;
0470 #endif
0471
0472     int err = 0;
0473
0474     if (!prof_on)
0475         return 0;
0476 #ifdef CONFIG_SMP
0477     err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
0478                 profile_prepare_cpu, profile_dead_cpu);
0479     if (err)
0480         return err;
0481
0482     err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
0483                 profile_online_cpu, NULL);
0484     if (err < 0)
0485         goto err_state_prep;
0486     online_state = err;
0487     err = 0;
0488 #endif
0489     entry = proc_create("profile", S_IWUSR | S_IRUGO,
0490                 NULL, &profile_proc_ops);
0491     if (!entry)
0492         goto err_state_onl;
0493     proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
0494
0495     return err;
0496 err_state_onl:
0497 #ifdef CONFIG_SMP
0498     cpuhp_remove_state(online_state);
0499 err_state_prep:
0500     cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
0501 #endif
0502     return err;
0503 }
0504 subsys_initcall(create_proc_profile);
0505 #endif /* CONFIG_PROC_FS */