Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/kernel/profile.c
0003  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
0004  *  with configurable resolution, support for restricting the cpus on
0005  *  which profiling is done, and switching between cpu time and
0006  *  schedule() calls via kernel command line parameters passed at boot.
0007  *
0008  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
0009  *  Red Hat, July 2004
0010  *  Consolidation of architecture support code for profiling,
0011  *  Nadia Yvette Chambers, Oracle, July 2004
0012  *  Amortized hit count accounting via per-cpu open-addressed hashtables
0013  *  to resolve timer interrupt livelocks, Nadia Yvette Chambers,
0014  *  Oracle, 2004
0015  */
0016 
0017 #include <linux/export.h>
0018 #include <linux/profile.h>
0019 #include <linux/bootmem.h>
0020 #include <linux/notifier.h>
0021 #include <linux/mm.h>
0022 #include <linux/cpumask.h>
0023 #include <linux/cpu.h>
0024 #include <linux/highmem.h>
0025 #include <linux/mutex.h>
0026 #include <linux/slab.h>
0027 #include <linux/vmalloc.h>
0028 #include <asm/sections.h>
0029 #include <asm/irq_regs.h>
0030 #include <asm/ptrace.h>
0031 
0032 struct profile_hit {
0033     u32 pc, hits;
0034 };
0035 #define PROFILE_GRPSHIFT    3
0036 #define PROFILE_GRPSZ       (1 << PROFILE_GRPSHIFT)
0037 #define NR_PROFILE_HIT      (PAGE_SIZE/sizeof(struct profile_hit))
0038 #define NR_PROFILE_GRP      (NR_PROFILE_HIT/PROFILE_GRPSZ)
0039 
0040 static atomic_t *prof_buffer;
0041 static unsigned long prof_len, prof_shift;
0042 
0043 int prof_on __read_mostly;
0044 EXPORT_SYMBOL_GPL(prof_on);
0045 
0046 static cpumask_var_t prof_cpu_mask;
0047 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
0048 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
0049 static DEFINE_PER_CPU(int, cpu_profile_flip);
0050 static DEFINE_MUTEX(profile_flip_mutex);
0051 #endif /* CONFIG_SMP */
0052 
0053 int profile_setup(char *str)
0054 {
0055     static const char schedstr[] = "schedule";
0056     static const char sleepstr[] = "sleep";
0057     static const char kvmstr[] = "kvm";
0058     int par;
0059 
0060     if (!strncmp(str, sleepstr, strlen(sleepstr))) {
0061 #ifdef CONFIG_SCHEDSTATS
0062         force_schedstat_enabled();
0063         prof_on = SLEEP_PROFILING;
0064         if (str[strlen(sleepstr)] == ',')
0065             str += strlen(sleepstr) + 1;
0066         if (get_option(&str, &par))
0067             prof_shift = par;
0068         pr_info("kernel sleep profiling enabled (shift: %ld)\n",
0069             prof_shift);
0070 #else
0071         pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
0072 #endif /* CONFIG_SCHEDSTATS */
0073     } else if (!strncmp(str, schedstr, strlen(schedstr))) {
0074         prof_on = SCHED_PROFILING;
0075         if (str[strlen(schedstr)] == ',')
0076             str += strlen(schedstr) + 1;
0077         if (get_option(&str, &par))
0078             prof_shift = par;
0079         pr_info("kernel schedule profiling enabled (shift: %ld)\n",
0080             prof_shift);
0081     } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
0082         prof_on = KVM_PROFILING;
0083         if (str[strlen(kvmstr)] == ',')
0084             str += strlen(kvmstr) + 1;
0085         if (get_option(&str, &par))
0086             prof_shift = par;
0087         pr_info("kernel KVM profiling enabled (shift: %ld)\n",
0088             prof_shift);
0089     } else if (get_option(&str, &par)) {
0090         prof_shift = par;
0091         prof_on = CPU_PROFILING;
0092         pr_info("kernel profiling enabled (shift: %ld)\n",
0093             prof_shift);
0094     }
0095     return 1;
0096 }
0097 __setup("profile=", profile_setup);
0098 
0099 
0100 int __ref profile_init(void)
0101 {
0102     int buffer_bytes;
0103     if (!prof_on)
0104         return 0;
0105 
0106     /* only text is profiled */
0107     prof_len = (_etext - _stext) >> prof_shift;
0108     buffer_bytes = prof_len*sizeof(atomic_t);
0109 
0110     if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
0111         return -ENOMEM;
0112 
0113     cpumask_copy(prof_cpu_mask, cpu_possible_mask);
0114 
0115     prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
0116     if (prof_buffer)
0117         return 0;
0118 
0119     prof_buffer = alloc_pages_exact(buffer_bytes,
0120                     GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
0121     if (prof_buffer)
0122         return 0;
0123 
0124     prof_buffer = vzalloc(buffer_bytes);
0125     if (prof_buffer)
0126         return 0;
0127 
0128     free_cpumask_var(prof_cpu_mask);
0129     return -ENOMEM;
0130 }
0131 
0132 /* Profile event notifications */
0133 
0134 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
0135 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
0136 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
0137 
0138 void profile_task_exit(struct task_struct *task)
0139 {
0140     blocking_notifier_call_chain(&task_exit_notifier, 0, task);
0141 }
0142 
0143 int profile_handoff_task(struct task_struct *task)
0144 {
0145     int ret;
0146     ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
0147     return (ret == NOTIFY_OK) ? 1 : 0;
0148 }
0149 
0150 void profile_munmap(unsigned long addr)
0151 {
0152     blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
0153 }
0154 
0155 int task_handoff_register(struct notifier_block *n)
0156 {
0157     return atomic_notifier_chain_register(&task_free_notifier, n);
0158 }
0159 EXPORT_SYMBOL_GPL(task_handoff_register);
0160 
0161 int task_handoff_unregister(struct notifier_block *n)
0162 {
0163     return atomic_notifier_chain_unregister(&task_free_notifier, n);
0164 }
0165 EXPORT_SYMBOL_GPL(task_handoff_unregister);
0166 
0167 int profile_event_register(enum profile_type type, struct notifier_block *n)
0168 {
0169     int err = -EINVAL;
0170 
0171     switch (type) {
0172     case PROFILE_TASK_EXIT:
0173         err = blocking_notifier_chain_register(
0174                 &task_exit_notifier, n);
0175         break;
0176     case PROFILE_MUNMAP:
0177         err = blocking_notifier_chain_register(
0178                 &munmap_notifier, n);
0179         break;
0180     }
0181 
0182     return err;
0183 }
0184 EXPORT_SYMBOL_GPL(profile_event_register);
0185 
0186 int profile_event_unregister(enum profile_type type, struct notifier_block *n)
0187 {
0188     int err = -EINVAL;
0189 
0190     switch (type) {
0191     case PROFILE_TASK_EXIT:
0192         err = blocking_notifier_chain_unregister(
0193                 &task_exit_notifier, n);
0194         break;
0195     case PROFILE_MUNMAP:
0196         err = blocking_notifier_chain_unregister(
0197                 &munmap_notifier, n);
0198         break;
0199     }
0200 
0201     return err;
0202 }
0203 EXPORT_SYMBOL_GPL(profile_event_unregister);
0204 
0205 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
0206 /*
0207  * Each cpu has a pair of open-addressed hashtables for pending
0208  * profile hits. read_profile() IPI's all cpus to request them
0209  * to flip buffers and flushes their contents to prof_buffer itself.
0210  * Flip requests are serialized by the profile_flip_mutex. The sole
0211  * use of having a second hashtable is for avoiding cacheline
0212  * contention that would otherwise happen during flushes of pending
0213  * profile hits required for the accuracy of reported profile hits
0214  * and so resurrect the interrupt livelock issue.
0215  *
0216  * The open-addressed hashtables are indexed by profile buffer slot
0217  * and hold the number of pending hits to that profile buffer slot on
0218  * a cpu in an entry. When the hashtable overflows, all pending hits
0219  * are accounted to their corresponding profile buffer slots with
0220  * atomic_add() and the hashtable emptied. As numerous pending hits
0221  * may be accounted to a profile buffer slot in a hashtable entry,
0222  * this amortizes a number of atomic profile buffer increments likely
0223  * to be far larger than the number of entries in the hashtable,
0224  * particularly given that the number of distinct profile buffer
0225  * positions to which hits are accounted during short intervals (e.g.
0226  * several seconds) is usually very small. Exclusion from buffer
0227  * flipping is provided by interrupt disablement (note that for
0228  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
0229  * process context).
0230  * The hash function is meant to be lightweight as opposed to strong,
0231  * and was vaguely inspired by ppc64 firmware-supported inverted
0232  * pagetable hash functions, but uses a full hashtable full of finite
0233  * collision chains, not just pairs of them.
0234  *
0235  * -- nyc
0236  */
0237 static void __profile_flip_buffers(void *unused)
0238 {
0239     int cpu = smp_processor_id();
0240 
0241     per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
0242 }
0243 
0244 static void profile_flip_buffers(void)
0245 {
0246     int i, j, cpu;
0247 
0248     mutex_lock(&profile_flip_mutex);
0249     j = per_cpu(cpu_profile_flip, get_cpu());
0250     put_cpu();
0251     on_each_cpu(__profile_flip_buffers, NULL, 1);
0252     for_each_online_cpu(cpu) {
0253         struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
0254         for (i = 0; i < NR_PROFILE_HIT; ++i) {
0255             if (!hits[i].hits) {
0256                 if (hits[i].pc)
0257                     hits[i].pc = 0;
0258                 continue;
0259             }
0260             atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
0261             hits[i].hits = hits[i].pc = 0;
0262         }
0263     }
0264     mutex_unlock(&profile_flip_mutex);
0265 }
0266 
0267 static void profile_discard_flip_buffers(void)
0268 {
0269     int i, cpu;
0270 
0271     mutex_lock(&profile_flip_mutex);
0272     i = per_cpu(cpu_profile_flip, get_cpu());
0273     put_cpu();
0274     on_each_cpu(__profile_flip_buffers, NULL, 1);
0275     for_each_online_cpu(cpu) {
0276         struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
0277         memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
0278     }
0279     mutex_unlock(&profile_flip_mutex);
0280 }
0281 
0282 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
0283 {
0284     unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
0285     int i, j, cpu;
0286     struct profile_hit *hits;
0287 
0288     pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
0289     i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
0290     secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
0291     cpu = get_cpu();
0292     hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
0293     if (!hits) {
0294         put_cpu();
0295         return;
0296     }
0297     /*
0298      * We buffer the global profiler buffer into a per-CPU
0299      * queue and thus reduce the number of global (and possibly
0300      * NUMA-alien) accesses. The write-queue is self-coalescing:
0301      */
0302     local_irq_save(flags);
0303     do {
0304         for (j = 0; j < PROFILE_GRPSZ; ++j) {
0305             if (hits[i + j].pc == pc) {
0306                 hits[i + j].hits += nr_hits;
0307                 goto out;
0308             } else if (!hits[i + j].hits) {
0309                 hits[i + j].pc = pc;
0310                 hits[i + j].hits = nr_hits;
0311                 goto out;
0312             }
0313         }
0314         i = (i + secondary) & (NR_PROFILE_HIT - 1);
0315     } while (i != primary);
0316 
0317     /*
0318      * Add the current hit(s) and flush the write-queue out
0319      * to the global buffer:
0320      */
0321     atomic_add(nr_hits, &prof_buffer[pc]);
0322     for (i = 0; i < NR_PROFILE_HIT; ++i) {
0323         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
0324         hits[i].pc = hits[i].hits = 0;
0325     }
0326 out:
0327     local_irq_restore(flags);
0328     put_cpu();
0329 }
0330 
0331 static int profile_dead_cpu(unsigned int cpu)
0332 {
0333     struct page *page;
0334     int i;
0335 
0336     if (prof_cpu_mask != NULL)
0337         cpumask_clear_cpu(cpu, prof_cpu_mask);
0338 
0339     for (i = 0; i < 2; i++) {
0340         if (per_cpu(cpu_profile_hits, cpu)[i]) {
0341             page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
0342             per_cpu(cpu_profile_hits, cpu)[i] = NULL;
0343             __free_page(page);
0344         }
0345     }
0346     return 0;
0347 }
0348 
0349 static int profile_prepare_cpu(unsigned int cpu)
0350 {
0351     int i, node = cpu_to_mem(cpu);
0352     struct page *page;
0353 
0354     per_cpu(cpu_profile_flip, cpu) = 0;
0355 
0356     for (i = 0; i < 2; i++) {
0357         if (per_cpu(cpu_profile_hits, cpu)[i])
0358             continue;
0359 
0360         page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
0361         if (!page) {
0362             profile_dead_cpu(cpu);
0363             return -ENOMEM;
0364         }
0365         per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
0366 
0367     }
0368     return 0;
0369 }
0370 
0371 static int profile_online_cpu(unsigned int cpu)
0372 {
0373     if (prof_cpu_mask != NULL)
0374         cpumask_set_cpu(cpu, prof_cpu_mask);
0375 
0376     return 0;
0377 }
0378 
0379 #else /* !CONFIG_SMP */
0380 #define profile_flip_buffers()      do { } while (0)
0381 #define profile_discard_flip_buffers()  do { } while (0)
0382 
0383 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
0384 {
0385     unsigned long pc;
0386     pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
0387     atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
0388 }
0389 #endif /* !CONFIG_SMP */
0390 
0391 void profile_hits(int type, void *__pc, unsigned int nr_hits)
0392 {
0393     if (prof_on != type || !prof_buffer)
0394         return;
0395     do_profile_hits(type, __pc, nr_hits);
0396 }
0397 EXPORT_SYMBOL_GPL(profile_hits);
0398 
0399 void profile_tick(int type)
0400 {
0401     struct pt_regs *regs = get_irq_regs();
0402 
0403     if (!user_mode(regs) && prof_cpu_mask != NULL &&
0404         cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
0405         profile_hit(type, (void *)profile_pc(regs));
0406 }
0407 
0408 #ifdef CONFIG_PROC_FS
0409 #include <linux/proc_fs.h>
0410 #include <linux/seq_file.h>
0411 #include <linux/uaccess.h>
0412 
0413 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
0414 {
0415     seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
0416     return 0;
0417 }
0418 
0419 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
0420 {
0421     return single_open(file, prof_cpu_mask_proc_show, NULL);
0422 }
0423 
0424 static ssize_t prof_cpu_mask_proc_write(struct file *file,
0425     const char __user *buffer, size_t count, loff_t *pos)
0426 {
0427     cpumask_var_t new_value;
0428     int err;
0429 
0430     if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
0431         return -ENOMEM;
0432 
0433     err = cpumask_parse_user(buffer, count, new_value);
0434     if (!err) {
0435         cpumask_copy(prof_cpu_mask, new_value);
0436         err = count;
0437     }
0438     free_cpumask_var(new_value);
0439     return err;
0440 }
0441 
0442 static const struct file_operations prof_cpu_mask_proc_fops = {
0443     .open       = prof_cpu_mask_proc_open,
0444     .read       = seq_read,
0445     .llseek     = seq_lseek,
0446     .release    = single_release,
0447     .write      = prof_cpu_mask_proc_write,
0448 };
0449 
0450 void create_prof_cpu_mask(void)
0451 {
0452     /* create /proc/irq/prof_cpu_mask */
0453     proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
0454 }
0455 
0456 /*
0457  * This function accesses profiling information. The returned data is
0458  * binary: the sampling step and the actual contents of the profile
0459  * buffer. Use of the program readprofile is recommended in order to
0460  * get meaningful info out of these data.
0461  */
0462 static ssize_t
0463 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
0464 {
0465     unsigned long p = *ppos;
0466     ssize_t read;
0467     char *pnt;
0468     unsigned int sample_step = 1 << prof_shift;
0469 
0470     profile_flip_buffers();
0471     if (p >= (prof_len+1)*sizeof(unsigned int))
0472         return 0;
0473     if (count > (prof_len+1)*sizeof(unsigned int) - p)
0474         count = (prof_len+1)*sizeof(unsigned int) - p;
0475     read = 0;
0476 
0477     while (p < sizeof(unsigned int) && count > 0) {
0478         if (put_user(*((char *)(&sample_step)+p), buf))
0479             return -EFAULT;
0480         buf++; p++; count--; read++;
0481     }
0482     pnt = (char *)prof_buffer + p - sizeof(atomic_t);
0483     if (copy_to_user(buf, (void *)pnt, count))
0484         return -EFAULT;
0485     read += count;
0486     *ppos += read;
0487     return read;
0488 }
0489 
0490 /*
0491  * Writing to /proc/profile resets the counters
0492  *
0493  * Writing a 'profiling multiplier' value into it also re-sets the profiling
0494  * interrupt frequency, on architectures that support this.
0495  */
0496 static ssize_t write_profile(struct file *file, const char __user *buf,
0497                  size_t count, loff_t *ppos)
0498 {
0499 #ifdef CONFIG_SMP
0500     extern int setup_profiling_timer(unsigned int multiplier);
0501 
0502     if (count == sizeof(int)) {
0503         unsigned int multiplier;
0504 
0505         if (copy_from_user(&multiplier, buf, sizeof(int)))
0506             return -EFAULT;
0507 
0508         if (setup_profiling_timer(multiplier))
0509             return -EINVAL;
0510     }
0511 #endif
0512     profile_discard_flip_buffers();
0513     memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
0514     return count;
0515 }
0516 
0517 static const struct file_operations proc_profile_operations = {
0518     .read       = read_profile,
0519     .write      = write_profile,
0520     .llseek     = default_llseek,
0521 };
0522 
0523 int __ref create_proc_profile(void)
0524 {
0525     struct proc_dir_entry *entry;
0526 #ifdef CONFIG_SMP
0527     enum cpuhp_state online_state;
0528 #endif
0529 
0530     int err = 0;
0531 
0532     if (!prof_on)
0533         return 0;
0534 #ifdef CONFIG_SMP
0535     err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
0536                 profile_prepare_cpu, profile_dead_cpu);
0537     if (err)
0538         return err;
0539 
0540     err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
0541                 profile_online_cpu, NULL);
0542     if (err < 0)
0543         goto err_state_prep;
0544     online_state = err;
0545     err = 0;
0546 #endif
0547     entry = proc_create("profile", S_IWUSR | S_IRUGO,
0548                 NULL, &proc_profile_operations);
0549     if (!entry)
0550         goto err_state_onl;
0551     proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
0552 
0553     return err;
0554 err_state_onl:
0555 #ifdef CONFIG_SMP
0556     cpuhp_remove_state(online_state);
0557 err_state_prep:
0558     cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
0559 #endif
0560     return err;
0561 }
0562 subsys_initcall(create_proc_profile);
0563 #endif /* CONFIG_PROC_FS */