Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * trace_hwlat.c - A simple Hardware Latency detector.
0004  *
0005  * Use this tracer to detect large system latencies induced by the behavior of
0006  * certain underlying system hardware or firmware, independent of Linux itself.
0007  * The code was developed originally to detect the presence of SMIs on Intel
0008  * and AMD systems, although there is no dependency upon x86 herein.
0009  *
0010  * The classical example usage of this tracer is in detecting the presence of
0011  * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
0012  * somewhat special form of hardware interrupt spawned from earlier CPU debug
0013  * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
0014  * LPC (or other device) to generate a special interrupt under certain
0015  * circumstances, for example, upon expiration of a special SMI timer device,
0016  * due to certain external thermal readings, on certain I/O address accesses,
0017  * and other situations. An SMI hits a special CPU pin, triggers a special
0018  * SMI mode (complete with special memory map), and the OS is unaware.
0019  *
0020  * Although certain hardware-inducing latencies are necessary (for example,
0021  * a modern system often requires an SMI handler for correct thermal control
0022  * and remote management) they can wreak havoc upon any OS-level performance
0023  * guarantees toward low-latency, especially when the OS is not even made
0024  * aware of the presence of these interrupts. For this reason, we need a
0025  * somewhat brute force mechanism to detect these interrupts. In this case,
0026  * we do it by hogging all of the CPU(s) for configurable timer intervals,
0027  * sampling the built-in CPU timer, looking for discontiguous readings.
0028  *
0029  * WARNING: This implementation necessarily introduces latencies. Therefore,
0030  *          you should NEVER use this tracer while running in a production
0031  *          environment requiring any kind of low-latency performance
0032  *          guarantee(s).
0033  *
0034  * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
0035  * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
0036  *
0037  * Includes useful feedback from Clark Williams <williams@redhat.com>
0038  *
0039  */
0040 #include <linux/kthread.h>
0041 #include <linux/tracefs.h>
0042 #include <linux/uaccess.h>
0043 #include <linux/cpumask.h>
0044 #include <linux/delay.h>
0045 #include <linux/sched/clock.h>
0046 #include "trace.h"
0047 
0048 static struct trace_array   *hwlat_trace;
0049 
0050 #define U64STR_SIZE     22          /* 20 digits max */
0051 
0052 #define BANNER          "hwlat_detector: "
0053 #define DEFAULT_SAMPLE_WINDOW   1000000         /* 1s */
0054 #define DEFAULT_SAMPLE_WIDTH    500000          /* 0.5s */
0055 #define DEFAULT_LAT_THRESHOLD   10          /* 10us */
0056 
0057 static struct dentry *hwlat_sample_width;   /* sample width us */
0058 static struct dentry *hwlat_sample_window;  /* sample window us */
0059 static struct dentry *hwlat_thread_mode;    /* hwlat thread mode */
0060 
0061 enum {
0062     MODE_NONE = 0,
0063     MODE_ROUND_ROBIN,
0064     MODE_PER_CPU,
0065     MODE_MAX
0066 };
0067 static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" };
0068 
0069 /* Save the previous tracing_thresh value */
0070 static unsigned long save_tracing_thresh;
0071 
0072 /* runtime kthread data */
0073 struct hwlat_kthread_data {
0074     struct task_struct  *kthread;
0075     /* NMI timestamp counters */
0076     u64         nmi_ts_start;
0077     u64         nmi_total_ts;
0078     int         nmi_count;
0079     int         nmi_cpu;
0080 };
0081 
0082 static struct hwlat_kthread_data hwlat_single_cpu_data;
0083 static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
0084 
0085 /* Tells NMIs to call back to the hwlat tracer to record timestamps */
0086 bool trace_hwlat_callback_enabled;
0087 
0088 /* If the user changed threshold, remember it */
0089 static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
0090 
0091 /* Individual latency samples are stored here when detected. */
0092 struct hwlat_sample {
0093     u64         seqnum;     /* unique sequence */
0094     u64         duration;   /* delta */
0095     u64         outer_duration; /* delta (outer loop) */
0096     u64         nmi_total_ts;   /* Total time spent in NMIs */
0097     struct timespec64   timestamp;  /* wall time */
0098     int         nmi_count;  /* # NMIs during this sample */
0099     int         count;      /* # of iterations over thresh */
0100 };
0101 
0102 /* keep the global state somewhere. */
0103 static struct hwlat_data {
0104 
0105     struct mutex lock;      /* protect changes */
0106 
0107     u64 count;          /* total since reset */
0108 
0109     u64 sample_window;      /* total sampling window (on+off) */
0110     u64 sample_width;       /* active sampling portion of window */
0111 
0112     int thread_mode;        /* thread mode */
0113 
0114 } hwlat_data = {
0115     .sample_window      = DEFAULT_SAMPLE_WINDOW,
0116     .sample_width       = DEFAULT_SAMPLE_WIDTH,
0117     .thread_mode        = MODE_ROUND_ROBIN
0118 };
0119 
0120 static struct hwlat_kthread_data *get_cpu_data(void)
0121 {
0122     if (hwlat_data.thread_mode == MODE_PER_CPU)
0123         return this_cpu_ptr(&hwlat_per_cpu_data);
0124     else
0125         return &hwlat_single_cpu_data;
0126 }
0127 
0128 static bool hwlat_busy;
0129 
0130 static void trace_hwlat_sample(struct hwlat_sample *sample)
0131 {
0132     struct trace_array *tr = hwlat_trace;
0133     struct trace_event_call *call = &event_hwlat;
0134     struct trace_buffer *buffer = tr->array_buffer.buffer;
0135     struct ring_buffer_event *event;
0136     struct hwlat_entry *entry;
0137 
0138     event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
0139                       tracing_gen_ctx());
0140     if (!event)
0141         return;
0142     entry   = ring_buffer_event_data(event);
0143     entry->seqnum           = sample->seqnum;
0144     entry->duration         = sample->duration;
0145     entry->outer_duration       = sample->outer_duration;
0146     entry->timestamp        = sample->timestamp;
0147     entry->nmi_total_ts     = sample->nmi_total_ts;
0148     entry->nmi_count        = sample->nmi_count;
0149     entry->count            = sample->count;
0150 
0151     if (!call_filter_check_discard(call, entry, buffer, event))
0152         trace_buffer_unlock_commit_nostack(buffer, event);
0153 }
0154 
0155 /* Macros to encapsulate the time capturing infrastructure */
0156 #define time_type   u64
0157 #define time_get()  trace_clock_local()
0158 #define time_to_us(x)   div_u64(x, 1000)
0159 #define time_sub(a, b)  ((a) - (b))
0160 #define init_time(a, b) (a = b)
0161 #define time_u64(a) a
0162 
0163 void trace_hwlat_callback(bool enter)
0164 {
0165     struct hwlat_kthread_data *kdata = get_cpu_data();
0166 
0167     if (!kdata->kthread)
0168         return;
0169 
0170     /*
0171      * Currently trace_clock_local() calls sched_clock() and the
0172      * generic version is not NMI safe.
0173      */
0174     if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
0175         if (enter)
0176             kdata->nmi_ts_start = time_get();
0177         else
0178             kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start;
0179     }
0180 
0181     if (enter)
0182         kdata->nmi_count++;
0183 }
0184 
0185 /*
0186  * hwlat_err - report a hwlat error.
0187  */
0188 #define hwlat_err(msg) ({                           \
0189     struct trace_array *tr = hwlat_trace;                   \
0190                                         \
0191     trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg);    \
0192 })
0193 
0194 /**
0195  * get_sample - sample the CPU TSC and look for likely hardware latencies
0196  *
0197  * Used to repeatedly capture the CPU TSC (or similar), looking for potential
0198  * hardware-induced latency. Called with interrupts disabled and with
0199  * hwlat_data.lock held.
0200  */
0201 static int get_sample(void)
0202 {
0203     struct hwlat_kthread_data *kdata = get_cpu_data();
0204     struct trace_array *tr = hwlat_trace;
0205     struct hwlat_sample s;
0206     time_type start, t1, t2, last_t2;
0207     s64 diff, outer_diff, total, last_total = 0;
0208     u64 sample = 0;
0209     u64 thresh = tracing_thresh;
0210     u64 outer_sample = 0;
0211     int ret = -1;
0212     unsigned int count = 0;
0213 
0214     do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
0215 
0216     kdata->nmi_total_ts = 0;
0217     kdata->nmi_count = 0;
0218     /* Make sure NMIs see this first */
0219     barrier();
0220 
0221     trace_hwlat_callback_enabled = true;
0222 
0223     init_time(last_t2, 0);
0224     start = time_get(); /* start timestamp */
0225     outer_diff = 0;
0226 
0227     do {
0228 
0229         t1 = time_get();    /* we'll look for a discontinuity */
0230         t2 = time_get();
0231 
0232         if (time_u64(last_t2)) {
0233             /* Check the delta from outer loop (t2 to next t1) */
0234             outer_diff = time_to_us(time_sub(t1, last_t2));
0235             /* This shouldn't happen */
0236             if (outer_diff < 0) {
0237                 hwlat_err(BANNER "time running backwards\n");
0238                 goto out;
0239             }
0240             if (outer_diff > outer_sample)
0241                 outer_sample = outer_diff;
0242         }
0243         last_t2 = t2;
0244 
0245         total = time_to_us(time_sub(t2, start)); /* sample width */
0246 
0247         /* Check for possible overflows */
0248         if (total < last_total) {
0249             hwlat_err("Time total overflowed\n");
0250             break;
0251         }
0252         last_total = total;
0253 
0254         /* This checks the inner loop (t1 to t2) */
0255         diff = time_to_us(time_sub(t2, t1));     /* current diff */
0256 
0257         if (diff > thresh || outer_diff > thresh) {
0258             if (!count)
0259                 ktime_get_real_ts64(&s.timestamp);
0260             count++;
0261         }
0262 
0263         /* This shouldn't happen */
0264         if (diff < 0) {
0265             hwlat_err(BANNER "time running backwards\n");
0266             goto out;
0267         }
0268 
0269         if (diff > sample)
0270             sample = diff; /* only want highest value */
0271 
0272     } while (total <= hwlat_data.sample_width);
0273 
0274     barrier(); /* finish the above in the view for NMIs */
0275     trace_hwlat_callback_enabled = false;
0276     barrier(); /* Make sure nmi_total_ts is no longer updated */
0277 
0278     ret = 0;
0279 
0280     /* If we exceed the threshold value, we have found a hardware latency */
0281     if (sample > thresh || outer_sample > thresh) {
0282         u64 latency;
0283 
0284         ret = 1;
0285 
0286         /* We read in microseconds */
0287         if (kdata->nmi_total_ts)
0288             do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
0289 
0290         hwlat_data.count++;
0291         s.seqnum = hwlat_data.count;
0292         s.duration = sample;
0293         s.outer_duration = outer_sample;
0294         s.nmi_total_ts = kdata->nmi_total_ts;
0295         s.nmi_count = kdata->nmi_count;
0296         s.count = count;
0297         trace_hwlat_sample(&s);
0298 
0299         latency = max(sample, outer_sample);
0300 
0301         /* Keep a running maximum ever recorded hardware latency */
0302         if (latency > tr->max_latency) {
0303             tr->max_latency = latency;
0304             latency_fsnotify(tr);
0305         }
0306     }
0307 
0308 out:
0309     return ret;
0310 }
0311 
0312 static struct cpumask save_cpumask;
0313 
0314 static void move_to_next_cpu(void)
0315 {
0316     struct cpumask *current_mask = &save_cpumask;
0317     struct trace_array *tr = hwlat_trace;
0318     int next_cpu;
0319 
0320     /*
0321      * If for some reason the user modifies the CPU affinity
0322      * of this thread, then stop migrating for the duration
0323      * of the current test.
0324      */
0325     if (!cpumask_equal(current_mask, current->cpus_ptr))
0326         goto change_mode;
0327 
0328     cpus_read_lock();
0329     cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0330     next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
0331     cpus_read_unlock();
0332 
0333     if (next_cpu >= nr_cpu_ids)
0334         next_cpu = cpumask_first(current_mask);
0335 
0336     if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
0337         goto change_mode;
0338 
0339     cpumask_clear(current_mask);
0340     cpumask_set_cpu(next_cpu, current_mask);
0341 
0342     sched_setaffinity(0, current_mask);
0343     return;
0344 
0345  change_mode:
0346     hwlat_data.thread_mode = MODE_NONE;
0347     pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n");
0348 }
0349 
0350 /*
0351  * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
0352  *
0353  * Used to periodically sample the CPU TSC via a call to get_sample. We
0354  * disable interrupts, which does (intentionally) introduce latency since we
0355  * need to ensure nothing else might be running (and thus preempting).
0356  * Obviously this should never be used in production environments.
0357  *
0358  * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
0359  */
0360 static int kthread_fn(void *data)
0361 {
0362     u64 interval;
0363 
0364     while (!kthread_should_stop()) {
0365 
0366         if (hwlat_data.thread_mode == MODE_ROUND_ROBIN)
0367             move_to_next_cpu();
0368 
0369         local_irq_disable();
0370         get_sample();
0371         local_irq_enable();
0372 
0373         mutex_lock(&hwlat_data.lock);
0374         interval = hwlat_data.sample_window - hwlat_data.sample_width;
0375         mutex_unlock(&hwlat_data.lock);
0376 
0377         do_div(interval, USEC_PER_MSEC); /* modifies interval value */
0378 
0379         /* Always sleep for at least 1ms */
0380         if (interval < 1)
0381             interval = 1;
0382 
0383         if (msleep_interruptible(interval))
0384             break;
0385     }
0386 
0387     return 0;
0388 }
0389 
0390 /*
0391  * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop
0392  *
0393  * This kicks the running hardware latency sampling/detector kernel thread and
0394  * tells it to stop sampling now. Use this on unload and at system shutdown.
0395  */
0396 static void stop_single_kthread(void)
0397 {
0398     struct hwlat_kthread_data *kdata = get_cpu_data();
0399     struct task_struct *kthread;
0400 
0401     cpus_read_lock();
0402     kthread = kdata->kthread;
0403 
0404     if (!kthread)
0405         goto out_put_cpus;
0406 
0407     kthread_stop(kthread);
0408     kdata->kthread = NULL;
0409 
0410 out_put_cpus:
0411     cpus_read_unlock();
0412 }
0413 
0414 
0415 /*
0416  * start_single_kthread - Kick off the hardware latency sampling/detector kthread
0417  *
0418  * This starts the kernel thread that will sit and sample the CPU timestamp
0419  * counter (TSC or similar) and look for potential hardware latencies.
0420  */
0421 static int start_single_kthread(struct trace_array *tr)
0422 {
0423     struct hwlat_kthread_data *kdata = get_cpu_data();
0424     struct cpumask *current_mask = &save_cpumask;
0425     struct task_struct *kthread;
0426     int next_cpu;
0427 
0428     cpus_read_lock();
0429     if (kdata->kthread)
0430         goto out_put_cpus;
0431 
0432     kthread = kthread_create(kthread_fn, NULL, "hwlatd");
0433     if (IS_ERR(kthread)) {
0434         pr_err(BANNER "could not start sampling thread\n");
0435         cpus_read_unlock();
0436         return -ENOMEM;
0437     }
0438 
0439     /* Just pick the first CPU on first iteration */
0440     cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0441 
0442     if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) {
0443         next_cpu = cpumask_first(current_mask);
0444         cpumask_clear(current_mask);
0445         cpumask_set_cpu(next_cpu, current_mask);
0446 
0447     }
0448 
0449     sched_setaffinity(kthread->pid, current_mask);
0450 
0451     kdata->kthread = kthread;
0452     wake_up_process(kthread);
0453 
0454 out_put_cpus:
0455     cpus_read_unlock();
0456     return 0;
0457 }
0458 
0459 /*
0460  * stop_cpu_kthread - Stop a hwlat cpu kthread
0461  */
0462 static void stop_cpu_kthread(unsigned int cpu)
0463 {
0464     struct task_struct *kthread;
0465 
0466     kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread;
0467     if (kthread)
0468         kthread_stop(kthread);
0469     per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
0470 }
0471 
0472 /*
0473  * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop
0474  *
0475  * This kicks the running hardware latency sampling/detector kernel threads and
0476  * tells it to stop sampling now. Use this on unload and at system shutdown.
0477  */
0478 static void stop_per_cpu_kthreads(void)
0479 {
0480     unsigned int cpu;
0481 
0482     cpus_read_lock();
0483     for_each_online_cpu(cpu)
0484         stop_cpu_kthread(cpu);
0485     cpus_read_unlock();
0486 }
0487 
0488 /*
0489  * start_cpu_kthread - Start a hwlat cpu kthread
0490  */
0491 static int start_cpu_kthread(unsigned int cpu)
0492 {
0493     struct task_struct *kthread;
0494 
0495     kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
0496     if (IS_ERR(kthread)) {
0497         pr_err(BANNER "could not start sampling thread\n");
0498         return -ENOMEM;
0499     }
0500 
0501     per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
0502 
0503     return 0;
0504 }
0505 
0506 #ifdef CONFIG_HOTPLUG_CPU
0507 static void hwlat_hotplug_workfn(struct work_struct *dummy)
0508 {
0509     struct trace_array *tr = hwlat_trace;
0510     unsigned int cpu = smp_processor_id();
0511 
0512     mutex_lock(&trace_types_lock);
0513     mutex_lock(&hwlat_data.lock);
0514     cpus_read_lock();
0515 
0516     if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
0517         goto out_unlock;
0518 
0519     if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
0520         goto out_unlock;
0521 
0522     start_cpu_kthread(cpu);
0523 
0524 out_unlock:
0525     cpus_read_unlock();
0526     mutex_unlock(&hwlat_data.lock);
0527     mutex_unlock(&trace_types_lock);
0528 }
0529 
0530 static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn);
0531 
0532 /*
0533  * hwlat_cpu_init - CPU hotplug online callback function
0534  */
0535 static int hwlat_cpu_init(unsigned int cpu)
0536 {
0537     schedule_work_on(cpu, &hwlat_hotplug_work);
0538     return 0;
0539 }
0540 
0541 /*
0542  * hwlat_cpu_die - CPU hotplug offline callback function
0543  */
0544 static int hwlat_cpu_die(unsigned int cpu)
0545 {
0546     stop_cpu_kthread(cpu);
0547     return 0;
0548 }
0549 
0550 static void hwlat_init_hotplug_support(void)
0551 {
0552     int ret;
0553 
0554     ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online",
0555                 hwlat_cpu_init, hwlat_cpu_die);
0556     if (ret < 0)
0557         pr_warn(BANNER "Error to init cpu hotplug support\n");
0558 
0559     return;
0560 }
0561 #else /* CONFIG_HOTPLUG_CPU */
0562 static void hwlat_init_hotplug_support(void)
0563 {
0564     return;
0565 }
0566 #endif /* CONFIG_HOTPLUG_CPU */
0567 
0568 /*
0569  * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads
0570  *
0571  * This starts the kernel threads that will sit on potentially all cpus and
0572  * sample the CPU timestamp counter (TSC or similar) and look for potential
0573  * hardware latencies.
0574  */
0575 static int start_per_cpu_kthreads(struct trace_array *tr)
0576 {
0577     struct cpumask *current_mask = &save_cpumask;
0578     unsigned int cpu;
0579     int retval;
0580 
0581     cpus_read_lock();
0582     /*
0583      * Run only on CPUs in which hwlat is allowed to run.
0584      */
0585     cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0586 
0587     for_each_online_cpu(cpu)
0588         per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
0589 
0590     for_each_cpu(cpu, current_mask) {
0591         retval = start_cpu_kthread(cpu);
0592         if (retval)
0593             goto out_error;
0594     }
0595     cpus_read_unlock();
0596 
0597     return 0;
0598 
0599 out_error:
0600     cpus_read_unlock();
0601     stop_per_cpu_kthreads();
0602     return retval;
0603 }
0604 
0605 static void *s_mode_start(struct seq_file *s, loff_t *pos)
0606 {
0607     int mode = *pos;
0608 
0609     mutex_lock(&hwlat_data.lock);
0610 
0611     if (mode >= MODE_MAX)
0612         return NULL;
0613 
0614     return pos;
0615 }
0616 
0617 static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos)
0618 {
0619     int mode = ++(*pos);
0620 
0621     if (mode >= MODE_MAX)
0622         return NULL;
0623 
0624     return pos;
0625 }
0626 
0627 static int s_mode_show(struct seq_file *s, void *v)
0628 {
0629     loff_t *pos = v;
0630     int mode = *pos;
0631 
0632     if (mode == hwlat_data.thread_mode)
0633         seq_printf(s, "[%s]", thread_mode_str[mode]);
0634     else
0635         seq_printf(s, "%s", thread_mode_str[mode]);
0636 
0637     if (mode != MODE_MAX)
0638         seq_puts(s, " ");
0639 
0640     return 0;
0641 }
0642 
0643 static void s_mode_stop(struct seq_file *s, void *v)
0644 {
0645     seq_puts(s, "\n");
0646     mutex_unlock(&hwlat_data.lock);
0647 }
0648 
0649 static const struct seq_operations thread_mode_seq_ops = {
0650     .start      = s_mode_start,
0651     .next       = s_mode_next,
0652     .show       = s_mode_show,
0653     .stop       = s_mode_stop
0654 };
0655 
0656 static int hwlat_mode_open(struct inode *inode, struct file *file)
0657 {
0658     return seq_open(file, &thread_mode_seq_ops);
0659 };
0660 
0661 static void hwlat_tracer_start(struct trace_array *tr);
0662 static void hwlat_tracer_stop(struct trace_array *tr);
0663 
0664 /**
0665  * hwlat_mode_write - Write function for "mode" entry
0666  * @filp: The active open file structure
0667  * @ubuf: The user buffer that contains the value to write
0668  * @cnt: The maximum number of bytes to write to "file"
0669  * @ppos: The current position in @file
0670  *
0671  * This function provides a write implementation for the "mode" interface
0672  * to the hardware latency detector. hwlatd has different operation modes.
0673  * The "none" sets the allowed cpumask for a single hwlatd thread at the
0674  * startup and lets the scheduler handle the migration. The default mode is
0675  * the "round-robin" one, in which a single hwlatd thread runs, migrating
0676  * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode
0677  * creates one hwlatd thread per allowed CPU.
0678  */
0679 static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf,
0680                  size_t cnt, loff_t *ppos)
0681 {
0682     struct trace_array *tr = hwlat_trace;
0683     const char *mode;
0684     char buf[64];
0685     int ret, i;
0686 
0687     if (cnt >= sizeof(buf))
0688         return -EINVAL;
0689 
0690     if (copy_from_user(buf, ubuf, cnt))
0691         return -EFAULT;
0692 
0693     buf[cnt] = 0;
0694 
0695     mode = strstrip(buf);
0696 
0697     ret = -EINVAL;
0698 
0699     /*
0700      * trace_types_lock is taken to avoid concurrency on start/stop
0701      * and hwlat_busy.
0702      */
0703     mutex_lock(&trace_types_lock);
0704     if (hwlat_busy)
0705         hwlat_tracer_stop(tr);
0706 
0707     mutex_lock(&hwlat_data.lock);
0708 
0709     for (i = 0; i < MODE_MAX; i++) {
0710         if (strcmp(mode, thread_mode_str[i]) == 0) {
0711             hwlat_data.thread_mode = i;
0712             ret = cnt;
0713         }
0714     }
0715 
0716     mutex_unlock(&hwlat_data.lock);
0717 
0718     if (hwlat_busy)
0719         hwlat_tracer_start(tr);
0720     mutex_unlock(&trace_types_lock);
0721 
0722     *ppos += cnt;
0723 
0724 
0725 
0726     return ret;
0727 }
0728 
0729 /*
0730  * The width parameter is read/write using the generic trace_min_max_param
0731  * method. The *val is protected by the hwlat_data lock and is upper
0732  * bounded by the window parameter.
0733  */
0734 static struct trace_min_max_param hwlat_width = {
0735     .lock       = &hwlat_data.lock,
0736     .val        = &hwlat_data.sample_width,
0737     .max        = &hwlat_data.sample_window,
0738     .min        = NULL,
0739 };
0740 
0741 /*
0742  * The window parameter is read/write using the generic trace_min_max_param
0743  * method. The *val is protected by the hwlat_data lock and is lower
0744  * bounded by the width parameter.
0745  */
0746 static struct trace_min_max_param hwlat_window = {
0747     .lock       = &hwlat_data.lock,
0748     .val        = &hwlat_data.sample_window,
0749     .max        = NULL,
0750     .min        = &hwlat_data.sample_width,
0751 };
0752 
0753 static const struct file_operations thread_mode_fops = {
0754     .open       = hwlat_mode_open,
0755     .read       = seq_read,
0756     .llseek     = seq_lseek,
0757     .release    = seq_release,
0758     .write      = hwlat_mode_write
0759 };
0760 /**
0761  * init_tracefs - A function to initialize the tracefs interface files
0762  *
0763  * This function creates entries in tracefs for "hwlat_detector".
0764  * It creates the hwlat_detector directory in the tracing directory,
0765  * and within that directory is the count, width and window files to
0766  * change and view those values.
0767  */
0768 static int init_tracefs(void)
0769 {
0770     int ret;
0771     struct dentry *top_dir;
0772 
0773     ret = tracing_init_dentry();
0774     if (ret)
0775         return -ENOMEM;
0776 
0777     top_dir = tracefs_create_dir("hwlat_detector", NULL);
0778     if (!top_dir)
0779         return -ENOMEM;
0780 
0781     hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE,
0782                           top_dir,
0783                           &hwlat_window,
0784                           &trace_min_max_fops);
0785     if (!hwlat_sample_window)
0786         goto err;
0787 
0788     hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE,
0789                          top_dir,
0790                          &hwlat_width,
0791                          &trace_min_max_fops);
0792     if (!hwlat_sample_width)
0793         goto err;
0794 
0795     hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE,
0796                           top_dir,
0797                           NULL,
0798                           &thread_mode_fops);
0799     if (!hwlat_thread_mode)
0800         goto err;
0801 
0802     return 0;
0803 
0804  err:
0805     tracefs_remove(top_dir);
0806     return -ENOMEM;
0807 }
0808 
0809 static void hwlat_tracer_start(struct trace_array *tr)
0810 {
0811     int err;
0812 
0813     if (hwlat_data.thread_mode == MODE_PER_CPU)
0814         err = start_per_cpu_kthreads(tr);
0815     else
0816         err = start_single_kthread(tr);
0817     if (err)
0818         pr_err(BANNER "Cannot start hwlat kthread\n");
0819 }
0820 
0821 static void hwlat_tracer_stop(struct trace_array *tr)
0822 {
0823     if (hwlat_data.thread_mode == MODE_PER_CPU)
0824         stop_per_cpu_kthreads();
0825     else
0826         stop_single_kthread();
0827 }
0828 
0829 static int hwlat_tracer_init(struct trace_array *tr)
0830 {
0831     /* Only allow one instance to enable this */
0832     if (hwlat_busy)
0833         return -EBUSY;
0834 
0835     hwlat_trace = tr;
0836 
0837     hwlat_data.count = 0;
0838     tr->max_latency = 0;
0839     save_tracing_thresh = tracing_thresh;
0840 
0841     /* tracing_thresh is in nsecs, we speak in usecs */
0842     if (!tracing_thresh)
0843         tracing_thresh = last_tracing_thresh;
0844 
0845     if (tracer_tracing_is_on(tr))
0846         hwlat_tracer_start(tr);
0847 
0848     hwlat_busy = true;
0849 
0850     return 0;
0851 }
0852 
0853 static void hwlat_tracer_reset(struct trace_array *tr)
0854 {
0855     hwlat_tracer_stop(tr);
0856 
0857     /* the tracing threshold is static between runs */
0858     last_tracing_thresh = tracing_thresh;
0859 
0860     tracing_thresh = save_tracing_thresh;
0861     hwlat_busy = false;
0862 }
0863 
0864 static struct tracer hwlat_tracer __read_mostly =
0865 {
0866     .name       = "hwlat",
0867     .init       = hwlat_tracer_init,
0868     .reset      = hwlat_tracer_reset,
0869     .start      = hwlat_tracer_start,
0870     .stop       = hwlat_tracer_stop,
0871     .allow_instances = true,
0872 };
0873 
0874 __init static int init_hwlat_tracer(void)
0875 {
0876     int ret;
0877 
0878     mutex_init(&hwlat_data.lock);
0879 
0880     ret = register_tracer(&hwlat_tracer);
0881     if (ret)
0882         return ret;
0883 
0884     hwlat_init_hotplug_support();
0885 
0886     init_tracefs();
0887 
0888     return 0;
0889 }
0890 late_initcall(init_hwlat_tracer);