0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040 #include <linux/kthread.h>
0041 #include <linux/tracefs.h>
0042 #include <linux/uaccess.h>
0043 #include <linux/cpumask.h>
0044 #include <linux/delay.h>
0045 #include <linux/sched/clock.h>
0046 #include "trace.h"
0047
0048 static struct trace_array *hwlat_trace;
0049
0050 #define U64STR_SIZE 22
0051
0052 #define BANNER "hwlat_detector: "
0053 #define DEFAULT_SAMPLE_WINDOW 1000000
0054 #define DEFAULT_SAMPLE_WIDTH 500000
0055 #define DEFAULT_LAT_THRESHOLD 10
0056
0057 static struct dentry *hwlat_sample_width;
0058 static struct dentry *hwlat_sample_window;
0059 static struct dentry *hwlat_thread_mode;
0060
0061 enum {
0062 MODE_NONE = 0,
0063 MODE_ROUND_ROBIN,
0064 MODE_PER_CPU,
0065 MODE_MAX
0066 };
0067 static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" };
0068
0069
0070 static unsigned long save_tracing_thresh;
0071
0072
0073 struct hwlat_kthread_data {
0074 struct task_struct *kthread;
0075
0076 u64 nmi_ts_start;
0077 u64 nmi_total_ts;
0078 int nmi_count;
0079 int nmi_cpu;
0080 };
0081
0082 static struct hwlat_kthread_data hwlat_single_cpu_data;
0083 static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
0084
0085
0086 bool trace_hwlat_callback_enabled;
0087
0088
0089 static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
0090
0091
0092 struct hwlat_sample {
0093 u64 seqnum;
0094 u64 duration;
0095 u64 outer_duration;
0096 u64 nmi_total_ts;
0097 struct timespec64 timestamp;
0098 int nmi_count;
0099 int count;
0100 };
0101
0102
0103 static struct hwlat_data {
0104
0105 struct mutex lock;
0106
0107 u64 count;
0108
0109 u64 sample_window;
0110 u64 sample_width;
0111
0112 int thread_mode;
0113
0114 } hwlat_data = {
0115 .sample_window = DEFAULT_SAMPLE_WINDOW,
0116 .sample_width = DEFAULT_SAMPLE_WIDTH,
0117 .thread_mode = MODE_ROUND_ROBIN
0118 };
0119
0120 static struct hwlat_kthread_data *get_cpu_data(void)
0121 {
0122 if (hwlat_data.thread_mode == MODE_PER_CPU)
0123 return this_cpu_ptr(&hwlat_per_cpu_data);
0124 else
0125 return &hwlat_single_cpu_data;
0126 }
0127
0128 static bool hwlat_busy;
0129
0130 static void trace_hwlat_sample(struct hwlat_sample *sample)
0131 {
0132 struct trace_array *tr = hwlat_trace;
0133 struct trace_event_call *call = &event_hwlat;
0134 struct trace_buffer *buffer = tr->array_buffer.buffer;
0135 struct ring_buffer_event *event;
0136 struct hwlat_entry *entry;
0137
0138 event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
0139 tracing_gen_ctx());
0140 if (!event)
0141 return;
0142 entry = ring_buffer_event_data(event);
0143 entry->seqnum = sample->seqnum;
0144 entry->duration = sample->duration;
0145 entry->outer_duration = sample->outer_duration;
0146 entry->timestamp = sample->timestamp;
0147 entry->nmi_total_ts = sample->nmi_total_ts;
0148 entry->nmi_count = sample->nmi_count;
0149 entry->count = sample->count;
0150
0151 if (!call_filter_check_discard(call, entry, buffer, event))
0152 trace_buffer_unlock_commit_nostack(buffer, event);
0153 }
0154
0155
0156 #define time_type u64
0157 #define time_get() trace_clock_local()
0158 #define time_to_us(x) div_u64(x, 1000)
0159 #define time_sub(a, b) ((a) - (b))
0160 #define init_time(a, b) (a = b)
0161 #define time_u64(a) a
0162
0163 void trace_hwlat_callback(bool enter)
0164 {
0165 struct hwlat_kthread_data *kdata = get_cpu_data();
0166
0167 if (!kdata->kthread)
0168 return;
0169
0170
0171
0172
0173
0174 if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
0175 if (enter)
0176 kdata->nmi_ts_start = time_get();
0177 else
0178 kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start;
0179 }
0180
0181 if (enter)
0182 kdata->nmi_count++;
0183 }
0184
0185
0186
0187
0188 #define hwlat_err(msg) ({ \
0189 struct trace_array *tr = hwlat_trace; \
0190 \
0191 trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
0192 })
0193
0194
0195
0196
0197
0198
0199
0200
0201 static int get_sample(void)
0202 {
0203 struct hwlat_kthread_data *kdata = get_cpu_data();
0204 struct trace_array *tr = hwlat_trace;
0205 struct hwlat_sample s;
0206 time_type start, t1, t2, last_t2;
0207 s64 diff, outer_diff, total, last_total = 0;
0208 u64 sample = 0;
0209 u64 thresh = tracing_thresh;
0210 u64 outer_sample = 0;
0211 int ret = -1;
0212 unsigned int count = 0;
0213
0214 do_div(thresh, NSEC_PER_USEC);
0215
0216 kdata->nmi_total_ts = 0;
0217 kdata->nmi_count = 0;
0218
0219 barrier();
0220
0221 trace_hwlat_callback_enabled = true;
0222
0223 init_time(last_t2, 0);
0224 start = time_get();
0225 outer_diff = 0;
0226
0227 do {
0228
0229 t1 = time_get();
0230 t2 = time_get();
0231
0232 if (time_u64(last_t2)) {
0233
0234 outer_diff = time_to_us(time_sub(t1, last_t2));
0235
0236 if (outer_diff < 0) {
0237 hwlat_err(BANNER "time running backwards\n");
0238 goto out;
0239 }
0240 if (outer_diff > outer_sample)
0241 outer_sample = outer_diff;
0242 }
0243 last_t2 = t2;
0244
0245 total = time_to_us(time_sub(t2, start));
0246
0247
0248 if (total < last_total) {
0249 hwlat_err("Time total overflowed\n");
0250 break;
0251 }
0252 last_total = total;
0253
0254
0255 diff = time_to_us(time_sub(t2, t1));
0256
0257 if (diff > thresh || outer_diff > thresh) {
0258 if (!count)
0259 ktime_get_real_ts64(&s.timestamp);
0260 count++;
0261 }
0262
0263
0264 if (diff < 0) {
0265 hwlat_err(BANNER "time running backwards\n");
0266 goto out;
0267 }
0268
0269 if (diff > sample)
0270 sample = diff;
0271
0272 } while (total <= hwlat_data.sample_width);
0273
0274 barrier();
0275 trace_hwlat_callback_enabled = false;
0276 barrier();
0277
0278 ret = 0;
0279
0280
0281 if (sample > thresh || outer_sample > thresh) {
0282 u64 latency;
0283
0284 ret = 1;
0285
0286
0287 if (kdata->nmi_total_ts)
0288 do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
0289
0290 hwlat_data.count++;
0291 s.seqnum = hwlat_data.count;
0292 s.duration = sample;
0293 s.outer_duration = outer_sample;
0294 s.nmi_total_ts = kdata->nmi_total_ts;
0295 s.nmi_count = kdata->nmi_count;
0296 s.count = count;
0297 trace_hwlat_sample(&s);
0298
0299 latency = max(sample, outer_sample);
0300
0301
0302 if (latency > tr->max_latency) {
0303 tr->max_latency = latency;
0304 latency_fsnotify(tr);
0305 }
0306 }
0307
0308 out:
0309 return ret;
0310 }
0311
0312 static struct cpumask save_cpumask;
0313
0314 static void move_to_next_cpu(void)
0315 {
0316 struct cpumask *current_mask = &save_cpumask;
0317 struct trace_array *tr = hwlat_trace;
0318 int next_cpu;
0319
0320
0321
0322
0323
0324
0325 if (!cpumask_equal(current_mask, current->cpus_ptr))
0326 goto change_mode;
0327
0328 cpus_read_lock();
0329 cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0330 next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
0331 cpus_read_unlock();
0332
0333 if (next_cpu >= nr_cpu_ids)
0334 next_cpu = cpumask_first(current_mask);
0335
0336 if (next_cpu >= nr_cpu_ids)
0337 goto change_mode;
0338
0339 cpumask_clear(current_mask);
0340 cpumask_set_cpu(next_cpu, current_mask);
0341
0342 sched_setaffinity(0, current_mask);
0343 return;
0344
0345 change_mode:
0346 hwlat_data.thread_mode = MODE_NONE;
0347 pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n");
0348 }
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360 static int kthread_fn(void *data)
0361 {
0362 u64 interval;
0363
0364 while (!kthread_should_stop()) {
0365
0366 if (hwlat_data.thread_mode == MODE_ROUND_ROBIN)
0367 move_to_next_cpu();
0368
0369 local_irq_disable();
0370 get_sample();
0371 local_irq_enable();
0372
0373 mutex_lock(&hwlat_data.lock);
0374 interval = hwlat_data.sample_window - hwlat_data.sample_width;
0375 mutex_unlock(&hwlat_data.lock);
0376
0377 do_div(interval, USEC_PER_MSEC);
0378
0379
0380 if (interval < 1)
0381 interval = 1;
0382
0383 if (msleep_interruptible(interval))
0384 break;
0385 }
0386
0387 return 0;
0388 }
0389
0390
0391
0392
0393
0394
0395
0396 static void stop_single_kthread(void)
0397 {
0398 struct hwlat_kthread_data *kdata = get_cpu_data();
0399 struct task_struct *kthread;
0400
0401 cpus_read_lock();
0402 kthread = kdata->kthread;
0403
0404 if (!kthread)
0405 goto out_put_cpus;
0406
0407 kthread_stop(kthread);
0408 kdata->kthread = NULL;
0409
0410 out_put_cpus:
0411 cpus_read_unlock();
0412 }
0413
0414
0415
0416
0417
0418
0419
0420
0421 static int start_single_kthread(struct trace_array *tr)
0422 {
0423 struct hwlat_kthread_data *kdata = get_cpu_data();
0424 struct cpumask *current_mask = &save_cpumask;
0425 struct task_struct *kthread;
0426 int next_cpu;
0427
0428 cpus_read_lock();
0429 if (kdata->kthread)
0430 goto out_put_cpus;
0431
0432 kthread = kthread_create(kthread_fn, NULL, "hwlatd");
0433 if (IS_ERR(kthread)) {
0434 pr_err(BANNER "could not start sampling thread\n");
0435 cpus_read_unlock();
0436 return -ENOMEM;
0437 }
0438
0439
0440 cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0441
0442 if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) {
0443 next_cpu = cpumask_first(current_mask);
0444 cpumask_clear(current_mask);
0445 cpumask_set_cpu(next_cpu, current_mask);
0446
0447 }
0448
0449 sched_setaffinity(kthread->pid, current_mask);
0450
0451 kdata->kthread = kthread;
0452 wake_up_process(kthread);
0453
0454 out_put_cpus:
0455 cpus_read_unlock();
0456 return 0;
0457 }
0458
0459
0460
0461
0462 static void stop_cpu_kthread(unsigned int cpu)
0463 {
0464 struct task_struct *kthread;
0465
0466 kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread;
0467 if (kthread)
0468 kthread_stop(kthread);
0469 per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
0470 }
0471
0472
0473
0474
0475
0476
0477
0478 static void stop_per_cpu_kthreads(void)
0479 {
0480 unsigned int cpu;
0481
0482 cpus_read_lock();
0483 for_each_online_cpu(cpu)
0484 stop_cpu_kthread(cpu);
0485 cpus_read_unlock();
0486 }
0487
0488
0489
0490
0491 static int start_cpu_kthread(unsigned int cpu)
0492 {
0493 struct task_struct *kthread;
0494
0495 kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
0496 if (IS_ERR(kthread)) {
0497 pr_err(BANNER "could not start sampling thread\n");
0498 return -ENOMEM;
0499 }
0500
0501 per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
0502
0503 return 0;
0504 }
0505
0506 #ifdef CONFIG_HOTPLUG_CPU
0507 static void hwlat_hotplug_workfn(struct work_struct *dummy)
0508 {
0509 struct trace_array *tr = hwlat_trace;
0510 unsigned int cpu = smp_processor_id();
0511
0512 mutex_lock(&trace_types_lock);
0513 mutex_lock(&hwlat_data.lock);
0514 cpus_read_lock();
0515
0516 if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
0517 goto out_unlock;
0518
0519 if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
0520 goto out_unlock;
0521
0522 start_cpu_kthread(cpu);
0523
0524 out_unlock:
0525 cpus_read_unlock();
0526 mutex_unlock(&hwlat_data.lock);
0527 mutex_unlock(&trace_types_lock);
0528 }
0529
0530 static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn);
0531
0532
0533
0534
0535 static int hwlat_cpu_init(unsigned int cpu)
0536 {
0537 schedule_work_on(cpu, &hwlat_hotplug_work);
0538 return 0;
0539 }
0540
0541
0542
0543
0544 static int hwlat_cpu_die(unsigned int cpu)
0545 {
0546 stop_cpu_kthread(cpu);
0547 return 0;
0548 }
0549
0550 static void hwlat_init_hotplug_support(void)
0551 {
0552 int ret;
0553
0554 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online",
0555 hwlat_cpu_init, hwlat_cpu_die);
0556 if (ret < 0)
0557 pr_warn(BANNER "Error to init cpu hotplug support\n");
0558
0559 return;
0560 }
0561 #else
0562 static void hwlat_init_hotplug_support(void)
0563 {
0564 return;
0565 }
0566 #endif
0567
0568
0569
0570
0571
0572
0573
0574
0575 static int start_per_cpu_kthreads(struct trace_array *tr)
0576 {
0577 struct cpumask *current_mask = &save_cpumask;
0578 unsigned int cpu;
0579 int retval;
0580
0581 cpus_read_lock();
0582
0583
0584
0585 cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
0586
0587 for_each_online_cpu(cpu)
0588 per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
0589
0590 for_each_cpu(cpu, current_mask) {
0591 retval = start_cpu_kthread(cpu);
0592 if (retval)
0593 goto out_error;
0594 }
0595 cpus_read_unlock();
0596
0597 return 0;
0598
0599 out_error:
0600 cpus_read_unlock();
0601 stop_per_cpu_kthreads();
0602 return retval;
0603 }
0604
0605 static void *s_mode_start(struct seq_file *s, loff_t *pos)
0606 {
0607 int mode = *pos;
0608
0609 mutex_lock(&hwlat_data.lock);
0610
0611 if (mode >= MODE_MAX)
0612 return NULL;
0613
0614 return pos;
0615 }
0616
0617 static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos)
0618 {
0619 int mode = ++(*pos);
0620
0621 if (mode >= MODE_MAX)
0622 return NULL;
0623
0624 return pos;
0625 }
0626
0627 static int s_mode_show(struct seq_file *s, void *v)
0628 {
0629 loff_t *pos = v;
0630 int mode = *pos;
0631
0632 if (mode == hwlat_data.thread_mode)
0633 seq_printf(s, "[%s]", thread_mode_str[mode]);
0634 else
0635 seq_printf(s, "%s", thread_mode_str[mode]);
0636
0637 if (mode != MODE_MAX)
0638 seq_puts(s, " ");
0639
0640 return 0;
0641 }
0642
0643 static void s_mode_stop(struct seq_file *s, void *v)
0644 {
0645 seq_puts(s, "\n");
0646 mutex_unlock(&hwlat_data.lock);
0647 }
0648
0649 static const struct seq_operations thread_mode_seq_ops = {
0650 .start = s_mode_start,
0651 .next = s_mode_next,
0652 .show = s_mode_show,
0653 .stop = s_mode_stop
0654 };
0655
0656 static int hwlat_mode_open(struct inode *inode, struct file *file)
0657 {
0658 return seq_open(file, &thread_mode_seq_ops);
0659 };
0660
0661 static void hwlat_tracer_start(struct trace_array *tr);
0662 static void hwlat_tracer_stop(struct trace_array *tr);
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679 static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf,
0680 size_t cnt, loff_t *ppos)
0681 {
0682 struct trace_array *tr = hwlat_trace;
0683 const char *mode;
0684 char buf[64];
0685 int ret, i;
0686
0687 if (cnt >= sizeof(buf))
0688 return -EINVAL;
0689
0690 if (copy_from_user(buf, ubuf, cnt))
0691 return -EFAULT;
0692
0693 buf[cnt] = 0;
0694
0695 mode = strstrip(buf);
0696
0697 ret = -EINVAL;
0698
0699
0700
0701
0702
0703 mutex_lock(&trace_types_lock);
0704 if (hwlat_busy)
0705 hwlat_tracer_stop(tr);
0706
0707 mutex_lock(&hwlat_data.lock);
0708
0709 for (i = 0; i < MODE_MAX; i++) {
0710 if (strcmp(mode, thread_mode_str[i]) == 0) {
0711 hwlat_data.thread_mode = i;
0712 ret = cnt;
0713 }
0714 }
0715
0716 mutex_unlock(&hwlat_data.lock);
0717
0718 if (hwlat_busy)
0719 hwlat_tracer_start(tr);
0720 mutex_unlock(&trace_types_lock);
0721
0722 *ppos += cnt;
0723
0724
0725
0726 return ret;
0727 }
0728
0729
0730
0731
0732
0733
0734 static struct trace_min_max_param hwlat_width = {
0735 .lock = &hwlat_data.lock,
0736 .val = &hwlat_data.sample_width,
0737 .max = &hwlat_data.sample_window,
0738 .min = NULL,
0739 };
0740
0741
0742
0743
0744
0745
0746 static struct trace_min_max_param hwlat_window = {
0747 .lock = &hwlat_data.lock,
0748 .val = &hwlat_data.sample_window,
0749 .max = NULL,
0750 .min = &hwlat_data.sample_width,
0751 };
0752
0753 static const struct file_operations thread_mode_fops = {
0754 .open = hwlat_mode_open,
0755 .read = seq_read,
0756 .llseek = seq_lseek,
0757 .release = seq_release,
0758 .write = hwlat_mode_write
0759 };
0760
0761
0762
0763
0764
0765
0766
0767
0768 static int init_tracefs(void)
0769 {
0770 int ret;
0771 struct dentry *top_dir;
0772
0773 ret = tracing_init_dentry();
0774 if (ret)
0775 return -ENOMEM;
0776
0777 top_dir = tracefs_create_dir("hwlat_detector", NULL);
0778 if (!top_dir)
0779 return -ENOMEM;
0780
0781 hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE,
0782 top_dir,
0783 &hwlat_window,
0784 &trace_min_max_fops);
0785 if (!hwlat_sample_window)
0786 goto err;
0787
0788 hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE,
0789 top_dir,
0790 &hwlat_width,
0791 &trace_min_max_fops);
0792 if (!hwlat_sample_width)
0793 goto err;
0794
0795 hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE,
0796 top_dir,
0797 NULL,
0798 &thread_mode_fops);
0799 if (!hwlat_thread_mode)
0800 goto err;
0801
0802 return 0;
0803
0804 err:
0805 tracefs_remove(top_dir);
0806 return -ENOMEM;
0807 }
0808
0809 static void hwlat_tracer_start(struct trace_array *tr)
0810 {
0811 int err;
0812
0813 if (hwlat_data.thread_mode == MODE_PER_CPU)
0814 err = start_per_cpu_kthreads(tr);
0815 else
0816 err = start_single_kthread(tr);
0817 if (err)
0818 pr_err(BANNER "Cannot start hwlat kthread\n");
0819 }
0820
0821 static void hwlat_tracer_stop(struct trace_array *tr)
0822 {
0823 if (hwlat_data.thread_mode == MODE_PER_CPU)
0824 stop_per_cpu_kthreads();
0825 else
0826 stop_single_kthread();
0827 }
0828
0829 static int hwlat_tracer_init(struct trace_array *tr)
0830 {
0831
0832 if (hwlat_busy)
0833 return -EBUSY;
0834
0835 hwlat_trace = tr;
0836
0837 hwlat_data.count = 0;
0838 tr->max_latency = 0;
0839 save_tracing_thresh = tracing_thresh;
0840
0841
0842 if (!tracing_thresh)
0843 tracing_thresh = last_tracing_thresh;
0844
0845 if (tracer_tracing_is_on(tr))
0846 hwlat_tracer_start(tr);
0847
0848 hwlat_busy = true;
0849
0850 return 0;
0851 }
0852
0853 static void hwlat_tracer_reset(struct trace_array *tr)
0854 {
0855 hwlat_tracer_stop(tr);
0856
0857
0858 last_tracing_thresh = tracing_thresh;
0859
0860 tracing_thresh = save_tracing_thresh;
0861 hwlat_busy = false;
0862 }
0863
0864 static struct tracer hwlat_tracer __read_mostly =
0865 {
0866 .name = "hwlat",
0867 .init = hwlat_tracer_init,
0868 .reset = hwlat_tracer_reset,
0869 .start = hwlat_tracer_start,
0870 .stop = hwlat_tracer_stop,
0871 .allow_instances = true,
0872 };
0873
0874 __init static int init_hwlat_tracer(void)
0875 {
0876 int ret;
0877
0878 mutex_init(&hwlat_data.lock);
0879
0880 ret = register_tracer(&hwlat_tracer);
0881 if (ret)
0882 return ret;
0883
0884 hwlat_init_hotplug_support();
0885
0886 init_tracefs();
0887
0888 return 0;
0889 }
0890 late_initcall(init_hwlat_tracer);