0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/fs.h>
0012 #include <linux/mm.h>
0013 #include <linux/cpu.h>
0014 #include <linux/smp.h>
0015 #include <linux/idr.h>
0016 #include <linux/file.h>
0017 #include <linux/poll.h>
0018 #include <linux/slab.h>
0019 #include <linux/hash.h>
0020 #include <linux/tick.h>
0021 #include <linux/sysfs.h>
0022 #include <linux/dcache.h>
0023 #include <linux/percpu.h>
0024 #include <linux/ptrace.h>
0025 #include <linux/reboot.h>
0026 #include <linux/vmstat.h>
0027 #include <linux/device.h>
0028 #include <linux/export.h>
0029 #include <linux/vmalloc.h>
0030 #include <linux/hardirq.h>
0031 #include <linux/hugetlb.h>
0032 #include <linux/rculist.h>
0033 #include <linux/uaccess.h>
0034 #include <linux/syscalls.h>
0035 #include <linux/anon_inodes.h>
0036 #include <linux/kernel_stat.h>
0037 #include <linux/cgroup.h>
0038 #include <linux/perf_event.h>
0039 #include <linux/trace_events.h>
0040 #include <linux/hw_breakpoint.h>
0041 #include <linux/mm_types.h>
0042 #include <linux/module.h>
0043 #include <linux/mman.h>
0044 #include <linux/compat.h>
0045 #include <linux/bpf.h>
0046 #include <linux/filter.h>
0047 #include <linux/namei.h>
0048 #include <linux/parser.h>
0049 #include <linux/sched/clock.h>
0050 #include <linux/sched/mm.h>
0051 #include <linux/proc_ns.h>
0052 #include <linux/mount.h>
0053 #include <linux/min_heap.h>
0054 #include <linux/highmem.h>
0055 #include <linux/pgtable.h>
0056 #include <linux/buildid.h>
0057
0058 #include "internal.h"
0059
0060 #include <asm/irq_regs.h>
0061
0062 typedef int (*remote_function_f)(void *);
0063
0064 struct remote_function_call {
0065 struct task_struct *p;
0066 remote_function_f func;
0067 void *info;
0068 int ret;
0069 };
0070
0071 static void remote_function(void *data)
0072 {
0073 struct remote_function_call *tfc = data;
0074 struct task_struct *p = tfc->p;
0075
0076 if (p) {
0077
0078 if (task_cpu(p) != smp_processor_id())
0079 return;
0080
0081
0082
0083
0084
0085
0086 tfc->ret = -ESRCH;
0087 if (p != current)
0088 return;
0089 }
0090
0091 tfc->ret = tfc->func(tfc->info);
0092 }
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107 static int
0108 task_function_call(struct task_struct *p, remote_function_f func, void *info)
0109 {
0110 struct remote_function_call data = {
0111 .p = p,
0112 .func = func,
0113 .info = info,
0114 .ret = -EAGAIN,
0115 };
0116 int ret;
0117
0118 for (;;) {
0119 ret = smp_call_function_single(task_cpu(p), remote_function,
0120 &data, 1);
0121 if (!ret)
0122 ret = data.ret;
0123
0124 if (ret != -EAGAIN)
0125 break;
0126
0127 cond_resched();
0128 }
0129
0130 return ret;
0131 }
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143 static int cpu_function_call(int cpu, remote_function_f func, void *info)
0144 {
0145 struct remote_function_call data = {
0146 .p = NULL,
0147 .func = func,
0148 .info = info,
0149 .ret = -ENXIO,
0150 };
0151
0152 smp_call_function_single(cpu, remote_function, &data, 1);
0153
0154 return data.ret;
0155 }
0156
0157 static inline struct perf_cpu_context *
0158 __get_cpu_context(struct perf_event_context *ctx)
0159 {
0160 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
0161 }
0162
0163 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
0164 struct perf_event_context *ctx)
0165 {
0166 raw_spin_lock(&cpuctx->ctx.lock);
0167 if (ctx)
0168 raw_spin_lock(&ctx->lock);
0169 }
0170
0171 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
0172 struct perf_event_context *ctx)
0173 {
0174 if (ctx)
0175 raw_spin_unlock(&ctx->lock);
0176 raw_spin_unlock(&cpuctx->ctx.lock);
0177 }
0178
0179 #define TASK_TOMBSTONE ((void *)-1L)
0180
0181 static bool is_kernel_event(struct perf_event *event)
0182 {
0183 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
0184 }
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
0206 struct perf_event_context *, void *);
0207
0208 struct event_function_struct {
0209 struct perf_event *event;
0210 event_f func;
0211 void *data;
0212 };
0213
0214 static int event_function(void *info)
0215 {
0216 struct event_function_struct *efs = info;
0217 struct perf_event *event = efs->event;
0218 struct perf_event_context *ctx = event->ctx;
0219 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0220 struct perf_event_context *task_ctx = cpuctx->task_ctx;
0221 int ret = 0;
0222
0223 lockdep_assert_irqs_disabled();
0224
0225 perf_ctx_lock(cpuctx, task_ctx);
0226
0227
0228
0229
0230 if (ctx->task) {
0231 if (ctx->task != current) {
0232 ret = -ESRCH;
0233 goto unlock;
0234 }
0235
0236
0237
0238
0239
0240
0241
0242
0243 WARN_ON_ONCE(!ctx->is_active);
0244
0245
0246
0247
0248 WARN_ON_ONCE(task_ctx != ctx);
0249 } else {
0250 WARN_ON_ONCE(&cpuctx->ctx != ctx);
0251 }
0252
0253 efs->func(event, cpuctx, ctx, efs->data);
0254 unlock:
0255 perf_ctx_unlock(cpuctx, task_ctx);
0256
0257 return ret;
0258 }
0259
0260 static void event_function_call(struct perf_event *event, event_f func, void *data)
0261 {
0262 struct perf_event_context *ctx = event->ctx;
0263 struct task_struct *task = READ_ONCE(ctx->task);
0264 struct event_function_struct efs = {
0265 .event = event,
0266 .func = func,
0267 .data = data,
0268 };
0269
0270 if (!event->parent) {
0271
0272
0273
0274
0275
0276 lockdep_assert_held(&ctx->mutex);
0277 }
0278
0279 if (!task) {
0280 cpu_function_call(event->cpu, event_function, &efs);
0281 return;
0282 }
0283
0284 if (task == TASK_TOMBSTONE)
0285 return;
0286
0287 again:
0288 if (!task_function_call(task, event_function, &efs))
0289 return;
0290
0291 raw_spin_lock_irq(&ctx->lock);
0292
0293
0294
0295
0296 task = ctx->task;
0297 if (task == TASK_TOMBSTONE) {
0298 raw_spin_unlock_irq(&ctx->lock);
0299 return;
0300 }
0301 if (ctx->is_active) {
0302 raw_spin_unlock_irq(&ctx->lock);
0303 goto again;
0304 }
0305 func(event, NULL, ctx, data);
0306 raw_spin_unlock_irq(&ctx->lock);
0307 }
0308
0309
0310
0311
0312
0313 static void event_function_local(struct perf_event *event, event_f func, void *data)
0314 {
0315 struct perf_event_context *ctx = event->ctx;
0316 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0317 struct task_struct *task = READ_ONCE(ctx->task);
0318 struct perf_event_context *task_ctx = NULL;
0319
0320 lockdep_assert_irqs_disabled();
0321
0322 if (task) {
0323 if (task == TASK_TOMBSTONE)
0324 return;
0325
0326 task_ctx = ctx;
0327 }
0328
0329 perf_ctx_lock(cpuctx, task_ctx);
0330
0331 task = ctx->task;
0332 if (task == TASK_TOMBSTONE)
0333 goto unlock;
0334
0335 if (task) {
0336
0337
0338
0339
0340
0341 if (ctx->is_active) {
0342 if (WARN_ON_ONCE(task != current))
0343 goto unlock;
0344
0345 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
0346 goto unlock;
0347 }
0348 } else {
0349 WARN_ON_ONCE(&cpuctx->ctx != ctx);
0350 }
0351
0352 func(event, cpuctx, ctx, data);
0353 unlock:
0354 perf_ctx_unlock(cpuctx, task_ctx);
0355 }
0356
0357 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
0358 PERF_FLAG_FD_OUTPUT |\
0359 PERF_FLAG_PID_CGROUP |\
0360 PERF_FLAG_FD_CLOEXEC)
0361
0362
0363
0364
0365 #define PERF_SAMPLE_BRANCH_PERM_PLM \
0366 (PERF_SAMPLE_BRANCH_KERNEL |\
0367 PERF_SAMPLE_BRANCH_HV)
0368
0369 enum event_type_t {
0370 EVENT_FLEXIBLE = 0x1,
0371 EVENT_PINNED = 0x2,
0372 EVENT_TIME = 0x4,
0373
0374 EVENT_CPU = 0x8,
0375 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
0376 };
0377
0378
0379
0380
0381
0382
0383 static void perf_sched_delayed(struct work_struct *work);
0384 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
0385 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
0386 static DEFINE_MUTEX(perf_sched_mutex);
0387 static atomic_t perf_sched_count;
0388
0389 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
0390 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
0391 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
0392
0393 static atomic_t nr_mmap_events __read_mostly;
0394 static atomic_t nr_comm_events __read_mostly;
0395 static atomic_t nr_namespaces_events __read_mostly;
0396 static atomic_t nr_task_events __read_mostly;
0397 static atomic_t nr_freq_events __read_mostly;
0398 static atomic_t nr_switch_events __read_mostly;
0399 static atomic_t nr_ksymbol_events __read_mostly;
0400 static atomic_t nr_bpf_events __read_mostly;
0401 static atomic_t nr_cgroup_events __read_mostly;
0402 static atomic_t nr_text_poke_events __read_mostly;
0403 static atomic_t nr_build_id_events __read_mostly;
0404
0405 static LIST_HEAD(pmus);
0406 static DEFINE_MUTEX(pmus_lock);
0407 static struct srcu_struct pmus_srcu;
0408 static cpumask_var_t perf_online_mask;
0409 static struct kmem_cache *perf_event_cache;
0410
0411
0412
0413
0414
0415
0416
0417
0418 int sysctl_perf_event_paranoid __read_mostly = 2;
0419
0420
0421 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
0422
0423
0424
0425
0426 #define DEFAULT_MAX_SAMPLE_RATE 100000
0427 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
0428 #define DEFAULT_CPU_TIME_MAX_PERCENT 25
0429
0430 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
0431
0432 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
0433 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
0434
0435 static int perf_sample_allowed_ns __read_mostly =
0436 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
0437
0438 static void update_perf_cpu_limits(void)
0439 {
0440 u64 tmp = perf_sample_period_ns;
0441
0442 tmp *= sysctl_perf_cpu_time_max_percent;
0443 tmp = div_u64(tmp, 100);
0444 if (!tmp)
0445 tmp = 1;
0446
0447 WRITE_ONCE(perf_sample_allowed_ns, tmp);
0448 }
0449
0450 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
0451
0452 int perf_proc_update_handler(struct ctl_table *table, int write,
0453 void *buffer, size_t *lenp, loff_t *ppos)
0454 {
0455 int ret;
0456 int perf_cpu = sysctl_perf_cpu_time_max_percent;
0457
0458
0459
0460 if (write && (perf_cpu == 100 || perf_cpu == 0))
0461 return -EINVAL;
0462
0463 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
0464 if (ret || !write)
0465 return ret;
0466
0467 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
0468 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
0469 update_perf_cpu_limits();
0470
0471 return 0;
0472 }
0473
0474 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
0475
0476 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
0477 void *buffer, size_t *lenp, loff_t *ppos)
0478 {
0479 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
0480
0481 if (ret || !write)
0482 return ret;
0483
0484 if (sysctl_perf_cpu_time_max_percent == 100 ||
0485 sysctl_perf_cpu_time_max_percent == 0) {
0486 printk(KERN_WARNING
0487 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
0488 WRITE_ONCE(perf_sample_allowed_ns, 0);
0489 } else {
0490 update_perf_cpu_limits();
0491 }
0492
0493 return 0;
0494 }
0495
0496
0497
0498
0499
0500
0501
0502 #define NR_ACCUMULATED_SAMPLES 128
0503 static DEFINE_PER_CPU(u64, running_sample_length);
0504
0505 static u64 __report_avg;
0506 static u64 __report_allowed;
0507
0508 static void perf_duration_warn(struct irq_work *w)
0509 {
0510 printk_ratelimited(KERN_INFO
0511 "perf: interrupt took too long (%lld > %lld), lowering "
0512 "kernel.perf_event_max_sample_rate to %d\n",
0513 __report_avg, __report_allowed,
0514 sysctl_perf_event_sample_rate);
0515 }
0516
0517 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
0518
0519 void perf_sample_event_took(u64 sample_len_ns)
0520 {
0521 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
0522 u64 running_len;
0523 u64 avg_len;
0524 u32 max;
0525
0526 if (max_len == 0)
0527 return;
0528
0529
0530 running_len = __this_cpu_read(running_sample_length);
0531 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
0532 running_len += sample_len_ns;
0533 __this_cpu_write(running_sample_length, running_len);
0534
0535
0536
0537
0538
0539
0540 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
0541 if (avg_len <= max_len)
0542 return;
0543
0544 __report_avg = avg_len;
0545 __report_allowed = max_len;
0546
0547
0548
0549
0550 avg_len += avg_len / 4;
0551 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
0552 if (avg_len < max)
0553 max /= (u32)avg_len;
0554 else
0555 max = 1;
0556
0557 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
0558 WRITE_ONCE(max_samples_per_tick, max);
0559
0560 sysctl_perf_event_sample_rate = max * HZ;
0561 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
0562
0563 if (!irq_work_queue(&perf_duration_work)) {
0564 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
0565 "kernel.perf_event_max_sample_rate to %d\n",
0566 __report_avg, __report_allowed,
0567 sysctl_perf_event_sample_rate);
0568 }
0569 }
0570
0571 static atomic64_t perf_event_id;
0572
0573 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
0574 enum event_type_t event_type);
0575
0576 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
0577 enum event_type_t event_type);
0578
0579 static void update_context_time(struct perf_event_context *ctx);
0580 static u64 perf_event_time(struct perf_event *event);
0581
0582 void __weak perf_event_print_debug(void) { }
0583
0584 static inline u64 perf_clock(void)
0585 {
0586 return local_clock();
0587 }
0588
0589 static inline u64 perf_event_clock(struct perf_event *event)
0590 {
0591 return event->clock();
0592 }
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616 static __always_inline enum perf_event_state
0617 __perf_effective_state(struct perf_event *event)
0618 {
0619 struct perf_event *leader = event->group_leader;
0620
0621 if (leader->state <= PERF_EVENT_STATE_OFF)
0622 return leader->state;
0623
0624 return event->state;
0625 }
0626
0627 static __always_inline void
0628 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
0629 {
0630 enum perf_event_state state = __perf_effective_state(event);
0631 u64 delta = now - event->tstamp;
0632
0633 *enabled = event->total_time_enabled;
0634 if (state >= PERF_EVENT_STATE_INACTIVE)
0635 *enabled += delta;
0636
0637 *running = event->total_time_running;
0638 if (state >= PERF_EVENT_STATE_ACTIVE)
0639 *running += delta;
0640 }
0641
0642 static void perf_event_update_time(struct perf_event *event)
0643 {
0644 u64 now = perf_event_time(event);
0645
0646 __perf_update_times(event, now, &event->total_time_enabled,
0647 &event->total_time_running);
0648 event->tstamp = now;
0649 }
0650
0651 static void perf_event_update_sibling_time(struct perf_event *leader)
0652 {
0653 struct perf_event *sibling;
0654
0655 for_each_sibling_event(sibling, leader)
0656 perf_event_update_time(sibling);
0657 }
0658
0659 static void
0660 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
0661 {
0662 if (event->state == state)
0663 return;
0664
0665 perf_event_update_time(event);
0666
0667
0668
0669
0670 if ((event->state < 0) ^ (state < 0))
0671 perf_event_update_sibling_time(event);
0672
0673 WRITE_ONCE(event->state, state);
0674 }
0675
0676
0677
0678
0679
0680 #define __store_release(ptr, val) \
0681 do { \
0682 barrier(); \
0683 WRITE_ONCE(*(ptr), (val)); \
0684 } while (0)
0685
0686 #define __load_acquire(ptr) \
0687 ({ \
0688 __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
0689 barrier(); \
0690 ___p; \
0691 })
0692
0693 #ifdef CONFIG_CGROUP_PERF
0694
0695 static inline bool
0696 perf_cgroup_match(struct perf_event *event)
0697 {
0698 struct perf_event_context *ctx = event->ctx;
0699 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0700
0701
0702 if (!event->cgrp)
0703 return true;
0704
0705
0706 if (!cpuctx->cgrp)
0707 return false;
0708
0709
0710
0711
0712
0713
0714
0715 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
0716 event->cgrp->css.cgroup);
0717 }
0718
0719 static inline void perf_detach_cgroup(struct perf_event *event)
0720 {
0721 css_put(&event->cgrp->css);
0722 event->cgrp = NULL;
0723 }
0724
0725 static inline int is_cgroup_event(struct perf_event *event)
0726 {
0727 return event->cgrp != NULL;
0728 }
0729
0730 static inline u64 perf_cgroup_event_time(struct perf_event *event)
0731 {
0732 struct perf_cgroup_info *t;
0733
0734 t = per_cpu_ptr(event->cgrp->info, event->cpu);
0735 return t->time;
0736 }
0737
0738 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
0739 {
0740 struct perf_cgroup_info *t;
0741
0742 t = per_cpu_ptr(event->cgrp->info, event->cpu);
0743 if (!__load_acquire(&t->active))
0744 return t->time;
0745 now += READ_ONCE(t->timeoffset);
0746 return now;
0747 }
0748
0749 static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
0750 {
0751 if (adv)
0752 info->time += now - info->timestamp;
0753 info->timestamp = now;
0754
0755
0756
0757 WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
0758 }
0759
0760 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
0761 {
0762 struct perf_cgroup *cgrp = cpuctx->cgrp;
0763 struct cgroup_subsys_state *css;
0764 struct perf_cgroup_info *info;
0765
0766 if (cgrp) {
0767 u64 now = perf_clock();
0768
0769 for (css = &cgrp->css; css; css = css->parent) {
0770 cgrp = container_of(css, struct perf_cgroup, css);
0771 info = this_cpu_ptr(cgrp->info);
0772
0773 __update_cgrp_time(info, now, true);
0774 if (final)
0775 __store_release(&info->active, 0);
0776 }
0777 }
0778 }
0779
0780 static inline void update_cgrp_time_from_event(struct perf_event *event)
0781 {
0782 struct perf_cgroup_info *info;
0783
0784
0785
0786
0787
0788 if (!is_cgroup_event(event))
0789 return;
0790
0791 info = this_cpu_ptr(event->cgrp->info);
0792
0793
0794
0795 if (info->active)
0796 __update_cgrp_time(info, perf_clock(), true);
0797 }
0798
0799 static inline void
0800 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
0801 {
0802 struct perf_event_context *ctx = &cpuctx->ctx;
0803 struct perf_cgroup *cgrp = cpuctx->cgrp;
0804 struct perf_cgroup_info *info;
0805 struct cgroup_subsys_state *css;
0806
0807
0808
0809
0810
0811
0812 if (!cgrp)
0813 return;
0814
0815 WARN_ON_ONCE(!ctx->nr_cgroups);
0816
0817 for (css = &cgrp->css; css; css = css->parent) {
0818 cgrp = container_of(css, struct perf_cgroup, css);
0819 info = this_cpu_ptr(cgrp->info);
0820 __update_cgrp_time(info, ctx->timestamp, false);
0821 __store_release(&info->active, 1);
0822 }
0823 }
0824
0825 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
0826
0827
0828
0829
0830 static void perf_cgroup_switch(struct task_struct *task)
0831 {
0832 struct perf_cgroup *cgrp;
0833 struct perf_cpu_context *cpuctx, *tmp;
0834 struct list_head *list;
0835 unsigned long flags;
0836
0837
0838
0839
0840
0841 local_irq_save(flags);
0842
0843 cgrp = perf_cgroup_from_task(task, NULL);
0844
0845 list = this_cpu_ptr(&cgrp_cpuctx_list);
0846 list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
0847 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
0848 if (READ_ONCE(cpuctx->cgrp) == cgrp)
0849 continue;
0850
0851 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
0852 perf_pmu_disable(cpuctx->ctx.pmu);
0853
0854 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
0855
0856
0857
0858
0859
0860 cpuctx->cgrp = cgrp;
0861
0862
0863
0864
0865
0866 cpu_ctx_sched_in(cpuctx, EVENT_ALL);
0867
0868 perf_pmu_enable(cpuctx->ctx.pmu);
0869 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
0870 }
0871
0872 local_irq_restore(flags);
0873 }
0874
0875 static int perf_cgroup_ensure_storage(struct perf_event *event,
0876 struct cgroup_subsys_state *css)
0877 {
0878 struct perf_cpu_context *cpuctx;
0879 struct perf_event **storage;
0880 int cpu, heap_size, ret = 0;
0881
0882
0883
0884
0885
0886 for (heap_size = 1; css; css = css->parent)
0887 heap_size++;
0888
0889 for_each_possible_cpu(cpu) {
0890 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
0891 if (heap_size <= cpuctx->heap_size)
0892 continue;
0893
0894 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
0895 GFP_KERNEL, cpu_to_node(cpu));
0896 if (!storage) {
0897 ret = -ENOMEM;
0898 break;
0899 }
0900
0901 raw_spin_lock_irq(&cpuctx->ctx.lock);
0902 if (cpuctx->heap_size < heap_size) {
0903 swap(cpuctx->heap, storage);
0904 if (storage == cpuctx->heap_default)
0905 storage = NULL;
0906 cpuctx->heap_size = heap_size;
0907 }
0908 raw_spin_unlock_irq(&cpuctx->ctx.lock);
0909
0910 kfree(storage);
0911 }
0912
0913 return ret;
0914 }
0915
0916 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
0917 struct perf_event_attr *attr,
0918 struct perf_event *group_leader)
0919 {
0920 struct perf_cgroup *cgrp;
0921 struct cgroup_subsys_state *css;
0922 struct fd f = fdget(fd);
0923 int ret = 0;
0924
0925 if (!f.file)
0926 return -EBADF;
0927
0928 css = css_tryget_online_from_dir(f.file->f_path.dentry,
0929 &perf_event_cgrp_subsys);
0930 if (IS_ERR(css)) {
0931 ret = PTR_ERR(css);
0932 goto out;
0933 }
0934
0935 ret = perf_cgroup_ensure_storage(event, css);
0936 if (ret)
0937 goto out;
0938
0939 cgrp = container_of(css, struct perf_cgroup, css);
0940 event->cgrp = cgrp;
0941
0942
0943
0944
0945
0946
0947 if (group_leader && group_leader->cgrp != cgrp) {
0948 perf_detach_cgroup(event);
0949 ret = -EINVAL;
0950 }
0951 out:
0952 fdput(f);
0953 return ret;
0954 }
0955
0956 static inline void
0957 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
0958 {
0959 struct perf_cpu_context *cpuctx;
0960
0961 if (!is_cgroup_event(event))
0962 return;
0963
0964
0965
0966
0967
0968 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
0969
0970 if (ctx->nr_cgroups++)
0971 return;
0972
0973 cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
0974 list_add(&cpuctx->cgrp_cpuctx_entry,
0975 per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
0976 }
0977
0978 static inline void
0979 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
0980 {
0981 struct perf_cpu_context *cpuctx;
0982
0983 if (!is_cgroup_event(event))
0984 return;
0985
0986
0987
0988
0989
0990 cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
0991
0992 if (--ctx->nr_cgroups)
0993 return;
0994
0995 cpuctx->cgrp = NULL;
0996 list_del(&cpuctx->cgrp_cpuctx_entry);
0997 }
0998
0999 #else
1000
1001 static inline bool
1002 perf_cgroup_match(struct perf_event *event)
1003 {
1004 return true;
1005 }
1006
1007 static inline void perf_detach_cgroup(struct perf_event *event)
1008 {}
1009
1010 static inline int is_cgroup_event(struct perf_event *event)
1011 {
1012 return 0;
1013 }
1014
1015 static inline void update_cgrp_time_from_event(struct perf_event *event)
1016 {
1017 }
1018
1019 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1020 bool final)
1021 {
1022 }
1023
1024 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1025 struct perf_event_attr *attr,
1026 struct perf_event *group_leader)
1027 {
1028 return -EINVAL;
1029 }
1030
1031 static inline void
1032 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1033 {
1034 }
1035
1036 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1037 {
1038 return 0;
1039 }
1040
1041 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1042 {
1043 return 0;
1044 }
1045
1046 static inline void
1047 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1048 {
1049 }
1050
1051 static inline void
1052 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1053 {
1054 }
1055
1056 static void perf_cgroup_switch(struct task_struct *task)
1057 {
1058 }
1059 #endif
1060
1061
1062
1063
1064
1065 #define PERF_CPU_HRTIMER (1000 / HZ)
1066
1067
1068
1069 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1070 {
1071 struct perf_cpu_context *cpuctx;
1072 bool rotations;
1073
1074 lockdep_assert_irqs_disabled();
1075
1076 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1077 rotations = perf_rotate_context(cpuctx);
1078
1079 raw_spin_lock(&cpuctx->hrtimer_lock);
1080 if (rotations)
1081 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1082 else
1083 cpuctx->hrtimer_active = 0;
1084 raw_spin_unlock(&cpuctx->hrtimer_lock);
1085
1086 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1087 }
1088
1089 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1090 {
1091 struct hrtimer *timer = &cpuctx->hrtimer;
1092 struct pmu *pmu = cpuctx->ctx.pmu;
1093 u64 interval;
1094
1095
1096 if (pmu->task_ctx_nr == perf_sw_context)
1097 return;
1098
1099
1100
1101
1102
1103 interval = pmu->hrtimer_interval_ms;
1104 if (interval < 1)
1105 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1106
1107 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1108
1109 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1110 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1111 timer->function = perf_mux_hrtimer_handler;
1112 }
1113
1114 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1115 {
1116 struct hrtimer *timer = &cpuctx->hrtimer;
1117 struct pmu *pmu = cpuctx->ctx.pmu;
1118 unsigned long flags;
1119
1120
1121 if (pmu->task_ctx_nr == perf_sw_context)
1122 return 0;
1123
1124 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1125 if (!cpuctx->hrtimer_active) {
1126 cpuctx->hrtimer_active = 1;
1127 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1128 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1129 }
1130 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1131
1132 return 0;
1133 }
1134
1135 void perf_pmu_disable(struct pmu *pmu)
1136 {
1137 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1138 if (!(*count)++)
1139 pmu->pmu_disable(pmu);
1140 }
1141
1142 void perf_pmu_enable(struct pmu *pmu)
1143 {
1144 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1145 if (!--(*count))
1146 pmu->pmu_enable(pmu);
1147 }
1148
1149 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1150
1151
1152
1153
1154
1155
1156
1157 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1158 {
1159 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1160
1161 lockdep_assert_irqs_disabled();
1162
1163 WARN_ON(!list_empty(&ctx->active_ctx_list));
1164
1165 list_add(&ctx->active_ctx_list, head);
1166 }
1167
1168 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1169 {
1170 lockdep_assert_irqs_disabled();
1171
1172 WARN_ON(list_empty(&ctx->active_ctx_list));
1173
1174 list_del_init(&ctx->active_ctx_list);
1175 }
1176
1177 static void get_ctx(struct perf_event_context *ctx)
1178 {
1179 refcount_inc(&ctx->refcount);
1180 }
1181
1182 static void *alloc_task_ctx_data(struct pmu *pmu)
1183 {
1184 if (pmu->task_ctx_cache)
1185 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1186
1187 return NULL;
1188 }
1189
1190 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1191 {
1192 if (pmu->task_ctx_cache && task_ctx_data)
1193 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1194 }
1195
1196 static void free_ctx(struct rcu_head *head)
1197 {
1198 struct perf_event_context *ctx;
1199
1200 ctx = container_of(head, struct perf_event_context, rcu_head);
1201 free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1202 kfree(ctx);
1203 }
1204
1205 static void put_ctx(struct perf_event_context *ctx)
1206 {
1207 if (refcount_dec_and_test(&ctx->refcount)) {
1208 if (ctx->parent_ctx)
1209 put_ctx(ctx->parent_ctx);
1210 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1211 put_task_struct(ctx->task);
1212 call_rcu(&ctx->rcu_head, free_ctx);
1213 }
1214 }
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282 static struct perf_event_context *
1283 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1284 {
1285 struct perf_event_context *ctx;
1286
1287 again:
1288 rcu_read_lock();
1289 ctx = READ_ONCE(event->ctx);
1290 if (!refcount_inc_not_zero(&ctx->refcount)) {
1291 rcu_read_unlock();
1292 goto again;
1293 }
1294 rcu_read_unlock();
1295
1296 mutex_lock_nested(&ctx->mutex, nesting);
1297 if (event->ctx != ctx) {
1298 mutex_unlock(&ctx->mutex);
1299 put_ctx(ctx);
1300 goto again;
1301 }
1302
1303 return ctx;
1304 }
1305
1306 static inline struct perf_event_context *
1307 perf_event_ctx_lock(struct perf_event *event)
1308 {
1309 return perf_event_ctx_lock_nested(event, 0);
1310 }
1311
1312 static void perf_event_ctx_unlock(struct perf_event *event,
1313 struct perf_event_context *ctx)
1314 {
1315 mutex_unlock(&ctx->mutex);
1316 put_ctx(ctx);
1317 }
1318
1319
1320
1321
1322
1323
1324 static __must_check struct perf_event_context *
1325 unclone_ctx(struct perf_event_context *ctx)
1326 {
1327 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1328
1329 lockdep_assert_held(&ctx->lock);
1330
1331 if (parent_ctx)
1332 ctx->parent_ctx = NULL;
1333 ctx->generation++;
1334
1335 return parent_ctx;
1336 }
1337
1338 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1339 enum pid_type type)
1340 {
1341 u32 nr;
1342
1343
1344
1345 if (event->parent)
1346 event = event->parent;
1347
1348 nr = __task_pid_nr_ns(p, type, event->ns);
1349
1350 if (!nr && !pid_alive(p))
1351 nr = -1;
1352 return nr;
1353 }
1354
1355 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1356 {
1357 return perf_event_pid_type(event, p, PIDTYPE_TGID);
1358 }
1359
1360 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1361 {
1362 return perf_event_pid_type(event, p, PIDTYPE_PID);
1363 }
1364
1365
1366
1367
1368
1369 static u64 primary_event_id(struct perf_event *event)
1370 {
1371 u64 id = event->id;
1372
1373 if (event->parent)
1374 id = event->parent->id;
1375
1376 return id;
1377 }
1378
1379
1380
1381
1382
1383
1384
1385 static struct perf_event_context *
1386 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1387 {
1388 struct perf_event_context *ctx;
1389
1390 retry:
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400 local_irq_save(*flags);
1401 rcu_read_lock();
1402 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1403 if (ctx) {
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414 raw_spin_lock(&ctx->lock);
1415 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1416 raw_spin_unlock(&ctx->lock);
1417 rcu_read_unlock();
1418 local_irq_restore(*flags);
1419 goto retry;
1420 }
1421
1422 if (ctx->task == TASK_TOMBSTONE ||
1423 !refcount_inc_not_zero(&ctx->refcount)) {
1424 raw_spin_unlock(&ctx->lock);
1425 ctx = NULL;
1426 } else {
1427 WARN_ON_ONCE(ctx->task != task);
1428 }
1429 }
1430 rcu_read_unlock();
1431 if (!ctx)
1432 local_irq_restore(*flags);
1433 return ctx;
1434 }
1435
1436
1437
1438
1439
1440
1441 static struct perf_event_context *
1442 perf_pin_task_context(struct task_struct *task, int ctxn)
1443 {
1444 struct perf_event_context *ctx;
1445 unsigned long flags;
1446
1447 ctx = perf_lock_task_context(task, ctxn, &flags);
1448 if (ctx) {
1449 ++ctx->pin_count;
1450 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1451 }
1452 return ctx;
1453 }
1454
1455 static void perf_unpin_context(struct perf_event_context *ctx)
1456 {
1457 unsigned long flags;
1458
1459 raw_spin_lock_irqsave(&ctx->lock, flags);
1460 --ctx->pin_count;
1461 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1462 }
1463
1464
1465
1466
1467 static void __update_context_time(struct perf_event_context *ctx, bool adv)
1468 {
1469 u64 now = perf_clock();
1470
1471 if (adv)
1472 ctx->time += now - ctx->timestamp;
1473 ctx->timestamp = now;
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484 WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1485 }
1486
1487 static void update_context_time(struct perf_event_context *ctx)
1488 {
1489 __update_context_time(ctx, true);
1490 }
1491
1492 static u64 perf_event_time(struct perf_event *event)
1493 {
1494 struct perf_event_context *ctx = event->ctx;
1495
1496 if (unlikely(!ctx))
1497 return 0;
1498
1499 if (is_cgroup_event(event))
1500 return perf_cgroup_event_time(event);
1501
1502 return ctx->time;
1503 }
1504
1505 static u64 perf_event_time_now(struct perf_event *event, u64 now)
1506 {
1507 struct perf_event_context *ctx = event->ctx;
1508
1509 if (unlikely(!ctx))
1510 return 0;
1511
1512 if (is_cgroup_event(event))
1513 return perf_cgroup_event_time_now(event, now);
1514
1515 if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1516 return ctx->time;
1517
1518 now += READ_ONCE(ctx->timeoffset);
1519 return now;
1520 }
1521
1522 static enum event_type_t get_event_type(struct perf_event *event)
1523 {
1524 struct perf_event_context *ctx = event->ctx;
1525 enum event_type_t event_type;
1526
1527 lockdep_assert_held(&ctx->lock);
1528
1529
1530
1531
1532
1533 if (event->group_leader != event)
1534 event = event->group_leader;
1535
1536 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1537 if (!ctx->task)
1538 event_type |= EVENT_CPU;
1539
1540 return event_type;
1541 }
1542
1543
1544
1545
1546 static void init_event_group(struct perf_event *event)
1547 {
1548 RB_CLEAR_NODE(&event->group_node);
1549 event->group_index = 0;
1550 }
1551
1552
1553
1554
1555
1556 static struct perf_event_groups *
1557 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1558 {
1559 if (event->attr.pinned)
1560 return &ctx->pinned_groups;
1561 else
1562 return &ctx->flexible_groups;
1563 }
1564
1565
1566
1567
1568 static void perf_event_groups_init(struct perf_event_groups *groups)
1569 {
1570 groups->tree = RB_ROOT;
1571 groups->index = 0;
1572 }
1573
1574 static inline struct cgroup *event_cgroup(const struct perf_event *event)
1575 {
1576 struct cgroup *cgroup = NULL;
1577
1578 #ifdef CONFIG_CGROUP_PERF
1579 if (event->cgrp)
1580 cgroup = event->cgrp->css.cgroup;
1581 #endif
1582
1583 return cgroup;
1584 }
1585
1586
1587
1588
1589
1590
1591
1592 static __always_inline int
1593 perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1594 const u64 left_group_index, const struct perf_event *right)
1595 {
1596 if (left_cpu < right->cpu)
1597 return -1;
1598 if (left_cpu > right->cpu)
1599 return 1;
1600
1601 #ifdef CONFIG_CGROUP_PERF
1602 {
1603 const struct cgroup *right_cgroup = event_cgroup(right);
1604
1605 if (left_cgroup != right_cgroup) {
1606 if (!left_cgroup) {
1607
1608
1609
1610
1611 return -1;
1612 }
1613 if (!right_cgroup) {
1614
1615
1616
1617
1618 return 1;
1619 }
1620
1621 if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1622 return -1;
1623
1624 return 1;
1625 }
1626 }
1627 #endif
1628
1629 if (left_group_index < right->group_index)
1630 return -1;
1631 if (left_group_index > right->group_index)
1632 return 1;
1633
1634 return 0;
1635 }
1636
1637 #define __node_2_pe(node) \
1638 rb_entry((node), struct perf_event, group_node)
1639
1640 static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1641 {
1642 struct perf_event *e = __node_2_pe(a);
1643 return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1644 __node_2_pe(b)) < 0;
1645 }
1646
1647 struct __group_key {
1648 int cpu;
1649 struct cgroup *cgroup;
1650 };
1651
1652 static inline int __group_cmp(const void *key, const struct rb_node *node)
1653 {
1654 const struct __group_key *a = key;
1655 const struct perf_event *b = __node_2_pe(node);
1656
1657
1658 return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1659 }
1660
1661
1662
1663
1664
1665
1666 static void
1667 perf_event_groups_insert(struct perf_event_groups *groups,
1668 struct perf_event *event)
1669 {
1670 event->group_index = ++groups->index;
1671
1672 rb_add(&event->group_node, &groups->tree, __group_less);
1673 }
1674
1675
1676
1677
1678 static void
1679 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1680 {
1681 struct perf_event_groups *groups;
1682
1683 groups = get_event_groups(event, ctx);
1684 perf_event_groups_insert(groups, event);
1685 }
1686
1687
1688
1689
1690 static void
1691 perf_event_groups_delete(struct perf_event_groups *groups,
1692 struct perf_event *event)
1693 {
1694 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1695 RB_EMPTY_ROOT(&groups->tree));
1696
1697 rb_erase(&event->group_node, &groups->tree);
1698 init_event_group(event);
1699 }
1700
1701
1702
1703
1704 static void
1705 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1706 {
1707 struct perf_event_groups *groups;
1708
1709 groups = get_event_groups(event, ctx);
1710 perf_event_groups_delete(groups, event);
1711 }
1712
1713
1714
1715
1716 static struct perf_event *
1717 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1718 struct cgroup *cgrp)
1719 {
1720 struct __group_key key = {
1721 .cpu = cpu,
1722 .cgroup = cgrp,
1723 };
1724 struct rb_node *node;
1725
1726 node = rb_find_first(&key, &groups->tree, __group_cmp);
1727 if (node)
1728 return __node_2_pe(node);
1729
1730 return NULL;
1731 }
1732
1733
1734
1735
1736 static struct perf_event *
1737 perf_event_groups_next(struct perf_event *event)
1738 {
1739 struct __group_key key = {
1740 .cpu = event->cpu,
1741 .cgroup = event_cgroup(event),
1742 };
1743 struct rb_node *next;
1744
1745 next = rb_next_match(&key, &event->group_node, __group_cmp);
1746 if (next)
1747 return __node_2_pe(next);
1748
1749 return NULL;
1750 }
1751
1752
1753
1754
1755 #define perf_event_groups_for_each(event, groups) \
1756 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1757 typeof(*event), group_node); event; \
1758 event = rb_entry_safe(rb_next(&event->group_node), \
1759 typeof(*event), group_node))
1760
1761
1762
1763
1764
1765 static void
1766 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1767 {
1768 lockdep_assert_held(&ctx->lock);
1769
1770 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1771 event->attach_state |= PERF_ATTACH_CONTEXT;
1772
1773 event->tstamp = perf_event_time(event);
1774
1775
1776
1777
1778
1779
1780 if (event->group_leader == event) {
1781 event->group_caps = event->event_caps;
1782 add_event_to_groups(event, ctx);
1783 }
1784
1785 list_add_rcu(&event->event_entry, &ctx->event_list);
1786 ctx->nr_events++;
1787 if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1788 ctx->nr_user++;
1789 if (event->attr.inherit_stat)
1790 ctx->nr_stat++;
1791
1792 if (event->state > PERF_EVENT_STATE_OFF)
1793 perf_cgroup_event_enable(event, ctx);
1794
1795 ctx->generation++;
1796 }
1797
1798
1799
1800
1801 static inline void perf_event__state_init(struct perf_event *event)
1802 {
1803 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1804 PERF_EVENT_STATE_INACTIVE;
1805 }
1806
1807 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1808 {
1809 int entry = sizeof(u64);
1810 int size = 0;
1811 int nr = 1;
1812
1813 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1814 size += sizeof(u64);
1815
1816 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1817 size += sizeof(u64);
1818
1819 if (event->attr.read_format & PERF_FORMAT_ID)
1820 entry += sizeof(u64);
1821
1822 if (event->attr.read_format & PERF_FORMAT_LOST)
1823 entry += sizeof(u64);
1824
1825 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1826 nr += nr_siblings;
1827 size += sizeof(u64);
1828 }
1829
1830 size += entry * nr;
1831 event->read_size = size;
1832 }
1833
1834 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1835 {
1836 struct perf_sample_data *data;
1837 u16 size = 0;
1838
1839 if (sample_type & PERF_SAMPLE_IP)
1840 size += sizeof(data->ip);
1841
1842 if (sample_type & PERF_SAMPLE_ADDR)
1843 size += sizeof(data->addr);
1844
1845 if (sample_type & PERF_SAMPLE_PERIOD)
1846 size += sizeof(data->period);
1847
1848 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1849 size += sizeof(data->weight.full);
1850
1851 if (sample_type & PERF_SAMPLE_READ)
1852 size += event->read_size;
1853
1854 if (sample_type & PERF_SAMPLE_DATA_SRC)
1855 size += sizeof(data->data_src.val);
1856
1857 if (sample_type & PERF_SAMPLE_TRANSACTION)
1858 size += sizeof(data->txn);
1859
1860 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1861 size += sizeof(data->phys_addr);
1862
1863 if (sample_type & PERF_SAMPLE_CGROUP)
1864 size += sizeof(data->cgroup);
1865
1866 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1867 size += sizeof(data->data_page_size);
1868
1869 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1870 size += sizeof(data->code_page_size);
1871
1872 event->header_size = size;
1873 }
1874
1875
1876
1877
1878
1879 static void perf_event__header_size(struct perf_event *event)
1880 {
1881 __perf_event_read_size(event,
1882 event->group_leader->nr_siblings);
1883 __perf_event_header_size(event, event->attr.sample_type);
1884 }
1885
1886 static void perf_event__id_header_size(struct perf_event *event)
1887 {
1888 struct perf_sample_data *data;
1889 u64 sample_type = event->attr.sample_type;
1890 u16 size = 0;
1891
1892 if (sample_type & PERF_SAMPLE_TID)
1893 size += sizeof(data->tid_entry);
1894
1895 if (sample_type & PERF_SAMPLE_TIME)
1896 size += sizeof(data->time);
1897
1898 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1899 size += sizeof(data->id);
1900
1901 if (sample_type & PERF_SAMPLE_ID)
1902 size += sizeof(data->id);
1903
1904 if (sample_type & PERF_SAMPLE_STREAM_ID)
1905 size += sizeof(data->stream_id);
1906
1907 if (sample_type & PERF_SAMPLE_CPU)
1908 size += sizeof(data->cpu_entry);
1909
1910 event->id_header_size = size;
1911 }
1912
1913 static bool perf_event_validate_size(struct perf_event *event)
1914 {
1915
1916
1917
1918
1919 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1920 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1921 perf_event__id_header_size(event);
1922
1923
1924
1925
1926
1927 if (event->read_size + event->header_size +
1928 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1929 return false;
1930
1931 return true;
1932 }
1933
1934 static void perf_group_attach(struct perf_event *event)
1935 {
1936 struct perf_event *group_leader = event->group_leader, *pos;
1937
1938 lockdep_assert_held(&event->ctx->lock);
1939
1940
1941
1942
1943 if (event->attach_state & PERF_ATTACH_GROUP)
1944 return;
1945
1946 event->attach_state |= PERF_ATTACH_GROUP;
1947
1948 if (group_leader == event)
1949 return;
1950
1951 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1952
1953 group_leader->group_caps &= event->event_caps;
1954
1955 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1956 group_leader->nr_siblings++;
1957
1958 perf_event__header_size(group_leader);
1959
1960 for_each_sibling_event(pos, group_leader)
1961 perf_event__header_size(pos);
1962 }
1963
1964
1965
1966
1967
1968 static void
1969 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1970 {
1971 WARN_ON_ONCE(event->ctx != ctx);
1972 lockdep_assert_held(&ctx->lock);
1973
1974
1975
1976
1977 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1978 return;
1979
1980 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1981
1982 ctx->nr_events--;
1983 if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1984 ctx->nr_user--;
1985 if (event->attr.inherit_stat)
1986 ctx->nr_stat--;
1987
1988 list_del_rcu(&event->event_entry);
1989
1990 if (event->group_leader == event)
1991 del_event_from_groups(event, ctx);
1992
1993
1994
1995
1996
1997
1998
1999
2000 if (event->state > PERF_EVENT_STATE_OFF) {
2001 perf_cgroup_event_disable(event, ctx);
2002 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2003 }
2004
2005 ctx->generation++;
2006 }
2007
2008 static int
2009 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2010 {
2011 if (!has_aux(aux_event))
2012 return 0;
2013
2014 if (!event->pmu->aux_output_match)
2015 return 0;
2016
2017 return event->pmu->aux_output_match(aux_event);
2018 }
2019
2020 static void put_event(struct perf_event *event);
2021 static void event_sched_out(struct perf_event *event,
2022 struct perf_cpu_context *cpuctx,
2023 struct perf_event_context *ctx);
2024
2025 static void perf_put_aux_event(struct perf_event *event)
2026 {
2027 struct perf_event_context *ctx = event->ctx;
2028 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2029 struct perf_event *iter;
2030
2031
2032
2033
2034 if (event->aux_event) {
2035 iter = event->aux_event;
2036 event->aux_event = NULL;
2037 put_event(iter);
2038 return;
2039 }
2040
2041
2042
2043
2044
2045 for_each_sibling_event(iter, event->group_leader) {
2046 if (iter->aux_event != event)
2047 continue;
2048
2049 iter->aux_event = NULL;
2050 put_event(event);
2051
2052
2053
2054
2055
2056
2057 event_sched_out(iter, cpuctx, ctx);
2058 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2059 }
2060 }
2061
2062 static bool perf_need_aux_event(struct perf_event *event)
2063 {
2064 return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2065 }
2066
2067 static int perf_get_aux_event(struct perf_event *event,
2068 struct perf_event *group_leader)
2069 {
2070
2071
2072
2073
2074
2075
2076 if (!group_leader)
2077 return 0;
2078
2079
2080
2081
2082 if (event->attr.aux_output && event->attr.aux_sample_size)
2083 return 0;
2084
2085 if (event->attr.aux_output &&
2086 !perf_aux_output_match(event, group_leader))
2087 return 0;
2088
2089 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2090 return 0;
2091
2092 if (!atomic_long_inc_not_zero(&group_leader->refcount))
2093 return 0;
2094
2095
2096
2097
2098
2099
2100
2101 event->aux_event = group_leader;
2102
2103 return 1;
2104 }
2105
2106 static inline struct list_head *get_event_list(struct perf_event *event)
2107 {
2108 struct perf_event_context *ctx = event->ctx;
2109 return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2110 }
2111
2112
2113
2114
2115
2116
2117
2118 static inline void perf_remove_sibling_event(struct perf_event *event)
2119 {
2120 struct perf_event_context *ctx = event->ctx;
2121 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2122
2123 event_sched_out(event, cpuctx, ctx);
2124 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2125 }
2126
2127 static void perf_group_detach(struct perf_event *event)
2128 {
2129 struct perf_event *leader = event->group_leader;
2130 struct perf_event *sibling, *tmp;
2131 struct perf_event_context *ctx = event->ctx;
2132
2133 lockdep_assert_held(&ctx->lock);
2134
2135
2136
2137
2138 if (!(event->attach_state & PERF_ATTACH_GROUP))
2139 return;
2140
2141 event->attach_state &= ~PERF_ATTACH_GROUP;
2142
2143 perf_put_aux_event(event);
2144
2145
2146
2147
2148 if (leader != event) {
2149 list_del_init(&event->sibling_list);
2150 event->group_leader->nr_siblings--;
2151 goto out;
2152 }
2153
2154
2155
2156
2157
2158
2159 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2160
2161 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2162 perf_remove_sibling_event(sibling);
2163
2164 sibling->group_leader = sibling;
2165 list_del_init(&sibling->sibling_list);
2166
2167
2168 sibling->group_caps = event->group_caps;
2169
2170 if (!RB_EMPTY_NODE(&event->group_node)) {
2171 add_event_to_groups(sibling, event->ctx);
2172
2173 if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2174 list_add_tail(&sibling->active_list, get_event_list(sibling));
2175 }
2176
2177 WARN_ON_ONCE(sibling->ctx != event->ctx);
2178 }
2179
2180 out:
2181 for_each_sibling_event(tmp, leader)
2182 perf_event__header_size(tmp);
2183
2184 perf_event__header_size(leader);
2185 }
2186
2187 static void sync_child_event(struct perf_event *child_event);
2188
2189 static void perf_child_detach(struct perf_event *event)
2190 {
2191 struct perf_event *parent_event = event->parent;
2192
2193 if (!(event->attach_state & PERF_ATTACH_CHILD))
2194 return;
2195
2196 event->attach_state &= ~PERF_ATTACH_CHILD;
2197
2198 if (WARN_ON_ONCE(!parent_event))
2199 return;
2200
2201 lockdep_assert_held(&parent_event->child_mutex);
2202
2203 sync_child_event(event);
2204 list_del_init(&event->child_list);
2205 }
2206
2207 static bool is_orphaned_event(struct perf_event *event)
2208 {
2209 return event->state == PERF_EVENT_STATE_DEAD;
2210 }
2211
2212 static inline int __pmu_filter_match(struct perf_event *event)
2213 {
2214 struct pmu *pmu = event->pmu;
2215 return pmu->filter_match ? pmu->filter_match(event) : 1;
2216 }
2217
2218
2219
2220
2221
2222
2223
2224 static inline int pmu_filter_match(struct perf_event *event)
2225 {
2226 struct perf_event *sibling;
2227
2228 if (!__pmu_filter_match(event))
2229 return 0;
2230
2231 for_each_sibling_event(sibling, event) {
2232 if (!__pmu_filter_match(sibling))
2233 return 0;
2234 }
2235
2236 return 1;
2237 }
2238
2239 static inline int
2240 event_filter_match(struct perf_event *event)
2241 {
2242 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2243 perf_cgroup_match(event) && pmu_filter_match(event);
2244 }
2245
2246 static void
2247 event_sched_out(struct perf_event *event,
2248 struct perf_cpu_context *cpuctx,
2249 struct perf_event_context *ctx)
2250 {
2251 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2252
2253 WARN_ON_ONCE(event->ctx != ctx);
2254 lockdep_assert_held(&ctx->lock);
2255
2256 if (event->state != PERF_EVENT_STATE_ACTIVE)
2257 return;
2258
2259
2260
2261
2262
2263
2264 list_del_init(&event->active_list);
2265
2266 perf_pmu_disable(event->pmu);
2267
2268 event->pmu->del(event, 0);
2269 event->oncpu = -1;
2270
2271 if (READ_ONCE(event->pending_disable) >= 0) {
2272 WRITE_ONCE(event->pending_disable, -1);
2273 perf_cgroup_event_disable(event, ctx);
2274 state = PERF_EVENT_STATE_OFF;
2275 }
2276 perf_event_set_state(event, state);
2277
2278 if (!is_software_event(event))
2279 cpuctx->active_oncpu--;
2280 if (!--ctx->nr_active)
2281 perf_event_ctx_deactivate(ctx);
2282 if (event->attr.freq && event->attr.sample_freq)
2283 ctx->nr_freq--;
2284 if (event->attr.exclusive || !cpuctx->active_oncpu)
2285 cpuctx->exclusive = 0;
2286
2287 perf_pmu_enable(event->pmu);
2288 }
2289
2290 static void
2291 group_sched_out(struct perf_event *group_event,
2292 struct perf_cpu_context *cpuctx,
2293 struct perf_event_context *ctx)
2294 {
2295 struct perf_event *event;
2296
2297 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2298 return;
2299
2300 perf_pmu_disable(ctx->pmu);
2301
2302 event_sched_out(group_event, cpuctx, ctx);
2303
2304
2305
2306
2307 for_each_sibling_event(event, group_event)
2308 event_sched_out(event, cpuctx, ctx);
2309
2310 perf_pmu_enable(ctx->pmu);
2311 }
2312
2313 #define DETACH_GROUP 0x01UL
2314 #define DETACH_CHILD 0x02UL
2315
2316
2317
2318
2319
2320
2321
2322 static void
2323 __perf_remove_from_context(struct perf_event *event,
2324 struct perf_cpu_context *cpuctx,
2325 struct perf_event_context *ctx,
2326 void *info)
2327 {
2328 unsigned long flags = (unsigned long)info;
2329
2330 if (ctx->is_active & EVENT_TIME) {
2331 update_context_time(ctx);
2332 update_cgrp_time_from_cpuctx(cpuctx, false);
2333 }
2334
2335 event_sched_out(event, cpuctx, ctx);
2336 if (flags & DETACH_GROUP)
2337 perf_group_detach(event);
2338 if (flags & DETACH_CHILD)
2339 perf_child_detach(event);
2340 list_del_event(event, ctx);
2341
2342 if (!ctx->nr_events && ctx->is_active) {
2343 if (ctx == &cpuctx->ctx)
2344 update_cgrp_time_from_cpuctx(cpuctx, true);
2345
2346 ctx->is_active = 0;
2347 ctx->rotate_necessary = 0;
2348 if (ctx->task) {
2349 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2350 cpuctx->task_ctx = NULL;
2351 }
2352 }
2353 }
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2366 {
2367 struct perf_event_context *ctx = event->ctx;
2368
2369 lockdep_assert_held(&ctx->mutex);
2370
2371
2372
2373
2374
2375
2376 raw_spin_lock_irq(&ctx->lock);
2377
2378
2379
2380
2381 if (!ctx->is_active && !is_cgroup_event(event)) {
2382 __perf_remove_from_context(event, __get_cpu_context(ctx),
2383 ctx, (void *)flags);
2384 raw_spin_unlock_irq(&ctx->lock);
2385 return;
2386 }
2387 raw_spin_unlock_irq(&ctx->lock);
2388
2389 event_function_call(event, __perf_remove_from_context, (void *)flags);
2390 }
2391
2392
2393
2394
2395 static void __perf_event_disable(struct perf_event *event,
2396 struct perf_cpu_context *cpuctx,
2397 struct perf_event_context *ctx,
2398 void *info)
2399 {
2400 if (event->state < PERF_EVENT_STATE_INACTIVE)
2401 return;
2402
2403 if (ctx->is_active & EVENT_TIME) {
2404 update_context_time(ctx);
2405 update_cgrp_time_from_event(event);
2406 }
2407
2408 if (event == event->group_leader)
2409 group_sched_out(event, cpuctx, ctx);
2410 else
2411 event_sched_out(event, cpuctx, ctx);
2412
2413 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2414 perf_cgroup_event_disable(event, ctx);
2415 }
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431 static void _perf_event_disable(struct perf_event *event)
2432 {
2433 struct perf_event_context *ctx = event->ctx;
2434
2435 raw_spin_lock_irq(&ctx->lock);
2436 if (event->state <= PERF_EVENT_STATE_OFF) {
2437 raw_spin_unlock_irq(&ctx->lock);
2438 return;
2439 }
2440 raw_spin_unlock_irq(&ctx->lock);
2441
2442 event_function_call(event, __perf_event_disable, NULL);
2443 }
2444
2445 void perf_event_disable_local(struct perf_event *event)
2446 {
2447 event_function_local(event, __perf_event_disable, NULL);
2448 }
2449
2450
2451
2452
2453
2454 void perf_event_disable(struct perf_event *event)
2455 {
2456 struct perf_event_context *ctx;
2457
2458 ctx = perf_event_ctx_lock(event);
2459 _perf_event_disable(event);
2460 perf_event_ctx_unlock(event, ctx);
2461 }
2462 EXPORT_SYMBOL_GPL(perf_event_disable);
2463
2464 void perf_event_disable_inatomic(struct perf_event *event)
2465 {
2466 WRITE_ONCE(event->pending_disable, smp_processor_id());
2467
2468 irq_work_queue(&event->pending);
2469 }
2470
2471 #define MAX_INTERRUPTS (~0ULL)
2472
2473 static void perf_log_throttle(struct perf_event *event, int enable);
2474 static void perf_log_itrace_start(struct perf_event *event);
2475
2476 static int
2477 event_sched_in(struct perf_event *event,
2478 struct perf_cpu_context *cpuctx,
2479 struct perf_event_context *ctx)
2480 {
2481 int ret = 0;
2482
2483 WARN_ON_ONCE(event->ctx != ctx);
2484
2485 lockdep_assert_held(&ctx->lock);
2486
2487 if (event->state <= PERF_EVENT_STATE_OFF)
2488 return 0;
2489
2490 WRITE_ONCE(event->oncpu, smp_processor_id());
2491
2492
2493
2494
2495
2496 smp_wmb();
2497 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2498
2499
2500
2501
2502
2503
2504 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2505 perf_log_throttle(event, 1);
2506 event->hw.interrupts = 0;
2507 }
2508
2509 perf_pmu_disable(event->pmu);
2510
2511 perf_log_itrace_start(event);
2512
2513 if (event->pmu->add(event, PERF_EF_START)) {
2514 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2515 event->oncpu = -1;
2516 ret = -EAGAIN;
2517 goto out;
2518 }
2519
2520 if (!is_software_event(event))
2521 cpuctx->active_oncpu++;
2522 if (!ctx->nr_active++)
2523 perf_event_ctx_activate(ctx);
2524 if (event->attr.freq && event->attr.sample_freq)
2525 ctx->nr_freq++;
2526
2527 if (event->attr.exclusive)
2528 cpuctx->exclusive = 1;
2529
2530 out:
2531 perf_pmu_enable(event->pmu);
2532
2533 return ret;
2534 }
2535
2536 static int
2537 group_sched_in(struct perf_event *group_event,
2538 struct perf_cpu_context *cpuctx,
2539 struct perf_event_context *ctx)
2540 {
2541 struct perf_event *event, *partial_group = NULL;
2542 struct pmu *pmu = ctx->pmu;
2543
2544 if (group_event->state == PERF_EVENT_STATE_OFF)
2545 return 0;
2546
2547 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2548
2549 if (event_sched_in(group_event, cpuctx, ctx))
2550 goto error;
2551
2552
2553
2554
2555 for_each_sibling_event(event, group_event) {
2556 if (event_sched_in(event, cpuctx, ctx)) {
2557 partial_group = event;
2558 goto group_error;
2559 }
2560 }
2561
2562 if (!pmu->commit_txn(pmu))
2563 return 0;
2564
2565 group_error:
2566
2567
2568
2569
2570
2571 for_each_sibling_event(event, group_event) {
2572 if (event == partial_group)
2573 break;
2574
2575 event_sched_out(event, cpuctx, ctx);
2576 }
2577 event_sched_out(group_event, cpuctx, ctx);
2578
2579 error:
2580 pmu->cancel_txn(pmu);
2581 return -EAGAIN;
2582 }
2583
2584
2585
2586
2587 static int group_can_go_on(struct perf_event *event,
2588 struct perf_cpu_context *cpuctx,
2589 int can_add_hw)
2590 {
2591
2592
2593
2594 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2595 return 1;
2596
2597
2598
2599
2600 if (cpuctx->exclusive)
2601 return 0;
2602
2603
2604
2605
2606 if (event->attr.exclusive && !list_empty(get_event_list(event)))
2607 return 0;
2608
2609
2610
2611
2612 return can_add_hw;
2613 }
2614
2615 static void add_event_to_ctx(struct perf_event *event,
2616 struct perf_event_context *ctx)
2617 {
2618 list_add_event(event, ctx);
2619 perf_group_attach(event);
2620 }
2621
2622 static void ctx_sched_out(struct perf_event_context *ctx,
2623 struct perf_cpu_context *cpuctx,
2624 enum event_type_t event_type);
2625 static void
2626 ctx_sched_in(struct perf_event_context *ctx,
2627 struct perf_cpu_context *cpuctx,
2628 enum event_type_t event_type);
2629
2630 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2631 struct perf_event_context *ctx,
2632 enum event_type_t event_type)
2633 {
2634 if (!cpuctx->task_ctx)
2635 return;
2636
2637 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2638 return;
2639
2640 ctx_sched_out(ctx, cpuctx, event_type);
2641 }
2642
2643 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2644 struct perf_event_context *ctx)
2645 {
2646 cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
2647 if (ctx)
2648 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
2649 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
2650 if (ctx)
2651 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
2652 }
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669 static void ctx_resched(struct perf_cpu_context *cpuctx,
2670 struct perf_event_context *task_ctx,
2671 enum event_type_t event_type)
2672 {
2673 enum event_type_t ctx_event_type;
2674 bool cpu_event = !!(event_type & EVENT_CPU);
2675
2676
2677
2678
2679
2680 if (event_type & EVENT_PINNED)
2681 event_type |= EVENT_FLEXIBLE;
2682
2683 ctx_event_type = event_type & EVENT_ALL;
2684
2685 perf_pmu_disable(cpuctx->ctx.pmu);
2686 if (task_ctx)
2687 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2688
2689
2690
2691
2692
2693
2694
2695
2696 if (cpu_event)
2697 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2698 else if (ctx_event_type & EVENT_PINNED)
2699 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2700
2701 perf_event_sched_in(cpuctx, task_ctx);
2702 perf_pmu_enable(cpuctx->ctx.pmu);
2703 }
2704
2705 void perf_pmu_resched(struct pmu *pmu)
2706 {
2707 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2708 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2709
2710 perf_ctx_lock(cpuctx, task_ctx);
2711 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2712 perf_ctx_unlock(cpuctx, task_ctx);
2713 }
2714
2715
2716
2717
2718
2719
2720
2721 static int __perf_install_in_context(void *info)
2722 {
2723 struct perf_event *event = info;
2724 struct perf_event_context *ctx = event->ctx;
2725 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2726 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2727 bool reprogram = true;
2728 int ret = 0;
2729
2730 raw_spin_lock(&cpuctx->ctx.lock);
2731 if (ctx->task) {
2732 raw_spin_lock(&ctx->lock);
2733 task_ctx = ctx;
2734
2735 reprogram = (ctx->task == current);
2736
2737
2738
2739
2740
2741
2742
2743
2744 if (task_curr(ctx->task) && !reprogram) {
2745 ret = -ESRCH;
2746 goto unlock;
2747 }
2748
2749 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2750 } else if (task_ctx) {
2751 raw_spin_lock(&task_ctx->lock);
2752 }
2753
2754 #ifdef CONFIG_CGROUP_PERF
2755 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2756
2757
2758
2759
2760 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2761 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2762 event->cgrp->css.cgroup);
2763 }
2764 #endif
2765
2766 if (reprogram) {
2767 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2768 add_event_to_ctx(event, ctx);
2769 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2770 } else {
2771 add_event_to_ctx(event, ctx);
2772 }
2773
2774 unlock:
2775 perf_ctx_unlock(cpuctx, task_ctx);
2776
2777 return ret;
2778 }
2779
2780 static bool exclusive_event_installable(struct perf_event *event,
2781 struct perf_event_context *ctx);
2782
2783
2784
2785
2786
2787
2788 static void
2789 perf_install_in_context(struct perf_event_context *ctx,
2790 struct perf_event *event,
2791 int cpu)
2792 {
2793 struct task_struct *task = READ_ONCE(ctx->task);
2794
2795 lockdep_assert_held(&ctx->mutex);
2796
2797 WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2798
2799 if (event->cpu != -1)
2800 event->cpu = cpu;
2801
2802
2803
2804
2805
2806 smp_store_release(&event->ctx, ctx);
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2819 ctx->nr_events && !is_cgroup_event(event)) {
2820 raw_spin_lock_irq(&ctx->lock);
2821 if (ctx->task == TASK_TOMBSTONE) {
2822 raw_spin_unlock_irq(&ctx->lock);
2823 return;
2824 }
2825 add_event_to_ctx(event, ctx);
2826 raw_spin_unlock_irq(&ctx->lock);
2827 return;
2828 }
2829
2830 if (!task) {
2831 cpu_function_call(cpu, __perf_install_in_context, event);
2832 return;
2833 }
2834
2835
2836
2837
2838 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2839 return;
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871 smp_mb();
2872 again:
2873 if (!task_function_call(task, __perf_install_in_context, event))
2874 return;
2875
2876 raw_spin_lock_irq(&ctx->lock);
2877 task = ctx->task;
2878 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2879
2880
2881
2882
2883
2884 raw_spin_unlock_irq(&ctx->lock);
2885 return;
2886 }
2887
2888
2889
2890
2891 if (task_curr(task)) {
2892 raw_spin_unlock_irq(&ctx->lock);
2893 goto again;
2894 }
2895 add_event_to_ctx(event, ctx);
2896 raw_spin_unlock_irq(&ctx->lock);
2897 }
2898
2899
2900
2901
2902 static void __perf_event_enable(struct perf_event *event,
2903 struct perf_cpu_context *cpuctx,
2904 struct perf_event_context *ctx,
2905 void *info)
2906 {
2907 struct perf_event *leader = event->group_leader;
2908 struct perf_event_context *task_ctx;
2909
2910 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2911 event->state <= PERF_EVENT_STATE_ERROR)
2912 return;
2913
2914 if (ctx->is_active)
2915 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2916
2917 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2918 perf_cgroup_event_enable(event, ctx);
2919
2920 if (!ctx->is_active)
2921 return;
2922
2923 if (!event_filter_match(event)) {
2924 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2925 return;
2926 }
2927
2928
2929
2930
2931
2932 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2933 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2934 return;
2935 }
2936
2937 task_ctx = cpuctx->task_ctx;
2938 if (ctx->task)
2939 WARN_ON_ONCE(task_ctx != ctx);
2940
2941 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2942 }
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953 static void _perf_event_enable(struct perf_event *event)
2954 {
2955 struct perf_event_context *ctx = event->ctx;
2956
2957 raw_spin_lock_irq(&ctx->lock);
2958 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2959 event->state < PERF_EVENT_STATE_ERROR) {
2960 out:
2961 raw_spin_unlock_irq(&ctx->lock);
2962 return;
2963 }
2964
2965
2966
2967
2968
2969
2970
2971
2972 if (event->state == PERF_EVENT_STATE_ERROR) {
2973
2974
2975
2976 if (event->event_caps & PERF_EV_CAP_SIBLING &&
2977 event->group_leader == event)
2978 goto out;
2979
2980 event->state = PERF_EVENT_STATE_OFF;
2981 }
2982 raw_spin_unlock_irq(&ctx->lock);
2983
2984 event_function_call(event, __perf_event_enable, NULL);
2985 }
2986
2987
2988
2989
2990 void perf_event_enable(struct perf_event *event)
2991 {
2992 struct perf_event_context *ctx;
2993
2994 ctx = perf_event_ctx_lock(event);
2995 _perf_event_enable(event);
2996 perf_event_ctx_unlock(event, ctx);
2997 }
2998 EXPORT_SYMBOL_GPL(perf_event_enable);
2999
3000 struct stop_event_data {
3001 struct perf_event *event;
3002 unsigned int restart;
3003 };
3004
3005 static int __perf_event_stop(void *info)
3006 {
3007 struct stop_event_data *sd = info;
3008 struct perf_event *event = sd->event;
3009
3010
3011 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3012 return 0;
3013
3014
3015 smp_rmb();
3016
3017
3018
3019
3020
3021 if (READ_ONCE(event->oncpu) != smp_processor_id())
3022 return -EAGAIN;
3023
3024 event->pmu->stop(event, PERF_EF_UPDATE);
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035 if (sd->restart)
3036 event->pmu->start(event, 0);
3037
3038 return 0;
3039 }
3040
3041 static int perf_event_stop(struct perf_event *event, int restart)
3042 {
3043 struct stop_event_data sd = {
3044 .event = event,
3045 .restart = restart,
3046 };
3047 int ret = 0;
3048
3049 do {
3050 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3051 return 0;
3052
3053
3054 smp_rmb();
3055
3056
3057
3058
3059
3060
3061 ret = cpu_function_call(READ_ONCE(event->oncpu),
3062 __perf_event_stop, &sd);
3063 } while (ret == -EAGAIN);
3064
3065 return ret;
3066 }
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090 void perf_event_addr_filters_sync(struct perf_event *event)
3091 {
3092 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3093
3094 if (!has_addr_filter(event))
3095 return;
3096
3097 raw_spin_lock(&ifh->lock);
3098 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3099 event->pmu->addr_filters_sync(event);
3100 event->hw.addr_filters_gen = event->addr_filters_gen;
3101 }
3102 raw_spin_unlock(&ifh->lock);
3103 }
3104 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3105
3106 static int _perf_event_refresh(struct perf_event *event, int refresh)
3107 {
3108
3109
3110
3111 if (event->attr.inherit || !is_sampling_event(event))
3112 return -EINVAL;
3113
3114 atomic_add(refresh, &event->event_limit);
3115 _perf_event_enable(event);
3116
3117 return 0;
3118 }
3119
3120
3121
3122
3123 int perf_event_refresh(struct perf_event *event, int refresh)
3124 {
3125 struct perf_event_context *ctx;
3126 int ret;
3127
3128 ctx = perf_event_ctx_lock(event);
3129 ret = _perf_event_refresh(event, refresh);
3130 perf_event_ctx_unlock(event, ctx);
3131
3132 return ret;
3133 }
3134 EXPORT_SYMBOL_GPL(perf_event_refresh);
3135
3136 static int perf_event_modify_breakpoint(struct perf_event *bp,
3137 struct perf_event_attr *attr)
3138 {
3139 int err;
3140
3141 _perf_event_disable(bp);
3142
3143 err = modify_user_hw_breakpoint_check(bp, attr, true);
3144
3145 if (!bp->attr.disabled)
3146 _perf_event_enable(bp);
3147
3148 return err;
3149 }
3150
3151
3152
3153
3154 static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3155 const struct perf_event_attr *from)
3156 {
3157 to->sig_data = from->sig_data;
3158 }
3159
3160 static int perf_event_modify_attr(struct perf_event *event,
3161 struct perf_event_attr *attr)
3162 {
3163 int (*func)(struct perf_event *, struct perf_event_attr *);
3164 struct perf_event *child;
3165 int err;
3166
3167 if (event->attr.type != attr->type)
3168 return -EINVAL;
3169
3170 switch (event->attr.type) {
3171 case PERF_TYPE_BREAKPOINT:
3172 func = perf_event_modify_breakpoint;
3173 break;
3174 default:
3175
3176 return -EOPNOTSUPP;
3177 }
3178
3179 WARN_ON_ONCE(event->ctx->parent_ctx);
3180
3181 mutex_lock(&event->child_mutex);
3182
3183
3184
3185
3186
3187 perf_event_modify_copy_attr(&event->attr, attr);
3188 err = func(event, attr);
3189 if (err)
3190 goto out;
3191 list_for_each_entry(child, &event->child_list, child_list) {
3192 perf_event_modify_copy_attr(&child->attr, attr);
3193 err = func(child, attr);
3194 if (err)
3195 goto out;
3196 }
3197 out:
3198 mutex_unlock(&event->child_mutex);
3199 return err;
3200 }
3201
3202 static void ctx_sched_out(struct perf_event_context *ctx,
3203 struct perf_cpu_context *cpuctx,
3204 enum event_type_t event_type)
3205 {
3206 struct perf_event *event, *tmp;
3207 int is_active = ctx->is_active;
3208
3209 lockdep_assert_held(&ctx->lock);
3210
3211 if (likely(!ctx->nr_events)) {
3212
3213
3214
3215 WARN_ON_ONCE(ctx->is_active);
3216 if (ctx->task)
3217 WARN_ON_ONCE(cpuctx->task_ctx);
3218 return;
3219 }
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231 if (is_active & EVENT_TIME) {
3232
3233 update_context_time(ctx);
3234 update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3235
3236
3237
3238
3239 barrier();
3240 }
3241
3242 ctx->is_active &= ~event_type;
3243 if (!(ctx->is_active & EVENT_ALL))
3244 ctx->is_active = 0;
3245
3246 if (ctx->task) {
3247 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3248 if (!ctx->is_active)
3249 cpuctx->task_ctx = NULL;
3250 }
3251
3252 is_active ^= ctx->is_active;
3253
3254 if (!ctx->nr_active || !(is_active & EVENT_ALL))
3255 return;
3256
3257 perf_pmu_disable(ctx->pmu);
3258 if (is_active & EVENT_PINNED) {
3259 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3260 group_sched_out(event, cpuctx, ctx);
3261 }
3262
3263 if (is_active & EVENT_FLEXIBLE) {
3264 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3265 group_sched_out(event, cpuctx, ctx);
3266
3267
3268
3269
3270
3271
3272 ctx->rotate_necessary = 0;
3273 }
3274 perf_pmu_enable(ctx->pmu);
3275 }
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285 static int context_equiv(struct perf_event_context *ctx1,
3286 struct perf_event_context *ctx2)
3287 {
3288 lockdep_assert_held(&ctx1->lock);
3289 lockdep_assert_held(&ctx2->lock);
3290
3291
3292 if (ctx1->pin_count || ctx2->pin_count)
3293 return 0;
3294
3295
3296 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3297 return 1;
3298
3299
3300 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3301 return 1;
3302
3303
3304
3305
3306
3307 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3308 ctx1->parent_gen == ctx2->parent_gen)
3309 return 1;
3310
3311
3312 return 0;
3313 }
3314
3315 static void __perf_event_sync_stat(struct perf_event *event,
3316 struct perf_event *next_event)
3317 {
3318 u64 value;
3319
3320 if (!event->attr.inherit_stat)
3321 return;
3322
3323
3324
3325
3326
3327
3328
3329
3330 if (event->state == PERF_EVENT_STATE_ACTIVE)
3331 event->pmu->read(event);
3332
3333 perf_event_update_time(event);
3334
3335
3336
3337
3338
3339 value = local64_read(&next_event->count);
3340 value = local64_xchg(&event->count, value);
3341 local64_set(&next_event->count, value);
3342
3343 swap(event->total_time_enabled, next_event->total_time_enabled);
3344 swap(event->total_time_running, next_event->total_time_running);
3345
3346
3347
3348
3349 perf_event_update_userpage(event);
3350 perf_event_update_userpage(next_event);
3351 }
3352
3353 static void perf_event_sync_stat(struct perf_event_context *ctx,
3354 struct perf_event_context *next_ctx)
3355 {
3356 struct perf_event *event, *next_event;
3357
3358 if (!ctx->nr_stat)
3359 return;
3360
3361 update_context_time(ctx);
3362
3363 event = list_first_entry(&ctx->event_list,
3364 struct perf_event, event_entry);
3365
3366 next_event = list_first_entry(&next_ctx->event_list,
3367 struct perf_event, event_entry);
3368
3369 while (&event->event_entry != &ctx->event_list &&
3370 &next_event->event_entry != &next_ctx->event_list) {
3371
3372 __perf_event_sync_stat(event, next_event);
3373
3374 event = list_next_entry(event, event_entry);
3375 next_event = list_next_entry(next_event, event_entry);
3376 }
3377 }
3378
3379 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3380 struct task_struct *next)
3381 {
3382 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3383 struct perf_event_context *next_ctx;
3384 struct perf_event_context *parent, *next_parent;
3385 struct perf_cpu_context *cpuctx;
3386 int do_switch = 1;
3387 struct pmu *pmu;
3388
3389 if (likely(!ctx))
3390 return;
3391
3392 pmu = ctx->pmu;
3393 cpuctx = __get_cpu_context(ctx);
3394 if (!cpuctx->task_ctx)
3395 return;
3396
3397 rcu_read_lock();
3398 next_ctx = next->perf_event_ctxp[ctxn];
3399 if (!next_ctx)
3400 goto unlock;
3401
3402 parent = rcu_dereference(ctx->parent_ctx);
3403 next_parent = rcu_dereference(next_ctx->parent_ctx);
3404
3405
3406 if (!parent && !next_parent)
3407 goto unlock;
3408
3409 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419 raw_spin_lock(&ctx->lock);
3420 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3421 if (context_equiv(ctx, next_ctx)) {
3422
3423 WRITE_ONCE(ctx->task, next);
3424 WRITE_ONCE(next_ctx->task, task);
3425
3426 perf_pmu_disable(pmu);
3427
3428 if (cpuctx->sched_cb_usage && pmu->sched_task)
3429 pmu->sched_task(ctx, false);
3430
3431
3432
3433
3434
3435
3436
3437 if (pmu->swap_task_ctx)
3438 pmu->swap_task_ctx(ctx, next_ctx);
3439 else
3440 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3441
3442 perf_pmu_enable(pmu);
3443
3444
3445
3446
3447
3448
3449
3450
3451 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3452 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3453
3454 do_switch = 0;
3455
3456 perf_event_sync_stat(ctx, next_ctx);
3457 }
3458 raw_spin_unlock(&next_ctx->lock);
3459 raw_spin_unlock(&ctx->lock);
3460 }
3461 unlock:
3462 rcu_read_unlock();
3463
3464 if (do_switch) {
3465 raw_spin_lock(&ctx->lock);
3466 perf_pmu_disable(pmu);
3467
3468 if (cpuctx->sched_cb_usage && pmu->sched_task)
3469 pmu->sched_task(ctx, false);
3470 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3471
3472 perf_pmu_enable(pmu);
3473 raw_spin_unlock(&ctx->lock);
3474 }
3475 }
3476
3477 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3478
3479 void perf_sched_cb_dec(struct pmu *pmu)
3480 {
3481 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3482
3483 this_cpu_dec(perf_sched_cb_usages);
3484
3485 if (!--cpuctx->sched_cb_usage)
3486 list_del(&cpuctx->sched_cb_entry);
3487 }
3488
3489
3490 void perf_sched_cb_inc(struct pmu *pmu)
3491 {
3492 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3493
3494 if (!cpuctx->sched_cb_usage++)
3495 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3496
3497 this_cpu_inc(perf_sched_cb_usages);
3498 }
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3509 {
3510 struct pmu *pmu;
3511
3512 pmu = cpuctx->ctx.pmu;
3513
3514 if (WARN_ON_ONCE(!pmu->sched_task))
3515 return;
3516
3517 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3518 perf_pmu_disable(pmu);
3519
3520 pmu->sched_task(cpuctx->task_ctx, sched_in);
3521
3522 perf_pmu_enable(pmu);
3523 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3524 }
3525
3526 static void perf_pmu_sched_task(struct task_struct *prev,
3527 struct task_struct *next,
3528 bool sched_in)
3529 {
3530 struct perf_cpu_context *cpuctx;
3531
3532 if (prev == next)
3533 return;
3534
3535 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3536
3537 if (cpuctx->task_ctx)
3538 continue;
3539
3540 __perf_pmu_sched_task(cpuctx, sched_in);
3541 }
3542 }
3543
3544 static void perf_event_switch(struct task_struct *task,
3545 struct task_struct *next_prev, bool sched_in);
3546
3547 #define for_each_task_context_nr(ctxn) \
3548 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561 void __perf_event_task_sched_out(struct task_struct *task,
3562 struct task_struct *next)
3563 {
3564 int ctxn;
3565
3566 if (__this_cpu_read(perf_sched_cb_usages))
3567 perf_pmu_sched_task(task, next, false);
3568
3569 if (atomic_read(&nr_switch_events))
3570 perf_event_switch(task, next, false);
3571
3572 for_each_task_context_nr(ctxn)
3573 perf_event_context_sched_out(task, ctxn, next);
3574
3575
3576
3577
3578
3579
3580 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3581 perf_cgroup_switch(next);
3582 }
3583
3584
3585
3586
3587 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3588 enum event_type_t event_type)
3589 {
3590 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3591 }
3592
3593 static bool perf_less_group_idx(const void *l, const void *r)
3594 {
3595 const struct perf_event *le = *(const struct perf_event **)l;
3596 const struct perf_event *re = *(const struct perf_event **)r;
3597
3598 return le->group_index < re->group_index;
3599 }
3600
3601 static void swap_ptr(void *l, void *r)
3602 {
3603 void **lp = l, **rp = r;
3604
3605 swap(*lp, *rp);
3606 }
3607
3608 static const struct min_heap_callbacks perf_min_heap = {
3609 .elem_size = sizeof(struct perf_event *),
3610 .less = perf_less_group_idx,
3611 .swp = swap_ptr,
3612 };
3613
3614 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3615 {
3616 struct perf_event **itrs = heap->data;
3617
3618 if (event) {
3619 itrs[heap->nr] = event;
3620 heap->nr++;
3621 }
3622 }
3623
3624 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3625 struct perf_event_groups *groups, int cpu,
3626 int (*func)(struct perf_event *, void *),
3627 void *data)
3628 {
3629 #ifdef CONFIG_CGROUP_PERF
3630 struct cgroup_subsys_state *css = NULL;
3631 #endif
3632
3633 struct perf_event *itrs[2];
3634 struct min_heap event_heap;
3635 struct perf_event **evt;
3636 int ret;
3637
3638 if (cpuctx) {
3639 event_heap = (struct min_heap){
3640 .data = cpuctx->heap,
3641 .nr = 0,
3642 .size = cpuctx->heap_size,
3643 };
3644
3645 lockdep_assert_held(&cpuctx->ctx.lock);
3646
3647 #ifdef CONFIG_CGROUP_PERF
3648 if (cpuctx->cgrp)
3649 css = &cpuctx->cgrp->css;
3650 #endif
3651 } else {
3652 event_heap = (struct min_heap){
3653 .data = itrs,
3654 .nr = 0,
3655 .size = ARRAY_SIZE(itrs),
3656 };
3657
3658 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3659 }
3660 evt = event_heap.data;
3661
3662 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3663
3664 #ifdef CONFIG_CGROUP_PERF
3665 for (; css; css = css->parent)
3666 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3667 #endif
3668
3669 min_heapify_all(&event_heap, &perf_min_heap);
3670
3671 while (event_heap.nr) {
3672 ret = func(*evt, data);
3673 if (ret)
3674 return ret;
3675
3676 *evt = perf_event_groups_next(*evt);
3677 if (*evt)
3678 min_heapify(&event_heap, 0, &perf_min_heap);
3679 else
3680 min_heap_pop(&event_heap, &perf_min_heap);
3681 }
3682
3683 return 0;
3684 }
3685
3686
3687
3688
3689
3690
3691
3692
3693 static inline bool event_update_userpage(struct perf_event *event)
3694 {
3695 if (likely(!atomic_read(&event->mmap_count)))
3696 return false;
3697
3698 perf_event_update_time(event);
3699 perf_event_update_userpage(event);
3700
3701 return true;
3702 }
3703
3704 static inline void group_update_userpage(struct perf_event *group_event)
3705 {
3706 struct perf_event *event;
3707
3708 if (!event_update_userpage(group_event))
3709 return;
3710
3711 for_each_sibling_event(event, group_event)
3712 event_update_userpage(event);
3713 }
3714
3715 static int merge_sched_in(struct perf_event *event, void *data)
3716 {
3717 struct perf_event_context *ctx = event->ctx;
3718 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3719 int *can_add_hw = data;
3720
3721 if (event->state <= PERF_EVENT_STATE_OFF)
3722 return 0;
3723
3724 if (!event_filter_match(event))
3725 return 0;
3726
3727 if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3728 if (!group_sched_in(event, cpuctx, ctx))
3729 list_add_tail(&event->active_list, get_event_list(event));
3730 }
3731
3732 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3733 *can_add_hw = 0;
3734 if (event->attr.pinned) {
3735 perf_cgroup_event_disable(event, ctx);
3736 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3737 } else {
3738 ctx->rotate_necessary = 1;
3739 perf_mux_hrtimer_restart(cpuctx);
3740 group_update_userpage(event);
3741 }
3742 }
3743
3744 return 0;
3745 }
3746
3747 static void
3748 ctx_pinned_sched_in(struct perf_event_context *ctx,
3749 struct perf_cpu_context *cpuctx)
3750 {
3751 int can_add_hw = 1;
3752
3753 if (ctx != &cpuctx->ctx)
3754 cpuctx = NULL;
3755
3756 visit_groups_merge(cpuctx, &ctx->pinned_groups,
3757 smp_processor_id(),
3758 merge_sched_in, &can_add_hw);
3759 }
3760
3761 static void
3762 ctx_flexible_sched_in(struct perf_event_context *ctx,
3763 struct perf_cpu_context *cpuctx)
3764 {
3765 int can_add_hw = 1;
3766
3767 if (ctx != &cpuctx->ctx)
3768 cpuctx = NULL;
3769
3770 visit_groups_merge(cpuctx, &ctx->flexible_groups,
3771 smp_processor_id(),
3772 merge_sched_in, &can_add_hw);
3773 }
3774
3775 static void
3776 ctx_sched_in(struct perf_event_context *ctx,
3777 struct perf_cpu_context *cpuctx,
3778 enum event_type_t event_type)
3779 {
3780 int is_active = ctx->is_active;
3781
3782 lockdep_assert_held(&ctx->lock);
3783
3784 if (likely(!ctx->nr_events))
3785 return;
3786
3787 if (is_active ^ EVENT_TIME) {
3788
3789 __update_context_time(ctx, false);
3790 perf_cgroup_set_timestamp(cpuctx);
3791
3792
3793
3794
3795 barrier();
3796 }
3797
3798 ctx->is_active |= (event_type | EVENT_TIME);
3799 if (ctx->task) {
3800 if (!is_active)
3801 cpuctx->task_ctx = ctx;
3802 else
3803 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3804 }
3805
3806 is_active ^= ctx->is_active;
3807
3808
3809
3810
3811
3812 if (is_active & EVENT_PINNED)
3813 ctx_pinned_sched_in(ctx, cpuctx);
3814
3815
3816 if (is_active & EVENT_FLEXIBLE)
3817 ctx_flexible_sched_in(ctx, cpuctx);
3818 }
3819
3820 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3821 enum event_type_t event_type)
3822 {
3823 struct perf_event_context *ctx = &cpuctx->ctx;
3824
3825 ctx_sched_in(ctx, cpuctx, event_type);
3826 }
3827
3828 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3829 struct task_struct *task)
3830 {
3831 struct perf_cpu_context *cpuctx;
3832 struct pmu *pmu;
3833
3834 cpuctx = __get_cpu_context(ctx);
3835
3836
3837
3838
3839
3840 pmu = ctx->pmu = cpuctx->ctx.pmu;
3841
3842 if (cpuctx->task_ctx == ctx) {
3843 if (cpuctx->sched_cb_usage)
3844 __perf_pmu_sched_task(cpuctx, true);
3845 return;
3846 }
3847
3848 perf_ctx_lock(cpuctx, ctx);
3849
3850
3851
3852
3853 if (!ctx->nr_events)
3854 goto unlock;
3855
3856 perf_pmu_disable(pmu);
3857
3858
3859
3860
3861
3862
3863
3864
3865 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3866 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3867 perf_event_sched_in(cpuctx, ctx);
3868
3869 if (cpuctx->sched_cb_usage && pmu->sched_task)
3870 pmu->sched_task(cpuctx->task_ctx, true);
3871
3872 perf_pmu_enable(pmu);
3873
3874 unlock:
3875 perf_ctx_unlock(cpuctx, ctx);
3876 }
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889 void __perf_event_task_sched_in(struct task_struct *prev,
3890 struct task_struct *task)
3891 {
3892 struct perf_event_context *ctx;
3893 int ctxn;
3894
3895 for_each_task_context_nr(ctxn) {
3896 ctx = task->perf_event_ctxp[ctxn];
3897 if (likely(!ctx))
3898 continue;
3899
3900 perf_event_context_sched_in(ctx, task);
3901 }
3902
3903 if (atomic_read(&nr_switch_events))
3904 perf_event_switch(task, prev, true);
3905
3906 if (__this_cpu_read(perf_sched_cb_usages))
3907 perf_pmu_sched_task(prev, task, true);
3908 }
3909
3910 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3911 {
3912 u64 frequency = event->attr.sample_freq;
3913 u64 sec = NSEC_PER_SEC;
3914 u64 divisor, dividend;
3915
3916 int count_fls, nsec_fls, frequency_fls, sec_fls;
3917
3918 count_fls = fls64(count);
3919 nsec_fls = fls64(nsec);
3920 frequency_fls = fls64(frequency);
3921 sec_fls = 30;
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937 #define REDUCE_FLS(a, b) \
3938 do { \
3939 if (a##_fls > b##_fls) { \
3940 a >>= 1; \
3941 a##_fls--; \
3942 } else { \
3943 b >>= 1; \
3944 b##_fls--; \
3945 } \
3946 } while (0)
3947
3948
3949
3950
3951
3952 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3953 REDUCE_FLS(nsec, frequency);
3954 REDUCE_FLS(sec, count);
3955 }
3956
3957 if (count_fls + sec_fls > 64) {
3958 divisor = nsec * frequency;
3959
3960 while (count_fls + sec_fls > 64) {
3961 REDUCE_FLS(count, sec);
3962 divisor >>= 1;
3963 }
3964
3965 dividend = count * sec;
3966 } else {
3967 dividend = count * sec;
3968
3969 while (nsec_fls + frequency_fls > 64) {
3970 REDUCE_FLS(nsec, frequency);
3971 dividend >>= 1;
3972 }
3973
3974 divisor = nsec * frequency;
3975 }
3976
3977 if (!divisor)
3978 return dividend;
3979
3980 return div64_u64(dividend, divisor);
3981 }
3982
3983 static DEFINE_PER_CPU(int, perf_throttled_count);
3984 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3985
3986 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3987 {
3988 struct hw_perf_event *hwc = &event->hw;
3989 s64 period, sample_period;
3990 s64 delta;
3991
3992 period = perf_calculate_period(event, nsec, count);
3993
3994 delta = (s64)(period - hwc->sample_period);
3995 delta = (delta + 7) / 8;
3996
3997 sample_period = hwc->sample_period + delta;
3998
3999 if (!sample_period)
4000 sample_period = 1;
4001
4002 hwc->sample_period = sample_period;
4003
4004 if (local64_read(&hwc->period_left) > 8*sample_period) {
4005 if (disable)
4006 event->pmu->stop(event, PERF_EF_UPDATE);
4007
4008 local64_set(&hwc->period_left, 0);
4009
4010 if (disable)
4011 event->pmu->start(event, PERF_EF_RELOAD);
4012 }
4013 }
4014
4015
4016
4017
4018
4019
4020 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4021 int needs_unthr)
4022 {
4023 struct perf_event *event;
4024 struct hw_perf_event *hwc;
4025 u64 now, period = TICK_NSEC;
4026 s64 delta;
4027
4028
4029
4030
4031
4032
4033 if (!(ctx->nr_freq || needs_unthr))
4034 return;
4035
4036 raw_spin_lock(&ctx->lock);
4037 perf_pmu_disable(ctx->pmu);
4038
4039 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4040 if (event->state != PERF_EVENT_STATE_ACTIVE)
4041 continue;
4042
4043 if (!event_filter_match(event))
4044 continue;
4045
4046 perf_pmu_disable(event->pmu);
4047
4048 hwc = &event->hw;
4049
4050 if (hwc->interrupts == MAX_INTERRUPTS) {
4051 hwc->interrupts = 0;
4052 perf_log_throttle(event, 1);
4053 event->pmu->start(event, 0);
4054 }
4055
4056 if (!event->attr.freq || !event->attr.sample_freq)
4057 goto next;
4058
4059
4060
4061
4062 event->pmu->stop(event, PERF_EF_UPDATE);
4063
4064 now = local64_read(&event->count);
4065 delta = now - hwc->freq_count_stamp;
4066 hwc->freq_count_stamp = now;
4067
4068
4069
4070
4071
4072
4073
4074
4075 if (delta > 0)
4076 perf_adjust_period(event, period, delta, false);
4077
4078 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4079 next:
4080 perf_pmu_enable(event->pmu);
4081 }
4082
4083 perf_pmu_enable(ctx->pmu);
4084 raw_spin_unlock(&ctx->lock);
4085 }
4086
4087
4088
4089
4090 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4091 {
4092
4093
4094
4095
4096 if (ctx->rotate_disable)
4097 return;
4098
4099 perf_event_groups_delete(&ctx->flexible_groups, event);
4100 perf_event_groups_insert(&ctx->flexible_groups, event);
4101 }
4102
4103
4104 static inline struct perf_event *
4105 ctx_event_to_rotate(struct perf_event_context *ctx)
4106 {
4107 struct perf_event *event;
4108
4109
4110 event = list_first_entry_or_null(&ctx->flexible_active,
4111 struct perf_event, active_list);
4112
4113
4114 if (!event) {
4115 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4116 typeof(*event), group_node);
4117 }
4118
4119
4120
4121
4122
4123 ctx->rotate_necessary = 0;
4124
4125 return event;
4126 }
4127
4128 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4129 {
4130 struct perf_event *cpu_event = NULL, *task_event = NULL;
4131 struct perf_event_context *task_ctx = NULL;
4132 int cpu_rotate, task_rotate;
4133
4134
4135
4136
4137
4138
4139 cpu_rotate = cpuctx->ctx.rotate_necessary;
4140 task_ctx = cpuctx->task_ctx;
4141 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4142
4143 if (!(cpu_rotate || task_rotate))
4144 return false;
4145
4146 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4147 perf_pmu_disable(cpuctx->ctx.pmu);
4148
4149 if (task_rotate)
4150 task_event = ctx_event_to_rotate(task_ctx);
4151 if (cpu_rotate)
4152 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4153
4154
4155
4156
4157
4158 if (task_event || (task_ctx && cpu_event))
4159 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4160 if (cpu_event)
4161 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4162
4163 if (task_event)
4164 rotate_ctx(task_ctx, task_event);
4165 if (cpu_event)
4166 rotate_ctx(&cpuctx->ctx, cpu_event);
4167
4168 perf_event_sched_in(cpuctx, task_ctx);
4169
4170 perf_pmu_enable(cpuctx->ctx.pmu);
4171 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4172
4173 return true;
4174 }
4175
4176 void perf_event_task_tick(void)
4177 {
4178 struct list_head *head = this_cpu_ptr(&active_ctx_list);
4179 struct perf_event_context *ctx, *tmp;
4180 int throttled;
4181
4182 lockdep_assert_irqs_disabled();
4183
4184 __this_cpu_inc(perf_throttled_seq);
4185 throttled = __this_cpu_xchg(perf_throttled_count, 0);
4186 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4187
4188 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4189 perf_adjust_freq_unthr_context(ctx, throttled);
4190 }
4191
4192 static int event_enable_on_exec(struct perf_event *event,
4193 struct perf_event_context *ctx)
4194 {
4195 if (!event->attr.enable_on_exec)
4196 return 0;
4197
4198 event->attr.enable_on_exec = 0;
4199 if (event->state >= PERF_EVENT_STATE_INACTIVE)
4200 return 0;
4201
4202 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4203
4204 return 1;
4205 }
4206
4207
4208
4209
4210
4211 static void perf_event_enable_on_exec(int ctxn)
4212 {
4213 struct perf_event_context *ctx, *clone_ctx = NULL;
4214 enum event_type_t event_type = 0;
4215 struct perf_cpu_context *cpuctx;
4216 struct perf_event *event;
4217 unsigned long flags;
4218 int enabled = 0;
4219
4220 local_irq_save(flags);
4221 ctx = current->perf_event_ctxp[ctxn];
4222 if (!ctx || !ctx->nr_events)
4223 goto out;
4224
4225 cpuctx = __get_cpu_context(ctx);
4226 perf_ctx_lock(cpuctx, ctx);
4227 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4228 list_for_each_entry(event, &ctx->event_list, event_entry) {
4229 enabled |= event_enable_on_exec(event, ctx);
4230 event_type |= get_event_type(event);
4231 }
4232
4233
4234
4235
4236 if (enabled) {
4237 clone_ctx = unclone_ctx(ctx);
4238 ctx_resched(cpuctx, ctx, event_type);
4239 } else {
4240 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
4241 }
4242 perf_ctx_unlock(cpuctx, ctx);
4243
4244 out:
4245 local_irq_restore(flags);
4246
4247 if (clone_ctx)
4248 put_ctx(clone_ctx);
4249 }
4250
4251 static void perf_remove_from_owner(struct perf_event *event);
4252 static void perf_event_exit_event(struct perf_event *event,
4253 struct perf_event_context *ctx);
4254
4255
4256
4257
4258
4259 static void perf_event_remove_on_exec(int ctxn)
4260 {
4261 struct perf_event_context *ctx, *clone_ctx = NULL;
4262 struct perf_event *event, *next;
4263 unsigned long flags;
4264 bool modified = false;
4265
4266 ctx = perf_pin_task_context(current, ctxn);
4267 if (!ctx)
4268 return;
4269
4270 mutex_lock(&ctx->mutex);
4271
4272 if (WARN_ON_ONCE(ctx->task != current))
4273 goto unlock;
4274
4275 list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4276 if (!event->attr.remove_on_exec)
4277 continue;
4278
4279 if (!is_kernel_event(event))
4280 perf_remove_from_owner(event);
4281
4282 modified = true;
4283
4284 perf_event_exit_event(event, ctx);
4285 }
4286
4287 raw_spin_lock_irqsave(&ctx->lock, flags);
4288 if (modified)
4289 clone_ctx = unclone_ctx(ctx);
4290 --ctx->pin_count;
4291 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4292
4293 unlock:
4294 mutex_unlock(&ctx->mutex);
4295
4296 put_ctx(ctx);
4297 if (clone_ctx)
4298 put_ctx(clone_ctx);
4299 }
4300
4301 struct perf_read_data {
4302 struct perf_event *event;
4303 bool group;
4304 int ret;
4305 };
4306
4307 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4308 {
4309 u16 local_pkg, event_pkg;
4310
4311 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4312 int local_cpu = smp_processor_id();
4313
4314 event_pkg = topology_physical_package_id(event_cpu);
4315 local_pkg = topology_physical_package_id(local_cpu);
4316
4317 if (event_pkg == local_pkg)
4318 return local_cpu;
4319 }
4320
4321 return event_cpu;
4322 }
4323
4324
4325
4326
4327 static void __perf_event_read(void *info)
4328 {
4329 struct perf_read_data *data = info;
4330 struct perf_event *sub, *event = data->event;
4331 struct perf_event_context *ctx = event->ctx;
4332 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4333 struct pmu *pmu = event->pmu;
4334
4335
4336
4337
4338
4339
4340
4341
4342 if (ctx->task && cpuctx->task_ctx != ctx)
4343 return;
4344
4345 raw_spin_lock(&ctx->lock);
4346 if (ctx->is_active & EVENT_TIME) {
4347 update_context_time(ctx);
4348 update_cgrp_time_from_event(event);
4349 }
4350
4351 perf_event_update_time(event);
4352 if (data->group)
4353 perf_event_update_sibling_time(event);
4354
4355 if (event->state != PERF_EVENT_STATE_ACTIVE)
4356 goto unlock;
4357
4358 if (!data->group) {
4359 pmu->read(event);
4360 data->ret = 0;
4361 goto unlock;
4362 }
4363
4364 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4365
4366 pmu->read(event);
4367
4368 for_each_sibling_event(sub, event) {
4369 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4370
4371
4372
4373
4374 sub->pmu->read(sub);
4375 }
4376 }
4377
4378 data->ret = pmu->commit_txn(pmu);
4379
4380 unlock:
4381 raw_spin_unlock(&ctx->lock);
4382 }
4383
4384 static inline u64 perf_event_count(struct perf_event *event)
4385 {
4386 return local64_read(&event->count) + atomic64_read(&event->child_count);
4387 }
4388
4389 static void calc_timer_values(struct perf_event *event,
4390 u64 *now,
4391 u64 *enabled,
4392 u64 *running)
4393 {
4394 u64 ctx_time;
4395
4396 *now = perf_clock();
4397 ctx_time = perf_event_time_now(event, *now);
4398 __perf_update_times(event, ctx_time, enabled, running);
4399 }
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409 int perf_event_read_local(struct perf_event *event, u64 *value,
4410 u64 *enabled, u64 *running)
4411 {
4412 unsigned long flags;
4413 int ret = 0;
4414
4415
4416
4417
4418
4419 local_irq_save(flags);
4420
4421
4422
4423
4424
4425 if (event->attr.inherit) {
4426 ret = -EOPNOTSUPP;
4427 goto out;
4428 }
4429
4430
4431 if ((event->attach_state & PERF_ATTACH_TASK) &&
4432 event->hw.target != current) {
4433 ret = -EINVAL;
4434 goto out;
4435 }
4436
4437
4438 if (!(event->attach_state & PERF_ATTACH_TASK) &&
4439 event->cpu != smp_processor_id()) {
4440 ret = -EINVAL;
4441 goto out;
4442 }
4443
4444
4445 if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4446 ret = -EBUSY;
4447 goto out;
4448 }
4449
4450
4451
4452
4453
4454
4455 if (event->oncpu == smp_processor_id())
4456 event->pmu->read(event);
4457
4458 *value = local64_read(&event->count);
4459 if (enabled || running) {
4460 u64 __enabled, __running, __now;
4461
4462 calc_timer_values(event, &__now, &__enabled, &__running);
4463 if (enabled)
4464 *enabled = __enabled;
4465 if (running)
4466 *running = __running;
4467 }
4468 out:
4469 local_irq_restore(flags);
4470
4471 return ret;
4472 }
4473
4474 static int perf_event_read(struct perf_event *event, bool group)
4475 {
4476 enum perf_event_state state = READ_ONCE(event->state);
4477 int event_cpu, ret = 0;
4478
4479
4480
4481
4482
4483 again:
4484 if (state == PERF_EVENT_STATE_ACTIVE) {
4485 struct perf_read_data data;
4486
4487
4488
4489
4490
4491
4492
4493 smp_rmb();
4494
4495 event_cpu = READ_ONCE(event->oncpu);
4496 if ((unsigned)event_cpu >= nr_cpu_ids)
4497 return 0;
4498
4499 data = (struct perf_read_data){
4500 .event = event,
4501 .group = group,
4502 .ret = 0,
4503 };
4504
4505 preempt_disable();
4506 event_cpu = __perf_event_read_cpu(event, event_cpu);
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4519 preempt_enable();
4520 ret = data.ret;
4521
4522 } else if (state == PERF_EVENT_STATE_INACTIVE) {
4523 struct perf_event_context *ctx = event->ctx;
4524 unsigned long flags;
4525
4526 raw_spin_lock_irqsave(&ctx->lock, flags);
4527 state = event->state;
4528 if (state != PERF_EVENT_STATE_INACTIVE) {
4529 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4530 goto again;
4531 }
4532
4533
4534
4535
4536
4537 if (ctx->is_active & EVENT_TIME) {
4538 update_context_time(ctx);
4539 update_cgrp_time_from_event(event);
4540 }
4541
4542 perf_event_update_time(event);
4543 if (group)
4544 perf_event_update_sibling_time(event);
4545 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4546 }
4547
4548 return ret;
4549 }
4550
4551
4552
4553
4554 static void __perf_event_init_context(struct perf_event_context *ctx)
4555 {
4556 raw_spin_lock_init(&ctx->lock);
4557 mutex_init(&ctx->mutex);
4558 INIT_LIST_HEAD(&ctx->active_ctx_list);
4559 perf_event_groups_init(&ctx->pinned_groups);
4560 perf_event_groups_init(&ctx->flexible_groups);
4561 INIT_LIST_HEAD(&ctx->event_list);
4562 INIT_LIST_HEAD(&ctx->pinned_active);
4563 INIT_LIST_HEAD(&ctx->flexible_active);
4564 refcount_set(&ctx->refcount, 1);
4565 }
4566
4567 static struct perf_event_context *
4568 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4569 {
4570 struct perf_event_context *ctx;
4571
4572 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4573 if (!ctx)
4574 return NULL;
4575
4576 __perf_event_init_context(ctx);
4577 if (task)
4578 ctx->task = get_task_struct(task);
4579 ctx->pmu = pmu;
4580
4581 return ctx;
4582 }
4583
4584 static struct task_struct *
4585 find_lively_task_by_vpid(pid_t vpid)
4586 {
4587 struct task_struct *task;
4588
4589 rcu_read_lock();
4590 if (!vpid)
4591 task = current;
4592 else
4593 task = find_task_by_vpid(vpid);
4594 if (task)
4595 get_task_struct(task);
4596 rcu_read_unlock();
4597
4598 if (!task)
4599 return ERR_PTR(-ESRCH);
4600
4601 return task;
4602 }
4603
4604
4605
4606
4607 static struct perf_event_context *
4608 find_get_context(struct pmu *pmu, struct task_struct *task,
4609 struct perf_event *event)
4610 {
4611 struct perf_event_context *ctx, *clone_ctx = NULL;
4612 struct perf_cpu_context *cpuctx;
4613 void *task_ctx_data = NULL;
4614 unsigned long flags;
4615 int ctxn, err;
4616 int cpu = event->cpu;
4617
4618 if (!task) {
4619
4620 err = perf_allow_cpu(&event->attr);
4621 if (err)
4622 return ERR_PTR(err);
4623
4624 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4625 ctx = &cpuctx->ctx;
4626 get_ctx(ctx);
4627 raw_spin_lock_irqsave(&ctx->lock, flags);
4628 ++ctx->pin_count;
4629 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4630
4631 return ctx;
4632 }
4633
4634 err = -EINVAL;
4635 ctxn = pmu->task_ctx_nr;
4636 if (ctxn < 0)
4637 goto errout;
4638
4639 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4640 task_ctx_data = alloc_task_ctx_data(pmu);
4641 if (!task_ctx_data) {
4642 err = -ENOMEM;
4643 goto errout;
4644 }
4645 }
4646
4647 retry:
4648 ctx = perf_lock_task_context(task, ctxn, &flags);
4649 if (ctx) {
4650 clone_ctx = unclone_ctx(ctx);
4651 ++ctx->pin_count;
4652
4653 if (task_ctx_data && !ctx->task_ctx_data) {
4654 ctx->task_ctx_data = task_ctx_data;
4655 task_ctx_data = NULL;
4656 }
4657 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4658
4659 if (clone_ctx)
4660 put_ctx(clone_ctx);
4661 } else {
4662 ctx = alloc_perf_context(pmu, task);
4663 err = -ENOMEM;
4664 if (!ctx)
4665 goto errout;
4666
4667 if (task_ctx_data) {
4668 ctx->task_ctx_data = task_ctx_data;
4669 task_ctx_data = NULL;
4670 }
4671
4672 err = 0;
4673 mutex_lock(&task->perf_event_mutex);
4674
4675
4676
4677
4678 if (task->flags & PF_EXITING)
4679 err = -ESRCH;
4680 else if (task->perf_event_ctxp[ctxn])
4681 err = -EAGAIN;
4682 else {
4683 get_ctx(ctx);
4684 ++ctx->pin_count;
4685 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4686 }
4687 mutex_unlock(&task->perf_event_mutex);
4688
4689 if (unlikely(err)) {
4690 put_ctx(ctx);
4691
4692 if (err == -EAGAIN)
4693 goto retry;
4694 goto errout;
4695 }
4696 }
4697
4698 free_task_ctx_data(pmu, task_ctx_data);
4699 return ctx;
4700
4701 errout:
4702 free_task_ctx_data(pmu, task_ctx_data);
4703 return ERR_PTR(err);
4704 }
4705
4706 static void perf_event_free_filter(struct perf_event *event);
4707
4708 static void free_event_rcu(struct rcu_head *head)
4709 {
4710 struct perf_event *event;
4711
4712 event = container_of(head, struct perf_event, rcu_head);
4713 if (event->ns)
4714 put_pid_ns(event->ns);
4715 perf_event_free_filter(event);
4716 kmem_cache_free(perf_event_cache, event);
4717 }
4718
4719 static void ring_buffer_attach(struct perf_event *event,
4720 struct perf_buffer *rb);
4721
4722 static void detach_sb_event(struct perf_event *event)
4723 {
4724 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4725
4726 raw_spin_lock(&pel->lock);
4727 list_del_rcu(&event->sb_list);
4728 raw_spin_unlock(&pel->lock);
4729 }
4730
4731 static bool is_sb_event(struct perf_event *event)
4732 {
4733 struct perf_event_attr *attr = &event->attr;
4734
4735 if (event->parent)
4736 return false;
4737
4738 if (event->attach_state & PERF_ATTACH_TASK)
4739 return false;
4740
4741 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4742 attr->comm || attr->comm_exec ||
4743 attr->task || attr->ksymbol ||
4744 attr->context_switch || attr->text_poke ||
4745 attr->bpf_event)
4746 return true;
4747 return false;
4748 }
4749
4750 static void unaccount_pmu_sb_event(struct perf_event *event)
4751 {
4752 if (is_sb_event(event))
4753 detach_sb_event(event);
4754 }
4755
4756 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4757 {
4758 if (event->parent)
4759 return;
4760
4761 if (is_cgroup_event(event))
4762 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4763 }
4764
4765 #ifdef CONFIG_NO_HZ_FULL
4766 static DEFINE_SPINLOCK(nr_freq_lock);
4767 #endif
4768
4769 static void unaccount_freq_event_nohz(void)
4770 {
4771 #ifdef CONFIG_NO_HZ_FULL
4772 spin_lock(&nr_freq_lock);
4773 if (atomic_dec_and_test(&nr_freq_events))
4774 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4775 spin_unlock(&nr_freq_lock);
4776 #endif
4777 }
4778
4779 static void unaccount_freq_event(void)
4780 {
4781 if (tick_nohz_full_enabled())
4782 unaccount_freq_event_nohz();
4783 else
4784 atomic_dec(&nr_freq_events);
4785 }
4786
4787 static void unaccount_event(struct perf_event *event)
4788 {
4789 bool dec = false;
4790
4791 if (event->parent)
4792 return;
4793
4794 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4795 dec = true;
4796 if (event->attr.mmap || event->attr.mmap_data)
4797 atomic_dec(&nr_mmap_events);
4798 if (event->attr.build_id)
4799 atomic_dec(&nr_build_id_events);
4800 if (event->attr.comm)
4801 atomic_dec(&nr_comm_events);
4802 if (event->attr.namespaces)
4803 atomic_dec(&nr_namespaces_events);
4804 if (event->attr.cgroup)
4805 atomic_dec(&nr_cgroup_events);
4806 if (event->attr.task)
4807 atomic_dec(&nr_task_events);
4808 if (event->attr.freq)
4809 unaccount_freq_event();
4810 if (event->attr.context_switch) {
4811 dec = true;
4812 atomic_dec(&nr_switch_events);
4813 }
4814 if (is_cgroup_event(event))
4815 dec = true;
4816 if (has_branch_stack(event))
4817 dec = true;
4818 if (event->attr.ksymbol)
4819 atomic_dec(&nr_ksymbol_events);
4820 if (event->attr.bpf_event)
4821 atomic_dec(&nr_bpf_events);
4822 if (event->attr.text_poke)
4823 atomic_dec(&nr_text_poke_events);
4824
4825 if (dec) {
4826 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4827 schedule_delayed_work(&perf_sched_work, HZ);
4828 }
4829
4830 unaccount_event_cpu(event, event->cpu);
4831
4832 unaccount_pmu_sb_event(event);
4833 }
4834
4835 static void perf_sched_delayed(struct work_struct *work)
4836 {
4837 mutex_lock(&perf_sched_mutex);
4838 if (atomic_dec_and_test(&perf_sched_count))
4839 static_branch_disable(&perf_sched_events);
4840 mutex_unlock(&perf_sched_mutex);
4841 }
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855 static int exclusive_event_init(struct perf_event *event)
4856 {
4857 struct pmu *pmu = event->pmu;
4858
4859 if (!is_exclusive_pmu(pmu))
4860 return 0;
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875 if (event->attach_state & PERF_ATTACH_TASK) {
4876 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4877 return -EBUSY;
4878 } else {
4879 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4880 return -EBUSY;
4881 }
4882
4883 return 0;
4884 }
4885
4886 static void exclusive_event_destroy(struct perf_event *event)
4887 {
4888 struct pmu *pmu = event->pmu;
4889
4890 if (!is_exclusive_pmu(pmu))
4891 return;
4892
4893
4894 if (event->attach_state & PERF_ATTACH_TASK)
4895 atomic_dec(&pmu->exclusive_cnt);
4896 else
4897 atomic_inc(&pmu->exclusive_cnt);
4898 }
4899
4900 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4901 {
4902 if ((e1->pmu == e2->pmu) &&
4903 (e1->cpu == e2->cpu ||
4904 e1->cpu == -1 ||
4905 e2->cpu == -1))
4906 return true;
4907 return false;
4908 }
4909
4910 static bool exclusive_event_installable(struct perf_event *event,
4911 struct perf_event_context *ctx)
4912 {
4913 struct perf_event *iter_event;
4914 struct pmu *pmu = event->pmu;
4915
4916 lockdep_assert_held(&ctx->mutex);
4917
4918 if (!is_exclusive_pmu(pmu))
4919 return true;
4920
4921 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4922 if (exclusive_event_match(iter_event, event))
4923 return false;
4924 }
4925
4926 return true;
4927 }
4928
4929 static void perf_addr_filters_splice(struct perf_event *event,
4930 struct list_head *head);
4931
4932 static void _free_event(struct perf_event *event)
4933 {
4934 irq_work_sync(&event->pending);
4935
4936 unaccount_event(event);
4937
4938 security_perf_event_free(event);
4939
4940 if (event->rb) {
4941
4942
4943
4944
4945
4946
4947 mutex_lock(&event->mmap_mutex);
4948 ring_buffer_attach(event, NULL);
4949 mutex_unlock(&event->mmap_mutex);
4950 }
4951
4952 if (is_cgroup_event(event))
4953 perf_detach_cgroup(event);
4954
4955 if (!event->parent) {
4956 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4957 put_callchain_buffers();
4958 }
4959
4960 perf_event_free_bpf_prog(event);
4961 perf_addr_filters_splice(event, NULL);
4962 kfree(event->addr_filter_ranges);
4963
4964 if (event->destroy)
4965 event->destroy(event);
4966
4967
4968
4969
4970
4971 if (event->hw.target)
4972 put_task_struct(event->hw.target);
4973
4974
4975
4976
4977
4978 if (event->ctx)
4979 put_ctx(event->ctx);
4980
4981 exclusive_event_destroy(event);
4982 module_put(event->pmu->module);
4983
4984 call_rcu(&event->rcu_head, free_event_rcu);
4985 }
4986
4987
4988
4989
4990
4991 static void free_event(struct perf_event *event)
4992 {
4993 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4994 "unexpected event refcount: %ld; ptr=%p\n",
4995 atomic_long_read(&event->refcount), event)) {
4996
4997 return;
4998 }
4999
5000 _free_event(event);
5001 }
5002
5003
5004
5005
5006 static void perf_remove_from_owner(struct perf_event *event)
5007 {
5008 struct task_struct *owner;
5009
5010 rcu_read_lock();
5011
5012
5013
5014
5015
5016
5017 owner = READ_ONCE(event->owner);
5018 if (owner) {
5019
5020
5021
5022
5023
5024 get_task_struct(owner);
5025 }
5026 rcu_read_unlock();
5027
5028 if (owner) {
5029
5030
5031
5032
5033
5034
5035
5036
5037 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5038
5039
5040
5041
5042
5043
5044
5045 if (event->owner) {
5046 list_del_init(&event->owner_entry);
5047 smp_store_release(&event->owner, NULL);
5048 }
5049 mutex_unlock(&owner->perf_event_mutex);
5050 put_task_struct(owner);
5051 }
5052 }
5053
5054 static void put_event(struct perf_event *event)
5055 {
5056 if (!atomic_long_dec_and_test(&event->refcount))
5057 return;
5058
5059 _free_event(event);
5060 }
5061
5062
5063
5064
5065
5066
5067 int perf_event_release_kernel(struct perf_event *event)
5068 {
5069 struct perf_event_context *ctx = event->ctx;
5070 struct perf_event *child, *tmp;
5071 LIST_HEAD(free_list);
5072
5073
5074
5075
5076
5077 if (!ctx) {
5078 WARN_ON_ONCE(event->attach_state &
5079 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5080 goto no_ctx;
5081 }
5082
5083 if (!is_kernel_event(event))
5084 perf_remove_from_owner(event);
5085
5086 ctx = perf_event_ctx_lock(event);
5087 WARN_ON_ONCE(ctx->parent_ctx);
5088 perf_remove_from_context(event, DETACH_GROUP);
5089
5090 raw_spin_lock_irq(&ctx->lock);
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102 event->state = PERF_EVENT_STATE_DEAD;
5103 raw_spin_unlock_irq(&ctx->lock);
5104
5105 perf_event_ctx_unlock(event, ctx);
5106
5107 again:
5108 mutex_lock(&event->child_mutex);
5109 list_for_each_entry(child, &event->child_list, child_list) {
5110
5111
5112
5113
5114
5115 ctx = READ_ONCE(child->ctx);
5116
5117
5118
5119
5120
5121
5122
5123
5124 get_ctx(ctx);
5125
5126
5127
5128
5129
5130
5131 mutex_unlock(&event->child_mutex);
5132 mutex_lock(&ctx->mutex);
5133 mutex_lock(&event->child_mutex);
5134
5135
5136
5137
5138
5139
5140 tmp = list_first_entry_or_null(&event->child_list,
5141 struct perf_event, child_list);
5142 if (tmp == child) {
5143 perf_remove_from_context(child, DETACH_GROUP);
5144 list_move(&child->child_list, &free_list);
5145
5146
5147
5148
5149 put_event(event);
5150 }
5151
5152 mutex_unlock(&event->child_mutex);
5153 mutex_unlock(&ctx->mutex);
5154 put_ctx(ctx);
5155 goto again;
5156 }
5157 mutex_unlock(&event->child_mutex);
5158
5159 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5160 void *var = &child->ctx->refcount;
5161
5162 list_del(&child->child_list);
5163 free_event(child);
5164
5165
5166
5167
5168
5169 smp_mb();
5170 wake_up_var(var);
5171 }
5172
5173 no_ctx:
5174 put_event(event);
5175 return 0;
5176 }
5177 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5178
5179
5180
5181
5182 static int perf_release(struct inode *inode, struct file *file)
5183 {
5184 perf_event_release_kernel(file->private_data);
5185 return 0;
5186 }
5187
5188 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5189 {
5190 struct perf_event *child;
5191 u64 total = 0;
5192
5193 *enabled = 0;
5194 *running = 0;
5195
5196 mutex_lock(&event->child_mutex);
5197
5198 (void)perf_event_read(event, false);
5199 total += perf_event_count(event);
5200
5201 *enabled += event->total_time_enabled +
5202 atomic64_read(&event->child_total_time_enabled);
5203 *running += event->total_time_running +
5204 atomic64_read(&event->child_total_time_running);
5205
5206 list_for_each_entry(child, &event->child_list, child_list) {
5207 (void)perf_event_read(child, false);
5208 total += perf_event_count(child);
5209 *enabled += child->total_time_enabled;
5210 *running += child->total_time_running;
5211 }
5212 mutex_unlock(&event->child_mutex);
5213
5214 return total;
5215 }
5216
5217 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5218 {
5219 struct perf_event_context *ctx;
5220 u64 count;
5221
5222 ctx = perf_event_ctx_lock(event);
5223 count = __perf_event_read_value(event, enabled, running);
5224 perf_event_ctx_unlock(event, ctx);
5225
5226 return count;
5227 }
5228 EXPORT_SYMBOL_GPL(perf_event_read_value);
5229
5230 static int __perf_read_group_add(struct perf_event *leader,
5231 u64 read_format, u64 *values)
5232 {
5233 struct perf_event_context *ctx = leader->ctx;
5234 struct perf_event *sub;
5235 unsigned long flags;
5236 int n = 1;
5237 int ret;
5238
5239 ret = perf_event_read(leader, true);
5240 if (ret)
5241 return ret;
5242
5243 raw_spin_lock_irqsave(&ctx->lock, flags);
5244
5245
5246
5247
5248
5249
5250 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5251 values[n++] += leader->total_time_enabled +
5252 atomic64_read(&leader->child_total_time_enabled);
5253 }
5254
5255 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5256 values[n++] += leader->total_time_running +
5257 atomic64_read(&leader->child_total_time_running);
5258 }
5259
5260
5261
5262
5263 values[n++] += perf_event_count(leader);
5264 if (read_format & PERF_FORMAT_ID)
5265 values[n++] = primary_event_id(leader);
5266 if (read_format & PERF_FORMAT_LOST)
5267 values[n++] = atomic64_read(&leader->lost_samples);
5268
5269 for_each_sibling_event(sub, leader) {
5270 values[n++] += perf_event_count(sub);
5271 if (read_format & PERF_FORMAT_ID)
5272 values[n++] = primary_event_id(sub);
5273 if (read_format & PERF_FORMAT_LOST)
5274 values[n++] = atomic64_read(&sub->lost_samples);
5275 }
5276
5277 raw_spin_unlock_irqrestore(&ctx->lock, flags);
5278 return 0;
5279 }
5280
5281 static int perf_read_group(struct perf_event *event,
5282 u64 read_format, char __user *buf)
5283 {
5284 struct perf_event *leader = event->group_leader, *child;
5285 struct perf_event_context *ctx = leader->ctx;
5286 int ret;
5287 u64 *values;
5288
5289 lockdep_assert_held(&ctx->mutex);
5290
5291 values = kzalloc(event->read_size, GFP_KERNEL);
5292 if (!values)
5293 return -ENOMEM;
5294
5295 values[0] = 1 + leader->nr_siblings;
5296
5297
5298
5299
5300
5301 mutex_lock(&leader->child_mutex);
5302
5303 ret = __perf_read_group_add(leader, read_format, values);
5304 if (ret)
5305 goto unlock;
5306
5307 list_for_each_entry(child, &leader->child_list, child_list) {
5308 ret = __perf_read_group_add(child, read_format, values);
5309 if (ret)
5310 goto unlock;
5311 }
5312
5313 mutex_unlock(&leader->child_mutex);
5314
5315 ret = event->read_size;
5316 if (copy_to_user(buf, values, event->read_size))
5317 ret = -EFAULT;
5318 goto out;
5319
5320 unlock:
5321 mutex_unlock(&leader->child_mutex);
5322 out:
5323 kfree(values);
5324 return ret;
5325 }
5326
5327 static int perf_read_one(struct perf_event *event,
5328 u64 read_format, char __user *buf)
5329 {
5330 u64 enabled, running;
5331 u64 values[5];
5332 int n = 0;
5333
5334 values[n++] = __perf_event_read_value(event, &enabled, &running);
5335 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5336 values[n++] = enabled;
5337 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5338 values[n++] = running;
5339 if (read_format & PERF_FORMAT_ID)
5340 values[n++] = primary_event_id(event);
5341 if (read_format & PERF_FORMAT_LOST)
5342 values[n++] = atomic64_read(&event->lost_samples);
5343
5344 if (copy_to_user(buf, values, n * sizeof(u64)))
5345 return -EFAULT;
5346
5347 return n * sizeof(u64);
5348 }
5349
5350 static bool is_event_hup(struct perf_event *event)
5351 {
5352 bool no_children;
5353
5354 if (event->state > PERF_EVENT_STATE_EXIT)
5355 return false;
5356
5357 mutex_lock(&event->child_mutex);
5358 no_children = list_empty(&event->child_list);
5359 mutex_unlock(&event->child_mutex);
5360 return no_children;
5361 }
5362
5363
5364
5365
5366 static ssize_t
5367 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5368 {
5369 u64 read_format = event->attr.read_format;
5370 int ret;
5371
5372
5373
5374
5375
5376
5377 if (event->state == PERF_EVENT_STATE_ERROR)
5378 return 0;
5379
5380 if (count < event->read_size)
5381 return -ENOSPC;
5382
5383 WARN_ON_ONCE(event->ctx->parent_ctx);
5384 if (read_format & PERF_FORMAT_GROUP)
5385 ret = perf_read_group(event, read_format, buf);
5386 else
5387 ret = perf_read_one(event, read_format, buf);
5388
5389 return ret;
5390 }
5391
5392 static ssize_t
5393 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5394 {
5395 struct perf_event *event = file->private_data;
5396 struct perf_event_context *ctx;
5397 int ret;
5398
5399 ret = security_perf_event_read(event);
5400 if (ret)
5401 return ret;
5402
5403 ctx = perf_event_ctx_lock(event);
5404 ret = __perf_read(event, buf, count);
5405 perf_event_ctx_unlock(event, ctx);
5406
5407 return ret;
5408 }
5409
5410 static __poll_t perf_poll(struct file *file, poll_table *wait)
5411 {
5412 struct perf_event *event = file->private_data;
5413 struct perf_buffer *rb;
5414 __poll_t events = EPOLLHUP;
5415
5416 poll_wait(file, &event->waitq, wait);
5417
5418 if (is_event_hup(event))
5419 return events;
5420
5421
5422
5423
5424
5425 mutex_lock(&event->mmap_mutex);
5426 rb = event->rb;
5427 if (rb)
5428 events = atomic_xchg(&rb->poll, 0);
5429 mutex_unlock(&event->mmap_mutex);
5430 return events;
5431 }
5432
5433 static void _perf_event_reset(struct perf_event *event)
5434 {
5435 (void)perf_event_read(event, false);
5436 local64_set(&event->count, 0);
5437 perf_event_update_userpage(event);
5438 }
5439
5440
5441 u64 perf_event_pause(struct perf_event *event, bool reset)
5442 {
5443 struct perf_event_context *ctx;
5444 u64 count;
5445
5446 ctx = perf_event_ctx_lock(event);
5447 WARN_ON_ONCE(event->attr.inherit);
5448 _perf_event_disable(event);
5449 count = local64_read(&event->count);
5450 if (reset)
5451 local64_set(&event->count, 0);
5452 perf_event_ctx_unlock(event, ctx);
5453
5454 return count;
5455 }
5456 EXPORT_SYMBOL_GPL(perf_event_pause);
5457
5458
5459
5460
5461
5462
5463
5464 static void perf_event_for_each_child(struct perf_event *event,
5465 void (*func)(struct perf_event *))
5466 {
5467 struct perf_event *child;
5468
5469 WARN_ON_ONCE(event->ctx->parent_ctx);
5470
5471 mutex_lock(&event->child_mutex);
5472 func(event);
5473 list_for_each_entry(child, &event->child_list, child_list)
5474 func(child);
5475 mutex_unlock(&event->child_mutex);
5476 }
5477
5478 static void perf_event_for_each(struct perf_event *event,
5479 void (*func)(struct perf_event *))
5480 {
5481 struct perf_event_context *ctx = event->ctx;
5482 struct perf_event *sibling;
5483
5484 lockdep_assert_held(&ctx->mutex);
5485
5486 event = event->group_leader;
5487
5488 perf_event_for_each_child(event, func);
5489 for_each_sibling_event(sibling, event)
5490 perf_event_for_each_child(sibling, func);
5491 }
5492
5493 static void __perf_event_period(struct perf_event *event,
5494 struct perf_cpu_context *cpuctx,
5495 struct perf_event_context *ctx,
5496 void *info)
5497 {
5498 u64 value = *((u64 *)info);
5499 bool active;
5500
5501 if (event->attr.freq) {
5502 event->attr.sample_freq = value;
5503 } else {
5504 event->attr.sample_period = value;
5505 event->hw.sample_period = value;
5506 }
5507
5508 active = (event->state == PERF_EVENT_STATE_ACTIVE);
5509 if (active) {
5510 perf_pmu_disable(ctx->pmu);
5511
5512
5513
5514
5515 if (event->hw.interrupts == MAX_INTERRUPTS) {
5516 event->hw.interrupts = 0;
5517 perf_log_throttle(event, 1);
5518 }
5519 event->pmu->stop(event, PERF_EF_UPDATE);
5520 }
5521
5522 local64_set(&event->hw.period_left, 0);
5523
5524 if (active) {
5525 event->pmu->start(event, PERF_EF_RELOAD);
5526 perf_pmu_enable(ctx->pmu);
5527 }
5528 }
5529
5530 static int perf_event_check_period(struct perf_event *event, u64 value)
5531 {
5532 return event->pmu->check_period(event, value);
5533 }
5534
5535 static int _perf_event_period(struct perf_event *event, u64 value)
5536 {
5537 if (!is_sampling_event(event))
5538 return -EINVAL;
5539
5540 if (!value)
5541 return -EINVAL;
5542
5543 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5544 return -EINVAL;
5545
5546 if (perf_event_check_period(event, value))
5547 return -EINVAL;
5548
5549 if (!event->attr.freq && (value & (1ULL << 63)))
5550 return -EINVAL;
5551
5552 event_function_call(event, __perf_event_period, &value);
5553
5554 return 0;
5555 }
5556
5557 int perf_event_period(struct perf_event *event, u64 value)
5558 {
5559 struct perf_event_context *ctx;
5560 int ret;
5561
5562 ctx = perf_event_ctx_lock(event);
5563 ret = _perf_event_period(event, value);
5564 perf_event_ctx_unlock(event, ctx);
5565
5566 return ret;
5567 }
5568 EXPORT_SYMBOL_GPL(perf_event_period);
5569
5570 static const struct file_operations perf_fops;
5571
5572 static inline int perf_fget_light(int fd, struct fd *p)
5573 {
5574 struct fd f = fdget(fd);
5575 if (!f.file)
5576 return -EBADF;
5577
5578 if (f.file->f_op != &perf_fops) {
5579 fdput(f);
5580 return -EBADF;
5581 }
5582 *p = f;
5583 return 0;
5584 }
5585
5586 static int perf_event_set_output(struct perf_event *event,
5587 struct perf_event *output_event);
5588 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5589 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5590 struct perf_event_attr *attr);
5591
5592 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5593 {
5594 void (*func)(struct perf_event *);
5595 u32 flags = arg;
5596
5597 switch (cmd) {
5598 case PERF_EVENT_IOC_ENABLE:
5599 func = _perf_event_enable;
5600 break;
5601 case PERF_EVENT_IOC_DISABLE:
5602 func = _perf_event_disable;
5603 break;
5604 case PERF_EVENT_IOC_RESET:
5605 func = _perf_event_reset;
5606 break;
5607
5608 case PERF_EVENT_IOC_REFRESH:
5609 return _perf_event_refresh(event, arg);
5610
5611 case PERF_EVENT_IOC_PERIOD:
5612 {
5613 u64 value;
5614
5615 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5616 return -EFAULT;
5617
5618 return _perf_event_period(event, value);
5619 }
5620 case PERF_EVENT_IOC_ID:
5621 {
5622 u64 id = primary_event_id(event);
5623
5624 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5625 return -EFAULT;
5626 return 0;
5627 }
5628
5629 case PERF_EVENT_IOC_SET_OUTPUT:
5630 {
5631 int ret;
5632 if (arg != -1) {
5633 struct perf_event *output_event;
5634 struct fd output;
5635 ret = perf_fget_light(arg, &output);
5636 if (ret)
5637 return ret;
5638 output_event = output.file->private_data;
5639 ret = perf_event_set_output(event, output_event);
5640 fdput(output);
5641 } else {
5642 ret = perf_event_set_output(event, NULL);
5643 }
5644 return ret;
5645 }
5646
5647 case PERF_EVENT_IOC_SET_FILTER:
5648 return perf_event_set_filter(event, (void __user *)arg);
5649
5650 case PERF_EVENT_IOC_SET_BPF:
5651 {
5652 struct bpf_prog *prog;
5653 int err;
5654
5655 prog = bpf_prog_get(arg);
5656 if (IS_ERR(prog))
5657 return PTR_ERR(prog);
5658
5659 err = perf_event_set_bpf_prog(event, prog, 0);
5660 if (err) {
5661 bpf_prog_put(prog);
5662 return err;
5663 }
5664
5665 return 0;
5666 }
5667
5668 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5669 struct perf_buffer *rb;
5670
5671 rcu_read_lock();
5672 rb = rcu_dereference(event->rb);
5673 if (!rb || !rb->nr_pages) {
5674 rcu_read_unlock();
5675 return -EINVAL;
5676 }
5677 rb_toggle_paused(rb, !!arg);
5678 rcu_read_unlock();
5679 return 0;
5680 }
5681
5682 case PERF_EVENT_IOC_QUERY_BPF:
5683 return perf_event_query_prog_array(event, (void __user *)arg);
5684
5685 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5686 struct perf_event_attr new_attr;
5687 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5688 &new_attr);
5689
5690 if (err)
5691 return err;
5692
5693 return perf_event_modify_attr(event, &new_attr);
5694 }
5695 default:
5696 return -ENOTTY;
5697 }
5698
5699 if (flags & PERF_IOC_FLAG_GROUP)
5700 perf_event_for_each(event, func);
5701 else
5702 perf_event_for_each_child(event, func);
5703
5704 return 0;
5705 }
5706
5707 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5708 {
5709 struct perf_event *event = file->private_data;
5710 struct perf_event_context *ctx;
5711 long ret;
5712
5713
5714 ret = security_perf_event_write(event);
5715 if (ret)
5716 return ret;
5717
5718 ctx = perf_event_ctx_lock(event);
5719 ret = _perf_ioctl(event, cmd, arg);
5720 perf_event_ctx_unlock(event, ctx);
5721
5722 return ret;
5723 }
5724
5725 #ifdef CONFIG_COMPAT
5726 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5727 unsigned long arg)
5728 {
5729 switch (_IOC_NR(cmd)) {
5730 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5731 case _IOC_NR(PERF_EVENT_IOC_ID):
5732 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5733 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5734
5735 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5736 cmd &= ~IOCSIZE_MASK;
5737 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5738 }
5739 break;
5740 }
5741 return perf_ioctl(file, cmd, arg);
5742 }
5743 #else
5744 # define perf_compat_ioctl NULL
5745 #endif
5746
5747 int perf_event_task_enable(void)
5748 {
5749 struct perf_event_context *ctx;
5750 struct perf_event *event;
5751
5752 mutex_lock(¤t->perf_event_mutex);
5753 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5754 ctx = perf_event_ctx_lock(event);
5755 perf_event_for_each_child(event, _perf_event_enable);
5756 perf_event_ctx_unlock(event, ctx);
5757 }
5758 mutex_unlock(¤t->perf_event_mutex);
5759
5760 return 0;
5761 }
5762
5763 int perf_event_task_disable(void)
5764 {
5765 struct perf_event_context *ctx;
5766 struct perf_event *event;
5767
5768 mutex_lock(¤t->perf_event_mutex);
5769 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
5770 ctx = perf_event_ctx_lock(event);
5771 perf_event_for_each_child(event, _perf_event_disable);
5772 perf_event_ctx_unlock(event, ctx);
5773 }
5774 mutex_unlock(¤t->perf_event_mutex);
5775
5776 return 0;
5777 }
5778
5779 static int perf_event_index(struct perf_event *event)
5780 {
5781 if (event->hw.state & PERF_HES_STOPPED)
5782 return 0;
5783
5784 if (event->state != PERF_EVENT_STATE_ACTIVE)
5785 return 0;
5786
5787 return event->pmu->event_idx(event);
5788 }
5789
5790 static void perf_event_init_userpage(struct perf_event *event)
5791 {
5792 struct perf_event_mmap_page *userpg;
5793 struct perf_buffer *rb;
5794
5795 rcu_read_lock();
5796 rb = rcu_dereference(event->rb);
5797 if (!rb)
5798 goto unlock;
5799
5800 userpg = rb->user_page;
5801
5802
5803 userpg->cap_bit0_is_deprecated = 1;
5804 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5805 userpg->data_offset = PAGE_SIZE;
5806 userpg->data_size = perf_data_size(rb);
5807
5808 unlock:
5809 rcu_read_unlock();
5810 }
5811
5812 void __weak arch_perf_update_userpage(
5813 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5814 {
5815 }
5816
5817
5818
5819
5820
5821
5822 void perf_event_update_userpage(struct perf_event *event)
5823 {
5824 struct perf_event_mmap_page *userpg;
5825 struct perf_buffer *rb;
5826 u64 enabled, running, now;
5827
5828 rcu_read_lock();
5829 rb = rcu_dereference(event->rb);
5830 if (!rb)
5831 goto unlock;
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842 calc_timer_values(event, &now, &enabled, &running);
5843
5844 userpg = rb->user_page;
5845
5846
5847
5848
5849 preempt_disable();
5850 ++userpg->lock;
5851 barrier();
5852 userpg->index = perf_event_index(event);
5853 userpg->offset = perf_event_count(event);
5854 if (userpg->index)
5855 userpg->offset -= local64_read(&event->hw.prev_count);
5856
5857 userpg->time_enabled = enabled +
5858 atomic64_read(&event->child_total_time_enabled);
5859
5860 userpg->time_running = running +
5861 atomic64_read(&event->child_total_time_running);
5862
5863 arch_perf_update_userpage(event, userpg, now);
5864
5865 barrier();
5866 ++userpg->lock;
5867 preempt_enable();
5868 unlock:
5869 rcu_read_unlock();
5870 }
5871 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5872
5873 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5874 {
5875 struct perf_event *event = vmf->vma->vm_file->private_data;
5876 struct perf_buffer *rb;
5877 vm_fault_t ret = VM_FAULT_SIGBUS;
5878
5879 if (vmf->flags & FAULT_FLAG_MKWRITE) {
5880 if (vmf->pgoff == 0)
5881 ret = 0;
5882 return ret;
5883 }
5884
5885 rcu_read_lock();
5886 rb = rcu_dereference(event->rb);
5887 if (!rb)
5888 goto unlock;
5889
5890 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5891 goto unlock;
5892
5893 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5894 if (!vmf->page)
5895 goto unlock;
5896
5897 get_page(vmf->page);
5898 vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5899 vmf->page->index = vmf->pgoff;
5900
5901 ret = 0;
5902 unlock:
5903 rcu_read_unlock();
5904
5905 return ret;
5906 }
5907
5908 static void ring_buffer_attach(struct perf_event *event,
5909 struct perf_buffer *rb)
5910 {
5911 struct perf_buffer *old_rb = NULL;
5912 unsigned long flags;
5913
5914 WARN_ON_ONCE(event->parent);
5915
5916 if (event->rb) {
5917
5918
5919
5920
5921 WARN_ON_ONCE(event->rcu_pending);
5922
5923 old_rb = event->rb;
5924 spin_lock_irqsave(&old_rb->event_lock, flags);
5925 list_del_rcu(&event->rb_entry);
5926 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5927
5928 event->rcu_batches = get_state_synchronize_rcu();
5929 event->rcu_pending = 1;
5930 }
5931
5932 if (rb) {
5933 if (event->rcu_pending) {
5934 cond_synchronize_rcu(event->rcu_batches);
5935 event->rcu_pending = 0;
5936 }
5937
5938 spin_lock_irqsave(&rb->event_lock, flags);
5939 list_add_rcu(&event->rb_entry, &rb->event_list);
5940 spin_unlock_irqrestore(&rb->event_lock, flags);
5941 }
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953 if (has_aux(event))
5954 perf_event_stop(event, 0);
5955
5956 rcu_assign_pointer(event->rb, rb);
5957
5958 if (old_rb) {
5959 ring_buffer_put(old_rb);
5960
5961
5962
5963
5964
5965 wake_up_all(&event->waitq);
5966 }
5967 }
5968
5969 static void ring_buffer_wakeup(struct perf_event *event)
5970 {
5971 struct perf_buffer *rb;
5972
5973 if (event->parent)
5974 event = event->parent;
5975
5976 rcu_read_lock();
5977 rb = rcu_dereference(event->rb);
5978 if (rb) {
5979 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5980 wake_up_all(&event->waitq);
5981 }
5982 rcu_read_unlock();
5983 }
5984
5985 struct perf_buffer *ring_buffer_get(struct perf_event *event)
5986 {
5987 struct perf_buffer *rb;
5988
5989 if (event->parent)
5990 event = event->parent;
5991
5992 rcu_read_lock();
5993 rb = rcu_dereference(event->rb);
5994 if (rb) {
5995 if (!refcount_inc_not_zero(&rb->refcount))
5996 rb = NULL;
5997 }
5998 rcu_read_unlock();
5999
6000 return rb;
6001 }
6002
6003 void ring_buffer_put(struct perf_buffer *rb)
6004 {
6005 if (!refcount_dec_and_test(&rb->refcount))
6006 return;
6007
6008 WARN_ON_ONCE(!list_empty(&rb->event_list));
6009
6010 call_rcu(&rb->rcu_head, rb_free_rcu);
6011 }
6012
6013 static void perf_mmap_open(struct vm_area_struct *vma)
6014 {
6015 struct perf_event *event = vma->vm_file->private_data;
6016
6017 atomic_inc(&event->mmap_count);
6018 atomic_inc(&event->rb->mmap_count);
6019
6020 if (vma->vm_pgoff)
6021 atomic_inc(&event->rb->aux_mmap_count);
6022
6023 if (event->pmu->event_mapped)
6024 event->pmu->event_mapped(event, vma->vm_mm);
6025 }
6026
6027 static void perf_pmu_output_stop(struct perf_event *event);
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037 static void perf_mmap_close(struct vm_area_struct *vma)
6038 {
6039 struct perf_event *event = vma->vm_file->private_data;
6040 struct perf_buffer *rb = ring_buffer_get(event);
6041 struct user_struct *mmap_user = rb->mmap_user;
6042 int mmap_locked = rb->mmap_locked;
6043 unsigned long size = perf_data_size(rb);
6044 bool detach_rest = false;
6045
6046 if (event->pmu->event_unmapped)
6047 event->pmu->event_unmapped(event, vma->vm_mm);
6048
6049
6050
6051
6052
6053
6054 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6055 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6056
6057
6058
6059
6060
6061
6062 perf_pmu_output_stop(event);
6063
6064
6065 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6066 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6067
6068
6069 rb_free_aux(rb);
6070 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6071
6072 mutex_unlock(&event->mmap_mutex);
6073 }
6074
6075 if (atomic_dec_and_test(&rb->mmap_count))
6076 detach_rest = true;
6077
6078 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6079 goto out_put;
6080
6081 ring_buffer_attach(event, NULL);
6082 mutex_unlock(&event->mmap_mutex);
6083
6084
6085 if (!detach_rest)
6086 goto out_put;
6087
6088
6089
6090
6091
6092
6093 again:
6094 rcu_read_lock();
6095 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6096 if (!atomic_long_inc_not_zero(&event->refcount)) {
6097
6098
6099
6100
6101 continue;
6102 }
6103 rcu_read_unlock();
6104
6105 mutex_lock(&event->mmap_mutex);
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116 if (event->rb == rb)
6117 ring_buffer_attach(event, NULL);
6118
6119 mutex_unlock(&event->mmap_mutex);
6120 put_event(event);
6121
6122
6123
6124
6125
6126 goto again;
6127 }
6128 rcu_read_unlock();
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139 atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6140 &mmap_user->locked_vm);
6141 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6142 free_uid(mmap_user);
6143
6144 out_put:
6145 ring_buffer_put(rb);
6146 }
6147
6148 static const struct vm_operations_struct perf_mmap_vmops = {
6149 .open = perf_mmap_open,
6150 .close = perf_mmap_close,
6151 .fault = perf_mmap_fault,
6152 .page_mkwrite = perf_mmap_fault,
6153 };
6154
6155 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6156 {
6157 struct perf_event *event = file->private_data;
6158 unsigned long user_locked, user_lock_limit;
6159 struct user_struct *user = current_user();
6160 struct perf_buffer *rb = NULL;
6161 unsigned long locked, lock_limit;
6162 unsigned long vma_size;
6163 unsigned long nr_pages;
6164 long user_extra = 0, extra = 0;
6165 int ret = 0, flags = 0;
6166
6167
6168
6169
6170
6171
6172 if (event->cpu == -1 && event->attr.inherit)
6173 return -EINVAL;
6174
6175 if (!(vma->vm_flags & VM_SHARED))
6176 return -EINVAL;
6177
6178 ret = security_perf_event_read(event);
6179 if (ret)
6180 return ret;
6181
6182 vma_size = vma->vm_end - vma->vm_start;
6183
6184 if (vma->vm_pgoff == 0) {
6185 nr_pages = (vma_size / PAGE_SIZE) - 1;
6186 } else {
6187
6188
6189
6190
6191
6192 u64 aux_offset, aux_size;
6193
6194 if (!event->rb)
6195 return -EINVAL;
6196
6197 nr_pages = vma_size / PAGE_SIZE;
6198
6199 mutex_lock(&event->mmap_mutex);
6200 ret = -EINVAL;
6201
6202 rb = event->rb;
6203 if (!rb)
6204 goto aux_unlock;
6205
6206 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6207 aux_size = READ_ONCE(rb->user_page->aux_size);
6208
6209 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6210 goto aux_unlock;
6211
6212 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6213 goto aux_unlock;
6214
6215
6216 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6217 goto aux_unlock;
6218
6219 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6220 goto aux_unlock;
6221
6222
6223 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6224 goto aux_unlock;
6225
6226 if (!is_power_of_2(nr_pages))
6227 goto aux_unlock;
6228
6229 if (!atomic_inc_not_zero(&rb->mmap_count))
6230 goto aux_unlock;
6231
6232 if (rb_has_aux(rb)) {
6233 atomic_inc(&rb->aux_mmap_count);
6234 ret = 0;
6235 goto unlock;
6236 }
6237
6238 atomic_set(&rb->aux_mmap_count, 1);
6239 user_extra = nr_pages;
6240
6241 goto accounting;
6242 }
6243
6244
6245
6246
6247
6248 if (nr_pages != 0 && !is_power_of_2(nr_pages))
6249 return -EINVAL;
6250
6251 if (vma_size != PAGE_SIZE * (1 + nr_pages))
6252 return -EINVAL;
6253
6254 WARN_ON_ONCE(event->ctx->parent_ctx);
6255 again:
6256 mutex_lock(&event->mmap_mutex);
6257 if (event->rb) {
6258 if (data_page_nr(event->rb) != nr_pages) {
6259 ret = -EINVAL;
6260 goto unlock;
6261 }
6262
6263 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6264
6265
6266
6267
6268 ring_buffer_attach(event, NULL);
6269 mutex_unlock(&event->mmap_mutex);
6270 goto again;
6271 }
6272
6273 goto unlock;
6274 }
6275
6276 user_extra = nr_pages + 1;
6277
6278 accounting:
6279 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6280
6281
6282
6283
6284 user_lock_limit *= num_online_cpus();
6285
6286 user_locked = atomic_long_read(&user->locked_vm);
6287
6288
6289
6290
6291
6292 if (user_locked > user_lock_limit)
6293 user_locked = user_lock_limit;
6294 user_locked += user_extra;
6295
6296 if (user_locked > user_lock_limit) {
6297
6298
6299
6300
6301 extra = user_locked - user_lock_limit;
6302 user_extra -= extra;
6303 }
6304
6305 lock_limit = rlimit(RLIMIT_MEMLOCK);
6306 lock_limit >>= PAGE_SHIFT;
6307 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6308
6309 if ((locked > lock_limit) && perf_is_paranoid() &&
6310 !capable(CAP_IPC_LOCK)) {
6311 ret = -EPERM;
6312 goto unlock;
6313 }
6314
6315 WARN_ON(!rb && event->rb);
6316
6317 if (vma->vm_flags & VM_WRITE)
6318 flags |= RING_BUFFER_WRITABLE;
6319
6320 if (!rb) {
6321 rb = rb_alloc(nr_pages,
6322 event->attr.watermark ? event->attr.wakeup_watermark : 0,
6323 event->cpu, flags);
6324
6325 if (!rb) {
6326 ret = -ENOMEM;
6327 goto unlock;
6328 }
6329
6330 atomic_set(&rb->mmap_count, 1);
6331 rb->mmap_user = get_current_user();
6332 rb->mmap_locked = extra;
6333
6334 ring_buffer_attach(event, rb);
6335
6336 perf_event_update_time(event);
6337 perf_event_init_userpage(event);
6338 perf_event_update_userpage(event);
6339 } else {
6340 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6341 event->attr.aux_watermark, flags);
6342 if (!ret)
6343 rb->aux_mmap_locked = extra;
6344 }
6345
6346 unlock:
6347 if (!ret) {
6348 atomic_long_add(user_extra, &user->locked_vm);
6349 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6350
6351 atomic_inc(&event->mmap_count);
6352 } else if (rb) {
6353 atomic_dec(&rb->mmap_count);
6354 }
6355 aux_unlock:
6356 mutex_unlock(&event->mmap_mutex);
6357
6358
6359
6360
6361
6362 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6363 vma->vm_ops = &perf_mmap_vmops;
6364
6365 if (event->pmu->event_mapped)
6366 event->pmu->event_mapped(event, vma->vm_mm);
6367
6368 return ret;
6369 }
6370
6371 static int perf_fasync(int fd, struct file *filp, int on)
6372 {
6373 struct inode *inode = file_inode(filp);
6374 struct perf_event *event = filp->private_data;
6375 int retval;
6376
6377 inode_lock(inode);
6378 retval = fasync_helper(fd, filp, on, &event->fasync);
6379 inode_unlock(inode);
6380
6381 if (retval < 0)
6382 return retval;
6383
6384 return 0;
6385 }
6386
6387 static const struct file_operations perf_fops = {
6388 .llseek = no_llseek,
6389 .release = perf_release,
6390 .read = perf_read,
6391 .poll = perf_poll,
6392 .unlocked_ioctl = perf_ioctl,
6393 .compat_ioctl = perf_compat_ioctl,
6394 .mmap = perf_mmap,
6395 .fasync = perf_fasync,
6396 };
6397
6398
6399
6400
6401
6402
6403
6404
6405 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6406 {
6407
6408 if (event->parent)
6409 event = event->parent;
6410 return &event->fasync;
6411 }
6412
6413 void perf_event_wakeup(struct perf_event *event)
6414 {
6415 ring_buffer_wakeup(event);
6416
6417 if (event->pending_kill) {
6418 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6419 event->pending_kill = 0;
6420 }
6421 }
6422
6423 static void perf_sigtrap(struct perf_event *event)
6424 {
6425
6426
6427
6428
6429
6430 if (WARN_ON_ONCE(event->ctx->task != current))
6431 return;
6432
6433
6434
6435
6436 if (current->flags & PF_EXITING)
6437 return;
6438
6439 send_sig_perf((void __user *)event->pending_addr,
6440 event->attr.type, event->attr.sig_data);
6441 }
6442
6443 static void perf_pending_event_disable(struct perf_event *event)
6444 {
6445 int cpu = READ_ONCE(event->pending_disable);
6446
6447 if (cpu < 0)
6448 return;
6449
6450 if (cpu == smp_processor_id()) {
6451 WRITE_ONCE(event->pending_disable, -1);
6452
6453 if (event->attr.sigtrap) {
6454 perf_sigtrap(event);
6455 atomic_set_release(&event->event_limit, 1);
6456 return;
6457 }
6458
6459 perf_event_disable_local(event);
6460 return;
6461 }
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483 irq_work_queue_on(&event->pending, cpu);
6484 }
6485
6486 static void perf_pending_event(struct irq_work *entry)
6487 {
6488 struct perf_event *event = container_of(entry, struct perf_event, pending);
6489 int rctx;
6490
6491 rctx = perf_swevent_get_recursion_context();
6492
6493
6494
6495
6496
6497 perf_pending_event_disable(event);
6498
6499 if (event->pending_wakeup) {
6500 event->pending_wakeup = 0;
6501 perf_event_wakeup(event);
6502 }
6503
6504 if (rctx >= 0)
6505 perf_swevent_put_recursion_context(rctx);
6506 }
6507
6508 #ifdef CONFIG_GUEST_PERF_EVENTS
6509 struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
6510
6511 DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
6512 DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
6513 DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
6514
6515 void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6516 {
6517 if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
6518 return;
6519
6520 rcu_assign_pointer(perf_guest_cbs, cbs);
6521 static_call_update(__perf_guest_state, cbs->state);
6522 static_call_update(__perf_guest_get_ip, cbs->get_ip);
6523
6524
6525 if (cbs->handle_intel_pt_intr)
6526 static_call_update(__perf_guest_handle_intel_pt_intr,
6527 cbs->handle_intel_pt_intr);
6528 }
6529 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6530
6531 void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6532 {
6533 if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
6534 return;
6535
6536 rcu_assign_pointer(perf_guest_cbs, NULL);
6537 static_call_update(__perf_guest_state, (void *)&__static_call_return0);
6538 static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
6539 static_call_update(__perf_guest_handle_intel_pt_intr,
6540 (void *)&__static_call_return0);
6541 synchronize_rcu();
6542 }
6543 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6544 #endif
6545
6546 static void
6547 perf_output_sample_regs(struct perf_output_handle *handle,
6548 struct pt_regs *regs, u64 mask)
6549 {
6550 int bit;
6551 DECLARE_BITMAP(_mask, 64);
6552
6553 bitmap_from_u64(_mask, mask);
6554 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6555 u64 val;
6556
6557 val = perf_reg_value(regs, bit);
6558 perf_output_put(handle, val);
6559 }
6560 }
6561
6562 static void perf_sample_regs_user(struct perf_regs *regs_user,
6563 struct pt_regs *regs)
6564 {
6565 if (user_mode(regs)) {
6566 regs_user->abi = perf_reg_abi(current);
6567 regs_user->regs = regs;
6568 } else if (!(current->flags & PF_KTHREAD)) {
6569 perf_get_regs_user(regs_user, regs);
6570 } else {
6571 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6572 regs_user->regs = NULL;
6573 }
6574 }
6575
6576 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6577 struct pt_regs *regs)
6578 {
6579 regs_intr->regs = regs;
6580 regs_intr->abi = perf_reg_abi(current);
6581 }
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591 static u64 perf_ustack_task_size(struct pt_regs *regs)
6592 {
6593 unsigned long addr = perf_user_stack_pointer(regs);
6594
6595 if (!addr || addr >= TASK_SIZE)
6596 return 0;
6597
6598 return TASK_SIZE - addr;
6599 }
6600
6601 static u16
6602 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6603 struct pt_regs *regs)
6604 {
6605 u64 task_size;
6606
6607
6608 if (!regs)
6609 return 0;
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6622 stack_size = min(stack_size, (u16) task_size);
6623
6624
6625 header_size += 2 * sizeof(u64);
6626
6627
6628 if ((u16) (header_size + stack_size) < header_size) {
6629
6630
6631
6632
6633 stack_size = USHRT_MAX - header_size - sizeof(u64);
6634 stack_size = round_up(stack_size, sizeof(u64));
6635 }
6636
6637 return stack_size;
6638 }
6639
6640 static void
6641 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6642 struct pt_regs *regs)
6643 {
6644
6645 if (!regs) {
6646 u64 size = 0;
6647 perf_output_put(handle, size);
6648 } else {
6649 unsigned long sp;
6650 unsigned int rem;
6651 u64 dyn_size;
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665 perf_output_put(handle, dump_size);
6666
6667
6668 sp = perf_user_stack_pointer(regs);
6669 rem = __output_copy_user(handle, (void *) sp, dump_size);
6670 dyn_size = dump_size - rem;
6671
6672 perf_output_skip(handle, rem);
6673
6674
6675 perf_output_put(handle, dyn_size);
6676 }
6677 }
6678
6679 static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6680 struct perf_sample_data *data,
6681 size_t size)
6682 {
6683 struct perf_event *sampler = event->aux_event;
6684 struct perf_buffer *rb;
6685
6686 data->aux_size = 0;
6687
6688 if (!sampler)
6689 goto out;
6690
6691 if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6692 goto out;
6693
6694 if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6695 goto out;
6696
6697 rb = ring_buffer_get(sampler);
6698 if (!rb)
6699 goto out;
6700
6701
6702
6703
6704
6705 if (READ_ONCE(rb->aux_in_sampling)) {
6706 data->aux_size = 0;
6707 } else {
6708 size = min_t(size_t, size, perf_aux_size(rb));
6709 data->aux_size = ALIGN(size, sizeof(u64));
6710 }
6711 ring_buffer_put(rb);
6712
6713 out:
6714 return data->aux_size;
6715 }
6716
6717 static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6718 struct perf_event *event,
6719 struct perf_output_handle *handle,
6720 unsigned long size)
6721 {
6722 unsigned long flags;
6723 long ret;
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734 local_irq_save(flags);
6735
6736
6737
6738
6739 WRITE_ONCE(rb->aux_in_sampling, 1);
6740 barrier();
6741
6742 ret = event->pmu->snapshot_aux(event, handle, size);
6743
6744 barrier();
6745 WRITE_ONCE(rb->aux_in_sampling, 0);
6746 local_irq_restore(flags);
6747
6748 return ret;
6749 }
6750
6751 static void perf_aux_sample_output(struct perf_event *event,
6752 struct perf_output_handle *handle,
6753 struct perf_sample_data *data)
6754 {
6755 struct perf_event *sampler = event->aux_event;
6756 struct perf_buffer *rb;
6757 unsigned long pad;
6758 long size;
6759
6760 if (WARN_ON_ONCE(!sampler || !data->aux_size))
6761 return;
6762
6763 rb = ring_buffer_get(sampler);
6764 if (!rb)
6765 return;
6766
6767 size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6768
6769
6770
6771
6772
6773
6774
6775 if (WARN_ON_ONCE(size < 0))
6776 goto out_put;
6777
6778
6779
6780
6781
6782 pad = data->aux_size - size;
6783 if (WARN_ON_ONCE(pad >= sizeof(u64)))
6784 pad = 8;
6785
6786 if (pad) {
6787 u64 zero = 0;
6788 perf_output_copy(handle, &zero, pad);
6789 }
6790
6791 out_put:
6792 ring_buffer_put(rb);
6793 }
6794
6795 static void __perf_event_header__init_id(struct perf_event_header *header,
6796 struct perf_sample_data *data,
6797 struct perf_event *event)
6798 {
6799 u64 sample_type = event->attr.sample_type;
6800
6801 data->type = sample_type;
6802 header->size += event->id_header_size;
6803
6804 if (sample_type & PERF_SAMPLE_TID) {
6805
6806 data->tid_entry.pid = perf_event_pid(event, current);
6807 data->tid_entry.tid = perf_event_tid(event, current);
6808 }
6809
6810 if (sample_type & PERF_SAMPLE_TIME)
6811 data->time = perf_event_clock(event);
6812
6813 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6814 data->id = primary_event_id(event);
6815
6816 if (sample_type & PERF_SAMPLE_STREAM_ID)
6817 data->stream_id = event->id;
6818
6819 if (sample_type & PERF_SAMPLE_CPU) {
6820 data->cpu_entry.cpu = raw_smp_processor_id();
6821 data->cpu_entry.reserved = 0;
6822 }
6823 }
6824
6825 void perf_event_header__init_id(struct perf_event_header *header,
6826 struct perf_sample_data *data,
6827 struct perf_event *event)
6828 {
6829 if (event->attr.sample_id_all)
6830 __perf_event_header__init_id(header, data, event);
6831 }
6832
6833 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6834 struct perf_sample_data *data)
6835 {
6836 u64 sample_type = data->type;
6837
6838 if (sample_type & PERF_SAMPLE_TID)
6839 perf_output_put(handle, data->tid_entry);
6840
6841 if (sample_type & PERF_SAMPLE_TIME)
6842 perf_output_put(handle, data->time);
6843
6844 if (sample_type & PERF_SAMPLE_ID)
6845 perf_output_put(handle, data->id);
6846
6847 if (sample_type & PERF_SAMPLE_STREAM_ID)
6848 perf_output_put(handle, data->stream_id);
6849
6850 if (sample_type & PERF_SAMPLE_CPU)
6851 perf_output_put(handle, data->cpu_entry);
6852
6853 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6854 perf_output_put(handle, data->id);
6855 }
6856
6857 void perf_event__output_id_sample(struct perf_event *event,
6858 struct perf_output_handle *handle,
6859 struct perf_sample_data *sample)
6860 {
6861 if (event->attr.sample_id_all)
6862 __perf_event__output_id_sample(handle, sample);
6863 }
6864
6865 static void perf_output_read_one(struct perf_output_handle *handle,
6866 struct perf_event *event,
6867 u64 enabled, u64 running)
6868 {
6869 u64 read_format = event->attr.read_format;
6870 u64 values[5];
6871 int n = 0;
6872
6873 values[n++] = perf_event_count(event);
6874 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6875 values[n++] = enabled +
6876 atomic64_read(&event->child_total_time_enabled);
6877 }
6878 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6879 values[n++] = running +
6880 atomic64_read(&event->child_total_time_running);
6881 }
6882 if (read_format & PERF_FORMAT_ID)
6883 values[n++] = primary_event_id(event);
6884 if (read_format & PERF_FORMAT_LOST)
6885 values[n++] = atomic64_read(&event->lost_samples);
6886
6887 __output_copy(handle, values, n * sizeof(u64));
6888 }
6889
6890 static void perf_output_read_group(struct perf_output_handle *handle,
6891 struct perf_event *event,
6892 u64 enabled, u64 running)
6893 {
6894 struct perf_event *leader = event->group_leader, *sub;
6895 u64 read_format = event->attr.read_format;
6896 unsigned long flags;
6897 u64 values[6];
6898 int n = 0;
6899
6900
6901
6902
6903
6904 local_irq_save(flags);
6905
6906 values[n++] = 1 + leader->nr_siblings;
6907
6908 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6909 values[n++] = enabled;
6910
6911 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6912 values[n++] = running;
6913
6914 if ((leader != event) &&
6915 (leader->state == PERF_EVENT_STATE_ACTIVE))
6916 leader->pmu->read(leader);
6917
6918 values[n++] = perf_event_count(leader);
6919 if (read_format & PERF_FORMAT_ID)
6920 values[n++] = primary_event_id(leader);
6921 if (read_format & PERF_FORMAT_LOST)
6922 values[n++] = atomic64_read(&leader->lost_samples);
6923
6924 __output_copy(handle, values, n * sizeof(u64));
6925
6926 for_each_sibling_event(sub, leader) {
6927 n = 0;
6928
6929 if ((sub != event) &&
6930 (sub->state == PERF_EVENT_STATE_ACTIVE))
6931 sub->pmu->read(sub);
6932
6933 values[n++] = perf_event_count(sub);
6934 if (read_format & PERF_FORMAT_ID)
6935 values[n++] = primary_event_id(sub);
6936 if (read_format & PERF_FORMAT_LOST)
6937 values[n++] = atomic64_read(&sub->lost_samples);
6938
6939 __output_copy(handle, values, n * sizeof(u64));
6940 }
6941
6942 local_irq_restore(flags);
6943 }
6944
6945 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6946 PERF_FORMAT_TOTAL_TIME_RUNNING)
6947
6948
6949
6950
6951
6952
6953
6954
6955 static void perf_output_read(struct perf_output_handle *handle,
6956 struct perf_event *event)
6957 {
6958 u64 enabled = 0, running = 0, now;
6959 u64 read_format = event->attr.read_format;
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970 if (read_format & PERF_FORMAT_TOTAL_TIMES)
6971 calc_timer_values(event, &now, &enabled, &running);
6972
6973 if (event->attr.read_format & PERF_FORMAT_GROUP)
6974 perf_output_read_group(handle, event, enabled, running);
6975 else
6976 perf_output_read_one(handle, event, enabled, running);
6977 }
6978
6979 static inline bool perf_sample_save_hw_index(struct perf_event *event)
6980 {
6981 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6982 }
6983
6984 void perf_output_sample(struct perf_output_handle *handle,
6985 struct perf_event_header *header,
6986 struct perf_sample_data *data,
6987 struct perf_event *event)
6988 {
6989 u64 sample_type = data->type;
6990
6991 perf_output_put(handle, *header);
6992
6993 if (sample_type & PERF_SAMPLE_IDENTIFIER)
6994 perf_output_put(handle, data->id);
6995
6996 if (sample_type & PERF_SAMPLE_IP)
6997 perf_output_put(handle, data->ip);
6998
6999 if (sample_type & PERF_SAMPLE_TID)
7000 perf_output_put(handle, data->tid_entry);
7001
7002 if (sample_type & PERF_SAMPLE_TIME)
7003 perf_output_put(handle, data->time);
7004
7005 if (sample_type & PERF_SAMPLE_ADDR)
7006 perf_output_put(handle, data->addr);
7007
7008 if (sample_type & PERF_SAMPLE_ID)
7009 perf_output_put(handle, data->id);
7010
7011 if (sample_type & PERF_SAMPLE_STREAM_ID)
7012 perf_output_put(handle, data->stream_id);
7013
7014 if (sample_type & PERF_SAMPLE_CPU)
7015 perf_output_put(handle, data->cpu_entry);
7016
7017 if (sample_type & PERF_SAMPLE_PERIOD)
7018 perf_output_put(handle, data->period);
7019
7020 if (sample_type & PERF_SAMPLE_READ)
7021 perf_output_read(handle, event);
7022
7023 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7024 int size = 1;
7025
7026 size += data->callchain->nr;
7027 size *= sizeof(u64);
7028 __output_copy(handle, data->callchain, size);
7029 }
7030
7031 if (sample_type & PERF_SAMPLE_RAW) {
7032 struct perf_raw_record *raw = data->raw;
7033
7034 if (raw) {
7035 struct perf_raw_frag *frag = &raw->frag;
7036
7037 perf_output_put(handle, raw->size);
7038 do {
7039 if (frag->copy) {
7040 __output_custom(handle, frag->copy,
7041 frag->data, frag->size);
7042 } else {
7043 __output_copy(handle, frag->data,
7044 frag->size);
7045 }
7046 if (perf_raw_frag_last(frag))
7047 break;
7048 frag = frag->next;
7049 } while (1);
7050 if (frag->pad)
7051 __output_skip(handle, NULL, frag->pad);
7052 } else {
7053 struct {
7054 u32 size;
7055 u32 data;
7056 } raw = {
7057 .size = sizeof(u32),
7058 .data = 0,
7059 };
7060 perf_output_put(handle, raw);
7061 }
7062 }
7063
7064 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7065 if (data->br_stack) {
7066 size_t size;
7067
7068 size = data->br_stack->nr
7069 * sizeof(struct perf_branch_entry);
7070
7071 perf_output_put(handle, data->br_stack->nr);
7072 if (perf_sample_save_hw_index(event))
7073 perf_output_put(handle, data->br_stack->hw_idx);
7074 perf_output_copy(handle, data->br_stack->entries, size);
7075 } else {
7076
7077
7078
7079 u64 nr = 0;
7080 perf_output_put(handle, nr);
7081 }
7082 }
7083
7084 if (sample_type & PERF_SAMPLE_REGS_USER) {
7085 u64 abi = data->regs_user.abi;
7086
7087
7088
7089
7090
7091 perf_output_put(handle, abi);
7092
7093 if (abi) {
7094 u64 mask = event->attr.sample_regs_user;
7095 perf_output_sample_regs(handle,
7096 data->regs_user.regs,
7097 mask);
7098 }
7099 }
7100
7101 if (sample_type & PERF_SAMPLE_STACK_USER) {
7102 perf_output_sample_ustack(handle,
7103 data->stack_user_size,
7104 data->regs_user.regs);
7105 }
7106
7107 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
7108 perf_output_put(handle, data->weight.full);
7109
7110 if (sample_type & PERF_SAMPLE_DATA_SRC)
7111 perf_output_put(handle, data->data_src.val);
7112
7113 if (sample_type & PERF_SAMPLE_TRANSACTION)
7114 perf_output_put(handle, data->txn);
7115
7116 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7117 u64 abi = data->regs_intr.abi;
7118
7119
7120
7121
7122 perf_output_put(handle, abi);
7123
7124 if (abi) {
7125 u64 mask = event->attr.sample_regs_intr;
7126
7127 perf_output_sample_regs(handle,
7128 data->regs_intr.regs,
7129 mask);
7130 }
7131 }
7132
7133 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7134 perf_output_put(handle, data->phys_addr);
7135
7136 if (sample_type & PERF_SAMPLE_CGROUP)
7137 perf_output_put(handle, data->cgroup);
7138
7139 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7140 perf_output_put(handle, data->data_page_size);
7141
7142 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7143 perf_output_put(handle, data->code_page_size);
7144
7145 if (sample_type & PERF_SAMPLE_AUX) {
7146 perf_output_put(handle, data->aux_size);
7147
7148 if (data->aux_size)
7149 perf_aux_sample_output(event, handle, data);
7150 }
7151
7152 if (!event->attr.watermark) {
7153 int wakeup_events = event->attr.wakeup_events;
7154
7155 if (wakeup_events) {
7156 struct perf_buffer *rb = handle->rb;
7157 int events = local_inc_return(&rb->events);
7158
7159 if (events >= wakeup_events) {
7160 local_sub(wakeup_events, &rb->events);
7161 local_inc(&rb->wakeup);
7162 }
7163 }
7164 }
7165 }
7166
7167 static u64 perf_virt_to_phys(u64 virt)
7168 {
7169 u64 phys_addr = 0;
7170
7171 if (!virt)
7172 return 0;
7173
7174 if (virt >= TASK_SIZE) {
7175
7176 if (virt_addr_valid((void *)(uintptr_t)virt) &&
7177 !(virt >= VMALLOC_START && virt < VMALLOC_END))
7178 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7179 } else {
7180
7181
7182
7183
7184
7185
7186
7187 if (current->mm != NULL) {
7188 struct page *p;
7189
7190 pagefault_disable();
7191 if (get_user_page_fast_only(virt, 0, &p)) {
7192 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7193 put_page(p);
7194 }
7195 pagefault_enable();
7196 }
7197 }
7198
7199 return phys_addr;
7200 }
7201
7202
7203
7204
7205 static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
7206 {
7207 u64 size = 0;
7208
7209 #ifdef CONFIG_HAVE_FAST_GUP
7210 pgd_t *pgdp, pgd;
7211 p4d_t *p4dp, p4d;
7212 pud_t *pudp, pud;
7213 pmd_t *pmdp, pmd;
7214 pte_t *ptep, pte;
7215
7216 pgdp = pgd_offset(mm, addr);
7217 pgd = READ_ONCE(*pgdp);
7218 if (pgd_none(pgd))
7219 return 0;
7220
7221 if (pgd_leaf(pgd))
7222 return pgd_leaf_size(pgd);
7223
7224 p4dp = p4d_offset_lockless(pgdp, pgd, addr);
7225 p4d = READ_ONCE(*p4dp);
7226 if (!p4d_present(p4d))
7227 return 0;
7228
7229 if (p4d_leaf(p4d))
7230 return p4d_leaf_size(p4d);
7231
7232 pudp = pud_offset_lockless(p4dp, p4d, addr);
7233 pud = READ_ONCE(*pudp);
7234 if (!pud_present(pud))
7235 return 0;
7236
7237 if (pud_leaf(pud))
7238 return pud_leaf_size(pud);
7239
7240 pmdp = pmd_offset_lockless(pudp, pud, addr);
7241 pmd = READ_ONCE(*pmdp);
7242 if (!pmd_present(pmd))
7243 return 0;
7244
7245 if (pmd_leaf(pmd))
7246 return pmd_leaf_size(pmd);
7247
7248 ptep = pte_offset_map(&pmd, addr);
7249 pte = ptep_get_lockless(ptep);
7250 if (pte_present(pte))
7251 size = pte_leaf_size(pte);
7252 pte_unmap(ptep);
7253 #endif
7254
7255 return size;
7256 }
7257
7258 static u64 perf_get_page_size(unsigned long addr)
7259 {
7260 struct mm_struct *mm;
7261 unsigned long flags;
7262 u64 size;
7263
7264 if (!addr)
7265 return 0;
7266
7267
7268
7269
7270
7271 local_irq_save(flags);
7272
7273 mm = current->mm;
7274 if (!mm) {
7275
7276
7277
7278
7279 mm = &init_mm;
7280 }
7281
7282 size = perf_get_pgtable_size(mm, addr);
7283
7284 local_irq_restore(flags);
7285
7286 return size;
7287 }
7288
7289 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7290
7291 struct perf_callchain_entry *
7292 perf_callchain(struct perf_event *event, struct pt_regs *regs)
7293 {
7294 bool kernel = !event->attr.exclude_callchain_kernel;
7295 bool user = !event->attr.exclude_callchain_user;
7296
7297 bool crosstask = event->ctx->task && event->ctx->task != current;
7298 const u32 max_stack = event->attr.sample_max_stack;
7299 struct perf_callchain_entry *callchain;
7300
7301 if (!kernel && !user)
7302 return &__empty_callchain;
7303
7304 callchain = get_perf_callchain(regs, 0, kernel, user,
7305 max_stack, crosstask, true);
7306 return callchain ?: &__empty_callchain;
7307 }
7308
7309 void perf_prepare_sample(struct perf_event_header *header,
7310 struct perf_sample_data *data,
7311 struct perf_event *event,
7312 struct pt_regs *regs)
7313 {
7314 u64 sample_type = event->attr.sample_type;
7315
7316 header->type = PERF_RECORD_SAMPLE;
7317 header->size = sizeof(*header) + event->header_size;
7318
7319 header->misc = 0;
7320 header->misc |= perf_misc_flags(regs);
7321
7322 __perf_event_header__init_id(header, data, event);
7323
7324 if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
7325 data->ip = perf_instruction_pointer(regs);
7326
7327 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7328 int size = 1;
7329
7330 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7331 data->callchain = perf_callchain(event, regs);
7332
7333 size += data->callchain->nr;
7334
7335 header->size += size * sizeof(u64);
7336 }
7337
7338 if (sample_type & PERF_SAMPLE_RAW) {
7339 struct perf_raw_record *raw = data->raw;
7340 int size;
7341
7342 if (raw) {
7343 struct perf_raw_frag *frag = &raw->frag;
7344 u32 sum = 0;
7345
7346 do {
7347 sum += frag->size;
7348 if (perf_raw_frag_last(frag))
7349 break;
7350 frag = frag->next;
7351 } while (1);
7352
7353 size = round_up(sum + sizeof(u32), sizeof(u64));
7354 raw->size = size - sizeof(u32);
7355 frag->pad = raw->size - sum;
7356 } else {
7357 size = sizeof(u64);
7358 }
7359
7360 header->size += size;
7361 }
7362
7363 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7364 int size = sizeof(u64);
7365 if (data->br_stack) {
7366 if (perf_sample_save_hw_index(event))
7367 size += sizeof(u64);
7368
7369 size += data->br_stack->nr
7370 * sizeof(struct perf_branch_entry);
7371 }
7372 header->size += size;
7373 }
7374
7375 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7376 perf_sample_regs_user(&data->regs_user, regs);
7377
7378 if (sample_type & PERF_SAMPLE_REGS_USER) {
7379
7380 int size = sizeof(u64);
7381
7382 if (data->regs_user.regs) {
7383 u64 mask = event->attr.sample_regs_user;
7384 size += hweight64(mask) * sizeof(u64);
7385 }
7386
7387 header->size += size;
7388 }
7389
7390 if (sample_type & PERF_SAMPLE_STACK_USER) {
7391
7392
7393
7394
7395
7396
7397 u16 stack_size = event->attr.sample_stack_user;
7398 u16 size = sizeof(u64);
7399
7400 stack_size = perf_sample_ustack_size(stack_size, header->size,
7401 data->regs_user.regs);
7402
7403
7404
7405
7406
7407
7408 if (stack_size)
7409 size += sizeof(u64) + stack_size;
7410
7411 data->stack_user_size = stack_size;
7412 header->size += size;
7413 }
7414
7415 if (sample_type & PERF_SAMPLE_REGS_INTR) {
7416
7417 int size = sizeof(u64);
7418
7419 perf_sample_regs_intr(&data->regs_intr, regs);
7420
7421 if (data->regs_intr.regs) {
7422 u64 mask = event->attr.sample_regs_intr;
7423
7424 size += hweight64(mask) * sizeof(u64);
7425 }
7426
7427 header->size += size;
7428 }
7429
7430 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7431 data->phys_addr = perf_virt_to_phys(data->addr);
7432
7433 #ifdef CONFIG_CGROUP_PERF
7434 if (sample_type & PERF_SAMPLE_CGROUP) {
7435 struct cgroup *cgrp;
7436
7437
7438 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7439 data->cgroup = cgroup_id(cgrp);
7440 }
7441 #endif
7442
7443
7444
7445
7446
7447
7448 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7449 data->data_page_size = perf_get_page_size(data->addr);
7450
7451 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7452 data->code_page_size = perf_get_page_size(data->ip);
7453
7454 if (sample_type & PERF_SAMPLE_AUX) {
7455 u64 size;
7456
7457 header->size += sizeof(u64);
7458
7459
7460
7461
7462
7463
7464
7465 size = min_t(size_t, U16_MAX - header->size,
7466 event->attr.aux_sample_size);
7467 size = rounddown(size, 8);
7468 size = perf_prepare_sample_aux(event, data, size);
7469
7470 WARN_ON_ONCE(size + header->size > U16_MAX);
7471 header->size += size;
7472 }
7473
7474
7475
7476
7477
7478
7479
7480
7481 WARN_ON_ONCE(header->size & 7);
7482 }
7483
7484 static __always_inline int
7485 __perf_event_output(struct perf_event *event,
7486 struct perf_sample_data *data,
7487 struct pt_regs *regs,
7488 int (*output_begin)(struct perf_output_handle *,
7489 struct perf_sample_data *,
7490 struct perf_event *,
7491 unsigned int))
7492 {
7493 struct perf_output_handle handle;
7494 struct perf_event_header header;
7495 int err;
7496
7497
7498 rcu_read_lock();
7499
7500 perf_prepare_sample(&header, data, event, regs);
7501
7502 err = output_begin(&handle, data, event, header.size);
7503 if (err)
7504 goto exit;
7505
7506 perf_output_sample(&handle, &header, data, event);
7507
7508 perf_output_end(&handle);
7509
7510 exit:
7511 rcu_read_unlock();
7512 return err;
7513 }
7514
7515 void
7516 perf_event_output_forward(struct perf_event *event,
7517 struct perf_sample_data *data,
7518 struct pt_regs *regs)
7519 {
7520 __perf_event_output(event, data, regs, perf_output_begin_forward);
7521 }
7522
7523 void
7524 perf_event_output_backward(struct perf_event *event,
7525 struct perf_sample_data *data,
7526 struct pt_regs *regs)
7527 {
7528 __perf_event_output(event, data, regs, perf_output_begin_backward);
7529 }
7530
7531 int
7532 perf_event_output(struct perf_event *event,
7533 struct perf_sample_data *data,
7534 struct pt_regs *regs)
7535 {
7536 return __perf_event_output(event, data, regs, perf_output_begin);
7537 }
7538
7539
7540
7541
7542
7543 struct perf_read_event {
7544 struct perf_event_header header;
7545
7546 u32 pid;
7547 u32 tid;
7548 };
7549
7550 static void
7551 perf_event_read_event(struct perf_event *event,
7552 struct task_struct *task)
7553 {
7554 struct perf_output_handle handle;
7555 struct perf_sample_data sample;
7556 struct perf_read_event read_event = {
7557 .header = {
7558 .type = PERF_RECORD_READ,
7559 .misc = 0,
7560 .size = sizeof(read_event) + event->read_size,
7561 },
7562 .pid = perf_event_pid(event, task),
7563 .tid = perf_event_tid(event, task),
7564 };
7565 int ret;
7566
7567 perf_event_header__init_id(&read_event.header, &sample, event);
7568 ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7569 if (ret)
7570 return;
7571
7572 perf_output_put(&handle, read_event);
7573 perf_output_read(&handle, event);
7574 perf_event__output_id_sample(event, &handle, &sample);
7575
7576 perf_output_end(&handle);
7577 }
7578
7579 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7580
7581 static void
7582 perf_iterate_ctx(struct perf_event_context *ctx,
7583 perf_iterate_f output,
7584 void *data, bool all)
7585 {
7586 struct perf_event *event;
7587
7588 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7589 if (!all) {
7590 if (event->state < PERF_EVENT_STATE_INACTIVE)
7591 continue;
7592 if (!event_filter_match(event))
7593 continue;
7594 }
7595
7596 output(event, data);
7597 }
7598 }
7599
7600 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7601 {
7602 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7603 struct perf_event *event;
7604
7605 list_for_each_entry_rcu(event, &pel->list, sb_list) {
7606
7607
7608
7609
7610
7611 if (!smp_load_acquire(&event->ctx))
7612 continue;
7613
7614 if (event->state < PERF_EVENT_STATE_INACTIVE)
7615 continue;
7616 if (!event_filter_match(event))
7617 continue;
7618 output(event, data);
7619 }
7620 }
7621
7622
7623
7624
7625
7626
7627
7628 static void
7629 perf_iterate_sb(perf_iterate_f output, void *data,
7630 struct perf_event_context *task_ctx)
7631 {
7632 struct perf_event_context *ctx;
7633 int ctxn;
7634
7635 rcu_read_lock();
7636 preempt_disable();
7637
7638
7639
7640
7641
7642
7643 if (task_ctx) {
7644 perf_iterate_ctx(task_ctx, output, data, false);
7645 goto done;
7646 }
7647
7648 perf_iterate_sb_cpu(output, data);
7649
7650 for_each_task_context_nr(ctxn) {
7651 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7652 if (ctx)
7653 perf_iterate_ctx(ctx, output, data, false);
7654 }
7655 done:
7656 preempt_enable();
7657 rcu_read_unlock();
7658 }
7659
7660
7661
7662
7663
7664 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7665 {
7666 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7667 struct perf_addr_filter *filter;
7668 unsigned int restart = 0, count = 0;
7669 unsigned long flags;
7670
7671 if (!has_addr_filter(event))
7672 return;
7673
7674 raw_spin_lock_irqsave(&ifh->lock, flags);
7675 list_for_each_entry(filter, &ifh->list, entry) {
7676 if (filter->path.dentry) {
7677 event->addr_filter_ranges[count].start = 0;
7678 event->addr_filter_ranges[count].size = 0;
7679 restart++;
7680 }
7681
7682 count++;
7683 }
7684
7685 if (restart)
7686 event->addr_filters_gen++;
7687 raw_spin_unlock_irqrestore(&ifh->lock, flags);
7688
7689 if (restart)
7690 perf_event_stop(event, 1);
7691 }
7692
7693 void perf_event_exec(void)
7694 {
7695 struct perf_event_context *ctx;
7696 int ctxn;
7697
7698 for_each_task_context_nr(ctxn) {
7699 perf_event_enable_on_exec(ctxn);
7700 perf_event_remove_on_exec(ctxn);
7701
7702 rcu_read_lock();
7703 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7704 if (ctx) {
7705 perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
7706 NULL, true);
7707 }
7708 rcu_read_unlock();
7709 }
7710 }
7711
7712 struct remote_output {
7713 struct perf_buffer *rb;
7714 int err;
7715 };
7716
7717 static void __perf_event_output_stop(struct perf_event *event, void *data)
7718 {
7719 struct perf_event *parent = event->parent;
7720 struct remote_output *ro = data;
7721 struct perf_buffer *rb = ro->rb;
7722 struct stop_event_data sd = {
7723 .event = event,
7724 };
7725
7726 if (!has_aux(event))
7727 return;
7728
7729 if (!parent)
7730 parent = event;
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742 if (rcu_dereference(parent->rb) == rb)
7743 ro->err = __perf_event_stop(&sd);
7744 }
7745
7746 static int __perf_pmu_output_stop(void *info)
7747 {
7748 struct perf_event *event = info;
7749 struct pmu *pmu = event->ctx->pmu;
7750 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7751 struct remote_output ro = {
7752 .rb = event->rb,
7753 };
7754
7755 rcu_read_lock();
7756 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7757 if (cpuctx->task_ctx)
7758 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7759 &ro, false);
7760 rcu_read_unlock();
7761
7762 return ro.err;
7763 }
7764
7765 static void perf_pmu_output_stop(struct perf_event *event)
7766 {
7767 struct perf_event *iter;
7768 int err, cpu;
7769
7770 restart:
7771 rcu_read_lock();
7772 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7773
7774
7775
7776
7777
7778
7779 cpu = iter->cpu;
7780 if (cpu == -1)
7781 cpu = READ_ONCE(iter->oncpu);
7782
7783 if (cpu == -1)
7784 continue;
7785
7786 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7787 if (err == -EAGAIN) {
7788 rcu_read_unlock();
7789 goto restart;
7790 }
7791 }
7792 rcu_read_unlock();
7793 }
7794
7795
7796
7797
7798
7799
7800
7801 struct perf_task_event {
7802 struct task_struct *task;
7803 struct perf_event_context *task_ctx;
7804
7805 struct {
7806 struct perf_event_header header;
7807
7808 u32 pid;
7809 u32 ppid;
7810 u32 tid;
7811 u32 ptid;
7812 u64 time;
7813 } event_id;
7814 };
7815
7816 static int perf_event_task_match(struct perf_event *event)
7817 {
7818 return event->attr.comm || event->attr.mmap ||
7819 event->attr.mmap2 || event->attr.mmap_data ||
7820 event->attr.task;
7821 }
7822
7823 static void perf_event_task_output(struct perf_event *event,
7824 void *data)
7825 {
7826 struct perf_task_event *task_event = data;
7827 struct perf_output_handle handle;
7828 struct perf_sample_data sample;
7829 struct task_struct *task = task_event->task;
7830 int ret, size = task_event->event_id.header.size;
7831
7832 if (!perf_event_task_match(event))
7833 return;
7834
7835 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7836
7837 ret = perf_output_begin(&handle, &sample, event,
7838 task_event->event_id.header.size);
7839 if (ret)
7840 goto out;
7841
7842 task_event->event_id.pid = perf_event_pid(event, task);
7843 task_event->event_id.tid = perf_event_tid(event, task);
7844
7845 if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7846 task_event->event_id.ppid = perf_event_pid(event,
7847 task->real_parent);
7848 task_event->event_id.ptid = perf_event_pid(event,
7849 task->real_parent);
7850 } else {
7851 task_event->event_id.ppid = perf_event_pid(event, current);
7852 task_event->event_id.ptid = perf_event_tid(event, current);
7853 }
7854
7855 task_event->event_id.time = perf_event_clock(event);
7856
7857 perf_output_put(&handle, task_event->event_id);
7858
7859 perf_event__output_id_sample(event, &handle, &sample);
7860
7861 perf_output_end(&handle);
7862 out:
7863 task_event->event_id.header.size = size;
7864 }
7865
7866 static void perf_event_task(struct task_struct *task,
7867 struct perf_event_context *task_ctx,
7868 int new)
7869 {
7870 struct perf_task_event task_event;
7871
7872 if (!atomic_read(&nr_comm_events) &&
7873 !atomic_read(&nr_mmap_events) &&
7874 !atomic_read(&nr_task_events))
7875 return;
7876
7877 task_event = (struct perf_task_event){
7878 .task = task,
7879 .task_ctx = task_ctx,
7880 .event_id = {
7881 .header = {
7882 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7883 .misc = 0,
7884 .size = sizeof(task_event.event_id),
7885 },
7886
7887
7888
7889
7890
7891 },
7892 };
7893
7894 perf_iterate_sb(perf_event_task_output,
7895 &task_event,
7896 task_ctx);
7897 }
7898
7899 void perf_event_fork(struct task_struct *task)
7900 {
7901 perf_event_task(task, NULL, 1);
7902 perf_event_namespaces(task);
7903 }
7904
7905
7906
7907
7908
7909 struct perf_comm_event {
7910 struct task_struct *task;
7911 char *comm;
7912 int comm_size;
7913
7914 struct {
7915 struct perf_event_header header;
7916
7917 u32 pid;
7918 u32 tid;
7919 } event_id;
7920 };
7921
7922 static int perf_event_comm_match(struct perf_event *event)
7923 {
7924 return event->attr.comm;
7925 }
7926
7927 static void perf_event_comm_output(struct perf_event *event,
7928 void *data)
7929 {
7930 struct perf_comm_event *comm_event = data;
7931 struct perf_output_handle handle;
7932 struct perf_sample_data sample;
7933 int size = comm_event->event_id.header.size;
7934 int ret;
7935
7936 if (!perf_event_comm_match(event))
7937 return;
7938
7939 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7940 ret = perf_output_begin(&handle, &sample, event,
7941 comm_event->event_id.header.size);
7942
7943 if (ret)
7944 goto out;
7945
7946 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7947 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7948
7949 perf_output_put(&handle, comm_event->event_id);
7950 __output_copy(&handle, comm_event->comm,
7951 comm_event->comm_size);
7952
7953 perf_event__output_id_sample(event, &handle, &sample);
7954
7955 perf_output_end(&handle);
7956 out:
7957 comm_event->event_id.header.size = size;
7958 }
7959
7960 static void perf_event_comm_event(struct perf_comm_event *comm_event)
7961 {
7962 char comm[TASK_COMM_LEN];
7963 unsigned int size;
7964
7965 memset(comm, 0, sizeof(comm));
7966 strlcpy(comm, comm_event->task->comm, sizeof(comm));
7967 size = ALIGN(strlen(comm)+1, sizeof(u64));
7968
7969 comm_event->comm = comm;
7970 comm_event->comm_size = size;
7971
7972 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7973
7974 perf_iterate_sb(perf_event_comm_output,
7975 comm_event,
7976 NULL);
7977 }
7978
7979 void perf_event_comm(struct task_struct *task, bool exec)
7980 {
7981 struct perf_comm_event comm_event;
7982
7983 if (!atomic_read(&nr_comm_events))
7984 return;
7985
7986 comm_event = (struct perf_comm_event){
7987 .task = task,
7988
7989
7990 .event_id = {
7991 .header = {
7992 .type = PERF_RECORD_COMM,
7993 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7994
7995 },
7996
7997
7998 },
7999 };
8000
8001 perf_event_comm_event(&comm_event);
8002 }
8003
8004
8005
8006
8007
8008 struct perf_namespaces_event {
8009 struct task_struct *task;
8010
8011 struct {
8012 struct perf_event_header header;
8013
8014 u32 pid;
8015 u32 tid;
8016 u64 nr_namespaces;
8017 struct perf_ns_link_info link_info[NR_NAMESPACES];
8018 } event_id;
8019 };
8020
8021 static int perf_event_namespaces_match(struct perf_event *event)
8022 {
8023 return event->attr.namespaces;
8024 }
8025
8026 static void perf_event_namespaces_output(struct perf_event *event,
8027 void *data)
8028 {
8029 struct perf_namespaces_event *namespaces_event = data;
8030 struct perf_output_handle handle;
8031 struct perf_sample_data sample;
8032 u16 header_size = namespaces_event->event_id.header.size;
8033 int ret;
8034
8035 if (!perf_event_namespaces_match(event))
8036 return;
8037
8038 perf_event_header__init_id(&namespaces_event->event_id.header,
8039 &sample, event);
8040 ret = perf_output_begin(&handle, &sample, event,
8041 namespaces_event->event_id.header.size);
8042 if (ret)
8043 goto out;
8044
8045 namespaces_event->event_id.pid = perf_event_pid(event,
8046 namespaces_event->task);
8047 namespaces_event->event_id.tid = perf_event_tid(event,
8048 namespaces_event->task);
8049
8050 perf_output_put(&handle, namespaces_event->event_id);
8051
8052 perf_event__output_id_sample(event, &handle, &sample);
8053
8054 perf_output_end(&handle);
8055 out:
8056 namespaces_event->event_id.header.size = header_size;
8057 }
8058
8059 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
8060 struct task_struct *task,
8061 const struct proc_ns_operations *ns_ops)
8062 {
8063 struct path ns_path;
8064 struct inode *ns_inode;
8065 int error;
8066
8067 error = ns_get_path(&ns_path, task, ns_ops);
8068 if (!error) {
8069 ns_inode = ns_path.dentry->d_inode;
8070 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
8071 ns_link_info->ino = ns_inode->i_ino;
8072 path_put(&ns_path);
8073 }
8074 }
8075
8076 void perf_event_namespaces(struct task_struct *task)
8077 {
8078 struct perf_namespaces_event namespaces_event;
8079 struct perf_ns_link_info *ns_link_info;
8080
8081 if (!atomic_read(&nr_namespaces_events))
8082 return;
8083
8084 namespaces_event = (struct perf_namespaces_event){
8085 .task = task,
8086 .event_id = {
8087 .header = {
8088 .type = PERF_RECORD_NAMESPACES,
8089 .misc = 0,
8090 .size = sizeof(namespaces_event.event_id),
8091 },
8092
8093
8094 .nr_namespaces = NR_NAMESPACES,
8095
8096 },
8097 };
8098
8099 ns_link_info = namespaces_event.event_id.link_info;
8100
8101 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
8102 task, &mntns_operations);
8103
8104 #ifdef CONFIG_USER_NS
8105 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
8106 task, &userns_operations);
8107 #endif
8108 #ifdef CONFIG_NET_NS
8109 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
8110 task, &netns_operations);
8111 #endif
8112 #ifdef CONFIG_UTS_NS
8113 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
8114 task, &utsns_operations);
8115 #endif
8116 #ifdef CONFIG_IPC_NS
8117 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
8118 task, &ipcns_operations);
8119 #endif
8120 #ifdef CONFIG_PID_NS
8121 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
8122 task, &pidns_operations);
8123 #endif
8124 #ifdef CONFIG_CGROUPS
8125 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
8126 task, &cgroupns_operations);
8127 #endif
8128
8129 perf_iterate_sb(perf_event_namespaces_output,
8130 &namespaces_event,
8131 NULL);
8132 }
8133
8134
8135
8136
8137 #ifdef CONFIG_CGROUP_PERF
8138
8139 struct perf_cgroup_event {
8140 char *path;
8141 int path_size;
8142 struct {
8143 struct perf_event_header header;
8144 u64 id;
8145 char path[];
8146 } event_id;
8147 };
8148
8149 static int perf_event_cgroup_match(struct perf_event *event)
8150 {
8151 return event->attr.cgroup;
8152 }
8153
8154 static void perf_event_cgroup_output(struct perf_event *event, void *data)
8155 {
8156 struct perf_cgroup_event *cgroup_event = data;
8157 struct perf_output_handle handle;
8158 struct perf_sample_data sample;
8159 u16 header_size = cgroup_event->event_id.header.size;
8160 int ret;
8161
8162 if (!perf_event_cgroup_match(event))
8163 return;
8164
8165 perf_event_header__init_id(&cgroup_event->event_id.header,
8166 &sample, event);
8167 ret = perf_output_begin(&handle, &sample, event,
8168 cgroup_event->event_id.header.size);
8169 if (ret)
8170 goto out;
8171
8172 perf_output_put(&handle, cgroup_event->event_id);
8173 __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8174
8175 perf_event__output_id_sample(event, &handle, &sample);
8176
8177 perf_output_end(&handle);
8178 out:
8179 cgroup_event->event_id.header.size = header_size;
8180 }
8181
8182 static void perf_event_cgroup(struct cgroup *cgrp)
8183 {
8184 struct perf_cgroup_event cgroup_event;
8185 char path_enomem[16] = "//enomem";
8186 char *pathname;
8187 size_t size;
8188
8189 if (!atomic_read(&nr_cgroup_events))
8190 return;
8191
8192 cgroup_event = (struct perf_cgroup_event){
8193 .event_id = {
8194 .header = {
8195 .type = PERF_RECORD_CGROUP,
8196 .misc = 0,
8197 .size = sizeof(cgroup_event.event_id),
8198 },
8199 .id = cgroup_id(cgrp),
8200 },
8201 };
8202
8203 pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8204 if (pathname == NULL) {
8205 cgroup_event.path = path_enomem;
8206 } else {
8207
8208 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8209 cgroup_event.path = pathname;
8210 }
8211
8212
8213
8214
8215
8216
8217 size = strlen(cgroup_event.path) + 1;
8218 while (!IS_ALIGNED(size, sizeof(u64)))
8219 cgroup_event.path[size++] = '\0';
8220
8221 cgroup_event.event_id.header.size += size;
8222 cgroup_event.path_size = size;
8223
8224 perf_iterate_sb(perf_event_cgroup_output,
8225 &cgroup_event,
8226 NULL);
8227
8228 kfree(pathname);
8229 }
8230
8231 #endif
8232
8233
8234
8235
8236
8237 struct perf_mmap_event {
8238 struct vm_area_struct *vma;
8239
8240 const char *file_name;
8241 int file_size;
8242 int maj, min;
8243 u64 ino;
8244 u64 ino_generation;
8245 u32 prot, flags;
8246 u8 build_id[BUILD_ID_SIZE_MAX];
8247 u32 build_id_size;
8248
8249 struct {
8250 struct perf_event_header header;
8251
8252 u32 pid;
8253 u32 tid;
8254 u64 start;
8255 u64 len;
8256 u64 pgoff;
8257 } event_id;
8258 };
8259
8260 static int perf_event_mmap_match(struct perf_event *event,
8261 void *data)
8262 {
8263 struct perf_mmap_event *mmap_event = data;
8264 struct vm_area_struct *vma = mmap_event->vma;
8265 int executable = vma->vm_flags & VM_EXEC;
8266
8267 return (!executable && event->attr.mmap_data) ||
8268 (executable && (event->attr.mmap || event->attr.mmap2));
8269 }
8270
8271 static void perf_event_mmap_output(struct perf_event *event,
8272 void *data)
8273 {
8274 struct perf_mmap_event *mmap_event = data;
8275 struct perf_output_handle handle;
8276 struct perf_sample_data sample;
8277 int size = mmap_event->event_id.header.size;
8278 u32 type = mmap_event->event_id.header.type;
8279 bool use_build_id;
8280 int ret;
8281
8282 if (!perf_event_mmap_match(event, data))
8283 return;
8284
8285 if (event->attr.mmap2) {
8286 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8287 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8288 mmap_event->event_id.header.size += sizeof(mmap_event->min);
8289 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8290 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8291 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8292 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8293 }
8294
8295 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8296 ret = perf_output_begin(&handle, &sample, event,
8297 mmap_event->event_id.header.size);
8298 if (ret)
8299 goto out;
8300
8301 mmap_event->event_id.pid = perf_event_pid(event, current);
8302 mmap_event->event_id.tid = perf_event_tid(event, current);
8303
8304 use_build_id = event->attr.build_id && mmap_event->build_id_size;
8305
8306 if (event->attr.mmap2 && use_build_id)
8307 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8308
8309 perf_output_put(&handle, mmap_event->event_id);
8310
8311 if (event->attr.mmap2) {
8312 if (use_build_id) {
8313 u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8314
8315 __output_copy(&handle, size, 4);
8316 __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8317 } else {
8318 perf_output_put(&handle, mmap_event->maj);
8319 perf_output_put(&handle, mmap_event->min);
8320 perf_output_put(&handle, mmap_event->ino);
8321 perf_output_put(&handle, mmap_event->ino_generation);
8322 }
8323 perf_output_put(&handle, mmap_event->prot);
8324 perf_output_put(&handle, mmap_event->flags);
8325 }
8326
8327 __output_copy(&handle, mmap_event->file_name,
8328 mmap_event->file_size);
8329
8330 perf_event__output_id_sample(event, &handle, &sample);
8331
8332 perf_output_end(&handle);
8333 out:
8334 mmap_event->event_id.header.size = size;
8335 mmap_event->event_id.header.type = type;
8336 }
8337
8338 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8339 {
8340 struct vm_area_struct *vma = mmap_event->vma;
8341 struct file *file = vma->vm_file;
8342 int maj = 0, min = 0;
8343 u64 ino = 0, gen = 0;
8344 u32 prot = 0, flags = 0;
8345 unsigned int size;
8346 char tmp[16];
8347 char *buf = NULL;
8348 char *name;
8349
8350 if (vma->vm_flags & VM_READ)
8351 prot |= PROT_READ;
8352 if (vma->vm_flags & VM_WRITE)
8353 prot |= PROT_WRITE;
8354 if (vma->vm_flags & VM_EXEC)
8355 prot |= PROT_EXEC;
8356
8357 if (vma->vm_flags & VM_MAYSHARE)
8358 flags = MAP_SHARED;
8359 else
8360 flags = MAP_PRIVATE;
8361
8362 if (vma->vm_flags & VM_LOCKED)
8363 flags |= MAP_LOCKED;
8364 if (is_vm_hugetlb_page(vma))
8365 flags |= MAP_HUGETLB;
8366
8367 if (file) {
8368 struct inode *inode;
8369 dev_t dev;
8370
8371 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8372 if (!buf) {
8373 name = "//enomem";
8374 goto cpy_name;
8375 }
8376
8377
8378
8379
8380
8381 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8382 if (IS_ERR(name)) {
8383 name = "//toolong";
8384 goto cpy_name;
8385 }
8386 inode = file_inode(vma->vm_file);
8387 dev = inode->i_sb->s_dev;
8388 ino = inode->i_ino;
8389 gen = inode->i_generation;
8390 maj = MAJOR(dev);
8391 min = MINOR(dev);
8392
8393 goto got_name;
8394 } else {
8395 if (vma->vm_ops && vma->vm_ops->name) {
8396 name = (char *) vma->vm_ops->name(vma);
8397 if (name)
8398 goto cpy_name;
8399 }
8400
8401 name = (char *)arch_vma_name(vma);
8402 if (name)
8403 goto cpy_name;
8404
8405 if (vma->vm_start <= vma->vm_mm->start_brk &&
8406 vma->vm_end >= vma->vm_mm->brk) {
8407 name = "[heap]";
8408 goto cpy_name;
8409 }
8410 if (vma->vm_start <= vma->vm_mm->start_stack &&
8411 vma->vm_end >= vma->vm_mm->start_stack) {
8412 name = "[stack]";
8413 goto cpy_name;
8414 }
8415
8416 name = "//anon";
8417 goto cpy_name;
8418 }
8419
8420 cpy_name:
8421 strlcpy(tmp, name, sizeof(tmp));
8422 name = tmp;
8423 got_name:
8424
8425
8426
8427
8428
8429 size = strlen(name)+1;
8430 while (!IS_ALIGNED(size, sizeof(u64)))
8431 name[size++] = '\0';
8432
8433 mmap_event->file_name = name;
8434 mmap_event->file_size = size;
8435 mmap_event->maj = maj;
8436 mmap_event->min = min;
8437 mmap_event->ino = ino;
8438 mmap_event->ino_generation = gen;
8439 mmap_event->prot = prot;
8440 mmap_event->flags = flags;
8441
8442 if (!(vma->vm_flags & VM_EXEC))
8443 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8444
8445 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8446
8447 if (atomic_read(&nr_build_id_events))
8448 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8449
8450 perf_iterate_sb(perf_event_mmap_output,
8451 mmap_event,
8452 NULL);
8453
8454 kfree(buf);
8455 }
8456
8457
8458
8459
8460 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8461 struct file *file, unsigned long offset,
8462 unsigned long size)
8463 {
8464
8465 if (!filter->path.dentry)
8466 return false;
8467
8468 if (d_inode(filter->path.dentry) != file_inode(file))
8469 return false;
8470
8471 if (filter->offset > offset + size)
8472 return false;
8473
8474 if (filter->offset + filter->size < offset)
8475 return false;
8476
8477 return true;
8478 }
8479
8480 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8481 struct vm_area_struct *vma,
8482 struct perf_addr_filter_range *fr)
8483 {
8484 unsigned long vma_size = vma->vm_end - vma->vm_start;
8485 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8486 struct file *file = vma->vm_file;
8487
8488 if (!perf_addr_filter_match(filter, file, off, vma_size))
8489 return false;
8490
8491 if (filter->offset < off) {
8492 fr->start = vma->vm_start;
8493 fr->size = min(vma_size, filter->size - (off - filter->offset));
8494 } else {
8495 fr->start = vma->vm_start + filter->offset - off;
8496 fr->size = min(vma->vm_end - fr->start, filter->size);
8497 }
8498
8499 return true;
8500 }
8501
8502 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8503 {
8504 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8505 struct vm_area_struct *vma = data;
8506 struct perf_addr_filter *filter;
8507 unsigned int restart = 0, count = 0;
8508 unsigned long flags;
8509
8510 if (!has_addr_filter(event))
8511 return;
8512
8513 if (!vma->vm_file)
8514 return;
8515
8516 raw_spin_lock_irqsave(&ifh->lock, flags);
8517 list_for_each_entry(filter, &ifh->list, entry) {
8518 if (perf_addr_filter_vma_adjust(filter, vma,
8519 &event->addr_filter_ranges[count]))
8520 restart++;
8521
8522 count++;
8523 }
8524
8525 if (restart)
8526 event->addr_filters_gen++;
8527 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8528
8529 if (restart)
8530 perf_event_stop(event, 1);
8531 }
8532
8533
8534
8535
8536 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8537 {
8538 struct perf_event_context *ctx;
8539 int ctxn;
8540
8541
8542
8543
8544
8545 if (!(vma->vm_flags & VM_EXEC))
8546 return;
8547
8548 rcu_read_lock();
8549 for_each_task_context_nr(ctxn) {
8550 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8551 if (!ctx)
8552 continue;
8553
8554 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8555 }
8556 rcu_read_unlock();
8557 }
8558
8559 void perf_event_mmap(struct vm_area_struct *vma)
8560 {
8561 struct perf_mmap_event mmap_event;
8562
8563 if (!atomic_read(&nr_mmap_events))
8564 return;
8565
8566 mmap_event = (struct perf_mmap_event){
8567 .vma = vma,
8568
8569
8570 .event_id = {
8571 .header = {
8572 .type = PERF_RECORD_MMAP,
8573 .misc = PERF_RECORD_MISC_USER,
8574
8575 },
8576
8577
8578 .start = vma->vm_start,
8579 .len = vma->vm_end - vma->vm_start,
8580 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
8581 },
8582
8583
8584
8585
8586
8587
8588 };
8589
8590 perf_addr_filters_adjust(vma);
8591 perf_event_mmap_event(&mmap_event);
8592 }
8593
8594 void perf_event_aux_event(struct perf_event *event, unsigned long head,
8595 unsigned long size, u64 flags)
8596 {
8597 struct perf_output_handle handle;
8598 struct perf_sample_data sample;
8599 struct perf_aux_event {
8600 struct perf_event_header header;
8601 u64 offset;
8602 u64 size;
8603 u64 flags;
8604 } rec = {
8605 .header = {
8606 .type = PERF_RECORD_AUX,
8607 .misc = 0,
8608 .size = sizeof(rec),
8609 },
8610 .offset = head,
8611 .size = size,
8612 .flags = flags,
8613 };
8614 int ret;
8615
8616 perf_event_header__init_id(&rec.header, &sample, event);
8617 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8618
8619 if (ret)
8620 return;
8621
8622 perf_output_put(&handle, rec);
8623 perf_event__output_id_sample(event, &handle, &sample);
8624
8625 perf_output_end(&handle);
8626 }
8627
8628
8629
8630
8631 void perf_log_lost_samples(struct perf_event *event, u64 lost)
8632 {
8633 struct perf_output_handle handle;
8634 struct perf_sample_data sample;
8635 int ret;
8636
8637 struct {
8638 struct perf_event_header header;
8639 u64 lost;
8640 } lost_samples_event = {
8641 .header = {
8642 .type = PERF_RECORD_LOST_SAMPLES,
8643 .misc = 0,
8644 .size = sizeof(lost_samples_event),
8645 },
8646 .lost = lost,
8647 };
8648
8649 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8650
8651 ret = perf_output_begin(&handle, &sample, event,
8652 lost_samples_event.header.size);
8653 if (ret)
8654 return;
8655
8656 perf_output_put(&handle, lost_samples_event);
8657 perf_event__output_id_sample(event, &handle, &sample);
8658 perf_output_end(&handle);
8659 }
8660
8661
8662
8663
8664
8665 struct perf_switch_event {
8666 struct task_struct *task;
8667 struct task_struct *next_prev;
8668
8669 struct {
8670 struct perf_event_header header;
8671 u32 next_prev_pid;
8672 u32 next_prev_tid;
8673 } event_id;
8674 };
8675
8676 static int perf_event_switch_match(struct perf_event *event)
8677 {
8678 return event->attr.context_switch;
8679 }
8680
8681 static void perf_event_switch_output(struct perf_event *event, void *data)
8682 {
8683 struct perf_switch_event *se = data;
8684 struct perf_output_handle handle;
8685 struct perf_sample_data sample;
8686 int ret;
8687
8688 if (!perf_event_switch_match(event))
8689 return;
8690
8691
8692 if (event->ctx->task) {
8693 se->event_id.header.type = PERF_RECORD_SWITCH;
8694 se->event_id.header.size = sizeof(se->event_id.header);
8695 } else {
8696 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8697 se->event_id.header.size = sizeof(se->event_id);
8698 se->event_id.next_prev_pid =
8699 perf_event_pid(event, se->next_prev);
8700 se->event_id.next_prev_tid =
8701 perf_event_tid(event, se->next_prev);
8702 }
8703
8704 perf_event_header__init_id(&se->event_id.header, &sample, event);
8705
8706 ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8707 if (ret)
8708 return;
8709
8710 if (event->ctx->task)
8711 perf_output_put(&handle, se->event_id.header);
8712 else
8713 perf_output_put(&handle, se->event_id);
8714
8715 perf_event__output_id_sample(event, &handle, &sample);
8716
8717 perf_output_end(&handle);
8718 }
8719
8720 static void perf_event_switch(struct task_struct *task,
8721 struct task_struct *next_prev, bool sched_in)
8722 {
8723 struct perf_switch_event switch_event;
8724
8725
8726
8727 switch_event = (struct perf_switch_event){
8728 .task = task,
8729 .next_prev = next_prev,
8730 .event_id = {
8731 .header = {
8732
8733 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8734
8735 },
8736
8737
8738 },
8739 };
8740
8741 if (!sched_in && task->on_rq) {
8742 switch_event.event_id.header.misc |=
8743 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8744 }
8745
8746 perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
8747 }
8748
8749
8750
8751
8752
8753 static void perf_log_throttle(struct perf_event *event, int enable)
8754 {
8755 struct perf_output_handle handle;
8756 struct perf_sample_data sample;
8757 int ret;
8758
8759 struct {
8760 struct perf_event_header header;
8761 u64 time;
8762 u64 id;
8763 u64 stream_id;
8764 } throttle_event = {
8765 .header = {
8766 .type = PERF_RECORD_THROTTLE,
8767 .misc = 0,
8768 .size = sizeof(throttle_event),
8769 },
8770 .time = perf_event_clock(event),
8771 .id = primary_event_id(event),
8772 .stream_id = event->id,
8773 };
8774
8775 if (enable)
8776 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8777
8778 perf_event_header__init_id(&throttle_event.header, &sample, event);
8779
8780 ret = perf_output_begin(&handle, &sample, event,
8781 throttle_event.header.size);
8782 if (ret)
8783 return;
8784
8785 perf_output_put(&handle, throttle_event);
8786 perf_event__output_id_sample(event, &handle, &sample);
8787 perf_output_end(&handle);
8788 }
8789
8790
8791
8792
8793
8794 struct perf_ksymbol_event {
8795 const char *name;
8796 int name_len;
8797 struct {
8798 struct perf_event_header header;
8799 u64 addr;
8800 u32 len;
8801 u16 ksym_type;
8802 u16 flags;
8803 } event_id;
8804 };
8805
8806 static int perf_event_ksymbol_match(struct perf_event *event)
8807 {
8808 return event->attr.ksymbol;
8809 }
8810
8811 static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8812 {
8813 struct perf_ksymbol_event *ksymbol_event = data;
8814 struct perf_output_handle handle;
8815 struct perf_sample_data sample;
8816 int ret;
8817
8818 if (!perf_event_ksymbol_match(event))
8819 return;
8820
8821 perf_event_header__init_id(&ksymbol_event->event_id.header,
8822 &sample, event);
8823 ret = perf_output_begin(&handle, &sample, event,
8824 ksymbol_event->event_id.header.size);
8825 if (ret)
8826 return;
8827
8828 perf_output_put(&handle, ksymbol_event->event_id);
8829 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8830 perf_event__output_id_sample(event, &handle, &sample);
8831
8832 perf_output_end(&handle);
8833 }
8834
8835 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8836 const char *sym)
8837 {
8838 struct perf_ksymbol_event ksymbol_event;
8839 char name[KSYM_NAME_LEN];
8840 u16 flags = 0;
8841 int name_len;
8842
8843 if (!atomic_read(&nr_ksymbol_events))
8844 return;
8845
8846 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8847 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8848 goto err;
8849
8850 strlcpy(name, sym, KSYM_NAME_LEN);
8851 name_len = strlen(name) + 1;
8852 while (!IS_ALIGNED(name_len, sizeof(u64)))
8853 name[name_len++] = '\0';
8854 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8855
8856 if (unregister)
8857 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8858
8859 ksymbol_event = (struct perf_ksymbol_event){
8860 .name = name,
8861 .name_len = name_len,
8862 .event_id = {
8863 .header = {
8864 .type = PERF_RECORD_KSYMBOL,
8865 .size = sizeof(ksymbol_event.event_id) +
8866 name_len,
8867 },
8868 .addr = addr,
8869 .len = len,
8870 .ksym_type = ksym_type,
8871 .flags = flags,
8872 },
8873 };
8874
8875 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8876 return;
8877 err:
8878 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8879 }
8880
8881
8882
8883
8884
8885 struct perf_bpf_event {
8886 struct bpf_prog *prog;
8887 struct {
8888 struct perf_event_header header;
8889 u16 type;
8890 u16 flags;
8891 u32 id;
8892 u8 tag[BPF_TAG_SIZE];
8893 } event_id;
8894 };
8895
8896 static int perf_event_bpf_match(struct perf_event *event)
8897 {
8898 return event->attr.bpf_event;
8899 }
8900
8901 static void perf_event_bpf_output(struct perf_event *event, void *data)
8902 {
8903 struct perf_bpf_event *bpf_event = data;
8904 struct perf_output_handle handle;
8905 struct perf_sample_data sample;
8906 int ret;
8907
8908 if (!perf_event_bpf_match(event))
8909 return;
8910
8911 perf_event_header__init_id(&bpf_event->event_id.header,
8912 &sample, event);
8913 ret = perf_output_begin(&handle, data, event,
8914 bpf_event->event_id.header.size);
8915 if (ret)
8916 return;
8917
8918 perf_output_put(&handle, bpf_event->event_id);
8919 perf_event__output_id_sample(event, &handle, &sample);
8920
8921 perf_output_end(&handle);
8922 }
8923
8924 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8925 enum perf_bpf_event_type type)
8926 {
8927 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8928 int i;
8929
8930 if (prog->aux->func_cnt == 0) {
8931 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8932 (u64)(unsigned long)prog->bpf_func,
8933 prog->jited_len, unregister,
8934 prog->aux->ksym.name);
8935 } else {
8936 for (i = 0; i < prog->aux->func_cnt; i++) {
8937 struct bpf_prog *subprog = prog->aux->func[i];
8938
8939 perf_event_ksymbol(
8940 PERF_RECORD_KSYMBOL_TYPE_BPF,
8941 (u64)(unsigned long)subprog->bpf_func,
8942 subprog->jited_len, unregister,
8943 prog->aux->ksym.name);
8944 }
8945 }
8946 }
8947
8948 void perf_event_bpf_event(struct bpf_prog *prog,
8949 enum perf_bpf_event_type type,
8950 u16 flags)
8951 {
8952 struct perf_bpf_event bpf_event;
8953
8954 if (type <= PERF_BPF_EVENT_UNKNOWN ||
8955 type >= PERF_BPF_EVENT_MAX)
8956 return;
8957
8958 switch (type) {
8959 case PERF_BPF_EVENT_PROG_LOAD:
8960 case PERF_BPF_EVENT_PROG_UNLOAD:
8961 if (atomic_read(&nr_ksymbol_events))
8962 perf_event_bpf_emit_ksymbols(prog, type);
8963 break;
8964 default:
8965 break;
8966 }
8967
8968 if (!atomic_read(&nr_bpf_events))
8969 return;
8970
8971 bpf_event = (struct perf_bpf_event){
8972 .prog = prog,
8973 .event_id = {
8974 .header = {
8975 .type = PERF_RECORD_BPF_EVENT,
8976 .size = sizeof(bpf_event.event_id),
8977 },
8978 .type = type,
8979 .flags = flags,
8980 .id = prog->aux->id,
8981 },
8982 };
8983
8984 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8985
8986 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8987 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8988 }
8989
8990 struct perf_text_poke_event {
8991 const void *old_bytes;
8992 const void *new_bytes;
8993 size_t pad;
8994 u16 old_len;
8995 u16 new_len;
8996
8997 struct {
8998 struct perf_event_header header;
8999
9000 u64 addr;
9001 } event_id;
9002 };
9003
9004 static int perf_event_text_poke_match(struct perf_event *event)
9005 {
9006 return event->attr.text_poke;
9007 }
9008
9009 static void perf_event_text_poke_output(struct perf_event *event, void *data)
9010 {
9011 struct perf_text_poke_event *text_poke_event = data;
9012 struct perf_output_handle handle;
9013 struct perf_sample_data sample;
9014 u64 padding = 0;
9015 int ret;
9016
9017 if (!perf_event_text_poke_match(event))
9018 return;
9019
9020 perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
9021
9022 ret = perf_output_begin(&handle, &sample, event,
9023 text_poke_event->event_id.header.size);
9024 if (ret)
9025 return;
9026
9027 perf_output_put(&handle, text_poke_event->event_id);
9028 perf_output_put(&handle, text_poke_event->old_len);
9029 perf_output_put(&handle, text_poke_event->new_len);
9030
9031 __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
9032 __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
9033
9034 if (text_poke_event->pad)
9035 __output_copy(&handle, &padding, text_poke_event->pad);
9036
9037 perf_event__output_id_sample(event, &handle, &sample);
9038
9039 perf_output_end(&handle);
9040 }
9041
9042 void perf_event_text_poke(const void *addr, const void *old_bytes,
9043 size_t old_len, const void *new_bytes, size_t new_len)
9044 {
9045 struct perf_text_poke_event text_poke_event;
9046 size_t tot, pad;
9047
9048 if (!atomic_read(&nr_text_poke_events))
9049 return;
9050
9051 tot = sizeof(text_poke_event.old_len) + old_len;
9052 tot += sizeof(text_poke_event.new_len) + new_len;
9053 pad = ALIGN(tot, sizeof(u64)) - tot;
9054
9055 text_poke_event = (struct perf_text_poke_event){
9056 .old_bytes = old_bytes,
9057 .new_bytes = new_bytes,
9058 .pad = pad,
9059 .old_len = old_len,
9060 .new_len = new_len,
9061 .event_id = {
9062 .header = {
9063 .type = PERF_RECORD_TEXT_POKE,
9064 .misc = PERF_RECORD_MISC_KERNEL,
9065 .size = sizeof(text_poke_event.event_id) + tot + pad,
9066 },
9067 .addr = (unsigned long)addr,
9068 },
9069 };
9070
9071 perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
9072 }
9073
9074 void perf_event_itrace_started(struct perf_event *event)
9075 {
9076 event->attach_state |= PERF_ATTACH_ITRACE;
9077 }
9078
9079 static void perf_log_itrace_start(struct perf_event *event)
9080 {
9081 struct perf_output_handle handle;
9082 struct perf_sample_data sample;
9083 struct perf_aux_event {
9084 struct perf_event_header header;
9085 u32 pid;
9086 u32 tid;
9087 } rec;
9088 int ret;
9089
9090 if (event->parent)
9091 event = event->parent;
9092
9093 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
9094 event->attach_state & PERF_ATTACH_ITRACE)
9095 return;
9096
9097 rec.header.type = PERF_RECORD_ITRACE_START;
9098 rec.header.misc = 0;
9099 rec.header.size = sizeof(rec);
9100 rec.pid = perf_event_pid(event, current);
9101 rec.tid = perf_event_tid(event, current);
9102
9103 perf_event_header__init_id(&rec.header, &sample, event);
9104 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9105
9106 if (ret)
9107 return;
9108
9109 perf_output_put(&handle, rec);
9110 perf_event__output_id_sample(event, &handle, &sample);
9111
9112 perf_output_end(&handle);
9113 }
9114
9115 void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
9116 {
9117 struct perf_output_handle handle;
9118 struct perf_sample_data sample;
9119 struct perf_aux_event {
9120 struct perf_event_header header;
9121 u64 hw_id;
9122 } rec;
9123 int ret;
9124
9125 if (event->parent)
9126 event = event->parent;
9127
9128 rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
9129 rec.header.misc = 0;
9130 rec.header.size = sizeof(rec);
9131 rec.hw_id = hw_id;
9132
9133 perf_event_header__init_id(&rec.header, &sample, event);
9134 ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9135
9136 if (ret)
9137 return;
9138
9139 perf_output_put(&handle, rec);
9140 perf_event__output_id_sample(event, &handle, &sample);
9141
9142 perf_output_end(&handle);
9143 }
9144
9145 static int
9146 __perf_event_account_interrupt(struct perf_event *event, int throttle)
9147 {
9148 struct hw_perf_event *hwc = &event->hw;
9149 int ret = 0;
9150 u64 seq;
9151
9152 seq = __this_cpu_read(perf_throttled_seq);
9153 if (seq != hwc->interrupts_seq) {
9154 hwc->interrupts_seq = seq;
9155 hwc->interrupts = 1;
9156 } else {
9157 hwc->interrupts++;
9158 if (unlikely(throttle
9159 && hwc->interrupts >= max_samples_per_tick)) {
9160 __this_cpu_inc(perf_throttled_count);
9161 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
9162 hwc->interrupts = MAX_INTERRUPTS;
9163 perf_log_throttle(event, 0);
9164 ret = 1;
9165 }
9166 }
9167
9168 if (event->attr.freq) {
9169 u64 now = perf_clock();
9170 s64 delta = now - hwc->freq_time_stamp;
9171
9172 hwc->freq_time_stamp = now;
9173
9174 if (delta > 0 && delta < 2*TICK_NSEC)
9175 perf_adjust_period(event, delta, hwc->last_period, true);
9176 }
9177
9178 return ret;
9179 }
9180
9181 int perf_event_account_interrupt(struct perf_event *event)
9182 {
9183 return __perf_event_account_interrupt(event, 1);
9184 }
9185
9186
9187
9188
9189
9190 static int __perf_event_overflow(struct perf_event *event,
9191 int throttle, struct perf_sample_data *data,
9192 struct pt_regs *regs)
9193 {
9194 int events = atomic_read(&event->event_limit);
9195 int ret = 0;
9196
9197
9198
9199
9200
9201 if (unlikely(!is_sampling_event(event)))
9202 return 0;
9203
9204 ret = __perf_event_account_interrupt(event, throttle);
9205
9206
9207
9208
9209
9210
9211 event->pending_kill = POLL_IN;
9212 if (events && atomic_dec_and_test(&event->event_limit)) {
9213 ret = 1;
9214 event->pending_kill = POLL_HUP;
9215 event->pending_addr = data->addr;
9216
9217 perf_event_disable_inatomic(event);
9218 }
9219
9220 READ_ONCE(event->overflow_handler)(event, data, regs);
9221
9222 if (*perf_event_fasync(event) && event->pending_kill) {
9223 event->pending_wakeup = 1;
9224 irq_work_queue(&event->pending);
9225 }
9226
9227 return ret;
9228 }
9229
9230 int perf_event_overflow(struct perf_event *event,
9231 struct perf_sample_data *data,
9232 struct pt_regs *regs)
9233 {
9234 return __perf_event_overflow(event, 1, data, regs);
9235 }
9236
9237
9238
9239
9240
9241 struct swevent_htable {
9242 struct swevent_hlist *swevent_hlist;
9243 struct mutex hlist_mutex;
9244 int hlist_refcount;
9245
9246
9247 int recursion[PERF_NR_CONTEXTS];
9248 };
9249
9250 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9251
9252
9253
9254
9255
9256
9257
9258
9259 u64 perf_swevent_set_period(struct perf_event *event)
9260 {
9261 struct hw_perf_event *hwc = &event->hw;
9262 u64 period = hwc->last_period;
9263 u64 nr, offset;
9264 s64 old, val;
9265
9266 hwc->last_period = hwc->sample_period;
9267
9268 again:
9269 old = val = local64_read(&hwc->period_left);
9270 if (val < 0)
9271 return 0;
9272
9273 nr = div64_u64(period + val, period);
9274 offset = nr * period;
9275 val -= offset;
9276 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
9277 goto again;
9278
9279 return nr;
9280 }
9281
9282 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9283 struct perf_sample_data *data,
9284 struct pt_regs *regs)
9285 {
9286 struct hw_perf_event *hwc = &event->hw;
9287 int throttle = 0;
9288
9289 if (!overflow)
9290 overflow = perf_swevent_set_period(event);
9291
9292 if (hwc->interrupts == MAX_INTERRUPTS)
9293 return;
9294
9295 for (; overflow; overflow--) {
9296 if (__perf_event_overflow(event, throttle,
9297 data, regs)) {
9298
9299
9300
9301
9302 break;
9303 }
9304 throttle = 1;
9305 }
9306 }
9307
9308 static void perf_swevent_event(struct perf_event *event, u64 nr,
9309 struct perf_sample_data *data,
9310 struct pt_regs *regs)
9311 {
9312 struct hw_perf_event *hwc = &event->hw;
9313
9314 local64_add(nr, &event->count);
9315
9316 if (!regs)
9317 return;
9318
9319 if (!is_sampling_event(event))
9320 return;
9321
9322 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9323 data->period = nr;
9324 return perf_swevent_overflow(event, 1, data, regs);
9325 } else
9326 data->period = event->hw.last_period;
9327
9328 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9329 return perf_swevent_overflow(event, 1, data, regs);
9330
9331 if (local64_add_negative(nr, &hwc->period_left))
9332 return;
9333
9334 perf_swevent_overflow(event, 0, data, regs);
9335 }
9336
9337 static int perf_exclude_event(struct perf_event *event,
9338 struct pt_regs *regs)
9339 {
9340 if (event->hw.state & PERF_HES_STOPPED)
9341 return 1;
9342
9343 if (regs) {
9344 if (event->attr.exclude_user && user_mode(regs))
9345 return 1;
9346
9347 if (event->attr.exclude_kernel && !user_mode(regs))
9348 return 1;
9349 }
9350
9351 return 0;
9352 }
9353
9354 static int perf_swevent_match(struct perf_event *event,
9355 enum perf_type_id type,
9356 u32 event_id,
9357 struct perf_sample_data *data,
9358 struct pt_regs *regs)
9359 {
9360 if (event->attr.type != type)
9361 return 0;
9362
9363 if (event->attr.config != event_id)
9364 return 0;
9365
9366 if (perf_exclude_event(event, regs))
9367 return 0;
9368
9369 return 1;
9370 }
9371
9372 static inline u64 swevent_hash(u64 type, u32 event_id)
9373 {
9374 u64 val = event_id | (type << 32);
9375
9376 return hash_64(val, SWEVENT_HLIST_BITS);
9377 }
9378
9379 static inline struct hlist_head *
9380 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9381 {
9382 u64 hash = swevent_hash(type, event_id);
9383
9384 return &hlist->heads[hash];
9385 }
9386
9387
9388 static inline struct hlist_head *
9389 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9390 {
9391 struct swevent_hlist *hlist;
9392
9393 hlist = rcu_dereference(swhash->swevent_hlist);
9394 if (!hlist)
9395 return NULL;
9396
9397 return __find_swevent_head(hlist, type, event_id);
9398 }
9399
9400
9401 static inline struct hlist_head *
9402 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9403 {
9404 struct swevent_hlist *hlist;
9405 u32 event_id = event->attr.config;
9406 u64 type = event->attr.type;
9407
9408
9409
9410
9411
9412
9413 hlist = rcu_dereference_protected(swhash->swevent_hlist,
9414 lockdep_is_held(&event->ctx->lock));
9415 if (!hlist)
9416 return NULL;
9417
9418 return __find_swevent_head(hlist, type, event_id);
9419 }
9420
9421 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9422 u64 nr,
9423 struct perf_sample_data *data,
9424 struct pt_regs *regs)
9425 {
9426 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9427 struct perf_event *event;
9428 struct hlist_head *head;
9429
9430 rcu_read_lock();
9431 head = find_swevent_head_rcu(swhash, type, event_id);
9432 if (!head)
9433 goto end;
9434
9435 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9436 if (perf_swevent_match(event, type, event_id, data, regs))
9437 perf_swevent_event(event, nr, data, regs);
9438 }
9439 end:
9440 rcu_read_unlock();
9441 }
9442
9443 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9444
9445 int perf_swevent_get_recursion_context(void)
9446 {
9447 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9448
9449 return get_recursion_context(swhash->recursion);
9450 }
9451 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9452
9453 void perf_swevent_put_recursion_context(int rctx)
9454 {
9455 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9456
9457 put_recursion_context(swhash->recursion, rctx);
9458 }
9459
9460 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9461 {
9462 struct perf_sample_data data;
9463
9464 if (WARN_ON_ONCE(!regs))
9465 return;
9466
9467 perf_sample_data_init(&data, addr, 0);
9468 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9469 }
9470
9471 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9472 {
9473 int rctx;
9474
9475 preempt_disable_notrace();
9476 rctx = perf_swevent_get_recursion_context();
9477 if (unlikely(rctx < 0))
9478 goto fail;
9479
9480 ___perf_sw_event(event_id, nr, regs, addr);
9481
9482 perf_swevent_put_recursion_context(rctx);
9483 fail:
9484 preempt_enable_notrace();
9485 }
9486
9487 static void perf_swevent_read(struct perf_event *event)
9488 {
9489 }
9490
9491 static int perf_swevent_add(struct perf_event *event, int flags)
9492 {
9493 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9494 struct hw_perf_event *hwc = &event->hw;
9495 struct hlist_head *head;
9496
9497 if (is_sampling_event(event)) {
9498 hwc->last_period = hwc->sample_period;
9499 perf_swevent_set_period(event);
9500 }
9501
9502 hwc->state = !(flags & PERF_EF_START);
9503
9504 head = find_swevent_head(swhash, event);
9505 if (WARN_ON_ONCE(!head))
9506 return -EINVAL;
9507
9508 hlist_add_head_rcu(&event->hlist_entry, head);
9509 perf_event_update_userpage(event);
9510
9511 return 0;
9512 }
9513
9514 static void perf_swevent_del(struct perf_event *event, int flags)
9515 {
9516 hlist_del_rcu(&event->hlist_entry);
9517 }
9518
9519 static void perf_swevent_start(struct perf_event *event, int flags)
9520 {
9521 event->hw.state = 0;
9522 }
9523
9524 static void perf_swevent_stop(struct perf_event *event, int flags)
9525 {
9526 event->hw.state = PERF_HES_STOPPED;
9527 }
9528
9529
9530 static inline struct swevent_hlist *
9531 swevent_hlist_deref(struct swevent_htable *swhash)
9532 {
9533 return rcu_dereference_protected(swhash->swevent_hlist,
9534 lockdep_is_held(&swhash->hlist_mutex));
9535 }
9536
9537 static void swevent_hlist_release(struct swevent_htable *swhash)
9538 {
9539 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9540
9541 if (!hlist)
9542 return;
9543
9544 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9545 kfree_rcu(hlist, rcu_head);
9546 }
9547
9548 static void swevent_hlist_put_cpu(int cpu)
9549 {
9550 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9551
9552 mutex_lock(&swhash->hlist_mutex);
9553
9554 if (!--swhash->hlist_refcount)
9555 swevent_hlist_release(swhash);
9556
9557 mutex_unlock(&swhash->hlist_mutex);
9558 }
9559
9560 static void swevent_hlist_put(void)
9561 {
9562 int cpu;
9563
9564 for_each_possible_cpu(cpu)
9565 swevent_hlist_put_cpu(cpu);
9566 }
9567
9568 static int swevent_hlist_get_cpu(int cpu)
9569 {
9570 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9571 int err = 0;
9572
9573 mutex_lock(&swhash->hlist_mutex);
9574 if (!swevent_hlist_deref(swhash) &&
9575 cpumask_test_cpu(cpu, perf_online_mask)) {
9576 struct swevent_hlist *hlist;
9577
9578 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9579 if (!hlist) {
9580 err = -ENOMEM;
9581 goto exit;
9582 }
9583 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9584 }
9585 swhash->hlist_refcount++;
9586 exit:
9587 mutex_unlock(&swhash->hlist_mutex);
9588
9589 return err;
9590 }
9591
9592 static int swevent_hlist_get(void)
9593 {
9594 int err, cpu, failed_cpu;
9595
9596 mutex_lock(&pmus_lock);
9597 for_each_possible_cpu(cpu) {
9598 err = swevent_hlist_get_cpu(cpu);
9599 if (err) {
9600 failed_cpu = cpu;
9601 goto fail;
9602 }
9603 }
9604 mutex_unlock(&pmus_lock);
9605 return 0;
9606 fail:
9607 for_each_possible_cpu(cpu) {
9608 if (cpu == failed_cpu)
9609 break;
9610 swevent_hlist_put_cpu(cpu);
9611 }
9612 mutex_unlock(&pmus_lock);
9613 return err;
9614 }
9615
9616 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9617
9618 static void sw_perf_event_destroy(struct perf_event *event)
9619 {
9620 u64 event_id = event->attr.config;
9621
9622 WARN_ON(event->parent);
9623
9624 static_key_slow_dec(&perf_swevent_enabled[event_id]);
9625 swevent_hlist_put();
9626 }
9627
9628 static int perf_swevent_init(struct perf_event *event)
9629 {
9630 u64 event_id = event->attr.config;
9631
9632 if (event->attr.type != PERF_TYPE_SOFTWARE)
9633 return -ENOENT;
9634
9635
9636
9637
9638 if (has_branch_stack(event))
9639 return -EOPNOTSUPP;
9640
9641 switch (event_id) {
9642 case PERF_COUNT_SW_CPU_CLOCK:
9643 case PERF_COUNT_SW_TASK_CLOCK:
9644 return -ENOENT;
9645
9646 default:
9647 break;
9648 }
9649
9650 if (event_id >= PERF_COUNT_SW_MAX)
9651 return -ENOENT;
9652
9653 if (!event->parent) {
9654 int err;
9655
9656 err = swevent_hlist_get();
9657 if (err)
9658 return err;
9659
9660 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9661 event->destroy = sw_perf_event_destroy;
9662 }
9663
9664 return 0;
9665 }
9666
9667 static struct pmu perf_swevent = {
9668 .task_ctx_nr = perf_sw_context,
9669
9670 .capabilities = PERF_PMU_CAP_NO_NMI,
9671
9672 .event_init = perf_swevent_init,
9673 .add = perf_swevent_add,
9674 .del = perf_swevent_del,
9675 .start = perf_swevent_start,
9676 .stop = perf_swevent_stop,
9677 .read = perf_swevent_read,
9678 };
9679
9680 #ifdef CONFIG_EVENT_TRACING
9681
9682 static int perf_tp_filter_match(struct perf_event *event,
9683 struct perf_sample_data *data)
9684 {
9685 void *record = data->raw->frag.data;
9686
9687
9688 if (event->parent)
9689 event = event->parent;
9690
9691 if (likely(!event->filter) || filter_match_preds(event->filter, record))
9692 return 1;
9693 return 0;
9694 }
9695
9696 static int perf_tp_event_match(struct perf_event *event,
9697 struct perf_sample_data *data,
9698 struct pt_regs *regs)
9699 {
9700 if (event->hw.state & PERF_HES_STOPPED)
9701 return 0;
9702
9703
9704
9705 if (event->attr.exclude_kernel && !user_mode(regs))
9706 return 0;
9707
9708 if (!perf_tp_filter_match(event, data))
9709 return 0;
9710
9711 return 1;
9712 }
9713
9714 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9715 struct trace_event_call *call, u64 count,
9716 struct pt_regs *regs, struct hlist_head *head,
9717 struct task_struct *task)
9718 {
9719 if (bpf_prog_array_valid(call)) {
9720 *(struct pt_regs **)raw_data = regs;
9721 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9722 perf_swevent_put_recursion_context(rctx);
9723 return;
9724 }
9725 }
9726 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9727 rctx, task);
9728 }
9729 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9730
9731 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9732 struct pt_regs *regs, struct hlist_head *head, int rctx,
9733 struct task_struct *task)
9734 {
9735 struct perf_sample_data data;
9736 struct perf_event *event;
9737
9738 struct perf_raw_record raw = {
9739 .frag = {
9740 .size = entry_size,
9741 .data = record,
9742 },
9743 };
9744
9745 perf_sample_data_init(&data, 0, 0);
9746 data.raw = &raw;
9747
9748 perf_trace_buf_update(record, event_type);
9749
9750 hlist_for_each_entry_rcu(event, head, hlist_entry) {
9751 if (perf_tp_event_match(event, &data, regs))
9752 perf_swevent_event(event, count, &data, regs);
9753 }
9754
9755
9756
9757
9758
9759 if (task && task != current) {
9760 struct perf_event_context *ctx;
9761 struct trace_entry *entry = record;
9762
9763 rcu_read_lock();
9764 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9765 if (!ctx)
9766 goto unlock;
9767
9768 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9769 if (event->cpu != smp_processor_id())
9770 continue;
9771 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9772 continue;
9773 if (event->attr.config != entry->type)
9774 continue;
9775
9776 if (event->attr.sigtrap)
9777 continue;
9778 if (perf_tp_event_match(event, &data, regs))
9779 perf_swevent_event(event, count, &data, regs);
9780 }
9781 unlock:
9782 rcu_read_unlock();
9783 }
9784
9785 perf_swevent_put_recursion_context(rctx);
9786 }
9787 EXPORT_SYMBOL_GPL(perf_tp_event);
9788
9789 static void tp_perf_event_destroy(struct perf_event *event)
9790 {
9791 perf_trace_destroy(event);
9792 }
9793
9794 static int perf_tp_event_init(struct perf_event *event)
9795 {
9796 int err;
9797
9798 if (event->attr.type != PERF_TYPE_TRACEPOINT)
9799 return -ENOENT;
9800
9801
9802
9803
9804 if (has_branch_stack(event))
9805 return -EOPNOTSUPP;
9806
9807 err = perf_trace_init(event);
9808 if (err)
9809 return err;
9810
9811 event->destroy = tp_perf_event_destroy;
9812
9813 return 0;
9814 }
9815
9816 static struct pmu perf_tracepoint = {
9817 .task_ctx_nr = perf_sw_context,
9818
9819 .event_init = perf_tp_event_init,
9820 .add = perf_trace_add,
9821 .del = perf_trace_del,
9822 .start = perf_swevent_start,
9823 .stop = perf_swevent_stop,
9824 .read = perf_swevent_read,
9825 };
9826
9827 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842 enum perf_probe_config {
9843 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,
9844 PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9845 PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9846 };
9847
9848 PMU_FORMAT_ATTR(retprobe, "config:0");
9849 #endif
9850
9851 #ifdef CONFIG_KPROBE_EVENTS
9852 static struct attribute *kprobe_attrs[] = {
9853 &format_attr_retprobe.attr,
9854 NULL,
9855 };
9856
9857 static struct attribute_group kprobe_format_group = {
9858 .name = "format",
9859 .attrs = kprobe_attrs,
9860 };
9861
9862 static const struct attribute_group *kprobe_attr_groups[] = {
9863 &kprobe_format_group,
9864 NULL,
9865 };
9866
9867 static int perf_kprobe_event_init(struct perf_event *event);
9868 static struct pmu perf_kprobe = {
9869 .task_ctx_nr = perf_sw_context,
9870 .event_init = perf_kprobe_event_init,
9871 .add = perf_trace_add,
9872 .del = perf_trace_del,
9873 .start = perf_swevent_start,
9874 .stop = perf_swevent_stop,
9875 .read = perf_swevent_read,
9876 .attr_groups = kprobe_attr_groups,
9877 };
9878
9879 static int perf_kprobe_event_init(struct perf_event *event)
9880 {
9881 int err;
9882 bool is_retprobe;
9883
9884 if (event->attr.type != perf_kprobe.type)
9885 return -ENOENT;
9886
9887 if (!perfmon_capable())
9888 return -EACCES;
9889
9890
9891
9892
9893 if (has_branch_stack(event))
9894 return -EOPNOTSUPP;
9895
9896 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9897 err = perf_kprobe_init(event, is_retprobe);
9898 if (err)
9899 return err;
9900
9901 event->destroy = perf_kprobe_destroy;
9902
9903 return 0;
9904 }
9905 #endif
9906
9907 #ifdef CONFIG_UPROBE_EVENTS
9908 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9909
9910 static struct attribute *uprobe_attrs[] = {
9911 &format_attr_retprobe.attr,
9912 &format_attr_ref_ctr_offset.attr,
9913 NULL,
9914 };
9915
9916 static struct attribute_group uprobe_format_group = {
9917 .name = "format",
9918 .attrs = uprobe_attrs,
9919 };
9920
9921 static const struct attribute_group *uprobe_attr_groups[] = {
9922 &uprobe_format_group,
9923 NULL,
9924 };
9925
9926 static int perf_uprobe_event_init(struct perf_event *event);
9927 static struct pmu perf_uprobe = {
9928 .task_ctx_nr = perf_sw_context,
9929 .event_init = perf_uprobe_event_init,
9930 .add = perf_trace_add,
9931 .del = perf_trace_del,
9932 .start = perf_swevent_start,
9933 .stop = perf_swevent_stop,
9934 .read = perf_swevent_read,
9935 .attr_groups = uprobe_attr_groups,
9936 };
9937
9938 static int perf_uprobe_event_init(struct perf_event *event)
9939 {
9940 int err;
9941 unsigned long ref_ctr_offset;
9942 bool is_retprobe;
9943
9944 if (event->attr.type != perf_uprobe.type)
9945 return -ENOENT;
9946
9947 if (!perfmon_capable())
9948 return -EACCES;
9949
9950
9951
9952
9953 if (has_branch_stack(event))
9954 return -EOPNOTSUPP;
9955
9956 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9957 ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9958 err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
9959 if (err)
9960 return err;
9961
9962 event->destroy = perf_uprobe_destroy;
9963
9964 return 0;
9965 }
9966 #endif
9967
9968 static inline void perf_tp_register(void)
9969 {
9970 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9971 #ifdef CONFIG_KPROBE_EVENTS
9972 perf_pmu_register(&perf_kprobe, "kprobe", -1);
9973 #endif
9974 #ifdef CONFIG_UPROBE_EVENTS
9975 perf_pmu_register(&perf_uprobe, "uprobe", -1);
9976 #endif
9977 }
9978
9979 static void perf_event_free_filter(struct perf_event *event)
9980 {
9981 ftrace_profile_free_filter(event);
9982 }
9983
9984 #ifdef CONFIG_BPF_SYSCALL
9985 static void bpf_overflow_handler(struct perf_event *event,
9986 struct perf_sample_data *data,
9987 struct pt_regs *regs)
9988 {
9989 struct bpf_perf_event_data_kern ctx = {
9990 .data = data,
9991 .event = event,
9992 };
9993 struct bpf_prog *prog;
9994 int ret = 0;
9995
9996 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9997 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9998 goto out;
9999 rcu_read_lock();
10000 prog = READ_ONCE(event->prog);
10001 if (prog)
10002 ret = bpf_prog_run(prog, &ctx);
10003 rcu_read_unlock();
10004 out:
10005 __this_cpu_dec(bpf_prog_active);
10006 if (!ret)
10007 return;
10008
10009 event->orig_overflow_handler(event, data, regs);
10010 }
10011
10012 static int perf_event_set_bpf_handler(struct perf_event *event,
10013 struct bpf_prog *prog,
10014 u64 bpf_cookie)
10015 {
10016 if (event->overflow_handler_context)
10017
10018 return -EINVAL;
10019
10020 if (event->prog)
10021 return -EEXIST;
10022
10023 if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
10024 return -EINVAL;
10025
10026 if (event->attr.precise_ip &&
10027 prog->call_get_stack &&
10028 (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
10029 event->attr.exclude_callchain_kernel ||
10030 event->attr.exclude_callchain_user)) {
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040 return -EPROTO;
10041 }
10042
10043 event->prog = prog;
10044 event->bpf_cookie = bpf_cookie;
10045 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
10046 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
10047 return 0;
10048 }
10049
10050 static void perf_event_free_bpf_handler(struct perf_event *event)
10051 {
10052 struct bpf_prog *prog = event->prog;
10053
10054 if (!prog)
10055 return;
10056
10057 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
10058 event->prog = NULL;
10059 bpf_prog_put(prog);
10060 }
10061 #else
10062 static int perf_event_set_bpf_handler(struct perf_event *event,
10063 struct bpf_prog *prog,
10064 u64 bpf_cookie)
10065 {
10066 return -EOPNOTSUPP;
10067 }
10068 static void perf_event_free_bpf_handler(struct perf_event *event)
10069 {
10070 }
10071 #endif
10072
10073
10074
10075
10076
10077 static inline bool perf_event_is_tracing(struct perf_event *event)
10078 {
10079 if (event->pmu == &perf_tracepoint)
10080 return true;
10081 #ifdef CONFIG_KPROBE_EVENTS
10082 if (event->pmu == &perf_kprobe)
10083 return true;
10084 #endif
10085 #ifdef CONFIG_UPROBE_EVENTS
10086 if (event->pmu == &perf_uprobe)
10087 return true;
10088 #endif
10089 return false;
10090 }
10091
10092 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10093 u64 bpf_cookie)
10094 {
10095 bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
10096
10097 if (!perf_event_is_tracing(event))
10098 return perf_event_set_bpf_handler(event, prog, bpf_cookie);
10099
10100 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
10101 is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
10102 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
10103 is_syscall_tp = is_syscall_trace_event(event->tp_event);
10104 if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
10105
10106 return -EINVAL;
10107
10108 if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
10109 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
10110 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
10111 return -EINVAL;
10112
10113 if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
10114
10115 return -EINVAL;
10116
10117
10118 if (prog->kprobe_override && !is_kprobe)
10119 return -EINVAL;
10120
10121 if (is_tracepoint || is_syscall_tp) {
10122 int off = trace_event_get_offsets(event->tp_event);
10123
10124 if (prog->aux->max_ctx_offset > off)
10125 return -EACCES;
10126 }
10127
10128 return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
10129 }
10130
10131 void perf_event_free_bpf_prog(struct perf_event *event)
10132 {
10133 if (!perf_event_is_tracing(event)) {
10134 perf_event_free_bpf_handler(event);
10135 return;
10136 }
10137 perf_event_detach_bpf_prog(event);
10138 }
10139
10140 #else
10141
10142 static inline void perf_tp_register(void)
10143 {
10144 }
10145
10146 static void perf_event_free_filter(struct perf_event *event)
10147 {
10148 }
10149
10150 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10151 u64 bpf_cookie)
10152 {
10153 return -ENOENT;
10154 }
10155
10156 void perf_event_free_bpf_prog(struct perf_event *event)
10157 {
10158 }
10159 #endif
10160
10161 #ifdef CONFIG_HAVE_HW_BREAKPOINT
10162 void perf_bp_event(struct perf_event *bp, void *data)
10163 {
10164 struct perf_sample_data sample;
10165 struct pt_regs *regs = data;
10166
10167 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
10168
10169 if (!bp->hw.state && !perf_exclude_event(bp, regs))
10170 perf_swevent_event(bp, 1, &sample, regs);
10171 }
10172 #endif
10173
10174
10175
10176
10177 static struct perf_addr_filter *
10178 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10179 {
10180 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10181 struct perf_addr_filter *filter;
10182
10183 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10184 if (!filter)
10185 return NULL;
10186
10187 INIT_LIST_HEAD(&filter->entry);
10188 list_add_tail(&filter->entry, filters);
10189
10190 return filter;
10191 }
10192
10193 static void free_filters_list(struct list_head *filters)
10194 {
10195 struct perf_addr_filter *filter, *iter;
10196
10197 list_for_each_entry_safe(filter, iter, filters, entry) {
10198 path_put(&filter->path);
10199 list_del(&filter->entry);
10200 kfree(filter);
10201 }
10202 }
10203
10204
10205
10206
10207 static void perf_addr_filters_splice(struct perf_event *event,
10208 struct list_head *head)
10209 {
10210 unsigned long flags;
10211 LIST_HEAD(list);
10212
10213 if (!has_addr_filter(event))
10214 return;
10215
10216
10217 if (event->parent)
10218 return;
10219
10220 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10221
10222 list_splice_init(&event->addr_filters.list, &list);
10223 if (head)
10224 list_splice(head, &event->addr_filters.list);
10225
10226 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10227
10228 free_filters_list(&list);
10229 }
10230
10231
10232
10233
10234
10235
10236 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10237 struct mm_struct *mm,
10238 struct perf_addr_filter_range *fr)
10239 {
10240 struct vm_area_struct *vma;
10241
10242 for (vma = mm->mmap; vma; vma = vma->vm_next) {
10243 if (!vma->vm_file)
10244 continue;
10245
10246 if (perf_addr_filter_vma_adjust(filter, vma, fr))
10247 return;
10248 }
10249 }
10250
10251
10252
10253
10254
10255 static void perf_event_addr_filters_apply(struct perf_event *event)
10256 {
10257 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10258 struct task_struct *task = READ_ONCE(event->ctx->task);
10259 struct perf_addr_filter *filter;
10260 struct mm_struct *mm = NULL;
10261 unsigned int count = 0;
10262 unsigned long flags;
10263
10264
10265
10266
10267
10268 if (task == TASK_TOMBSTONE)
10269 return;
10270
10271 if (ifh->nr_file_filters) {
10272 mm = get_task_mm(task);
10273 if (!mm)
10274 goto restart;
10275
10276 mmap_read_lock(mm);
10277 }
10278
10279 raw_spin_lock_irqsave(&ifh->lock, flags);
10280 list_for_each_entry(filter, &ifh->list, entry) {
10281 if (filter->path.dentry) {
10282
10283
10284
10285
10286 event->addr_filter_ranges[count].start = 0;
10287 event->addr_filter_ranges[count].size = 0;
10288
10289 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10290 } else {
10291 event->addr_filter_ranges[count].start = filter->offset;
10292 event->addr_filter_ranges[count].size = filter->size;
10293 }
10294
10295 count++;
10296 }
10297
10298 event->addr_filters_gen++;
10299 raw_spin_unlock_irqrestore(&ifh->lock, flags);
10300
10301 if (ifh->nr_file_filters) {
10302 mmap_read_unlock(mm);
10303
10304 mmput(mm);
10305 }
10306
10307 restart:
10308 perf_event_stop(event, 1);
10309 }
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330 enum {
10331 IF_ACT_NONE = -1,
10332 IF_ACT_FILTER,
10333 IF_ACT_START,
10334 IF_ACT_STOP,
10335 IF_SRC_FILE,
10336 IF_SRC_KERNEL,
10337 IF_SRC_FILEADDR,
10338 IF_SRC_KERNELADDR,
10339 };
10340
10341 enum {
10342 IF_STATE_ACTION = 0,
10343 IF_STATE_SOURCE,
10344 IF_STATE_END,
10345 };
10346
10347 static const match_table_t if_tokens = {
10348 { IF_ACT_FILTER, "filter" },
10349 { IF_ACT_START, "start" },
10350 { IF_ACT_STOP, "stop" },
10351 { IF_SRC_FILE, "%u/%u@%s" },
10352 { IF_SRC_KERNEL, "%u/%u" },
10353 { IF_SRC_FILEADDR, "%u@%s" },
10354 { IF_SRC_KERNELADDR, "%u" },
10355 { IF_ACT_NONE, NULL },
10356 };
10357
10358
10359
10360
10361 static int
10362 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10363 struct list_head *filters)
10364 {
10365 struct perf_addr_filter *filter = NULL;
10366 char *start, *orig, *filename = NULL;
10367 substring_t args[MAX_OPT_ARGS];
10368 int state = IF_STATE_ACTION, token;
10369 unsigned int kernel = 0;
10370 int ret = -EINVAL;
10371
10372 orig = fstr = kstrdup(fstr, GFP_KERNEL);
10373 if (!fstr)
10374 return -ENOMEM;
10375
10376 while ((start = strsep(&fstr, " ,\n")) != NULL) {
10377 static const enum perf_addr_filter_action_t actions[] = {
10378 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10379 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
10380 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
10381 };
10382 ret = -EINVAL;
10383
10384 if (!*start)
10385 continue;
10386
10387
10388 if (state == IF_STATE_ACTION) {
10389 filter = perf_addr_filter_new(event, filters);
10390 if (!filter)
10391 goto fail;
10392 }
10393
10394 token = match_token(start, if_tokens, args);
10395 switch (token) {
10396 case IF_ACT_FILTER:
10397 case IF_ACT_START:
10398 case IF_ACT_STOP:
10399 if (state != IF_STATE_ACTION)
10400 goto fail;
10401
10402 filter->action = actions[token];
10403 state = IF_STATE_SOURCE;
10404 break;
10405
10406 case IF_SRC_KERNELADDR:
10407 case IF_SRC_KERNEL:
10408 kernel = 1;
10409 fallthrough;
10410
10411 case IF_SRC_FILEADDR:
10412 case IF_SRC_FILE:
10413 if (state != IF_STATE_SOURCE)
10414 goto fail;
10415
10416 *args[0].to = 0;
10417 ret = kstrtoul(args[0].from, 0, &filter->offset);
10418 if (ret)
10419 goto fail;
10420
10421 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10422 *args[1].to = 0;
10423 ret = kstrtoul(args[1].from, 0, &filter->size);
10424 if (ret)
10425 goto fail;
10426 }
10427
10428 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10429 int fpos = token == IF_SRC_FILE ? 2 : 1;
10430
10431 kfree(filename);
10432 filename = match_strdup(&args[fpos]);
10433 if (!filename) {
10434 ret = -ENOMEM;
10435 goto fail;
10436 }
10437 }
10438
10439 state = IF_STATE_END;
10440 break;
10441
10442 default:
10443 goto fail;
10444 }
10445
10446
10447
10448
10449
10450
10451 if (state == IF_STATE_END) {
10452 ret = -EINVAL;
10453
10454
10455
10456
10457
10458 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10459 !filter->size)
10460 goto fail;
10461
10462 if (!kernel) {
10463 if (!filename)
10464 goto fail;
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474 ret = -EOPNOTSUPP;
10475 if (!event->ctx->task)
10476 goto fail;
10477
10478
10479 ret = kern_path(filename, LOOKUP_FOLLOW,
10480 &filter->path);
10481 if (ret)
10482 goto fail;
10483
10484 ret = -EINVAL;
10485 if (!filter->path.dentry ||
10486 !S_ISREG(d_inode(filter->path.dentry)
10487 ->i_mode))
10488 goto fail;
10489
10490 event->addr_filters.nr_file_filters++;
10491 }
10492
10493
10494 kfree(filename);
10495 filename = NULL;
10496 state = IF_STATE_ACTION;
10497 filter = NULL;
10498 kernel = 0;
10499 }
10500 }
10501
10502 if (state != IF_STATE_ACTION)
10503 goto fail;
10504
10505 kfree(filename);
10506 kfree(orig);
10507
10508 return 0;
10509
10510 fail:
10511 kfree(filename);
10512 free_filters_list(filters);
10513 kfree(orig);
10514
10515 return ret;
10516 }
10517
10518 static int
10519 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10520 {
10521 LIST_HEAD(filters);
10522 int ret;
10523
10524
10525
10526
10527
10528 lockdep_assert_held(&event->ctx->mutex);
10529
10530 if (WARN_ON_ONCE(event->parent))
10531 return -EINVAL;
10532
10533 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10534 if (ret)
10535 goto fail_clear_files;
10536
10537 ret = event->pmu->addr_filters_validate(&filters);
10538 if (ret)
10539 goto fail_free_filters;
10540
10541
10542 perf_addr_filters_splice(event, &filters);
10543
10544
10545 perf_event_for_each_child(event, perf_event_addr_filters_apply);
10546
10547 return ret;
10548
10549 fail_free_filters:
10550 free_filters_list(&filters);
10551
10552 fail_clear_files:
10553 event->addr_filters.nr_file_filters = 0;
10554
10555 return ret;
10556 }
10557
10558 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10559 {
10560 int ret = -EINVAL;
10561 char *filter_str;
10562
10563 filter_str = strndup_user(arg, PAGE_SIZE);
10564 if (IS_ERR(filter_str))
10565 return PTR_ERR(filter_str);
10566
10567 #ifdef CONFIG_EVENT_TRACING
10568 if (perf_event_is_tracing(event)) {
10569 struct perf_event_context *ctx = event->ctx;
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582 mutex_unlock(&ctx->mutex);
10583 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10584 mutex_lock(&ctx->mutex);
10585 } else
10586 #endif
10587 if (has_addr_filter(event))
10588 ret = perf_event_set_addr_filter(event, filter_str);
10589
10590 kfree(filter_str);
10591 return ret;
10592 }
10593
10594
10595
10596
10597
10598 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10599 {
10600 enum hrtimer_restart ret = HRTIMER_RESTART;
10601 struct perf_sample_data data;
10602 struct pt_regs *regs;
10603 struct perf_event *event;
10604 u64 period;
10605
10606 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10607
10608 if (event->state != PERF_EVENT_STATE_ACTIVE)
10609 return HRTIMER_NORESTART;
10610
10611 event->pmu->read(event);
10612
10613 perf_sample_data_init(&data, 0, event->hw.last_period);
10614 regs = get_irq_regs();
10615
10616 if (regs && !perf_exclude_event(event, regs)) {
10617 if (!(event->attr.exclude_idle && is_idle_task(current)))
10618 if (__perf_event_overflow(event, 1, &data, regs))
10619 ret = HRTIMER_NORESTART;
10620 }
10621
10622 period = max_t(u64, 10000, event->hw.sample_period);
10623 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10624
10625 return ret;
10626 }
10627
10628 static void perf_swevent_start_hrtimer(struct perf_event *event)
10629 {
10630 struct hw_perf_event *hwc = &event->hw;
10631 s64 period;
10632
10633 if (!is_sampling_event(event))
10634 return;
10635
10636 period = local64_read(&hwc->period_left);
10637 if (period) {
10638 if (period < 0)
10639 period = 10000;
10640
10641 local64_set(&hwc->period_left, 0);
10642 } else {
10643 period = max_t(u64, 10000, hwc->sample_period);
10644 }
10645 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10646 HRTIMER_MODE_REL_PINNED_HARD);
10647 }
10648
10649 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10650 {
10651 struct hw_perf_event *hwc = &event->hw;
10652
10653 if (is_sampling_event(event)) {
10654 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10655 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10656
10657 hrtimer_cancel(&hwc->hrtimer);
10658 }
10659 }
10660
10661 static void perf_swevent_init_hrtimer(struct perf_event *event)
10662 {
10663 struct hw_perf_event *hwc = &event->hw;
10664
10665 if (!is_sampling_event(event))
10666 return;
10667
10668 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10669 hwc->hrtimer.function = perf_swevent_hrtimer;
10670
10671
10672
10673
10674
10675 if (event->attr.freq) {
10676 long freq = event->attr.sample_freq;
10677
10678 event->attr.sample_period = NSEC_PER_SEC / freq;
10679 hwc->sample_period = event->attr.sample_period;
10680 local64_set(&hwc->period_left, hwc->sample_period);
10681 hwc->last_period = hwc->sample_period;
10682 event->attr.freq = 0;
10683 }
10684 }
10685
10686
10687
10688
10689
10690 static void cpu_clock_event_update(struct perf_event *event)
10691 {
10692 s64 prev;
10693 u64 now;
10694
10695 now = local_clock();
10696 prev = local64_xchg(&event->hw.prev_count, now);
10697 local64_add(now - prev, &event->count);
10698 }
10699
10700 static void cpu_clock_event_start(struct perf_event *event, int flags)
10701 {
10702 local64_set(&event->hw.prev_count, local_clock());
10703 perf_swevent_start_hrtimer(event);
10704 }
10705
10706 static void cpu_clock_event_stop(struct perf_event *event, int flags)
10707 {
10708 perf_swevent_cancel_hrtimer(event);
10709 cpu_clock_event_update(event);
10710 }
10711
10712 static int cpu_clock_event_add(struct perf_event *event, int flags)
10713 {
10714 if (flags & PERF_EF_START)
10715 cpu_clock_event_start(event, flags);
10716 perf_event_update_userpage(event);
10717
10718 return 0;
10719 }
10720
10721 static void cpu_clock_event_del(struct perf_event *event, int flags)
10722 {
10723 cpu_clock_event_stop(event, flags);
10724 }
10725
10726 static void cpu_clock_event_read(struct perf_event *event)
10727 {
10728 cpu_clock_event_update(event);
10729 }
10730
10731 static int cpu_clock_event_init(struct perf_event *event)
10732 {
10733 if (event->attr.type != PERF_TYPE_SOFTWARE)
10734 return -ENOENT;
10735
10736 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10737 return -ENOENT;
10738
10739
10740
10741
10742 if (has_branch_stack(event))
10743 return -EOPNOTSUPP;
10744
10745 perf_swevent_init_hrtimer(event);
10746
10747 return 0;
10748 }
10749
10750 static struct pmu perf_cpu_clock = {
10751 .task_ctx_nr = perf_sw_context,
10752
10753 .capabilities = PERF_PMU_CAP_NO_NMI,
10754
10755 .event_init = cpu_clock_event_init,
10756 .add = cpu_clock_event_add,
10757 .del = cpu_clock_event_del,
10758 .start = cpu_clock_event_start,
10759 .stop = cpu_clock_event_stop,
10760 .read = cpu_clock_event_read,
10761 };
10762
10763
10764
10765
10766
10767 static void task_clock_event_update(struct perf_event *event, u64 now)
10768 {
10769 u64 prev;
10770 s64 delta;
10771
10772 prev = local64_xchg(&event->hw.prev_count, now);
10773 delta = now - prev;
10774 local64_add(delta, &event->count);
10775 }
10776
10777 static void task_clock_event_start(struct perf_event *event, int flags)
10778 {
10779 local64_set(&event->hw.prev_count, event->ctx->time);
10780 perf_swevent_start_hrtimer(event);
10781 }
10782
10783 static void task_clock_event_stop(struct perf_event *event, int flags)
10784 {
10785 perf_swevent_cancel_hrtimer(event);
10786 task_clock_event_update(event, event->ctx->time);
10787 }
10788
10789 static int task_clock_event_add(struct perf_event *event, int flags)
10790 {
10791 if (flags & PERF_EF_START)
10792 task_clock_event_start(event, flags);
10793 perf_event_update_userpage(event);
10794
10795 return 0;
10796 }
10797
10798 static void task_clock_event_del(struct perf_event *event, int flags)
10799 {
10800 task_clock_event_stop(event, PERF_EF_UPDATE);
10801 }
10802
10803 static void task_clock_event_read(struct perf_event *event)
10804 {
10805 u64 now = perf_clock();
10806 u64 delta = now - event->ctx->timestamp;
10807 u64 time = event->ctx->time + delta;
10808
10809 task_clock_event_update(event, time);
10810 }
10811
10812 static int task_clock_event_init(struct perf_event *event)
10813 {
10814 if (event->attr.type != PERF_TYPE_SOFTWARE)
10815 return -ENOENT;
10816
10817 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10818 return -ENOENT;
10819
10820
10821
10822
10823 if (has_branch_stack(event))
10824 return -EOPNOTSUPP;
10825
10826 perf_swevent_init_hrtimer(event);
10827
10828 return 0;
10829 }
10830
10831 static struct pmu perf_task_clock = {
10832 .task_ctx_nr = perf_sw_context,
10833
10834 .capabilities = PERF_PMU_CAP_NO_NMI,
10835
10836 .event_init = task_clock_event_init,
10837 .add = task_clock_event_add,
10838 .del = task_clock_event_del,
10839 .start = task_clock_event_start,
10840 .stop = task_clock_event_stop,
10841 .read = task_clock_event_read,
10842 };
10843
10844 static void perf_pmu_nop_void(struct pmu *pmu)
10845 {
10846 }
10847
10848 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10849 {
10850 }
10851
10852 static int perf_pmu_nop_int(struct pmu *pmu)
10853 {
10854 return 0;
10855 }
10856
10857 static int perf_event_nop_int(struct perf_event *event, u64 value)
10858 {
10859 return 0;
10860 }
10861
10862 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10863
10864 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10865 {
10866 __this_cpu_write(nop_txn_flags, flags);
10867
10868 if (flags & ~PERF_PMU_TXN_ADD)
10869 return;
10870
10871 perf_pmu_disable(pmu);
10872 }
10873
10874 static int perf_pmu_commit_txn(struct pmu *pmu)
10875 {
10876 unsigned int flags = __this_cpu_read(nop_txn_flags);
10877
10878 __this_cpu_write(nop_txn_flags, 0);
10879
10880 if (flags & ~PERF_PMU_TXN_ADD)
10881 return 0;
10882
10883 perf_pmu_enable(pmu);
10884 return 0;
10885 }
10886
10887 static void perf_pmu_cancel_txn(struct pmu *pmu)
10888 {
10889 unsigned int flags = __this_cpu_read(nop_txn_flags);
10890
10891 __this_cpu_write(nop_txn_flags, 0);
10892
10893 if (flags & ~PERF_PMU_TXN_ADD)
10894 return;
10895
10896 perf_pmu_enable(pmu);
10897 }
10898
10899 static int perf_event_idx_default(struct perf_event *event)
10900 {
10901 return 0;
10902 }
10903
10904
10905
10906
10907
10908 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10909 {
10910 struct pmu *pmu;
10911
10912 if (ctxn < 0)
10913 return NULL;
10914
10915 list_for_each_entry(pmu, &pmus, entry) {
10916 if (pmu->task_ctx_nr == ctxn)
10917 return pmu->pmu_cpu_context;
10918 }
10919
10920 return NULL;
10921 }
10922
10923 static void free_pmu_context(struct pmu *pmu)
10924 {
10925
10926
10927
10928
10929
10930 if (pmu->task_ctx_nr > perf_invalid_context)
10931 return;
10932
10933 free_percpu(pmu->pmu_cpu_context);
10934 }
10935
10936
10937
10938
10939 static ssize_t nr_addr_filters_show(struct device *dev,
10940 struct device_attribute *attr,
10941 char *page)
10942 {
10943 struct pmu *pmu = dev_get_drvdata(dev);
10944
10945 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10946 }
10947 DEVICE_ATTR_RO(nr_addr_filters);
10948
10949 static struct idr pmu_idr;
10950
10951 static ssize_t
10952 type_show(struct device *dev, struct device_attribute *attr, char *page)
10953 {
10954 struct pmu *pmu = dev_get_drvdata(dev);
10955
10956 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10957 }
10958 static DEVICE_ATTR_RO(type);
10959
10960 static ssize_t
10961 perf_event_mux_interval_ms_show(struct device *dev,
10962 struct device_attribute *attr,
10963 char *page)
10964 {
10965 struct pmu *pmu = dev_get_drvdata(dev);
10966
10967 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10968 }
10969
10970 static DEFINE_MUTEX(mux_interval_mutex);
10971
10972 static ssize_t
10973 perf_event_mux_interval_ms_store(struct device *dev,
10974 struct device_attribute *attr,
10975 const char *buf, size_t count)
10976 {
10977 struct pmu *pmu = dev_get_drvdata(dev);
10978 int timer, cpu, ret;
10979
10980 ret = kstrtoint(buf, 0, &timer);
10981 if (ret)
10982 return ret;
10983
10984 if (timer < 1)
10985 return -EINVAL;
10986
10987
10988 if (timer == pmu->hrtimer_interval_ms)
10989 return count;
10990
10991 mutex_lock(&mux_interval_mutex);
10992 pmu->hrtimer_interval_ms = timer;
10993
10994
10995 cpus_read_lock();
10996 for_each_online_cpu(cpu) {
10997 struct perf_cpu_context *cpuctx;
10998 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10999 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
11000
11001 cpu_function_call(cpu,
11002 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
11003 }
11004 cpus_read_unlock();
11005 mutex_unlock(&mux_interval_mutex);
11006
11007 return count;
11008 }
11009 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
11010
11011 static struct attribute *pmu_dev_attrs[] = {
11012 &dev_attr_type.attr,
11013 &dev_attr_perf_event_mux_interval_ms.attr,
11014 NULL,
11015 };
11016 ATTRIBUTE_GROUPS(pmu_dev);
11017
11018 static int pmu_bus_running;
11019 static struct bus_type pmu_bus = {
11020 .name = "event_source",
11021 .dev_groups = pmu_dev_groups,
11022 };
11023
11024 static void pmu_dev_release(struct device *dev)
11025 {
11026 kfree(dev);
11027 }
11028
11029 static int pmu_dev_alloc(struct pmu *pmu)
11030 {
11031 int ret = -ENOMEM;
11032
11033 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
11034 if (!pmu->dev)
11035 goto out;
11036
11037 pmu->dev->groups = pmu->attr_groups;
11038 device_initialize(pmu->dev);
11039 ret = dev_set_name(pmu->dev, "%s", pmu->name);
11040 if (ret)
11041 goto free_dev;
11042
11043 dev_set_drvdata(pmu->dev, pmu);
11044 pmu->dev->bus = &pmu_bus;
11045 pmu->dev->release = pmu_dev_release;
11046 ret = device_add(pmu->dev);
11047 if (ret)
11048 goto free_dev;
11049
11050
11051 if (pmu->nr_addr_filters)
11052 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
11053
11054 if (ret)
11055 goto del_dev;
11056
11057 if (pmu->attr_update)
11058 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
11059
11060 if (ret)
11061 goto del_dev;
11062
11063 out:
11064 return ret;
11065
11066 del_dev:
11067 device_del(pmu->dev);
11068
11069 free_dev:
11070 put_device(pmu->dev);
11071 goto out;
11072 }
11073
11074 static struct lock_class_key cpuctx_mutex;
11075 static struct lock_class_key cpuctx_lock;
11076
11077 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
11078 {
11079 int cpu, ret, max = PERF_TYPE_MAX;
11080
11081 mutex_lock(&pmus_lock);
11082 ret = -ENOMEM;
11083 pmu->pmu_disable_count = alloc_percpu(int);
11084 if (!pmu->pmu_disable_count)
11085 goto unlock;
11086
11087 pmu->type = -1;
11088 if (!name)
11089 goto skip_type;
11090 pmu->name = name;
11091
11092 if (type != PERF_TYPE_SOFTWARE) {
11093 if (type >= 0)
11094 max = type;
11095
11096 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
11097 if (ret < 0)
11098 goto free_pdc;
11099
11100 WARN_ON(type >= 0 && ret != type);
11101
11102 type = ret;
11103 }
11104 pmu->type = type;
11105
11106 if (pmu_bus_running) {
11107 ret = pmu_dev_alloc(pmu);
11108 if (ret)
11109 goto free_idr;
11110 }
11111
11112 skip_type:
11113 if (pmu->task_ctx_nr == perf_hw_context) {
11114 static int hw_context_taken = 0;
11115
11116
11117
11118
11119
11120
11121 if (WARN_ON_ONCE(hw_context_taken &&
11122 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
11123 pmu->task_ctx_nr = perf_invalid_context;
11124
11125 hw_context_taken = 1;
11126 }
11127
11128 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
11129 if (pmu->pmu_cpu_context)
11130 goto got_cpu_context;
11131
11132 ret = -ENOMEM;
11133 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
11134 if (!pmu->pmu_cpu_context)
11135 goto free_dev;
11136
11137 for_each_possible_cpu(cpu) {
11138 struct perf_cpu_context *cpuctx;
11139
11140 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11141 __perf_event_init_context(&cpuctx->ctx);
11142 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
11143 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
11144 cpuctx->ctx.pmu = pmu;
11145 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
11146
11147 __perf_mux_hrtimer_init(cpuctx, cpu);
11148
11149 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
11150 cpuctx->heap = cpuctx->heap_default;
11151 }
11152
11153 got_cpu_context:
11154 if (!pmu->start_txn) {
11155 if (pmu->pmu_enable) {
11156
11157
11158
11159
11160
11161 pmu->start_txn = perf_pmu_start_txn;
11162 pmu->commit_txn = perf_pmu_commit_txn;
11163 pmu->cancel_txn = perf_pmu_cancel_txn;
11164 } else {
11165 pmu->start_txn = perf_pmu_nop_txn;
11166 pmu->commit_txn = perf_pmu_nop_int;
11167 pmu->cancel_txn = perf_pmu_nop_void;
11168 }
11169 }
11170
11171 if (!pmu->pmu_enable) {
11172 pmu->pmu_enable = perf_pmu_nop_void;
11173 pmu->pmu_disable = perf_pmu_nop_void;
11174 }
11175
11176 if (!pmu->check_period)
11177 pmu->check_period = perf_event_nop_int;
11178
11179 if (!pmu->event_idx)
11180 pmu->event_idx = perf_event_idx_default;
11181
11182
11183
11184
11185
11186
11187 if (type == PERF_TYPE_SOFTWARE || !name)
11188 list_add_rcu(&pmu->entry, &pmus);
11189 else
11190 list_add_tail_rcu(&pmu->entry, &pmus);
11191
11192 atomic_set(&pmu->exclusive_cnt, 0);
11193 ret = 0;
11194 unlock:
11195 mutex_unlock(&pmus_lock);
11196
11197 return ret;
11198
11199 free_dev:
11200 device_del(pmu->dev);
11201 put_device(pmu->dev);
11202
11203 free_idr:
11204 if (pmu->type != PERF_TYPE_SOFTWARE)
11205 idr_remove(&pmu_idr, pmu->type);
11206
11207 free_pdc:
11208 free_percpu(pmu->pmu_disable_count);
11209 goto unlock;
11210 }
11211 EXPORT_SYMBOL_GPL(perf_pmu_register);
11212
11213 void perf_pmu_unregister(struct pmu *pmu)
11214 {
11215 mutex_lock(&pmus_lock);
11216 list_del_rcu(&pmu->entry);
11217
11218
11219
11220
11221
11222 synchronize_srcu(&pmus_srcu);
11223 synchronize_rcu();
11224
11225 free_percpu(pmu->pmu_disable_count);
11226 if (pmu->type != PERF_TYPE_SOFTWARE)
11227 idr_remove(&pmu_idr, pmu->type);
11228 if (pmu_bus_running) {
11229 if (pmu->nr_addr_filters)
11230 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11231 device_del(pmu->dev);
11232 put_device(pmu->dev);
11233 }
11234 free_pmu_context(pmu);
11235 mutex_unlock(&pmus_lock);
11236 }
11237 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11238
11239 static inline bool has_extended_regs(struct perf_event *event)
11240 {
11241 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11242 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11243 }
11244
11245 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11246 {
11247 struct perf_event_context *ctx = NULL;
11248 int ret;
11249
11250 if (!try_module_get(pmu->module))
11251 return -ENODEV;
11252
11253
11254
11255
11256
11257
11258
11259 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11260
11261
11262
11263
11264 ctx = perf_event_ctx_lock_nested(event->group_leader,
11265 SINGLE_DEPTH_NESTING);
11266 BUG_ON(!ctx);
11267 }
11268
11269 event->pmu = pmu;
11270 ret = pmu->event_init(event);
11271
11272 if (ctx)
11273 perf_event_ctx_unlock(event->group_leader, ctx);
11274
11275 if (!ret) {
11276 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11277 has_extended_regs(event))
11278 ret = -EOPNOTSUPP;
11279
11280 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11281 event_has_any_exclude_flag(event))
11282 ret = -EINVAL;
11283
11284 if (ret && event->destroy)
11285 event->destroy(event);
11286 }
11287
11288 if (ret)
11289 module_put(pmu->module);
11290
11291 return ret;
11292 }
11293
11294 static struct pmu *perf_init_event(struct perf_event *event)
11295 {
11296 bool extended_type = false;
11297 int idx, type, ret;
11298 struct pmu *pmu;
11299
11300 idx = srcu_read_lock(&pmus_srcu);
11301
11302
11303 if (event->parent && event->parent->pmu) {
11304 pmu = event->parent->pmu;
11305 ret = perf_try_init_event(pmu, event);
11306 if (!ret)
11307 goto unlock;
11308 }
11309
11310
11311
11312
11313
11314 type = event->attr.type;
11315 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
11316 type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
11317 if (!type) {
11318 type = PERF_TYPE_RAW;
11319 } else {
11320 extended_type = true;
11321 event->attr.config &= PERF_HW_EVENT_MASK;
11322 }
11323 }
11324
11325 again:
11326 rcu_read_lock();
11327 pmu = idr_find(&pmu_idr, type);
11328 rcu_read_unlock();
11329 if (pmu) {
11330 if (event->attr.type != type && type != PERF_TYPE_RAW &&
11331 !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
11332 goto fail;
11333
11334 ret = perf_try_init_event(pmu, event);
11335 if (ret == -ENOENT && event->attr.type != type && !extended_type) {
11336 type = event->attr.type;
11337 goto again;
11338 }
11339
11340 if (ret)
11341 pmu = ERR_PTR(ret);
11342
11343 goto unlock;
11344 }
11345
11346 list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11347 ret = perf_try_init_event(pmu, event);
11348 if (!ret)
11349 goto unlock;
11350
11351 if (ret != -ENOENT) {
11352 pmu = ERR_PTR(ret);
11353 goto unlock;
11354 }
11355 }
11356 fail:
11357 pmu = ERR_PTR(-ENOENT);
11358 unlock:
11359 srcu_read_unlock(&pmus_srcu, idx);
11360
11361 return pmu;
11362 }
11363
11364 static void attach_sb_event(struct perf_event *event)
11365 {
11366 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11367
11368 raw_spin_lock(&pel->lock);
11369 list_add_rcu(&event->sb_list, &pel->list);
11370 raw_spin_unlock(&pel->lock);
11371 }
11372
11373
11374
11375
11376
11377
11378
11379
11380 static void account_pmu_sb_event(struct perf_event *event)
11381 {
11382 if (is_sb_event(event))
11383 attach_sb_event(event);
11384 }
11385
11386 static void account_event_cpu(struct perf_event *event, int cpu)
11387 {
11388 if (event->parent)
11389 return;
11390
11391 if (is_cgroup_event(event))
11392 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11393 }
11394
11395
11396 static void account_freq_event_nohz(void)
11397 {
11398 #ifdef CONFIG_NO_HZ_FULL
11399
11400 spin_lock(&nr_freq_lock);
11401 if (atomic_inc_return(&nr_freq_events) == 1)
11402 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11403 spin_unlock(&nr_freq_lock);
11404 #endif
11405 }
11406
11407 static void account_freq_event(void)
11408 {
11409 if (tick_nohz_full_enabled())
11410 account_freq_event_nohz();
11411 else
11412 atomic_inc(&nr_freq_events);
11413 }
11414
11415
11416 static void account_event(struct perf_event *event)
11417 {
11418 bool inc = false;
11419
11420 if (event->parent)
11421 return;
11422
11423 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11424 inc = true;
11425 if (event->attr.mmap || event->attr.mmap_data)
11426 atomic_inc(&nr_mmap_events);
11427 if (event->attr.build_id)
11428 atomic_inc(&nr_build_id_events);
11429 if (event->attr.comm)
11430 atomic_inc(&nr_comm_events);
11431 if (event->attr.namespaces)
11432 atomic_inc(&nr_namespaces_events);
11433 if (event->attr.cgroup)
11434 atomic_inc(&nr_cgroup_events);
11435 if (event->attr.task)
11436 atomic_inc(&nr_task_events);
11437 if (event->attr.freq)
11438 account_freq_event();
11439 if (event->attr.context_switch) {
11440 atomic_inc(&nr_switch_events);
11441 inc = true;
11442 }
11443 if (has_branch_stack(event))
11444 inc = true;
11445 if (is_cgroup_event(event))
11446 inc = true;
11447 if (event->attr.ksymbol)
11448 atomic_inc(&nr_ksymbol_events);
11449 if (event->attr.bpf_event)
11450 atomic_inc(&nr_bpf_events);
11451 if (event->attr.text_poke)
11452 atomic_inc(&nr_text_poke_events);
11453
11454 if (inc) {
11455
11456
11457
11458
11459
11460 if (atomic_inc_not_zero(&perf_sched_count))
11461 goto enabled;
11462
11463 mutex_lock(&perf_sched_mutex);
11464 if (!atomic_read(&perf_sched_count)) {
11465 static_branch_enable(&perf_sched_events);
11466
11467
11468
11469
11470
11471 synchronize_rcu();
11472 }
11473
11474
11475
11476
11477 atomic_inc(&perf_sched_count);
11478 mutex_unlock(&perf_sched_mutex);
11479 }
11480 enabled:
11481
11482 account_event_cpu(event, event->cpu);
11483
11484 account_pmu_sb_event(event);
11485 }
11486
11487
11488
11489
11490 static struct perf_event *
11491 perf_event_alloc(struct perf_event_attr *attr, int cpu,
11492 struct task_struct *task,
11493 struct perf_event *group_leader,
11494 struct perf_event *parent_event,
11495 perf_overflow_handler_t overflow_handler,
11496 void *context, int cgroup_fd)
11497 {
11498 struct pmu *pmu;
11499 struct perf_event *event;
11500 struct hw_perf_event *hwc;
11501 long err = -EINVAL;
11502 int node;
11503
11504 if ((unsigned)cpu >= nr_cpu_ids) {
11505 if (!task || cpu != -1)
11506 return ERR_PTR(-EINVAL);
11507 }
11508 if (attr->sigtrap && !task) {
11509
11510 return ERR_PTR(-EINVAL);
11511 }
11512
11513 node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11514 event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11515 node);
11516 if (!event)
11517 return ERR_PTR(-ENOMEM);
11518
11519
11520
11521
11522
11523 if (!group_leader)
11524 group_leader = event;
11525
11526 mutex_init(&event->child_mutex);
11527 INIT_LIST_HEAD(&event->child_list);
11528
11529 INIT_LIST_HEAD(&event->event_entry);
11530 INIT_LIST_HEAD(&event->sibling_list);
11531 INIT_LIST_HEAD(&event->active_list);
11532 init_event_group(event);
11533 INIT_LIST_HEAD(&event->rb_entry);
11534 INIT_LIST_HEAD(&event->active_entry);
11535 INIT_LIST_HEAD(&event->addr_filters.list);
11536 INIT_HLIST_NODE(&event->hlist_entry);
11537
11538
11539 init_waitqueue_head(&event->waitq);
11540 event->pending_disable = -1;
11541 init_irq_work(&event->pending, perf_pending_event);
11542
11543 mutex_init(&event->mmap_mutex);
11544 raw_spin_lock_init(&event->addr_filters.lock);
11545
11546 atomic_long_set(&event->refcount, 1);
11547 event->cpu = cpu;
11548 event->attr = *attr;
11549 event->group_leader = group_leader;
11550 event->pmu = NULL;
11551 event->oncpu = -1;
11552
11553 event->parent = parent_event;
11554
11555 event->ns = get_pid_ns(task_active_pid_ns(current));
11556 event->id = atomic64_inc_return(&perf_event_id);
11557
11558 event->state = PERF_EVENT_STATE_INACTIVE;
11559
11560 if (parent_event)
11561 event->event_caps = parent_event->event_caps;
11562
11563 if (event->attr.sigtrap)
11564 atomic_set(&event->event_limit, 1);
11565
11566 if (task) {
11567 event->attach_state = PERF_ATTACH_TASK;
11568
11569
11570
11571
11572
11573 event->hw.target = get_task_struct(task);
11574 }
11575
11576 event->clock = &local_clock;
11577 if (parent_event)
11578 event->clock = parent_event->clock;
11579
11580 if (!overflow_handler && parent_event) {
11581 overflow_handler = parent_event->overflow_handler;
11582 context = parent_event->overflow_handler_context;
11583 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11584 if (overflow_handler == bpf_overflow_handler) {
11585 struct bpf_prog *prog = parent_event->prog;
11586
11587 bpf_prog_inc(prog);
11588 event->prog = prog;
11589 event->orig_overflow_handler =
11590 parent_event->orig_overflow_handler;
11591 }
11592 #endif
11593 }
11594
11595 if (overflow_handler) {
11596 event->overflow_handler = overflow_handler;
11597 event->overflow_handler_context = context;
11598 } else if (is_write_backward(event)){
11599 event->overflow_handler = perf_event_output_backward;
11600 event->overflow_handler_context = NULL;
11601 } else {
11602 event->overflow_handler = perf_event_output_forward;
11603 event->overflow_handler_context = NULL;
11604 }
11605
11606 perf_event__state_init(event);
11607
11608 pmu = NULL;
11609
11610 hwc = &event->hw;
11611 hwc->sample_period = attr->sample_period;
11612 if (attr->freq && attr->sample_freq)
11613 hwc->sample_period = 1;
11614 hwc->last_period = hwc->sample_period;
11615
11616 local64_set(&hwc->period_left, hwc->sample_period);
11617
11618
11619
11620
11621
11622 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11623 goto err_ns;
11624
11625 if (!has_branch_stack(event))
11626 event->attr.branch_sample_type = 0;
11627
11628 pmu = perf_init_event(event);
11629 if (IS_ERR(pmu)) {
11630 err = PTR_ERR(pmu);
11631 goto err_ns;
11632 }
11633
11634
11635
11636
11637
11638 if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11639 err = -EINVAL;
11640 goto err_pmu;
11641 }
11642
11643 if (event->attr.aux_output &&
11644 !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11645 err = -EOPNOTSUPP;
11646 goto err_pmu;
11647 }
11648
11649 if (cgroup_fd != -1) {
11650 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11651 if (err)
11652 goto err_pmu;
11653 }
11654
11655 err = exclusive_event_init(event);
11656 if (err)
11657 goto err_pmu;
11658
11659 if (has_addr_filter(event)) {
11660 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11661 sizeof(struct perf_addr_filter_range),
11662 GFP_KERNEL);
11663 if (!event->addr_filter_ranges) {
11664 err = -ENOMEM;
11665 goto err_per_task;
11666 }
11667
11668
11669
11670
11671
11672 if (event->parent) {
11673 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11674
11675 raw_spin_lock_irq(&ifh->lock);
11676 memcpy(event->addr_filter_ranges,
11677 event->parent->addr_filter_ranges,
11678 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11679 raw_spin_unlock_irq(&ifh->lock);
11680 }
11681
11682
11683 event->addr_filters_gen = 1;
11684 }
11685
11686 if (!event->parent) {
11687 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11688 err = get_callchain_buffers(attr->sample_max_stack);
11689 if (err)
11690 goto err_addr_filters;
11691 }
11692 }
11693
11694 err = security_perf_event_alloc(event);
11695 if (err)
11696 goto err_callchain_buffer;
11697
11698
11699 account_event(event);
11700
11701 return event;
11702
11703 err_callchain_buffer:
11704 if (!event->parent) {
11705 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11706 put_callchain_buffers();
11707 }
11708 err_addr_filters:
11709 kfree(event->addr_filter_ranges);
11710
11711 err_per_task:
11712 exclusive_event_destroy(event);
11713
11714 err_pmu:
11715 if (is_cgroup_event(event))
11716 perf_detach_cgroup(event);
11717 if (event->destroy)
11718 event->destroy(event);
11719 module_put(pmu->module);
11720 err_ns:
11721 if (event->ns)
11722 put_pid_ns(event->ns);
11723 if (event->hw.target)
11724 put_task_struct(event->hw.target);
11725 kmem_cache_free(perf_event_cache, event);
11726
11727 return ERR_PTR(err);
11728 }
11729
11730 static int perf_copy_attr(struct perf_event_attr __user *uattr,
11731 struct perf_event_attr *attr)
11732 {
11733 u32 size;
11734 int ret;
11735
11736
11737 memset(attr, 0, sizeof(*attr));
11738
11739 ret = get_user(size, &uattr->size);
11740 if (ret)
11741 return ret;
11742
11743
11744 if (!size)
11745 size = PERF_ATTR_SIZE_VER0;
11746 if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11747 goto err_size;
11748
11749 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11750 if (ret) {
11751 if (ret == -E2BIG)
11752 goto err_size;
11753 return ret;
11754 }
11755
11756 attr->size = size;
11757
11758 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11759 return -EINVAL;
11760
11761 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11762 return -EINVAL;
11763
11764 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11765 return -EINVAL;
11766
11767 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11768 u64 mask = attr->branch_sample_type;
11769
11770
11771 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11772 return -EINVAL;
11773
11774
11775 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11776 return -EINVAL;
11777
11778
11779 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11780
11781
11782 if (!attr->exclude_kernel)
11783 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11784
11785 if (!attr->exclude_user)
11786 mask |= PERF_SAMPLE_BRANCH_USER;
11787
11788 if (!attr->exclude_hv)
11789 mask |= PERF_SAMPLE_BRANCH_HV;
11790
11791
11792
11793 attr->branch_sample_type = mask;
11794 }
11795
11796 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11797 ret = perf_allow_kernel(attr);
11798 if (ret)
11799 return ret;
11800 }
11801 }
11802
11803 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11804 ret = perf_reg_validate(attr->sample_regs_user);
11805 if (ret)
11806 return ret;
11807 }
11808
11809 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11810 if (!arch_perf_have_user_stack_dump())
11811 return -ENOSYS;
11812
11813
11814
11815
11816
11817
11818 if (attr->sample_stack_user >= USHRT_MAX)
11819 return -EINVAL;
11820 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11821 return -EINVAL;
11822 }
11823
11824 if (!attr->sample_max_stack)
11825 attr->sample_max_stack = sysctl_perf_event_max_stack;
11826
11827 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11828 ret = perf_reg_validate(attr->sample_regs_intr);
11829
11830 #ifndef CONFIG_CGROUP_PERF
11831 if (attr->sample_type & PERF_SAMPLE_CGROUP)
11832 return -EINVAL;
11833 #endif
11834 if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
11835 (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
11836 return -EINVAL;
11837
11838 if (!attr->inherit && attr->inherit_thread)
11839 return -EINVAL;
11840
11841 if (attr->remove_on_exec && attr->enable_on_exec)
11842 return -EINVAL;
11843
11844 if (attr->sigtrap && !attr->remove_on_exec)
11845 return -EINVAL;
11846
11847 out:
11848 return ret;
11849
11850 err_size:
11851 put_user(sizeof(*attr), &uattr->size);
11852 ret = -E2BIG;
11853 goto out;
11854 }
11855
11856 static void mutex_lock_double(struct mutex *a, struct mutex *b)
11857 {
11858 if (b < a)
11859 swap(a, b);
11860
11861 mutex_lock(a);
11862 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11863 }
11864
11865 static int
11866 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11867 {
11868 struct perf_buffer *rb = NULL;
11869 int ret = -EINVAL;
11870
11871 if (!output_event) {
11872 mutex_lock(&event->mmap_mutex);
11873 goto set;
11874 }
11875
11876
11877 if (event == output_event)
11878 goto out;
11879
11880
11881
11882
11883 if (output_event->cpu != event->cpu)
11884 goto out;
11885
11886
11887
11888
11889 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11890 goto out;
11891
11892
11893
11894
11895 if (output_event->clock != event->clock)
11896 goto out;
11897
11898
11899
11900
11901
11902 if (is_write_backward(output_event) != is_write_backward(event))
11903 goto out;
11904
11905
11906
11907
11908 if (has_aux(event) && has_aux(output_event) &&
11909 event->pmu != output_event->pmu)
11910 goto out;
11911
11912
11913
11914
11915
11916
11917
11918
11919 mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
11920 set:
11921
11922 if (atomic_read(&event->mmap_count))
11923 goto unlock;
11924
11925 if (output_event) {
11926
11927 rb = ring_buffer_get(output_event);
11928 if (!rb)
11929 goto unlock;
11930
11931
11932 if (!atomic_read(&rb->mmap_count)) {
11933 ring_buffer_put(rb);
11934 goto unlock;
11935 }
11936 }
11937
11938 ring_buffer_attach(event, rb);
11939
11940 ret = 0;
11941 unlock:
11942 mutex_unlock(&event->mmap_mutex);
11943 if (output_event)
11944 mutex_unlock(&output_event->mmap_mutex);
11945
11946 out:
11947 return ret;
11948 }
11949
11950 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11951 {
11952 bool nmi_safe = false;
11953
11954 switch (clk_id) {
11955 case CLOCK_MONOTONIC:
11956 event->clock = &ktime_get_mono_fast_ns;
11957 nmi_safe = true;
11958 break;
11959
11960 case CLOCK_MONOTONIC_RAW:
11961 event->clock = &ktime_get_raw_fast_ns;
11962 nmi_safe = true;
11963 break;
11964
11965 case CLOCK_REALTIME:
11966 event->clock = &ktime_get_real_ns;
11967 break;
11968
11969 case CLOCK_BOOTTIME:
11970 event->clock = &ktime_get_boottime_ns;
11971 break;
11972
11973 case CLOCK_TAI:
11974 event->clock = &ktime_get_clocktai_ns;
11975 break;
11976
11977 default:
11978 return -EINVAL;
11979 }
11980
11981 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11982 return -EINVAL;
11983
11984 return 0;
11985 }
11986
11987
11988
11989
11990
11991 static struct perf_event_context *
11992 __perf_event_ctx_lock_double(struct perf_event *group_leader,
11993 struct perf_event_context *ctx)
11994 {
11995 struct perf_event_context *gctx;
11996
11997 again:
11998 rcu_read_lock();
11999 gctx = READ_ONCE(group_leader->ctx);
12000 if (!refcount_inc_not_zero(&gctx->refcount)) {
12001 rcu_read_unlock();
12002 goto again;
12003 }
12004 rcu_read_unlock();
12005
12006 mutex_lock_double(&gctx->mutex, &ctx->mutex);
12007
12008 if (group_leader->ctx != gctx) {
12009 mutex_unlock(&ctx->mutex);
12010 mutex_unlock(&gctx->mutex);
12011 put_ctx(gctx);
12012 goto again;
12013 }
12014
12015 return gctx;
12016 }
12017
12018 static bool
12019 perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
12020 {
12021 unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
12022 bool is_capable = perfmon_capable();
12023
12024 if (attr->sigtrap) {
12025
12026
12027
12028
12029 rcu_read_lock();
12030 is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
12031 rcu_read_unlock();
12032
12033
12034
12035
12036
12037
12038 ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
12039 }
12040
12041
12042
12043
12044
12045
12046 return is_capable || ptrace_may_access(task, ptrace_mode);
12047 }
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058 SYSCALL_DEFINE5(perf_event_open,
12059 struct perf_event_attr __user *, attr_uptr,
12060 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
12061 {
12062 struct perf_event *group_leader = NULL, *output_event = NULL;
12063 struct perf_event *event, *sibling;
12064 struct perf_event_attr attr;
12065 struct perf_event_context *ctx, *gctx;
12066 struct file *event_file = NULL;
12067 struct fd group = {NULL, 0};
12068 struct task_struct *task = NULL;
12069 struct pmu *pmu;
12070 int event_fd;
12071 int move_group = 0;
12072 int err;
12073 int f_flags = O_RDWR;
12074 int cgroup_fd = -1;
12075
12076
12077 if (flags & ~PERF_FLAG_ALL)
12078 return -EINVAL;
12079
12080
12081 err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
12082 if (err)
12083 return err;
12084
12085 err = perf_copy_attr(attr_uptr, &attr);
12086 if (err)
12087 return err;
12088
12089 if (!attr.exclude_kernel) {
12090 err = perf_allow_kernel(&attr);
12091 if (err)
12092 return err;
12093 }
12094
12095 if (attr.namespaces) {
12096 if (!perfmon_capable())
12097 return -EACCES;
12098 }
12099
12100 if (attr.freq) {
12101 if (attr.sample_freq > sysctl_perf_event_sample_rate)
12102 return -EINVAL;
12103 } else {
12104 if (attr.sample_period & (1ULL << 63))
12105 return -EINVAL;
12106 }
12107
12108
12109 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
12110 err = perf_allow_kernel(&attr);
12111 if (err)
12112 return err;
12113 }
12114
12115
12116 if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
12117 err = security_locked_down(LOCKDOWN_PERF);
12118 if (err)
12119 return err;
12120 }
12121
12122
12123
12124
12125
12126
12127
12128 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
12129 return -EINVAL;
12130
12131 if (flags & PERF_FLAG_FD_CLOEXEC)
12132 f_flags |= O_CLOEXEC;
12133
12134 event_fd = get_unused_fd_flags(f_flags);
12135 if (event_fd < 0)
12136 return event_fd;
12137
12138 if (group_fd != -1) {
12139 err = perf_fget_light(group_fd, &group);
12140 if (err)
12141 goto err_fd;
12142 group_leader = group.file->private_data;
12143 if (flags & PERF_FLAG_FD_OUTPUT)
12144 output_event = group_leader;
12145 if (flags & PERF_FLAG_FD_NO_GROUP)
12146 group_leader = NULL;
12147 }
12148
12149 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
12150 task = find_lively_task_by_vpid(pid);
12151 if (IS_ERR(task)) {
12152 err = PTR_ERR(task);
12153 goto err_group_fd;
12154 }
12155 }
12156
12157 if (task && group_leader &&
12158 group_leader->attr.inherit != attr.inherit) {
12159 err = -EINVAL;
12160 goto err_task;
12161 }
12162
12163 if (flags & PERF_FLAG_PID_CGROUP)
12164 cgroup_fd = pid;
12165
12166 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
12167 NULL, NULL, cgroup_fd);
12168 if (IS_ERR(event)) {
12169 err = PTR_ERR(event);
12170 goto err_task;
12171 }
12172
12173 if (is_sampling_event(event)) {
12174 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
12175 err = -EOPNOTSUPP;
12176 goto err_alloc;
12177 }
12178 }
12179
12180
12181
12182
12183
12184 pmu = event->pmu;
12185
12186 if (attr.use_clockid) {
12187 err = perf_event_set_clock(event, attr.clockid);
12188 if (err)
12189 goto err_alloc;
12190 }
12191
12192 if (pmu->task_ctx_nr == perf_sw_context)
12193 event->event_caps |= PERF_EV_CAP_SOFTWARE;
12194
12195 if (group_leader) {
12196 if (is_software_event(event) &&
12197 !in_software_context(group_leader)) {
12198
12199
12200
12201
12202
12203
12204
12205
12206 pmu = group_leader->ctx->pmu;
12207 } else if (!is_software_event(event) &&
12208 is_software_event(group_leader) &&
12209 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12210
12211
12212
12213
12214
12215 move_group = 1;
12216 }
12217 }
12218
12219
12220
12221
12222 ctx = find_get_context(pmu, task, event);
12223 if (IS_ERR(ctx)) {
12224 err = PTR_ERR(ctx);
12225 goto err_alloc;
12226 }
12227
12228
12229
12230
12231 if (group_leader) {
12232 err = -EINVAL;
12233
12234
12235
12236
12237
12238 if (group_leader->group_leader != group_leader)
12239 goto err_context;
12240
12241
12242 if (group_leader->clock != event->clock)
12243 goto err_context;
12244
12245
12246
12247
12248
12249
12250 if (group_leader->cpu != event->cpu)
12251 goto err_context;
12252
12253
12254
12255
12256
12257 if (group_leader->ctx->task != ctx->task)
12258 goto err_context;
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268 if (!move_group && group_leader->ctx != ctx)
12269 goto err_context;
12270
12271
12272
12273
12274 if (attr.exclusive || attr.pinned)
12275 goto err_context;
12276 }
12277
12278 if (output_event) {
12279 err = perf_event_set_output(event, output_event);
12280 if (err)
12281 goto err_context;
12282 }
12283
12284 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
12285 f_flags);
12286 if (IS_ERR(event_file)) {
12287 err = PTR_ERR(event_file);
12288 event_file = NULL;
12289 goto err_context;
12290 }
12291
12292 if (task) {
12293 err = down_read_interruptible(&task->signal->exec_update_lock);
12294 if (err)
12295 goto err_file;
12296
12297
12298
12299
12300
12301
12302
12303 err = -EACCES;
12304 if (!perf_check_permission(&attr, task))
12305 goto err_cred;
12306 }
12307
12308 if (move_group) {
12309 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
12310
12311 if (gctx->task == TASK_TOMBSTONE) {
12312 err = -ESRCH;
12313 goto err_locked;
12314 }
12315
12316
12317
12318
12319
12320 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12321
12322
12323
12324
12325
12326 if (gctx != ctx) {
12327 err = -EINVAL;
12328 goto err_locked;
12329 } else {
12330 perf_event_ctx_unlock(group_leader, gctx);
12331 move_group = 0;
12332 goto not_move_group;
12333 }
12334 }
12335
12336
12337
12338
12339 err = -EBUSY;
12340 if (!exclusive_event_installable(group_leader, ctx))
12341 goto err_locked;
12342
12343 for_each_sibling_event(sibling, group_leader) {
12344 if (!exclusive_event_installable(sibling, ctx))
12345 goto err_locked;
12346 }
12347 } else {
12348 mutex_lock(&ctx->mutex);
12349
12350
12351
12352
12353
12354 if (group_leader && group_leader->ctx != ctx) {
12355 err = -EINVAL;
12356 goto err_locked;
12357 }
12358 }
12359 not_move_group:
12360
12361 if (ctx->task == TASK_TOMBSTONE) {
12362 err = -ESRCH;
12363 goto err_locked;
12364 }
12365
12366 if (!perf_event_validate_size(event)) {
12367 err = -E2BIG;
12368 goto err_locked;
12369 }
12370
12371 if (!task) {
12372
12373
12374
12375
12376
12377
12378 struct perf_cpu_context *cpuctx =
12379 container_of(ctx, struct perf_cpu_context, ctx);
12380
12381 if (!cpuctx->online) {
12382 err = -ENODEV;
12383 goto err_locked;
12384 }
12385 }
12386
12387 if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12388 err = -EINVAL;
12389 goto err_locked;
12390 }
12391
12392
12393
12394
12395
12396 if (!exclusive_event_installable(event, ctx)) {
12397 err = -EBUSY;
12398 goto err_locked;
12399 }
12400
12401 WARN_ON_ONCE(ctx->parent_ctx);
12402
12403
12404
12405
12406
12407
12408 if (move_group) {
12409
12410
12411
12412
12413 perf_remove_from_context(group_leader, 0);
12414 put_ctx(gctx);
12415
12416 for_each_sibling_event(sibling, group_leader) {
12417 perf_remove_from_context(sibling, 0);
12418 put_ctx(gctx);
12419 }
12420
12421
12422
12423
12424
12425 synchronize_rcu();
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437 for_each_sibling_event(sibling, group_leader) {
12438 perf_event__state_init(sibling);
12439 perf_install_in_context(ctx, sibling, sibling->cpu);
12440 get_ctx(ctx);
12441 }
12442
12443
12444
12445
12446
12447
12448 perf_event__state_init(group_leader);
12449 perf_install_in_context(ctx, group_leader, group_leader->cpu);
12450 get_ctx(ctx);
12451 }
12452
12453
12454
12455
12456
12457
12458
12459 perf_event__header_size(event);
12460 perf_event__id_header_size(event);
12461
12462 event->owner = current;
12463
12464 perf_install_in_context(ctx, event, event->cpu);
12465 perf_unpin_context(ctx);
12466
12467 if (move_group)
12468 perf_event_ctx_unlock(group_leader, gctx);
12469 mutex_unlock(&ctx->mutex);
12470
12471 if (task) {
12472 up_read(&task->signal->exec_update_lock);
12473 put_task_struct(task);
12474 }
12475
12476 mutex_lock(¤t->perf_event_mutex);
12477 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
12478 mutex_unlock(¤t->perf_event_mutex);
12479
12480
12481
12482
12483
12484
12485
12486 fdput(group);
12487 fd_install(event_fd, event_file);
12488 return event_fd;
12489
12490 err_locked:
12491 if (move_group)
12492 perf_event_ctx_unlock(group_leader, gctx);
12493 mutex_unlock(&ctx->mutex);
12494 err_cred:
12495 if (task)
12496 up_read(&task->signal->exec_update_lock);
12497 err_file:
12498 fput(event_file);
12499 err_context:
12500 perf_unpin_context(ctx);
12501 put_ctx(ctx);
12502 err_alloc:
12503
12504
12505
12506
12507 if (!event_file)
12508 free_event(event);
12509 err_task:
12510 if (task)
12511 put_task_struct(task);
12512 err_group_fd:
12513 fdput(group);
12514 err_fd:
12515 put_unused_fd(event_fd);
12516 return err;
12517 }
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528 struct perf_event *
12529 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12530 struct task_struct *task,
12531 perf_overflow_handler_t overflow_handler,
12532 void *context)
12533 {
12534 struct perf_event_context *ctx;
12535 struct perf_event *event;
12536 int err;
12537
12538
12539
12540
12541
12542 if (attr->aux_output)
12543 return ERR_PTR(-EINVAL);
12544
12545 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12546 overflow_handler, context, -1);
12547 if (IS_ERR(event)) {
12548 err = PTR_ERR(event);
12549 goto err;
12550 }
12551
12552
12553 event->owner = TASK_TOMBSTONE;
12554
12555
12556
12557
12558 ctx = find_get_context(event->pmu, task, event);
12559 if (IS_ERR(ctx)) {
12560 err = PTR_ERR(ctx);
12561 goto err_free;
12562 }
12563
12564 WARN_ON_ONCE(ctx->parent_ctx);
12565 mutex_lock(&ctx->mutex);
12566 if (ctx->task == TASK_TOMBSTONE) {
12567 err = -ESRCH;
12568 goto err_unlock;
12569 }
12570
12571 if (!task) {
12572
12573
12574
12575
12576
12577
12578 struct perf_cpu_context *cpuctx =
12579 container_of(ctx, struct perf_cpu_context, ctx);
12580 if (!cpuctx->online) {
12581 err = -ENODEV;
12582 goto err_unlock;
12583 }
12584 }
12585
12586 if (!exclusive_event_installable(event, ctx)) {
12587 err = -EBUSY;
12588 goto err_unlock;
12589 }
12590
12591 perf_install_in_context(ctx, event, event->cpu);
12592 perf_unpin_context(ctx);
12593 mutex_unlock(&ctx->mutex);
12594
12595 return event;
12596
12597 err_unlock:
12598 mutex_unlock(&ctx->mutex);
12599 perf_unpin_context(ctx);
12600 put_ctx(ctx);
12601 err_free:
12602 free_event(event);
12603 err:
12604 return ERR_PTR(err);
12605 }
12606 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12607
12608 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12609 {
12610 struct perf_event_context *src_ctx;
12611 struct perf_event_context *dst_ctx;
12612 struct perf_event *event, *tmp;
12613 LIST_HEAD(events);
12614
12615 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12616 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12617
12618
12619
12620
12621
12622 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12623 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12624 event_entry) {
12625 perf_remove_from_context(event, 0);
12626 unaccount_event_cpu(event, src_cpu);
12627 put_ctx(src_ctx);
12628 list_add(&event->migrate_entry, &events);
12629 }
12630
12631
12632
12633
12634 synchronize_rcu();
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12645 if (event->group_leader == event)
12646 continue;
12647
12648 list_del(&event->migrate_entry);
12649 if (event->state >= PERF_EVENT_STATE_OFF)
12650 event->state = PERF_EVENT_STATE_INACTIVE;
12651 account_event_cpu(event, dst_cpu);
12652 perf_install_in_context(dst_ctx, event, dst_cpu);
12653 get_ctx(dst_ctx);
12654 }
12655
12656
12657
12658
12659
12660 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12661 list_del(&event->migrate_entry);
12662 if (event->state >= PERF_EVENT_STATE_OFF)
12663 event->state = PERF_EVENT_STATE_INACTIVE;
12664 account_event_cpu(event, dst_cpu);
12665 perf_install_in_context(dst_ctx, event, dst_cpu);
12666 get_ctx(dst_ctx);
12667 }
12668 mutex_unlock(&dst_ctx->mutex);
12669 mutex_unlock(&src_ctx->mutex);
12670 }
12671 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12672
12673 static void sync_child_event(struct perf_event *child_event)
12674 {
12675 struct perf_event *parent_event = child_event->parent;
12676 u64 child_val;
12677
12678 if (child_event->attr.inherit_stat) {
12679 struct task_struct *task = child_event->ctx->task;
12680
12681 if (task && task != TASK_TOMBSTONE)
12682 perf_event_read_event(child_event, task);
12683 }
12684
12685 child_val = perf_event_count(child_event);
12686
12687
12688
12689
12690 atomic64_add(child_val, &parent_event->child_count);
12691 atomic64_add(child_event->total_time_enabled,
12692 &parent_event->child_total_time_enabled);
12693 atomic64_add(child_event->total_time_running,
12694 &parent_event->child_total_time_running);
12695 }
12696
12697 static void
12698 perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
12699 {
12700 struct perf_event *parent_event = event->parent;
12701 unsigned long detach_flags = 0;
12702
12703 if (parent_event) {
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716 detach_flags = DETACH_GROUP | DETACH_CHILD;
12717 mutex_lock(&parent_event->child_mutex);
12718 }
12719
12720 perf_remove_from_context(event, detach_flags);
12721
12722 raw_spin_lock_irq(&ctx->lock);
12723 if (event->state > PERF_EVENT_STATE_EXIT)
12724 perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12725 raw_spin_unlock_irq(&ctx->lock);
12726
12727
12728
12729
12730 if (parent_event) {
12731 mutex_unlock(&parent_event->child_mutex);
12732
12733
12734
12735 perf_event_wakeup(parent_event);
12736 free_event(event);
12737 put_event(parent_event);
12738 return;
12739 }
12740
12741
12742
12743
12744 perf_event_wakeup(event);
12745 }
12746
12747 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12748 {
12749 struct perf_event_context *child_ctx, *clone_ctx = NULL;
12750 struct perf_event *child_event, *next;
12751
12752 WARN_ON_ONCE(child != current);
12753
12754 child_ctx = perf_pin_task_context(child, ctxn);
12755 if (!child_ctx)
12756 return;
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768 mutex_lock(&child_ctx->mutex);
12769
12770
12771
12772
12773
12774
12775 raw_spin_lock_irq(&child_ctx->lock);
12776 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12777
12778
12779
12780
12781
12782 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12783 put_ctx(child_ctx);
12784 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12785 put_task_struct(current);
12786
12787 clone_ctx = unclone_ctx(child_ctx);
12788 raw_spin_unlock_irq(&child_ctx->lock);
12789
12790 if (clone_ctx)
12791 put_ctx(clone_ctx);
12792
12793
12794
12795
12796
12797
12798 perf_event_task(child, child_ctx, 0);
12799
12800 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12801 perf_event_exit_event(child_event, child_ctx);
12802
12803 mutex_unlock(&child_ctx->mutex);
12804
12805 put_ctx(child_ctx);
12806 }
12807
12808
12809
12810
12811
12812
12813
12814 void perf_event_exit_task(struct task_struct *child)
12815 {
12816 struct perf_event *event, *tmp;
12817 int ctxn;
12818
12819 mutex_lock(&child->perf_event_mutex);
12820 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12821 owner_entry) {
12822 list_del_init(&event->owner_entry);
12823
12824
12825
12826
12827
12828
12829 smp_store_release(&event->owner, NULL);
12830 }
12831 mutex_unlock(&child->perf_event_mutex);
12832
12833 for_each_task_context_nr(ctxn)
12834 perf_event_exit_task_context(child, ctxn);
12835
12836
12837
12838
12839
12840
12841
12842 perf_event_task(child, NULL, 0);
12843 }
12844
12845 static void perf_free_event(struct perf_event *event,
12846 struct perf_event_context *ctx)
12847 {
12848 struct perf_event *parent = event->parent;
12849
12850 if (WARN_ON_ONCE(!parent))
12851 return;
12852
12853 mutex_lock(&parent->child_mutex);
12854 list_del_init(&event->child_list);
12855 mutex_unlock(&parent->child_mutex);
12856
12857 put_event(parent);
12858
12859 raw_spin_lock_irq(&ctx->lock);
12860 perf_group_detach(event);
12861 list_del_event(event, ctx);
12862 raw_spin_unlock_irq(&ctx->lock);
12863 free_event(event);
12864 }
12865
12866
12867
12868
12869
12870
12871
12872
12873 void perf_event_free_task(struct task_struct *task)
12874 {
12875 struct perf_event_context *ctx;
12876 struct perf_event *event, *tmp;
12877 int ctxn;
12878
12879 for_each_task_context_nr(ctxn) {
12880 ctx = task->perf_event_ctxp[ctxn];
12881 if (!ctx)
12882 continue;
12883
12884 mutex_lock(&ctx->mutex);
12885 raw_spin_lock_irq(&ctx->lock);
12886
12887
12888
12889
12890
12891
12892 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12893 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12894 put_task_struct(task);
12895 raw_spin_unlock_irq(&ctx->lock);
12896
12897 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12898 perf_free_event(event, ctx);
12899
12900 mutex_unlock(&ctx->mutex);
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12917 put_ctx(ctx);
12918 }
12919 }
12920
12921 void perf_event_delayed_put(struct task_struct *task)
12922 {
12923 int ctxn;
12924
12925 for_each_task_context_nr(ctxn)
12926 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12927 }
12928
12929 struct file *perf_event_get(unsigned int fd)
12930 {
12931 struct file *file = fget(fd);
12932 if (!file)
12933 return ERR_PTR(-EBADF);
12934
12935 if (file->f_op != &perf_fops) {
12936 fput(file);
12937 return ERR_PTR(-EBADF);
12938 }
12939
12940 return file;
12941 }
12942
12943 const struct perf_event *perf_get_event(struct file *file)
12944 {
12945 if (file->f_op != &perf_fops)
12946 return ERR_PTR(-EINVAL);
12947
12948 return file->private_data;
12949 }
12950
12951 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12952 {
12953 if (!event)
12954 return ERR_PTR(-EINVAL);
12955
12956 return &event->attr;
12957 }
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967 static struct perf_event *
12968 inherit_event(struct perf_event *parent_event,
12969 struct task_struct *parent,
12970 struct perf_event_context *parent_ctx,
12971 struct task_struct *child,
12972 struct perf_event *group_leader,
12973 struct perf_event_context *child_ctx)
12974 {
12975 enum perf_event_state parent_state = parent_event->state;
12976 struct perf_event *child_event;
12977 unsigned long flags;
12978
12979
12980
12981
12982
12983
12984
12985 if (parent_event->parent)
12986 parent_event = parent_event->parent;
12987
12988 child_event = perf_event_alloc(&parent_event->attr,
12989 parent_event->cpu,
12990 child,
12991 group_leader, parent_event,
12992 NULL, NULL, -1);
12993 if (IS_ERR(child_event))
12994 return child_event;
12995
12996
12997 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12998 !child_ctx->task_ctx_data) {
12999 struct pmu *pmu = child_event->pmu;
13000
13001 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
13002 if (!child_ctx->task_ctx_data) {
13003 free_event(child_event);
13004 return ERR_PTR(-ENOMEM);
13005 }
13006 }
13007
13008
13009
13010
13011
13012
13013
13014 mutex_lock(&parent_event->child_mutex);
13015 if (is_orphaned_event(parent_event) ||
13016 !atomic_long_inc_not_zero(&parent_event->refcount)) {
13017 mutex_unlock(&parent_event->child_mutex);
13018
13019 free_event(child_event);
13020 return NULL;
13021 }
13022
13023 get_ctx(child_ctx);
13024
13025
13026
13027
13028
13029
13030 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
13031 child_event->state = PERF_EVENT_STATE_INACTIVE;
13032 else
13033 child_event->state = PERF_EVENT_STATE_OFF;
13034
13035 if (parent_event->attr.freq) {
13036 u64 sample_period = parent_event->hw.sample_period;
13037 struct hw_perf_event *hwc = &child_event->hw;
13038
13039 hwc->sample_period = sample_period;
13040 hwc->last_period = sample_period;
13041
13042 local64_set(&hwc->period_left, sample_period);
13043 }
13044
13045 child_event->ctx = child_ctx;
13046 child_event->overflow_handler = parent_event->overflow_handler;
13047 child_event->overflow_handler_context
13048 = parent_event->overflow_handler_context;
13049
13050
13051
13052
13053 perf_event__header_size(child_event);
13054 perf_event__id_header_size(child_event);
13055
13056
13057
13058
13059 raw_spin_lock_irqsave(&child_ctx->lock, flags);
13060 add_event_to_ctx(child_event, child_ctx);
13061 child_event->attach_state |= PERF_ATTACH_CHILD;
13062 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
13063
13064
13065
13066
13067 list_add_tail(&child_event->child_list, &parent_event->child_list);
13068 mutex_unlock(&parent_event->child_mutex);
13069
13070 return child_event;
13071 }
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083 static int inherit_group(struct perf_event *parent_event,
13084 struct task_struct *parent,
13085 struct perf_event_context *parent_ctx,
13086 struct task_struct *child,
13087 struct perf_event_context *child_ctx)
13088 {
13089 struct perf_event *leader;
13090 struct perf_event *sub;
13091 struct perf_event *child_ctr;
13092
13093 leader = inherit_event(parent_event, parent, parent_ctx,
13094 child, NULL, child_ctx);
13095 if (IS_ERR(leader))
13096 return PTR_ERR(leader);
13097
13098
13099
13100
13101
13102 for_each_sibling_event(sub, parent_event) {
13103 child_ctr = inherit_event(sub, parent, parent_ctx,
13104 child, leader, child_ctx);
13105 if (IS_ERR(child_ctr))
13106 return PTR_ERR(child_ctr);
13107
13108 if (sub->aux_event == parent_event && child_ctr &&
13109 !perf_get_aux_event(child_ctr, leader))
13110 return -EINVAL;
13111 }
13112 return 0;
13113 }
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126 static int
13127 inherit_task_group(struct perf_event *event, struct task_struct *parent,
13128 struct perf_event_context *parent_ctx,
13129 struct task_struct *child, int ctxn,
13130 u64 clone_flags, int *inherited_all)
13131 {
13132 int ret;
13133 struct perf_event_context *child_ctx;
13134
13135 if (!event->attr.inherit ||
13136 (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
13137
13138 (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
13139 *inherited_all = 0;
13140 return 0;
13141 }
13142
13143 child_ctx = child->perf_event_ctxp[ctxn];
13144 if (!child_ctx) {
13145
13146
13147
13148
13149
13150
13151 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
13152 if (!child_ctx)
13153 return -ENOMEM;
13154
13155 child->perf_event_ctxp[ctxn] = child_ctx;
13156 }
13157
13158 ret = inherit_group(event, parent, parent_ctx,
13159 child, child_ctx);
13160
13161 if (ret)
13162 *inherited_all = 0;
13163
13164 return ret;
13165 }
13166
13167
13168
13169
13170 static int perf_event_init_context(struct task_struct *child, int ctxn,
13171 u64 clone_flags)
13172 {
13173 struct perf_event_context *child_ctx, *parent_ctx;
13174 struct perf_event_context *cloned_ctx;
13175 struct perf_event *event;
13176 struct task_struct *parent = current;
13177 int inherited_all = 1;
13178 unsigned long flags;
13179 int ret = 0;
13180
13181 if (likely(!parent->perf_event_ctxp[ctxn]))
13182 return 0;
13183
13184
13185
13186
13187
13188 parent_ctx = perf_pin_task_context(parent, ctxn);
13189 if (!parent_ctx)
13190 return 0;
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203 mutex_lock(&parent_ctx->mutex);
13204
13205
13206
13207
13208
13209 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
13210 ret = inherit_task_group(event, parent, parent_ctx,
13211 child, ctxn, clone_flags,
13212 &inherited_all);
13213 if (ret)
13214 goto out_unlock;
13215 }
13216
13217
13218
13219
13220
13221
13222 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13223 parent_ctx->rotate_disable = 1;
13224 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13225
13226 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
13227 ret = inherit_task_group(event, parent, parent_ctx,
13228 child, ctxn, clone_flags,
13229 &inherited_all);
13230 if (ret)
13231 goto out_unlock;
13232 }
13233
13234 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13235 parent_ctx->rotate_disable = 0;
13236
13237 child_ctx = child->perf_event_ctxp[ctxn];
13238
13239 if (child_ctx && inherited_all) {
13240
13241
13242
13243
13244
13245
13246
13247 cloned_ctx = parent_ctx->parent_ctx;
13248 if (cloned_ctx) {
13249 child_ctx->parent_ctx = cloned_ctx;
13250 child_ctx->parent_gen = parent_ctx->parent_gen;
13251 } else {
13252 child_ctx->parent_ctx = parent_ctx;
13253 child_ctx->parent_gen = parent_ctx->generation;
13254 }
13255 get_ctx(child_ctx->parent_ctx);
13256 }
13257
13258 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13259 out_unlock:
13260 mutex_unlock(&parent_ctx->mutex);
13261
13262 perf_unpin_context(parent_ctx);
13263 put_ctx(parent_ctx);
13264
13265 return ret;
13266 }
13267
13268
13269
13270
13271 int perf_event_init_task(struct task_struct *child, u64 clone_flags)
13272 {
13273 int ctxn, ret;
13274
13275 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
13276 mutex_init(&child->perf_event_mutex);
13277 INIT_LIST_HEAD(&child->perf_event_list);
13278
13279 for_each_task_context_nr(ctxn) {
13280 ret = perf_event_init_context(child, ctxn, clone_flags);
13281 if (ret) {
13282 perf_event_free_task(child);
13283 return ret;
13284 }
13285 }
13286
13287 return 0;
13288 }
13289
13290 static void __init perf_event_init_all_cpus(void)
13291 {
13292 struct swevent_htable *swhash;
13293 int cpu;
13294
13295 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13296
13297 for_each_possible_cpu(cpu) {
13298 swhash = &per_cpu(swevent_htable, cpu);
13299 mutex_init(&swhash->hlist_mutex);
13300 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
13301
13302 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13303 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13304
13305 #ifdef CONFIG_CGROUP_PERF
13306 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
13307 #endif
13308 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13309 }
13310 }
13311
13312 static void perf_swevent_init_cpu(unsigned int cpu)
13313 {
13314 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13315
13316 mutex_lock(&swhash->hlist_mutex);
13317 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13318 struct swevent_hlist *hlist;
13319
13320 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13321 WARN_ON(!hlist);
13322 rcu_assign_pointer(swhash->swevent_hlist, hlist);
13323 }
13324 mutex_unlock(&swhash->hlist_mutex);
13325 }
13326
13327 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13328 static void __perf_event_exit_context(void *__info)
13329 {
13330 struct perf_event_context *ctx = __info;
13331 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
13332 struct perf_event *event;
13333
13334 raw_spin_lock(&ctx->lock);
13335 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
13336 list_for_each_entry(event, &ctx->event_list, event_entry)
13337 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13338 raw_spin_unlock(&ctx->lock);
13339 }
13340
13341 static void perf_event_exit_cpu_context(int cpu)
13342 {
13343 struct perf_cpu_context *cpuctx;
13344 struct perf_event_context *ctx;
13345 struct pmu *pmu;
13346
13347 mutex_lock(&pmus_lock);
13348 list_for_each_entry(pmu, &pmus, entry) {
13349 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13350 ctx = &cpuctx->ctx;
13351
13352 mutex_lock(&ctx->mutex);
13353 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13354 cpuctx->online = 0;
13355 mutex_unlock(&ctx->mutex);
13356 }
13357 cpumask_clear_cpu(cpu, perf_online_mask);
13358 mutex_unlock(&pmus_lock);
13359 }
13360 #else
13361
13362 static void perf_event_exit_cpu_context(int cpu) { }
13363
13364 #endif
13365
13366 int perf_event_init_cpu(unsigned int cpu)
13367 {
13368 struct perf_cpu_context *cpuctx;
13369 struct perf_event_context *ctx;
13370 struct pmu *pmu;
13371
13372 perf_swevent_init_cpu(cpu);
13373
13374 mutex_lock(&pmus_lock);
13375 cpumask_set_cpu(cpu, perf_online_mask);
13376 list_for_each_entry(pmu, &pmus, entry) {
13377 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13378 ctx = &cpuctx->ctx;
13379
13380 mutex_lock(&ctx->mutex);
13381 cpuctx->online = 1;
13382 mutex_unlock(&ctx->mutex);
13383 }
13384 mutex_unlock(&pmus_lock);
13385
13386 return 0;
13387 }
13388
13389 int perf_event_exit_cpu(unsigned int cpu)
13390 {
13391 perf_event_exit_cpu_context(cpu);
13392 return 0;
13393 }
13394
13395 static int
13396 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13397 {
13398 int cpu;
13399
13400 for_each_online_cpu(cpu)
13401 perf_event_exit_cpu(cpu);
13402
13403 return NOTIFY_OK;
13404 }
13405
13406
13407
13408
13409
13410 static struct notifier_block perf_reboot_notifier = {
13411 .notifier_call = perf_reboot,
13412 .priority = INT_MIN,
13413 };
13414
13415 void __init perf_event_init(void)
13416 {
13417 int ret;
13418
13419 idr_init(&pmu_idr);
13420
13421 perf_event_init_all_cpus();
13422 init_srcu_struct(&pmus_srcu);
13423 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13424 perf_pmu_register(&perf_cpu_clock, NULL, -1);
13425 perf_pmu_register(&perf_task_clock, NULL, -1);
13426 perf_tp_register();
13427 perf_event_init_cpu(smp_processor_id());
13428 register_reboot_notifier(&perf_reboot_notifier);
13429
13430 ret = init_hw_breakpoint();
13431 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13432
13433 perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13434
13435
13436
13437
13438
13439 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13440 != 1024);
13441 }
13442
13443 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13444 char *page)
13445 {
13446 struct perf_pmu_events_attr *pmu_attr =
13447 container_of(attr, struct perf_pmu_events_attr, attr);
13448
13449 if (pmu_attr->event_str)
13450 return sprintf(page, "%s\n", pmu_attr->event_str);
13451
13452 return 0;
13453 }
13454 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13455
13456 static int __init perf_event_sysfs_init(void)
13457 {
13458 struct pmu *pmu;
13459 int ret;
13460
13461 mutex_lock(&pmus_lock);
13462
13463 ret = bus_register(&pmu_bus);
13464 if (ret)
13465 goto unlock;
13466
13467 list_for_each_entry(pmu, &pmus, entry) {
13468 if (!pmu->name || pmu->type < 0)
13469 continue;
13470
13471 ret = pmu_dev_alloc(pmu);
13472 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13473 }
13474 pmu_bus_running = 1;
13475 ret = 0;
13476
13477 unlock:
13478 mutex_unlock(&pmus_lock);
13479
13480 return ret;
13481 }
13482 device_initcall(perf_event_sysfs_init);
13483
13484 #ifdef CONFIG_CGROUP_PERF
13485 static struct cgroup_subsys_state *
13486 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13487 {
13488 struct perf_cgroup *jc;
13489
13490 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13491 if (!jc)
13492 return ERR_PTR(-ENOMEM);
13493
13494 jc->info = alloc_percpu(struct perf_cgroup_info);
13495 if (!jc->info) {
13496 kfree(jc);
13497 return ERR_PTR(-ENOMEM);
13498 }
13499
13500 return &jc->css;
13501 }
13502
13503 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13504 {
13505 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13506
13507 free_percpu(jc->info);
13508 kfree(jc);
13509 }
13510
13511 static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13512 {
13513 perf_event_cgroup(css->cgroup);
13514 return 0;
13515 }
13516
13517 static int __perf_cgroup_move(void *info)
13518 {
13519 struct task_struct *task = info;
13520 rcu_read_lock();
13521 perf_cgroup_switch(task);
13522 rcu_read_unlock();
13523 return 0;
13524 }
13525
13526 static void perf_cgroup_attach(struct cgroup_taskset *tset)
13527 {
13528 struct task_struct *task;
13529 struct cgroup_subsys_state *css;
13530
13531 cgroup_taskset_for_each(task, css, tset)
13532 task_function_call(task, __perf_cgroup_move, task);
13533 }
13534
13535 struct cgroup_subsys perf_event_cgrp_subsys = {
13536 .css_alloc = perf_cgroup_css_alloc,
13537 .css_free = perf_cgroup_css_free,
13538 .css_online = perf_cgroup_css_online,
13539 .attach = perf_cgroup_attach,
13540
13541
13542
13543
13544
13545 .implicit_on_dfl = true,
13546 .threaded = true,
13547 };
13548 #endif
13549
13550 DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);