Back to home page

LXR

 
 

    


0001 /*
0002  * Performance events core code:
0003  *
0004  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
0005  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
0006  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
0007  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
0008  *
0009  * For licensing details see kernel-base/COPYING
0010  */
0011 
0012 #include <linux/fs.h>
0013 #include <linux/mm.h>
0014 #include <linux/cpu.h>
0015 #include <linux/smp.h>
0016 #include <linux/idr.h>
0017 #include <linux/file.h>
0018 #include <linux/poll.h>
0019 #include <linux/slab.h>
0020 #include <linux/hash.h>
0021 #include <linux/tick.h>
0022 #include <linux/sysfs.h>
0023 #include <linux/dcache.h>
0024 #include <linux/percpu.h>
0025 #include <linux/ptrace.h>
0026 #include <linux/reboot.h>
0027 #include <linux/vmstat.h>
0028 #include <linux/device.h>
0029 #include <linux/export.h>
0030 #include <linux/vmalloc.h>
0031 #include <linux/hardirq.h>
0032 #include <linux/rculist.h>
0033 #include <linux/uaccess.h>
0034 #include <linux/syscalls.h>
0035 #include <linux/anon_inodes.h>
0036 #include <linux/kernel_stat.h>
0037 #include <linux/cgroup.h>
0038 #include <linux/perf_event.h>
0039 #include <linux/trace_events.h>
0040 #include <linux/hw_breakpoint.h>
0041 #include <linux/mm_types.h>
0042 #include <linux/module.h>
0043 #include <linux/mman.h>
0044 #include <linux/compat.h>
0045 #include <linux/bpf.h>
0046 #include <linux/filter.h>
0047 #include <linux/namei.h>
0048 #include <linux/parser.h>
0049 
0050 #include "internal.h"
0051 
0052 #include <asm/irq_regs.h>
0053 
0054 typedef int (*remote_function_f)(void *);
0055 
0056 struct remote_function_call {
0057     struct task_struct  *p;
0058     remote_function_f   func;
0059     void            *info;
0060     int         ret;
0061 };
0062 
0063 static void remote_function(void *data)
0064 {
0065     struct remote_function_call *tfc = data;
0066     struct task_struct *p = tfc->p;
0067 
0068     if (p) {
0069         /* -EAGAIN */
0070         if (task_cpu(p) != smp_processor_id())
0071             return;
0072 
0073         /*
0074          * Now that we're on right CPU with IRQs disabled, we can test
0075          * if we hit the right task without races.
0076          */
0077 
0078         tfc->ret = -ESRCH; /* No such (running) process */
0079         if (p != current)
0080             return;
0081     }
0082 
0083     tfc->ret = tfc->func(tfc->info);
0084 }
0085 
0086 /**
0087  * task_function_call - call a function on the cpu on which a task runs
0088  * @p:      the task to evaluate
0089  * @func:   the function to be called
0090  * @info:   the function call argument
0091  *
0092  * Calls the function @func when the task is currently running. This might
0093  * be on the current CPU, which just calls the function directly
0094  *
0095  * returns: @func return value, or
0096  *      -ESRCH  - when the process isn't running
0097  *      -EAGAIN - when the process moved away
0098  */
0099 static int
0100 task_function_call(struct task_struct *p, remote_function_f func, void *info)
0101 {
0102     struct remote_function_call data = {
0103         .p  = p,
0104         .func   = func,
0105         .info   = info,
0106         .ret    = -EAGAIN,
0107     };
0108     int ret;
0109 
0110     do {
0111         ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
0112         if (!ret)
0113             ret = data.ret;
0114     } while (ret == -EAGAIN);
0115 
0116     return ret;
0117 }
0118 
0119 /**
0120  * cpu_function_call - call a function on the cpu
0121  * @func:   the function to be called
0122  * @info:   the function call argument
0123  *
0124  * Calls the function @func on the remote cpu.
0125  *
0126  * returns: @func return value or -ENXIO when the cpu is offline
0127  */
0128 static int cpu_function_call(int cpu, remote_function_f func, void *info)
0129 {
0130     struct remote_function_call data = {
0131         .p  = NULL,
0132         .func   = func,
0133         .info   = info,
0134         .ret    = -ENXIO, /* No such CPU */
0135     };
0136 
0137     smp_call_function_single(cpu, remote_function, &data, 1);
0138 
0139     return data.ret;
0140 }
0141 
0142 static inline struct perf_cpu_context *
0143 __get_cpu_context(struct perf_event_context *ctx)
0144 {
0145     return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
0146 }
0147 
0148 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
0149               struct perf_event_context *ctx)
0150 {
0151     raw_spin_lock(&cpuctx->ctx.lock);
0152     if (ctx)
0153         raw_spin_lock(&ctx->lock);
0154 }
0155 
0156 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
0157                 struct perf_event_context *ctx)
0158 {
0159     if (ctx)
0160         raw_spin_unlock(&ctx->lock);
0161     raw_spin_unlock(&cpuctx->ctx.lock);
0162 }
0163 
0164 #define TASK_TOMBSTONE ((void *)-1L)
0165 
0166 static bool is_kernel_event(struct perf_event *event)
0167 {
0168     return READ_ONCE(event->owner) == TASK_TOMBSTONE;
0169 }
0170 
0171 /*
0172  * On task ctx scheduling...
0173  *
0174  * When !ctx->nr_events a task context will not be scheduled. This means
0175  * we can disable the scheduler hooks (for performance) without leaving
0176  * pending task ctx state.
0177  *
0178  * This however results in two special cases:
0179  *
0180  *  - removing the last event from a task ctx; this is relatively straight
0181  *    forward and is done in __perf_remove_from_context.
0182  *
0183  *  - adding the first event to a task ctx; this is tricky because we cannot
0184  *    rely on ctx->is_active and therefore cannot use event_function_call().
0185  *    See perf_install_in_context().
0186  *
0187  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
0188  */
0189 
0190 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
0191             struct perf_event_context *, void *);
0192 
0193 struct event_function_struct {
0194     struct perf_event *event;
0195     event_f func;
0196     void *data;
0197 };
0198 
0199 static int event_function(void *info)
0200 {
0201     struct event_function_struct *efs = info;
0202     struct perf_event *event = efs->event;
0203     struct perf_event_context *ctx = event->ctx;
0204     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0205     struct perf_event_context *task_ctx = cpuctx->task_ctx;
0206     int ret = 0;
0207 
0208     WARN_ON_ONCE(!irqs_disabled());
0209 
0210     perf_ctx_lock(cpuctx, task_ctx);
0211     /*
0212      * Since we do the IPI call without holding ctx->lock things can have
0213      * changed, double check we hit the task we set out to hit.
0214      */
0215     if (ctx->task) {
0216         if (ctx->task != current) {
0217             ret = -ESRCH;
0218             goto unlock;
0219         }
0220 
0221         /*
0222          * We only use event_function_call() on established contexts,
0223          * and event_function() is only ever called when active (or
0224          * rather, we'll have bailed in task_function_call() or the
0225          * above ctx->task != current test), therefore we must have
0226          * ctx->is_active here.
0227          */
0228         WARN_ON_ONCE(!ctx->is_active);
0229         /*
0230          * And since we have ctx->is_active, cpuctx->task_ctx must
0231          * match.
0232          */
0233         WARN_ON_ONCE(task_ctx != ctx);
0234     } else {
0235         WARN_ON_ONCE(&cpuctx->ctx != ctx);
0236     }
0237 
0238     efs->func(event, cpuctx, ctx, efs->data);
0239 unlock:
0240     perf_ctx_unlock(cpuctx, task_ctx);
0241 
0242     return ret;
0243 }
0244 
0245 static void event_function_call(struct perf_event *event, event_f func, void *data)
0246 {
0247     struct perf_event_context *ctx = event->ctx;
0248     struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
0249     struct event_function_struct efs = {
0250         .event = event,
0251         .func = func,
0252         .data = data,
0253     };
0254 
0255     if (!event->parent) {
0256         /*
0257          * If this is a !child event, we must hold ctx::mutex to
0258          * stabilize the the event->ctx relation. See
0259          * perf_event_ctx_lock().
0260          */
0261         lockdep_assert_held(&ctx->mutex);
0262     }
0263 
0264     if (!task) {
0265         cpu_function_call(event->cpu, event_function, &efs);
0266         return;
0267     }
0268 
0269     if (task == TASK_TOMBSTONE)
0270         return;
0271 
0272 again:
0273     if (!task_function_call(task, event_function, &efs))
0274         return;
0275 
0276     raw_spin_lock_irq(&ctx->lock);
0277     /*
0278      * Reload the task pointer, it might have been changed by
0279      * a concurrent perf_event_context_sched_out().
0280      */
0281     task = ctx->task;
0282     if (task == TASK_TOMBSTONE) {
0283         raw_spin_unlock_irq(&ctx->lock);
0284         return;
0285     }
0286     if (ctx->is_active) {
0287         raw_spin_unlock_irq(&ctx->lock);
0288         goto again;
0289     }
0290     func(event, NULL, ctx, data);
0291     raw_spin_unlock_irq(&ctx->lock);
0292 }
0293 
0294 /*
0295  * Similar to event_function_call() + event_function(), but hard assumes IRQs
0296  * are already disabled and we're on the right CPU.
0297  */
0298 static void event_function_local(struct perf_event *event, event_f func, void *data)
0299 {
0300     struct perf_event_context *ctx = event->ctx;
0301     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0302     struct task_struct *task = READ_ONCE(ctx->task);
0303     struct perf_event_context *task_ctx = NULL;
0304 
0305     WARN_ON_ONCE(!irqs_disabled());
0306 
0307     if (task) {
0308         if (task == TASK_TOMBSTONE)
0309             return;
0310 
0311         task_ctx = ctx;
0312     }
0313 
0314     perf_ctx_lock(cpuctx, task_ctx);
0315 
0316     task = ctx->task;
0317     if (task == TASK_TOMBSTONE)
0318         goto unlock;
0319 
0320     if (task) {
0321         /*
0322          * We must be either inactive or active and the right task,
0323          * otherwise we're screwed, since we cannot IPI to somewhere
0324          * else.
0325          */
0326         if (ctx->is_active) {
0327             if (WARN_ON_ONCE(task != current))
0328                 goto unlock;
0329 
0330             if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
0331                 goto unlock;
0332         }
0333     } else {
0334         WARN_ON_ONCE(&cpuctx->ctx != ctx);
0335     }
0336 
0337     func(event, cpuctx, ctx, data);
0338 unlock:
0339     perf_ctx_unlock(cpuctx, task_ctx);
0340 }
0341 
0342 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
0343                PERF_FLAG_FD_OUTPUT  |\
0344                PERF_FLAG_PID_CGROUP |\
0345                PERF_FLAG_FD_CLOEXEC)
0346 
0347 /*
0348  * branch priv levels that need permission checks
0349  */
0350 #define PERF_SAMPLE_BRANCH_PERM_PLM \
0351     (PERF_SAMPLE_BRANCH_KERNEL |\
0352      PERF_SAMPLE_BRANCH_HV)
0353 
0354 enum event_type_t {
0355     EVENT_FLEXIBLE = 0x1,
0356     EVENT_PINNED = 0x2,
0357     EVENT_TIME = 0x4,
0358     EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
0359 };
0360 
0361 /*
0362  * perf_sched_events : >0 events exist
0363  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
0364  */
0365 
0366 static void perf_sched_delayed(struct work_struct *work);
0367 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
0368 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
0369 static DEFINE_MUTEX(perf_sched_mutex);
0370 static atomic_t perf_sched_count;
0371 
0372 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
0373 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
0374 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
0375 
0376 static atomic_t nr_mmap_events __read_mostly;
0377 static atomic_t nr_comm_events __read_mostly;
0378 static atomic_t nr_task_events __read_mostly;
0379 static atomic_t nr_freq_events __read_mostly;
0380 static atomic_t nr_switch_events __read_mostly;
0381 
0382 static LIST_HEAD(pmus);
0383 static DEFINE_MUTEX(pmus_lock);
0384 static struct srcu_struct pmus_srcu;
0385 
0386 /*
0387  * perf event paranoia level:
0388  *  -1 - not paranoid at all
0389  *   0 - disallow raw tracepoint access for unpriv
0390  *   1 - disallow cpu events for unpriv
0391  *   2 - disallow kernel profiling for unpriv
0392  */
0393 int sysctl_perf_event_paranoid __read_mostly = 2;
0394 
0395 /* Minimum for 512 kiB + 1 user control page */
0396 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
0397 
0398 /*
0399  * max perf event sample rate
0400  */
0401 #define DEFAULT_MAX_SAMPLE_RATE     100000
0402 #define DEFAULT_SAMPLE_PERIOD_NS    (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
0403 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
0404 
0405 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
0406 
0407 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
0408 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
0409 
0410 static int perf_sample_allowed_ns __read_mostly =
0411     DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
0412 
0413 static void update_perf_cpu_limits(void)
0414 {
0415     u64 tmp = perf_sample_period_ns;
0416 
0417     tmp *= sysctl_perf_cpu_time_max_percent;
0418     tmp = div_u64(tmp, 100);
0419     if (!tmp)
0420         tmp = 1;
0421 
0422     WRITE_ONCE(perf_sample_allowed_ns, tmp);
0423 }
0424 
0425 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
0426 
0427 int perf_proc_update_handler(struct ctl_table *table, int write,
0428         void __user *buffer, size_t *lenp,
0429         loff_t *ppos)
0430 {
0431     int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
0432 
0433     if (ret || !write)
0434         return ret;
0435 
0436     /*
0437      * If throttling is disabled don't allow the write:
0438      */
0439     if (sysctl_perf_cpu_time_max_percent == 100 ||
0440         sysctl_perf_cpu_time_max_percent == 0)
0441         return -EINVAL;
0442 
0443     max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
0444     perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
0445     update_perf_cpu_limits();
0446 
0447     return 0;
0448 }
0449 
0450 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
0451 
0452 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
0453                 void __user *buffer, size_t *lenp,
0454                 loff_t *ppos)
0455 {
0456     int ret = proc_dointvec(table, write, buffer, lenp, ppos);
0457 
0458     if (ret || !write)
0459         return ret;
0460 
0461     if (sysctl_perf_cpu_time_max_percent == 100 ||
0462         sysctl_perf_cpu_time_max_percent == 0) {
0463         printk(KERN_WARNING
0464                "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
0465         WRITE_ONCE(perf_sample_allowed_ns, 0);
0466     } else {
0467         update_perf_cpu_limits();
0468     }
0469 
0470     return 0;
0471 }
0472 
0473 /*
0474  * perf samples are done in some very critical code paths (NMIs).
0475  * If they take too much CPU time, the system can lock up and not
0476  * get any real work done.  This will drop the sample rate when
0477  * we detect that events are taking too long.
0478  */
0479 #define NR_ACCUMULATED_SAMPLES 128
0480 static DEFINE_PER_CPU(u64, running_sample_length);
0481 
0482 static u64 __report_avg;
0483 static u64 __report_allowed;
0484 
0485 static void perf_duration_warn(struct irq_work *w)
0486 {
0487     printk_ratelimited(KERN_INFO
0488         "perf: interrupt took too long (%lld > %lld), lowering "
0489         "kernel.perf_event_max_sample_rate to %d\n",
0490         __report_avg, __report_allowed,
0491         sysctl_perf_event_sample_rate);
0492 }
0493 
0494 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
0495 
0496 void perf_sample_event_took(u64 sample_len_ns)
0497 {
0498     u64 max_len = READ_ONCE(perf_sample_allowed_ns);
0499     u64 running_len;
0500     u64 avg_len;
0501     u32 max;
0502 
0503     if (max_len == 0)
0504         return;
0505 
0506     /* Decay the counter by 1 average sample. */
0507     running_len = __this_cpu_read(running_sample_length);
0508     running_len -= running_len/NR_ACCUMULATED_SAMPLES;
0509     running_len += sample_len_ns;
0510     __this_cpu_write(running_sample_length, running_len);
0511 
0512     /*
0513      * Note: this will be biased artifically low until we have
0514      * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
0515      * from having to maintain a count.
0516      */
0517     avg_len = running_len/NR_ACCUMULATED_SAMPLES;
0518     if (avg_len <= max_len)
0519         return;
0520 
0521     __report_avg = avg_len;
0522     __report_allowed = max_len;
0523 
0524     /*
0525      * Compute a throttle threshold 25% below the current duration.
0526      */
0527     avg_len += avg_len / 4;
0528     max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
0529     if (avg_len < max)
0530         max /= (u32)avg_len;
0531     else
0532         max = 1;
0533 
0534     WRITE_ONCE(perf_sample_allowed_ns, avg_len);
0535     WRITE_ONCE(max_samples_per_tick, max);
0536 
0537     sysctl_perf_event_sample_rate = max * HZ;
0538     perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
0539 
0540     if (!irq_work_queue(&perf_duration_work)) {
0541         early_printk("perf: interrupt took too long (%lld > %lld), lowering "
0542                  "kernel.perf_event_max_sample_rate to %d\n",
0543                  __report_avg, __report_allowed,
0544                  sysctl_perf_event_sample_rate);
0545     }
0546 }
0547 
0548 static atomic64_t perf_event_id;
0549 
0550 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
0551                   enum event_type_t event_type);
0552 
0553 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
0554                  enum event_type_t event_type,
0555                  struct task_struct *task);
0556 
0557 static void update_context_time(struct perf_event_context *ctx);
0558 static u64 perf_event_time(struct perf_event *event);
0559 
0560 void __weak perf_event_print_debug(void)    { }
0561 
0562 extern __weak const char *perf_pmu_name(void)
0563 {
0564     return "pmu";
0565 }
0566 
0567 static inline u64 perf_clock(void)
0568 {
0569     return local_clock();
0570 }
0571 
0572 static inline u64 perf_event_clock(struct perf_event *event)
0573 {
0574     return event->clock();
0575 }
0576 
0577 #ifdef CONFIG_CGROUP_PERF
0578 
0579 static inline bool
0580 perf_cgroup_match(struct perf_event *event)
0581 {
0582     struct perf_event_context *ctx = event->ctx;
0583     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0584 
0585     /* @event doesn't care about cgroup */
0586     if (!event->cgrp)
0587         return true;
0588 
0589     /* wants specific cgroup scope but @cpuctx isn't associated with any */
0590     if (!cpuctx->cgrp)
0591         return false;
0592 
0593     /*
0594      * Cgroup scoping is recursive.  An event enabled for a cgroup is
0595      * also enabled for all its descendant cgroups.  If @cpuctx's
0596      * cgroup is a descendant of @event's (the test covers identity
0597      * case), it's a match.
0598      */
0599     return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
0600                     event->cgrp->css.cgroup);
0601 }
0602 
0603 static inline void perf_detach_cgroup(struct perf_event *event)
0604 {
0605     css_put(&event->cgrp->css);
0606     event->cgrp = NULL;
0607 }
0608 
0609 static inline int is_cgroup_event(struct perf_event *event)
0610 {
0611     return event->cgrp != NULL;
0612 }
0613 
0614 static inline u64 perf_cgroup_event_time(struct perf_event *event)
0615 {
0616     struct perf_cgroup_info *t;
0617 
0618     t = per_cpu_ptr(event->cgrp->info, event->cpu);
0619     return t->time;
0620 }
0621 
0622 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
0623 {
0624     struct perf_cgroup_info *info;
0625     u64 now;
0626 
0627     now = perf_clock();
0628 
0629     info = this_cpu_ptr(cgrp->info);
0630 
0631     info->time += now - info->timestamp;
0632     info->timestamp = now;
0633 }
0634 
0635 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
0636 {
0637     struct perf_cgroup *cgrp_out = cpuctx->cgrp;
0638     if (cgrp_out)
0639         __update_cgrp_time(cgrp_out);
0640 }
0641 
0642 static inline void update_cgrp_time_from_event(struct perf_event *event)
0643 {
0644     struct perf_cgroup *cgrp;
0645 
0646     /*
0647      * ensure we access cgroup data only when needed and
0648      * when we know the cgroup is pinned (css_get)
0649      */
0650     if (!is_cgroup_event(event))
0651         return;
0652 
0653     cgrp = perf_cgroup_from_task(current, event->ctx);
0654     /*
0655      * Do not update time when cgroup is not active
0656      */
0657     if (cgrp == event->cgrp)
0658         __update_cgrp_time(event->cgrp);
0659 }
0660 
0661 static inline void
0662 perf_cgroup_set_timestamp(struct task_struct *task,
0663               struct perf_event_context *ctx)
0664 {
0665     struct perf_cgroup *cgrp;
0666     struct perf_cgroup_info *info;
0667 
0668     /*
0669      * ctx->lock held by caller
0670      * ensure we do not access cgroup data
0671      * unless we have the cgroup pinned (css_get)
0672      */
0673     if (!task || !ctx->nr_cgroups)
0674         return;
0675 
0676     cgrp = perf_cgroup_from_task(task, ctx);
0677     info = this_cpu_ptr(cgrp->info);
0678     info->timestamp = ctx->timestamp;
0679 }
0680 
0681 #define PERF_CGROUP_SWOUT   0x1 /* cgroup switch out every event */
0682 #define PERF_CGROUP_SWIN    0x2 /* cgroup switch in events based on task */
0683 
0684 /*
0685  * reschedule events based on the cgroup constraint of task.
0686  *
0687  * mode SWOUT : schedule out everything
0688  * mode SWIN : schedule in based on cgroup for next
0689  */
0690 static void perf_cgroup_switch(struct task_struct *task, int mode)
0691 {
0692     struct perf_cpu_context *cpuctx;
0693     struct pmu *pmu;
0694     unsigned long flags;
0695 
0696     /*
0697      * disable interrupts to avoid geting nr_cgroup
0698      * changes via __perf_event_disable(). Also
0699      * avoids preemption.
0700      */
0701     local_irq_save(flags);
0702 
0703     /*
0704      * we reschedule only in the presence of cgroup
0705      * constrained events.
0706      */
0707 
0708     list_for_each_entry_rcu(pmu, &pmus, entry) {
0709         cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
0710         if (cpuctx->unique_pmu != pmu)
0711             continue; /* ensure we process each cpuctx once */
0712 
0713         /*
0714          * perf_cgroup_events says at least one
0715          * context on this CPU has cgroup events.
0716          *
0717          * ctx->nr_cgroups reports the number of cgroup
0718          * events for a context.
0719          */
0720         if (cpuctx->ctx.nr_cgroups > 0) {
0721             perf_ctx_lock(cpuctx, cpuctx->task_ctx);
0722             perf_pmu_disable(cpuctx->ctx.pmu);
0723 
0724             if (mode & PERF_CGROUP_SWOUT) {
0725                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
0726                 /*
0727                  * must not be done before ctxswout due
0728                  * to event_filter_match() in event_sched_out()
0729                  */
0730                 cpuctx->cgrp = NULL;
0731             }
0732 
0733             if (mode & PERF_CGROUP_SWIN) {
0734                 WARN_ON_ONCE(cpuctx->cgrp);
0735                 /*
0736                  * set cgrp before ctxsw in to allow
0737                  * event_filter_match() to not have to pass
0738                  * task around
0739                  * we pass the cpuctx->ctx to perf_cgroup_from_task()
0740                  * because cgorup events are only per-cpu
0741                  */
0742                 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
0743                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
0744             }
0745             perf_pmu_enable(cpuctx->ctx.pmu);
0746             perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
0747         }
0748     }
0749 
0750     local_irq_restore(flags);
0751 }
0752 
0753 static inline void perf_cgroup_sched_out(struct task_struct *task,
0754                      struct task_struct *next)
0755 {
0756     struct perf_cgroup *cgrp1;
0757     struct perf_cgroup *cgrp2 = NULL;
0758 
0759     rcu_read_lock();
0760     /*
0761      * we come here when we know perf_cgroup_events > 0
0762      * we do not need to pass the ctx here because we know
0763      * we are holding the rcu lock
0764      */
0765     cgrp1 = perf_cgroup_from_task(task, NULL);
0766     cgrp2 = perf_cgroup_from_task(next, NULL);
0767 
0768     /*
0769      * only schedule out current cgroup events if we know
0770      * that we are switching to a different cgroup. Otherwise,
0771      * do no touch the cgroup events.
0772      */
0773     if (cgrp1 != cgrp2)
0774         perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
0775 
0776     rcu_read_unlock();
0777 }
0778 
0779 static inline void perf_cgroup_sched_in(struct task_struct *prev,
0780                     struct task_struct *task)
0781 {
0782     struct perf_cgroup *cgrp1;
0783     struct perf_cgroup *cgrp2 = NULL;
0784 
0785     rcu_read_lock();
0786     /*
0787      * we come here when we know perf_cgroup_events > 0
0788      * we do not need to pass the ctx here because we know
0789      * we are holding the rcu lock
0790      */
0791     cgrp1 = perf_cgroup_from_task(task, NULL);
0792     cgrp2 = perf_cgroup_from_task(prev, NULL);
0793 
0794     /*
0795      * only need to schedule in cgroup events if we are changing
0796      * cgroup during ctxsw. Cgroup events were not scheduled
0797      * out of ctxsw out if that was not the case.
0798      */
0799     if (cgrp1 != cgrp2)
0800         perf_cgroup_switch(task, PERF_CGROUP_SWIN);
0801 
0802     rcu_read_unlock();
0803 }
0804 
0805 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
0806                       struct perf_event_attr *attr,
0807                       struct perf_event *group_leader)
0808 {
0809     struct perf_cgroup *cgrp;
0810     struct cgroup_subsys_state *css;
0811     struct fd f = fdget(fd);
0812     int ret = 0;
0813 
0814     if (!f.file)
0815         return -EBADF;
0816 
0817     css = css_tryget_online_from_dir(f.file->f_path.dentry,
0818                      &perf_event_cgrp_subsys);
0819     if (IS_ERR(css)) {
0820         ret = PTR_ERR(css);
0821         goto out;
0822     }
0823 
0824     cgrp = container_of(css, struct perf_cgroup, css);
0825     event->cgrp = cgrp;
0826 
0827     /*
0828      * all events in a group must monitor
0829      * the same cgroup because a task belongs
0830      * to only one perf cgroup at a time
0831      */
0832     if (group_leader && group_leader->cgrp != cgrp) {
0833         perf_detach_cgroup(event);
0834         ret = -EINVAL;
0835     }
0836 out:
0837     fdput(f);
0838     return ret;
0839 }
0840 
0841 static inline void
0842 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
0843 {
0844     struct perf_cgroup_info *t;
0845     t = per_cpu_ptr(event->cgrp->info, event->cpu);
0846     event->shadow_ctx_time = now - t->timestamp;
0847 }
0848 
0849 static inline void
0850 perf_cgroup_defer_enabled(struct perf_event *event)
0851 {
0852     /*
0853      * when the current task's perf cgroup does not match
0854      * the event's, we need to remember to call the
0855      * perf_mark_enable() function the first time a task with
0856      * a matching perf cgroup is scheduled in.
0857      */
0858     if (is_cgroup_event(event) && !perf_cgroup_match(event))
0859         event->cgrp_defer_enabled = 1;
0860 }
0861 
0862 static inline void
0863 perf_cgroup_mark_enabled(struct perf_event *event,
0864              struct perf_event_context *ctx)
0865 {
0866     struct perf_event *sub;
0867     u64 tstamp = perf_event_time(event);
0868 
0869     if (!event->cgrp_defer_enabled)
0870         return;
0871 
0872     event->cgrp_defer_enabled = 0;
0873 
0874     event->tstamp_enabled = tstamp - event->total_time_enabled;
0875     list_for_each_entry(sub, &event->sibling_list, group_entry) {
0876         if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
0877             sub->tstamp_enabled = tstamp - sub->total_time_enabled;
0878             sub->cgrp_defer_enabled = 0;
0879         }
0880     }
0881 }
0882 
0883 /*
0884  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
0885  * cleared when last cgroup event is removed.
0886  */
0887 static inline void
0888 list_update_cgroup_event(struct perf_event *event,
0889              struct perf_event_context *ctx, bool add)
0890 {
0891     struct perf_cpu_context *cpuctx;
0892 
0893     if (!is_cgroup_event(event))
0894         return;
0895 
0896     if (add && ctx->nr_cgroups++)
0897         return;
0898     else if (!add && --ctx->nr_cgroups)
0899         return;
0900     /*
0901      * Because cgroup events are always per-cpu events,
0902      * this will always be called from the right CPU.
0903      */
0904     cpuctx = __get_cpu_context(ctx);
0905 
0906     /*
0907      * cpuctx->cgrp is NULL until a cgroup event is sched in or
0908      * ctx->nr_cgroup == 0 .
0909      */
0910     if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
0911         cpuctx->cgrp = event->cgrp;
0912     else if (!add)
0913         cpuctx->cgrp = NULL;
0914 }
0915 
0916 #else /* !CONFIG_CGROUP_PERF */
0917 
0918 static inline bool
0919 perf_cgroup_match(struct perf_event *event)
0920 {
0921     return true;
0922 }
0923 
0924 static inline void perf_detach_cgroup(struct perf_event *event)
0925 {}
0926 
0927 static inline int is_cgroup_event(struct perf_event *event)
0928 {
0929     return 0;
0930 }
0931 
0932 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
0933 {
0934     return 0;
0935 }
0936 
0937 static inline void update_cgrp_time_from_event(struct perf_event *event)
0938 {
0939 }
0940 
0941 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
0942 {
0943 }
0944 
0945 static inline void perf_cgroup_sched_out(struct task_struct *task,
0946                      struct task_struct *next)
0947 {
0948 }
0949 
0950 static inline void perf_cgroup_sched_in(struct task_struct *prev,
0951                     struct task_struct *task)
0952 {
0953 }
0954 
0955 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
0956                       struct perf_event_attr *attr,
0957                       struct perf_event *group_leader)
0958 {
0959     return -EINVAL;
0960 }
0961 
0962 static inline void
0963 perf_cgroup_set_timestamp(struct task_struct *task,
0964               struct perf_event_context *ctx)
0965 {
0966 }
0967 
0968 void
0969 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
0970 {
0971 }
0972 
0973 static inline void
0974 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
0975 {
0976 }
0977 
0978 static inline u64 perf_cgroup_event_time(struct perf_event *event)
0979 {
0980     return 0;
0981 }
0982 
0983 static inline void
0984 perf_cgroup_defer_enabled(struct perf_event *event)
0985 {
0986 }
0987 
0988 static inline void
0989 perf_cgroup_mark_enabled(struct perf_event *event,
0990              struct perf_event_context *ctx)
0991 {
0992 }
0993 
0994 static inline void
0995 list_update_cgroup_event(struct perf_event *event,
0996              struct perf_event_context *ctx, bool add)
0997 {
0998 }
0999 
1000 #endif
1001 
1002 /*
1003  * set default to be dependent on timer tick just
1004  * like original code
1005  */
1006 #define PERF_CPU_HRTIMER (1000 / HZ)
1007 /*
1008  * function must be called with interrupts disbled
1009  */
1010 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1011 {
1012     struct perf_cpu_context *cpuctx;
1013     int rotations = 0;
1014 
1015     WARN_ON(!irqs_disabled());
1016 
1017     cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1018     rotations = perf_rotate_context(cpuctx);
1019 
1020     raw_spin_lock(&cpuctx->hrtimer_lock);
1021     if (rotations)
1022         hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1023     else
1024         cpuctx->hrtimer_active = 0;
1025     raw_spin_unlock(&cpuctx->hrtimer_lock);
1026 
1027     return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1028 }
1029 
1030 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1031 {
1032     struct hrtimer *timer = &cpuctx->hrtimer;
1033     struct pmu *pmu = cpuctx->ctx.pmu;
1034     u64 interval;
1035 
1036     /* no multiplexing needed for SW PMU */
1037     if (pmu->task_ctx_nr == perf_sw_context)
1038         return;
1039 
1040     /*
1041      * check default is sane, if not set then force to
1042      * default interval (1/tick)
1043      */
1044     interval = pmu->hrtimer_interval_ms;
1045     if (interval < 1)
1046         interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1047 
1048     cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1049 
1050     raw_spin_lock_init(&cpuctx->hrtimer_lock);
1051     hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1052     timer->function = perf_mux_hrtimer_handler;
1053 }
1054 
1055 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1056 {
1057     struct hrtimer *timer = &cpuctx->hrtimer;
1058     struct pmu *pmu = cpuctx->ctx.pmu;
1059     unsigned long flags;
1060 
1061     /* not for SW PMU */
1062     if (pmu->task_ctx_nr == perf_sw_context)
1063         return 0;
1064 
1065     raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1066     if (!cpuctx->hrtimer_active) {
1067         cpuctx->hrtimer_active = 1;
1068         hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1069         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1070     }
1071     raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1072 
1073     return 0;
1074 }
1075 
1076 void perf_pmu_disable(struct pmu *pmu)
1077 {
1078     int *count = this_cpu_ptr(pmu->pmu_disable_count);
1079     if (!(*count)++)
1080         pmu->pmu_disable(pmu);
1081 }
1082 
1083 void perf_pmu_enable(struct pmu *pmu)
1084 {
1085     int *count = this_cpu_ptr(pmu->pmu_disable_count);
1086     if (!--(*count))
1087         pmu->pmu_enable(pmu);
1088 }
1089 
1090 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1091 
1092 /*
1093  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1094  * perf_event_task_tick() are fully serialized because they're strictly cpu
1095  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1096  * disabled, while perf_event_task_tick is called from IRQ context.
1097  */
1098 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1099 {
1100     struct list_head *head = this_cpu_ptr(&active_ctx_list);
1101 
1102     WARN_ON(!irqs_disabled());
1103 
1104     WARN_ON(!list_empty(&ctx->active_ctx_list));
1105 
1106     list_add(&ctx->active_ctx_list, head);
1107 }
1108 
1109 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1110 {
1111     WARN_ON(!irqs_disabled());
1112 
1113     WARN_ON(list_empty(&ctx->active_ctx_list));
1114 
1115     list_del_init(&ctx->active_ctx_list);
1116 }
1117 
1118 static void get_ctx(struct perf_event_context *ctx)
1119 {
1120     WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1121 }
1122 
1123 static void free_ctx(struct rcu_head *head)
1124 {
1125     struct perf_event_context *ctx;
1126 
1127     ctx = container_of(head, struct perf_event_context, rcu_head);
1128     kfree(ctx->task_ctx_data);
1129     kfree(ctx);
1130 }
1131 
1132 static void put_ctx(struct perf_event_context *ctx)
1133 {
1134     if (atomic_dec_and_test(&ctx->refcount)) {
1135         if (ctx->parent_ctx)
1136             put_ctx(ctx->parent_ctx);
1137         if (ctx->task && ctx->task != TASK_TOMBSTONE)
1138             put_task_struct(ctx->task);
1139         call_rcu(&ctx->rcu_head, free_ctx);
1140     }
1141 }
1142 
1143 /*
1144  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1145  * perf_pmu_migrate_context() we need some magic.
1146  *
1147  * Those places that change perf_event::ctx will hold both
1148  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1149  *
1150  * Lock ordering is by mutex address. There are two other sites where
1151  * perf_event_context::mutex nests and those are:
1152  *
1153  *  - perf_event_exit_task_context()    [ child , 0 ]
1154  *      perf_event_exit_event()
1155  *        put_event()           [ parent, 1 ]
1156  *
1157  *  - perf_event_init_context()     [ parent, 0 ]
1158  *      inherit_task_group()
1159  *        inherit_group()
1160  *          inherit_event()
1161  *            perf_event_alloc()
1162  *              perf_init_event()
1163  *                perf_try_init_event() [ child , 1 ]
1164  *
1165  * While it appears there is an obvious deadlock here -- the parent and child
1166  * nesting levels are inverted between the two. This is in fact safe because
1167  * life-time rules separate them. That is an exiting task cannot fork, and a
1168  * spawning task cannot (yet) exit.
1169  *
1170  * But remember that that these are parent<->child context relations, and
1171  * migration does not affect children, therefore these two orderings should not
1172  * interact.
1173  *
1174  * The change in perf_event::ctx does not affect children (as claimed above)
1175  * because the sys_perf_event_open() case will install a new event and break
1176  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1177  * concerned with cpuctx and that doesn't have children.
1178  *
1179  * The places that change perf_event::ctx will issue:
1180  *
1181  *   perf_remove_from_context();
1182  *   synchronize_rcu();
1183  *   perf_install_in_context();
1184  *
1185  * to affect the change. The remove_from_context() + synchronize_rcu() should
1186  * quiesce the event, after which we can install it in the new location. This
1187  * means that only external vectors (perf_fops, prctl) can perturb the event
1188  * while in transit. Therefore all such accessors should also acquire
1189  * perf_event_context::mutex to serialize against this.
1190  *
1191  * However; because event->ctx can change while we're waiting to acquire
1192  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1193  * function.
1194  *
1195  * Lock order:
1196  *    cred_guard_mutex
1197  *  task_struct::perf_event_mutex
1198  *    perf_event_context::mutex
1199  *      perf_event::child_mutex;
1200  *        perf_event_context::lock
1201  *      perf_event::mmap_mutex
1202  *      mmap_sem
1203  */
1204 static struct perf_event_context *
1205 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1206 {
1207     struct perf_event_context *ctx;
1208 
1209 again:
1210     rcu_read_lock();
1211     ctx = ACCESS_ONCE(event->ctx);
1212     if (!atomic_inc_not_zero(&ctx->refcount)) {
1213         rcu_read_unlock();
1214         goto again;
1215     }
1216     rcu_read_unlock();
1217 
1218     mutex_lock_nested(&ctx->mutex, nesting);
1219     if (event->ctx != ctx) {
1220         mutex_unlock(&ctx->mutex);
1221         put_ctx(ctx);
1222         goto again;
1223     }
1224 
1225     return ctx;
1226 }
1227 
1228 static inline struct perf_event_context *
1229 perf_event_ctx_lock(struct perf_event *event)
1230 {
1231     return perf_event_ctx_lock_nested(event, 0);
1232 }
1233 
1234 static void perf_event_ctx_unlock(struct perf_event *event,
1235                   struct perf_event_context *ctx)
1236 {
1237     mutex_unlock(&ctx->mutex);
1238     put_ctx(ctx);
1239 }
1240 
1241 /*
1242  * This must be done under the ctx->lock, such as to serialize against
1243  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1244  * calling scheduler related locks and ctx->lock nests inside those.
1245  */
1246 static __must_check struct perf_event_context *
1247 unclone_ctx(struct perf_event_context *ctx)
1248 {
1249     struct perf_event_context *parent_ctx = ctx->parent_ctx;
1250 
1251     lockdep_assert_held(&ctx->lock);
1252 
1253     if (parent_ctx)
1254         ctx->parent_ctx = NULL;
1255     ctx->generation++;
1256 
1257     return parent_ctx;
1258 }
1259 
1260 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1261 {
1262     /*
1263      * only top level events have the pid namespace they were created in
1264      */
1265     if (event->parent)
1266         event = event->parent;
1267 
1268     return task_tgid_nr_ns(p, event->ns);
1269 }
1270 
1271 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1272 {
1273     /*
1274      * only top level events have the pid namespace they were created in
1275      */
1276     if (event->parent)
1277         event = event->parent;
1278 
1279     return task_pid_nr_ns(p, event->ns);
1280 }
1281 
1282 /*
1283  * If we inherit events we want to return the parent event id
1284  * to userspace.
1285  */
1286 static u64 primary_event_id(struct perf_event *event)
1287 {
1288     u64 id = event->id;
1289 
1290     if (event->parent)
1291         id = event->parent->id;
1292 
1293     return id;
1294 }
1295 
1296 /*
1297  * Get the perf_event_context for a task and lock it.
1298  *
1299  * This has to cope with with the fact that until it is locked,
1300  * the context could get moved to another task.
1301  */
1302 static struct perf_event_context *
1303 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1304 {
1305     struct perf_event_context *ctx;
1306 
1307 retry:
1308     /*
1309      * One of the few rules of preemptible RCU is that one cannot do
1310      * rcu_read_unlock() while holding a scheduler (or nested) lock when
1311      * part of the read side critical section was irqs-enabled -- see
1312      * rcu_read_unlock_special().
1313      *
1314      * Since ctx->lock nests under rq->lock we must ensure the entire read
1315      * side critical section has interrupts disabled.
1316      */
1317     local_irq_save(*flags);
1318     rcu_read_lock();
1319     ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1320     if (ctx) {
1321         /*
1322          * If this context is a clone of another, it might
1323          * get swapped for another underneath us by
1324          * perf_event_task_sched_out, though the
1325          * rcu_read_lock() protects us from any context
1326          * getting freed.  Lock the context and check if it
1327          * got swapped before we could get the lock, and retry
1328          * if so.  If we locked the right context, then it
1329          * can't get swapped on us any more.
1330          */
1331         raw_spin_lock(&ctx->lock);
1332         if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1333             raw_spin_unlock(&ctx->lock);
1334             rcu_read_unlock();
1335             local_irq_restore(*flags);
1336             goto retry;
1337         }
1338 
1339         if (ctx->task == TASK_TOMBSTONE ||
1340             !atomic_inc_not_zero(&ctx->refcount)) {
1341             raw_spin_unlock(&ctx->lock);
1342             ctx = NULL;
1343         } else {
1344             WARN_ON_ONCE(ctx->task != task);
1345         }
1346     }
1347     rcu_read_unlock();
1348     if (!ctx)
1349         local_irq_restore(*flags);
1350     return ctx;
1351 }
1352 
1353 /*
1354  * Get the context for a task and increment its pin_count so it
1355  * can't get swapped to another task.  This also increments its
1356  * reference count so that the context can't get freed.
1357  */
1358 static struct perf_event_context *
1359 perf_pin_task_context(struct task_struct *task, int ctxn)
1360 {
1361     struct perf_event_context *ctx;
1362     unsigned long flags;
1363 
1364     ctx = perf_lock_task_context(task, ctxn, &flags);
1365     if (ctx) {
1366         ++ctx->pin_count;
1367         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1368     }
1369     return ctx;
1370 }
1371 
1372 static void perf_unpin_context(struct perf_event_context *ctx)
1373 {
1374     unsigned long flags;
1375 
1376     raw_spin_lock_irqsave(&ctx->lock, flags);
1377     --ctx->pin_count;
1378     raw_spin_unlock_irqrestore(&ctx->lock, flags);
1379 }
1380 
1381 /*
1382  * Update the record of the current time in a context.
1383  */
1384 static void update_context_time(struct perf_event_context *ctx)
1385 {
1386     u64 now = perf_clock();
1387 
1388     ctx->time += now - ctx->timestamp;
1389     ctx->timestamp = now;
1390 }
1391 
1392 static u64 perf_event_time(struct perf_event *event)
1393 {
1394     struct perf_event_context *ctx = event->ctx;
1395 
1396     if (is_cgroup_event(event))
1397         return perf_cgroup_event_time(event);
1398 
1399     return ctx ? ctx->time : 0;
1400 }
1401 
1402 /*
1403  * Update the total_time_enabled and total_time_running fields for a event.
1404  */
1405 static void update_event_times(struct perf_event *event)
1406 {
1407     struct perf_event_context *ctx = event->ctx;
1408     u64 run_end;
1409 
1410     lockdep_assert_held(&ctx->lock);
1411 
1412     if (event->state < PERF_EVENT_STATE_INACTIVE ||
1413         event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1414         return;
1415 
1416     /*
1417      * in cgroup mode, time_enabled represents
1418      * the time the event was enabled AND active
1419      * tasks were in the monitored cgroup. This is
1420      * independent of the activity of the context as
1421      * there may be a mix of cgroup and non-cgroup events.
1422      *
1423      * That is why we treat cgroup events differently
1424      * here.
1425      */
1426     if (is_cgroup_event(event))
1427         run_end = perf_cgroup_event_time(event);
1428     else if (ctx->is_active)
1429         run_end = ctx->time;
1430     else
1431         run_end = event->tstamp_stopped;
1432 
1433     event->total_time_enabled = run_end - event->tstamp_enabled;
1434 
1435     if (event->state == PERF_EVENT_STATE_INACTIVE)
1436         run_end = event->tstamp_stopped;
1437     else
1438         run_end = perf_event_time(event);
1439 
1440     event->total_time_running = run_end - event->tstamp_running;
1441 
1442 }
1443 
1444 /*
1445  * Update total_time_enabled and total_time_running for all events in a group.
1446  */
1447 static void update_group_times(struct perf_event *leader)
1448 {
1449     struct perf_event *event;
1450 
1451     update_event_times(leader);
1452     list_for_each_entry(event, &leader->sibling_list, group_entry)
1453         update_event_times(event);
1454 }
1455 
1456 static struct list_head *
1457 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1458 {
1459     if (event->attr.pinned)
1460         return &ctx->pinned_groups;
1461     else
1462         return &ctx->flexible_groups;
1463 }
1464 
1465 /*
1466  * Add a event from the lists for its context.
1467  * Must be called with ctx->mutex and ctx->lock held.
1468  */
1469 static void
1470 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1471 {
1472     lockdep_assert_held(&ctx->lock);
1473 
1474     WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1475     event->attach_state |= PERF_ATTACH_CONTEXT;
1476 
1477     /*
1478      * If we're a stand alone event or group leader, we go to the context
1479      * list, group events are kept attached to the group so that
1480      * perf_group_detach can, at all times, locate all siblings.
1481      */
1482     if (event->group_leader == event) {
1483         struct list_head *list;
1484 
1485         event->group_caps = event->event_caps;
1486 
1487         list = ctx_group_list(event, ctx);
1488         list_add_tail(&event->group_entry, list);
1489     }
1490 
1491     list_update_cgroup_event(event, ctx, true);
1492 
1493     list_add_rcu(&event->event_entry, &ctx->event_list);
1494     ctx->nr_events++;
1495     if (event->attr.inherit_stat)
1496         ctx->nr_stat++;
1497 
1498     ctx->generation++;
1499 }
1500 
1501 /*
1502  * Initialize event state based on the perf_event_attr::disabled.
1503  */
1504 static inline void perf_event__state_init(struct perf_event *event)
1505 {
1506     event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1507                           PERF_EVENT_STATE_INACTIVE;
1508 }
1509 
1510 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1511 {
1512     int entry = sizeof(u64); /* value */
1513     int size = 0;
1514     int nr = 1;
1515 
1516     if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1517         size += sizeof(u64);
1518 
1519     if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1520         size += sizeof(u64);
1521 
1522     if (event->attr.read_format & PERF_FORMAT_ID)
1523         entry += sizeof(u64);
1524 
1525     if (event->attr.read_format & PERF_FORMAT_GROUP) {
1526         nr += nr_siblings;
1527         size += sizeof(u64);
1528     }
1529 
1530     size += entry * nr;
1531     event->read_size = size;
1532 }
1533 
1534 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1535 {
1536     struct perf_sample_data *data;
1537     u16 size = 0;
1538 
1539     if (sample_type & PERF_SAMPLE_IP)
1540         size += sizeof(data->ip);
1541 
1542     if (sample_type & PERF_SAMPLE_ADDR)
1543         size += sizeof(data->addr);
1544 
1545     if (sample_type & PERF_SAMPLE_PERIOD)
1546         size += sizeof(data->period);
1547 
1548     if (sample_type & PERF_SAMPLE_WEIGHT)
1549         size += sizeof(data->weight);
1550 
1551     if (sample_type & PERF_SAMPLE_READ)
1552         size += event->read_size;
1553 
1554     if (sample_type & PERF_SAMPLE_DATA_SRC)
1555         size += sizeof(data->data_src.val);
1556 
1557     if (sample_type & PERF_SAMPLE_TRANSACTION)
1558         size += sizeof(data->txn);
1559 
1560     event->header_size = size;
1561 }
1562 
1563 /*
1564  * Called at perf_event creation and when events are attached/detached from a
1565  * group.
1566  */
1567 static void perf_event__header_size(struct perf_event *event)
1568 {
1569     __perf_event_read_size(event,
1570                    event->group_leader->nr_siblings);
1571     __perf_event_header_size(event, event->attr.sample_type);
1572 }
1573 
1574 static void perf_event__id_header_size(struct perf_event *event)
1575 {
1576     struct perf_sample_data *data;
1577     u64 sample_type = event->attr.sample_type;
1578     u16 size = 0;
1579 
1580     if (sample_type & PERF_SAMPLE_TID)
1581         size += sizeof(data->tid_entry);
1582 
1583     if (sample_type & PERF_SAMPLE_TIME)
1584         size += sizeof(data->time);
1585 
1586     if (sample_type & PERF_SAMPLE_IDENTIFIER)
1587         size += sizeof(data->id);
1588 
1589     if (sample_type & PERF_SAMPLE_ID)
1590         size += sizeof(data->id);
1591 
1592     if (sample_type & PERF_SAMPLE_STREAM_ID)
1593         size += sizeof(data->stream_id);
1594 
1595     if (sample_type & PERF_SAMPLE_CPU)
1596         size += sizeof(data->cpu_entry);
1597 
1598     event->id_header_size = size;
1599 }
1600 
1601 static bool perf_event_validate_size(struct perf_event *event)
1602 {
1603     /*
1604      * The values computed here will be over-written when we actually
1605      * attach the event.
1606      */
1607     __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1608     __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1609     perf_event__id_header_size(event);
1610 
1611     /*
1612      * Sum the lot; should not exceed the 64k limit we have on records.
1613      * Conservative limit to allow for callchains and other variable fields.
1614      */
1615     if (event->read_size + event->header_size +
1616         event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1617         return false;
1618 
1619     return true;
1620 }
1621 
1622 static void perf_group_attach(struct perf_event *event)
1623 {
1624     struct perf_event *group_leader = event->group_leader, *pos;
1625 
1626     lockdep_assert_held(&event->ctx->lock);
1627 
1628     /*
1629      * We can have double attach due to group movement in perf_event_open.
1630      */
1631     if (event->attach_state & PERF_ATTACH_GROUP)
1632         return;
1633 
1634     event->attach_state |= PERF_ATTACH_GROUP;
1635 
1636     if (group_leader == event)
1637         return;
1638 
1639     WARN_ON_ONCE(group_leader->ctx != event->ctx);
1640 
1641     group_leader->group_caps &= event->event_caps;
1642 
1643     list_add_tail(&event->group_entry, &group_leader->sibling_list);
1644     group_leader->nr_siblings++;
1645 
1646     perf_event__header_size(group_leader);
1647 
1648     list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1649         perf_event__header_size(pos);
1650 }
1651 
1652 /*
1653  * Remove a event from the lists for its context.
1654  * Must be called with ctx->mutex and ctx->lock held.
1655  */
1656 static void
1657 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1658 {
1659     WARN_ON_ONCE(event->ctx != ctx);
1660     lockdep_assert_held(&ctx->lock);
1661 
1662     /*
1663      * We can have double detach due to exit/hot-unplug + close.
1664      */
1665     if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1666         return;
1667 
1668     event->attach_state &= ~PERF_ATTACH_CONTEXT;
1669 
1670     list_update_cgroup_event(event, ctx, false);
1671 
1672     ctx->nr_events--;
1673     if (event->attr.inherit_stat)
1674         ctx->nr_stat--;
1675 
1676     list_del_rcu(&event->event_entry);
1677 
1678     if (event->group_leader == event)
1679         list_del_init(&event->group_entry);
1680 
1681     update_group_times(event);
1682 
1683     /*
1684      * If event was in error state, then keep it
1685      * that way, otherwise bogus counts will be
1686      * returned on read(). The only way to get out
1687      * of error state is by explicit re-enabling
1688      * of the event
1689      */
1690     if (event->state > PERF_EVENT_STATE_OFF)
1691         event->state = PERF_EVENT_STATE_OFF;
1692 
1693     ctx->generation++;
1694 }
1695 
1696 static void perf_group_detach(struct perf_event *event)
1697 {
1698     struct perf_event *sibling, *tmp;
1699     struct list_head *list = NULL;
1700 
1701     lockdep_assert_held(&event->ctx->lock);
1702 
1703     /*
1704      * We can have double detach due to exit/hot-unplug + close.
1705      */
1706     if (!(event->attach_state & PERF_ATTACH_GROUP))
1707         return;
1708 
1709     event->attach_state &= ~PERF_ATTACH_GROUP;
1710 
1711     /*
1712      * If this is a sibling, remove it from its group.
1713      */
1714     if (event->group_leader != event) {
1715         list_del_init(&event->group_entry);
1716         event->group_leader->nr_siblings--;
1717         goto out;
1718     }
1719 
1720     if (!list_empty(&event->group_entry))
1721         list = &event->group_entry;
1722 
1723     /*
1724      * If this was a group event with sibling events then
1725      * upgrade the siblings to singleton events by adding them
1726      * to whatever list we are on.
1727      */
1728     list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1729         if (list)
1730             list_move_tail(&sibling->group_entry, list);
1731         sibling->group_leader = sibling;
1732 
1733         /* Inherit group flags from the previous leader */
1734         sibling->group_caps = event->group_caps;
1735 
1736         WARN_ON_ONCE(sibling->ctx != event->ctx);
1737     }
1738 
1739 out:
1740     perf_event__header_size(event->group_leader);
1741 
1742     list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1743         perf_event__header_size(tmp);
1744 }
1745 
1746 static bool is_orphaned_event(struct perf_event *event)
1747 {
1748     return event->state == PERF_EVENT_STATE_DEAD;
1749 }
1750 
1751 static inline int __pmu_filter_match(struct perf_event *event)
1752 {
1753     struct pmu *pmu = event->pmu;
1754     return pmu->filter_match ? pmu->filter_match(event) : 1;
1755 }
1756 
1757 /*
1758  * Check whether we should attempt to schedule an event group based on
1759  * PMU-specific filtering. An event group can consist of HW and SW events,
1760  * potentially with a SW leader, so we must check all the filters, to
1761  * determine whether a group is schedulable:
1762  */
1763 static inline int pmu_filter_match(struct perf_event *event)
1764 {
1765     struct perf_event *child;
1766 
1767     if (!__pmu_filter_match(event))
1768         return 0;
1769 
1770     list_for_each_entry(child, &event->sibling_list, group_entry) {
1771         if (!__pmu_filter_match(child))
1772             return 0;
1773     }
1774 
1775     return 1;
1776 }
1777 
1778 static inline int
1779 event_filter_match(struct perf_event *event)
1780 {
1781     return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1782            perf_cgroup_match(event) && pmu_filter_match(event);
1783 }
1784 
1785 static void
1786 event_sched_out(struct perf_event *event,
1787           struct perf_cpu_context *cpuctx,
1788           struct perf_event_context *ctx)
1789 {
1790     u64 tstamp = perf_event_time(event);
1791     u64 delta;
1792 
1793     WARN_ON_ONCE(event->ctx != ctx);
1794     lockdep_assert_held(&ctx->lock);
1795 
1796     /*
1797      * An event which could not be activated because of
1798      * filter mismatch still needs to have its timings
1799      * maintained, otherwise bogus information is return
1800      * via read() for time_enabled, time_running:
1801      */
1802     if (event->state == PERF_EVENT_STATE_INACTIVE &&
1803         !event_filter_match(event)) {
1804         delta = tstamp - event->tstamp_stopped;
1805         event->tstamp_running += delta;
1806         event->tstamp_stopped = tstamp;
1807     }
1808 
1809     if (event->state != PERF_EVENT_STATE_ACTIVE)
1810         return;
1811 
1812     perf_pmu_disable(event->pmu);
1813 
1814     event->tstamp_stopped = tstamp;
1815     event->pmu->del(event, 0);
1816     event->oncpu = -1;
1817     event->state = PERF_EVENT_STATE_INACTIVE;
1818     if (event->pending_disable) {
1819         event->pending_disable = 0;
1820         event->state = PERF_EVENT_STATE_OFF;
1821     }
1822 
1823     if (!is_software_event(event))
1824         cpuctx->active_oncpu--;
1825     if (!--ctx->nr_active)
1826         perf_event_ctx_deactivate(ctx);
1827     if (event->attr.freq && event->attr.sample_freq)
1828         ctx->nr_freq--;
1829     if (event->attr.exclusive || !cpuctx->active_oncpu)
1830         cpuctx->exclusive = 0;
1831 
1832     perf_pmu_enable(event->pmu);
1833 }
1834 
1835 static void
1836 group_sched_out(struct perf_event *group_event,
1837         struct perf_cpu_context *cpuctx,
1838         struct perf_event_context *ctx)
1839 {
1840     struct perf_event *event;
1841     int state = group_event->state;
1842 
1843     perf_pmu_disable(ctx->pmu);
1844 
1845     event_sched_out(group_event, cpuctx, ctx);
1846 
1847     /*
1848      * Schedule out siblings (if any):
1849      */
1850     list_for_each_entry(event, &group_event->sibling_list, group_entry)
1851         event_sched_out(event, cpuctx, ctx);
1852 
1853     perf_pmu_enable(ctx->pmu);
1854 
1855     if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1856         cpuctx->exclusive = 0;
1857 }
1858 
1859 #define DETACH_GROUP    0x01UL
1860 
1861 /*
1862  * Cross CPU call to remove a performance event
1863  *
1864  * We disable the event on the hardware level first. After that we
1865  * remove it from the context list.
1866  */
1867 static void
1868 __perf_remove_from_context(struct perf_event *event,
1869                struct perf_cpu_context *cpuctx,
1870                struct perf_event_context *ctx,
1871                void *info)
1872 {
1873     unsigned long flags = (unsigned long)info;
1874 
1875     event_sched_out(event, cpuctx, ctx);
1876     if (flags & DETACH_GROUP)
1877         perf_group_detach(event);
1878     list_del_event(event, ctx);
1879 
1880     if (!ctx->nr_events && ctx->is_active) {
1881         ctx->is_active = 0;
1882         if (ctx->task) {
1883             WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1884             cpuctx->task_ctx = NULL;
1885         }
1886     }
1887 }
1888 
1889 /*
1890  * Remove the event from a task's (or a CPU's) list of events.
1891  *
1892  * If event->ctx is a cloned context, callers must make sure that
1893  * every task struct that event->ctx->task could possibly point to
1894  * remains valid.  This is OK when called from perf_release since
1895  * that only calls us on the top-level context, which can't be a clone.
1896  * When called from perf_event_exit_task, it's OK because the
1897  * context has been detached from its task.
1898  */
1899 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1900 {
1901     struct perf_event_context *ctx = event->ctx;
1902 
1903     lockdep_assert_held(&ctx->mutex);
1904 
1905     event_function_call(event, __perf_remove_from_context, (void *)flags);
1906 
1907     /*
1908      * The above event_function_call() can NO-OP when it hits
1909      * TASK_TOMBSTONE. In that case we must already have been detached
1910      * from the context (by perf_event_exit_event()) but the grouping
1911      * might still be in-tact.
1912      */
1913     WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1914     if ((flags & DETACH_GROUP) &&
1915         (event->attach_state & PERF_ATTACH_GROUP)) {
1916         /*
1917          * Since in that case we cannot possibly be scheduled, simply
1918          * detach now.
1919          */
1920         raw_spin_lock_irq(&ctx->lock);
1921         perf_group_detach(event);
1922         raw_spin_unlock_irq(&ctx->lock);
1923     }
1924 }
1925 
1926 /*
1927  * Cross CPU call to disable a performance event
1928  */
1929 static void __perf_event_disable(struct perf_event *event,
1930                  struct perf_cpu_context *cpuctx,
1931                  struct perf_event_context *ctx,
1932                  void *info)
1933 {
1934     if (event->state < PERF_EVENT_STATE_INACTIVE)
1935         return;
1936 
1937     update_context_time(ctx);
1938     update_cgrp_time_from_event(event);
1939     update_group_times(event);
1940     if (event == event->group_leader)
1941         group_sched_out(event, cpuctx, ctx);
1942     else
1943         event_sched_out(event, cpuctx, ctx);
1944     event->state = PERF_EVENT_STATE_OFF;
1945 }
1946 
1947 /*
1948  * Disable a event.
1949  *
1950  * If event->ctx is a cloned context, callers must make sure that
1951  * every task struct that event->ctx->task could possibly point to
1952  * remains valid.  This condition is satisifed when called through
1953  * perf_event_for_each_child or perf_event_for_each because they
1954  * hold the top-level event's child_mutex, so any descendant that
1955  * goes to exit will block in perf_event_exit_event().
1956  *
1957  * When called from perf_pending_event it's OK because event->ctx
1958  * is the current context on this CPU and preemption is disabled,
1959  * hence we can't get into perf_event_task_sched_out for this context.
1960  */
1961 static void _perf_event_disable(struct perf_event *event)
1962 {
1963     struct perf_event_context *ctx = event->ctx;
1964 
1965     raw_spin_lock_irq(&ctx->lock);
1966     if (event->state <= PERF_EVENT_STATE_OFF) {
1967         raw_spin_unlock_irq(&ctx->lock);
1968         return;
1969     }
1970     raw_spin_unlock_irq(&ctx->lock);
1971 
1972     event_function_call(event, __perf_event_disable, NULL);
1973 }
1974 
1975 void perf_event_disable_local(struct perf_event *event)
1976 {
1977     event_function_local(event, __perf_event_disable, NULL);
1978 }
1979 
1980 /*
1981  * Strictly speaking kernel users cannot create groups and therefore this
1982  * interface does not need the perf_event_ctx_lock() magic.
1983  */
1984 void perf_event_disable(struct perf_event *event)
1985 {
1986     struct perf_event_context *ctx;
1987 
1988     ctx = perf_event_ctx_lock(event);
1989     _perf_event_disable(event);
1990     perf_event_ctx_unlock(event, ctx);
1991 }
1992 EXPORT_SYMBOL_GPL(perf_event_disable);
1993 
1994 void perf_event_disable_inatomic(struct perf_event *event)
1995 {
1996     event->pending_disable = 1;
1997     irq_work_queue(&event->pending);
1998 }
1999 
2000 static void perf_set_shadow_time(struct perf_event *event,
2001                  struct perf_event_context *ctx,
2002                  u64 tstamp)
2003 {
2004     /*
2005      * use the correct time source for the time snapshot
2006      *
2007      * We could get by without this by leveraging the
2008      * fact that to get to this function, the caller
2009      * has most likely already called update_context_time()
2010      * and update_cgrp_time_xx() and thus both timestamp
2011      * are identical (or very close). Given that tstamp is,
2012      * already adjusted for cgroup, we could say that:
2013      *    tstamp - ctx->timestamp
2014      * is equivalent to
2015      *    tstamp - cgrp->timestamp.
2016      *
2017      * Then, in perf_output_read(), the calculation would
2018      * work with no changes because:
2019      * - event is guaranteed scheduled in
2020      * - no scheduled out in between
2021      * - thus the timestamp would be the same
2022      *
2023      * But this is a bit hairy.
2024      *
2025      * So instead, we have an explicit cgroup call to remain
2026      * within the time time source all along. We believe it
2027      * is cleaner and simpler to understand.
2028      */
2029     if (is_cgroup_event(event))
2030         perf_cgroup_set_shadow_time(event, tstamp);
2031     else
2032         event->shadow_ctx_time = tstamp - ctx->timestamp;
2033 }
2034 
2035 #define MAX_INTERRUPTS (~0ULL)
2036 
2037 static void perf_log_throttle(struct perf_event *event, int enable);
2038 static void perf_log_itrace_start(struct perf_event *event);
2039 
2040 static int
2041 event_sched_in(struct perf_event *event,
2042          struct perf_cpu_context *cpuctx,
2043          struct perf_event_context *ctx)
2044 {
2045     u64 tstamp = perf_event_time(event);
2046     int ret = 0;
2047 
2048     lockdep_assert_held(&ctx->lock);
2049 
2050     if (event->state <= PERF_EVENT_STATE_OFF)
2051         return 0;
2052 
2053     WRITE_ONCE(event->oncpu, smp_processor_id());
2054     /*
2055      * Order event::oncpu write to happen before the ACTIVE state
2056      * is visible.
2057      */
2058     smp_wmb();
2059     WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2060 
2061     /*
2062      * Unthrottle events, since we scheduled we might have missed several
2063      * ticks already, also for a heavily scheduling task there is little
2064      * guarantee it'll get a tick in a timely manner.
2065      */
2066     if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2067         perf_log_throttle(event, 1);
2068         event->hw.interrupts = 0;
2069     }
2070 
2071     /*
2072      * The new state must be visible before we turn it on in the hardware:
2073      */
2074     smp_wmb();
2075 
2076     perf_pmu_disable(event->pmu);
2077 
2078     perf_set_shadow_time(event, ctx, tstamp);
2079 
2080     perf_log_itrace_start(event);
2081 
2082     if (event->pmu->add(event, PERF_EF_START)) {
2083         event->state = PERF_EVENT_STATE_INACTIVE;
2084         event->oncpu = -1;
2085         ret = -EAGAIN;
2086         goto out;
2087     }
2088 
2089     event->tstamp_running += tstamp - event->tstamp_stopped;
2090 
2091     if (!is_software_event(event))
2092         cpuctx->active_oncpu++;
2093     if (!ctx->nr_active++)
2094         perf_event_ctx_activate(ctx);
2095     if (event->attr.freq && event->attr.sample_freq)
2096         ctx->nr_freq++;
2097 
2098     if (event->attr.exclusive)
2099         cpuctx->exclusive = 1;
2100 
2101 out:
2102     perf_pmu_enable(event->pmu);
2103 
2104     return ret;
2105 }
2106 
2107 static int
2108 group_sched_in(struct perf_event *group_event,
2109            struct perf_cpu_context *cpuctx,
2110            struct perf_event_context *ctx)
2111 {
2112     struct perf_event *event, *partial_group = NULL;
2113     struct pmu *pmu = ctx->pmu;
2114     u64 now = ctx->time;
2115     bool simulate = false;
2116 
2117     if (group_event->state == PERF_EVENT_STATE_OFF)
2118         return 0;
2119 
2120     pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2121 
2122     if (event_sched_in(group_event, cpuctx, ctx)) {
2123         pmu->cancel_txn(pmu);
2124         perf_mux_hrtimer_restart(cpuctx);
2125         return -EAGAIN;
2126     }
2127 
2128     /*
2129      * Schedule in siblings as one group (if any):
2130      */
2131     list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2132         if (event_sched_in(event, cpuctx, ctx)) {
2133             partial_group = event;
2134             goto group_error;
2135         }
2136     }
2137 
2138     if (!pmu->commit_txn(pmu))
2139         return 0;
2140 
2141 group_error:
2142     /*
2143      * Groups can be scheduled in as one unit only, so undo any
2144      * partial group before returning:
2145      * The events up to the failed event are scheduled out normally,
2146      * tstamp_stopped will be updated.
2147      *
2148      * The failed events and the remaining siblings need to have
2149      * their timings updated as if they had gone thru event_sched_in()
2150      * and event_sched_out(). This is required to get consistent timings
2151      * across the group. This also takes care of the case where the group
2152      * could never be scheduled by ensuring tstamp_stopped is set to mark
2153      * the time the event was actually stopped, such that time delta
2154      * calculation in update_event_times() is correct.
2155      */
2156     list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2157         if (event == partial_group)
2158             simulate = true;
2159 
2160         if (simulate) {
2161             event->tstamp_running += now - event->tstamp_stopped;
2162             event->tstamp_stopped = now;
2163         } else {
2164             event_sched_out(event, cpuctx, ctx);
2165         }
2166     }
2167     event_sched_out(group_event, cpuctx, ctx);
2168 
2169     pmu->cancel_txn(pmu);
2170 
2171     perf_mux_hrtimer_restart(cpuctx);
2172 
2173     return -EAGAIN;
2174 }
2175 
2176 /*
2177  * Work out whether we can put this event group on the CPU now.
2178  */
2179 static int group_can_go_on(struct perf_event *event,
2180                struct perf_cpu_context *cpuctx,
2181                int can_add_hw)
2182 {
2183     /*
2184      * Groups consisting entirely of software events can always go on.
2185      */
2186     if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2187         return 1;
2188     /*
2189      * If an exclusive group is already on, no other hardware
2190      * events can go on.
2191      */
2192     if (cpuctx->exclusive)
2193         return 0;
2194     /*
2195      * If this group is exclusive and there are already
2196      * events on the CPU, it can't go on.
2197      */
2198     if (event->attr.exclusive && cpuctx->active_oncpu)
2199         return 0;
2200     /*
2201      * Otherwise, try to add it if all previous groups were able
2202      * to go on.
2203      */
2204     return can_add_hw;
2205 }
2206 
2207 static void add_event_to_ctx(struct perf_event *event,
2208                    struct perf_event_context *ctx)
2209 {
2210     u64 tstamp = perf_event_time(event);
2211 
2212     list_add_event(event, ctx);
2213     perf_group_attach(event);
2214     event->tstamp_enabled = tstamp;
2215     event->tstamp_running = tstamp;
2216     event->tstamp_stopped = tstamp;
2217 }
2218 
2219 static void ctx_sched_out(struct perf_event_context *ctx,
2220               struct perf_cpu_context *cpuctx,
2221               enum event_type_t event_type);
2222 static void
2223 ctx_sched_in(struct perf_event_context *ctx,
2224          struct perf_cpu_context *cpuctx,
2225          enum event_type_t event_type,
2226          struct task_struct *task);
2227 
2228 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2229                    struct perf_event_context *ctx)
2230 {
2231     if (!cpuctx->task_ctx)
2232         return;
2233 
2234     if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2235         return;
2236 
2237     ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2238 }
2239 
2240 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2241                 struct perf_event_context *ctx,
2242                 struct task_struct *task)
2243 {
2244     cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2245     if (ctx)
2246         ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2247     cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2248     if (ctx)
2249         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2250 }
2251 
2252 static void ctx_resched(struct perf_cpu_context *cpuctx,
2253             struct perf_event_context *task_ctx)
2254 {
2255     perf_pmu_disable(cpuctx->ctx.pmu);
2256     if (task_ctx)
2257         task_ctx_sched_out(cpuctx, task_ctx);
2258     cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2259     perf_event_sched_in(cpuctx, task_ctx, current);
2260     perf_pmu_enable(cpuctx->ctx.pmu);
2261 }
2262 
2263 /*
2264  * Cross CPU call to install and enable a performance event
2265  *
2266  * Very similar to remote_function() + event_function() but cannot assume that
2267  * things like ctx->is_active and cpuctx->task_ctx are set.
2268  */
2269 static int  __perf_install_in_context(void *info)
2270 {
2271     struct perf_event *event = info;
2272     struct perf_event_context *ctx = event->ctx;
2273     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2274     struct perf_event_context *task_ctx = cpuctx->task_ctx;
2275     bool reprogram = true;
2276     int ret = 0;
2277 
2278     raw_spin_lock(&cpuctx->ctx.lock);
2279     if (ctx->task) {
2280         raw_spin_lock(&ctx->lock);
2281         task_ctx = ctx;
2282 
2283         reprogram = (ctx->task == current);
2284 
2285         /*
2286          * If the task is running, it must be running on this CPU,
2287          * otherwise we cannot reprogram things.
2288          *
2289          * If its not running, we don't care, ctx->lock will
2290          * serialize against it becoming runnable.
2291          */
2292         if (task_curr(ctx->task) && !reprogram) {
2293             ret = -ESRCH;
2294             goto unlock;
2295         }
2296 
2297         WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2298     } else if (task_ctx) {
2299         raw_spin_lock(&task_ctx->lock);
2300     }
2301 
2302     if (reprogram) {
2303         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2304         add_event_to_ctx(event, ctx);
2305         ctx_resched(cpuctx, task_ctx);
2306     } else {
2307         add_event_to_ctx(event, ctx);
2308     }
2309 
2310 unlock:
2311     perf_ctx_unlock(cpuctx, task_ctx);
2312 
2313     return ret;
2314 }
2315 
2316 /*
2317  * Attach a performance event to a context.
2318  *
2319  * Very similar to event_function_call, see comment there.
2320  */
2321 static void
2322 perf_install_in_context(struct perf_event_context *ctx,
2323             struct perf_event *event,
2324             int cpu)
2325 {
2326     struct task_struct *task = READ_ONCE(ctx->task);
2327 
2328     lockdep_assert_held(&ctx->mutex);
2329 
2330     if (event->cpu != -1)
2331         event->cpu = cpu;
2332 
2333     /*
2334      * Ensures that if we can observe event->ctx, both the event and ctx
2335      * will be 'complete'. See perf_iterate_sb_cpu().
2336      */
2337     smp_store_release(&event->ctx, ctx);
2338 
2339     if (!task) {
2340         cpu_function_call(cpu, __perf_install_in_context, event);
2341         return;
2342     }
2343 
2344     /*
2345      * Should not happen, we validate the ctx is still alive before calling.
2346      */
2347     if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2348         return;
2349 
2350     /*
2351      * Installing events is tricky because we cannot rely on ctx->is_active
2352      * to be set in case this is the nr_events 0 -> 1 transition.
2353      *
2354      * Instead we use task_curr(), which tells us if the task is running.
2355      * However, since we use task_curr() outside of rq::lock, we can race
2356      * against the actual state. This means the result can be wrong.
2357      *
2358      * If we get a false positive, we retry, this is harmless.
2359      *
2360      * If we get a false negative, things are complicated. If we are after
2361      * perf_event_context_sched_in() ctx::lock will serialize us, and the
2362      * value must be correct. If we're before, it doesn't matter since
2363      * perf_event_context_sched_in() will program the counter.
2364      *
2365      * However, this hinges on the remote context switch having observed
2366      * our task->perf_event_ctxp[] store, such that it will in fact take
2367      * ctx::lock in perf_event_context_sched_in().
2368      *
2369      * We do this by task_function_call(), if the IPI fails to hit the task
2370      * we know any future context switch of task must see the
2371      * perf_event_ctpx[] store.
2372      */
2373 
2374     /*
2375      * This smp_mb() orders the task->perf_event_ctxp[] store with the
2376      * task_cpu() load, such that if the IPI then does not find the task
2377      * running, a future context switch of that task must observe the
2378      * store.
2379      */
2380     smp_mb();
2381 again:
2382     if (!task_function_call(task, __perf_install_in_context, event))
2383         return;
2384 
2385     raw_spin_lock_irq(&ctx->lock);
2386     task = ctx->task;
2387     if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2388         /*
2389          * Cannot happen because we already checked above (which also
2390          * cannot happen), and we hold ctx->mutex, which serializes us
2391          * against perf_event_exit_task_context().
2392          */
2393         raw_spin_unlock_irq(&ctx->lock);
2394         return;
2395     }
2396     /*
2397      * If the task is not running, ctx->lock will avoid it becoming so,
2398      * thus we can safely install the event.
2399      */
2400     if (task_curr(task)) {
2401         raw_spin_unlock_irq(&ctx->lock);
2402         goto again;
2403     }
2404     add_event_to_ctx(event, ctx);
2405     raw_spin_unlock_irq(&ctx->lock);
2406 }
2407 
2408 /*
2409  * Put a event into inactive state and update time fields.
2410  * Enabling the leader of a group effectively enables all
2411  * the group members that aren't explicitly disabled, so we
2412  * have to update their ->tstamp_enabled also.
2413  * Note: this works for group members as well as group leaders
2414  * since the non-leader members' sibling_lists will be empty.
2415  */
2416 static void __perf_event_mark_enabled(struct perf_event *event)
2417 {
2418     struct perf_event *sub;
2419     u64 tstamp = perf_event_time(event);
2420 
2421     event->state = PERF_EVENT_STATE_INACTIVE;
2422     event->tstamp_enabled = tstamp - event->total_time_enabled;
2423     list_for_each_entry(sub, &event->sibling_list, group_entry) {
2424         if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2425             sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2426     }
2427 }
2428 
2429 /*
2430  * Cross CPU call to enable a performance event
2431  */
2432 static void __perf_event_enable(struct perf_event *event,
2433                 struct perf_cpu_context *cpuctx,
2434                 struct perf_event_context *ctx,
2435                 void *info)
2436 {
2437     struct perf_event *leader = event->group_leader;
2438     struct perf_event_context *task_ctx;
2439 
2440     if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2441         event->state <= PERF_EVENT_STATE_ERROR)
2442         return;
2443 
2444     if (ctx->is_active)
2445         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2446 
2447     __perf_event_mark_enabled(event);
2448 
2449     if (!ctx->is_active)
2450         return;
2451 
2452     if (!event_filter_match(event)) {
2453         if (is_cgroup_event(event))
2454             perf_cgroup_defer_enabled(event);
2455         ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2456         return;
2457     }
2458 
2459     /*
2460      * If the event is in a group and isn't the group leader,
2461      * then don't put it on unless the group is on.
2462      */
2463     if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2464         ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2465         return;
2466     }
2467 
2468     task_ctx = cpuctx->task_ctx;
2469     if (ctx->task)
2470         WARN_ON_ONCE(task_ctx != ctx);
2471 
2472     ctx_resched(cpuctx, task_ctx);
2473 }
2474 
2475 /*
2476  * Enable a event.
2477  *
2478  * If event->ctx is a cloned context, callers must make sure that
2479  * every task struct that event->ctx->task could possibly point to
2480  * remains valid.  This condition is satisfied when called through
2481  * perf_event_for_each_child or perf_event_for_each as described
2482  * for perf_event_disable.
2483  */
2484 static void _perf_event_enable(struct perf_event *event)
2485 {
2486     struct perf_event_context *ctx = event->ctx;
2487 
2488     raw_spin_lock_irq(&ctx->lock);
2489     if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2490         event->state <  PERF_EVENT_STATE_ERROR) {
2491         raw_spin_unlock_irq(&ctx->lock);
2492         return;
2493     }
2494 
2495     /*
2496      * If the event is in error state, clear that first.
2497      *
2498      * That way, if we see the event in error state below, we know that it
2499      * has gone back into error state, as distinct from the task having
2500      * been scheduled away before the cross-call arrived.
2501      */
2502     if (event->state == PERF_EVENT_STATE_ERROR)
2503         event->state = PERF_EVENT_STATE_OFF;
2504     raw_spin_unlock_irq(&ctx->lock);
2505 
2506     event_function_call(event, __perf_event_enable, NULL);
2507 }
2508 
2509 /*
2510  * See perf_event_disable();
2511  */
2512 void perf_event_enable(struct perf_event *event)
2513 {
2514     struct perf_event_context *ctx;
2515 
2516     ctx = perf_event_ctx_lock(event);
2517     _perf_event_enable(event);
2518     perf_event_ctx_unlock(event, ctx);
2519 }
2520 EXPORT_SYMBOL_GPL(perf_event_enable);
2521 
2522 struct stop_event_data {
2523     struct perf_event   *event;
2524     unsigned int        restart;
2525 };
2526 
2527 static int __perf_event_stop(void *info)
2528 {
2529     struct stop_event_data *sd = info;
2530     struct perf_event *event = sd->event;
2531 
2532     /* if it's already INACTIVE, do nothing */
2533     if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2534         return 0;
2535 
2536     /* matches smp_wmb() in event_sched_in() */
2537     smp_rmb();
2538 
2539     /*
2540      * There is a window with interrupts enabled before we get here,
2541      * so we need to check again lest we try to stop another CPU's event.
2542      */
2543     if (READ_ONCE(event->oncpu) != smp_processor_id())
2544         return -EAGAIN;
2545 
2546     event->pmu->stop(event, PERF_EF_UPDATE);
2547 
2548     /*
2549      * May race with the actual stop (through perf_pmu_output_stop()),
2550      * but it is only used for events with AUX ring buffer, and such
2551      * events will refuse to restart because of rb::aux_mmap_count==0,
2552      * see comments in perf_aux_output_begin().
2553      *
2554      * Since this is happening on a event-local CPU, no trace is lost
2555      * while restarting.
2556      */
2557     if (sd->restart)
2558         event->pmu->start(event, 0);
2559 
2560     return 0;
2561 }
2562 
2563 static int perf_event_stop(struct perf_event *event, int restart)
2564 {
2565     struct stop_event_data sd = {
2566         .event      = event,
2567         .restart    = restart,
2568     };
2569     int ret = 0;
2570 
2571     do {
2572         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2573             return 0;
2574 
2575         /* matches smp_wmb() in event_sched_in() */
2576         smp_rmb();
2577 
2578         /*
2579          * We only want to restart ACTIVE events, so if the event goes
2580          * inactive here (event->oncpu==-1), there's nothing more to do;
2581          * fall through with ret==-ENXIO.
2582          */
2583         ret = cpu_function_call(READ_ONCE(event->oncpu),
2584                     __perf_event_stop, &sd);
2585     } while (ret == -EAGAIN);
2586 
2587     return ret;
2588 }
2589 
2590 /*
2591  * In order to contain the amount of racy and tricky in the address filter
2592  * configuration management, it is a two part process:
2593  *
2594  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2595  *      we update the addresses of corresponding vmas in
2596  *  event::addr_filters_offs array and bump the event::addr_filters_gen;
2597  * (p2) when an event is scheduled in (pmu::add), it calls
2598  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2599  *      if the generation has changed since the previous call.
2600  *
2601  * If (p1) happens while the event is active, we restart it to force (p2).
2602  *
2603  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2604  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2605  *     ioctl;
2606  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2607  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2608  *     for reading;
2609  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2610  *     of exec.
2611  */
2612 void perf_event_addr_filters_sync(struct perf_event *event)
2613 {
2614     struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2615 
2616     if (!has_addr_filter(event))
2617         return;
2618 
2619     raw_spin_lock(&ifh->lock);
2620     if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2621         event->pmu->addr_filters_sync(event);
2622         event->hw.addr_filters_gen = event->addr_filters_gen;
2623     }
2624     raw_spin_unlock(&ifh->lock);
2625 }
2626 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2627 
2628 static int _perf_event_refresh(struct perf_event *event, int refresh)
2629 {
2630     /*
2631      * not supported on inherited events
2632      */
2633     if (event->attr.inherit || !is_sampling_event(event))
2634         return -EINVAL;
2635 
2636     atomic_add(refresh, &event->event_limit);
2637     _perf_event_enable(event);
2638 
2639     return 0;
2640 }
2641 
2642 /*
2643  * See perf_event_disable()
2644  */
2645 int perf_event_refresh(struct perf_event *event, int refresh)
2646 {
2647     struct perf_event_context *ctx;
2648     int ret;
2649 
2650     ctx = perf_event_ctx_lock(event);
2651     ret = _perf_event_refresh(event, refresh);
2652     perf_event_ctx_unlock(event, ctx);
2653 
2654     return ret;
2655 }
2656 EXPORT_SYMBOL_GPL(perf_event_refresh);
2657 
2658 static void ctx_sched_out(struct perf_event_context *ctx,
2659               struct perf_cpu_context *cpuctx,
2660               enum event_type_t event_type)
2661 {
2662     int is_active = ctx->is_active;
2663     struct perf_event *event;
2664 
2665     lockdep_assert_held(&ctx->lock);
2666 
2667     if (likely(!ctx->nr_events)) {
2668         /*
2669          * See __perf_remove_from_context().
2670          */
2671         WARN_ON_ONCE(ctx->is_active);
2672         if (ctx->task)
2673             WARN_ON_ONCE(cpuctx->task_ctx);
2674         return;
2675     }
2676 
2677     ctx->is_active &= ~event_type;
2678     if (!(ctx->is_active & EVENT_ALL))
2679         ctx->is_active = 0;
2680 
2681     if (ctx->task) {
2682         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2683         if (!ctx->is_active)
2684             cpuctx->task_ctx = NULL;
2685     }
2686 
2687     /*
2688      * Always update time if it was set; not only when it changes.
2689      * Otherwise we can 'forget' to update time for any but the last
2690      * context we sched out. For example:
2691      *
2692      *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2693      *   ctx_sched_out(.event_type = EVENT_PINNED)
2694      *
2695      * would only update time for the pinned events.
2696      */
2697     if (is_active & EVENT_TIME) {
2698         /* update (and stop) ctx time */
2699         update_context_time(ctx);
2700         update_cgrp_time_from_cpuctx(cpuctx);
2701     }
2702 
2703     is_active ^= ctx->is_active; /* changed bits */
2704 
2705     if (!ctx->nr_active || !(is_active & EVENT_ALL))
2706         return;
2707 
2708     perf_pmu_disable(ctx->pmu);
2709     if (is_active & EVENT_PINNED) {
2710         list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2711             group_sched_out(event, cpuctx, ctx);
2712     }
2713 
2714     if (is_active & EVENT_FLEXIBLE) {
2715         list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2716             group_sched_out(event, cpuctx, ctx);
2717     }
2718     perf_pmu_enable(ctx->pmu);
2719 }
2720 
2721 /*
2722  * Test whether two contexts are equivalent, i.e. whether they have both been
2723  * cloned from the same version of the same context.
2724  *
2725  * Equivalence is measured using a generation number in the context that is
2726  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2727  * and list_del_event().
2728  */
2729 static int context_equiv(struct perf_event_context *ctx1,
2730              struct perf_event_context *ctx2)
2731 {
2732     lockdep_assert_held(&ctx1->lock);
2733     lockdep_assert_held(&ctx2->lock);
2734 
2735     /* Pinning disables the swap optimization */
2736     if (ctx1->pin_count || ctx2->pin_count)
2737         return 0;
2738 
2739     /* If ctx1 is the parent of ctx2 */
2740     if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2741         return 1;
2742 
2743     /* If ctx2 is the parent of ctx1 */
2744     if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2745         return 1;
2746 
2747     /*
2748      * If ctx1 and ctx2 have the same parent; we flatten the parent
2749      * hierarchy, see perf_event_init_context().
2750      */
2751     if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2752             ctx1->parent_gen == ctx2->parent_gen)
2753         return 1;
2754 
2755     /* Unmatched */
2756     return 0;
2757 }
2758 
2759 static void __perf_event_sync_stat(struct perf_event *event,
2760                      struct perf_event *next_event)
2761 {
2762     u64 value;
2763 
2764     if (!event->attr.inherit_stat)
2765         return;
2766 
2767     /*
2768      * Update the event value, we cannot use perf_event_read()
2769      * because we're in the middle of a context switch and have IRQs
2770      * disabled, which upsets smp_call_function_single(), however
2771      * we know the event must be on the current CPU, therefore we
2772      * don't need to use it.
2773      */
2774     switch (event->state) {
2775     case PERF_EVENT_STATE_ACTIVE:
2776         event->pmu->read(event);
2777         /* fall-through */
2778 
2779     case PERF_EVENT_STATE_INACTIVE:
2780         update_event_times(event);
2781         break;
2782 
2783     default:
2784         break;
2785     }
2786 
2787     /*
2788      * In order to keep per-task stats reliable we need to flip the event
2789      * values when we flip the contexts.
2790      */
2791     value = local64_read(&next_event->count);
2792     value = local64_xchg(&event->count, value);
2793     local64_set(&next_event->count, value);
2794 
2795     swap(event->total_time_enabled, next_event->total_time_enabled);
2796     swap(event->total_time_running, next_event->total_time_running);
2797 
2798     /*
2799      * Since we swizzled the values, update the user visible data too.
2800      */
2801     perf_event_update_userpage(event);
2802     perf_event_update_userpage(next_event);
2803 }
2804 
2805 static void perf_event_sync_stat(struct perf_event_context *ctx,
2806                    struct perf_event_context *next_ctx)
2807 {
2808     struct perf_event *event, *next_event;
2809 
2810     if (!ctx->nr_stat)
2811         return;
2812 
2813     update_context_time(ctx);
2814 
2815     event = list_first_entry(&ctx->event_list,
2816                    struct perf_event, event_entry);
2817 
2818     next_event = list_first_entry(&next_ctx->event_list,
2819                     struct perf_event, event_entry);
2820 
2821     while (&event->event_entry != &ctx->event_list &&
2822            &next_event->event_entry != &next_ctx->event_list) {
2823 
2824         __perf_event_sync_stat(event, next_event);
2825 
2826         event = list_next_entry(event, event_entry);
2827         next_event = list_next_entry(next_event, event_entry);
2828     }
2829 }
2830 
2831 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2832                      struct task_struct *next)
2833 {
2834     struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2835     struct perf_event_context *next_ctx;
2836     struct perf_event_context *parent, *next_parent;
2837     struct perf_cpu_context *cpuctx;
2838     int do_switch = 1;
2839 
2840     if (likely(!ctx))
2841         return;
2842 
2843     cpuctx = __get_cpu_context(ctx);
2844     if (!cpuctx->task_ctx)
2845         return;
2846 
2847     rcu_read_lock();
2848     next_ctx = next->perf_event_ctxp[ctxn];
2849     if (!next_ctx)
2850         goto unlock;
2851 
2852     parent = rcu_dereference(ctx->parent_ctx);
2853     next_parent = rcu_dereference(next_ctx->parent_ctx);
2854 
2855     /* If neither context have a parent context; they cannot be clones. */
2856     if (!parent && !next_parent)
2857         goto unlock;
2858 
2859     if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2860         /*
2861          * Looks like the two contexts are clones, so we might be
2862          * able to optimize the context switch.  We lock both
2863          * contexts and check that they are clones under the
2864          * lock (including re-checking that neither has been
2865          * uncloned in the meantime).  It doesn't matter which
2866          * order we take the locks because no other cpu could
2867          * be trying to lock both of these tasks.
2868          */
2869         raw_spin_lock(&ctx->lock);
2870         raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2871         if (context_equiv(ctx, next_ctx)) {
2872             WRITE_ONCE(ctx->task, next);
2873             WRITE_ONCE(next_ctx->task, task);
2874 
2875             swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2876 
2877             /*
2878              * RCU_INIT_POINTER here is safe because we've not
2879              * modified the ctx and the above modification of
2880              * ctx->task and ctx->task_ctx_data are immaterial
2881              * since those values are always verified under
2882              * ctx->lock which we're now holding.
2883              */
2884             RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2885             RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2886 
2887             do_switch = 0;
2888 
2889             perf_event_sync_stat(ctx, next_ctx);
2890         }
2891         raw_spin_unlock(&next_ctx->lock);
2892         raw_spin_unlock(&ctx->lock);
2893     }
2894 unlock:
2895     rcu_read_unlock();
2896 
2897     if (do_switch) {
2898         raw_spin_lock(&ctx->lock);
2899         task_ctx_sched_out(cpuctx, ctx);
2900         raw_spin_unlock(&ctx->lock);
2901     }
2902 }
2903 
2904 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2905 
2906 void perf_sched_cb_dec(struct pmu *pmu)
2907 {
2908     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2909 
2910     this_cpu_dec(perf_sched_cb_usages);
2911 
2912     if (!--cpuctx->sched_cb_usage)
2913         list_del(&cpuctx->sched_cb_entry);
2914 }
2915 
2916 
2917 void perf_sched_cb_inc(struct pmu *pmu)
2918 {
2919     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2920 
2921     if (!cpuctx->sched_cb_usage++)
2922         list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
2923 
2924     this_cpu_inc(perf_sched_cb_usages);
2925 }
2926 
2927 /*
2928  * This function provides the context switch callback to the lower code
2929  * layer. It is invoked ONLY when the context switch callback is enabled.
2930  *
2931  * This callback is relevant even to per-cpu events; for example multi event
2932  * PEBS requires this to provide PID/TID information. This requires we flush
2933  * all queued PEBS records before we context switch to a new task.
2934  */
2935 static void perf_pmu_sched_task(struct task_struct *prev,
2936                 struct task_struct *next,
2937                 bool sched_in)
2938 {
2939     struct perf_cpu_context *cpuctx;
2940     struct pmu *pmu;
2941 
2942     if (prev == next)
2943         return;
2944 
2945     list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
2946         pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
2947 
2948         if (WARN_ON_ONCE(!pmu->sched_task))
2949             continue;
2950 
2951         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2952         perf_pmu_disable(pmu);
2953 
2954         pmu->sched_task(cpuctx->task_ctx, sched_in);
2955 
2956         perf_pmu_enable(pmu);
2957         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2958     }
2959 }
2960 
2961 static void perf_event_switch(struct task_struct *task,
2962                   struct task_struct *next_prev, bool sched_in);
2963 
2964 #define for_each_task_context_nr(ctxn)                  \
2965     for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2966 
2967 /*
2968  * Called from scheduler to remove the events of the current task,
2969  * with interrupts disabled.
2970  *
2971  * We stop each event and update the event value in event->count.
2972  *
2973  * This does not protect us against NMI, but disable()
2974  * sets the disabled bit in the control field of event _before_
2975  * accessing the event control register. If a NMI hits, then it will
2976  * not restart the event.
2977  */
2978 void __perf_event_task_sched_out(struct task_struct *task,
2979                  struct task_struct *next)
2980 {
2981     int ctxn;
2982 
2983     if (__this_cpu_read(perf_sched_cb_usages))
2984         perf_pmu_sched_task(task, next, false);
2985 
2986     if (atomic_read(&nr_switch_events))
2987         perf_event_switch(task, next, false);
2988 
2989     for_each_task_context_nr(ctxn)
2990         perf_event_context_sched_out(task, ctxn, next);
2991 
2992     /*
2993      * if cgroup events exist on this CPU, then we need
2994      * to check if we have to switch out PMU state.
2995      * cgroup event are system-wide mode only
2996      */
2997     if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2998         perf_cgroup_sched_out(task, next);
2999 }
3000 
3001 /*
3002  * Called with IRQs disabled
3003  */
3004 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3005                   enum event_type_t event_type)
3006 {
3007     ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3008 }
3009 
3010 static void
3011 ctx_pinned_sched_in(struct perf_event_context *ctx,
3012             struct perf_cpu_context *cpuctx)
3013 {
3014     struct perf_event *event;
3015 
3016     list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3017         if (event->state <= PERF_EVENT_STATE_OFF)
3018             continue;
3019         if (!event_filter_match(event))
3020             continue;
3021 
3022         /* may need to reset tstamp_enabled */
3023         if (is_cgroup_event(event))
3024             perf_cgroup_mark_enabled(event, ctx);
3025 
3026         if (group_can_go_on(event, cpuctx, 1))
3027             group_sched_in(event, cpuctx, ctx);
3028 
3029         /*
3030          * If this pinned group hasn't been scheduled,
3031          * put it in error state.
3032          */
3033         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3034             update_group_times(event);
3035             event->state = PERF_EVENT_STATE_ERROR;
3036         }
3037     }
3038 }
3039 
3040 static void
3041 ctx_flexible_sched_in(struct perf_event_context *ctx,
3042               struct perf_cpu_context *cpuctx)
3043 {
3044     struct perf_event *event;
3045     int can_add_hw = 1;
3046 
3047     list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3048         /* Ignore events in OFF or ERROR state */
3049         if (event->state <= PERF_EVENT_STATE_OFF)
3050             continue;
3051         /*
3052          * Listen to the 'cpu' scheduling filter constraint
3053          * of events:
3054          */
3055         if (!event_filter_match(event))
3056             continue;
3057 
3058         /* may need to reset tstamp_enabled */
3059         if (is_cgroup_event(event))
3060             perf_cgroup_mark_enabled(event, ctx);
3061 
3062         if (group_can_go_on(event, cpuctx, can_add_hw)) {
3063             if (group_sched_in(event, cpuctx, ctx))
3064                 can_add_hw = 0;
3065         }
3066     }
3067 }
3068 
3069 static void
3070 ctx_sched_in(struct perf_event_context *ctx,
3071          struct perf_cpu_context *cpuctx,
3072          enum event_type_t event_type,
3073          struct task_struct *task)
3074 {
3075     int is_active = ctx->is_active;
3076     u64 now;
3077 
3078     lockdep_assert_held(&ctx->lock);
3079 
3080     if (likely(!ctx->nr_events))
3081         return;
3082 
3083     ctx->is_active |= (event_type | EVENT_TIME);
3084     if (ctx->task) {
3085         if (!is_active)
3086             cpuctx->task_ctx = ctx;
3087         else
3088             WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3089     }
3090 
3091     is_active ^= ctx->is_active; /* changed bits */
3092 
3093     if (is_active & EVENT_TIME) {
3094         /* start ctx time */
3095         now = perf_clock();
3096         ctx->timestamp = now;
3097         perf_cgroup_set_timestamp(task, ctx);
3098     }
3099 
3100     /*
3101      * First go through the list and put on any pinned groups
3102      * in order to give them the best chance of going on.
3103      */
3104     if (is_active & EVENT_PINNED)
3105         ctx_pinned_sched_in(ctx, cpuctx);
3106 
3107     /* Then walk through the lower prio flexible groups */
3108     if (is_active & EVENT_FLEXIBLE)
3109         ctx_flexible_sched_in(ctx, cpuctx);
3110 }
3111 
3112 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3113                  enum event_type_t event_type,
3114                  struct task_struct *task)
3115 {
3116     struct perf_event_context *ctx = &cpuctx->ctx;
3117 
3118     ctx_sched_in(ctx, cpuctx, event_type, task);
3119 }
3120 
3121 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3122                     struct task_struct *task)
3123 {
3124     struct perf_cpu_context *cpuctx;
3125 
3126     cpuctx = __get_cpu_context(ctx);
3127     if (cpuctx->task_ctx == ctx)
3128         return;
3129 
3130     perf_ctx_lock(cpuctx, ctx);
3131     perf_pmu_disable(ctx->pmu);
3132     /*
3133      * We want to keep the following priority order:
3134      * cpu pinned (that don't need to move), task pinned,
3135      * cpu flexible, task flexible.
3136      */
3137     cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3138     perf_event_sched_in(cpuctx, ctx, task);
3139     perf_pmu_enable(ctx->pmu);
3140     perf_ctx_unlock(cpuctx, ctx);
3141 }
3142 
3143 /*
3144  * Called from scheduler to add the events of the current task
3145  * with interrupts disabled.
3146  *
3147  * We restore the event value and then enable it.
3148  *
3149  * This does not protect us against NMI, but enable()
3150  * sets the enabled bit in the control field of event _before_
3151  * accessing the event control register. If a NMI hits, then it will
3152  * keep the event running.
3153  */
3154 void __perf_event_task_sched_in(struct task_struct *prev,
3155                 struct task_struct *task)
3156 {
3157     struct perf_event_context *ctx;
3158     int ctxn;
3159 
3160     /*
3161      * If cgroup events exist on this CPU, then we need to check if we have
3162      * to switch in PMU state; cgroup event are system-wide mode only.
3163      *
3164      * Since cgroup events are CPU events, we must schedule these in before
3165      * we schedule in the task events.
3166      */
3167     if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3168         perf_cgroup_sched_in(prev, task);
3169 
3170     for_each_task_context_nr(ctxn) {
3171         ctx = task->perf_event_ctxp[ctxn];
3172         if (likely(!ctx))
3173             continue;
3174 
3175         perf_event_context_sched_in(ctx, task);
3176     }
3177 
3178     if (atomic_read(&nr_switch_events))
3179         perf_event_switch(task, prev, true);
3180 
3181     if (__this_cpu_read(perf_sched_cb_usages))
3182         perf_pmu_sched_task(prev, task, true);
3183 }
3184 
3185 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3186 {
3187     u64 frequency = event->attr.sample_freq;
3188     u64 sec = NSEC_PER_SEC;
3189     u64 divisor, dividend;
3190 
3191     int count_fls, nsec_fls, frequency_fls, sec_fls;
3192 
3193     count_fls = fls64(count);
3194     nsec_fls = fls64(nsec);
3195     frequency_fls = fls64(frequency);
3196     sec_fls = 30;
3197 
3198     /*
3199      * We got @count in @nsec, with a target of sample_freq HZ
3200      * the target period becomes:
3201      *
3202      *             @count * 10^9
3203      * period = -------------------
3204      *          @nsec * sample_freq
3205      *
3206      */
3207 
3208     /*
3209      * Reduce accuracy by one bit such that @a and @b converge
3210      * to a similar magnitude.
3211      */
3212 #define REDUCE_FLS(a, b)        \
3213 do {                    \
3214     if (a##_fls > b##_fls) {    \
3215         a >>= 1;        \
3216         a##_fls--;      \
3217     } else {            \
3218         b >>= 1;        \
3219         b##_fls--;      \
3220     }               \
3221 } while (0)
3222 
3223     /*
3224      * Reduce accuracy until either term fits in a u64, then proceed with
3225      * the other, so that finally we can do a u64/u64 division.
3226      */
3227     while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3228         REDUCE_FLS(nsec, frequency);
3229         REDUCE_FLS(sec, count);
3230     }
3231 
3232     if (count_fls + sec_fls > 64) {
3233         divisor = nsec * frequency;
3234 
3235         while (count_fls + sec_fls > 64) {
3236             REDUCE_FLS(count, sec);
3237             divisor >>= 1;
3238         }
3239 
3240         dividend = count * sec;
3241     } else {
3242         dividend = count * sec;
3243 
3244         while (nsec_fls + frequency_fls > 64) {
3245             REDUCE_FLS(nsec, frequency);
3246             dividend >>= 1;
3247         }
3248 
3249         divisor = nsec * frequency;
3250     }
3251 
3252     if (!divisor)
3253         return dividend;
3254 
3255     return div64_u64(dividend, divisor);
3256 }
3257 
3258 static DEFINE_PER_CPU(int, perf_throttled_count);
3259 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3260 
3261 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3262 {
3263     struct hw_perf_event *hwc = &event->hw;
3264     s64 period, sample_period;
3265     s64 delta;
3266 
3267     period = perf_calculate_period(event, nsec, count);
3268 
3269     delta = (s64)(period - hwc->sample_period);
3270     delta = (delta + 7) / 8; /* low pass filter */
3271 
3272     sample_period = hwc->sample_period + delta;
3273 
3274     if (!sample_period)
3275         sample_period = 1;
3276 
3277     hwc->sample_period = sample_period;
3278 
3279     if (local64_read(&hwc->period_left) > 8*sample_period) {
3280         if (disable)
3281             event->pmu->stop(event, PERF_EF_UPDATE);
3282 
3283         local64_set(&hwc->period_left, 0);
3284 
3285         if (disable)
3286             event->pmu->start(event, PERF_EF_RELOAD);
3287     }
3288 }
3289 
3290 /*
3291  * combine freq adjustment with unthrottling to avoid two passes over the
3292  * events. At the same time, make sure, having freq events does not change
3293  * the rate of unthrottling as that would introduce bias.
3294  */
3295 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3296                        int needs_unthr)
3297 {
3298     struct perf_event *event;
3299     struct hw_perf_event *hwc;
3300     u64 now, period = TICK_NSEC;
3301     s64 delta;
3302 
3303     /*
3304      * only need to iterate over all events iff:
3305      * - context have events in frequency mode (needs freq adjust)
3306      * - there are events to unthrottle on this cpu
3307      */
3308     if (!(ctx->nr_freq || needs_unthr))
3309         return;
3310 
3311     raw_spin_lock(&ctx->lock);
3312     perf_pmu_disable(ctx->pmu);
3313 
3314     list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3315         if (event->state != PERF_EVENT_STATE_ACTIVE)
3316             continue;
3317 
3318         if (!event_filter_match(event))
3319             continue;
3320 
3321         perf_pmu_disable(event->pmu);
3322 
3323         hwc = &event->hw;
3324 
3325         if (hwc->interrupts == MAX_INTERRUPTS) {
3326             hwc->interrupts = 0;
3327             perf_log_throttle(event, 1);
3328             event->pmu->start(event, 0);
3329         }
3330 
3331         if (!event->attr.freq || !event->attr.sample_freq)
3332             goto next;
3333 
3334         /*
3335          * stop the event and update event->count
3336          */
3337         event->pmu->stop(event, PERF_EF_UPDATE);
3338 
3339         now = local64_read(&event->count);
3340         delta = now - hwc->freq_count_stamp;
3341         hwc->freq_count_stamp = now;
3342 
3343         /*
3344          * restart the event
3345          * reload only if value has changed
3346          * we have stopped the event so tell that
3347          * to perf_adjust_period() to avoid stopping it
3348          * twice.
3349          */
3350         if (delta > 0)
3351             perf_adjust_period(event, period, delta, false);
3352 
3353         event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3354     next:
3355         perf_pmu_enable(event->pmu);
3356     }
3357 
3358     perf_pmu_enable(ctx->pmu);
3359     raw_spin_unlock(&ctx->lock);
3360 }
3361 
3362 /*
3363  * Round-robin a context's events:
3364  */
3365 static void rotate_ctx(struct perf_event_context *ctx)
3366 {
3367     /*
3368      * Rotate the first entry last of non-pinned groups. Rotation might be
3369      * disabled by the inheritance code.
3370      */
3371     if (!ctx->rotate_disable)
3372         list_rotate_left(&ctx->flexible_groups);
3373 }
3374 
3375 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3376 {
3377     struct perf_event_context *ctx = NULL;
3378     int rotate = 0;
3379 
3380     if (cpuctx->ctx.nr_events) {
3381         if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3382             rotate = 1;
3383     }
3384 
3385     ctx = cpuctx->task_ctx;
3386     if (ctx && ctx->nr_events) {
3387         if (ctx->nr_events != ctx->nr_active)
3388             rotate = 1;
3389     }
3390 
3391     if (!rotate)
3392         goto done;
3393 
3394     perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3395     perf_pmu_disable(cpuctx->ctx.pmu);
3396 
3397     cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3398     if (ctx)
3399         ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3400 
3401     rotate_ctx(&cpuctx->ctx);
3402     if (ctx)
3403         rotate_ctx(ctx);
3404 
3405     perf_event_sched_in(cpuctx, ctx, current);
3406 
3407     perf_pmu_enable(cpuctx->ctx.pmu);
3408     perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3409 done:
3410 
3411     return rotate;
3412 }
3413 
3414 void perf_event_task_tick(void)
3415 {
3416     struct list_head *head = this_cpu_ptr(&active_ctx_list);
3417     struct perf_event_context *ctx, *tmp;
3418     int throttled;
3419 
3420     WARN_ON(!irqs_disabled());
3421 
3422     __this_cpu_inc(perf_throttled_seq);
3423     throttled = __this_cpu_xchg(perf_throttled_count, 0);
3424     tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3425 
3426     list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3427         perf_adjust_freq_unthr_context(ctx, throttled);
3428 }
3429 
3430 static int event_enable_on_exec(struct perf_event *event,
3431                 struct perf_event_context *ctx)
3432 {
3433     if (!event->attr.enable_on_exec)
3434         return 0;
3435 
3436     event->attr.enable_on_exec = 0;
3437     if (event->state >= PERF_EVENT_STATE_INACTIVE)
3438         return 0;
3439 
3440     __perf_event_mark_enabled(event);
3441 
3442     return 1;
3443 }
3444 
3445 /*
3446  * Enable all of a task's events that have been marked enable-on-exec.
3447  * This expects task == current.
3448  */
3449 static void perf_event_enable_on_exec(int ctxn)
3450 {
3451     struct perf_event_context *ctx, *clone_ctx = NULL;
3452     struct perf_cpu_context *cpuctx;
3453     struct perf_event *event;
3454     unsigned long flags;
3455     int enabled = 0;
3456 
3457     local_irq_save(flags);
3458     ctx = current->perf_event_ctxp[ctxn];
3459     if (!ctx || !ctx->nr_events)
3460         goto out;
3461 
3462     cpuctx = __get_cpu_context(ctx);
3463     perf_ctx_lock(cpuctx, ctx);
3464     ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3465     list_for_each_entry(event, &ctx->event_list, event_entry)
3466         enabled |= event_enable_on_exec(event, ctx);
3467 
3468     /*
3469      * Unclone and reschedule this context if we enabled any event.
3470      */
3471     if (enabled) {
3472         clone_ctx = unclone_ctx(ctx);
3473         ctx_resched(cpuctx, ctx);
3474     }
3475     perf_ctx_unlock(cpuctx, ctx);
3476 
3477 out:
3478     local_irq_restore(flags);
3479 
3480     if (clone_ctx)
3481         put_ctx(clone_ctx);
3482 }
3483 
3484 struct perf_read_data {
3485     struct perf_event *event;
3486     bool group;
3487     int ret;
3488 };
3489 
3490 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3491 {
3492     u16 local_pkg, event_pkg;
3493 
3494     if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3495         int local_cpu = smp_processor_id();
3496 
3497         event_pkg = topology_physical_package_id(event_cpu);
3498         local_pkg = topology_physical_package_id(local_cpu);
3499 
3500         if (event_pkg == local_pkg)
3501             return local_cpu;
3502     }
3503 
3504     return event_cpu;
3505 }
3506 
3507 /*
3508  * Cross CPU call to read the hardware event
3509  */
3510 static void __perf_event_read(void *info)
3511 {
3512     struct perf_read_data *data = info;
3513     struct perf_event *sub, *event = data->event;
3514     struct perf_event_context *ctx = event->ctx;
3515     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3516     struct pmu *pmu = event->pmu;
3517 
3518     /*
3519      * If this is a task context, we need to check whether it is
3520      * the current task context of this cpu.  If not it has been
3521      * scheduled out before the smp call arrived.  In that case
3522      * event->count would have been updated to a recent sample
3523      * when the event was scheduled out.
3524      */
3525     if (ctx->task && cpuctx->task_ctx != ctx)
3526         return;
3527 
3528     raw_spin_lock(&ctx->lock);
3529     if (ctx->is_active) {
3530         update_context_time(ctx);
3531         update_cgrp_time_from_event(event);
3532     }
3533 
3534     update_event_times(event);
3535     if (event->state != PERF_EVENT_STATE_ACTIVE)
3536         goto unlock;
3537 
3538     if (!data->group) {
3539         pmu->read(event);
3540         data->ret = 0;
3541         goto unlock;
3542     }
3543 
3544     pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3545 
3546     pmu->read(event);
3547 
3548     list_for_each_entry(sub, &event->sibling_list, group_entry) {
3549         update_event_times(sub);
3550         if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3551             /*
3552              * Use sibling's PMU rather than @event's since
3553              * sibling could be on different (eg: software) PMU.
3554              */
3555             sub->pmu->read(sub);
3556         }
3557     }
3558 
3559     data->ret = pmu->commit_txn(pmu);
3560 
3561 unlock:
3562     raw_spin_unlock(&ctx->lock);
3563 }
3564 
3565 static inline u64 perf_event_count(struct perf_event *event)
3566 {
3567     if (event->pmu->count)
3568         return event->pmu->count(event);
3569 
3570     return __perf_event_count(event);
3571 }
3572 
3573 /*
3574  * NMI-safe method to read a local event, that is an event that
3575  * is:
3576  *   - either for the current task, or for this CPU
3577  *   - does not have inherit set, for inherited task events
3578  *     will not be local and we cannot read them atomically
3579  *   - must not have a pmu::count method
3580  */
3581 u64 perf_event_read_local(struct perf_event *event)
3582 {
3583     unsigned long flags;
3584     u64 val;
3585 
3586     /*
3587      * Disabling interrupts avoids all counter scheduling (context
3588      * switches, timer based rotation and IPIs).
3589      */
3590     local_irq_save(flags);
3591 
3592     /* If this is a per-task event, it must be for current */
3593     WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3594              event->hw.target != current);
3595 
3596     /* If this is a per-CPU event, it must be for this CPU */
3597     WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3598              event->cpu != smp_processor_id());
3599 
3600     /*
3601      * It must not be an event with inherit set, we cannot read
3602      * all child counters from atomic context.
3603      */
3604     WARN_ON_ONCE(event->attr.inherit);
3605 
3606     /*
3607      * It must not have a pmu::count method, those are not
3608      * NMI safe.
3609      */
3610     WARN_ON_ONCE(event->pmu->count);
3611 
3612     /*
3613      * If the event is currently on this CPU, its either a per-task event,
3614      * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3615      * oncpu == -1).
3616      */
3617     if (event->oncpu == smp_processor_id())
3618         event->pmu->read(event);
3619 
3620     val = local64_read(&event->count);
3621     local_irq_restore(flags);
3622 
3623     return val;
3624 }
3625 
3626 static int perf_event_read(struct perf_event *event, bool group)
3627 {
3628     int event_cpu, ret = 0;
3629 
3630     /*
3631      * If event is enabled and currently active on a CPU, update the
3632      * value in the event structure:
3633      */
3634     if (event->state == PERF_EVENT_STATE_ACTIVE) {
3635         struct perf_read_data data = {
3636             .event = event,
3637             .group = group,
3638             .ret = 0,
3639         };
3640 
3641         event_cpu = READ_ONCE(event->oncpu);
3642         if ((unsigned)event_cpu >= nr_cpu_ids)
3643             return 0;
3644 
3645         preempt_disable();
3646         event_cpu = __perf_event_read_cpu(event, event_cpu);
3647 
3648         /*
3649          * Purposely ignore the smp_call_function_single() return
3650          * value.
3651          *
3652          * If event_cpu isn't a valid CPU it means the event got
3653          * scheduled out and that will have updated the event count.
3654          *
3655          * Therefore, either way, we'll have an up-to-date event count
3656          * after this.
3657          */
3658         (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3659         preempt_enable();
3660         ret = data.ret;
3661     } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3662         struct perf_event_context *ctx = event->ctx;
3663         unsigned long flags;
3664 
3665         raw_spin_lock_irqsave(&ctx->lock, flags);
3666         /*
3667          * may read while context is not active
3668          * (e.g., thread is blocked), in that case
3669          * we cannot update context time
3670          */
3671         if (ctx->is_active) {
3672             update_context_time(ctx);
3673             update_cgrp_time_from_event(event);
3674         }
3675         if (group)
3676             update_group_times(event);
3677         else
3678             update_event_times(event);
3679         raw_spin_unlock_irqrestore(&ctx->lock, flags);
3680     }
3681 
3682     return ret;
3683 }
3684 
3685 /*
3686  * Initialize the perf_event context in a task_struct:
3687  */
3688 static void __perf_event_init_context(struct perf_event_context *ctx)
3689 {
3690     raw_spin_lock_init(&ctx->lock);
3691     mutex_init(&ctx->mutex);
3692     INIT_LIST_HEAD(&ctx->active_ctx_list);
3693     INIT_LIST_HEAD(&ctx->pinned_groups);
3694     INIT_LIST_HEAD(&ctx->flexible_groups);
3695     INIT_LIST_HEAD(&ctx->event_list);
3696     atomic_set(&ctx->refcount, 1);
3697 }
3698 
3699 static struct perf_event_context *
3700 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3701 {
3702     struct perf_event_context *ctx;
3703 
3704     ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3705     if (!ctx)
3706         return NULL;
3707 
3708     __perf_event_init_context(ctx);
3709     if (task) {
3710         ctx->task = task;
3711         get_task_struct(task);
3712     }
3713     ctx->pmu = pmu;
3714 
3715     return ctx;
3716 }
3717 
3718 static struct task_struct *
3719 find_lively_task_by_vpid(pid_t vpid)
3720 {
3721     struct task_struct *task;
3722 
3723     rcu_read_lock();
3724     if (!vpid)
3725         task = current;
3726     else
3727         task = find_task_by_vpid(vpid);
3728     if (task)
3729         get_task_struct(task);
3730     rcu_read_unlock();
3731 
3732     if (!task)
3733         return ERR_PTR(-ESRCH);
3734 
3735     return task;
3736 }
3737 
3738 /*
3739  * Returns a matching context with refcount and pincount.
3740  */
3741 static struct perf_event_context *
3742 find_get_context(struct pmu *pmu, struct task_struct *task,
3743         struct perf_event *event)
3744 {
3745     struct perf_event_context *ctx, *clone_ctx = NULL;
3746     struct perf_cpu_context *cpuctx;
3747     void *task_ctx_data = NULL;
3748     unsigned long flags;
3749     int ctxn, err;
3750     int cpu = event->cpu;
3751 
3752     if (!task) {
3753         /* Must be root to operate on a CPU event: */
3754         if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3755             return ERR_PTR(-EACCES);
3756 
3757         /*
3758          * We could be clever and allow to attach a event to an
3759          * offline CPU and activate it when the CPU comes up, but
3760          * that's for later.
3761          */
3762         if (!cpu_online(cpu))
3763             return ERR_PTR(-ENODEV);
3764 
3765         cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3766         ctx = &cpuctx->ctx;
3767         get_ctx(ctx);
3768         ++ctx->pin_count;
3769 
3770         return ctx;
3771     }
3772 
3773     err = -EINVAL;
3774     ctxn = pmu->task_ctx_nr;
3775     if (ctxn < 0)
3776         goto errout;
3777 
3778     if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3779         task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3780         if (!task_ctx_data) {
3781             err = -ENOMEM;
3782             goto errout;
3783         }
3784     }
3785 
3786 retry:
3787     ctx = perf_lock_task_context(task, ctxn, &flags);
3788     if (ctx) {
3789         clone_ctx = unclone_ctx(ctx);
3790         ++ctx->pin_count;
3791 
3792         if (task_ctx_data && !ctx->task_ctx_data) {
3793             ctx->task_ctx_data = task_ctx_data;
3794             task_ctx_data = NULL;
3795         }
3796         raw_spin_unlock_irqrestore(&ctx->lock, flags);
3797 
3798         if (clone_ctx)
3799             put_ctx(clone_ctx);
3800     } else {
3801         ctx = alloc_perf_context(pmu, task);
3802         err = -ENOMEM;
3803         if (!ctx)
3804             goto errout;
3805 
3806         if (task_ctx_data) {
3807             ctx->task_ctx_data = task_ctx_data;
3808             task_ctx_data = NULL;
3809         }
3810 
3811         err = 0;
3812         mutex_lock(&task->perf_event_mutex);
3813         /*
3814          * If it has already passed perf_event_exit_task().
3815          * we must see PF_EXITING, it takes this mutex too.
3816          */
3817         if (task->flags & PF_EXITING)
3818             err = -ESRCH;
3819         else if (task->perf_event_ctxp[ctxn])
3820             err = -EAGAIN;
3821         else {
3822             get_ctx(ctx);
3823             ++ctx->pin_count;
3824             rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3825         }
3826         mutex_unlock(&task->perf_event_mutex);
3827 
3828         if (unlikely(err)) {
3829             put_ctx(ctx);
3830 
3831             if (err == -EAGAIN)
3832                 goto retry;
3833             goto errout;
3834         }
3835     }
3836 
3837     kfree(task_ctx_data);
3838     return ctx;
3839 
3840 errout:
3841     kfree(task_ctx_data);
3842     return ERR_PTR(err);
3843 }
3844 
3845 static void perf_event_free_filter(struct perf_event *event);
3846 static void perf_event_free_bpf_prog(struct perf_event *event);
3847 
3848 static void free_event_rcu(struct rcu_head *head)
3849 {
3850     struct perf_event *event;
3851 
3852     event = container_of(head, struct perf_event, rcu_head);
3853     if (event->ns)
3854         put_pid_ns(event->ns);
3855     perf_event_free_filter(event);
3856     kfree(event);
3857 }
3858 
3859 static void ring_buffer_attach(struct perf_event *event,
3860                    struct ring_buffer *rb);
3861 
3862 static void detach_sb_event(struct perf_event *event)
3863 {
3864     struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3865 
3866     raw_spin_lock(&pel->lock);
3867     list_del_rcu(&event->sb_list);
3868     raw_spin_unlock(&pel->lock);
3869 }
3870 
3871 static bool is_sb_event(struct perf_event *event)
3872 {
3873     struct perf_event_attr *attr = &event->attr;
3874 
3875     if (event->parent)
3876         return false;
3877 
3878     if (event->attach_state & PERF_ATTACH_TASK)
3879         return false;
3880 
3881     if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3882         attr->comm || attr->comm_exec ||
3883         attr->task ||
3884         attr->context_switch)
3885         return true;
3886     return false;
3887 }
3888 
3889 static void unaccount_pmu_sb_event(struct perf_event *event)
3890 {
3891     if (is_sb_event(event))
3892         detach_sb_event(event);
3893 }
3894 
3895 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3896 {
3897     if (event->parent)
3898         return;
3899 
3900     if (is_cgroup_event(event))
3901         atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3902 }
3903 
3904 #ifdef CONFIG_NO_HZ_FULL
3905 static DEFINE_SPINLOCK(nr_freq_lock);
3906 #endif
3907 
3908 static void unaccount_freq_event_nohz(void)
3909 {
3910 #ifdef CONFIG_NO_HZ_FULL
3911     spin_lock(&nr_freq_lock);
3912     if (atomic_dec_and_test(&nr_freq_events))
3913         tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3914     spin_unlock(&nr_freq_lock);
3915 #endif
3916 }
3917 
3918 static void unaccount_freq_event(void)
3919 {
3920     if (tick_nohz_full_enabled())
3921         unaccount_freq_event_nohz();
3922     else
3923         atomic_dec(&nr_freq_events);
3924 }
3925 
3926 static void unaccount_event(struct perf_event *event)
3927 {
3928     bool dec = false;
3929 
3930     if (event->parent)
3931         return;
3932 
3933     if (event->attach_state & PERF_ATTACH_TASK)
3934         dec = true;
3935     if (event->attr.mmap || event->attr.mmap_data)
3936         atomic_dec(&nr_mmap_events);
3937     if (event->attr.comm)
3938         atomic_dec(&nr_comm_events);
3939     if (event->attr.task)
3940         atomic_dec(&nr_task_events);
3941     if (event->attr.freq)
3942         unaccount_freq_event();
3943     if (event->attr.context_switch) {
3944         dec = true;
3945         atomic_dec(&nr_switch_events);
3946     }
3947     if (is_cgroup_event(event))
3948         dec = true;
3949     if (has_branch_stack(event))
3950         dec = true;
3951 
3952     if (dec) {
3953         if (!atomic_add_unless(&perf_sched_count, -1, 1))
3954             schedule_delayed_work(&perf_sched_work, HZ);
3955     }
3956 
3957     unaccount_event_cpu(event, event->cpu);
3958 
3959     unaccount_pmu_sb_event(event);
3960 }
3961 
3962 static void perf_sched_delayed(struct work_struct *work)
3963 {
3964     mutex_lock(&perf_sched_mutex);
3965     if (atomic_dec_and_test(&perf_sched_count))
3966         static_branch_disable(&perf_sched_events);
3967     mutex_unlock(&perf_sched_mutex);
3968 }
3969 
3970 /*
3971  * The following implement mutual exclusion of events on "exclusive" pmus
3972  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3973  * at a time, so we disallow creating events that might conflict, namely:
3974  *
3975  *  1) cpu-wide events in the presence of per-task events,
3976  *  2) per-task events in the presence of cpu-wide events,
3977  *  3) two matching events on the same context.
3978  *
3979  * The former two cases are handled in the allocation path (perf_event_alloc(),
3980  * _free_event()), the latter -- before the first perf_install_in_context().
3981  */
3982 static int exclusive_event_init(struct perf_event *event)
3983 {
3984     struct pmu *pmu = event->pmu;
3985 
3986     if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3987         return 0;
3988 
3989     /*
3990      * Prevent co-existence of per-task and cpu-wide events on the
3991      * same exclusive pmu.
3992      *
3993      * Negative pmu::exclusive_cnt means there are cpu-wide
3994      * events on this "exclusive" pmu, positive means there are
3995      * per-task events.
3996      *
3997      * Since this is called in perf_event_alloc() path, event::ctx
3998      * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3999      * to mean "per-task event", because unlike other attach states it
4000      * never gets cleared.
4001      */
4002     if (event->attach_state & PERF_ATTACH_TASK) {
4003         if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4004             return -EBUSY;
4005     } else {
4006         if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4007             return -EBUSY;
4008     }
4009 
4010     return 0;
4011 }
4012 
4013 static void exclusive_event_destroy(struct perf_event *event)
4014 {
4015     struct pmu *pmu = event->pmu;
4016 
4017     if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4018         return;
4019 
4020     /* see comment in exclusive_event_init() */
4021     if (event->attach_state & PERF_ATTACH_TASK)
4022         atomic_dec(&pmu->exclusive_cnt);
4023     else
4024         atomic_inc(&pmu->exclusive_cnt);
4025 }
4026 
4027 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4028 {
4029     if ((e1->pmu == e2->pmu) &&
4030         (e1->cpu == e2->cpu ||
4031          e1->cpu == -1 ||
4032          e2->cpu == -1))
4033         return true;
4034     return false;
4035 }
4036 
4037 /* Called under the same ctx::mutex as perf_install_in_context() */
4038 static bool exclusive_event_installable(struct perf_event *event,
4039                     struct perf_event_context *ctx)
4040 {
4041     struct perf_event *iter_event;
4042     struct pmu *pmu = event->pmu;
4043 
4044     if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4045         return true;
4046 
4047     list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4048         if (exclusive_event_match(iter_event, event))
4049             return false;
4050     }
4051 
4052     return true;
4053 }
4054 
4055 static void perf_addr_filters_splice(struct perf_event *event,
4056                        struct list_head *head);
4057 
4058 static void _free_event(struct perf_event *event)
4059 {
4060     irq_work_sync(&event->pending);
4061 
4062     unaccount_event(event);
4063 
4064     if (event->rb) {
4065         /*
4066          * Can happen when we close an event with re-directed output.
4067          *
4068          * Since we have a 0 refcount, perf_mmap_close() will skip
4069          * over us; possibly making our ring_buffer_put() the last.
4070          */
4071         mutex_lock(&event->mmap_mutex);
4072         ring_buffer_attach(event, NULL);
4073         mutex_unlock(&event->mmap_mutex);
4074     }
4075 
4076     if (is_cgroup_event(event))
4077         perf_detach_cgroup(event);
4078 
4079     if (!event->parent) {
4080         if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4081             put_callchain_buffers();
4082     }
4083 
4084     perf_event_free_bpf_prog(event);
4085     perf_addr_filters_splice(event, NULL);
4086     kfree(event->addr_filters_offs);
4087 
4088     if (event->destroy)
4089         event->destroy(event);
4090 
4091     if (event->ctx)
4092         put_ctx(event->ctx);
4093 
4094     exclusive_event_destroy(event);
4095     module_put(event->pmu->module);
4096 
4097     call_rcu(&event->rcu_head, free_event_rcu);
4098 }
4099 
4100 /*
4101  * Used to free events which have a known refcount of 1, such as in error paths
4102  * where the event isn't exposed yet and inherited events.
4103  */
4104 static void free_event(struct perf_event *event)
4105 {
4106     if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4107                 "unexpected event refcount: %ld; ptr=%p\n",
4108                 atomic_long_read(&event->refcount), event)) {
4109         /* leak to avoid use-after-free */
4110         return;
4111     }
4112 
4113     _free_event(event);
4114 }
4115 
4116 /*
4117  * Remove user event from the owner task.
4118  */
4119 static void perf_remove_from_owner(struct perf_event *event)
4120 {
4121     struct task_struct *owner;
4122 
4123     rcu_read_lock();
4124     /*
4125      * Matches the smp_store_release() in perf_event_exit_task(). If we
4126      * observe !owner it means the list deletion is complete and we can
4127      * indeed free this event, otherwise we need to serialize on
4128      * owner->perf_event_mutex.
4129      */
4130     owner = lockless_dereference(event->owner);
4131     if (owner) {
4132         /*
4133          * Since delayed_put_task_struct() also drops the last
4134          * task reference we can safely take a new reference
4135          * while holding the rcu_read_lock().
4136          */
4137         get_task_struct(owner);
4138     }
4139     rcu_read_unlock();
4140 
4141     if (owner) {
4142         /*
4143          * If we're here through perf_event_exit_task() we're already
4144          * holding ctx->mutex which would be an inversion wrt. the
4145          * normal lock order.
4146          *
4147          * However we can safely take this lock because its the child
4148          * ctx->mutex.
4149          */
4150         mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4151 
4152         /*
4153          * We have to re-check the event->owner field, if it is cleared
4154          * we raced with perf_event_exit_task(), acquiring the mutex
4155          * ensured they're done, and we can proceed with freeing the
4156          * event.
4157          */
4158         if (event->owner) {
4159             list_del_init(&event->owner_entry);
4160             smp_store_release(&event->owner, NULL);
4161         }
4162         mutex_unlock(&owner->perf_event_mutex);
4163         put_task_struct(owner);
4164     }
4165 }
4166 
4167 static void put_event(struct perf_event *event)
4168 {
4169     if (!atomic_long_dec_and_test(&event->refcount))
4170         return;
4171 
4172     _free_event(event);
4173 }
4174 
4175 /*
4176  * Kill an event dead; while event:refcount will preserve the event
4177  * object, it will not preserve its functionality. Once the last 'user'
4178  * gives up the object, we'll destroy the thing.
4179  */
4180 int perf_event_release_kernel(struct perf_event *event)
4181 {
4182     struct perf_event_context *ctx = event->ctx;
4183     struct perf_event *child, *tmp;
4184 
4185     /*
4186      * If we got here through err_file: fput(event_file); we will not have
4187      * attached to a context yet.
4188      */
4189     if (!ctx) {
4190         WARN_ON_ONCE(event->attach_state &
4191                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4192         goto no_ctx;
4193     }
4194 
4195     if (!is_kernel_event(event))
4196         perf_remove_from_owner(event);
4197 
4198     ctx = perf_event_ctx_lock(event);
4199     WARN_ON_ONCE(ctx->parent_ctx);
4200     perf_remove_from_context(event, DETACH_GROUP);
4201 
4202     raw_spin_lock_irq(&ctx->lock);
4203     /*
4204      * Mark this even as STATE_DEAD, there is no external reference to it
4205      * anymore.
4206      *
4207      * Anybody acquiring event->child_mutex after the below loop _must_
4208      * also see this, most importantly inherit_event() which will avoid
4209      * placing more children on the list.
4210      *
4211      * Thus this guarantees that we will in fact observe and kill _ALL_
4212      * child events.
4213      */
4214     event->state = PERF_EVENT_STATE_DEAD;
4215     raw_spin_unlock_irq(&ctx->lock);
4216 
4217     perf_event_ctx_unlock(event, ctx);
4218 
4219 again:
4220     mutex_lock(&event->child_mutex);
4221     list_for_each_entry(child, &event->child_list, child_list) {
4222 
4223         /*
4224          * Cannot change, child events are not migrated, see the
4225          * comment with perf_event_ctx_lock_nested().
4226          */
4227         ctx = lockless_dereference(child->ctx);
4228         /*
4229          * Since child_mutex nests inside ctx::mutex, we must jump
4230          * through hoops. We start by grabbing a reference on the ctx.
4231          *
4232          * Since the event cannot get freed while we hold the
4233          * child_mutex, the context must also exist and have a !0
4234          * reference count.
4235          */
4236         get_ctx(ctx);
4237 
4238         /*
4239          * Now that we have a ctx ref, we can drop child_mutex, and
4240          * acquire ctx::mutex without fear of it going away. Then we
4241          * can re-acquire child_mutex.
4242          */
4243         mutex_unlock(&event->child_mutex);
4244         mutex_lock(&ctx->mutex);
4245         mutex_lock(&event->child_mutex);
4246 
4247         /*
4248          * Now that we hold ctx::mutex and child_mutex, revalidate our
4249          * state, if child is still the first entry, it didn't get freed
4250          * and we can continue doing so.
4251          */
4252         tmp = list_first_entry_or_null(&event->child_list,
4253                            struct perf_event, child_list);
4254         if (tmp == child) {
4255             perf_remove_from_context(child, DETACH_GROUP);
4256             list_del(&child->child_list);
4257             free_event(child);
4258             /*
4259              * This matches the refcount bump in inherit_event();
4260              * this can't be the last reference.
4261              */
4262             put_event(event);
4263         }
4264 
4265         mutex_unlock(&event->child_mutex);
4266         mutex_unlock(&ctx->mutex);
4267         put_ctx(ctx);
4268         goto again;
4269     }
4270     mutex_unlock(&event->child_mutex);
4271 
4272 no_ctx:
4273     put_event(event); /* Must be the 'last' reference */
4274     return 0;
4275 }
4276 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4277 
4278 /*
4279  * Called when the last reference to the file is gone.
4280  */
4281 static int perf_release(struct inode *inode, struct file *file)
4282 {
4283     perf_event_release_kernel(file->private_data);
4284     return 0;
4285 }
4286 
4287 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4288 {
4289     struct perf_event *child;
4290     u64 total = 0;
4291 
4292     *enabled = 0;
4293     *running = 0;
4294 
4295     mutex_lock(&event->child_mutex);
4296 
4297     (void)perf_event_read(event, false);
4298     total += perf_event_count(event);
4299 
4300     *enabled += event->total_time_enabled +
4301             atomic64_read(&event->child_total_time_enabled);
4302     *running += event->total_time_running +
4303             atomic64_read(&event->child_total_time_running);
4304 
4305     list_for_each_entry(child, &event->child_list, child_list) {
4306         (void)perf_event_read(child, false);
4307         total += perf_event_count(child);
4308         *enabled += child->total_time_enabled;
4309         *running += child->total_time_running;
4310     }
4311     mutex_unlock(&event->child_mutex);
4312 
4313     return total;
4314 }
4315 EXPORT_SYMBOL_GPL(perf_event_read_value);
4316 
4317 static int __perf_read_group_add(struct perf_event *leader,
4318                     u64 read_format, u64 *values)
4319 {
4320     struct perf_event *sub;
4321     int n = 1; /* skip @nr */
4322     int ret;
4323 
4324     ret = perf_event_read(leader, true);
4325     if (ret)
4326         return ret;
4327 
4328     /*
4329      * Since we co-schedule groups, {enabled,running} times of siblings
4330      * will be identical to those of the leader, so we only publish one
4331      * set.
4332      */
4333     if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4334         values[n++] += leader->total_time_enabled +
4335             atomic64_read(&leader->child_total_time_enabled);
4336     }
4337 
4338     if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4339         values[n++] += leader->total_time_running +
4340             atomic64_read(&leader->child_total_time_running);
4341     }
4342 
4343     /*
4344      * Write {count,id} tuples for every sibling.
4345      */
4346     values[n++] += perf_event_count(leader);
4347     if (read_format & PERF_FORMAT_ID)
4348         values[n++] = primary_event_id(leader);
4349 
4350     list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4351         values[n++] += perf_event_count(sub);
4352         if (read_format & PERF_FORMAT_ID)
4353             values[n++] = primary_event_id(sub);
4354     }
4355 
4356     return 0;
4357 }
4358 
4359 static int perf_read_group(struct perf_event *event,
4360                    u64 read_format, char __user *buf)
4361 {
4362     struct perf_event *leader = event->group_leader, *child;
4363     struct perf_event_context *ctx = leader->ctx;
4364     int ret;
4365     u64 *values;
4366 
4367     lockdep_assert_held(&ctx->mutex);
4368 
4369     values = kzalloc(event->read_size, GFP_KERNEL);
4370     if (!values)
4371         return -ENOMEM;
4372 
4373     values[0] = 1 + leader->nr_siblings;
4374 
4375     /*
4376      * By locking the child_mutex of the leader we effectively
4377      * lock the child list of all siblings.. XXX explain how.
4378      */
4379     mutex_lock(&leader->child_mutex);
4380 
4381     ret = __perf_read_group_add(leader, read_format, values);
4382     if (ret)
4383         goto unlock;
4384 
4385     list_for_each_entry(child, &leader->child_list, child_list) {
4386         ret = __perf_read_group_add(child, read_format, values);
4387         if (ret)
4388             goto unlock;
4389     }
4390 
4391     mutex_unlock(&leader->child_mutex);
4392 
4393     ret = event->read_size;
4394     if (copy_to_user(buf, values, event->read_size))
4395         ret = -EFAULT;
4396     goto out;
4397 
4398 unlock:
4399     mutex_unlock(&leader->child_mutex);
4400 out:
4401     kfree(values);
4402     return ret;
4403 }
4404 
4405 static int perf_read_one(struct perf_event *event,
4406                  u64 read_format, char __user *buf)
4407 {
4408     u64 enabled, running;
4409     u64 values[4];
4410     int n = 0;
4411 
4412     values[n++] = perf_event_read_value(event, &enabled, &running);
4413     if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4414         values[n++] = enabled;
4415     if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4416         values[n++] = running;
4417     if (read_format & PERF_FORMAT_ID)
4418         values[n++] = primary_event_id(event);
4419 
4420     if (copy_to_user(buf, values, n * sizeof(u64)))
4421         return -EFAULT;
4422 
4423     return n * sizeof(u64);
4424 }
4425 
4426 static bool is_event_hup(struct perf_event *event)
4427 {
4428     bool no_children;
4429 
4430     if (event->state > PERF_EVENT_STATE_EXIT)
4431         return false;
4432 
4433     mutex_lock(&event->child_mutex);
4434     no_children = list_empty(&event->child_list);
4435     mutex_unlock(&event->child_mutex);
4436     return no_children;
4437 }
4438 
4439 /*
4440  * Read the performance event - simple non blocking version for now
4441  */
4442 static ssize_t
4443 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4444 {
4445     u64 read_format = event->attr.read_format;
4446     int ret;
4447 
4448     /*
4449      * Return end-of-file for a read on a event that is in
4450      * error state (i.e. because it was pinned but it couldn't be
4451      * scheduled on to the CPU at some point).
4452      */
4453     if (event->state == PERF_EVENT_STATE_ERROR)
4454         return 0;
4455 
4456     if (count < event->read_size)
4457         return -ENOSPC;
4458 
4459     WARN_ON_ONCE(event->ctx->parent_ctx);
4460     if (read_format & PERF_FORMAT_GROUP)
4461         ret = perf_read_group(event, read_format, buf);
4462     else
4463         ret = perf_read_one(event, read_format, buf);
4464 
4465     return ret;
4466 }
4467 
4468 static ssize_t
4469 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4470 {
4471     struct perf_event *event = file->private_data;
4472     struct perf_event_context *ctx;
4473     int ret;
4474 
4475     ctx = perf_event_ctx_lock(event);
4476     ret = __perf_read(event, buf, count);
4477     perf_event_ctx_unlock(event, ctx);
4478 
4479     return ret;
4480 }
4481 
4482 static unsigned int perf_poll(struct file *file, poll_table *wait)
4483 {
4484     struct perf_event *event = file->private_data;
4485     struct ring_buffer *rb;
4486     unsigned int events = POLLHUP;
4487 
4488     poll_wait(file, &event->waitq, wait);
4489 
4490     if (is_event_hup(event))
4491         return events;
4492 
4493     /*
4494      * Pin the event->rb by taking event->mmap_mutex; otherwise
4495      * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4496      */
4497     mutex_lock(&event->mmap_mutex);
4498     rb = event->rb;
4499     if (rb)
4500         events = atomic_xchg(&rb->poll, 0);
4501     mutex_unlock(&event->mmap_mutex);
4502     return events;
4503 }
4504 
4505 static void _perf_event_reset(struct perf_event *event)
4506 {
4507     (void)perf_event_read(event, false);
4508     local64_set(&event->count, 0);
4509     perf_event_update_userpage(event);
4510 }
4511 
4512 /*
4513  * Holding the top-level event's child_mutex means that any
4514  * descendant process that has inherited this event will block
4515  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4516  * task existence requirements of perf_event_enable/disable.
4517  */
4518 static void perf_event_for_each_child(struct perf_event *event,
4519                     void (*func)(struct perf_event *))
4520 {
4521     struct perf_event *child;
4522 
4523     WARN_ON_ONCE(event->ctx->parent_ctx);
4524 
4525     mutex_lock(&event->child_mutex);
4526     func(event);
4527     list_for_each_entry(child, &event->child_list, child_list)
4528         func(child);
4529     mutex_unlock(&event->child_mutex);
4530 }
4531 
4532 static void perf_event_for_each(struct perf_event *event,
4533                   void (*func)(struct perf_event *))
4534 {
4535     struct perf_event_context *ctx = event->ctx;
4536     struct perf_event *sibling;
4537 
4538     lockdep_assert_held(&ctx->mutex);
4539 
4540     event = event->group_leader;
4541 
4542     perf_event_for_each_child(event, func);
4543     list_for_each_entry(sibling, &event->sibling_list, group_entry)
4544         perf_event_for_each_child(sibling, func);
4545 }
4546 
4547 static void __perf_event_period(struct perf_event *event,
4548                 struct perf_cpu_context *cpuctx,
4549                 struct perf_event_context *ctx,
4550                 void *info)
4551 {
4552     u64 value = *((u64 *)info);
4553     bool active;
4554 
4555     if (event->attr.freq) {
4556         event->attr.sample_freq = value;
4557     } else {
4558         event->attr.sample_period = value;
4559         event->hw.sample_period = value;
4560     }
4561 
4562     active = (event->state == PERF_EVENT_STATE_ACTIVE);
4563     if (active) {
4564         perf_pmu_disable(ctx->pmu);
4565         /*
4566          * We could be throttled; unthrottle now to avoid the tick
4567          * trying to unthrottle while we already re-started the event.
4568          */
4569         if (event->hw.interrupts == MAX_INTERRUPTS) {
4570             event->hw.interrupts = 0;
4571             perf_log_throttle(event, 1);
4572         }
4573         event->pmu->stop(event, PERF_EF_UPDATE);
4574     }
4575 
4576     local64_set(&event->hw.period_left, 0);
4577 
4578     if (active) {
4579         event->pmu->start(event, PERF_EF_RELOAD);
4580         perf_pmu_enable(ctx->pmu);
4581     }
4582 }
4583 
4584 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4585 {
4586     u64 value;
4587 
4588     if (!is_sampling_event(event))
4589         return -EINVAL;
4590 
4591     if (copy_from_user(&value, arg, sizeof(value)))
4592         return -EFAULT;
4593 
4594     if (!value)
4595         return -EINVAL;
4596 
4597     if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4598         return -EINVAL;
4599 
4600     event_function_call(event, __perf_event_period, &value);
4601 
4602     return 0;
4603 }
4604 
4605 static const struct file_operations perf_fops;
4606 
4607 static inline int perf_fget_light(int fd, struct fd *p)
4608 {
4609     struct fd f = fdget(fd);
4610     if (!f.file)
4611         return -EBADF;
4612 
4613     if (f.file->f_op != &perf_fops) {
4614         fdput(f);
4615         return -EBADF;
4616     }
4617     *p = f;
4618     return 0;
4619 }
4620 
4621 static int perf_event_set_output(struct perf_event *event,
4622                  struct perf_event *output_event);
4623 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4624 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4625 
4626 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4627 {
4628     void (*func)(struct perf_event *);
4629     u32 flags = arg;
4630 
4631     switch (cmd) {
4632     case PERF_EVENT_IOC_ENABLE:
4633         func = _perf_event_enable;
4634         break;
4635     case PERF_EVENT_IOC_DISABLE:
4636         func = _perf_event_disable;
4637         break;
4638     case PERF_EVENT_IOC_RESET:
4639         func = _perf_event_reset;
4640         break;
4641 
4642     case PERF_EVENT_IOC_REFRESH:
4643         return _perf_event_refresh(event, arg);
4644 
4645     case PERF_EVENT_IOC_PERIOD:
4646         return perf_event_period(event, (u64 __user *)arg);
4647 
4648     case PERF_EVENT_IOC_ID:
4649     {
4650         u64 id = primary_event_id(event);
4651 
4652         if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4653             return -EFAULT;
4654         return 0;
4655     }
4656 
4657     case PERF_EVENT_IOC_SET_OUTPUT:
4658     {
4659         int ret;
4660         if (arg != -1) {
4661             struct perf_event *output_event;
4662             struct fd output;
4663             ret = perf_fget_light(arg, &output);
4664             if (ret)
4665                 return ret;
4666             output_event = output.file->private_data;
4667             ret = perf_event_set_output(event, output_event);
4668             fdput(output);
4669         } else {
4670             ret = perf_event_set_output(event, NULL);
4671         }
4672         return ret;
4673     }
4674 
4675     case PERF_EVENT_IOC_SET_FILTER:
4676         return perf_event_set_filter(event, (void __user *)arg);
4677 
4678     case PERF_EVENT_IOC_SET_BPF:
4679         return perf_event_set_bpf_prog(event, arg);
4680 
4681     case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4682         struct ring_buffer *rb;
4683 
4684         rcu_read_lock();
4685         rb = rcu_dereference(event->rb);
4686         if (!rb || !rb->nr_pages) {
4687             rcu_read_unlock();
4688             return -EINVAL;
4689         }
4690         rb_toggle_paused(rb, !!arg);
4691         rcu_read_unlock();
4692         return 0;
4693     }
4694     default:
4695         return -ENOTTY;
4696     }
4697 
4698     if (flags & PERF_IOC_FLAG_GROUP)
4699         perf_event_for_each(event, func);
4700     else
4701         perf_event_for_each_child(event, func);
4702 
4703     return 0;
4704 }
4705 
4706 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4707 {
4708     struct perf_event *event = file->private_data;
4709     struct perf_event_context *ctx;
4710     long ret;
4711 
4712     ctx = perf_event_ctx_lock(event);
4713     ret = _perf_ioctl(event, cmd, arg);
4714     perf_event_ctx_unlock(event, ctx);
4715 
4716     return ret;
4717 }
4718 
4719 #ifdef CONFIG_COMPAT
4720 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4721                 unsigned long arg)
4722 {
4723     switch (_IOC_NR(cmd)) {
4724     case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4725     case _IOC_NR(PERF_EVENT_IOC_ID):
4726         /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4727         if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4728             cmd &= ~IOCSIZE_MASK;
4729             cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4730         }
4731         break;
4732     }
4733     return perf_ioctl(file, cmd, arg);
4734 }
4735 #else
4736 # define perf_compat_ioctl NULL
4737 #endif
4738 
4739 int perf_event_task_enable(void)
4740 {
4741     struct perf_event_context *ctx;
4742     struct perf_event *event;
4743 
4744     mutex_lock(&current->perf_event_mutex);
4745     list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4746         ctx = perf_event_ctx_lock(event);
4747         perf_event_for_each_child(event, _perf_event_enable);
4748         perf_event_ctx_unlock(event, ctx);
4749     }
4750     mutex_unlock(&current->perf_event_mutex);
4751 
4752     return 0;
4753 }
4754 
4755 int perf_event_task_disable(void)
4756 {
4757     struct perf_event_context *ctx;
4758     struct perf_event *event;
4759 
4760     mutex_lock(&current->perf_event_mutex);
4761     list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4762         ctx = perf_event_ctx_lock(event);
4763         perf_event_for_each_child(event, _perf_event_disable);
4764         perf_event_ctx_unlock(event, ctx);
4765     }
4766     mutex_unlock(&current->perf_event_mutex);
4767 
4768     return 0;
4769 }
4770 
4771 static int perf_event_index(struct perf_event *event)
4772 {
4773     if (event->hw.state & PERF_HES_STOPPED)
4774         return 0;
4775 
4776     if (event->state != PERF_EVENT_STATE_ACTIVE)
4777         return 0;
4778 
4779     return event->pmu->event_idx(event);
4780 }
4781 
4782 static void calc_timer_values(struct perf_event *event,
4783                 u64 *now,
4784                 u64 *enabled,
4785                 u64 *running)
4786 {
4787     u64 ctx_time;
4788 
4789     *now = perf_clock();
4790     ctx_time = event->shadow_ctx_time + *now;
4791     *enabled = ctx_time - event->tstamp_enabled;
4792     *running = ctx_time - event->tstamp_running;
4793 }
4794 
4795 static void perf_event_init_userpage(struct perf_event *event)
4796 {
4797     struct perf_event_mmap_page *userpg;
4798     struct ring_buffer *rb;
4799 
4800     rcu_read_lock();
4801     rb = rcu_dereference(event->rb);
4802     if (!rb)
4803         goto unlock;
4804 
4805     userpg = rb->user_page;
4806 
4807     /* Allow new userspace to detect that bit 0 is deprecated */
4808     userpg->cap_bit0_is_deprecated = 1;
4809     userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4810     userpg->data_offset = PAGE_SIZE;
4811     userpg->data_size = perf_data_size(rb);
4812 
4813 unlock:
4814     rcu_read_unlock();
4815 }
4816 
4817 void __weak arch_perf_update_userpage(
4818     struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4819 {
4820 }
4821 
4822 /*
4823  * Callers need to ensure there can be no nesting of this function, otherwise
4824  * the seqlock logic goes bad. We can not serialize this because the arch
4825  * code calls this from NMI context.
4826  */
4827 void perf_event_update_userpage(struct perf_event *event)
4828 {
4829     struct perf_event_mmap_page *userpg;
4830     struct ring_buffer *rb;
4831     u64 enabled, running, now;
4832 
4833     rcu_read_lock();
4834     rb = rcu_dereference(event->rb);
4835     if (!rb)
4836         goto unlock;
4837 
4838     /*
4839      * compute total_time_enabled, total_time_running
4840      * based on snapshot values taken when the event
4841      * was last scheduled in.
4842      *
4843      * we cannot simply called update_context_time()
4844      * because of locking issue as we can be called in
4845      * NMI context
4846      */
4847     calc_timer_values(event, &now, &enabled, &running);
4848 
4849     userpg = rb->user_page;
4850     /*
4851      * Disable preemption so as to not let the corresponding user-space
4852      * spin too long if we get preempted.
4853      */
4854     preempt_disable();
4855     ++userpg->lock;
4856     barrier();
4857     userpg->index = perf_event_index(event);
4858     userpg->offset = perf_event_count(event);
4859     if (userpg->index)
4860         userpg->offset -= local64_read(&event->hw.prev_count);
4861 
4862     userpg->time_enabled = enabled +
4863             atomic64_read(&event->child_total_time_enabled);
4864 
4865     userpg->time_running = running +
4866             atomic64_read(&event->child_total_time_running);
4867 
4868     arch_perf_update_userpage(event, userpg, now);
4869 
4870     barrier();
4871     ++userpg->lock;
4872     preempt_enable();
4873 unlock:
4874     rcu_read_unlock();
4875 }
4876 
4877 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4878 {
4879     struct perf_event *event = vma->vm_file->private_data;
4880     struct ring_buffer *rb;
4881     int ret = VM_FAULT_SIGBUS;
4882 
4883     if (vmf->flags & FAULT_FLAG_MKWRITE) {
4884         if (vmf->pgoff == 0)
4885             ret = 0;
4886         return ret;
4887     }
4888 
4889     rcu_read_lock();
4890     rb = rcu_dereference(event->rb);
4891     if (!rb)
4892         goto unlock;
4893 
4894     if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4895         goto unlock;
4896 
4897     vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4898     if (!vmf->page)
4899         goto unlock;
4900 
4901     get_page(vmf->page);
4902     vmf->page->mapping = vma->vm_file->f_mapping;
4903     vmf->page->index   = vmf->pgoff;
4904 
4905     ret = 0;
4906 unlock:
4907     rcu_read_unlock();
4908 
4909     return ret;
4910 }
4911 
4912 static void ring_buffer_attach(struct perf_event *event,
4913                    struct ring_buffer *rb)
4914 {
4915     struct ring_buffer *old_rb = NULL;
4916     unsigned long flags;
4917 
4918     if (event->rb) {
4919         /*
4920          * Should be impossible, we set this when removing
4921          * event->rb_entry and wait/clear when adding event->rb_entry.
4922          */
4923         WARN_ON_ONCE(event->rcu_pending);
4924 
4925         old_rb = event->rb;
4926         spin_lock_irqsave(&old_rb->event_lock, flags);
4927         list_del_rcu(&event->rb_entry);
4928         spin_unlock_irqrestore(&old_rb->event_lock, flags);
4929 
4930         event->rcu_batches = get_state_synchronize_rcu();
4931         event->rcu_pending = 1;
4932     }
4933 
4934     if (rb) {
4935         if (event->rcu_pending) {
4936             cond_synchronize_rcu(event->rcu_batches);
4937             event->rcu_pending = 0;
4938         }
4939 
4940         spin_lock_irqsave(&rb->event_lock, flags);
4941         list_add_rcu(&event->rb_entry, &rb->event_list);
4942         spin_unlock_irqrestore(&rb->event_lock, flags);
4943     }
4944 
4945     /*
4946      * Avoid racing with perf_mmap_close(AUX): stop the event
4947      * before swizzling the event::rb pointer; if it's getting
4948      * unmapped, its aux_mmap_count will be 0 and it won't
4949      * restart. See the comment in __perf_pmu_output_stop().
4950      *
4951      * Data will inevitably be lost when set_output is done in
4952      * mid-air, but then again, whoever does it like this is
4953      * not in for the data anyway.
4954      */
4955     if (has_aux(event))
4956         perf_event_stop(event, 0);
4957 
4958     rcu_assign_pointer(event->rb, rb);
4959 
4960     if (old_rb) {
4961         ring_buffer_put(old_rb);
4962         /*
4963          * Since we detached before setting the new rb, so that we
4964          * could attach the new rb, we could have missed a wakeup.
4965          * Provide it now.
4966          */
4967         wake_up_all(&event->waitq);
4968     }
4969 }
4970 
4971 static void ring_buffer_wakeup(struct perf_event *event)
4972 {
4973     struct ring_buffer *rb;
4974 
4975     rcu_read_lock();
4976     rb = rcu_dereference(event->rb);
4977     if (rb) {
4978         list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4979             wake_up_all(&event->waitq);
4980     }
4981     rcu_read_unlock();
4982 }
4983 
4984 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4985 {
4986     struct ring_buffer *rb;
4987 
4988     rcu_read_lock();
4989     rb = rcu_dereference(event->rb);
4990     if (rb) {
4991         if (!atomic_inc_not_zero(&rb->refcount))
4992             rb = NULL;
4993     }
4994     rcu_read_unlock();
4995 
4996     return rb;
4997 }
4998 
4999 void ring_buffer_put(struct ring_buffer *rb)
5000 {
5001     if (!atomic_dec_and_test(&rb->refcount))
5002         return;
5003 
5004     WARN_ON_ONCE(!list_empty(&rb->event_list));
5005 
5006     call_rcu(&rb->rcu_head, rb_free_rcu);
5007 }
5008 
5009 static void perf_mmap_open(struct vm_area_struct *vma)
5010 {
5011     struct perf_event *event = vma->vm_file->private_data;
5012 
5013     atomic_inc(&event->mmap_count);
5014     atomic_inc(&event->rb->mmap_count);
5015 
5016     if (vma->vm_pgoff)
5017         atomic_inc(&event->rb->aux_mmap_count);
5018 
5019     if (event->pmu->event_mapped)
5020         event->pmu->event_mapped(event);
5021 }
5022 
5023 static void perf_pmu_output_stop(struct perf_event *event);
5024 
5025 /*
5026  * A buffer can be mmap()ed multiple times; either directly through the same
5027  * event, or through other events by use of perf_event_set_output().
5028  *
5029  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5030  * the buffer here, where we still have a VM context. This means we need
5031  * to detach all events redirecting to us.
5032  */
5033 static void perf_mmap_close(struct vm_area_struct *vma)
5034 {
5035     struct perf_event *event = vma->vm_file->private_data;
5036 
5037     struct ring_buffer *rb = ring_buffer_get(event);
5038     struct user_struct *mmap_user = rb->mmap_user;
5039     int mmap_locked = rb->mmap_locked;
5040     unsigned long size = perf_data_size(rb);
5041 
5042     if (event->pmu->event_unmapped)
5043         event->pmu->event_unmapped(event);
5044 
5045     /*
5046      * rb->aux_mmap_count will always drop before rb->mmap_count and
5047      * event->mmap_count, so it is ok to use event->mmap_mutex to
5048      * serialize with perf_mmap here.
5049      */
5050     if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5051         atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5052         /*
5053          * Stop all AUX events that are writing to this buffer,
5054          * so that we can free its AUX pages and corresponding PMU
5055          * data. Note that after rb::aux_mmap_count dropped to zero,
5056          * they won't start any more (see perf_aux_output_begin()).
5057          */
5058         perf_pmu_output_stop(event);
5059 
5060         /* now it's safe to free the pages */
5061         atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5062         vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5063 
5064         /* this has to be the last one */
5065         rb_free_aux(rb);
5066         WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5067 
5068         mutex_unlock(&event->mmap_mutex);
5069     }
5070 
5071     atomic_dec(&rb->mmap_count);
5072 
5073     if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5074         goto out_put;
5075 
5076     ring_buffer_attach(event, NULL);
5077     mutex_unlock(&event->mmap_mutex);
5078 
5079     /* If there's still other mmap()s of this buffer, we're done. */
5080     if (atomic_read(&rb->mmap_count))
5081         goto out_put;
5082 
5083     /*
5084      * No other mmap()s, detach from all other events that might redirect
5085      * into the now unreachable buffer. Somewhat complicated by the
5086      * fact that rb::event_lock otherwise nests inside mmap_mutex.
5087      */
5088 again:
5089     rcu_read_lock();
5090     list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5091         if (!atomic_long_inc_not_zero(&event->refcount)) {
5092             /*
5093              * This event is en-route to free_event() which will
5094              * detach it and remove it from the list.
5095              */
5096             continue;
5097         }
5098         rcu_read_unlock();
5099 
5100         mutex_lock(&event->mmap_mutex);
5101         /*
5102          * Check we didn't race with perf_event_set_output() which can
5103          * swizzle the rb from under us while we were waiting to
5104          * acquire mmap_mutex.
5105          *
5106          * If we find a different rb; ignore this event, a next
5107          * iteration will no longer find it on the list. We have to
5108          * still restart the iteration to make sure we're not now
5109          * iterating the wrong list.
5110          */
5111         if (event->rb == rb)
5112             ring_buffer_attach(event, NULL);
5113 
5114         mutex_unlock(&event->mmap_mutex);
5115         put_event(event);
5116 
5117         /*
5118          * Restart the iteration; either we're on the wrong list or
5119          * destroyed its integrity by doing a deletion.
5120          */
5121         goto again;
5122     }
5123     rcu_read_unlock();
5124 
5125     /*
5126      * It could be there's still a few 0-ref events on the list; they'll
5127      * get cleaned up by free_event() -- they'll also still have their
5128      * ref on the rb and will free it whenever they are done with it.
5129      *
5130      * Aside from that, this buffer is 'fully' detached and unmapped,
5131      * undo the VM accounting.
5132      */
5133 
5134     atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5135     vma->vm_mm->pinned_vm -= mmap_locked;
5136     free_uid(mmap_user);
5137 
5138 out_put:
5139     ring_buffer_put(rb); /* could be last */
5140 }
5141 
5142 static const struct vm_operations_struct perf_mmap_vmops = {
5143     .open       = perf_mmap_open,
5144     .close      = perf_mmap_close, /* non mergable */
5145     .fault      = perf_mmap_fault,
5146     .page_mkwrite   = perf_mmap_fault,
5147 };
5148 
5149 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5150 {
5151     struct perf_event *event = file->private_data;
5152     unsigned long user_locked, user_lock_limit;
5153     struct user_struct *user = current_user();
5154     unsigned long locked, lock_limit;
5155     struct ring_buffer *rb = NULL;
5156     unsigned long vma_size;
5157     unsigned long nr_pages;
5158     long user_extra = 0, extra = 0;
5159     int ret = 0, flags = 0;
5160 
5161     /*
5162      * Don't allow mmap() of inherited per-task counters. This would
5163      * create a performance issue due to all children writing to the
5164      * same rb.
5165      */
5166     if (event->cpu == -1 && event->attr.inherit)
5167         return -EINVAL;
5168 
5169     if (!(vma->vm_flags & VM_SHARED))
5170         return -EINVAL;
5171 
5172     vma_size = vma->vm_end - vma->vm_start;
5173 
5174     if (vma->vm_pgoff == 0) {
5175         nr_pages = (vma_size / PAGE_SIZE) - 1;
5176     } else {
5177         /*
5178          * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5179          * mapped, all subsequent mappings should have the same size
5180          * and offset. Must be above the normal perf buffer.
5181          */
5182         u64 aux_offset, aux_size;
5183 
5184         if (!event->rb)
5185             return -EINVAL;
5186 
5187         nr_pages = vma_size / PAGE_SIZE;
5188 
5189         mutex_lock(&event->mmap_mutex);
5190         ret = -EINVAL;
5191 
5192         rb = event->rb;
5193         if (!rb)
5194             goto aux_unlock;
5195 
5196         aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5197         aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5198 
5199         if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5200             goto aux_unlock;
5201 
5202         if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5203             goto aux_unlock;
5204 
5205         /* already mapped with a different offset */
5206         if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5207             goto aux_unlock;
5208 
5209         if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5210             goto aux_unlock;
5211 
5212         /* already mapped with a different size */
5213         if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5214             goto aux_unlock;
5215 
5216         if (!is_power_of_2(nr_pages))
5217             goto aux_unlock;
5218 
5219         if (!atomic_inc_not_zero(&rb->mmap_count))
5220             goto aux_unlock;
5221 
5222         if (rb_has_aux(rb)) {
5223             atomic_inc(&rb->aux_mmap_count);
5224             ret = 0;
5225             goto unlock;
5226         }
5227 
5228         atomic_set(&rb->aux_mmap_count, 1);
5229         user_extra = nr_pages;
5230 
5231         goto accounting;
5232     }
5233 
5234     /*
5235      * If we have rb pages ensure they're a power-of-two number, so we
5236      * can do bitmasks instead of modulo.
5237      */
5238     if (nr_pages != 0 && !is_power_of_2(nr_pages))
5239         return -EINVAL;
5240 
5241     if (vma_size != PAGE_SIZE * (1 + nr_pages))
5242         return -EINVAL;
5243 
5244     WARN_ON_ONCE(event->ctx->parent_ctx);
5245 again:
5246     mutex_lock(&event->mmap_mutex);
5247     if (event->rb) {
5248         if (event->rb->nr_pages != nr_pages) {
5249             ret = -EINVAL;
5250             goto unlock;
5251         }
5252 
5253         if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5254             /*
5255              * Raced against perf_mmap_close() through
5256              * perf_event_set_output(). Try again, hope for better
5257              * luck.
5258              */
5259             mutex_unlock(&event->mmap_mutex);
5260             goto again;
5261         }
5262 
5263         goto unlock;
5264     }
5265 
5266     user_extra = nr_pages + 1;
5267 
5268 accounting:
5269     user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5270 
5271     /*
5272      * Increase the limit linearly with more CPUs:
5273      */
5274     user_lock_limit *= num_online_cpus();
5275 
5276     user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5277 
5278     if (user_locked > user_lock_limit)
5279         extra = user_locked - user_lock_limit;
5280 
5281     lock_limit = rlimit(RLIMIT_MEMLOCK);
5282     lock_limit >>= PAGE_SHIFT;
5283     locked = vma->vm_mm->pinned_vm + extra;
5284 
5285     if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5286         !capable(CAP_IPC_LOCK)) {
5287         ret = -EPERM;
5288         goto unlock;
5289     }
5290 
5291     WARN_ON(!rb && event->rb);
5292 
5293     if (vma->vm_flags & VM_WRITE)
5294         flags |= RING_BUFFER_WRITABLE;
5295 
5296     if (!rb) {
5297         rb = rb_alloc(nr_pages,
5298                   event->attr.watermark ? event->attr.wakeup_watermark : 0,
5299                   event->cpu, flags);
5300 
5301         if (!rb) {
5302             ret = -ENOMEM;
5303             goto unlock;
5304         }
5305 
5306         atomic_set(&rb->mmap_count, 1);
5307         rb->mmap_user = get_current_user();
5308         rb->mmap_locked = extra;
5309 
5310         ring_buffer_attach(event, rb);
5311 
5312         perf_event_init_userpage(event);
5313         perf_event_update_userpage(event);
5314     } else {
5315         ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5316                    event->attr.aux_watermark, flags);
5317         if (!ret)
5318             rb->aux_mmap_locked = extra;
5319     }
5320 
5321 unlock:
5322     if (!ret) {
5323         atomic_long_add(user_extra, &user->locked_vm);
5324         vma->vm_mm->pinned_vm += extra;
5325 
5326         atomic_inc(&event->mmap_count);
5327     } else if (rb) {
5328         atomic_dec(&rb->mmap_count);
5329     }
5330 aux_unlock:
5331     mutex_unlock(&event->mmap_mutex);
5332 
5333     /*
5334      * Since pinned accounting is per vm we cannot allow fork() to copy our
5335      * vma.
5336      */
5337     vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5338     vma->vm_ops = &perf_mmap_vmops;
5339 
5340     if (event->pmu->event_mapped)
5341         event->pmu->event_mapped(event);
5342 
5343     return ret;
5344 }
5345 
5346 static int perf_fasync(int fd, struct file *filp, int on)
5347 {
5348     struct inode *inode = file_inode(filp);
5349     struct perf_event *event = filp->private_data;
5350     int retval;
5351 
5352     inode_lock(inode);
5353     retval = fasync_helper(fd, filp, on, &event->fasync);
5354     inode_unlock(inode);
5355 
5356     if (retval < 0)
5357         return retval;
5358 
5359     return 0;
5360 }
5361 
5362 static const struct file_operations perf_fops = {
5363     .llseek         = no_llseek,
5364     .release        = perf_release,
5365     .read           = perf_read,
5366     .poll           = perf_poll,
5367     .unlocked_ioctl     = perf_ioctl,
5368     .compat_ioctl       = perf_compat_ioctl,
5369     .mmap           = perf_mmap,
5370     .fasync         = perf_fasync,
5371 };
5372 
5373 /*
5374  * Perf event wakeup
5375  *
5376  * If there's data, ensure we set the poll() state and publish everything
5377  * to user-space before waking everybody up.
5378  */
5379 
5380 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5381 {
5382     /* only the parent has fasync state */
5383     if (event->parent)
5384         event = event->parent;
5385     return &event->fasync;
5386 }
5387 
5388 void perf_event_wakeup(struct perf_event *event)
5389 {
5390     ring_buffer_wakeup(event);
5391 
5392     if (event->pending_kill) {
5393         kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5394         event->pending_kill = 0;
5395     }
5396 }
5397 
5398 static void perf_pending_event(struct irq_work *entry)
5399 {
5400     struct perf_event *event = container_of(entry,
5401             struct perf_event, pending);
5402     int rctx;
5403 
5404     rctx = perf_swevent_get_recursion_context();
5405     /*
5406      * If we 'fail' here, that's OK, it means recursion is already disabled
5407      * and we won't recurse 'further'.
5408      */
5409 
5410     if (event->pending_disable) {
5411         event->pending_disable = 0;
5412         perf_event_disable_local(event);
5413     }
5414 
5415     if (event->pending_wakeup) {
5416         event->pending_wakeup = 0;
5417         perf_event_wakeup(event);
5418     }
5419 
5420     if (rctx >= 0)
5421         perf_swevent_put_recursion_context(rctx);
5422 }
5423 
5424 /*
5425  * We assume there is only KVM supporting the callbacks.
5426  * Later on, we might change it to a list if there is
5427  * another virtualization implementation supporting the callbacks.
5428  */
5429 struct perf_guest_info_callbacks *perf_guest_cbs;
5430 
5431 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5432 {
5433     perf_guest_cbs = cbs;
5434     return 0;
5435 }
5436 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5437 
5438 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5439 {
5440     perf_guest_cbs = NULL;
5441     return 0;
5442 }
5443 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5444 
5445 static void
5446 perf_output_sample_regs(struct perf_output_handle *handle,
5447             struct pt_regs *regs, u64 mask)
5448 {
5449     int bit;
5450     DECLARE_BITMAP(_mask, 64);
5451 
5452     bitmap_from_u64(_mask, mask);
5453     for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5454         u64 val;
5455 
5456         val = perf_reg_value(regs, bit);
5457         perf_output_put(handle, val);
5458     }
5459 }
5460 
5461 static void perf_sample_regs_user(struct perf_regs *regs_user,
5462                   struct pt_regs *regs,
5463                   struct pt_regs *regs_user_copy)
5464 {
5465     if (user_mode(regs)) {
5466         regs_user->abi = perf_reg_abi(current);
5467         regs_user->regs = regs;
5468     } else if (current->mm) {
5469         perf_get_regs_user(regs_user, regs, regs_user_copy);
5470     } else {
5471         regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5472         regs_user->regs = NULL;
5473     }
5474 }
5475 
5476 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5477                   struct pt_regs *regs)
5478 {
5479     regs_intr->regs = regs;
5480     regs_intr->abi  = perf_reg_abi(current);
5481 }
5482 
5483 
5484 /*
5485  * Get remaining task size from user stack pointer.
5486  *
5487  * It'd be better to take stack vma map and limit this more
5488  * precisly, but there's no way to get it safely under interrupt,
5489  * so using TASK_SIZE as limit.
5490  */
5491 static u64 perf_ustack_task_size(struct pt_regs *regs)
5492 {
5493     unsigned long addr = perf_user_stack_pointer(regs);
5494 
5495     if (!addr || addr >= TASK_SIZE)
5496         return 0;
5497 
5498     return TASK_SIZE - addr;
5499 }
5500 
5501 static u16
5502 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5503             struct pt_regs *regs)
5504 {
5505     u64 task_size;
5506 
5507     /* No regs, no stack pointer, no dump. */
5508     if (!regs)
5509         return 0;
5510 
5511     /*
5512      * Check if we fit in with the requested stack size into the:
5513      * - TASK_SIZE
5514      *   If we don't, we limit the size to the TASK_SIZE.
5515      *
5516      * - remaining sample size
5517      *   If we don't, we customize the stack size to
5518      *   fit in to the remaining sample size.
5519      */
5520 
5521     task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5522     stack_size = min(stack_size, (u16) task_size);
5523 
5524     /* Current header size plus static size and dynamic size. */
5525     header_size += 2 * sizeof(u64);
5526 
5527     /* Do we fit in with the current stack dump size? */
5528     if ((u16) (header_size + stack_size) < header_size) {
5529         /*
5530          * If we overflow the maximum size for the sample,
5531          * we customize the stack dump size to fit in.
5532          */
5533         stack_size = USHRT_MAX - header_size - sizeof(u64);
5534         stack_size = round_up(stack_size, sizeof(u64));
5535     }
5536 
5537     return stack_size;
5538 }
5539 
5540 static void
5541 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5542               struct pt_regs *regs)
5543 {
5544     /* Case of a kernel thread, nothing to dump */
5545     if (!regs) {
5546         u64 size = 0;
5547         perf_output_put(handle, size);
5548     } else {
5549         unsigned long sp;
5550         unsigned int rem;
5551         u64 dyn_size;
5552 
5553         /*
5554          * We dump:
5555          * static size
5556          *   - the size requested by user or the best one we can fit
5557          *     in to the sample max size
5558          * data
5559          *   - user stack dump data
5560          * dynamic size
5561          *   - the actual dumped size
5562          */
5563 
5564         /* Static size. */
5565         perf_output_put(handle, dump_size);
5566 
5567         /* Data. */
5568         sp = perf_user_stack_pointer(regs);
5569         rem = __output_copy_user(handle, (void *) sp, dump_size);
5570         dyn_size = dump_size - rem;
5571 
5572         perf_output_skip(handle, rem);
5573 
5574         /* Dynamic size. */
5575         perf_output_put(handle, dyn_size);
5576     }
5577 }
5578 
5579 static void __perf_event_header__init_id(struct perf_event_header *header,
5580                      struct perf_sample_data *data,
5581                      struct perf_event *event)
5582 {
5583     u64 sample_type = event->attr.sample_type;
5584 
5585     data->type = sample_type;
5586     header->size += event->id_header_size;
5587 
5588     if (sample_type & PERF_SAMPLE_TID) {
5589         /* namespace issues */
5590         data->tid_entry.pid = perf_event_pid(event, current);
5591         data->tid_entry.tid = perf_event_tid(event, current);
5592     }
5593 
5594     if (sample_type & PERF_SAMPLE_TIME)
5595         data->time = perf_event_clock(event);
5596 
5597     if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5598         data->id = primary_event_id(event);
5599 
5600     if (sample_type & PERF_SAMPLE_STREAM_ID)
5601         data->stream_id = event->id;
5602 
5603     if (sample_type & PERF_SAMPLE_CPU) {
5604         data->cpu_entry.cpu  = raw_smp_processor_id();
5605         data->cpu_entry.reserved = 0;
5606     }
5607 }
5608 
5609 void perf_event_header__init_id(struct perf_event_header *header,
5610                 struct perf_sample_data *data,
5611                 struct perf_event *event)
5612 {
5613     if (event->attr.sample_id_all)
5614         __perf_event_header__init_id(header, data, event);
5615 }
5616 
5617 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5618                        struct perf_sample_data *data)
5619 {
5620     u64 sample_type = data->type;
5621 
5622     if (sample_type & PERF_SAMPLE_TID)
5623         perf_output_put(handle, data->tid_entry);
5624 
5625     if (sample_type & PERF_SAMPLE_TIME)
5626         perf_output_put(handle, data->time);
5627 
5628     if (sample_type & PERF_SAMPLE_ID)
5629         perf_output_put(handle, data->id);
5630 
5631     if (sample_type & PERF_SAMPLE_STREAM_ID)
5632         perf_output_put(handle, data->stream_id);
5633 
5634     if (sample_type & PERF_SAMPLE_CPU)
5635         perf_output_put(handle, data->cpu_entry);
5636 
5637     if (sample_type & PERF_SAMPLE_IDENTIFIER)
5638         perf_output_put(handle, data->id);
5639 }
5640 
5641 void perf_event__output_id_sample(struct perf_event *event,
5642                   struct perf_output_handle *handle,
5643                   struct perf_sample_data *sample)
5644 {
5645     if (event->attr.sample_id_all)
5646         __perf_event__output_id_sample(handle, sample);
5647 }
5648 
5649 static void perf_output_read_one(struct perf_output_handle *handle,
5650                  struct perf_event *event,
5651                  u64 enabled, u64 running)
5652 {
5653     u64 read_format = event->attr.read_format;
5654     u64 values[4];
5655     int n = 0;
5656 
5657     values[n++] = perf_event_count(event);
5658     if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5659         values[n++] = enabled +
5660             atomic64_read(&event->child_total_time_enabled);
5661     }
5662     if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5663         values[n++] = running +
5664             atomic64_read(&event->child_total_time_running);
5665     }
5666     if (read_format & PERF_FORMAT_ID)
5667         values[n++] = primary_event_id(event);
5668 
5669     __output_copy(handle, values, n * sizeof(u64));
5670 }
5671 
5672 /*
5673  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5674  */
5675 static void perf_output_read_group(struct perf_output_handle *handle,
5676                 struct perf_event *event,
5677                 u64 enabled, u64 running)
5678 {
5679     struct perf_event *leader = event->group_leader, *sub;
5680     u64 read_format = event->attr.read_format;
5681     u64 values[5];
5682     int n = 0;
5683 
5684     values[n++] = 1 + leader->nr_siblings;
5685 
5686     if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5687         values[n++] = enabled;
5688 
5689     if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5690         values[n++] = running;
5691 
5692     if (leader != event)
5693         leader->pmu->read(leader);
5694 
5695     values[n++] = perf_event_count(leader);
5696     if (read_format & PERF_FORMAT_ID)
5697         values[n++] = primary_event_id(leader);
5698 
5699     __output_copy(handle, values, n * sizeof(u64));
5700 
5701     list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5702         n = 0;
5703 
5704         if ((sub != event) &&
5705             (sub->state == PERF_EVENT_STATE_ACTIVE))
5706             sub->pmu->read(sub);
5707 
5708         values[n++] = perf_event_count(sub);
5709         if (read_format & PERF_FORMAT_ID)
5710             values[n++] = primary_event_id(sub);
5711 
5712         __output_copy(handle, values, n * sizeof(u64));
5713     }
5714 }
5715 
5716 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5717                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5718 
5719 static void perf_output_read(struct perf_output_handle *handle,
5720                  struct perf_event *event)
5721 {
5722     u64 enabled = 0, running = 0, now;
5723     u64 read_format = event->attr.read_format;
5724 
5725     /*
5726      * compute total_time_enabled, total_time_running
5727      * based on snapshot values taken when the event
5728      * was last scheduled in.
5729      *
5730      * we cannot simply called update_context_time()
5731      * because of locking issue as we are called in
5732      * NMI context
5733      */
5734     if (read_format & PERF_FORMAT_TOTAL_TIMES)
5735         calc_timer_values(event, &now, &enabled, &running);
5736 
5737     if (event->attr.read_format & PERF_FORMAT_GROUP)
5738         perf_output_read_group(handle, event, enabled, running);
5739     else
5740         perf_output_read_one(handle, event, enabled, running);
5741 }
5742 
5743 void perf_output_sample(struct perf_output_handle *handle,
5744             struct perf_event_header *header,
5745             struct perf_sample_data *data,
5746             struct perf_event *event)
5747 {
5748     u64 sample_type = data->type;
5749 
5750     perf_output_put(handle, *header);
5751 
5752     if (sample_type & PERF_SAMPLE_IDENTIFIER)
5753         perf_output_put(handle, data->id);
5754 
5755     if (sample_type & PERF_SAMPLE_IP)
5756         perf_output_put(handle, data->ip);
5757 
5758     if (sample_type & PERF_SAMPLE_TID)
5759         perf_output_put(handle, data->tid_entry);
5760 
5761     if (sample_type & PERF_SAMPLE_TIME)
5762         perf_output_put(handle, data->time);
5763 
5764     if (sample_type & PERF_SAMPLE_ADDR)
5765         perf_output_put(handle, data->addr);
5766 
5767     if (sample_type & PERF_SAMPLE_ID)
5768         perf_output_put(handle, data->id);
5769 
5770     if (sample_type & PERF_SAMPLE_STREAM_ID)
5771         perf_output_put(handle, data->stream_id);
5772 
5773     if (sample_type & PERF_SAMPLE_CPU)
5774         perf_output_put(handle, data->cpu_entry);
5775 
5776     if (sample_type & PERF_SAMPLE_PERIOD)
5777         perf_output_put(handle, data->period);
5778 
5779     if (sample_type & PERF_SAMPLE_READ)
5780         perf_output_read(handle, event);
5781 
5782     if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5783         if (data->callchain) {
5784             int size = 1;
5785 
5786             if (data->callchain)
5787                 size += data->callchain->nr;
5788 
5789             size *= sizeof(u64);
5790 
5791             __output_copy(handle, data->callchain, size);
5792         } else {
5793             u64 nr = 0;
5794             perf_output_put(handle, nr);
5795         }
5796     }
5797 
5798     if (sample_type & PERF_SAMPLE_RAW) {
5799         struct perf_raw_record *raw = data->raw;
5800 
5801         if (raw) {
5802             struct perf_raw_frag *frag = &raw->frag;
5803 
5804             perf_output_put(handle, raw->size);
5805             do {
5806                 if (frag->copy) {
5807                     __output_custom(handle, frag->copy,
5808                             frag->data, frag->size);
5809                 } else {
5810                     __output_copy(handle, frag->data,
5811                               frag->size);
5812                 }
5813                 if (perf_raw_frag_last(frag))
5814                     break;
5815                 frag = frag->next;
5816             } while (1);
5817             if (frag->pad)
5818                 __output_skip(handle, NULL, frag->pad);
5819         } else {
5820             struct {
5821                 u32 size;
5822                 u32 data;
5823             } raw = {
5824                 .size = sizeof(u32),
5825                 .data = 0,
5826             };
5827             perf_output_put(handle, raw);
5828         }
5829     }
5830 
5831     if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5832         if (data->br_stack) {
5833             size_t size;
5834 
5835             size = data->br_stack->nr
5836                  * sizeof(struct perf_branch_entry);
5837 
5838             perf_output_put(handle, data->br_stack->nr);
5839             perf_output_copy(handle, data->br_stack->entries, size);
5840         } else {
5841             /*
5842              * we always store at least the value of nr
5843              */
5844             u64 nr = 0;
5845             perf_output_put(handle, nr);
5846         }
5847     }
5848 
5849     if (sample_type & PERF_SAMPLE_REGS_USER) {
5850         u64 abi = data->regs_user.abi;
5851 
5852         /*
5853          * If there are no regs to dump, notice it through
5854          * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5855          */
5856         perf_output_put(handle, abi);
5857 
5858         if (abi) {
5859             u64 mask = event->attr.sample_regs_user;
5860             perf_output_sample_regs(handle,
5861                         data->regs_user.regs,
5862                         mask);
5863         }
5864     }
5865 
5866     if (sample_type & PERF_SAMPLE_STACK_USER) {
5867         perf_output_sample_ustack(handle,
5868                       data->stack_user_size,
5869                       data->regs_user.regs);
5870     }
5871 
5872     if (sample_type & PERF_SAMPLE_WEIGHT)
5873         perf_output_put(handle, data->weight);
5874 
5875     if (sample_type & PERF_SAMPLE_DATA_SRC)
5876         perf_output_put(handle, data->data_src.val);
5877 
5878     if (sample_type & PERF_SAMPLE_TRANSACTION)
5879         perf_output_put(handle, data->txn);
5880 
5881     if (sample_type & PERF_SAMPLE_REGS_INTR) {
5882         u64 abi = data->regs_intr.abi;
5883         /*
5884          * If there are no regs to dump, notice it through
5885          * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5886          */
5887         perf_output_put(handle, abi);
5888 
5889         if (abi) {
5890             u64 mask = event->attr.sample_regs_intr;
5891 
5892             perf_output_sample_regs(handle,
5893                         data->regs_intr.regs,
5894                         mask);
5895         }
5896     }
5897 
5898     if (!event->attr.watermark) {
5899         int wakeup_events = event->attr.wakeup_events;
5900 
5901         if (wakeup_events) {
5902             struct ring_buffer *rb = handle->rb;
5903             int events = local_inc_return(&rb->events);
5904 
5905             if (events >= wakeup_events) {
5906                 local_sub(wakeup_events, &rb->events);
5907                 local_inc(&rb->wakeup);
5908             }
5909         }
5910     }
5911 }
5912 
5913 void perf_prepare_sample(struct perf_event_header *header,
5914              struct perf_sample_data *data,
5915              struct perf_event *event,
5916              struct pt_regs *regs)
5917 {
5918     u64 sample_type = event->attr.sample_type;
5919 
5920     header->type = PERF_RECORD_SAMPLE;
5921     header->size = sizeof(*header) + event->header_size;
5922 
5923     header->misc = 0;
5924     header->misc |= perf_misc_flags(regs);
5925 
5926     __perf_event_header__init_id(header, data, event);
5927 
5928     if (sample_type & PERF_SAMPLE_IP)
5929         data->ip = perf_instruction_pointer(regs);
5930 
5931     if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5932         int size = 1;
5933 
5934         data->callchain = perf_callchain(event, regs);
5935 
5936         if (data->callchain)
5937             size += data->callchain->nr;
5938 
5939         header->size += size * sizeof(u64);
5940     }
5941 
5942     if (sample_type & PERF_SAMPLE_RAW) {
5943         struct perf_raw_record *raw = data->raw;
5944         int size;
5945 
5946         if (raw) {
5947             struct perf_raw_frag *frag = &raw->frag;
5948             u32 sum = 0;
5949 
5950             do {
5951                 sum += frag->size;
5952                 if (perf_raw_frag_last(frag))
5953                     break;
5954                 frag = frag->next;
5955             } while (1);
5956 
5957             size = round_up(sum + sizeof(u32), sizeof(u64));
5958             raw->size = size - sizeof(u32);
5959             frag->pad = raw->size - sum;
5960         } else {
5961             size = sizeof(u64);
5962         }
5963 
5964         header->size += size;
5965     }
5966 
5967     if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5968         int size = sizeof(u64); /* nr */
5969         if (data->br_stack) {
5970             size += data->br_stack->nr
5971                   * sizeof(struct perf_branch_entry);
5972         }
5973         header->size += size;
5974     }
5975 
5976     if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5977         perf_sample_regs_user(&data->regs_user, regs,
5978                       &data->regs_user_copy);
5979 
5980     if (sample_type & PERF_SAMPLE_REGS_USER) {
5981         /* regs dump ABI info */
5982         int size = sizeof(u64);
5983 
5984         if (data->regs_user.regs) {
5985             u64 mask = event->attr.sample_regs_user;
5986             size += hweight64(mask) * sizeof(u64);
5987         }
5988 
5989         header->size += size;
5990     }
5991 
5992     if (sample_type & PERF_SAMPLE_STACK_USER) {
5993         /*
5994          * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5995          * processed as the last one or have additional check added
5996          * in case new sample type is added, because we could eat
5997          * up the rest of the sample size.
5998          */
5999         u16 stack_size = event->attr.sample_stack_user;
6000         u16 size = sizeof(u64);
6001 
6002         stack_size = perf_sample_ustack_size(stack_size, header->size,
6003                              data->regs_user.regs);
6004 
6005         /*
6006          * If there is something to dump, add space for the dump
6007          * itself and for the field that tells the dynamic size,
6008          * which is how many have been actually dumped.
6009          */
6010         if (stack_size)
6011             size += sizeof(u64) + stack_size;
6012 
6013         data->stack_user_size = stack_size;
6014         header->size += size;
6015     }
6016 
6017     if (sample_type & PERF_SAMPLE_REGS_INTR) {
6018         /* regs dump ABI info */
6019         int size = sizeof(u64);
6020 
6021         perf_sample_regs_intr(&data->regs_intr, regs);
6022 
6023         if (data->regs_intr.regs) {
6024             u64 mask = event->attr.sample_regs_intr;
6025 
6026             size += hweight64(mask) * sizeof(u64);
6027         }
6028 
6029         header->size += size;
6030     }
6031 }
6032 
6033 static void __always_inline
6034 __perf_event_output(struct perf_event *event,
6035             struct perf_sample_data *data,
6036             struct pt_regs *regs,
6037             int (*output_begin)(struct perf_output_handle *,
6038                     struct perf_event *,
6039                     unsigned int))
6040 {
6041     struct perf_output_handle handle;
6042     struct perf_event_header header;
6043 
6044     /* protect the callchain buffers */
6045     rcu_read_lock();
6046 
6047     perf_prepare_sample(&header, data, event, regs);
6048 
6049     if (output_begin(&handle, event, header.size))
6050         goto exit;
6051 
6052     perf_output_sample(&handle, &header, data, event);
6053 
6054     perf_output_end(&handle);
6055 
6056 exit:
6057     rcu_read_unlock();
6058 }
6059 
6060 void
6061 perf_event_output_forward(struct perf_event *event,
6062              struct perf_sample_data *data,
6063              struct pt_regs *regs)
6064 {
6065     __perf_event_output(event, data, regs, perf_output_begin_forward);
6066 }
6067 
6068 void
6069 perf_event_output_backward(struct perf_event *event,
6070                struct perf_sample_data *data,
6071                struct pt_regs *regs)
6072 {
6073     __perf_event_output(event, data, regs, perf_output_begin_backward);
6074 }
6075 
6076 void
6077 perf_event_output(struct perf_event *event,
6078           struct perf_sample_data *data,
6079           struct pt_regs *regs)
6080 {
6081     __perf_event_output(event, data, regs, perf_output_begin);
6082 }
6083 
6084 /*
6085  * read event_id
6086  */
6087 
6088 struct perf_read_event {
6089     struct perf_event_header    header;
6090 
6091     u32             pid;
6092     u32             tid;
6093 };
6094 
6095 static void
6096 perf_event_read_event(struct perf_event *event,
6097             struct task_struct *task)
6098 {
6099     struct perf_output_handle handle;
6100     struct perf_sample_data sample;
6101     struct perf_read_event read_event = {
6102         .header = {
6103             .type = PERF_RECORD_READ,
6104             .misc = 0,
6105             .size = sizeof(read_event) + event->read_size,
6106         },
6107         .pid = perf_event_pid(event, task),
6108         .tid = perf_event_tid(event, task),
6109     };
6110     int ret;
6111 
6112     perf_event_header__init_id(&read_event.header, &sample, event);
6113     ret = perf_output_begin(&handle, event, read_event.header.size);
6114     if (ret)
6115         return;
6116 
6117     perf_output_put(&handle, read_event);
6118     perf_output_read(&handle, event);
6119     perf_event__output_id_sample(event, &handle, &sample);
6120 
6121     perf_output_end(&handle);
6122 }
6123 
6124 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6125 
6126 static void
6127 perf_iterate_ctx(struct perf_event_context *ctx,
6128            perf_iterate_f output,
6129            void *data, bool all)
6130 {
6131     struct perf_event *event;
6132 
6133     list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6134         if (!all) {
6135             if (event->state < PERF_EVENT_STATE_INACTIVE)
6136                 continue;
6137             if (!event_filter_match(event))
6138                 continue;
6139         }
6140 
6141         output(event, data);
6142     }
6143 }
6144 
6145 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6146 {
6147     struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6148     struct perf_event *event;
6149 
6150     list_for_each_entry_rcu(event, &pel->list, sb_list) {
6151         /*
6152          * Skip events that are not fully formed yet; ensure that
6153          * if we observe event->ctx, both event and ctx will be
6154          * complete enough. See perf_install_in_context().
6155          */
6156         if (!smp_load_acquire(&event->ctx))
6157             continue;
6158 
6159         if (event->state < PERF_EVENT_STATE_INACTIVE)
6160             continue;
6161         if (!event_filter_match(event))
6162             continue;
6163         output(event, data);
6164     }
6165 }
6166 
6167 /*
6168  * Iterate all events that need to receive side-band events.
6169  *
6170  * For new callers; ensure that account_pmu_sb_event() includes
6171  * your event, otherwise it might not get delivered.
6172  */
6173 static void
6174 perf_iterate_sb(perf_iterate_f output, void *data,
6175            struct perf_event_context *task_ctx)
6176 {
6177     struct perf_event_context *ctx;
6178     int ctxn;
6179 
6180     rcu_read_lock();
6181     preempt_disable();
6182 
6183     /*
6184      * If we have task_ctx != NULL we only notify the task context itself.
6185      * The task_ctx is set only for EXIT events before releasing task
6186      * context.
6187      */
6188     if (task_ctx) {
6189         perf_iterate_ctx(task_ctx, output, data, false);
6190         goto done;
6191     }
6192 
6193     perf_iterate_sb_cpu(output, data);
6194 
6195     for_each_task_context_nr(ctxn) {
6196         ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6197         if (ctx)
6198             perf_iterate_ctx(ctx, output, data, false);
6199     }
6200 done:
6201     preempt_enable();
6202     rcu_read_unlock();
6203 }
6204 
6205 /*
6206  * Clear all file-based filters at exec, they'll have to be
6207  * re-instated when/if these objects are mmapped again.
6208  */
6209 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6210 {
6211     struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6212     struct perf_addr_filter *filter;
6213     unsigned int restart = 0, count = 0;
6214     unsigned long flags;
6215 
6216     if (!has_addr_filter(event))
6217         return;
6218 
6219     raw_spin_lock_irqsave(&ifh->lock, flags);
6220     list_for_each_entry(filter, &ifh->list, entry) {
6221         if (filter->inode) {
6222             event->addr_filters_offs[count] = 0;
6223             restart++;
6224         }
6225 
6226         count++;
6227     }
6228 
6229     if (restart)
6230         event->addr_filters_gen++;
6231     raw_spin_unlock_irqrestore(&ifh->lock, flags);
6232 
6233     if (restart)
6234         perf_event_stop(event, 1);
6235 }
6236 
6237 void perf_event_exec(void)
6238 {
6239     struct perf_event_context *ctx;
6240     int ctxn;
6241 
6242     rcu_read_lock();
6243     for_each_task_context_nr(ctxn) {
6244         ctx = current->perf_event_ctxp[ctxn];
6245         if (!ctx)
6246             continue;
6247 
6248         perf_event_enable_on_exec(ctxn);
6249 
6250         perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6251                    true);
6252     }
6253     rcu_read_unlock();
6254 }
6255 
6256 struct remote_output {
6257     struct ring_buffer  *rb;
6258     int         err;
6259 };
6260 
6261 static void __perf_event_output_stop(struct perf_event *event, void *data)
6262 {
6263     struct perf_event *parent = event->parent;
6264     struct remote_output *ro = data;
6265     struct ring_buffer *rb = ro->rb;
6266     struct stop_event_data sd = {
6267         .event  = event,
6268     };
6269 
6270     if (!has_aux(event))
6271         return;
6272 
6273     if (!parent)
6274         parent = event;
6275 
6276     /*
6277      * In case of inheritance, it will be the parent that links to the
6278      * ring-buffer, but it will be the child that's actually using it.
6279      *
6280      * We are using event::rb to determine if the event should be stopped,
6281      * however this may race with ring_buffer_attach() (through set_output),
6282      * which will make us skip the event that actually needs to be stopped.
6283      * So ring_buffer_attach() has to stop an aux event before re-assigning
6284      * its rb pointer.
6285      */
6286     if (rcu_dereference(parent->rb) == rb)
6287         ro->err = __perf_event_stop(&sd);
6288 }
6289 
6290 static int __perf_pmu_output_stop(void *info)
6291 {
6292     struct perf_event *event = info;
6293     struct pmu *pmu = event->pmu;
6294     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6295     struct remote_output ro = {
6296         .rb = event->rb,
6297     };
6298 
6299     rcu_read_lock();
6300     perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6301     if (cpuctx->task_ctx)
6302         perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6303                    &ro, false);
6304     rcu_read_unlock();
6305 
6306     return ro.err;
6307 }
6308 
6309 static void perf_pmu_output_stop(struct perf_event *event)
6310 {
6311     struct perf_event *iter;
6312     int err, cpu;
6313 
6314 restart:
6315     rcu_read_lock();
6316     list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6317         /*
6318          * For per-CPU events, we need to make sure that neither they
6319          * nor their children are running; for cpu==-1 events it's
6320          * sufficient to stop the event itself if it's active, since
6321          * it can't have children.
6322          */
6323         cpu = iter->cpu;
6324         if (cpu == -1)
6325             cpu = READ_ONCE(iter->oncpu);
6326 
6327         if (cpu == -1)
6328             continue;
6329 
6330         err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6331         if (err == -EAGAIN) {
6332             rcu_read_unlock();
6333             goto restart;
6334         }
6335     }
6336     rcu_read_unlock();
6337 }
6338 
6339 /*
6340  * task tracking -- fork/exit
6341  *
6342  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6343  */
6344 
6345 struct perf_task_event {
6346     struct task_struct      *task;
6347     struct perf_event_context   *task_ctx;
6348 
6349     struct {
6350         struct perf_event_header    header;
6351 
6352         u32             pid;
6353         u32             ppid;
6354         u32             tid;
6355         u32             ptid;
6356         u64             time;
6357     } event_id;
6358 };
6359 
6360 static int perf_event_task_match(struct perf_event *event)
6361 {
6362     return event->attr.comm  || event->attr.mmap ||
6363            event->attr.mmap2 || event->attr.mmap_data ||
6364            event->attr.task;
6365 }
6366 
6367 static void perf_event_task_output(struct perf_event *event,
6368                    void *data)
6369 {
6370     struct perf_task_event *task_event = data;
6371     struct perf_output_handle handle;
6372     struct perf_sample_data sample;
6373     struct task_struct *task = task_event->task;
6374     int ret, size = task_event->event_id.header.size;
6375 
6376     if (!perf_event_task_match(event))
6377         return;
6378 
6379     perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6380 
6381     ret = perf_output_begin(&handle, event,
6382                 task_event->event_id.header.size);
6383     if (ret)
6384         goto out;
6385 
6386     task_event->event_id.pid = perf_event_pid(event, task);
6387     task_event->event_id.ppid = perf_event_pid(event, current);
6388 
6389     task_event->event_id.tid = perf_event_tid(event, task);
6390     task_event->event_id.ptid = perf_event_tid(event, current);
6391 
6392     task_event->event_id.time = perf_event_clock(event);
6393 
6394     perf_output_put(&handle, task_event->event_id);
6395 
6396     perf_event__output_id_sample(event, &handle, &sample);
6397 
6398     perf_output_end(&handle);
6399 out:
6400     task_event->event_id.header.size = size;
6401 }
6402 
6403 static void perf_event_task(struct task_struct *task,
6404                   struct perf_event_context *task_ctx,
6405                   int new)
6406 {
6407     struct perf_task_event task_event;
6408 
6409     if (!atomic_read(&nr_comm_events) &&
6410         !atomic_read(&nr_mmap_events) &&
6411         !atomic_read(&nr_task_events))
6412         return;
6413 
6414     task_event = (struct perf_task_event){
6415         .task     = task,
6416         .task_ctx = task_ctx,
6417         .event_id    = {
6418             .header = {
6419                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6420                 .misc = 0,
6421                 .size = sizeof(task_event.event_id),
6422             },
6423             /* .pid  */
6424             /* .ppid */
6425             /* .tid  */
6426             /* .ptid */
6427             /* .time */
6428         },
6429     };
6430 
6431     perf_iterate_sb(perf_event_task_output,
6432                &task_event,
6433                task_ctx);
6434 }
6435 
6436 void perf_event_fork(struct task_struct *task)
6437 {
6438     perf_event_task(task, NULL, 1);
6439 }
6440 
6441 /*
6442  * comm tracking
6443  */
6444 
6445 struct perf_comm_event {
6446     struct task_struct  *task;
6447     char            *comm;
6448     int         comm_size;
6449 
6450     struct {
6451         struct perf_event_header    header;
6452 
6453         u32             pid;
6454         u32             tid;
6455     } event_id;
6456 };
6457 
6458 static int perf_event_comm_match(struct perf_event *event)
6459 {
6460     return event->attr.comm;
6461 }
6462 
6463 static void perf_event_comm_output(struct perf_event *event,
6464                    void *data)
6465 {
6466     struct perf_comm_event *comm_event = data;
6467     struct perf_output_handle handle;
6468     struct perf_sample_data sample;
6469     int size = comm_event->event_id.header.size;
6470     int ret;
6471 
6472     if (!perf_event_comm_match(event))
6473         return;
6474 
6475     perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6476     ret = perf_output_begin(&handle, event,
6477                 comm_event->event_id.header.size);
6478 
6479     if (ret)
6480         goto out;
6481 
6482     comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6483     comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6484 
6485     perf_output_put(&handle, comm_event->event_id);
6486     __output_copy(&handle, comm_event->comm,
6487                    comm_event->comm_size);
6488 
6489     perf_event__output_id_sample(event, &handle, &sample);
6490 
6491     perf_output_end(&handle);
6492 out:
6493     comm_event->event_id.header.size = size;
6494 }
6495 
6496 static void perf_event_comm_event(struct perf_comm_event *comm_event)
6497 {
6498     char comm[TASK_COMM_LEN];
6499     unsigned int size;
6500 
6501     memset(comm, 0, sizeof(comm));
6502     strlcpy(comm, comm_event->task->comm, sizeof(comm));
6503     size = ALIGN(strlen(comm)+1, sizeof(u64));
6504 
6505     comm_event->comm = comm;
6506     comm_event->comm_size = size;
6507 
6508     comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6509 
6510     perf_iterate_sb(perf_event_comm_output,
6511                comm_event,
6512                NULL);
6513 }
6514 
6515 void perf_event_comm(struct task_struct *task, bool exec)
6516 {
6517     struct perf_comm_event comm_event;
6518 
6519     if (!atomic_read(&nr_comm_events))
6520         return;
6521 
6522     comm_event = (struct perf_comm_event){
6523         .task   = task,
6524         /* .comm      */
6525         /* .comm_size */
6526         .event_id  = {
6527             .header = {
6528                 .type = PERF_RECORD_COMM,
6529                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6530                 /* .size */
6531             },
6532             /* .pid */
6533             /* .tid */
6534         },
6535     };
6536 
6537     perf_event_comm_event(&comm_event);
6538 }
6539 
6540 /*
6541  * mmap tracking
6542  */
6543 
6544 struct perf_mmap_event {
6545     struct vm_area_struct   *vma;
6546 
6547     const char      *file_name;
6548     int         file_size;
6549     int         maj, min;
6550     u64         ino;
6551     u64         ino_generation;
6552     u32         prot, flags;
6553 
6554     struct {
6555         struct perf_event_header    header;
6556 
6557         u32             pid;
6558         u32             tid;
6559         u64             start;
6560         u64             len;
6561         u64             pgoff;
6562     } event_id;
6563 };
6564 
6565 static int perf_event_mmap_match(struct perf_event *event,
6566                  void *data)
6567 {
6568     struct perf_mmap_event *mmap_event = data;
6569     struct vm_area_struct *vma = mmap_event->vma;
6570     int executable = vma->vm_flags & VM_EXEC;
6571 
6572     return (!executable && event->attr.mmap_data) ||
6573            (executable && (event->attr.mmap || event->attr.mmap2));
6574 }
6575 
6576 static void perf_event_mmap_output(struct perf_event *event,
6577                    void *data)
6578 {
6579     struct perf_mmap_event *mmap_event = data;
6580     struct perf_output_handle handle;
6581     struct perf_sample_data sample;
6582     int size = mmap_event->event_id.header.size;
6583     int ret;
6584 
6585     if (!perf_event_mmap_match(event, data))
6586         return;
6587 
6588     if (event->attr.mmap2) {
6589         mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6590         mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6591         mmap_event->event_id.header.size += sizeof(mmap_event->min);
6592         mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6593         mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6594         mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6595         mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6596     }
6597 
6598     perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6599     ret = perf_output_begin(&handle, event,
6600                 mmap_event->event_id.header.size);
6601     if (ret)
6602         goto out;
6603 
6604     mmap_event->event_id.pid = perf_event_pid(event, current);
6605     mmap_event->event_id.tid = perf_event_tid(event, cu