Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * trace event based perf event profiling/tracing
0004  *
0005  * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
0006  * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
0007  */
0008 
0009 #include <linux/module.h>
0010 #include <linux/kprobes.h>
0011 #include <linux/security.h>
0012 #include "trace.h"
0013 #include "trace_probe.h"
0014 
0015 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
0016 
0017 /*
0018  * Force it to be aligned to unsigned long to avoid misaligned accesses
0019  * surprises
0020  */
0021 typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
0022     perf_trace_t;
0023 
0024 /* Count the events in use (per event id, not per instance) */
0025 static int  total_ref_count;
0026 
0027 static int perf_trace_event_perm(struct trace_event_call *tp_event,
0028                  struct perf_event *p_event)
0029 {
0030     int ret;
0031 
0032     if (tp_event->perf_perm) {
0033         ret = tp_event->perf_perm(tp_event, p_event);
0034         if (ret)
0035             return ret;
0036     }
0037 
0038     /*
0039      * We checked and allowed to create parent,
0040      * allow children without checking.
0041      */
0042     if (p_event->parent)
0043         return 0;
0044 
0045     /*
0046      * It's ok to check current process (owner) permissions in here,
0047      * because code below is called only via perf_event_open syscall.
0048      */
0049 
0050     /* The ftrace function trace is allowed only for root. */
0051     if (ftrace_event_is_function(tp_event)) {
0052         ret = perf_allow_tracepoint(&p_event->attr);
0053         if (ret)
0054             return ret;
0055 
0056         if (!is_sampling_event(p_event))
0057             return 0;
0058 
0059         /*
0060          * We don't allow user space callchains for  function trace
0061          * event, due to issues with page faults while tracing page
0062          * fault handler and its overall trickiness nature.
0063          */
0064         if (!p_event->attr.exclude_callchain_user)
0065             return -EINVAL;
0066 
0067         /*
0068          * Same reason to disable user stack dump as for user space
0069          * callchains above.
0070          */
0071         if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
0072             return -EINVAL;
0073     }
0074 
0075     /* No tracing, just counting, so no obvious leak */
0076     if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
0077         return 0;
0078 
0079     /* Some events are ok to be traced by non-root users... */
0080     if (p_event->attach_state == PERF_ATTACH_TASK) {
0081         if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
0082             return 0;
0083     }
0084 
0085     /*
0086      * ...otherwise raw tracepoint data can be a severe data leak,
0087      * only allow root to have these.
0088      */
0089     ret = perf_allow_tracepoint(&p_event->attr);
0090     if (ret)
0091         return ret;
0092 
0093     return 0;
0094 }
0095 
0096 static int perf_trace_event_reg(struct trace_event_call *tp_event,
0097                 struct perf_event *p_event)
0098 {
0099     struct hlist_head __percpu *list;
0100     int ret = -ENOMEM;
0101     int cpu;
0102 
0103     p_event->tp_event = tp_event;
0104     if (tp_event->perf_refcount++ > 0)
0105         return 0;
0106 
0107     list = alloc_percpu(struct hlist_head);
0108     if (!list)
0109         goto fail;
0110 
0111     for_each_possible_cpu(cpu)
0112         INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
0113 
0114     tp_event->perf_events = list;
0115 
0116     if (!total_ref_count) {
0117         char __percpu *buf;
0118         int i;
0119 
0120         for (i = 0; i < PERF_NR_CONTEXTS; i++) {
0121             buf = (char __percpu *)alloc_percpu(perf_trace_t);
0122             if (!buf)
0123                 goto fail;
0124 
0125             perf_trace_buf[i] = buf;
0126         }
0127     }
0128 
0129     ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
0130     if (ret)
0131         goto fail;
0132 
0133     total_ref_count++;
0134     return 0;
0135 
0136 fail:
0137     if (!total_ref_count) {
0138         int i;
0139 
0140         for (i = 0; i < PERF_NR_CONTEXTS; i++) {
0141             free_percpu(perf_trace_buf[i]);
0142             perf_trace_buf[i] = NULL;
0143         }
0144     }
0145 
0146     if (!--tp_event->perf_refcount) {
0147         free_percpu(tp_event->perf_events);
0148         tp_event->perf_events = NULL;
0149     }
0150 
0151     return ret;
0152 }
0153 
0154 static void perf_trace_event_unreg(struct perf_event *p_event)
0155 {
0156     struct trace_event_call *tp_event = p_event->tp_event;
0157     int i;
0158 
0159     if (--tp_event->perf_refcount > 0)
0160         return;
0161 
0162     tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
0163 
0164     /*
0165      * Ensure our callback won't be called anymore. The buffers
0166      * will be freed after that.
0167      */
0168     tracepoint_synchronize_unregister();
0169 
0170     free_percpu(tp_event->perf_events);
0171     tp_event->perf_events = NULL;
0172 
0173     if (!--total_ref_count) {
0174         for (i = 0; i < PERF_NR_CONTEXTS; i++) {
0175             free_percpu(perf_trace_buf[i]);
0176             perf_trace_buf[i] = NULL;
0177         }
0178     }
0179 }
0180 
0181 static int perf_trace_event_open(struct perf_event *p_event)
0182 {
0183     struct trace_event_call *tp_event = p_event->tp_event;
0184     return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
0185 }
0186 
0187 static void perf_trace_event_close(struct perf_event *p_event)
0188 {
0189     struct trace_event_call *tp_event = p_event->tp_event;
0190     tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
0191 }
0192 
0193 static int perf_trace_event_init(struct trace_event_call *tp_event,
0194                  struct perf_event *p_event)
0195 {
0196     int ret;
0197 
0198     ret = perf_trace_event_perm(tp_event, p_event);
0199     if (ret)
0200         return ret;
0201 
0202     ret = perf_trace_event_reg(tp_event, p_event);
0203     if (ret)
0204         return ret;
0205 
0206     ret = perf_trace_event_open(p_event);
0207     if (ret) {
0208         perf_trace_event_unreg(p_event);
0209         return ret;
0210     }
0211 
0212     return 0;
0213 }
0214 
0215 int perf_trace_init(struct perf_event *p_event)
0216 {
0217     struct trace_event_call *tp_event;
0218     u64 event_id = p_event->attr.config;
0219     int ret = -EINVAL;
0220 
0221     mutex_lock(&event_mutex);
0222     list_for_each_entry(tp_event, &ftrace_events, list) {
0223         if (tp_event->event.type == event_id &&
0224             tp_event->class && tp_event->class->reg &&
0225             trace_event_try_get_ref(tp_event)) {
0226             ret = perf_trace_event_init(tp_event, p_event);
0227             if (ret)
0228                 trace_event_put_ref(tp_event);
0229             break;
0230         }
0231     }
0232     mutex_unlock(&event_mutex);
0233 
0234     return ret;
0235 }
0236 
0237 void perf_trace_destroy(struct perf_event *p_event)
0238 {
0239     mutex_lock(&event_mutex);
0240     perf_trace_event_close(p_event);
0241     perf_trace_event_unreg(p_event);
0242     trace_event_put_ref(p_event->tp_event);
0243     mutex_unlock(&event_mutex);
0244 }
0245 
0246 #ifdef CONFIG_KPROBE_EVENTS
0247 int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
0248 {
0249     int ret;
0250     char *func = NULL;
0251     struct trace_event_call *tp_event;
0252 
0253     if (p_event->attr.kprobe_func) {
0254         func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
0255         if (!func)
0256             return -ENOMEM;
0257         ret = strncpy_from_user(
0258             func, u64_to_user_ptr(p_event->attr.kprobe_func),
0259             KSYM_NAME_LEN);
0260         if (ret == KSYM_NAME_LEN)
0261             ret = -E2BIG;
0262         if (ret < 0)
0263             goto out;
0264 
0265         if (func[0] == '\0') {
0266             kfree(func);
0267             func = NULL;
0268         }
0269     }
0270 
0271     tp_event = create_local_trace_kprobe(
0272         func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
0273         p_event->attr.probe_offset, is_retprobe);
0274     if (IS_ERR(tp_event)) {
0275         ret = PTR_ERR(tp_event);
0276         goto out;
0277     }
0278 
0279     mutex_lock(&event_mutex);
0280     ret = perf_trace_event_init(tp_event, p_event);
0281     if (ret)
0282         destroy_local_trace_kprobe(tp_event);
0283     mutex_unlock(&event_mutex);
0284 out:
0285     kfree(func);
0286     return ret;
0287 }
0288 
0289 void perf_kprobe_destroy(struct perf_event *p_event)
0290 {
0291     mutex_lock(&event_mutex);
0292     perf_trace_event_close(p_event);
0293     perf_trace_event_unreg(p_event);
0294     trace_event_put_ref(p_event->tp_event);
0295     mutex_unlock(&event_mutex);
0296 
0297     destroy_local_trace_kprobe(p_event->tp_event);
0298 }
0299 #endif /* CONFIG_KPROBE_EVENTS */
0300 
0301 #ifdef CONFIG_UPROBE_EVENTS
0302 int perf_uprobe_init(struct perf_event *p_event,
0303              unsigned long ref_ctr_offset, bool is_retprobe)
0304 {
0305     int ret;
0306     char *path = NULL;
0307     struct trace_event_call *tp_event;
0308 
0309     if (!p_event->attr.uprobe_path)
0310         return -EINVAL;
0311 
0312     path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
0313                 PATH_MAX);
0314     if (IS_ERR(path)) {
0315         ret = PTR_ERR(path);
0316         return (ret == -EINVAL) ? -E2BIG : ret;
0317     }
0318     if (path[0] == '\0') {
0319         ret = -EINVAL;
0320         goto out;
0321     }
0322 
0323     tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
0324                          ref_ctr_offset, is_retprobe);
0325     if (IS_ERR(tp_event)) {
0326         ret = PTR_ERR(tp_event);
0327         goto out;
0328     }
0329 
0330     /*
0331      * local trace_uprobe need to hold event_mutex to call
0332      * uprobe_buffer_enable() and uprobe_buffer_disable().
0333      * event_mutex is not required for local trace_kprobes.
0334      */
0335     mutex_lock(&event_mutex);
0336     ret = perf_trace_event_init(tp_event, p_event);
0337     if (ret)
0338         destroy_local_trace_uprobe(tp_event);
0339     mutex_unlock(&event_mutex);
0340 out:
0341     kfree(path);
0342     return ret;
0343 }
0344 
0345 void perf_uprobe_destroy(struct perf_event *p_event)
0346 {
0347     mutex_lock(&event_mutex);
0348     perf_trace_event_close(p_event);
0349     perf_trace_event_unreg(p_event);
0350     trace_event_put_ref(p_event->tp_event);
0351     mutex_unlock(&event_mutex);
0352     destroy_local_trace_uprobe(p_event->tp_event);
0353 }
0354 #endif /* CONFIG_UPROBE_EVENTS */
0355 
0356 int perf_trace_add(struct perf_event *p_event, int flags)
0357 {
0358     struct trace_event_call *tp_event = p_event->tp_event;
0359 
0360     if (!(flags & PERF_EF_START))
0361         p_event->hw.state = PERF_HES_STOPPED;
0362 
0363     /*
0364      * If TRACE_REG_PERF_ADD returns false; no custom action was performed
0365      * and we need to take the default action of enqueueing our event on
0366      * the right per-cpu hlist.
0367      */
0368     if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
0369         struct hlist_head __percpu *pcpu_list;
0370         struct hlist_head *list;
0371 
0372         pcpu_list = tp_event->perf_events;
0373         if (WARN_ON_ONCE(!pcpu_list))
0374             return -EINVAL;
0375 
0376         list = this_cpu_ptr(pcpu_list);
0377         hlist_add_head_rcu(&p_event->hlist_entry, list);
0378     }
0379 
0380     return 0;
0381 }
0382 
0383 void perf_trace_del(struct perf_event *p_event, int flags)
0384 {
0385     struct trace_event_call *tp_event = p_event->tp_event;
0386 
0387     /*
0388      * If TRACE_REG_PERF_DEL returns false; no custom action was performed
0389      * and we need to take the default action of dequeueing our event from
0390      * the right per-cpu hlist.
0391      */
0392     if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
0393         hlist_del_rcu(&p_event->hlist_entry);
0394 }
0395 
0396 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
0397 {
0398     char *raw_data;
0399     int rctx;
0400 
0401     BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
0402 
0403     if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
0404               "perf buffer not large enough, wanted %d, have %d",
0405               size, PERF_MAX_TRACE_SIZE))
0406         return NULL;
0407 
0408     *rctxp = rctx = perf_swevent_get_recursion_context();
0409     if (rctx < 0)
0410         return NULL;
0411 
0412     if (regs)
0413         *regs = this_cpu_ptr(&__perf_regs[rctx]);
0414     raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
0415 
0416     /* zero the dead bytes from align to not leak stack to user */
0417     memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
0418     return raw_data;
0419 }
0420 EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
0421 NOKPROBE_SYMBOL(perf_trace_buf_alloc);
0422 
0423 void perf_trace_buf_update(void *record, u16 type)
0424 {
0425     struct trace_entry *entry = record;
0426 
0427     tracing_generic_entry_update(entry, type, tracing_gen_ctx());
0428 }
0429 NOKPROBE_SYMBOL(perf_trace_buf_update);
0430 
0431 #ifdef CONFIG_FUNCTION_TRACER
0432 static void
0433 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
0434               struct ftrace_ops *ops,  struct ftrace_regs *fregs)
0435 {
0436     struct ftrace_entry *entry;
0437     struct perf_event *event;
0438     struct hlist_head head;
0439     struct pt_regs regs;
0440     int rctx;
0441     int bit;
0442 
0443     if (!rcu_is_watching())
0444         return;
0445 
0446     bit = ftrace_test_recursion_trylock(ip, parent_ip);
0447     if (bit < 0)
0448         return;
0449 
0450     if ((unsigned long)ops->private != smp_processor_id())
0451         goto out;
0452 
0453     event = container_of(ops, struct perf_event, ftrace_ops);
0454 
0455     /*
0456      * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
0457      * the perf code does is hlist_for_each_entry_rcu(), so we can
0458      * get away with simply setting the @head.first pointer in order
0459      * to create a singular list.
0460      */
0461     head.first = &event->hlist_entry;
0462 
0463 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
0464             sizeof(u64)) - sizeof(u32))
0465 
0466     BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
0467 
0468     memset(&regs, 0, sizeof(regs));
0469     perf_fetch_caller_regs(&regs);
0470 
0471     entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
0472     if (!entry)
0473         goto out;
0474 
0475     entry->ip = ip;
0476     entry->parent_ip = parent_ip;
0477     perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
0478                   1, &regs, &head, NULL);
0479 
0480 out:
0481     ftrace_test_recursion_unlock(bit);
0482 #undef ENTRY_SIZE
0483 }
0484 
0485 static int perf_ftrace_function_register(struct perf_event *event)
0486 {
0487     struct ftrace_ops *ops = &event->ftrace_ops;
0488 
0489     ops->func    = perf_ftrace_function_call;
0490     ops->private = (void *)(unsigned long)nr_cpu_ids;
0491 
0492     return register_ftrace_function(ops);
0493 }
0494 
0495 static int perf_ftrace_function_unregister(struct perf_event *event)
0496 {
0497     struct ftrace_ops *ops = &event->ftrace_ops;
0498     int ret = unregister_ftrace_function(ops);
0499     ftrace_free_filter(ops);
0500     return ret;
0501 }
0502 
0503 int perf_ftrace_event_register(struct trace_event_call *call,
0504                    enum trace_reg type, void *data)
0505 {
0506     struct perf_event *event = data;
0507 
0508     switch (type) {
0509     case TRACE_REG_REGISTER:
0510     case TRACE_REG_UNREGISTER:
0511         break;
0512     case TRACE_REG_PERF_REGISTER:
0513     case TRACE_REG_PERF_UNREGISTER:
0514         return 0;
0515     case TRACE_REG_PERF_OPEN:
0516         return perf_ftrace_function_register(data);
0517     case TRACE_REG_PERF_CLOSE:
0518         return perf_ftrace_function_unregister(data);
0519     case TRACE_REG_PERF_ADD:
0520         event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
0521         return 1;
0522     case TRACE_REG_PERF_DEL:
0523         event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
0524         return 1;
0525     }
0526 
0527     return -EINVAL;
0528 }
0529 #endif /* CONFIG_FUNCTION_TRACER */