kernel/events/ring_buffer.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Performance events ring-buffer code:
0004  *
0005  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
0006  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
0007  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
0008  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
0009  */
0010
0011 #include <linux/perf_event.h>
0012 #include <linux/vmalloc.h>
0013 #include <linux/slab.h>
0014 #include <linux/circ_buf.h>
0015 #include <linux/poll.h>
0016 #include <linux/nospec.h>
0017
0018 #include "internal.h"
0019
0020 static void perf_output_wakeup(struct perf_output_handle *handle)
0021 {
0022     atomic_set(&handle->rb->poll, EPOLLIN);
0023
0024     handle->event->pending_wakeup = 1;
0025     irq_work_queue(&handle->event->pending);
0026 }
0027
0028 /*
0029  * We need to ensure a later event_id doesn't publish a head when a former
0030  * event isn't done writing. However since we need to deal with NMIs we
0031  * cannot fully serialize things.
0032  *
0033  * We only publish the head (and generate a wakeup) when the outer-most
0034  * event completes.
0035  */
0036 static void perf_output_get_handle(struct perf_output_handle *handle)
0037 {
0038     struct perf_buffer *rb = handle->rb;
0039
0040     preempt_disable();
0041
0042     /*
0043      * Avoid an explicit LOAD/STORE such that architectures with memops
0044      * can use them.
0045      */
0046     (*(volatile unsigned int *)&rb->nest)++;
0047     handle->wakeup = local_read(&rb->wakeup);
0048 }
0049
0050 static void perf_output_put_handle(struct perf_output_handle *handle)
0051 {
0052     struct perf_buffer *rb = handle->rb;
0053     unsigned long head;
0054     unsigned int nest;
0055
0056     /*
0057      * If this isn't the outermost nesting, we don't have to update
0058      * @rb->user_page->data_head.
0059      */
0060     nest = READ_ONCE(rb->nest);
0061     if (nest > 1) {
0062         WRITE_ONCE(rb->nest, nest - 1);
0063         goto out;
0064     }
0065
0066 again:
0067     /*
0068      * In order to avoid publishing a head value that goes backwards,
0069      * we must ensure the load of @rb->head happens after we've
0070      * incremented @rb->nest.
0071      *
0072      * Otherwise we can observe a @rb->head value before one published
0073      * by an IRQ/NMI happening between the load and the increment.
0074      */
0075     barrier();
0076     head = local_read(&rb->head);
0077
0078     /*
0079      * IRQ/NMI can happen here and advance @rb->head, causing our
0080      * load above to be stale.
0081      */
0082
0083     /*
0084      * Since the mmap() consumer (userspace) can run on a different CPU:
0085      *
0086      *   kernel             user
0087      *
0088      *   if (LOAD ->data_tail) {        LOAD ->data_head
0089      *          (A)     smp_rmb()   (C)
0090      *  STORE $data         LOAD $data
0091      *  smp_wmb()   (B)     smp_mb()    (D)
0092      *  STORE ->data_head       STORE ->data_tail
0093      *   }
0094      *
0095      * Where A pairs with D, and B pairs with C.
0096      *
0097      * In our case (A) is a control dependency that separates the load of
0098      * the ->data_tail and the stores of $data. In case ->data_tail
0099      * indicates there is no room in the buffer to store $data we do not.
0100      *
0101      * D needs to be a full barrier since it separates the data READ
0102      * from the tail WRITE.
0103      *
0104      * For B a WMB is sufficient since it separates two WRITEs, and for C
0105      * an RMB is sufficient since it separates two READs.
0106      *
0107      * See perf_output_begin().
0108      */
0109     smp_wmb(); /* B, matches C */
0110     WRITE_ONCE(rb->user_page->data_head, head);
0111
0112     /*
0113      * We must publish the head before decrementing the nest count,
0114      * otherwise an IRQ/NMI can publish a more recent head value and our
0115      * write will (temporarily) publish a stale value.
0116      */
0117     barrier();
0118     WRITE_ONCE(rb->nest, 0);
0119
0120     /*
0121      * Ensure we decrement @rb->nest before we validate the @rb->head.
0122      * Otherwise we cannot be sure we caught the 'last' nested update.
0123      */
0124     barrier();
0125     if (unlikely(head != local_read(&rb->head))) {
0126         WRITE_ONCE(rb->nest, 1);
0127         goto again;
0128     }
0129
0130     if (handle->wakeup != local_read(&rb->wakeup))
0131         perf_output_wakeup(handle);
0132
0133 out:
0134     preempt_enable();
0135 }
0136
0137 static __always_inline bool
0138 ring_buffer_has_space(unsigned long head, unsigned long tail,
0139               unsigned long data_size, unsigned int size,
0140               bool backward)
0141 {
0142     if (!backward)
0143         return CIRC_SPACE(head, tail, data_size) >= size;
0144     else
0145         return CIRC_SPACE(tail, head, data_size) >= size;
0146 }
0147
0148 static __always_inline int
0149 __perf_output_begin(struct perf_output_handle *handle,
0150             struct perf_sample_data *data,
0151             struct perf_event *event, unsigned int size,
0152             bool backward)
0153 {
0154     struct perf_buffer *rb;
0155     unsigned long tail, offset, head;
0156     int have_lost, page_shift;
0157     struct {
0158         struct perf_event_header header;
0159         u64          id;
0160         u64          lost;
0161     } lost_event;
0162
0163     rcu_read_lock();
0164     /*
0165      * For inherited events we send all the output towards the parent.
0166      */
0167     if (event->parent)
0168         event = event->parent;
0169
0170     rb = rcu_dereference(event->rb);
0171     if (unlikely(!rb))
0172         goto out;
0173
0174     if (unlikely(rb->paused)) {
0175         if (rb->nr_pages) {
0176             local_inc(&rb->lost);
0177             atomic64_inc(&event->lost_samples);
0178         }
0179         goto out;
0180     }
0181
0182     handle->rb    = rb;
0183     handle->event = event;
0184
0185     have_lost = local_read(&rb->lost);
0186     if (unlikely(have_lost)) {
0187         size += sizeof(lost_event);
0188         if (event->attr.sample_id_all)
0189             size += event->id_header_size;
0190     }
0191
0192     perf_output_get_handle(handle);
0193
0194     do {
0195         tail = READ_ONCE(rb->user_page->data_tail);
0196         offset = head = local_read(&rb->head);
0197         if (!rb->overwrite) {
0198             if (unlikely(!ring_buffer_has_space(head, tail,
0199                                 perf_data_size(rb),
0200                                 size, backward)))
0201                 goto fail;
0202         }
0203
0204         /*
0205          * The above forms a control dependency barrier separating the
0206          * @tail load above from the data stores below. Since the @tail
0207          * load is required to compute the branch to fail below.
0208          *
0209          * A, matches D; the full memory barrier userspace SHOULD issue
0210          * after reading the data and before storing the new tail
0211          * position.
0212          *
0213          * See perf_output_put_handle().
0214          */
0215
0216         if (!backward)
0217             head += size;
0218         else
0219             head -= size;
0220     } while (local_cmpxchg(&rb->head, offset, head) != offset);
0221
0222     if (backward) {
0223         offset = head;
0224         head = (u64)(-head);
0225     }
0226
0227     /*
0228      * We rely on the implied barrier() by local_cmpxchg() to ensure
0229      * none of the data stores below can be lifted up by the compiler.
0230      */
0231
0232     if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
0233         local_add(rb->watermark, &rb->wakeup);
0234
0235     page_shift = PAGE_SHIFT + page_order(rb);
0236
0237     handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
0238     offset &= (1UL << page_shift) - 1;
0239     handle->addr = rb->data_pages[handle->page] + offset;
0240     handle->size = (1UL << page_shift) - offset;
0241
0242     if (unlikely(have_lost)) {
0243         lost_event.header.size = sizeof(lost_event);
0244         lost_event.header.type = PERF_RECORD_LOST;
0245         lost_event.header.misc = 0;
0246         lost_event.id          = event->id;
0247         lost_event.lost        = local_xchg(&rb->lost, 0);
0248
0249         /* XXX mostly redundant; @data is already fully initializes */
0250         perf_event_header__init_id(&lost_event.header, data, event);
0251         perf_output_put(handle, lost_event);
0252         perf_event__output_id_sample(event, handle, data);
0253     }
0254
0255     return 0;
0256
0257 fail:
0258     local_inc(&rb->lost);
0259     atomic64_inc(&event->lost_samples);
0260     perf_output_put_handle(handle);
0261 out:
0262     rcu_read_unlock();
0263
0264     return -ENOSPC;
0265 }
0266
0267 int perf_output_begin_forward(struct perf_output_handle *handle,
0268                   struct perf_sample_data *data,
0269                   struct perf_event *event, unsigned int size)
0270 {
0271     return __perf_output_begin(handle, data, event, size, false);
0272 }
0273
0274 int perf_output_begin_backward(struct perf_output_handle *handle,
0275                    struct perf_sample_data *data,
0276                    struct perf_event *event, unsigned int size)
0277 {
0278     return __perf_output_begin(handle, data, event, size, true);
0279 }
0280
0281 int perf_output_begin(struct perf_output_handle *handle,
0282               struct perf_sample_data *data,
0283               struct perf_event *event, unsigned int size)
0284 {
0285
0286     return __perf_output_begin(handle, data, event, size,
0287                    unlikely(is_write_backward(event)));
0288 }
0289
0290 unsigned int perf_output_copy(struct perf_output_handle *handle,
0291               const void *buf, unsigned int len)
0292 {
0293     return __output_copy(handle, buf, len);
0294 }
0295
0296 unsigned int perf_output_skip(struct perf_output_handle *handle,
0297                   unsigned int len)
0298 {
0299     return __output_skip(handle, NULL, len);
0300 }
0301
0302 void perf_output_end(struct perf_output_handle *handle)
0303 {
0304     perf_output_put_handle(handle);
0305     rcu_read_unlock();
0306 }
0307
0308 static void
0309 ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
0310 {
0311     long max_size = perf_data_size(rb);
0312
0313     if (watermark)
0314         rb->watermark = min(max_size, watermark);
0315
0316     if (!rb->watermark)
0317         rb->watermark = max_size / 2;
0318
0319     if (flags & RING_BUFFER_WRITABLE)
0320         rb->overwrite = 0;
0321     else
0322         rb->overwrite = 1;
0323
0324     refcount_set(&rb->refcount, 1);
0325
0326     INIT_LIST_HEAD(&rb->event_list);
0327     spin_lock_init(&rb->event_lock);
0328
0329     /*
0330      * perf_output_begin() only checks rb->paused, therefore
0331      * rb->paused must be true if we have no pages for output.
0332      */
0333     if (!rb->nr_pages)
0334         rb->paused = 1;
0335 }
0336
0337 void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
0338 {
0339     /*
0340      * OVERWRITE is determined by perf_aux_output_end() and can't
0341      * be passed in directly.
0342      */
0343     if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
0344         return;
0345
0346     handle->aux_flags |= flags;
0347 }
0348 EXPORT_SYMBOL_GPL(perf_aux_output_flag);
0349
0350 /*
0351  * This is called before hardware starts writing to the AUX area to
0352  * obtain an output handle and make sure there's room in the buffer.
0353  * When the capture completes, call perf_aux_output_end() to commit
0354  * the recorded data to the buffer.
0355  *
0356  * The ordering is similar to that of perf_output_{begin,end}, with
0357  * the exception of (B), which should be taken care of by the pmu
0358  * driver, since ordering rules will differ depending on hardware.
0359  *
0360  * Call this from pmu::start(); see the comment in perf_aux_output_end()
0361  * about its use in pmu callbacks. Both can also be called from the PMI
0362  * handler if needed.
0363  */
0364 void *perf_aux_output_begin(struct perf_output_handle *handle,
0365                 struct perf_event *event)
0366 {
0367     struct perf_event *output_event = event;
0368     unsigned long aux_head, aux_tail;
0369     struct perf_buffer *rb;
0370     unsigned int nest;
0371
0372     if (output_event->parent)
0373         output_event = output_event->parent;
0374
0375     /*
0376      * Since this will typically be open across pmu::add/pmu::del, we
0377      * grab ring_buffer's refcount instead of holding rcu read lock
0378      * to make sure it doesn't disappear under us.
0379      */
0380     rb = ring_buffer_get(output_event);
0381     if (!rb)
0382         return NULL;
0383
0384     if (!rb_has_aux(rb))
0385         goto err;
0386
0387     /*
0388      * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
0389      * about to get freed, so we leave immediately.
0390      *
0391      * Checking rb::aux_mmap_count and rb::refcount has to be done in
0392      * the same order, see perf_mmap_close. Otherwise we end up freeing
0393      * aux pages in this path, which is a bug, because in_atomic().
0394      */
0395     if (!atomic_read(&rb->aux_mmap_count))
0396         goto err;
0397
0398     if (!refcount_inc_not_zero(&rb->aux_refcount))
0399         goto err;
0400
0401     nest = READ_ONCE(rb->aux_nest);
0402     /*
0403      * Nesting is not supported for AUX area, make sure nested
0404      * writers are caught early
0405      */
0406     if (WARN_ON_ONCE(nest))
0407         goto err_put;
0408
0409     WRITE_ONCE(rb->aux_nest, nest + 1);
0410
0411     aux_head = rb->aux_head;
0412
0413     handle->rb = rb;
0414     handle->event = event;
0415     handle->head = aux_head;
0416     handle->size = 0;
0417     handle->aux_flags = 0;
0418
0419     /*
0420      * In overwrite mode, AUX data stores do not depend on aux_tail,
0421      * therefore (A) control dependency barrier does not exist. The
0422      * (B) <-> (C) ordering is still observed by the pmu driver.
0423      */
0424     if (!rb->aux_overwrite) {
0425         aux_tail = READ_ONCE(rb->user_page->aux_tail);
0426         handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
0427         if (aux_head - aux_tail < perf_aux_size(rb))
0428             handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
0429
0430         /*
0431          * handle->size computation depends on aux_tail load; this forms a
0432          * control dependency barrier separating aux_tail load from aux data
0433          * store that will be enabled on successful return
0434          */
0435         if (!handle->size) { /* A, matches D */
0436             event->pending_disable = smp_processor_id();
0437             perf_output_wakeup(handle);
0438             WRITE_ONCE(rb->aux_nest, 0);
0439             goto err_put;
0440         }
0441     }
0442
0443     return handle->rb->aux_priv;
0444
0445 err_put:
0446     /* can't be last */
0447     rb_free_aux(rb);
0448
0449 err:
0450     ring_buffer_put(rb);
0451     handle->event = NULL;
0452
0453     return NULL;
0454 }
0455 EXPORT_SYMBOL_GPL(perf_aux_output_begin);
0456
0457 static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
0458 {
0459     if (rb->aux_overwrite)
0460         return false;
0461
0462     if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
0463         rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
0464         return true;
0465     }
0466
0467     return false;
0468 }
0469
0470 /*
0471  * Commit the data written by hardware into the ring buffer by adjusting
0472  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
0473  * pmu driver's responsibility to observe ordering rules of the hardware,
0474  * so that all the data is externally visible before this is called.
0475  *
0476  * Note: this has to be called from pmu::stop() callback, as the assumption
0477  * of the AUX buffer management code is that after pmu::stop(), the AUX
0478  * transaction must be stopped and therefore drop the AUX reference count.
0479  */
0480 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
0481 {
0482     bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
0483     struct perf_buffer *rb = handle->rb;
0484     unsigned long aux_head;
0485
0486     /* in overwrite mode, driver provides aux_head via handle */
0487     if (rb->aux_overwrite) {
0488         handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
0489
0490         aux_head = handle->head;
0491         rb->aux_head = aux_head;
0492     } else {
0493         handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
0494
0495         aux_head = rb->aux_head;
0496         rb->aux_head += size;
0497     }
0498
0499     /*
0500      * Only send RECORD_AUX if we have something useful to communicate
0501      *
0502      * Note: the OVERWRITE records by themselves are not considered
0503      * useful, as they don't communicate any *new* information,
0504      * aside from the short-lived offset, that becomes history at
0505      * the next event sched-in and therefore isn't useful.
0506      * The userspace that needs to copy out AUX data in overwrite
0507      * mode should know to use user_page::aux_head for the actual
0508      * offset. So, from now on we don't output AUX records that
0509      * have *only* OVERWRITE flag set.
0510      */
0511     if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
0512         perf_event_aux_event(handle->event, aux_head, size,
0513                      handle->aux_flags);
0514
0515     WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
0516     if (rb_need_aux_wakeup(rb))
0517         wakeup = true;
0518
0519     if (wakeup) {
0520         if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
0521             handle->event->pending_disable = smp_processor_id();
0522         perf_output_wakeup(handle);
0523     }
0524
0525     handle->event = NULL;
0526
0527     WRITE_ONCE(rb->aux_nest, 0);
0528     /* can't be last */
0529     rb_free_aux(rb);
0530     ring_buffer_put(rb);
0531 }
0532 EXPORT_SYMBOL_GPL(perf_aux_output_end);
0533
0534 /*
0535  * Skip over a given number of bytes in the AUX buffer, due to, for example,
0536  * hardware's alignment constraints.
0537  */
0538 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
0539 {
0540     struct perf_buffer *rb = handle->rb;
0541
0542     if (size > handle->size)
0543         return -ENOSPC;
0544
0545     rb->aux_head += size;
0546
0547     WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
0548     if (rb_need_aux_wakeup(rb)) {
0549         perf_output_wakeup(handle);
0550         handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
0551     }
0552
0553     handle->head = rb->aux_head;
0554     handle->size -= size;
0555
0556     return 0;
0557 }
0558 EXPORT_SYMBOL_GPL(perf_aux_output_skip);
0559
0560 void *perf_get_aux(struct perf_output_handle *handle)
0561 {
0562     /* this is only valid between perf_aux_output_begin and *_end */
0563     if (!handle->event)
0564         return NULL;
0565
0566     return handle->rb->aux_priv;
0567 }
0568 EXPORT_SYMBOL_GPL(perf_get_aux);
0569
0570 /*
0571  * Copy out AUX data from an AUX handle.
0572  */
0573 long perf_output_copy_aux(struct perf_output_handle *aux_handle,
0574               struct perf_output_handle *handle,
0575               unsigned long from, unsigned long to)
0576 {
0577     struct perf_buffer *rb = aux_handle->rb;
0578     unsigned long tocopy, remainder, len = 0;
0579     void *addr;
0580
0581     from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
0582     to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
0583
0584     do {
0585         tocopy = PAGE_SIZE - offset_in_page(from);
0586         if (to > from)
0587             tocopy = min(tocopy, to - from);
0588         if (!tocopy)
0589             break;
0590
0591         addr = rb->aux_pages[from >> PAGE_SHIFT];
0592         addr += offset_in_page(from);
0593
0594         remainder = perf_output_copy(handle, addr, tocopy);
0595         if (remainder)
0596             return -EFAULT;
0597
0598         len += tocopy;
0599         from += tocopy;
0600         from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
0601     } while (to != from);
0602
0603     return len;
0604 }
0605
0606 #define PERF_AUX_GFP    (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
0607
0608 static struct page *rb_alloc_aux_page(int node, int order)
0609 {
0610     struct page *page;
0611
0612     if (order > MAX_ORDER)
0613         order = MAX_ORDER;
0614
0615     do {
0616         page = alloc_pages_node(node, PERF_AUX_GFP, order);
0617     } while (!page && order--);
0618
0619     if (page && order) {
0620         /*
0621          * Communicate the allocation size to the driver:
0622          * if we managed to secure a high-order allocation,
0623          * set its first page's private to this order;
0624          * !PagePrivate(page) means it's just a normal page.
0625          */
0626         split_page(page, order);
0627         SetPagePrivate(page);
0628         set_page_private(page, order);
0629     }
0630
0631     return page;
0632 }
0633
0634 static void rb_free_aux_page(struct perf_buffer *rb, int idx)
0635 {
0636     struct page *page = virt_to_page(rb->aux_pages[idx]);
0637
0638     ClearPagePrivate(page);
0639     page->mapping = NULL;
0640     __free_page(page);
0641 }
0642
0643 static void __rb_free_aux(struct perf_buffer *rb)
0644 {
0645     int pg;
0646
0647     /*
0648      * Should never happen, the last reference should be dropped from
0649      * perf_mmap_close() path, which first stops aux transactions (which
0650      * in turn are the atomic holders of aux_refcount) and then does the
0651      * last rb_free_aux().
0652      */
0653     WARN_ON_ONCE(in_atomic());
0654
0655     if (rb->aux_priv) {
0656         rb->free_aux(rb->aux_priv);
0657         rb->free_aux = NULL;
0658         rb->aux_priv = NULL;
0659     }
0660
0661     if (rb->aux_nr_pages) {
0662         for (pg = 0; pg < rb->aux_nr_pages; pg++)
0663             rb_free_aux_page(rb, pg);
0664
0665         kfree(rb->aux_pages);
0666         rb->aux_nr_pages = 0;
0667     }
0668 }
0669
0670 int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
0671          pgoff_t pgoff, int nr_pages, long watermark, int flags)
0672 {
0673     bool overwrite = !(flags & RING_BUFFER_WRITABLE);
0674     int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
0675     int ret = -ENOMEM, max_order;
0676
0677     if (!has_aux(event))
0678         return -EOPNOTSUPP;
0679
0680     if (!overwrite) {
0681         /*
0682          * Watermark defaults to half the buffer, and so does the
0683          * max_order, to aid PMU drivers in double buffering.
0684          */
0685         if (!watermark)
0686             watermark = nr_pages << (PAGE_SHIFT - 1);
0687
0688         /*
0689          * Use aux_watermark as the basis for chunking to
0690          * help PMU drivers honor the watermark.
0691          */
0692         max_order = get_order(watermark);
0693     } else {
0694         /*
0695          * We need to start with the max_order that fits in nr_pages,
0696          * not the other way around, hence ilog2() and not get_order.
0697          */
0698         max_order = ilog2(nr_pages);
0699         watermark = 0;
0700     }
0701
0702     rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
0703                      node);
0704     if (!rb->aux_pages)
0705         return -ENOMEM;
0706
0707     rb->free_aux = event->pmu->free_aux;
0708     for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
0709         struct page *page;
0710         int last, order;
0711
0712         order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
0713         page = rb_alloc_aux_page(node, order);
0714         if (!page)
0715             goto out;
0716
0717         for (last = rb->aux_nr_pages + (1 << page_private(page));
0718              last > rb->aux_nr_pages; rb->aux_nr_pages++)
0719             rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
0720     }
0721
0722     /*
0723      * In overwrite mode, PMUs that don't support SG may not handle more
0724      * than one contiguous allocation, since they rely on PMI to do double
0725      * buffering. In this case, the entire buffer has to be one contiguous
0726      * chunk.
0727      */
0728     if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
0729         overwrite) {
0730         struct page *page = virt_to_page(rb->aux_pages[0]);
0731
0732         if (page_private(page) != max_order)
0733             goto out;
0734     }
0735
0736     rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
0737                          overwrite);
0738     if (!rb->aux_priv)
0739         goto out;
0740
0741     ret = 0;
0742
0743     /*
0744      * aux_pages (and pmu driver's private data, aux_priv) will be
0745      * referenced in both producer's and consumer's contexts, thus
0746      * we keep a refcount here to make sure either of the two can
0747      * reference them safely.
0748      */
0749     refcount_set(&rb->aux_refcount, 1);
0750
0751     rb->aux_overwrite = overwrite;
0752     rb->aux_watermark = watermark;
0753
0754 out:
0755     if (!ret)
0756         rb->aux_pgoff = pgoff;
0757     else
0758         __rb_free_aux(rb);
0759
0760     return ret;
0761 }
0762
0763 void rb_free_aux(struct perf_buffer *rb)
0764 {
0765     if (refcount_dec_and_test(&rb->aux_refcount))
0766         __rb_free_aux(rb);
0767 }
0768
0769 #ifndef CONFIG_PERF_USE_VMALLOC
0770
0771 /*
0772  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
0773  */
0774
0775 static struct page *
0776 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
0777 {
0778     if (pgoff > rb->nr_pages)
0779         return NULL;
0780
0781     if (pgoff == 0)
0782         return virt_to_page(rb->user_page);
0783
0784     return virt_to_page(rb->data_pages[pgoff - 1]);
0785 }
0786
0787 static void *perf_mmap_alloc_page(int cpu)
0788 {
0789     struct page *page;
0790     int node;
0791
0792     node = (cpu == -1) ? cpu : cpu_to_node(cpu);
0793     page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
0794     if (!page)
0795         return NULL;
0796
0797     return page_address(page);
0798 }
0799
0800 static void perf_mmap_free_page(void *addr)
0801 {
0802     struct page *page = virt_to_page(addr);
0803
0804     page->mapping = NULL;
0805     __free_page(page);
0806 }
0807
0808 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
0809 {
0810     struct perf_buffer *rb;
0811     unsigned long size;
0812     int i, node;
0813
0814     size = sizeof(struct perf_buffer);
0815     size += nr_pages * sizeof(void *);
0816
0817     if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
0818         goto fail;
0819
0820     node = (cpu == -1) ? cpu : cpu_to_node(cpu);
0821     rb = kzalloc_node(size, GFP_KERNEL, node);
0822     if (!rb)
0823         goto fail;
0824
0825     rb->user_page = perf_mmap_alloc_page(cpu);
0826     if (!rb->user_page)
0827         goto fail_user_page;
0828
0829     for (i = 0; i < nr_pages; i++) {
0830         rb->data_pages[i] = perf_mmap_alloc_page(cpu);
0831         if (!rb->data_pages[i])
0832             goto fail_data_pages;
0833     }
0834
0835     rb->nr_pages = nr_pages;
0836
0837     ring_buffer_init(rb, watermark, flags);
0838
0839     return rb;
0840
0841 fail_data_pages:
0842     for (i--; i >= 0; i--)
0843         perf_mmap_free_page(rb->data_pages[i]);
0844
0845     perf_mmap_free_page(rb->user_page);
0846
0847 fail_user_page:
0848     kfree(rb);
0849
0850 fail:
0851     return NULL;
0852 }
0853
0854 void rb_free(struct perf_buffer *rb)
0855 {
0856     int i;
0857
0858     perf_mmap_free_page(rb->user_page);
0859     for (i = 0; i < rb->nr_pages; i++)
0860         perf_mmap_free_page(rb->data_pages[i]);
0861     kfree(rb);
0862 }
0863
0864 #else
0865 static struct page *
0866 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
0867 {
0868     /* The '>' counts in the user page. */
0869     if (pgoff > data_page_nr(rb))
0870         return NULL;
0871
0872     return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
0873 }
0874
0875 static void perf_mmap_unmark_page(void *addr)
0876 {
0877     struct page *page = vmalloc_to_page(addr);
0878
0879     page->mapping = NULL;
0880 }
0881
0882 static void rb_free_work(struct work_struct *work)
0883 {
0884     struct perf_buffer *rb;
0885     void *base;
0886     int i, nr;
0887
0888     rb = container_of(work, struct perf_buffer, work);
0889     nr = data_page_nr(rb);
0890
0891     base = rb->user_page;
0892     /* The '<=' counts in the user page. */
0893     for (i = 0; i <= nr; i++)
0894         perf_mmap_unmark_page(base + (i * PAGE_SIZE));
0895
0896     vfree(base);
0897     kfree(rb);
0898 }
0899
0900 void rb_free(struct perf_buffer *rb)
0901 {
0902     schedule_work(&rb->work);
0903 }
0904
0905 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
0906 {
0907     struct perf_buffer *rb;
0908     unsigned long size;
0909     void *all_buf;
0910     int node;
0911
0912     size = sizeof(struct perf_buffer);
0913     size += sizeof(void *);
0914
0915     node = (cpu == -1) ? cpu : cpu_to_node(cpu);
0916     rb = kzalloc_node(size, GFP_KERNEL, node);
0917     if (!rb)
0918         goto fail;
0919
0920     INIT_WORK(&rb->work, rb_free_work);
0921
0922     all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
0923     if (!all_buf)
0924         goto fail_all_buf;
0925
0926     rb->user_page = all_buf;
0927     rb->data_pages[0] = all_buf + PAGE_SIZE;
0928     if (nr_pages) {
0929         rb->nr_pages = 1;
0930         rb->page_order = ilog2(nr_pages);
0931     }
0932
0933     ring_buffer_init(rb, watermark, flags);
0934
0935     return rb;
0936
0937 fail_all_buf:
0938     kfree(rb);
0939
0940 fail:
0941     return NULL;
0942 }
0943
0944 #endif
0945
0946 struct page *
0947 perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
0948 {
0949     if (rb->aux_nr_pages) {
0950         /* above AUX space */
0951         if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
0952             return NULL;
0953
0954         /* AUX space */
0955         if (pgoff >= rb->aux_pgoff) {
0956             int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
0957             return virt_to_page(rb->aux_pages[aux_pgoff]);
0958         }
0959     }
0960
0961     return __perf_mmap_to_page(rb, pgoff);
0962 }