Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Intel(R) Processor Trace PMU driver for perf
0004  * Copyright (c) 2013-2014, Intel Corporation.
0005  *
0006  * Intel PT is specified in the Intel Architecture Instruction Set Extensions
0007  * Programming Reference:
0008  * http://software.intel.com/en-us/intel-isa-extensions
0009  */
0010 
0011 #undef DEBUG
0012 
0013 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0014 
0015 #include <linux/types.h>
0016 #include <linux/bits.h>
0017 #include <linux/limits.h>
0018 #include <linux/slab.h>
0019 #include <linux/device.h>
0020 
0021 #include <asm/perf_event.h>
0022 #include <asm/insn.h>
0023 #include <asm/io.h>
0024 #include <asm/intel_pt.h>
0025 #include <asm/intel-family.h>
0026 
0027 #include "../perf_event.h"
0028 #include "pt.h"
0029 
0030 static DEFINE_PER_CPU(struct pt, pt_ctx);
0031 
0032 static struct pt_pmu pt_pmu;
0033 
0034 /*
0035  * Capabilities of Intel PT hardware, such as number of address bits or
0036  * supported output schemes, are cached and exported to userspace as "caps"
0037  * attribute group of pt pmu device
0038  * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
0039  * relevant bits together with intel_pt traces.
0040  *
0041  * These are necessary for both trace decoding (payloads_lip, contains address
0042  * width encoded in IP-related packets), and event configuration (bitmasks with
0043  * permitted values for certain bit fields).
0044  */
0045 #define PT_CAP(_n, _l, _r, _m)                      \
0046     [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,    \
0047                 .reg = _r, .mask = _m }
0048 
0049 static struct pt_cap_desc {
0050     const char  *name;
0051     u32     leaf;
0052     u8      reg;
0053     u32     mask;
0054 } pt_caps[] = {
0055     PT_CAP(max_subleaf,     0, CPUID_EAX, 0xffffffff),
0056     PT_CAP(cr3_filtering,       0, CPUID_EBX, BIT(0)),
0057     PT_CAP(psb_cyc,         0, CPUID_EBX, BIT(1)),
0058     PT_CAP(ip_filtering,        0, CPUID_EBX, BIT(2)),
0059     PT_CAP(mtc,         0, CPUID_EBX, BIT(3)),
0060     PT_CAP(ptwrite,         0, CPUID_EBX, BIT(4)),
0061     PT_CAP(power_event_trace,   0, CPUID_EBX, BIT(5)),
0062     PT_CAP(event_trace,     0, CPUID_EBX, BIT(7)),
0063     PT_CAP(tnt_disable,     0, CPUID_EBX, BIT(8)),
0064     PT_CAP(topa_output,     0, CPUID_ECX, BIT(0)),
0065     PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
0066     PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
0067     PT_CAP(output_subsys,       0, CPUID_ECX, BIT(3)),
0068     PT_CAP(payloads_lip,        0, CPUID_ECX, BIT(31)),
0069     PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x7),
0070     PT_CAP(mtc_periods,     1, CPUID_EAX, 0xffff0000),
0071     PT_CAP(cycle_thresholds,    1, CPUID_EBX, 0xffff),
0072     PT_CAP(psb_periods,     1, CPUID_EBX, 0xffff0000),
0073 };
0074 
0075 u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
0076 {
0077     struct pt_cap_desc *cd = &pt_caps[capability];
0078     u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
0079     unsigned int shift = __ffs(cd->mask);
0080 
0081     return (c & cd->mask) >> shift;
0082 }
0083 EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
0084 
0085 u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
0086 {
0087     return intel_pt_validate_cap(pt_pmu.caps, cap);
0088 }
0089 EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
0090 
0091 static ssize_t pt_cap_show(struct device *cdev,
0092                struct device_attribute *attr,
0093                char *buf)
0094 {
0095     struct dev_ext_attribute *ea =
0096         container_of(attr, struct dev_ext_attribute, attr);
0097     enum pt_capabilities cap = (long)ea->var;
0098 
0099     return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
0100 }
0101 
0102 static struct attribute_group pt_cap_group __ro_after_init = {
0103     .name   = "caps",
0104 };
0105 
0106 PMU_FORMAT_ATTR(pt,     "config:0"  );
0107 PMU_FORMAT_ATTR(cyc,        "config:1"  );
0108 PMU_FORMAT_ATTR(pwr_evt,    "config:4"  );
0109 PMU_FORMAT_ATTR(fup_on_ptw, "config:5"  );
0110 PMU_FORMAT_ATTR(mtc,        "config:9"  );
0111 PMU_FORMAT_ATTR(tsc,        "config:10" );
0112 PMU_FORMAT_ATTR(noretcomp,  "config:11" );
0113 PMU_FORMAT_ATTR(ptw,        "config:12" );
0114 PMU_FORMAT_ATTR(branch,     "config:13" );
0115 PMU_FORMAT_ATTR(event,      "config:31" );
0116 PMU_FORMAT_ATTR(notnt,      "config:55" );
0117 PMU_FORMAT_ATTR(mtc_period, "config:14-17"  );
0118 PMU_FORMAT_ATTR(cyc_thresh, "config:19-22"  );
0119 PMU_FORMAT_ATTR(psb_period, "config:24-27"  );
0120 
0121 static struct attribute *pt_formats_attr[] = {
0122     &format_attr_pt.attr,
0123     &format_attr_cyc.attr,
0124     &format_attr_pwr_evt.attr,
0125     &format_attr_event.attr,
0126     &format_attr_notnt.attr,
0127     &format_attr_fup_on_ptw.attr,
0128     &format_attr_mtc.attr,
0129     &format_attr_tsc.attr,
0130     &format_attr_noretcomp.attr,
0131     &format_attr_ptw.attr,
0132     &format_attr_branch.attr,
0133     &format_attr_mtc_period.attr,
0134     &format_attr_cyc_thresh.attr,
0135     &format_attr_psb_period.attr,
0136     NULL,
0137 };
0138 
0139 static struct attribute_group pt_format_group = {
0140     .name   = "format",
0141     .attrs  = pt_formats_attr,
0142 };
0143 
0144 static ssize_t
0145 pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
0146             char *page)
0147 {
0148     struct perf_pmu_events_attr *pmu_attr =
0149         container_of(attr, struct perf_pmu_events_attr, attr);
0150 
0151     switch (pmu_attr->id) {
0152     case 0:
0153         return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
0154     case 1:
0155         return sprintf(page, "%u:%u\n",
0156                    pt_pmu.tsc_art_num,
0157                    pt_pmu.tsc_art_den);
0158     default:
0159         break;
0160     }
0161 
0162     return -EINVAL;
0163 }
0164 
0165 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
0166            pt_timing_attr_show);
0167 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
0168            pt_timing_attr_show);
0169 
0170 static struct attribute *pt_timing_attr[] = {
0171     &timing_attr_max_nonturbo_ratio.attr.attr,
0172     &timing_attr_tsc_art_ratio.attr.attr,
0173     NULL,
0174 };
0175 
0176 static struct attribute_group pt_timing_group = {
0177     .attrs  = pt_timing_attr,
0178 };
0179 
0180 static const struct attribute_group *pt_attr_groups[] = {
0181     &pt_cap_group,
0182     &pt_format_group,
0183     &pt_timing_group,
0184     NULL,
0185 };
0186 
0187 static int __init pt_pmu_hw_init(void)
0188 {
0189     struct dev_ext_attribute *de_attrs;
0190     struct attribute **attrs;
0191     size_t size;
0192     u64 reg;
0193     int ret;
0194     long i;
0195 
0196     rdmsrl(MSR_PLATFORM_INFO, reg);
0197     pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
0198 
0199     /*
0200      * if available, read in TSC to core crystal clock ratio,
0201      * otherwise, zero for numerator stands for "not enumerated"
0202      * as per SDM
0203      */
0204     if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
0205         u32 eax, ebx, ecx, edx;
0206 
0207         cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
0208 
0209         pt_pmu.tsc_art_num = ebx;
0210         pt_pmu.tsc_art_den = eax;
0211     }
0212 
0213     /* model-specific quirks */
0214     switch (boot_cpu_data.x86_model) {
0215     case INTEL_FAM6_BROADWELL:
0216     case INTEL_FAM6_BROADWELL_D:
0217     case INTEL_FAM6_BROADWELL_G:
0218     case INTEL_FAM6_BROADWELL_X:
0219         /* not setting BRANCH_EN will #GP, erratum BDM106 */
0220         pt_pmu.branch_en_always_on = true;
0221         break;
0222     default:
0223         break;
0224     }
0225 
0226     if (boot_cpu_has(X86_FEATURE_VMX)) {
0227         /*
0228          * Intel SDM, 36.5 "Tracing post-VMXON" says that
0229          * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
0230          * post-VMXON.
0231          */
0232         rdmsrl(MSR_IA32_VMX_MISC, reg);
0233         if (reg & BIT(14))
0234             pt_pmu.vmx = true;
0235     }
0236 
0237     for (i = 0; i < PT_CPUID_LEAVES; i++) {
0238         cpuid_count(20, i,
0239                 &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
0240                 &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
0241                 &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
0242                 &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
0243     }
0244 
0245     ret = -ENOMEM;
0246     size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
0247     attrs = kzalloc(size, GFP_KERNEL);
0248     if (!attrs)
0249         goto fail;
0250 
0251     size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
0252     de_attrs = kzalloc(size, GFP_KERNEL);
0253     if (!de_attrs)
0254         goto fail;
0255 
0256     for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
0257         struct dev_ext_attribute *de_attr = de_attrs + i;
0258 
0259         de_attr->attr.attr.name = pt_caps[i].name;
0260 
0261         sysfs_attr_init(&de_attr->attr.attr);
0262 
0263         de_attr->attr.attr.mode     = S_IRUGO;
0264         de_attr->attr.show      = pt_cap_show;
0265         de_attr->var            = (void *)i;
0266 
0267         attrs[i] = &de_attr->attr.attr;
0268     }
0269 
0270     pt_cap_group.attrs = attrs;
0271 
0272     return 0;
0273 
0274 fail:
0275     kfree(attrs);
0276 
0277     return ret;
0278 }
0279 
0280 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
0281               RTIT_CTL_CYC_THRESH   | \
0282               RTIT_CTL_PSB_FREQ)
0283 
0284 #define RTIT_CTL_MTC    (RTIT_CTL_MTC_EN    | \
0285              RTIT_CTL_MTC_RANGE)
0286 
0287 #define RTIT_CTL_PTW    (RTIT_CTL_PTW_EN    | \
0288              RTIT_CTL_FUP_ON_PTW)
0289 
0290 /*
0291  * Bit 0 (TraceEn) in the attr.config is meaningless as the
0292  * corresponding bit in the RTIT_CTL can only be controlled
0293  * by the driver; therefore, repurpose it to mean: pass
0294  * through the bit that was previously assumed to be always
0295  * on for PT, thereby allowing the user to *not* set it if
0296  * they so wish. See also pt_event_valid() and pt_config().
0297  */
0298 #define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
0299 
0300 #define PT_CONFIG_MASK (RTIT_CTL_TRACEEN    | \
0301             RTIT_CTL_TSC_EN     | \
0302             RTIT_CTL_DISRETC    | \
0303             RTIT_CTL_BRANCH_EN  | \
0304             RTIT_CTL_CYC_PSB    | \
0305             RTIT_CTL_MTC        | \
0306             RTIT_CTL_PWR_EVT_EN | \
0307             RTIT_CTL_EVENT_EN   | \
0308             RTIT_CTL_NOTNT      | \
0309             RTIT_CTL_FUP_ON_PTW | \
0310             RTIT_CTL_PTW_EN)
0311 
0312 static bool pt_event_valid(struct perf_event *event)
0313 {
0314     u64 config = event->attr.config;
0315     u64 allowed, requested;
0316 
0317     if ((config & PT_CONFIG_MASK) != config)
0318         return false;
0319 
0320     if (config & RTIT_CTL_CYC_PSB) {
0321         if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
0322             return false;
0323 
0324         allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
0325         requested = (config & RTIT_CTL_PSB_FREQ) >>
0326             RTIT_CTL_PSB_FREQ_OFFSET;
0327         if (requested && (!(allowed & BIT(requested))))
0328             return false;
0329 
0330         allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
0331         requested = (config & RTIT_CTL_CYC_THRESH) >>
0332             RTIT_CTL_CYC_THRESH_OFFSET;
0333         if (requested && (!(allowed & BIT(requested))))
0334             return false;
0335     }
0336 
0337     if (config & RTIT_CTL_MTC) {
0338         /*
0339          * In the unlikely case that CPUID lists valid mtc periods,
0340          * but not the mtc capability, drop out here.
0341          *
0342          * Spec says that setting mtc period bits while mtc bit in
0343          * CPUID is 0 will #GP, so better safe than sorry.
0344          */
0345         if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
0346             return false;
0347 
0348         allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
0349         if (!allowed)
0350             return false;
0351 
0352         requested = (config & RTIT_CTL_MTC_RANGE) >>
0353             RTIT_CTL_MTC_RANGE_OFFSET;
0354 
0355         if (!(allowed & BIT(requested)))
0356             return false;
0357     }
0358 
0359     if (config & RTIT_CTL_PWR_EVT_EN &&
0360         !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
0361         return false;
0362 
0363     if (config & RTIT_CTL_EVENT_EN &&
0364         !intel_pt_validate_hw_cap(PT_CAP_event_trace))
0365         return false;
0366 
0367     if (config & RTIT_CTL_NOTNT &&
0368         !intel_pt_validate_hw_cap(PT_CAP_tnt_disable))
0369         return false;
0370 
0371     if (config & RTIT_CTL_PTW) {
0372         if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
0373             return false;
0374 
0375         /* FUPonPTW without PTW doesn't make sense */
0376         if ((config & RTIT_CTL_FUP_ON_PTW) &&
0377             !(config & RTIT_CTL_PTW_EN))
0378             return false;
0379     }
0380 
0381     /*
0382      * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
0383      * clears the assumption that BranchEn must always be enabled,
0384      * as was the case with the first implementation of PT.
0385      * If this bit is not set, the legacy behavior is preserved
0386      * for compatibility with the older userspace.
0387      *
0388      * Re-using bit 0 for this purpose is fine because it is never
0389      * directly set by the user; previous attempts at setting it in
0390      * the attr.config resulted in -EINVAL.
0391      */
0392     if (config & RTIT_CTL_PASSTHROUGH) {
0393         /*
0394          * Disallow not setting BRANCH_EN where BRANCH_EN is
0395          * always required.
0396          */
0397         if (pt_pmu.branch_en_always_on &&
0398             !(config & RTIT_CTL_BRANCH_EN))
0399             return false;
0400     } else {
0401         /*
0402          * Disallow BRANCH_EN without the PASSTHROUGH.
0403          */
0404         if (config & RTIT_CTL_BRANCH_EN)
0405             return false;
0406     }
0407 
0408     return true;
0409 }
0410 
0411 /*
0412  * PT configuration helpers
0413  * These all are cpu affine and operate on a local PT
0414  */
0415 
0416 static void pt_config_start(struct perf_event *event)
0417 {
0418     struct pt *pt = this_cpu_ptr(&pt_ctx);
0419     u64 ctl = event->hw.config;
0420 
0421     ctl |= RTIT_CTL_TRACEEN;
0422     if (READ_ONCE(pt->vmx_on))
0423         perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
0424     else
0425         wrmsrl(MSR_IA32_RTIT_CTL, ctl);
0426 
0427     WRITE_ONCE(event->hw.config, ctl);
0428 }
0429 
0430 /* Address ranges and their corresponding msr configuration registers */
0431 static const struct pt_address_range {
0432     unsigned long   msr_a;
0433     unsigned long   msr_b;
0434     unsigned int    reg_off;
0435 } pt_address_ranges[] = {
0436     {
0437         .msr_a   = MSR_IA32_RTIT_ADDR0_A,
0438         .msr_b   = MSR_IA32_RTIT_ADDR0_B,
0439         .reg_off = RTIT_CTL_ADDR0_OFFSET,
0440     },
0441     {
0442         .msr_a   = MSR_IA32_RTIT_ADDR1_A,
0443         .msr_b   = MSR_IA32_RTIT_ADDR1_B,
0444         .reg_off = RTIT_CTL_ADDR1_OFFSET,
0445     },
0446     {
0447         .msr_a   = MSR_IA32_RTIT_ADDR2_A,
0448         .msr_b   = MSR_IA32_RTIT_ADDR2_B,
0449         .reg_off = RTIT_CTL_ADDR2_OFFSET,
0450     },
0451     {
0452         .msr_a   = MSR_IA32_RTIT_ADDR3_A,
0453         .msr_b   = MSR_IA32_RTIT_ADDR3_B,
0454         .reg_off = RTIT_CTL_ADDR3_OFFSET,
0455     }
0456 };
0457 
0458 static u64 pt_config_filters(struct perf_event *event)
0459 {
0460     struct pt_filters *filters = event->hw.addr_filters;
0461     struct pt *pt = this_cpu_ptr(&pt_ctx);
0462     unsigned int range = 0;
0463     u64 rtit_ctl = 0;
0464 
0465     if (!filters)
0466         return 0;
0467 
0468     perf_event_addr_filters_sync(event);
0469 
0470     for (range = 0; range < filters->nr_filters; range++) {
0471         struct pt_filter *filter = &filters->filter[range];
0472 
0473         /*
0474          * Note, if the range has zero start/end addresses due
0475          * to its dynamic object not being loaded yet, we just
0476          * go ahead and program zeroed range, which will simply
0477          * produce no data. Note^2: if executable code at 0x0
0478          * is a concern, we can set up an "invalid" configuration
0479          * such as msr_b < msr_a.
0480          */
0481 
0482         /* avoid redundant msr writes */
0483         if (pt->filters.filter[range].msr_a != filter->msr_a) {
0484             wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
0485             pt->filters.filter[range].msr_a = filter->msr_a;
0486         }
0487 
0488         if (pt->filters.filter[range].msr_b != filter->msr_b) {
0489             wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
0490             pt->filters.filter[range].msr_b = filter->msr_b;
0491         }
0492 
0493         rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
0494     }
0495 
0496     return rtit_ctl;
0497 }
0498 
0499 static void pt_config(struct perf_event *event)
0500 {
0501     struct pt *pt = this_cpu_ptr(&pt_ctx);
0502     struct pt_buffer *buf = perf_get_aux(&pt->handle);
0503     u64 reg;
0504 
0505     /* First round: clear STATUS, in particular the PSB byte counter. */
0506     if (!event->hw.config) {
0507         perf_event_itrace_started(event);
0508         wrmsrl(MSR_IA32_RTIT_STATUS, 0);
0509     }
0510 
0511     reg = pt_config_filters(event);
0512     reg |= RTIT_CTL_TRACEEN;
0513     if (!buf->single)
0514         reg |= RTIT_CTL_TOPA;
0515 
0516     /*
0517      * Previously, we had BRANCH_EN on by default, but now that PT has
0518      * grown features outside of branch tracing, it is useful to allow
0519      * the user to disable it. Setting bit 0 in the event's attr.config
0520      * allows BRANCH_EN to pass through instead of being always on. See
0521      * also the comment in pt_event_valid().
0522      */
0523     if (event->attr.config & BIT(0)) {
0524         reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
0525     } else {
0526         reg |= RTIT_CTL_BRANCH_EN;
0527     }
0528 
0529     if (!event->attr.exclude_kernel)
0530         reg |= RTIT_CTL_OS;
0531     if (!event->attr.exclude_user)
0532         reg |= RTIT_CTL_USR;
0533 
0534     reg |= (event->attr.config & PT_CONFIG_MASK);
0535 
0536     event->hw.config = reg;
0537     pt_config_start(event);
0538 }
0539 
0540 static void pt_config_stop(struct perf_event *event)
0541 {
0542     struct pt *pt = this_cpu_ptr(&pt_ctx);
0543     u64 ctl = READ_ONCE(event->hw.config);
0544 
0545     /* may be already stopped by a PMI */
0546     if (!(ctl & RTIT_CTL_TRACEEN))
0547         return;
0548 
0549     ctl &= ~RTIT_CTL_TRACEEN;
0550     if (!READ_ONCE(pt->vmx_on))
0551         wrmsrl(MSR_IA32_RTIT_CTL, ctl);
0552 
0553     WRITE_ONCE(event->hw.config, ctl);
0554 
0555     /*
0556      * A wrmsr that disables trace generation serializes other PT
0557      * registers and causes all data packets to be written to memory,
0558      * but a fence is required for the data to become globally visible.
0559      *
0560      * The below WMB, separating data store and aux_head store matches
0561      * the consumer's RMB that separates aux_head load and data load.
0562      */
0563     wmb();
0564 }
0565 
0566 /**
0567  * struct topa - ToPA metadata
0568  * @list:   linkage to struct pt_buffer's list of tables
0569  * @offset: offset of the first entry in this table in the buffer
0570  * @size:   total size of all entries in this table
0571  * @last:   index of the last initialized entry in this table
0572  * @z_count:    how many times the first entry repeats
0573  */
0574 struct topa {
0575     struct list_head    list;
0576     u64         offset;
0577     size_t          size;
0578     int         last;
0579     unsigned int        z_count;
0580 };
0581 
0582 /*
0583  * Keep ToPA table-related metadata on the same page as the actual table,
0584  * taking up a few words from the top
0585  */
0586 
0587 #define TENTS_PER_PAGE  \
0588     ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
0589 
0590 /**
0591  * struct topa_page - page-sized ToPA table with metadata at the top
0592  * @table:  actual ToPA table entries, as understood by PT hardware
0593  * @topa:   metadata
0594  */
0595 struct topa_page {
0596     struct topa_entry   table[TENTS_PER_PAGE];
0597     struct topa     topa;
0598 };
0599 
0600 static inline struct topa_page *topa_to_page(struct topa *topa)
0601 {
0602     return container_of(topa, struct topa_page, topa);
0603 }
0604 
0605 static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
0606 {
0607     return (struct topa_page *)((unsigned long)te & PAGE_MASK);
0608 }
0609 
0610 static inline phys_addr_t topa_pfn(struct topa *topa)
0611 {
0612     return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
0613 }
0614 
0615 /* make -1 stand for the last table entry */
0616 #define TOPA_ENTRY(t, i)                \
0617     ((i) == -1                  \
0618         ? &topa_to_page(t)->table[(t)->last]    \
0619         : &topa_to_page(t)->table[(i)])
0620 #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
0621 #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
0622 
0623 static void pt_config_buffer(struct pt_buffer *buf)
0624 {
0625     struct pt *pt = this_cpu_ptr(&pt_ctx);
0626     u64 reg, mask;
0627     void *base;
0628 
0629     if (buf->single) {
0630         base = buf->data_pages[0];
0631         mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
0632     } else {
0633         base = topa_to_page(buf->cur)->table;
0634         mask = (u64)buf->cur_idx;
0635     }
0636 
0637     reg = virt_to_phys(base);
0638     if (pt->output_base != reg) {
0639         pt->output_base = reg;
0640         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
0641     }
0642 
0643     reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
0644     if (pt->output_mask != reg) {
0645         pt->output_mask = reg;
0646         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
0647     }
0648 }
0649 
0650 /**
0651  * topa_alloc() - allocate page-sized ToPA table
0652  * @cpu:    CPU on which to allocate.
0653  * @gfp:    Allocation flags.
0654  *
0655  * Return:  On success, return the pointer to ToPA table page.
0656  */
0657 static struct topa *topa_alloc(int cpu, gfp_t gfp)
0658 {
0659     int node = cpu_to_node(cpu);
0660     struct topa_page *tp;
0661     struct page *p;
0662 
0663     p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
0664     if (!p)
0665         return NULL;
0666 
0667     tp = page_address(p);
0668     tp->topa.last = 0;
0669 
0670     /*
0671      * In case of singe-entry ToPA, always put the self-referencing END
0672      * link as the 2nd entry in the table
0673      */
0674     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
0675         TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
0676         TOPA_ENTRY(&tp->topa, 1)->end = 1;
0677     }
0678 
0679     return &tp->topa;
0680 }
0681 
0682 /**
0683  * topa_free() - free a page-sized ToPA table
0684  * @topa:   Table to deallocate.
0685  */
0686 static void topa_free(struct topa *topa)
0687 {
0688     free_page((unsigned long)topa);
0689 }
0690 
0691 /**
0692  * topa_insert_table() - insert a ToPA table into a buffer
0693  * @buf:     PT buffer that's being extended.
0694  * @topa:    New topa table to be inserted.
0695  *
0696  * If it's the first table in this buffer, set up buffer's pointers
0697  * accordingly; otherwise, add a END=1 link entry to @topa to the current
0698  * "last" table and adjust the last table pointer to @topa.
0699  */
0700 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
0701 {
0702     struct topa *last = buf->last;
0703 
0704     list_add_tail(&topa->list, &buf->tables);
0705 
0706     if (!buf->first) {
0707         buf->first = buf->last = buf->cur = topa;
0708         return;
0709     }
0710 
0711     topa->offset = last->offset + last->size;
0712     buf->last = topa;
0713 
0714     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
0715         return;
0716 
0717     BUG_ON(last->last != TENTS_PER_PAGE - 1);
0718 
0719     TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
0720     TOPA_ENTRY(last, -1)->end = 1;
0721 }
0722 
0723 /**
0724  * topa_table_full() - check if a ToPA table is filled up
0725  * @topa:   ToPA table.
0726  */
0727 static bool topa_table_full(struct topa *topa)
0728 {
0729     /* single-entry ToPA is a special case */
0730     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
0731         return !!topa->last;
0732 
0733     return topa->last == TENTS_PER_PAGE - 1;
0734 }
0735 
0736 /**
0737  * topa_insert_pages() - create a list of ToPA tables
0738  * @buf:    PT buffer being initialized.
0739  * @gfp:    Allocation flags.
0740  *
0741  * This initializes a list of ToPA tables with entries from
0742  * the data_pages provided by rb_alloc_aux().
0743  *
0744  * Return:  0 on success or error code.
0745  */
0746 static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
0747 {
0748     struct topa *topa = buf->last;
0749     int order = 0;
0750     struct page *p;
0751 
0752     p = virt_to_page(buf->data_pages[buf->nr_pages]);
0753     if (PagePrivate(p))
0754         order = page_private(p);
0755 
0756     if (topa_table_full(topa)) {
0757         topa = topa_alloc(cpu, gfp);
0758         if (!topa)
0759             return -ENOMEM;
0760 
0761         topa_insert_table(buf, topa);
0762     }
0763 
0764     if (topa->z_count == topa->last - 1) {
0765         if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
0766             topa->z_count++;
0767     }
0768 
0769     TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
0770     TOPA_ENTRY(topa, -1)->size = order;
0771     if (!buf->snapshot &&
0772         !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
0773         TOPA_ENTRY(topa, -1)->intr = 1;
0774         TOPA_ENTRY(topa, -1)->stop = 1;
0775     }
0776 
0777     topa->last++;
0778     topa->size += sizes(order);
0779 
0780     buf->nr_pages += 1ul << order;
0781 
0782     return 0;
0783 }
0784 
0785 /**
0786  * pt_topa_dump() - print ToPA tables and their entries
0787  * @buf:    PT buffer.
0788  */
0789 static void pt_topa_dump(struct pt_buffer *buf)
0790 {
0791     struct topa *topa;
0792 
0793     list_for_each_entry(topa, &buf->tables, list) {
0794         struct topa_page *tp = topa_to_page(topa);
0795         int i;
0796 
0797         pr_debug("# table @%p, off %llx size %zx\n", tp->table,
0798              topa->offset, topa->size);
0799         for (i = 0; i < TENTS_PER_PAGE; i++) {
0800             pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
0801                  &tp->table[i],
0802                  (unsigned long)tp->table[i].base << TOPA_SHIFT,
0803                  sizes(tp->table[i].size),
0804                  tp->table[i].end ?  'E' : ' ',
0805                  tp->table[i].intr ? 'I' : ' ',
0806                  tp->table[i].stop ? 'S' : ' ',
0807                  *(u64 *)&tp->table[i]);
0808             if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
0809                  tp->table[i].stop) ||
0810                 tp->table[i].end)
0811                 break;
0812             if (!i && topa->z_count)
0813                 i += topa->z_count;
0814         }
0815     }
0816 }
0817 
0818 /**
0819  * pt_buffer_advance() - advance to the next output region
0820  * @buf:    PT buffer.
0821  *
0822  * Advance the current pointers in the buffer to the next ToPA entry.
0823  */
0824 static void pt_buffer_advance(struct pt_buffer *buf)
0825 {
0826     buf->output_off = 0;
0827     buf->cur_idx++;
0828 
0829     if (buf->cur_idx == buf->cur->last) {
0830         if (buf->cur == buf->last)
0831             buf->cur = buf->first;
0832         else
0833             buf->cur = list_entry(buf->cur->list.next, struct topa,
0834                           list);
0835         buf->cur_idx = 0;
0836     }
0837 }
0838 
0839 /**
0840  * pt_update_head() - calculate current offsets and sizes
0841  * @pt:     Per-cpu pt context.
0842  *
0843  * Update buffer's current write pointer position and data size.
0844  */
0845 static void pt_update_head(struct pt *pt)
0846 {
0847     struct pt_buffer *buf = perf_get_aux(&pt->handle);
0848     u64 topa_idx, base, old;
0849 
0850     if (buf->single) {
0851         local_set(&buf->data_size, buf->output_off);
0852         return;
0853     }
0854 
0855     /* offset of the first region in this table from the beginning of buf */
0856     base = buf->cur->offset + buf->output_off;
0857 
0858     /* offset of the current output region within this table */
0859     for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
0860         base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
0861 
0862     if (buf->snapshot) {
0863         local_set(&buf->data_size, base);
0864     } else {
0865         old = (local64_xchg(&buf->head, base) &
0866                ((buf->nr_pages << PAGE_SHIFT) - 1));
0867         if (base < old)
0868             base += buf->nr_pages << PAGE_SHIFT;
0869 
0870         local_add(base - old, &buf->data_size);
0871     }
0872 }
0873 
0874 /**
0875  * pt_buffer_region() - obtain current output region's address
0876  * @buf:    PT buffer.
0877  */
0878 static void *pt_buffer_region(struct pt_buffer *buf)
0879 {
0880     return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
0881 }
0882 
0883 /**
0884  * pt_buffer_region_size() - obtain current output region's size
0885  * @buf:    PT buffer.
0886  */
0887 static size_t pt_buffer_region_size(struct pt_buffer *buf)
0888 {
0889     return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
0890 }
0891 
0892 /**
0893  * pt_handle_status() - take care of possible status conditions
0894  * @pt:     Per-cpu pt context.
0895  */
0896 static void pt_handle_status(struct pt *pt)
0897 {
0898     struct pt_buffer *buf = perf_get_aux(&pt->handle);
0899     int advance = 0;
0900     u64 status;
0901 
0902     rdmsrl(MSR_IA32_RTIT_STATUS, status);
0903 
0904     if (status & RTIT_STATUS_ERROR) {
0905         pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
0906         pt_topa_dump(buf);
0907         status &= ~RTIT_STATUS_ERROR;
0908     }
0909 
0910     if (status & RTIT_STATUS_STOPPED) {
0911         status &= ~RTIT_STATUS_STOPPED;
0912 
0913         /*
0914          * On systems that only do single-entry ToPA, hitting STOP
0915          * means we are already losing data; need to let the decoder
0916          * know.
0917          */
0918         if (!buf->single &&
0919             (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
0920              buf->output_off == pt_buffer_region_size(buf))) {
0921             perf_aux_output_flag(&pt->handle,
0922                                  PERF_AUX_FLAG_TRUNCATED);
0923             advance++;
0924         }
0925     }
0926 
0927     /*
0928      * Also on single-entry ToPA implementations, interrupt will come
0929      * before the output reaches its output region's boundary.
0930      */
0931     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
0932         !buf->snapshot &&
0933         pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
0934         void *head = pt_buffer_region(buf);
0935 
0936         /* everything within this margin needs to be zeroed out */
0937         memset(head + buf->output_off, 0,
0938                pt_buffer_region_size(buf) -
0939                buf->output_off);
0940         advance++;
0941     }
0942 
0943     if (advance)
0944         pt_buffer_advance(buf);
0945 
0946     wrmsrl(MSR_IA32_RTIT_STATUS, status);
0947 }
0948 
0949 /**
0950  * pt_read_offset() - translate registers into buffer pointers
0951  * @buf:    PT buffer.
0952  *
0953  * Set buffer's output pointers from MSR values.
0954  */
0955 static void pt_read_offset(struct pt_buffer *buf)
0956 {
0957     struct pt *pt = this_cpu_ptr(&pt_ctx);
0958     struct topa_page *tp;
0959 
0960     if (!buf->single) {
0961         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
0962         tp = phys_to_virt(pt->output_base);
0963         buf->cur = &tp->topa;
0964     }
0965 
0966     rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
0967     /* offset within current output region */
0968     buf->output_off = pt->output_mask >> 32;
0969     /* index of current output region within this table */
0970     if (!buf->single)
0971         buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
0972 }
0973 
0974 static struct topa_entry *
0975 pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
0976 {
0977     struct topa_page *tp;
0978     struct topa *topa;
0979     unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
0980 
0981     /*
0982      * Indicates a bug in the caller.
0983      */
0984     if (WARN_ON_ONCE(pg >= buf->nr_pages))
0985         return NULL;
0986 
0987     /*
0988      * First, find the ToPA table where @pg fits. With high
0989      * order allocations, there shouldn't be many of these.
0990      */
0991     list_for_each_entry(topa, &buf->tables, list) {
0992         if (topa->offset + topa->size > pg << PAGE_SHIFT)
0993             goto found;
0994     }
0995 
0996     /*
0997      * Hitting this means we have a problem in the ToPA
0998      * allocation code.
0999      */
1000     WARN_ON_ONCE(1);
1001 
1002     return NULL;
1003 
1004 found:
1005     /*
1006      * Indicates a problem in the ToPA allocation code.
1007      */
1008     if (WARN_ON_ONCE(topa->last == -1))
1009         return NULL;
1010 
1011     tp = topa_to_page(topa);
1012     cur_pg = PFN_DOWN(topa->offset);
1013     if (topa->z_count) {
1014         z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
1015         start_idx = topa->z_count + 1;
1016     }
1017 
1018     /*
1019      * Multiple entries at the beginning of the table have the same size,
1020      * ideally all of them; if @pg falls there, the search is done.
1021      */
1022     if (pg >= cur_pg && pg < cur_pg + z_pg) {
1023         idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
1024         return &tp->table[idx];
1025     }
1026 
1027     /*
1028      * Otherwise, slow path: iterate through the remaining entries.
1029      */
1030     for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
1031         if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
1032             return &tp->table[idx];
1033 
1034         cur_pg += TOPA_ENTRY_PAGES(topa, idx);
1035     }
1036 
1037     /*
1038      * Means we couldn't find a ToPA entry in the table that does match.
1039      */
1040     WARN_ON_ONCE(1);
1041 
1042     return NULL;
1043 }
1044 
1045 static struct topa_entry *
1046 pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
1047 {
1048     unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
1049     struct topa_page *tp;
1050     struct topa *topa;
1051 
1052     tp = (struct topa_page *)table;
1053     if (tp->table != te)
1054         return --te;
1055 
1056     topa = &tp->topa;
1057     if (topa == buf->first)
1058         topa = buf->last;
1059     else
1060         topa = list_prev_entry(topa, list);
1061 
1062     tp = topa_to_page(topa);
1063 
1064     return &tp->table[topa->last - 1];
1065 }
1066 
1067 /**
1068  * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
1069  * @buf:    PT buffer.
1070  * @handle: Current output handle.
1071  *
1072  * Place INT and STOP marks to prevent overwriting old data that the consumer
1073  * hasn't yet collected and waking up the consumer after a certain fraction of
1074  * the buffer has filled up. Only needed and sensible for non-snapshot counters.
1075  *
1076  * This obviously relies on buf::head to figure out buffer markers, so it has
1077  * to be called after pt_buffer_reset_offsets() and before the hardware tracing
1078  * is enabled.
1079  */
1080 static int pt_buffer_reset_markers(struct pt_buffer *buf,
1081                    struct perf_output_handle *handle)
1082 
1083 {
1084     unsigned long head = local64_read(&buf->head);
1085     unsigned long idx, npages, wakeup;
1086 
1087     if (buf->single)
1088         return 0;
1089 
1090     /* can't stop in the middle of an output region */
1091     if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
1092         perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
1093         return -EINVAL;
1094     }
1095 
1096 
1097     /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
1098     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1099         return 0;
1100 
1101     /* clear STOP and INT from current entry */
1102     if (buf->stop_te) {
1103         buf->stop_te->stop = 0;
1104         buf->stop_te->intr = 0;
1105     }
1106 
1107     if (buf->intr_te)
1108         buf->intr_te->intr = 0;
1109 
1110     /* how many pages till the STOP marker */
1111     npages = handle->size >> PAGE_SHIFT;
1112 
1113     /* if it's on a page boundary, fill up one more page */
1114     if (!offset_in_page(head + handle->size + 1))
1115         npages++;
1116 
1117     idx = (head >> PAGE_SHIFT) + npages;
1118     idx &= buf->nr_pages - 1;
1119 
1120     if (idx != buf->stop_pos) {
1121         buf->stop_pos = idx;
1122         buf->stop_te = pt_topa_entry_for_page(buf, idx);
1123         buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
1124     }
1125 
1126     wakeup = handle->wakeup >> PAGE_SHIFT;
1127 
1128     /* in the worst case, wake up the consumer one page before hard stop */
1129     idx = (head >> PAGE_SHIFT) + npages - 1;
1130     if (idx > wakeup)
1131         idx = wakeup;
1132 
1133     idx &= buf->nr_pages - 1;
1134     if (idx != buf->intr_pos) {
1135         buf->intr_pos = idx;
1136         buf->intr_te = pt_topa_entry_for_page(buf, idx);
1137         buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
1138     }
1139 
1140     buf->stop_te->stop = 1;
1141     buf->stop_te->intr = 1;
1142     buf->intr_te->intr = 1;
1143 
1144     return 0;
1145 }
1146 
1147 /**
1148  * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
1149  * @buf:    PT buffer.
1150  * @head:   Write pointer (aux_head) from AUX buffer.
1151  *
1152  * Find the ToPA table and entry corresponding to given @head and set buffer's
1153  * "current" pointers accordingly. This is done after we have obtained the
1154  * current aux_head position from a successful call to perf_aux_output_begin()
1155  * to make sure the hardware is writing to the right place.
1156  *
1157  * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
1158  * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
1159  * which are used to determine INT and STOP markers' locations by a subsequent
1160  * call to pt_buffer_reset_markers().
1161  */
1162 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
1163 {
1164     struct topa_page *cur_tp;
1165     struct topa_entry *te;
1166     int pg;
1167 
1168     if (buf->snapshot)
1169         head &= (buf->nr_pages << PAGE_SHIFT) - 1;
1170 
1171     if (!buf->single) {
1172         pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
1173         te = pt_topa_entry_for_page(buf, pg);
1174 
1175         cur_tp = topa_entry_to_page(te);
1176         buf->cur = &cur_tp->topa;
1177         buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
1178         buf->output_off = head & (pt_buffer_region_size(buf) - 1);
1179     } else {
1180         buf->output_off = head;
1181     }
1182 
1183     local64_set(&buf->head, head);
1184     local_set(&buf->data_size, 0);
1185 }
1186 
1187 /**
1188  * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
1189  * @buf:    PT buffer.
1190  */
1191 static void pt_buffer_fini_topa(struct pt_buffer *buf)
1192 {
1193     struct topa *topa, *iter;
1194 
1195     if (buf->single)
1196         return;
1197 
1198     list_for_each_entry_safe(topa, iter, &buf->tables, list) {
1199         /*
1200          * right now, this is in free_aux() path only, so
1201          * no need to unlink this table from the list
1202          */
1203         topa_free(topa);
1204     }
1205 }
1206 
1207 /**
1208  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
1209  * @buf:    PT buffer.
1210  * @size:   Total size of all regions within this ToPA.
1211  * @gfp:    Allocation flags.
1212  */
1213 static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
1214                    unsigned long nr_pages, gfp_t gfp)
1215 {
1216     struct topa *topa;
1217     int err;
1218 
1219     topa = topa_alloc(cpu, gfp);
1220     if (!topa)
1221         return -ENOMEM;
1222 
1223     topa_insert_table(buf, topa);
1224 
1225     while (buf->nr_pages < nr_pages) {
1226         err = topa_insert_pages(buf, cpu, gfp);
1227         if (err) {
1228             pt_buffer_fini_topa(buf);
1229             return -ENOMEM;
1230         }
1231     }
1232 
1233     /* link last table to the first one, unless we're double buffering */
1234     if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
1235         TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
1236         TOPA_ENTRY(buf->last, -1)->end = 1;
1237     }
1238 
1239     pt_topa_dump(buf);
1240     return 0;
1241 }
1242 
1243 static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
1244 {
1245     struct page *p = virt_to_page(buf->data_pages[0]);
1246     int ret = -ENOTSUPP, order = 0;
1247 
1248     /*
1249      * We can use single range output mode
1250      * + in snapshot mode, where we don't need interrupts;
1251      * + if the hardware supports it;
1252      * + if the entire buffer is one contiguous allocation.
1253      */
1254     if (!buf->snapshot)
1255         goto out;
1256 
1257     if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
1258         goto out;
1259 
1260     if (PagePrivate(p))
1261         order = page_private(p);
1262 
1263     if (1 << order != nr_pages)
1264         goto out;
1265 
1266     buf->single = true;
1267     buf->nr_pages = nr_pages;
1268     ret = 0;
1269 out:
1270     return ret;
1271 }
1272 
1273 /**
1274  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
1275  * @cpu:    Cpu on which to allocate, -1 means current.
1276  * @pages:  Array of pointers to buffer pages passed from perf core.
1277  * @nr_pages:   Number of pages in the buffer.
1278  * @snapshot:   If this is a snapshot/overwrite counter.
1279  *
1280  * This is a pmu::setup_aux callback that sets up ToPA tables and all the
1281  * bookkeeping for an AUX buffer.
1282  *
1283  * Return:  Our private PT buffer structure.
1284  */
1285 static void *
1286 pt_buffer_setup_aux(struct perf_event *event, void **pages,
1287             int nr_pages, bool snapshot)
1288 {
1289     struct pt_buffer *buf;
1290     int node, ret, cpu = event->cpu;
1291 
1292     if (!nr_pages)
1293         return NULL;
1294 
1295     /*
1296      * Only support AUX sampling in snapshot mode, where we don't
1297      * generate NMIs.
1298      */
1299     if (event->attr.aux_sample_size && !snapshot)
1300         return NULL;
1301 
1302     if (cpu == -1)
1303         cpu = raw_smp_processor_id();
1304     node = cpu_to_node(cpu);
1305 
1306     buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
1307     if (!buf)
1308         return NULL;
1309 
1310     buf->snapshot = snapshot;
1311     buf->data_pages = pages;
1312     buf->stop_pos = -1;
1313     buf->intr_pos = -1;
1314 
1315     INIT_LIST_HEAD(&buf->tables);
1316 
1317     ret = pt_buffer_try_single(buf, nr_pages);
1318     if (!ret)
1319         return buf;
1320 
1321     ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
1322     if (ret) {
1323         kfree(buf);
1324         return NULL;
1325     }
1326 
1327     return buf;
1328 }
1329 
1330 /**
1331  * pt_buffer_free_aux() - perf AUX deallocation path callback
1332  * @data:   PT buffer.
1333  */
1334 static void pt_buffer_free_aux(void *data)
1335 {
1336     struct pt_buffer *buf = data;
1337 
1338     pt_buffer_fini_topa(buf);
1339     kfree(buf);
1340 }
1341 
1342 static int pt_addr_filters_init(struct perf_event *event)
1343 {
1344     struct pt_filters *filters;
1345     int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1346 
1347     if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1348         return 0;
1349 
1350     filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
1351     if (!filters)
1352         return -ENOMEM;
1353 
1354     if (event->parent)
1355         memcpy(filters, event->parent->hw.addr_filters,
1356                sizeof(*filters));
1357 
1358     event->hw.addr_filters = filters;
1359 
1360     return 0;
1361 }
1362 
1363 static void pt_addr_filters_fini(struct perf_event *event)
1364 {
1365     kfree(event->hw.addr_filters);
1366     event->hw.addr_filters = NULL;
1367 }
1368 
1369 #ifdef CONFIG_X86_64
1370 /* Clamp to a canonical address greater-than-or-equal-to the address given */
1371 static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits)
1372 {
1373     return __is_canonical_address(vaddr, vaddr_bits) ?
1374            vaddr :
1375            -BIT_ULL(vaddr_bits - 1);
1376 }
1377 
1378 /* Clamp to a canonical address less-than-or-equal-to the address given */
1379 static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits)
1380 {
1381     return __is_canonical_address(vaddr, vaddr_bits) ?
1382            vaddr :
1383            BIT_ULL(vaddr_bits - 1) - 1;
1384 }
1385 #else
1386 #define clamp_to_ge_canonical_addr(x, y) (x)
1387 #define clamp_to_le_canonical_addr(x, y) (x)
1388 #endif
1389 
1390 static int pt_event_addr_filters_validate(struct list_head *filters)
1391 {
1392     struct perf_addr_filter *filter;
1393     int range = 0;
1394 
1395     list_for_each_entry(filter, filters, entry) {
1396         /*
1397          * PT doesn't support single address triggers and
1398          * 'start' filters.
1399          */
1400         if (!filter->size ||
1401             filter->action == PERF_ADDR_FILTER_ACTION_START)
1402             return -EOPNOTSUPP;
1403 
1404         if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1405             return -EOPNOTSUPP;
1406     }
1407 
1408     return 0;
1409 }
1410 
1411 static void pt_event_addr_filters_sync(struct perf_event *event)
1412 {
1413     struct perf_addr_filters_head *head = perf_event_addr_filters(event);
1414     unsigned long msr_a, msr_b;
1415     struct perf_addr_filter_range *fr = event->addr_filter_ranges;
1416     struct pt_filters *filters = event->hw.addr_filters;
1417     struct perf_addr_filter *filter;
1418     int range = 0;
1419 
1420     if (!filters)
1421         return;
1422 
1423     list_for_each_entry(filter, &head->list, entry) {
1424         if (filter->path.dentry && !fr[range].start) {
1425             msr_a = msr_b = 0;
1426         } else {
1427             unsigned long n = fr[range].size - 1;
1428             unsigned long a = fr[range].start;
1429             unsigned long b;
1430 
1431             if (a > ULONG_MAX - n)
1432                 b = ULONG_MAX;
1433             else
1434                 b = a + n;
1435             /*
1436              * Apply the offset. 64-bit addresses written to the
1437              * MSRs must be canonical, but the range can encompass
1438              * non-canonical addresses. Since software cannot
1439              * execute at non-canonical addresses, adjusting to
1440              * canonical addresses does not affect the result of the
1441              * address filter.
1442              */
1443             msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits);
1444             msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits);
1445             if (msr_b < msr_a)
1446                 msr_a = msr_b = 0;
1447         }
1448 
1449         filters->filter[range].msr_a  = msr_a;
1450         filters->filter[range].msr_b  = msr_b;
1451         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
1452             filters->filter[range].config = 1;
1453         else
1454             filters->filter[range].config = 2;
1455         range++;
1456     }
1457 
1458     filters->nr_filters = range;
1459 }
1460 
1461 /**
1462  * intel_pt_interrupt() - PT PMI handler
1463  */
1464 void intel_pt_interrupt(void)
1465 {
1466     struct pt *pt = this_cpu_ptr(&pt_ctx);
1467     struct pt_buffer *buf;
1468     struct perf_event *event = pt->handle.event;
1469 
1470     /*
1471      * There may be a dangling PT bit in the interrupt status register
1472      * after PT has been disabled by pt_event_stop(). Make sure we don't
1473      * do anything (particularly, re-enable) for this event here.
1474      */
1475     if (!READ_ONCE(pt->handle_nmi))
1476         return;
1477 
1478     if (!event)
1479         return;
1480 
1481     pt_config_stop(event);
1482 
1483     buf = perf_get_aux(&pt->handle);
1484     if (!buf)
1485         return;
1486 
1487     pt_read_offset(buf);
1488 
1489     pt_handle_status(pt);
1490 
1491     pt_update_head(pt);
1492 
1493     perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1494 
1495     if (!event->hw.state) {
1496         int ret;
1497 
1498         buf = perf_aux_output_begin(&pt->handle, event);
1499         if (!buf) {
1500             event->hw.state = PERF_HES_STOPPED;
1501             return;
1502         }
1503 
1504         pt_buffer_reset_offsets(buf, pt->handle.head);
1505         /* snapshot counters don't use PMI, so it's safe */
1506         ret = pt_buffer_reset_markers(buf, &pt->handle);
1507         if (ret) {
1508             perf_aux_output_end(&pt->handle, 0);
1509             return;
1510         }
1511 
1512         pt_config_buffer(buf);
1513         pt_config_start(event);
1514     }
1515 }
1516 
1517 void intel_pt_handle_vmx(int on)
1518 {
1519     struct pt *pt = this_cpu_ptr(&pt_ctx);
1520     struct perf_event *event;
1521     unsigned long flags;
1522 
1523     /* PT plays nice with VMX, do nothing */
1524     if (pt_pmu.vmx)
1525         return;
1526 
1527     /*
1528      * VMXON will clear RTIT_CTL.TraceEn; we need to make
1529      * sure to not try to set it while VMX is on. Disable
1530      * interrupts to avoid racing with pmu callbacks;
1531      * concurrent PMI should be handled fine.
1532      */
1533     local_irq_save(flags);
1534     WRITE_ONCE(pt->vmx_on, on);
1535 
1536     /*
1537      * If an AUX transaction is in progress, it will contain
1538      * gap(s), so flag it PARTIAL to inform the user.
1539      */
1540     event = pt->handle.event;
1541     if (event)
1542         perf_aux_output_flag(&pt->handle,
1543                              PERF_AUX_FLAG_PARTIAL);
1544 
1545     /* Turn PTs back on */
1546     if (!on && event)
1547         wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
1548 
1549     local_irq_restore(flags);
1550 }
1551 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1552 
1553 /*
1554  * PMU callbacks
1555  */
1556 
1557 static void pt_event_start(struct perf_event *event, int mode)
1558 {
1559     struct hw_perf_event *hwc = &event->hw;
1560     struct pt *pt = this_cpu_ptr(&pt_ctx);
1561     struct pt_buffer *buf;
1562 
1563     buf = perf_aux_output_begin(&pt->handle, event);
1564     if (!buf)
1565         goto fail_stop;
1566 
1567     pt_buffer_reset_offsets(buf, pt->handle.head);
1568     if (!buf->snapshot) {
1569         if (pt_buffer_reset_markers(buf, &pt->handle))
1570             goto fail_end_stop;
1571     }
1572 
1573     WRITE_ONCE(pt->handle_nmi, 1);
1574     hwc->state = 0;
1575 
1576     pt_config_buffer(buf);
1577     pt_config(event);
1578 
1579     return;
1580 
1581 fail_end_stop:
1582     perf_aux_output_end(&pt->handle, 0);
1583 fail_stop:
1584     hwc->state = PERF_HES_STOPPED;
1585 }
1586 
1587 static void pt_event_stop(struct perf_event *event, int mode)
1588 {
1589     struct pt *pt = this_cpu_ptr(&pt_ctx);
1590 
1591     /*
1592      * Protect against the PMI racing with disabling wrmsr,
1593      * see comment in intel_pt_interrupt().
1594      */
1595     WRITE_ONCE(pt->handle_nmi, 0);
1596 
1597     pt_config_stop(event);
1598 
1599     if (event->hw.state == PERF_HES_STOPPED)
1600         return;
1601 
1602     event->hw.state = PERF_HES_STOPPED;
1603 
1604     if (mode & PERF_EF_UPDATE) {
1605         struct pt_buffer *buf = perf_get_aux(&pt->handle);
1606 
1607         if (!buf)
1608             return;
1609 
1610         if (WARN_ON_ONCE(pt->handle.event != event))
1611             return;
1612 
1613         pt_read_offset(buf);
1614 
1615         pt_handle_status(pt);
1616 
1617         pt_update_head(pt);
1618 
1619         if (buf->snapshot)
1620             pt->handle.head =
1621                 local_xchg(&buf->data_size,
1622                        buf->nr_pages << PAGE_SHIFT);
1623         perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
1624     }
1625 }
1626 
1627 static long pt_event_snapshot_aux(struct perf_event *event,
1628                   struct perf_output_handle *handle,
1629                   unsigned long size)
1630 {
1631     struct pt *pt = this_cpu_ptr(&pt_ctx);
1632     struct pt_buffer *buf = perf_get_aux(&pt->handle);
1633     unsigned long from = 0, to;
1634     long ret;
1635 
1636     if (WARN_ON_ONCE(!buf))
1637         return 0;
1638 
1639     /*
1640      * Sampling is only allowed on snapshot events;
1641      * see pt_buffer_setup_aux().
1642      */
1643     if (WARN_ON_ONCE(!buf->snapshot))
1644         return 0;
1645 
1646     /*
1647      * Here, handle_nmi tells us if the tracing is on
1648      */
1649     if (READ_ONCE(pt->handle_nmi))
1650         pt_config_stop(event);
1651 
1652     pt_read_offset(buf);
1653     pt_update_head(pt);
1654 
1655     to = local_read(&buf->data_size);
1656     if (to < size)
1657         from = buf->nr_pages << PAGE_SHIFT;
1658     from += to - size;
1659 
1660     ret = perf_output_copy_aux(&pt->handle, handle, from, to);
1661 
1662     /*
1663      * If the tracing was on when we turned up, restart it.
1664      * Compiler barrier not needed as we couldn't have been
1665      * preempted by anything that touches pt->handle_nmi.
1666      */
1667     if (pt->handle_nmi)
1668         pt_config_start(event);
1669 
1670     return ret;
1671 }
1672 
1673 static void pt_event_del(struct perf_event *event, int mode)
1674 {
1675     pt_event_stop(event, PERF_EF_UPDATE);
1676 }
1677 
1678 static int pt_event_add(struct perf_event *event, int mode)
1679 {
1680     struct pt *pt = this_cpu_ptr(&pt_ctx);
1681     struct hw_perf_event *hwc = &event->hw;
1682     int ret = -EBUSY;
1683 
1684     if (pt->handle.event)
1685         goto fail;
1686 
1687     if (mode & PERF_EF_START) {
1688         pt_event_start(event, 0);
1689         ret = -EINVAL;
1690         if (hwc->state == PERF_HES_STOPPED)
1691             goto fail;
1692     } else {
1693         hwc->state = PERF_HES_STOPPED;
1694     }
1695 
1696     ret = 0;
1697 fail:
1698 
1699     return ret;
1700 }
1701 
1702 static void pt_event_read(struct perf_event *event)
1703 {
1704 }
1705 
1706 static void pt_event_destroy(struct perf_event *event)
1707 {
1708     pt_addr_filters_fini(event);
1709     x86_del_exclusive(x86_lbr_exclusive_pt);
1710 }
1711 
1712 static int pt_event_init(struct perf_event *event)
1713 {
1714     if (event->attr.type != pt_pmu.pmu.type)
1715         return -ENOENT;
1716 
1717     if (!pt_event_valid(event))
1718         return -EINVAL;
1719 
1720     if (x86_add_exclusive(x86_lbr_exclusive_pt))
1721         return -EBUSY;
1722 
1723     if (pt_addr_filters_init(event)) {
1724         x86_del_exclusive(x86_lbr_exclusive_pt);
1725         return -ENOMEM;
1726     }
1727 
1728     event->destroy = pt_event_destroy;
1729 
1730     return 0;
1731 }
1732 
1733 void cpu_emergency_stop_pt(void)
1734 {
1735     struct pt *pt = this_cpu_ptr(&pt_ctx);
1736 
1737     if (pt->handle.event)
1738         pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1739 }
1740 
1741 int is_intel_pt_event(struct perf_event *event)
1742 {
1743     return event->pmu == &pt_pmu.pmu;
1744 }
1745 
1746 static __init int pt_init(void)
1747 {
1748     int ret, cpu, prior_warn = 0;
1749 
1750     BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1751 
1752     if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
1753         return -ENODEV;
1754 
1755     cpus_read_lock();
1756     for_each_online_cpu(cpu) {
1757         u64 ctl;
1758 
1759         ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1760         if (!ret && (ctl & RTIT_CTL_TRACEEN))
1761             prior_warn++;
1762     }
1763     cpus_read_unlock();
1764 
1765     if (prior_warn) {
1766         x86_add_exclusive(x86_lbr_exclusive_pt);
1767         pr_warn("PT is enabled at boot time, doing nothing\n");
1768 
1769         return -EBUSY;
1770     }
1771 
1772     ret = pt_pmu_hw_init();
1773     if (ret)
1774         return ret;
1775 
1776     if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
1777         pr_warn("ToPA output is not supported on this CPU\n");
1778         return -ENODEV;
1779     }
1780 
1781     if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1782         pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
1783 
1784     pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1785     pt_pmu.pmu.attr_groups       = pt_attr_groups;
1786     pt_pmu.pmu.task_ctx_nr       = perf_sw_context;
1787     pt_pmu.pmu.event_init        = pt_event_init;
1788     pt_pmu.pmu.add           = pt_event_add;
1789     pt_pmu.pmu.del           = pt_event_del;
1790     pt_pmu.pmu.start         = pt_event_start;
1791     pt_pmu.pmu.stop          = pt_event_stop;
1792     pt_pmu.pmu.snapshot_aux      = pt_event_snapshot_aux;
1793     pt_pmu.pmu.read          = pt_event_read;
1794     pt_pmu.pmu.setup_aux         = pt_buffer_setup_aux;
1795     pt_pmu.pmu.free_aux      = pt_buffer_free_aux;
1796     pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
1797     pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1798     pt_pmu.pmu.nr_addr_filters       =
1799         intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
1800 
1801     ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1802 
1803     return ret;
1804 }
1805 arch_initcall(pt_init);