Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Performance events - AMD IBS
0003  *
0004  *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
0005  *
0006  *  For licencing details see kernel-base/COPYING
0007  */
0008 
0009 #include <linux/perf_event.h>
0010 #include <linux/init.h>
0011 #include <linux/export.h>
0012 #include <linux/pci.h>
0013 #include <linux/ptrace.h>
0014 #include <linux/syscore_ops.h>
0015 #include <linux/sched/clock.h>
0016 
0017 #include <asm/apic.h>
0018 
0019 #include "../perf_event.h"
0020 
0021 static u32 ibs_caps;
0022 
0023 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
0024 
0025 #include <linux/kprobes.h>
0026 #include <linux/hardirq.h>
0027 
0028 #include <asm/nmi.h>
0029 #include <asm/amd-ibs.h>
0030 
0031 #define IBS_FETCH_CONFIG_MASK   (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
0032 #define IBS_OP_CONFIG_MASK  IBS_OP_MAX_CNT
0033 
0034 
0035 /*
0036  * IBS states:
0037  *
0038  * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
0039  * and any further add()s must fail.
0040  *
0041  * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
0042  * complicated by the fact that the IBS hardware can send late NMIs (ie. after
0043  * we've cleared the EN bit).
0044  *
0045  * In order to consume these late NMIs we have the STOPPED state, any NMI that
0046  * happens after we've cleared the EN state will clear this bit and report the
0047  * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
0048  * someone else can consume our BIT and our NMI will go unhandled).
0049  *
0050  * And since we cannot set/clear this separate bit together with the EN bit,
0051  * there are races; if we cleared STARTED early, an NMI could land in
0052  * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
0053  * could happen if the period is small enough), and consume our STOPPED bit
0054  * and trigger streams of unhandled NMIs.
0055  *
0056  * If, however, we clear STARTED late, an NMI can hit between clearing the
0057  * EN bit and clearing STARTED, still see STARTED set and process the event.
0058  * If this event will have the VALID bit clear, we bail properly, but this
0059  * is not a given. With VALID set we can end up calling pmu::stop() again
0060  * (the throttle logic) and trigger the WARNs in there.
0061  *
0062  * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
0063  * nesting, and clear STARTED late, so that we have a well defined state over
0064  * the clearing of the EN bit.
0065  *
0066  * XXX: we could probably be using !atomic bitops for all this.
0067  */
0068 
0069 enum ibs_states {
0070     IBS_ENABLED = 0,
0071     IBS_STARTED = 1,
0072     IBS_STOPPING    = 2,
0073     IBS_STOPPED = 3,
0074 
0075     IBS_MAX_STATES,
0076 };
0077 
0078 struct cpu_perf_ibs {
0079     struct perf_event   *event;
0080     unsigned long       state[BITS_TO_LONGS(IBS_MAX_STATES)];
0081 };
0082 
0083 struct perf_ibs {
0084     struct pmu          pmu;
0085     unsigned int            msr;
0086     u64             config_mask;
0087     u64             cnt_mask;
0088     u64             enable_mask;
0089     u64             valid_mask;
0090     u64             max_period;
0091     unsigned long           offset_mask[1];
0092     int             offset_max;
0093     unsigned int            fetch_count_reset_broken : 1;
0094     unsigned int            fetch_ignore_if_zero_rip : 1;
0095     struct cpu_perf_ibs __percpu    *pcpu;
0096 
0097     u64             (*get_count)(u64 config);
0098 };
0099 
0100 static int
0101 perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
0102 {
0103     s64 left = local64_read(&hwc->period_left);
0104     s64 period = hwc->sample_period;
0105     int overflow = 0;
0106 
0107     /*
0108      * If we are way outside a reasonable range then just skip forward:
0109      */
0110     if (unlikely(left <= -period)) {
0111         left = period;
0112         local64_set(&hwc->period_left, left);
0113         hwc->last_period = period;
0114         overflow = 1;
0115     }
0116 
0117     if (unlikely(left < (s64)min)) {
0118         left += period;
0119         local64_set(&hwc->period_left, left);
0120         hwc->last_period = period;
0121         overflow = 1;
0122     }
0123 
0124     /*
0125      * If the hw period that triggers the sw overflow is too short
0126      * we might hit the irq handler. This biases the results.
0127      * Thus we shorten the next-to-last period and set the last
0128      * period to the max period.
0129      */
0130     if (left > max) {
0131         left -= max;
0132         if (left > max)
0133             left = max;
0134         else if (left < min)
0135             left = min;
0136     }
0137 
0138     *hw_period = (u64)left;
0139 
0140     return overflow;
0141 }
0142 
0143 static  int
0144 perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
0145 {
0146     struct hw_perf_event *hwc = &event->hw;
0147     int shift = 64 - width;
0148     u64 prev_raw_count;
0149     u64 delta;
0150 
0151     /*
0152      * Careful: an NMI might modify the previous event value.
0153      *
0154      * Our tactic to handle this is to first atomically read and
0155      * exchange a new raw count - then add that new-prev delta
0156      * count to the generic event atomically:
0157      */
0158     prev_raw_count = local64_read(&hwc->prev_count);
0159     if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
0160                     new_raw_count) != prev_raw_count)
0161         return 0;
0162 
0163     /*
0164      * Now we have the new raw value and have updated the prev
0165      * timestamp already. We can now calculate the elapsed delta
0166      * (event-)time and add that to the generic event.
0167      *
0168      * Careful, not all hw sign-extends above the physical width
0169      * of the count.
0170      */
0171     delta = (new_raw_count << shift) - (prev_raw_count << shift);
0172     delta >>= shift;
0173 
0174     local64_add(delta, &event->count);
0175     local64_sub(delta, &hwc->period_left);
0176 
0177     return 1;
0178 }
0179 
0180 static struct perf_ibs perf_ibs_fetch;
0181 static struct perf_ibs perf_ibs_op;
0182 
0183 static struct perf_ibs *get_ibs_pmu(int type)
0184 {
0185     if (perf_ibs_fetch.pmu.type == type)
0186         return &perf_ibs_fetch;
0187     if (perf_ibs_op.pmu.type == type)
0188         return &perf_ibs_op;
0189     return NULL;
0190 }
0191 
0192 /*
0193  * Use IBS for precise event sampling:
0194  *
0195  *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
0196  *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
0197  *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
0198  *
0199  * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
0200  * MSRC001_1033) is used to select either cycle or micro-ops counting
0201  * mode.
0202  *
0203  * The rip of IBS samples has skid 0. Thus, IBS supports precise
0204  * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
0205  * rip is invalid when IBS was not able to record the rip correctly.
0206  * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
0207  *
0208  */
0209 static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
0210 {
0211     switch (event->attr.precise_ip) {
0212     case 0:
0213         return -ENOENT;
0214     case 1:
0215     case 2:
0216         break;
0217     default:
0218         return -EOPNOTSUPP;
0219     }
0220 
0221     switch (event->attr.type) {
0222     case PERF_TYPE_HARDWARE:
0223         switch (event->attr.config) {
0224         case PERF_COUNT_HW_CPU_CYCLES:
0225             *config = 0;
0226             return 0;
0227         }
0228         break;
0229     case PERF_TYPE_RAW:
0230         switch (event->attr.config) {
0231         case 0x0076:
0232             *config = 0;
0233             return 0;
0234         case 0x00C1:
0235             *config = IBS_OP_CNT_CTL;
0236             return 0;
0237         }
0238         break;
0239     default:
0240         return -ENOENT;
0241     }
0242 
0243     return -EOPNOTSUPP;
0244 }
0245 
0246 static int perf_ibs_init(struct perf_event *event)
0247 {
0248     struct hw_perf_event *hwc = &event->hw;
0249     struct perf_ibs *perf_ibs;
0250     u64 max_cnt, config;
0251     int ret;
0252 
0253     perf_ibs = get_ibs_pmu(event->attr.type);
0254     if (perf_ibs) {
0255         config = event->attr.config;
0256     } else {
0257         perf_ibs = &perf_ibs_op;
0258         ret = perf_ibs_precise_event(event, &config);
0259         if (ret)
0260             return ret;
0261     }
0262 
0263     if (event->pmu != &perf_ibs->pmu)
0264         return -ENOENT;
0265 
0266     if (config & ~perf_ibs->config_mask)
0267         return -EINVAL;
0268 
0269     if (hwc->sample_period) {
0270         if (config & perf_ibs->cnt_mask)
0271             /* raw max_cnt may not be set */
0272             return -EINVAL;
0273         if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
0274             /*
0275              * lower 4 bits can not be set in ibs max cnt,
0276              * but allowing it in case we adjust the
0277              * sample period to set a frequency.
0278              */
0279             return -EINVAL;
0280         hwc->sample_period &= ~0x0FULL;
0281         if (!hwc->sample_period)
0282             hwc->sample_period = 0x10;
0283     } else {
0284         max_cnt = config & perf_ibs->cnt_mask;
0285         config &= ~perf_ibs->cnt_mask;
0286         event->attr.sample_period = max_cnt << 4;
0287         hwc->sample_period = event->attr.sample_period;
0288     }
0289 
0290     if (!hwc->sample_period)
0291         return -EINVAL;
0292 
0293     /*
0294      * If we modify hwc->sample_period, we also need to update
0295      * hwc->last_period and hwc->period_left.
0296      */
0297     hwc->last_period = hwc->sample_period;
0298     local64_set(&hwc->period_left, hwc->sample_period);
0299 
0300     hwc->config_base = perf_ibs->msr;
0301     hwc->config = config;
0302 
0303     /*
0304      * rip recorded by IbsOpRip will not be consistent with rsp and rbp
0305      * recorded as part of interrupt regs. Thus we need to use rip from
0306      * interrupt regs while unwinding call stack. Setting _EARLY flag
0307      * makes sure we unwind call-stack before perf sample rip is set to
0308      * IbsOpRip.
0309      */
0310     if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
0311         event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
0312 
0313     return 0;
0314 }
0315 
0316 static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
0317                    struct hw_perf_event *hwc, u64 *period)
0318 {
0319     int overflow;
0320 
0321     /* ignore lower 4 bits in min count: */
0322     overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
0323     local64_set(&hwc->prev_count, 0);
0324 
0325     return overflow;
0326 }
0327 
0328 static u64 get_ibs_fetch_count(u64 config)
0329 {
0330     union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
0331 
0332     return fetch_ctl.fetch_cnt << 4;
0333 }
0334 
0335 static u64 get_ibs_op_count(u64 config)
0336 {
0337     union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
0338     u64 count = 0;
0339 
0340     /*
0341      * If the internal 27-bit counter rolled over, the count is MaxCnt
0342      * and the lower 7 bits of CurCnt are randomized.
0343      * Otherwise CurCnt has the full 27-bit current counter value.
0344      */
0345     if (op_ctl.op_val) {
0346         count = op_ctl.opmaxcnt << 4;
0347         if (ibs_caps & IBS_CAPS_OPCNTEXT)
0348             count += op_ctl.opmaxcnt_ext << 20;
0349     } else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
0350         count = op_ctl.opcurcnt;
0351     }
0352 
0353     return count;
0354 }
0355 
0356 static void
0357 perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
0358               u64 *config)
0359 {
0360     u64 count = perf_ibs->get_count(*config);
0361 
0362     /*
0363      * Set width to 64 since we do not overflow on max width but
0364      * instead on max count. In perf_ibs_set_period() we clear
0365      * prev count manually on overflow.
0366      */
0367     while (!perf_event_try_update(event, count, 64)) {
0368         rdmsrl(event->hw.config_base, *config);
0369         count = perf_ibs->get_count(*config);
0370     }
0371 }
0372 
0373 static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
0374                      struct hw_perf_event *hwc, u64 config)
0375 {
0376     u64 tmp = hwc->config | config;
0377 
0378     if (perf_ibs->fetch_count_reset_broken)
0379         wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
0380 
0381     wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
0382 }
0383 
0384 /*
0385  * Erratum #420 Instruction-Based Sampling Engine May Generate
0386  * Interrupt that Cannot Be Cleared:
0387  *
0388  * Must clear counter mask first, then clear the enable bit. See
0389  * Revision Guide for AMD Family 10h Processors, Publication #41322.
0390  */
0391 static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
0392                       struct hw_perf_event *hwc, u64 config)
0393 {
0394     config &= ~perf_ibs->cnt_mask;
0395     if (boot_cpu_data.x86 == 0x10)
0396         wrmsrl(hwc->config_base, config);
0397     config &= ~perf_ibs->enable_mask;
0398     wrmsrl(hwc->config_base, config);
0399 }
0400 
0401 /*
0402  * We cannot restore the ibs pmu state, so we always needs to update
0403  * the event while stopping it and then reset the state when starting
0404  * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
0405  * perf_ibs_start()/perf_ibs_stop() and instead always do it.
0406  */
0407 static void perf_ibs_start(struct perf_event *event, int flags)
0408 {
0409     struct hw_perf_event *hwc = &event->hw;
0410     struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
0411     struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
0412     u64 period, config = 0;
0413 
0414     if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
0415         return;
0416 
0417     WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
0418     hwc->state = 0;
0419 
0420     perf_ibs_set_period(perf_ibs, hwc, &period);
0421     if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
0422         config |= period & IBS_OP_MAX_CNT_EXT_MASK;
0423         period &= ~IBS_OP_MAX_CNT_EXT_MASK;
0424     }
0425     config |= period >> 4;
0426 
0427     /*
0428      * Set STARTED before enabling the hardware, such that a subsequent NMI
0429      * must observe it.
0430      */
0431     set_bit(IBS_STARTED,    pcpu->state);
0432     clear_bit(IBS_STOPPING, pcpu->state);
0433     perf_ibs_enable_event(perf_ibs, hwc, config);
0434 
0435     perf_event_update_userpage(event);
0436 }
0437 
0438 static void perf_ibs_stop(struct perf_event *event, int flags)
0439 {
0440     struct hw_perf_event *hwc = &event->hw;
0441     struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
0442     struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
0443     u64 config;
0444     int stopping;
0445 
0446     if (test_and_set_bit(IBS_STOPPING, pcpu->state))
0447         return;
0448 
0449     stopping = test_bit(IBS_STARTED, pcpu->state);
0450 
0451     if (!stopping && (hwc->state & PERF_HES_UPTODATE))
0452         return;
0453 
0454     rdmsrl(hwc->config_base, config);
0455 
0456     if (stopping) {
0457         /*
0458          * Set STOPPED before disabling the hardware, such that it
0459          * must be visible to NMIs the moment we clear the EN bit,
0460          * at which point we can generate an !VALID sample which
0461          * we need to consume.
0462          */
0463         set_bit(IBS_STOPPED, pcpu->state);
0464         perf_ibs_disable_event(perf_ibs, hwc, config);
0465         /*
0466          * Clear STARTED after disabling the hardware; if it were
0467          * cleared before an NMI hitting after the clear but before
0468          * clearing the EN bit might think it a spurious NMI and not
0469          * handle it.
0470          *
0471          * Clearing it after, however, creates the problem of the NMI
0472          * handler seeing STARTED but not having a valid sample.
0473          */
0474         clear_bit(IBS_STARTED, pcpu->state);
0475         WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
0476         hwc->state |= PERF_HES_STOPPED;
0477     }
0478 
0479     if (hwc->state & PERF_HES_UPTODATE)
0480         return;
0481 
0482     /*
0483      * Clear valid bit to not count rollovers on update, rollovers
0484      * are only updated in the irq handler.
0485      */
0486     config &= ~perf_ibs->valid_mask;
0487 
0488     perf_ibs_event_update(perf_ibs, event, &config);
0489     hwc->state |= PERF_HES_UPTODATE;
0490 }
0491 
0492 static int perf_ibs_add(struct perf_event *event, int flags)
0493 {
0494     struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
0495     struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
0496 
0497     if (test_and_set_bit(IBS_ENABLED, pcpu->state))
0498         return -ENOSPC;
0499 
0500     event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
0501 
0502     pcpu->event = event;
0503 
0504     if (flags & PERF_EF_START)
0505         perf_ibs_start(event, PERF_EF_RELOAD);
0506 
0507     return 0;
0508 }
0509 
0510 static void perf_ibs_del(struct perf_event *event, int flags)
0511 {
0512     struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
0513     struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
0514 
0515     if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
0516         return;
0517 
0518     perf_ibs_stop(event, PERF_EF_UPDATE);
0519 
0520     pcpu->event = NULL;
0521 
0522     perf_event_update_userpage(event);
0523 }
0524 
0525 static void perf_ibs_read(struct perf_event *event) { }
0526 
0527 /*
0528  * We need to initialize with empty group if all attributes in the
0529  * group are dynamic.
0530  */
0531 static struct attribute *attrs_empty[] = {
0532     NULL,
0533 };
0534 
0535 static struct attribute_group empty_format_group = {
0536     .name = "format",
0537     .attrs = attrs_empty,
0538 };
0539 
0540 static struct attribute_group empty_caps_group = {
0541     .name = "caps",
0542     .attrs = attrs_empty,
0543 };
0544 
0545 static const struct attribute_group *empty_attr_groups[] = {
0546     &empty_format_group,
0547     &empty_caps_group,
0548     NULL,
0549 };
0550 
0551 PMU_FORMAT_ATTR(rand_en,    "config:57");
0552 PMU_FORMAT_ATTR(cnt_ctl,    "config:19");
0553 PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
0554 PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
0555 PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
0556 
0557 static umode_t
0558 zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
0559 {
0560     return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
0561 }
0562 
0563 static struct attribute *rand_en_attrs[] = {
0564     &format_attr_rand_en.attr,
0565     NULL,
0566 };
0567 
0568 static struct attribute *fetch_l3missonly_attrs[] = {
0569     &fetch_l3missonly.attr.attr,
0570     NULL,
0571 };
0572 
0573 static struct attribute *zen4_ibs_extensions_attrs[] = {
0574     &zen4_ibs_extensions.attr.attr,
0575     NULL,
0576 };
0577 
0578 static struct attribute_group group_rand_en = {
0579     .name = "format",
0580     .attrs = rand_en_attrs,
0581 };
0582 
0583 static struct attribute_group group_fetch_l3missonly = {
0584     .name = "format",
0585     .attrs = fetch_l3missonly_attrs,
0586     .is_visible = zen4_ibs_extensions_is_visible,
0587 };
0588 
0589 static struct attribute_group group_zen4_ibs_extensions = {
0590     .name = "caps",
0591     .attrs = zen4_ibs_extensions_attrs,
0592     .is_visible = zen4_ibs_extensions_is_visible,
0593 };
0594 
0595 static const struct attribute_group *fetch_attr_groups[] = {
0596     &group_rand_en,
0597     &empty_caps_group,
0598     NULL,
0599 };
0600 
0601 static const struct attribute_group *fetch_attr_update[] = {
0602     &group_fetch_l3missonly,
0603     &group_zen4_ibs_extensions,
0604     NULL,
0605 };
0606 
0607 static umode_t
0608 cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
0609 {
0610     return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
0611 }
0612 
0613 static struct attribute *cnt_ctl_attrs[] = {
0614     &format_attr_cnt_ctl.attr,
0615     NULL,
0616 };
0617 
0618 static struct attribute *op_l3missonly_attrs[] = {
0619     &op_l3missonly.attr.attr,
0620     NULL,
0621 };
0622 
0623 static struct attribute_group group_cnt_ctl = {
0624     .name = "format",
0625     .attrs = cnt_ctl_attrs,
0626     .is_visible = cnt_ctl_is_visible,
0627 };
0628 
0629 static struct attribute_group group_op_l3missonly = {
0630     .name = "format",
0631     .attrs = op_l3missonly_attrs,
0632     .is_visible = zen4_ibs_extensions_is_visible,
0633 };
0634 
0635 static const struct attribute_group *op_attr_update[] = {
0636     &group_cnt_ctl,
0637     &group_op_l3missonly,
0638     &group_zen4_ibs_extensions,
0639     NULL,
0640 };
0641 
0642 static struct perf_ibs perf_ibs_fetch = {
0643     .pmu = {
0644         .task_ctx_nr    = perf_invalid_context,
0645 
0646         .event_init = perf_ibs_init,
0647         .add        = perf_ibs_add,
0648         .del        = perf_ibs_del,
0649         .start      = perf_ibs_start,
0650         .stop       = perf_ibs_stop,
0651         .read       = perf_ibs_read,
0652         .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
0653     },
0654     .msr            = MSR_AMD64_IBSFETCHCTL,
0655     .config_mask        = IBS_FETCH_CONFIG_MASK,
0656     .cnt_mask       = IBS_FETCH_MAX_CNT,
0657     .enable_mask        = IBS_FETCH_ENABLE,
0658     .valid_mask     = IBS_FETCH_VAL,
0659     .max_period     = IBS_FETCH_MAX_CNT << 4,
0660     .offset_mask        = { MSR_AMD64_IBSFETCH_REG_MASK },
0661     .offset_max     = MSR_AMD64_IBSFETCH_REG_COUNT,
0662 
0663     .get_count      = get_ibs_fetch_count,
0664 };
0665 
0666 static struct perf_ibs perf_ibs_op = {
0667     .pmu = {
0668         .task_ctx_nr    = perf_invalid_context,
0669 
0670         .event_init = perf_ibs_init,
0671         .add        = perf_ibs_add,
0672         .del        = perf_ibs_del,
0673         .start      = perf_ibs_start,
0674         .stop       = perf_ibs_stop,
0675         .read       = perf_ibs_read,
0676         .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
0677     },
0678     .msr            = MSR_AMD64_IBSOPCTL,
0679     .config_mask        = IBS_OP_CONFIG_MASK,
0680     .cnt_mask       = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
0681                   IBS_OP_CUR_CNT_RAND,
0682     .enable_mask        = IBS_OP_ENABLE,
0683     .valid_mask     = IBS_OP_VAL,
0684     .max_period     = IBS_OP_MAX_CNT << 4,
0685     .offset_mask        = { MSR_AMD64_IBSOP_REG_MASK },
0686     .offset_max     = MSR_AMD64_IBSOP_REG_COUNT,
0687 
0688     .get_count      = get_ibs_op_count,
0689 };
0690 
0691 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
0692 {
0693     struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
0694     struct perf_event *event = pcpu->event;
0695     struct hw_perf_event *hwc;
0696     struct perf_sample_data data;
0697     struct perf_raw_record raw;
0698     struct pt_regs regs;
0699     struct perf_ibs_data ibs_data;
0700     int offset, size, check_rip, offset_max, throttle = 0;
0701     unsigned int msr;
0702     u64 *buf, *config, period, new_config = 0;
0703 
0704     if (!test_bit(IBS_STARTED, pcpu->state)) {
0705 fail:
0706         /*
0707          * Catch spurious interrupts after stopping IBS: After
0708          * disabling IBS there could be still incoming NMIs
0709          * with samples that even have the valid bit cleared.
0710          * Mark all this NMIs as handled.
0711          */
0712         if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
0713             return 1;
0714 
0715         return 0;
0716     }
0717 
0718     if (WARN_ON_ONCE(!event))
0719         goto fail;
0720 
0721     hwc = &event->hw;
0722     msr = hwc->config_base;
0723     buf = ibs_data.regs;
0724     rdmsrl(msr, *buf);
0725     if (!(*buf++ & perf_ibs->valid_mask))
0726         goto fail;
0727 
0728     config = &ibs_data.regs[0];
0729     perf_ibs_event_update(perf_ibs, event, config);
0730     perf_sample_data_init(&data, 0, hwc->last_period);
0731     if (!perf_ibs_set_period(perf_ibs, hwc, &period))
0732         goto out;   /* no sw counter overflow */
0733 
0734     ibs_data.caps = ibs_caps;
0735     size = 1;
0736     offset = 1;
0737     check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
0738     if (event->attr.sample_type & PERF_SAMPLE_RAW)
0739         offset_max = perf_ibs->offset_max;
0740     else if (check_rip)
0741         offset_max = 3;
0742     else
0743         offset_max = 1;
0744     do {
0745         rdmsrl(msr + offset, *buf++);
0746         size++;
0747         offset = find_next_bit(perf_ibs->offset_mask,
0748                        perf_ibs->offset_max,
0749                        offset + 1);
0750     } while (offset < offset_max);
0751     /*
0752      * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
0753      * depending on their availability.
0754      * Can't add to offset_max as they are staggered
0755      */
0756     if (event->attr.sample_type & PERF_SAMPLE_RAW) {
0757         if (perf_ibs == &perf_ibs_op) {
0758             if (ibs_caps & IBS_CAPS_BRNTRGT) {
0759                 rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
0760                 size++;
0761             }
0762             if (ibs_caps & IBS_CAPS_OPDATA4) {
0763                 rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
0764                 size++;
0765             }
0766         }
0767         if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
0768             rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
0769             size++;
0770         }
0771     }
0772     ibs_data.size = sizeof(u64) * size;
0773 
0774     regs = *iregs;
0775     if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
0776         regs.flags &= ~PERF_EFLAGS_EXACT;
0777     } else {
0778         /* Workaround for erratum #1197 */
0779         if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
0780             goto out;
0781 
0782         set_linear_ip(&regs, ibs_data.regs[1]);
0783         regs.flags |= PERF_EFLAGS_EXACT;
0784     }
0785 
0786     if (event->attr.sample_type & PERF_SAMPLE_RAW) {
0787         raw = (struct perf_raw_record){
0788             .frag = {
0789                 .size = sizeof(u32) + ibs_data.size,
0790                 .data = ibs_data.data,
0791             },
0792         };
0793         data.raw = &raw;
0794     }
0795 
0796     /*
0797      * rip recorded by IbsOpRip will not be consistent with rsp and rbp
0798      * recorded as part of interrupt regs. Thus we need to use rip from
0799      * interrupt regs while unwinding call stack.
0800      */
0801     if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
0802         data.callchain = perf_callchain(event, iregs);
0803 
0804     throttle = perf_event_overflow(event, &data, &regs);
0805 out:
0806     if (throttle) {
0807         perf_ibs_stop(event, 0);
0808     } else {
0809         if (perf_ibs == &perf_ibs_op) {
0810             if (ibs_caps & IBS_CAPS_OPCNTEXT) {
0811                 new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
0812                 period &= ~IBS_OP_MAX_CNT_EXT_MASK;
0813             }
0814             if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
0815                 new_config |= *config & IBS_OP_CUR_CNT_RAND;
0816         }
0817         new_config |= period >> 4;
0818 
0819         perf_ibs_enable_event(perf_ibs, hwc, new_config);
0820     }
0821 
0822     perf_event_update_userpage(event);
0823 
0824     return 1;
0825 }
0826 
0827 static int
0828 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
0829 {
0830     u64 stamp = sched_clock();
0831     int handled = 0;
0832 
0833     handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
0834     handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
0835 
0836     if (handled)
0837         inc_irq_stat(apic_perf_irqs);
0838 
0839     perf_sample_event_took(sched_clock() - stamp);
0840 
0841     return handled;
0842 }
0843 NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
0844 
0845 static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
0846 {
0847     struct cpu_perf_ibs __percpu *pcpu;
0848     int ret;
0849 
0850     pcpu = alloc_percpu(struct cpu_perf_ibs);
0851     if (!pcpu)
0852         return -ENOMEM;
0853 
0854     perf_ibs->pcpu = pcpu;
0855 
0856     ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
0857     if (ret) {
0858         perf_ibs->pcpu = NULL;
0859         free_percpu(pcpu);
0860     }
0861 
0862     return ret;
0863 }
0864 
0865 static __init int perf_ibs_fetch_init(void)
0866 {
0867     /*
0868      * Some chips fail to reset the fetch count when it is written; instead
0869      * they need a 0-1 transition of IbsFetchEn.
0870      */
0871     if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
0872         perf_ibs_fetch.fetch_count_reset_broken = 1;
0873 
0874     if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
0875         perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
0876 
0877     if (ibs_caps & IBS_CAPS_ZEN4)
0878         perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
0879 
0880     perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
0881     perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
0882 
0883     return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
0884 }
0885 
0886 static __init int perf_ibs_op_init(void)
0887 {
0888     if (ibs_caps & IBS_CAPS_OPCNT)
0889         perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
0890 
0891     if (ibs_caps & IBS_CAPS_OPCNTEXT) {
0892         perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
0893         perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
0894         perf_ibs_op.cnt_mask    |= IBS_OP_MAX_CNT_EXT_MASK;
0895     }
0896 
0897     if (ibs_caps & IBS_CAPS_ZEN4)
0898         perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
0899 
0900     perf_ibs_op.pmu.attr_groups = empty_attr_groups;
0901     perf_ibs_op.pmu.attr_update = op_attr_update;
0902 
0903     return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
0904 }
0905 
0906 static __init int perf_event_ibs_init(void)
0907 {
0908     int ret;
0909 
0910     ret = perf_ibs_fetch_init();
0911     if (ret)
0912         return ret;
0913 
0914     ret = perf_ibs_op_init();
0915     if (ret)
0916         goto err_op;
0917 
0918     ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
0919     if (ret)
0920         goto err_nmi;
0921 
0922     pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
0923     return 0;
0924 
0925 err_nmi:
0926     perf_pmu_unregister(&perf_ibs_op.pmu);
0927     free_percpu(perf_ibs_op.pcpu);
0928     perf_ibs_op.pcpu = NULL;
0929 err_op:
0930     perf_pmu_unregister(&perf_ibs_fetch.pmu);
0931     free_percpu(perf_ibs_fetch.pcpu);
0932     perf_ibs_fetch.pcpu = NULL;
0933 
0934     return ret;
0935 }
0936 
0937 #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
0938 
0939 static __init int perf_event_ibs_init(void)
0940 {
0941     return 0;
0942 }
0943 
0944 #endif
0945 
0946 /* IBS - apic initialization, for perf and oprofile */
0947 
0948 static __init u32 __get_ibs_caps(void)
0949 {
0950     u32 caps;
0951     unsigned int max_level;
0952 
0953     if (!boot_cpu_has(X86_FEATURE_IBS))
0954         return 0;
0955 
0956     /* check IBS cpuid feature flags */
0957     max_level = cpuid_eax(0x80000000);
0958     if (max_level < IBS_CPUID_FEATURES)
0959         return IBS_CAPS_DEFAULT;
0960 
0961     caps = cpuid_eax(IBS_CPUID_FEATURES);
0962     if (!(caps & IBS_CAPS_AVAIL))
0963         /* cpuid flags not valid */
0964         return IBS_CAPS_DEFAULT;
0965 
0966     return caps;
0967 }
0968 
0969 u32 get_ibs_caps(void)
0970 {
0971     return ibs_caps;
0972 }
0973 
0974 EXPORT_SYMBOL(get_ibs_caps);
0975 
0976 static inline int get_eilvt(int offset)
0977 {
0978     return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
0979 }
0980 
0981 static inline int put_eilvt(int offset)
0982 {
0983     return !setup_APIC_eilvt(offset, 0, 0, 1);
0984 }
0985 
0986 /*
0987  * Check and reserve APIC extended interrupt LVT offset for IBS if available.
0988  */
0989 static inline int ibs_eilvt_valid(void)
0990 {
0991     int offset;
0992     u64 val;
0993     int valid = 0;
0994 
0995     preempt_disable();
0996 
0997     rdmsrl(MSR_AMD64_IBSCTL, val);
0998     offset = val & IBSCTL_LVT_OFFSET_MASK;
0999 
1000     if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
1001         pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
1002                smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1003         goto out;
1004     }
1005 
1006     if (!get_eilvt(offset)) {
1007         pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
1008                smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1009         goto out;
1010     }
1011 
1012     valid = 1;
1013 out:
1014     preempt_enable();
1015 
1016     return valid;
1017 }
1018 
1019 static int setup_ibs_ctl(int ibs_eilvt_off)
1020 {
1021     struct pci_dev *cpu_cfg;
1022     int nodes;
1023     u32 value = 0;
1024 
1025     nodes = 0;
1026     cpu_cfg = NULL;
1027     do {
1028         cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
1029                      PCI_DEVICE_ID_AMD_10H_NB_MISC,
1030                      cpu_cfg);
1031         if (!cpu_cfg)
1032             break;
1033         ++nodes;
1034         pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
1035                        | IBSCTL_LVT_OFFSET_VALID);
1036         pci_read_config_dword(cpu_cfg, IBSCTL, &value);
1037         if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
1038             pci_dev_put(cpu_cfg);
1039             pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
1040                  value);
1041             return -EINVAL;
1042         }
1043     } while (1);
1044 
1045     if (!nodes) {
1046         pr_debug("No CPU node configured for IBS\n");
1047         return -ENODEV;
1048     }
1049 
1050     return 0;
1051 }
1052 
1053 /*
1054  * This runs only on the current cpu. We try to find an LVT offset and
1055  * setup the local APIC. For this we must disable preemption. On
1056  * success we initialize all nodes with this offset. This updates then
1057  * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
1058  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
1059  * is using the new offset.
1060  */
1061 static void force_ibs_eilvt_setup(void)
1062 {
1063     int offset;
1064     int ret;
1065 
1066     preempt_disable();
1067     /* find the next free available EILVT entry, skip offset 0 */
1068     for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
1069         if (get_eilvt(offset))
1070             break;
1071     }
1072     preempt_enable();
1073 
1074     if (offset == APIC_EILVT_NR_MAX) {
1075         pr_debug("No EILVT entry available\n");
1076         return;
1077     }
1078 
1079     ret = setup_ibs_ctl(offset);
1080     if (ret)
1081         goto out;
1082 
1083     if (!ibs_eilvt_valid())
1084         goto out;
1085 
1086     pr_info("LVT offset %d assigned\n", offset);
1087 
1088     return;
1089 out:
1090     preempt_disable();
1091     put_eilvt(offset);
1092     preempt_enable();
1093     return;
1094 }
1095 
1096 static void ibs_eilvt_setup(void)
1097 {
1098     /*
1099      * Force LVT offset assignment for family 10h: The offsets are
1100      * not assigned by the BIOS for this family, so the OS is
1101      * responsible for doing it. If the OS assignment fails, fall
1102      * back to BIOS settings and try to setup this.
1103      */
1104     if (boot_cpu_data.x86 == 0x10)
1105         force_ibs_eilvt_setup();
1106 }
1107 
1108 static inline int get_ibs_lvt_offset(void)
1109 {
1110     u64 val;
1111 
1112     rdmsrl(MSR_AMD64_IBSCTL, val);
1113     if (!(val & IBSCTL_LVT_OFFSET_VALID))
1114         return -EINVAL;
1115 
1116     return val & IBSCTL_LVT_OFFSET_MASK;
1117 }
1118 
1119 static void setup_APIC_ibs(void)
1120 {
1121     int offset;
1122 
1123     offset = get_ibs_lvt_offset();
1124     if (offset < 0)
1125         goto failed;
1126 
1127     if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
1128         return;
1129 failed:
1130     pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
1131         smp_processor_id());
1132 }
1133 
1134 static void clear_APIC_ibs(void)
1135 {
1136     int offset;
1137 
1138     offset = get_ibs_lvt_offset();
1139     if (offset >= 0)
1140         setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
1141 }
1142 
1143 static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
1144 {
1145     setup_APIC_ibs();
1146     return 0;
1147 }
1148 
1149 #ifdef CONFIG_PM
1150 
1151 static int perf_ibs_suspend(void)
1152 {
1153     clear_APIC_ibs();
1154     return 0;
1155 }
1156 
1157 static void perf_ibs_resume(void)
1158 {
1159     ibs_eilvt_setup();
1160     setup_APIC_ibs();
1161 }
1162 
1163 static struct syscore_ops perf_ibs_syscore_ops = {
1164     .resume     = perf_ibs_resume,
1165     .suspend    = perf_ibs_suspend,
1166 };
1167 
1168 static void perf_ibs_pm_init(void)
1169 {
1170     register_syscore_ops(&perf_ibs_syscore_ops);
1171 }
1172 
1173 #else
1174 
1175 static inline void perf_ibs_pm_init(void) { }
1176 
1177 #endif
1178 
1179 static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
1180 {
1181     clear_APIC_ibs();
1182     return 0;
1183 }
1184 
1185 static __init int amd_ibs_init(void)
1186 {
1187     u32 caps;
1188 
1189     caps = __get_ibs_caps();
1190     if (!caps)
1191         return -ENODEV; /* ibs not supported by the cpu */
1192 
1193     ibs_eilvt_setup();
1194 
1195     if (!ibs_eilvt_valid())
1196         return -EINVAL;
1197 
1198     perf_ibs_pm_init();
1199 
1200     ibs_caps = caps;
1201     /* make ibs_caps visible to other cpus: */
1202     smp_mb();
1203     /*
1204      * x86_pmu_amd_ibs_starting_cpu will be called from core on
1205      * all online cpus.
1206      */
1207     cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
1208               "perf/x86/amd/ibs:starting",
1209               x86_pmu_amd_ibs_starting_cpu,
1210               x86_pmu_amd_ibs_dying_cpu);
1211 
1212     return perf_event_ibs_init();
1213 }
1214 
1215 /* Since we need the pci subsystem to init ibs we can't do this earlier: */
1216 device_initcall(amd_ibs_init);