Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Support Intel/AMD RAPL energy consumption counters
0004  * Copyright (C) 2013 Google, Inc., Stephane Eranian
0005  *
0006  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
0007  * section 14.7.1 (September 2013)
0008  *
0009  * AMD RAPL interface for Fam17h is described in the public PPR:
0010  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
0011  *
0012  * RAPL provides more controls than just reporting energy consumption
0013  * however here we only expose the 3 energy consumption free running
0014  * counters (pp0, pkg, dram).
0015  *
0016  * Each of those counters increments in a power unit defined by the
0017  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
0018  * but it can vary.
0019  *
0020  * Counter to rapl events mappings:
0021  *
0022  *  pp0 counter: consumption of all physical cores (power plane 0)
0023  *    event: rapl_energy_cores
0024  *    perf code: 0x1
0025  *
0026  *  pkg counter: consumption of the whole processor package
0027  *    event: rapl_energy_pkg
0028  *    perf code: 0x2
0029  *
0030  * dram counter: consumption of the dram domain (servers only)
0031  *    event: rapl_energy_dram
0032  *    perf code: 0x3
0033  *
0034  * gpu counter: consumption of the builtin-gpu domain (client only)
0035  *    event: rapl_energy_gpu
0036  *    perf code: 0x4
0037  *
0038  *  psys counter: consumption of the builtin-psys domain (client only)
0039  *    event: rapl_energy_psys
0040  *    perf code: 0x5
0041  *
0042  * We manage those counters as free running (read-only). They may be
0043  * use simultaneously by other tools, such as turbostat.
0044  *
0045  * The events only support system-wide mode counting. There is no
0046  * sampling support because it does not make sense and is not
0047  * supported by the RAPL hardware.
0048  *
0049  * Because we want to avoid floating-point operations in the kernel,
0050  * the events are all reported in fixed point arithmetic (32.32).
0051  * Tools must adjust the counts to convert them to Watts using
0052  * the duration of the measurement. Tools may use a function such as
0053  * ldexp(raw_count, -32);
0054  */
0055 
0056 #define pr_fmt(fmt) "RAPL PMU: " fmt
0057 
0058 #include <linux/module.h>
0059 #include <linux/slab.h>
0060 #include <linux/perf_event.h>
0061 #include <linux/nospec.h>
0062 #include <asm/cpu_device_id.h>
0063 #include <asm/intel-family.h>
0064 #include "perf_event.h"
0065 #include "probe.h"
0066 
0067 MODULE_LICENSE("GPL");
0068 
0069 /*
0070  * RAPL energy status counters
0071  */
0072 enum perf_rapl_events {
0073     PERF_RAPL_PP0 = 0,      /* all cores */
0074     PERF_RAPL_PKG,          /* entire package */
0075     PERF_RAPL_RAM,          /* DRAM */
0076     PERF_RAPL_PP1,          /* gpu */
0077     PERF_RAPL_PSYS,         /* psys */
0078 
0079     PERF_RAPL_MAX,
0080     NR_RAPL_DOMAINS = PERF_RAPL_MAX,
0081 };
0082 
0083 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
0084     "pp0-core",
0085     "package",
0086     "dram",
0087     "pp1-gpu",
0088     "psys",
0089 };
0090 
0091 /*
0092  * event code: LSB 8 bits, passed in attr->config
0093  * any other bit is reserved
0094  */
0095 #define RAPL_EVENT_MASK 0xFFULL
0096 #define RAPL_CNTR_WIDTH 32
0097 
0098 #define RAPL_EVENT_ATTR_STR(_name, v, str)                  \
0099 static struct perf_pmu_events_attr event_attr_##v = {               \
0100     .attr       = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
0101     .id     = 0,                            \
0102     .event_str  = str,                          \
0103 };
0104 
0105 struct rapl_pmu {
0106     raw_spinlock_t      lock;
0107     int         n_active;
0108     int         cpu;
0109     struct list_head    active_list;
0110     struct pmu      *pmu;
0111     ktime_t         timer_interval;
0112     struct hrtimer      hrtimer;
0113 };
0114 
0115 struct rapl_pmus {
0116     struct pmu      pmu;
0117     unsigned int        maxdie;
0118     struct rapl_pmu     *pmus[];
0119 };
0120 
0121 enum rapl_unit_quirk {
0122     RAPL_UNIT_QUIRK_NONE,
0123     RAPL_UNIT_QUIRK_INTEL_HSW,
0124     RAPL_UNIT_QUIRK_INTEL_SPR,
0125 };
0126 
0127 struct rapl_model {
0128     struct perf_msr *rapl_msrs;
0129     unsigned long   events;
0130     unsigned int    msr_power_unit;
0131     enum rapl_unit_quirk    unit_quirk;
0132 };
0133 
0134  /* 1/2^hw_unit Joule */
0135 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
0136 static struct rapl_pmus *rapl_pmus;
0137 static cpumask_t rapl_cpu_mask;
0138 static unsigned int rapl_cntr_mask;
0139 static u64 rapl_timer_ms;
0140 static struct perf_msr *rapl_msrs;
0141 
0142 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
0143 {
0144     unsigned int dieid = topology_logical_die_id(cpu);
0145 
0146     /*
0147      * The unsigned check also catches the '-1' return value for non
0148      * existent mappings in the topology map.
0149      */
0150     return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
0151 }
0152 
0153 static inline u64 rapl_read_counter(struct perf_event *event)
0154 {
0155     u64 raw;
0156     rdmsrl(event->hw.event_base, raw);
0157     return raw;
0158 }
0159 
0160 static inline u64 rapl_scale(u64 v, int cfg)
0161 {
0162     if (cfg > NR_RAPL_DOMAINS) {
0163         pr_warn("Invalid domain %d, failed to scale data\n", cfg);
0164         return v;
0165     }
0166     /*
0167      * scale delta to smallest unit (1/2^32)
0168      * users must then scale back: count * 1/(1e9*2^32) to get Joules
0169      * or use ldexp(count, -32).
0170      * Watts = Joules/Time delta
0171      */
0172     return v << (32 - rapl_hw_unit[cfg - 1]);
0173 }
0174 
0175 static u64 rapl_event_update(struct perf_event *event)
0176 {
0177     struct hw_perf_event *hwc = &event->hw;
0178     u64 prev_raw_count, new_raw_count;
0179     s64 delta, sdelta;
0180     int shift = RAPL_CNTR_WIDTH;
0181 
0182 again:
0183     prev_raw_count = local64_read(&hwc->prev_count);
0184     rdmsrl(event->hw.event_base, new_raw_count);
0185 
0186     if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
0187                 new_raw_count) != prev_raw_count) {
0188         cpu_relax();
0189         goto again;
0190     }
0191 
0192     /*
0193      * Now we have the new raw value and have updated the prev
0194      * timestamp already. We can now calculate the elapsed delta
0195      * (event-)time and add that to the generic event.
0196      *
0197      * Careful, not all hw sign-extends above the physical width
0198      * of the count.
0199      */
0200     delta = (new_raw_count << shift) - (prev_raw_count << shift);
0201     delta >>= shift;
0202 
0203     sdelta = rapl_scale(delta, event->hw.config);
0204 
0205     local64_add(sdelta, &event->count);
0206 
0207     return new_raw_count;
0208 }
0209 
0210 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
0211 {
0212        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
0213              HRTIMER_MODE_REL_PINNED);
0214 }
0215 
0216 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
0217 {
0218     struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
0219     struct perf_event *event;
0220     unsigned long flags;
0221 
0222     if (!pmu->n_active)
0223         return HRTIMER_NORESTART;
0224 
0225     raw_spin_lock_irqsave(&pmu->lock, flags);
0226 
0227     list_for_each_entry(event, &pmu->active_list, active_entry)
0228         rapl_event_update(event);
0229 
0230     raw_spin_unlock_irqrestore(&pmu->lock, flags);
0231 
0232     hrtimer_forward_now(hrtimer, pmu->timer_interval);
0233 
0234     return HRTIMER_RESTART;
0235 }
0236 
0237 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
0238 {
0239     struct hrtimer *hr = &pmu->hrtimer;
0240 
0241     hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
0242     hr->function = rapl_hrtimer_handle;
0243 }
0244 
0245 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
0246                    struct perf_event *event)
0247 {
0248     if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
0249         return;
0250 
0251     event->hw.state = 0;
0252 
0253     list_add_tail(&event->active_entry, &pmu->active_list);
0254 
0255     local64_set(&event->hw.prev_count, rapl_read_counter(event));
0256 
0257     pmu->n_active++;
0258     if (pmu->n_active == 1)
0259         rapl_start_hrtimer(pmu);
0260 }
0261 
0262 static void rapl_pmu_event_start(struct perf_event *event, int mode)
0263 {
0264     struct rapl_pmu *pmu = event->pmu_private;
0265     unsigned long flags;
0266 
0267     raw_spin_lock_irqsave(&pmu->lock, flags);
0268     __rapl_pmu_event_start(pmu, event);
0269     raw_spin_unlock_irqrestore(&pmu->lock, flags);
0270 }
0271 
0272 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
0273 {
0274     struct rapl_pmu *pmu = event->pmu_private;
0275     struct hw_perf_event *hwc = &event->hw;
0276     unsigned long flags;
0277 
0278     raw_spin_lock_irqsave(&pmu->lock, flags);
0279 
0280     /* mark event as deactivated and stopped */
0281     if (!(hwc->state & PERF_HES_STOPPED)) {
0282         WARN_ON_ONCE(pmu->n_active <= 0);
0283         pmu->n_active--;
0284         if (pmu->n_active == 0)
0285             hrtimer_cancel(&pmu->hrtimer);
0286 
0287         list_del(&event->active_entry);
0288 
0289         WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
0290         hwc->state |= PERF_HES_STOPPED;
0291     }
0292 
0293     /* check if update of sw counter is necessary */
0294     if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
0295         /*
0296          * Drain the remaining delta count out of a event
0297          * that we are disabling:
0298          */
0299         rapl_event_update(event);
0300         hwc->state |= PERF_HES_UPTODATE;
0301     }
0302 
0303     raw_spin_unlock_irqrestore(&pmu->lock, flags);
0304 }
0305 
0306 static int rapl_pmu_event_add(struct perf_event *event, int mode)
0307 {
0308     struct rapl_pmu *pmu = event->pmu_private;
0309     struct hw_perf_event *hwc = &event->hw;
0310     unsigned long flags;
0311 
0312     raw_spin_lock_irqsave(&pmu->lock, flags);
0313 
0314     hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
0315 
0316     if (mode & PERF_EF_START)
0317         __rapl_pmu_event_start(pmu, event);
0318 
0319     raw_spin_unlock_irqrestore(&pmu->lock, flags);
0320 
0321     return 0;
0322 }
0323 
0324 static void rapl_pmu_event_del(struct perf_event *event, int flags)
0325 {
0326     rapl_pmu_event_stop(event, PERF_EF_UPDATE);
0327 }
0328 
0329 static int rapl_pmu_event_init(struct perf_event *event)
0330 {
0331     u64 cfg = event->attr.config & RAPL_EVENT_MASK;
0332     int bit, ret = 0;
0333     struct rapl_pmu *pmu;
0334 
0335     /* only look at RAPL events */
0336     if (event->attr.type != rapl_pmus->pmu.type)
0337         return -ENOENT;
0338 
0339     /* check only supported bits are set */
0340     if (event->attr.config & ~RAPL_EVENT_MASK)
0341         return -EINVAL;
0342 
0343     if (event->cpu < 0)
0344         return -EINVAL;
0345 
0346     event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
0347 
0348     if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
0349         return -EINVAL;
0350 
0351     cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
0352     bit = cfg - 1;
0353 
0354     /* check event supported */
0355     if (!(rapl_cntr_mask & (1 << bit)))
0356         return -EINVAL;
0357 
0358     /* unsupported modes and filters */
0359     if (event->attr.sample_period) /* no sampling */
0360         return -EINVAL;
0361 
0362     /* must be done before validate_group */
0363     pmu = cpu_to_rapl_pmu(event->cpu);
0364     if (!pmu)
0365         return -EINVAL;
0366     event->cpu = pmu->cpu;
0367     event->pmu_private = pmu;
0368     event->hw.event_base = rapl_msrs[bit].msr;
0369     event->hw.config = cfg;
0370     event->hw.idx = bit;
0371 
0372     return ret;
0373 }
0374 
0375 static void rapl_pmu_event_read(struct perf_event *event)
0376 {
0377     rapl_event_update(event);
0378 }
0379 
0380 static ssize_t rapl_get_attr_cpumask(struct device *dev,
0381                 struct device_attribute *attr, char *buf)
0382 {
0383     return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
0384 }
0385 
0386 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
0387 
0388 static struct attribute *rapl_pmu_attrs[] = {
0389     &dev_attr_cpumask.attr,
0390     NULL,
0391 };
0392 
0393 static struct attribute_group rapl_pmu_attr_group = {
0394     .attrs = rapl_pmu_attrs,
0395 };
0396 
0397 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
0398 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
0399 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
0400 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
0401 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
0402 
0403 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
0404 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
0405 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
0406 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
0407 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
0408 
0409 /*
0410  * we compute in 0.23 nJ increments regardless of MSR
0411  */
0412 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
0413 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
0414 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
0415 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
0416 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
0417 
0418 /*
0419  * There are no default events, but we need to create
0420  * "events" group (with empty attrs) before updating
0421  * it with detected events.
0422  */
0423 static struct attribute *attrs_empty[] = {
0424     NULL,
0425 };
0426 
0427 static struct attribute_group rapl_pmu_events_group = {
0428     .name = "events",
0429     .attrs = attrs_empty,
0430 };
0431 
0432 PMU_FORMAT_ATTR(event, "config:0-7");
0433 static struct attribute *rapl_formats_attr[] = {
0434     &format_attr_event.attr,
0435     NULL,
0436 };
0437 
0438 static struct attribute_group rapl_pmu_format_group = {
0439     .name = "format",
0440     .attrs = rapl_formats_attr,
0441 };
0442 
0443 static const struct attribute_group *rapl_attr_groups[] = {
0444     &rapl_pmu_attr_group,
0445     &rapl_pmu_format_group,
0446     &rapl_pmu_events_group,
0447     NULL,
0448 };
0449 
0450 static struct attribute *rapl_events_cores[] = {
0451     EVENT_PTR(rapl_cores),
0452     EVENT_PTR(rapl_cores_unit),
0453     EVENT_PTR(rapl_cores_scale),
0454     NULL,
0455 };
0456 
0457 static struct attribute_group rapl_events_cores_group = {
0458     .name  = "events",
0459     .attrs = rapl_events_cores,
0460 };
0461 
0462 static struct attribute *rapl_events_pkg[] = {
0463     EVENT_PTR(rapl_pkg),
0464     EVENT_PTR(rapl_pkg_unit),
0465     EVENT_PTR(rapl_pkg_scale),
0466     NULL,
0467 };
0468 
0469 static struct attribute_group rapl_events_pkg_group = {
0470     .name  = "events",
0471     .attrs = rapl_events_pkg,
0472 };
0473 
0474 static struct attribute *rapl_events_ram[] = {
0475     EVENT_PTR(rapl_ram),
0476     EVENT_PTR(rapl_ram_unit),
0477     EVENT_PTR(rapl_ram_scale),
0478     NULL,
0479 };
0480 
0481 static struct attribute_group rapl_events_ram_group = {
0482     .name  = "events",
0483     .attrs = rapl_events_ram,
0484 };
0485 
0486 static struct attribute *rapl_events_gpu[] = {
0487     EVENT_PTR(rapl_gpu),
0488     EVENT_PTR(rapl_gpu_unit),
0489     EVENT_PTR(rapl_gpu_scale),
0490     NULL,
0491 };
0492 
0493 static struct attribute_group rapl_events_gpu_group = {
0494     .name  = "events",
0495     .attrs = rapl_events_gpu,
0496 };
0497 
0498 static struct attribute *rapl_events_psys[] = {
0499     EVENT_PTR(rapl_psys),
0500     EVENT_PTR(rapl_psys_unit),
0501     EVENT_PTR(rapl_psys_scale),
0502     NULL,
0503 };
0504 
0505 static struct attribute_group rapl_events_psys_group = {
0506     .name  = "events",
0507     .attrs = rapl_events_psys,
0508 };
0509 
0510 static bool test_msr(int idx, void *data)
0511 {
0512     return test_bit(idx, (unsigned long *) data);
0513 }
0514 
0515 /* Only lower 32bits of the MSR represents the energy counter */
0516 #define RAPL_MSR_MASK 0xFFFFFFFF
0517 
0518 static struct perf_msr intel_rapl_msrs[] = {
0519     [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
0520     [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
0521     [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
0522     [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
0523     [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
0524 };
0525 
0526 static struct perf_msr intel_rapl_spr_msrs[] = {
0527     [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
0528     [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
0529     [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
0530     [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
0531     [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
0532 };
0533 
0534 /*
0535  * Force to PERF_RAPL_MAX size due to:
0536  * - perf_msr_probe(PERF_RAPL_MAX)
0537  * - want to use same event codes across both architectures
0538  */
0539 static struct perf_msr amd_rapl_msrs[] = {
0540     [PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
0541     [PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
0542     [PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
0543     [PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
0544     [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  0, false, 0 },
0545 };
0546 
0547 static int rapl_cpu_offline(unsigned int cpu)
0548 {
0549     struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
0550     int target;
0551 
0552     /* Check if exiting cpu is used for collecting rapl events */
0553     if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
0554         return 0;
0555 
0556     pmu->cpu = -1;
0557     /* Find a new cpu to collect rapl events */
0558     target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
0559 
0560     /* Migrate rapl events to the new target */
0561     if (target < nr_cpu_ids) {
0562         cpumask_set_cpu(target, &rapl_cpu_mask);
0563         pmu->cpu = target;
0564         perf_pmu_migrate_context(pmu->pmu, cpu, target);
0565     }
0566     return 0;
0567 }
0568 
0569 static int rapl_cpu_online(unsigned int cpu)
0570 {
0571     struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
0572     int target;
0573 
0574     if (!pmu) {
0575         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
0576         if (!pmu)
0577             return -ENOMEM;
0578 
0579         raw_spin_lock_init(&pmu->lock);
0580         INIT_LIST_HEAD(&pmu->active_list);
0581         pmu->pmu = &rapl_pmus->pmu;
0582         pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
0583         rapl_hrtimer_init(pmu);
0584 
0585         rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
0586     }
0587 
0588     /*
0589      * Check if there is an online cpu in the package which collects rapl
0590      * events already.
0591      */
0592     target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
0593     if (target < nr_cpu_ids)
0594         return 0;
0595 
0596     cpumask_set_cpu(cpu, &rapl_cpu_mask);
0597     pmu->cpu = cpu;
0598     return 0;
0599 }
0600 
0601 static int rapl_check_hw_unit(struct rapl_model *rm)
0602 {
0603     u64 msr_rapl_power_unit_bits;
0604     int i;
0605 
0606     /* protect rdmsrl() to handle virtualization */
0607     if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
0608         return -1;
0609     for (i = 0; i < NR_RAPL_DOMAINS; i++)
0610         rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
0611 
0612     switch (rm->unit_quirk) {
0613     /*
0614      * DRAM domain on HSW server and KNL has fixed energy unit which can be
0615      * different than the unit from power unit MSR. See
0616      * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
0617      * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
0618      */
0619     case RAPL_UNIT_QUIRK_INTEL_HSW:
0620         rapl_hw_unit[PERF_RAPL_RAM] = 16;
0621         break;
0622     /*
0623      * SPR shares the same DRAM domain energy unit as HSW, plus it
0624      * also has a fixed energy unit for Psys domain.
0625      */
0626     case RAPL_UNIT_QUIRK_INTEL_SPR:
0627         rapl_hw_unit[PERF_RAPL_RAM] = 16;
0628         rapl_hw_unit[PERF_RAPL_PSYS] = 0;
0629         break;
0630     default:
0631         break;
0632     }
0633 
0634 
0635     /*
0636      * Calculate the timer rate:
0637      * Use reference of 200W for scaling the timeout to avoid counter
0638      * overflows. 200W = 200 Joules/sec
0639      * Divide interval by 2 to avoid lockstep (2 * 100)
0640      * if hw unit is 32, then we use 2 ms 1/200/2
0641      */
0642     rapl_timer_ms = 2;
0643     if (rapl_hw_unit[0] < 32) {
0644         rapl_timer_ms = (1000 / (2 * 100));
0645         rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
0646     }
0647     return 0;
0648 }
0649 
0650 static void __init rapl_advertise(void)
0651 {
0652     int i;
0653 
0654     pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
0655         hweight32(rapl_cntr_mask), rapl_timer_ms);
0656 
0657     for (i = 0; i < NR_RAPL_DOMAINS; i++) {
0658         if (rapl_cntr_mask & (1 << i)) {
0659             pr_info("hw unit of domain %s 2^-%d Joules\n",
0660                 rapl_domain_names[i], rapl_hw_unit[i]);
0661         }
0662     }
0663 }
0664 
0665 static void cleanup_rapl_pmus(void)
0666 {
0667     int i;
0668 
0669     for (i = 0; i < rapl_pmus->maxdie; i++)
0670         kfree(rapl_pmus->pmus[i]);
0671     kfree(rapl_pmus);
0672 }
0673 
0674 static const struct attribute_group *rapl_attr_update[] = {
0675     &rapl_events_cores_group,
0676     &rapl_events_pkg_group,
0677     &rapl_events_ram_group,
0678     &rapl_events_gpu_group,
0679     &rapl_events_psys_group,
0680     NULL,
0681 };
0682 
0683 static int __init init_rapl_pmus(void)
0684 {
0685     int maxdie = topology_max_packages() * topology_max_die_per_package();
0686     size_t size;
0687 
0688     size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
0689     rapl_pmus = kzalloc(size, GFP_KERNEL);
0690     if (!rapl_pmus)
0691         return -ENOMEM;
0692 
0693     rapl_pmus->maxdie       = maxdie;
0694     rapl_pmus->pmu.attr_groups  = rapl_attr_groups;
0695     rapl_pmus->pmu.attr_update  = rapl_attr_update;
0696     rapl_pmus->pmu.task_ctx_nr  = perf_invalid_context;
0697     rapl_pmus->pmu.event_init   = rapl_pmu_event_init;
0698     rapl_pmus->pmu.add      = rapl_pmu_event_add;
0699     rapl_pmus->pmu.del      = rapl_pmu_event_del;
0700     rapl_pmus->pmu.start        = rapl_pmu_event_start;
0701     rapl_pmus->pmu.stop     = rapl_pmu_event_stop;
0702     rapl_pmus->pmu.read     = rapl_pmu_event_read;
0703     rapl_pmus->pmu.module       = THIS_MODULE;
0704     rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
0705     return 0;
0706 }
0707 
0708 static struct rapl_model model_snb = {
0709     .events     = BIT(PERF_RAPL_PP0) |
0710               BIT(PERF_RAPL_PKG) |
0711               BIT(PERF_RAPL_PP1),
0712     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0713     .rapl_msrs      = intel_rapl_msrs,
0714 };
0715 
0716 static struct rapl_model model_snbep = {
0717     .events     = BIT(PERF_RAPL_PP0) |
0718               BIT(PERF_RAPL_PKG) |
0719               BIT(PERF_RAPL_RAM),
0720     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0721     .rapl_msrs      = intel_rapl_msrs,
0722 };
0723 
0724 static struct rapl_model model_hsw = {
0725     .events     = BIT(PERF_RAPL_PP0) |
0726               BIT(PERF_RAPL_PKG) |
0727               BIT(PERF_RAPL_RAM) |
0728               BIT(PERF_RAPL_PP1),
0729     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0730     .rapl_msrs      = intel_rapl_msrs,
0731 };
0732 
0733 static struct rapl_model model_hsx = {
0734     .events     = BIT(PERF_RAPL_PP0) |
0735               BIT(PERF_RAPL_PKG) |
0736               BIT(PERF_RAPL_RAM),
0737     .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
0738     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0739     .rapl_msrs      = intel_rapl_msrs,
0740 };
0741 
0742 static struct rapl_model model_knl = {
0743     .events     = BIT(PERF_RAPL_PKG) |
0744               BIT(PERF_RAPL_RAM),
0745     .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
0746     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0747     .rapl_msrs      = intel_rapl_msrs,
0748 };
0749 
0750 static struct rapl_model model_skl = {
0751     .events     = BIT(PERF_RAPL_PP0) |
0752               BIT(PERF_RAPL_PKG) |
0753               BIT(PERF_RAPL_RAM) |
0754               BIT(PERF_RAPL_PP1) |
0755               BIT(PERF_RAPL_PSYS),
0756     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0757     .rapl_msrs      = intel_rapl_msrs,
0758 };
0759 
0760 static struct rapl_model model_spr = {
0761     .events     = BIT(PERF_RAPL_PP0) |
0762               BIT(PERF_RAPL_PKG) |
0763               BIT(PERF_RAPL_RAM) |
0764               BIT(PERF_RAPL_PSYS),
0765     .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
0766     .msr_power_unit = MSR_RAPL_POWER_UNIT,
0767     .rapl_msrs      = intel_rapl_spr_msrs,
0768 };
0769 
0770 static struct rapl_model model_amd_hygon = {
0771     .events     = BIT(PERF_RAPL_PKG),
0772     .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
0773     .rapl_msrs      = amd_rapl_msrs,
0774 };
0775 
0776 static const struct x86_cpu_id rapl_model_match[] __initconst = {
0777     X86_MATCH_FEATURE(X86_FEATURE_RAPL,     &model_amd_hygon),
0778     X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,     &model_snb),
0779     X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,   &model_snbep),
0780     X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,       &model_snb),
0781     X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,     &model_snbep),
0782     X86_MATCH_INTEL_FAM6_MODEL(HASWELL,     &model_hsw),
0783     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,       &model_hsx),
0784     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,       &model_hsw),
0785     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,       &model_hsw),
0786     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,       &model_hsw),
0787     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,     &model_hsw),
0788     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,     &model_hsx),
0789     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,     &model_hsx),
0790     X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,    &model_knl),
0791     X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,    &model_knl),
0792     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,       &model_skl),
0793     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,     &model_skl),
0794     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,       &model_hsx),
0795     X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,      &model_skl),
0796     X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,        &model_skl),
0797     X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,    &model_skl),
0798     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,   &model_hsw),
0799     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw),
0800     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,  &model_hsw),
0801     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,       &model_skl),
0802     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,     &model_skl),
0803     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,       &model_hsx),
0804     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,       &model_hsx),
0805     X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,     &model_skl),
0806     X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,       &model_skl),
0807     X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,       &model_skl),
0808     X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,     &model_skl),
0809     X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,    &model_spr),
0810     {},
0811 };
0812 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
0813 
0814 static int __init rapl_pmu_init(void)
0815 {
0816     const struct x86_cpu_id *id;
0817     struct rapl_model *rm;
0818     int ret;
0819 
0820     id = x86_match_cpu(rapl_model_match);
0821     if (!id)
0822         return -ENODEV;
0823 
0824     rm = (struct rapl_model *) id->driver_data;
0825 
0826     rapl_msrs = rm->rapl_msrs;
0827 
0828     rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
0829                     false, (void *) &rm->events);
0830 
0831     ret = rapl_check_hw_unit(rm);
0832     if (ret)
0833         return ret;
0834 
0835     ret = init_rapl_pmus();
0836     if (ret)
0837         return ret;
0838 
0839     /*
0840      * Install callbacks. Core will call them for each online cpu.
0841      */
0842     ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
0843                 "perf/x86/rapl:online",
0844                 rapl_cpu_online, rapl_cpu_offline);
0845     if (ret)
0846         goto out;
0847 
0848     ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
0849     if (ret)
0850         goto out1;
0851 
0852     rapl_advertise();
0853     return 0;
0854 
0855 out1:
0856     cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
0857 out:
0858     pr_warn("Initialization failed (%d), disabled\n", ret);
0859     cleanup_rapl_pmus();
0860     return ret;
0861 }
0862 module_init(rapl_pmu_init);
0863 
0864 static void __exit intel_rapl_exit(void)
0865 {
0866     cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
0867     perf_pmu_unregister(&rapl_pmus->pmu);
0868     cleanup_rapl_pmus();
0869 }
0870 module_exit(intel_rapl_exit);