Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Common code for Intel Running Average Power Limit (RAPL) support.
0004  * Copyright (c) 2019, Intel Corporation.
0005  */
0006 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0007 
0008 #include <linux/kernel.h>
0009 #include <linux/module.h>
0010 #include <linux/list.h>
0011 #include <linux/types.h>
0012 #include <linux/device.h>
0013 #include <linux/slab.h>
0014 #include <linux/log2.h>
0015 #include <linux/bitmap.h>
0016 #include <linux/delay.h>
0017 #include <linux/sysfs.h>
0018 #include <linux/cpu.h>
0019 #include <linux/powercap.h>
0020 #include <linux/suspend.h>
0021 #include <linux/intel_rapl.h>
0022 #include <linux/processor.h>
0023 #include <linux/platform_device.h>
0024 
0025 #include <asm/iosf_mbi.h>
0026 #include <asm/cpu_device_id.h>
0027 #include <asm/intel-family.h>
0028 
0029 /* bitmasks for RAPL MSRs, used by primitive access functions */
0030 #define ENERGY_STATUS_MASK      0xffffffff
0031 
0032 #define POWER_LIMIT1_MASK       0x7FFF
0033 #define POWER_LIMIT1_ENABLE     BIT(15)
0034 #define POWER_LIMIT1_CLAMP      BIT(16)
0035 
0036 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
0037 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
0038 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
0039 #define POWER_HIGH_LOCK         BIT_ULL(63)
0040 #define POWER_LOW_LOCK          BIT(31)
0041 
0042 #define POWER_LIMIT4_MASK       0x1FFF
0043 
0044 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
0045 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
0046 
0047 #define POWER_UNIT_OFFSET   0
0048 #define POWER_UNIT_MASK     0x0F
0049 
0050 #define ENERGY_UNIT_OFFSET  0x08
0051 #define ENERGY_UNIT_MASK    0x1F00
0052 
0053 #define TIME_UNIT_OFFSET    0x10
0054 #define TIME_UNIT_MASK      0xF0000
0055 
0056 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
0057 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
0058 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
0059 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
0060 
0061 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
0062 #define PP_POLICY_MASK         0x1F
0063 
0064 /*
0065  * SPR has different layout for Psys Domain PowerLimit registers.
0066  * There are 17 bits of PL1 and PL2 instead of 15 bits.
0067  * The Enable bits and TimeWindow bits are also shifted as a result.
0068  */
0069 #define PSYS_POWER_LIMIT1_MASK       0x1FFFF
0070 #define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
0071 
0072 #define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
0073 #define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
0074 
0075 #define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
0076 #define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
0077 
0078 /* Non HW constants */
0079 #define RAPL_PRIMITIVE_DERIVED       BIT(1) /* not from raw data */
0080 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
0081 
0082 #define TIME_WINDOW_MAX_MSEC 40000
0083 #define TIME_WINDOW_MIN_MSEC 250
0084 #define ENERGY_UNIT_SCALE    1000   /* scale from driver unit to powercap unit */
0085 enum unit_type {
0086     ARBITRARY_UNIT,     /* no translation */
0087     POWER_UNIT,
0088     ENERGY_UNIT,
0089     TIME_UNIT,
0090 };
0091 
0092 /* per domain data, some are optional */
0093 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
0094 
0095 #define DOMAIN_STATE_INACTIVE           BIT(0)
0096 #define DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
0097 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
0098 
0099 static const char pl1_name[] = "long_term";
0100 static const char pl2_name[] = "short_term";
0101 static const char pl4_name[] = "peak_power";
0102 
0103 #define power_zone_to_rapl_domain(_zone) \
0104     container_of(_zone, struct rapl_domain, power_zone)
0105 
0106 struct rapl_defaults {
0107     u8 floor_freq_reg_addr;
0108     int (*check_unit)(struct rapl_package *rp, int cpu);
0109     void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
0110     u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
0111                     bool to_raw);
0112     unsigned int dram_domain_energy_unit;
0113     unsigned int psys_domain_energy_unit;
0114     bool spr_psys_bits;
0115 };
0116 static struct rapl_defaults *rapl_defaults;
0117 
0118 /* Sideband MBI registers */
0119 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
0120 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
0121 
0122 #define PACKAGE_PLN_INT_SAVED   BIT(0)
0123 #define MAX_PRIM_NAME (32)
0124 
0125 /* per domain data. used to describe individual knobs such that access function
0126  * can be consolidated into one instead of many inline functions.
0127  */
0128 struct rapl_primitive_info {
0129     const char *name;
0130     u64 mask;
0131     int shift;
0132     enum rapl_domain_reg_id id;
0133     enum unit_type unit;
0134     u32 flag;
0135 };
0136 
0137 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
0138         .name = #p,         \
0139         .mask = m,          \
0140         .shift = s,         \
0141         .id = i,            \
0142         .unit = u,          \
0143         .flag = f           \
0144     }
0145 
0146 static void rapl_init_domains(struct rapl_package *rp);
0147 static int rapl_read_data_raw(struct rapl_domain *rd,
0148                   enum rapl_primitives prim,
0149                   bool xlate, u64 *data);
0150 static int rapl_write_data_raw(struct rapl_domain *rd,
0151                    enum rapl_primitives prim,
0152                    unsigned long long value);
0153 static u64 rapl_unit_xlate(struct rapl_domain *rd,
0154                enum unit_type type, u64 value, int to_raw);
0155 static void package_power_limit_irq_save(struct rapl_package *rp);
0156 
0157 static LIST_HEAD(rapl_packages);    /* guarded by CPU hotplug lock */
0158 
0159 static const char *const rapl_domain_names[] = {
0160     "package",
0161     "core",
0162     "uncore",
0163     "dram",
0164     "psys",
0165 };
0166 
0167 static int get_energy_counter(struct powercap_zone *power_zone,
0168                   u64 *energy_raw)
0169 {
0170     struct rapl_domain *rd;
0171     u64 energy_now;
0172 
0173     /* prevent CPU hotplug, make sure the RAPL domain does not go
0174      * away while reading the counter.
0175      */
0176     cpus_read_lock();
0177     rd = power_zone_to_rapl_domain(power_zone);
0178 
0179     if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
0180         *energy_raw = energy_now;
0181         cpus_read_unlock();
0182 
0183         return 0;
0184     }
0185     cpus_read_unlock();
0186 
0187     return -EIO;
0188 }
0189 
0190 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
0191 {
0192     struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
0193 
0194     *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
0195     return 0;
0196 }
0197 
0198 static int release_zone(struct powercap_zone *power_zone)
0199 {
0200     struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
0201     struct rapl_package *rp = rd->rp;
0202 
0203     /* package zone is the last zone of a package, we can free
0204      * memory here since all children has been unregistered.
0205      */
0206     if (rd->id == RAPL_DOMAIN_PACKAGE) {
0207         kfree(rd);
0208         rp->domains = NULL;
0209     }
0210 
0211     return 0;
0212 
0213 }
0214 
0215 static int find_nr_power_limit(struct rapl_domain *rd)
0216 {
0217     int i, nr_pl = 0;
0218 
0219     for (i = 0; i < NR_POWER_LIMITS; i++) {
0220         if (rd->rpl[i].name)
0221             nr_pl++;
0222     }
0223 
0224     return nr_pl;
0225 }
0226 
0227 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
0228 {
0229     struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
0230 
0231     if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
0232         return -EACCES;
0233 
0234     cpus_read_lock();
0235     rapl_write_data_raw(rd, PL1_ENABLE, mode);
0236     if (rapl_defaults->set_floor_freq)
0237         rapl_defaults->set_floor_freq(rd, mode);
0238     cpus_read_unlock();
0239 
0240     return 0;
0241 }
0242 
0243 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
0244 {
0245     struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
0246     u64 val;
0247 
0248     if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
0249         *mode = false;
0250         return 0;
0251     }
0252     cpus_read_lock();
0253     if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
0254         cpus_read_unlock();
0255         return -EIO;
0256     }
0257     *mode = val;
0258     cpus_read_unlock();
0259 
0260     return 0;
0261 }
0262 
0263 /* per RAPL domain ops, in the order of rapl_domain_type */
0264 static const struct powercap_zone_ops zone_ops[] = {
0265     /* RAPL_DOMAIN_PACKAGE */
0266     {
0267      .get_energy_uj = get_energy_counter,
0268      .get_max_energy_range_uj = get_max_energy_counter,
0269      .release = release_zone,
0270      .set_enable = set_domain_enable,
0271      .get_enable = get_domain_enable,
0272      },
0273     /* RAPL_DOMAIN_PP0 */
0274     {
0275      .get_energy_uj = get_energy_counter,
0276      .get_max_energy_range_uj = get_max_energy_counter,
0277      .release = release_zone,
0278      .set_enable = set_domain_enable,
0279      .get_enable = get_domain_enable,
0280      },
0281     /* RAPL_DOMAIN_PP1 */
0282     {
0283      .get_energy_uj = get_energy_counter,
0284      .get_max_energy_range_uj = get_max_energy_counter,
0285      .release = release_zone,
0286      .set_enable = set_domain_enable,
0287      .get_enable = get_domain_enable,
0288      },
0289     /* RAPL_DOMAIN_DRAM */
0290     {
0291      .get_energy_uj = get_energy_counter,
0292      .get_max_energy_range_uj = get_max_energy_counter,
0293      .release = release_zone,
0294      .set_enable = set_domain_enable,
0295      .get_enable = get_domain_enable,
0296      },
0297     /* RAPL_DOMAIN_PLATFORM */
0298     {
0299      .get_energy_uj = get_energy_counter,
0300      .get_max_energy_range_uj = get_max_energy_counter,
0301      .release = release_zone,
0302      .set_enable = set_domain_enable,
0303      .get_enable = get_domain_enable,
0304      },
0305 };
0306 
0307 /*
0308  * Constraint index used by powercap can be different than power limit (PL)
0309  * index in that some  PLs maybe missing due to non-existent MSRs. So we
0310  * need to convert here by finding the valid PLs only (name populated).
0311  */
0312 static int contraint_to_pl(struct rapl_domain *rd, int cid)
0313 {
0314     int i, j;
0315 
0316     for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
0317         if ((rd->rpl[i].name) && j++ == cid) {
0318             pr_debug("%s: index %d\n", __func__, i);
0319             return i;
0320         }
0321     }
0322     pr_err("Cannot find matching power limit for constraint %d\n", cid);
0323 
0324     return -EINVAL;
0325 }
0326 
0327 static int set_power_limit(struct powercap_zone *power_zone, int cid,
0328                u64 power_limit)
0329 {
0330     struct rapl_domain *rd;
0331     struct rapl_package *rp;
0332     int ret = 0;
0333     int id;
0334 
0335     cpus_read_lock();
0336     rd = power_zone_to_rapl_domain(power_zone);
0337     id = contraint_to_pl(rd, cid);
0338     if (id < 0) {
0339         ret = id;
0340         goto set_exit;
0341     }
0342 
0343     rp = rd->rp;
0344 
0345     if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
0346         dev_warn(&power_zone->dev,
0347              "%s locked by BIOS, monitoring only\n", rd->name);
0348         ret = -EACCES;
0349         goto set_exit;
0350     }
0351 
0352     switch (rd->rpl[id].prim_id) {
0353     case PL1_ENABLE:
0354         rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
0355         break;
0356     case PL2_ENABLE:
0357         rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
0358         break;
0359     case PL4_ENABLE:
0360         rapl_write_data_raw(rd, POWER_LIMIT4, power_limit);
0361         break;
0362     default:
0363         ret = -EINVAL;
0364     }
0365     if (!ret)
0366         package_power_limit_irq_save(rp);
0367 set_exit:
0368     cpus_read_unlock();
0369     return ret;
0370 }
0371 
0372 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
0373                    u64 *data)
0374 {
0375     struct rapl_domain *rd;
0376     u64 val;
0377     int prim;
0378     int ret = 0;
0379     int id;
0380 
0381     cpus_read_lock();
0382     rd = power_zone_to_rapl_domain(power_zone);
0383     id = contraint_to_pl(rd, cid);
0384     if (id < 0) {
0385         ret = id;
0386         goto get_exit;
0387     }
0388 
0389     switch (rd->rpl[id].prim_id) {
0390     case PL1_ENABLE:
0391         prim = POWER_LIMIT1;
0392         break;
0393     case PL2_ENABLE:
0394         prim = POWER_LIMIT2;
0395         break;
0396     case PL4_ENABLE:
0397         prim = POWER_LIMIT4;
0398         break;
0399     default:
0400         cpus_read_unlock();
0401         return -EINVAL;
0402     }
0403     if (rapl_read_data_raw(rd, prim, true, &val))
0404         ret = -EIO;
0405     else
0406         *data = val;
0407 
0408 get_exit:
0409     cpus_read_unlock();
0410 
0411     return ret;
0412 }
0413 
0414 static int set_time_window(struct powercap_zone *power_zone, int cid,
0415                u64 window)
0416 {
0417     struct rapl_domain *rd;
0418     int ret = 0;
0419     int id;
0420 
0421     cpus_read_lock();
0422     rd = power_zone_to_rapl_domain(power_zone);
0423     id = contraint_to_pl(rd, cid);
0424     if (id < 0) {
0425         ret = id;
0426         goto set_time_exit;
0427     }
0428 
0429     switch (rd->rpl[id].prim_id) {
0430     case PL1_ENABLE:
0431         rapl_write_data_raw(rd, TIME_WINDOW1, window);
0432         break;
0433     case PL2_ENABLE:
0434         rapl_write_data_raw(rd, TIME_WINDOW2, window);
0435         break;
0436     default:
0437         ret = -EINVAL;
0438     }
0439 
0440 set_time_exit:
0441     cpus_read_unlock();
0442     return ret;
0443 }
0444 
0445 static int get_time_window(struct powercap_zone *power_zone, int cid,
0446                u64 *data)
0447 {
0448     struct rapl_domain *rd;
0449     u64 val;
0450     int ret = 0;
0451     int id;
0452 
0453     cpus_read_lock();
0454     rd = power_zone_to_rapl_domain(power_zone);
0455     id = contraint_to_pl(rd, cid);
0456     if (id < 0) {
0457         ret = id;
0458         goto get_time_exit;
0459     }
0460 
0461     switch (rd->rpl[id].prim_id) {
0462     case PL1_ENABLE:
0463         ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
0464         break;
0465     case PL2_ENABLE:
0466         ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
0467         break;
0468     case PL4_ENABLE:
0469         /*
0470          * Time window parameter is not applicable for PL4 entry
0471          * so assigining '0' as default value.
0472          */
0473         val = 0;
0474         break;
0475     default:
0476         cpus_read_unlock();
0477         return -EINVAL;
0478     }
0479     if (!ret)
0480         *data = val;
0481 
0482 get_time_exit:
0483     cpus_read_unlock();
0484 
0485     return ret;
0486 }
0487 
0488 static const char *get_constraint_name(struct powercap_zone *power_zone,
0489                        int cid)
0490 {
0491     struct rapl_domain *rd;
0492     int id;
0493 
0494     rd = power_zone_to_rapl_domain(power_zone);
0495     id = contraint_to_pl(rd, cid);
0496     if (id >= 0)
0497         return rd->rpl[id].name;
0498 
0499     return NULL;
0500 }
0501 
0502 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
0503 {
0504     struct rapl_domain *rd;
0505     u64 val;
0506     int prim;
0507     int ret = 0;
0508 
0509     cpus_read_lock();
0510     rd = power_zone_to_rapl_domain(power_zone);
0511     switch (rd->rpl[id].prim_id) {
0512     case PL1_ENABLE:
0513         prim = THERMAL_SPEC_POWER;
0514         break;
0515     case PL2_ENABLE:
0516         prim = MAX_POWER;
0517         break;
0518     case PL4_ENABLE:
0519         prim = MAX_POWER;
0520         break;
0521     default:
0522         cpus_read_unlock();
0523         return -EINVAL;
0524     }
0525     if (rapl_read_data_raw(rd, prim, true, &val))
0526         ret = -EIO;
0527     else
0528         *data = val;
0529 
0530     /* As a generalization rule, PL4 would be around two times PL2. */
0531     if (rd->rpl[id].prim_id == PL4_ENABLE)
0532         *data = *data * 2;
0533 
0534     cpus_read_unlock();
0535 
0536     return ret;
0537 }
0538 
0539 static const struct powercap_zone_constraint_ops constraint_ops = {
0540     .set_power_limit_uw = set_power_limit,
0541     .get_power_limit_uw = get_current_power_limit,
0542     .set_time_window_us = set_time_window,
0543     .get_time_window_us = get_time_window,
0544     .get_max_power_uw = get_max_power,
0545     .get_name = get_constraint_name,
0546 };
0547 
0548 /* called after domain detection and package level data are set */
0549 static void rapl_init_domains(struct rapl_package *rp)
0550 {
0551     enum rapl_domain_type i;
0552     enum rapl_domain_reg_id j;
0553     struct rapl_domain *rd = rp->domains;
0554 
0555     for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
0556         unsigned int mask = rp->domain_map & (1 << i);
0557 
0558         if (!mask)
0559             continue;
0560 
0561         rd->rp = rp;
0562 
0563         if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) {
0564             snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d",
0565                 topology_physical_package_id(rp->lead_cpu));
0566         } else
0567             snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s",
0568                 rapl_domain_names[i]);
0569 
0570         rd->id = i;
0571         rd->rpl[0].prim_id = PL1_ENABLE;
0572         rd->rpl[0].name = pl1_name;
0573 
0574         /*
0575          * The PL2 power domain is applicable for limits two
0576          * and limits three
0577          */
0578         if (rp->priv->limits[i] >= 2) {
0579             rd->rpl[1].prim_id = PL2_ENABLE;
0580             rd->rpl[1].name = pl2_name;
0581         }
0582 
0583         /* Enable PL4 domain if the total power limits are three */
0584         if (rp->priv->limits[i] == 3) {
0585             rd->rpl[2].prim_id = PL4_ENABLE;
0586             rd->rpl[2].name = pl4_name;
0587         }
0588 
0589         for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
0590             rd->regs[j] = rp->priv->regs[i][j];
0591 
0592         switch (i) {
0593         case RAPL_DOMAIN_DRAM:
0594             rd->domain_energy_unit =
0595                 rapl_defaults->dram_domain_energy_unit;
0596             if (rd->domain_energy_unit)
0597                 pr_info("DRAM domain energy unit %dpj\n",
0598                     rd->domain_energy_unit);
0599             break;
0600         case RAPL_DOMAIN_PLATFORM:
0601             rd->domain_energy_unit =
0602                 rapl_defaults->psys_domain_energy_unit;
0603             if (rd->domain_energy_unit)
0604                 pr_info("Platform domain energy unit %dpj\n",
0605                     rd->domain_energy_unit);
0606             break;
0607         default:
0608             break;
0609         }
0610         rd++;
0611     }
0612 }
0613 
0614 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
0615                u64 value, int to_raw)
0616 {
0617     u64 units = 1;
0618     struct rapl_package *rp = rd->rp;
0619     u64 scale = 1;
0620 
0621     switch (type) {
0622     case POWER_UNIT:
0623         units = rp->power_unit;
0624         break;
0625     case ENERGY_UNIT:
0626         scale = ENERGY_UNIT_SCALE;
0627         /* per domain unit takes precedence */
0628         if (rd->domain_energy_unit)
0629             units = rd->domain_energy_unit;
0630         else
0631             units = rp->energy_unit;
0632         break;
0633     case TIME_UNIT:
0634         return rapl_defaults->compute_time_window(rp, value, to_raw);
0635     case ARBITRARY_UNIT:
0636     default:
0637         return value;
0638     }
0639 
0640     if (to_raw)
0641         return div64_u64(value, units) * scale;
0642 
0643     value *= units;
0644 
0645     return div64_u64(value, scale);
0646 }
0647 
0648 /* in the order of enum rapl_primitives */
0649 static struct rapl_primitive_info rpi[] = {
0650     /* name, mask, shift, msr index, unit divisor */
0651     PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
0652                 RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
0653     PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
0654                 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
0655     PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
0656                 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
0657     PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0,
0658                 RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0),
0659     PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
0660                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0661     PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
0662                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0663     PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
0664                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0665     PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
0666                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0667     PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
0668                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0669     PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0,
0670                 RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0),
0671     PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
0672                 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
0673     PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
0674                 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
0675     PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
0676                 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
0677     PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
0678                 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
0679     PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
0680                 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
0681     PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
0682                 RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
0683     PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
0684                 RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
0685     PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
0686                 RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
0687     PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
0688                 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
0689     PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
0690                 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
0691     PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
0692                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0693     PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
0694                 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
0695     PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
0696                 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
0697     PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
0698                 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
0699     /* non-hardware */
0700     PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
0701                 RAPL_PRIMITIVE_DERIVED),
0702     {NULL, 0, 0, 0},
0703 };
0704 
0705 static enum rapl_primitives
0706 prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
0707 {
0708     if (!rapl_defaults->spr_psys_bits)
0709         return prim;
0710 
0711     if (rd->id != RAPL_DOMAIN_PLATFORM)
0712         return prim;
0713 
0714     switch (prim) {
0715     case POWER_LIMIT1:
0716         return PSYS_POWER_LIMIT1;
0717     case POWER_LIMIT2:
0718         return PSYS_POWER_LIMIT2;
0719     case PL1_ENABLE:
0720         return PSYS_PL1_ENABLE;
0721     case PL2_ENABLE:
0722         return PSYS_PL2_ENABLE;
0723     case TIME_WINDOW1:
0724         return PSYS_TIME_WINDOW1;
0725     case TIME_WINDOW2:
0726         return PSYS_TIME_WINDOW2;
0727     default:
0728         return prim;
0729     }
0730 }
0731 
0732 /* Read primitive data based on its related struct rapl_primitive_info.
0733  * if xlate flag is set, return translated data based on data units, i.e.
0734  * time, energy, and power.
0735  * RAPL MSRs are non-architectual and are laid out not consistently across
0736  * domains. Here we use primitive info to allow writing consolidated access
0737  * functions.
0738  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
0739  * is pre-assigned based on RAPL unit MSRs read at init time.
0740  * 63-------------------------- 31--------------------------- 0
0741  * |                           xxxxx (mask)                   |
0742  * |                                |<- shift ----------------|
0743  * 63-------------------------- 31--------------------------- 0
0744  */
0745 static int rapl_read_data_raw(struct rapl_domain *rd,
0746                   enum rapl_primitives prim, bool xlate, u64 *data)
0747 {
0748     u64 value;
0749     enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
0750     struct rapl_primitive_info *rp = &rpi[prim_fixed];
0751     struct reg_action ra;
0752     int cpu;
0753 
0754     if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
0755         return -EINVAL;
0756 
0757     ra.reg = rd->regs[rp->id];
0758     if (!ra.reg)
0759         return -EINVAL;
0760 
0761     cpu = rd->rp->lead_cpu;
0762 
0763     /* domain with 2 limits has different bit */
0764     if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
0765         rp->mask = POWER_HIGH_LOCK;
0766         rp->shift = 63;
0767     }
0768     /* non-hardware data are collected by the polling thread */
0769     if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
0770         *data = rd->rdd.primitives[prim];
0771         return 0;
0772     }
0773 
0774     ra.mask = rp->mask;
0775 
0776     if (rd->rp->priv->read_raw(cpu, &ra)) {
0777         pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
0778         return -EIO;
0779     }
0780 
0781     value = ra.value >> rp->shift;
0782 
0783     if (xlate)
0784         *data = rapl_unit_xlate(rd, rp->unit, value, 0);
0785     else
0786         *data = value;
0787 
0788     return 0;
0789 }
0790 
0791 /* Similar use of primitive info in the read counterpart */
0792 static int rapl_write_data_raw(struct rapl_domain *rd,
0793                    enum rapl_primitives prim,
0794                    unsigned long long value)
0795 {
0796     enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
0797     struct rapl_primitive_info *rp = &rpi[prim_fixed];
0798     int cpu;
0799     u64 bits;
0800     struct reg_action ra;
0801     int ret;
0802 
0803     cpu = rd->rp->lead_cpu;
0804     bits = rapl_unit_xlate(rd, rp->unit, value, 1);
0805     bits <<= rp->shift;
0806     bits &= rp->mask;
0807 
0808     memset(&ra, 0, sizeof(ra));
0809 
0810     ra.reg = rd->regs[rp->id];
0811     ra.mask = rp->mask;
0812     ra.value = bits;
0813 
0814     ret = rd->rp->priv->write_raw(cpu, &ra);
0815 
0816     return ret;
0817 }
0818 
0819 /*
0820  * Raw RAPL data stored in MSRs are in certain scales. We need to
0821  * convert them into standard units based on the units reported in
0822  * the RAPL unit MSRs. This is specific to CPUs as the method to
0823  * calculate units differ on different CPUs.
0824  * We convert the units to below format based on CPUs.
0825  * i.e.
0826  * energy unit: picoJoules  : Represented in picoJoules by default
0827  * power unit : microWatts  : Represented in milliWatts by default
0828  * time unit  : microseconds: Represented in seconds by default
0829  */
0830 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
0831 {
0832     struct reg_action ra;
0833     u32 value;
0834 
0835     ra.reg = rp->priv->reg_unit;
0836     ra.mask = ~0;
0837     if (rp->priv->read_raw(cpu, &ra)) {
0838         pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
0839                rp->priv->reg_unit, cpu);
0840         return -ENODEV;
0841     }
0842 
0843     value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
0844     rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
0845 
0846     value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
0847     rp->power_unit = 1000000 / (1 << value);
0848 
0849     value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
0850     rp->time_unit = 1000000 / (1 << value);
0851 
0852     pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
0853          rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
0854 
0855     return 0;
0856 }
0857 
0858 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
0859 {
0860     struct reg_action ra;
0861     u32 value;
0862 
0863     ra.reg = rp->priv->reg_unit;
0864     ra.mask = ~0;
0865     if (rp->priv->read_raw(cpu, &ra)) {
0866         pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
0867                rp->priv->reg_unit, cpu);
0868         return -ENODEV;
0869     }
0870 
0871     value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
0872     rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
0873 
0874     value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
0875     rp->power_unit = (1 << value) * 1000;
0876 
0877     value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
0878     rp->time_unit = 1000000 / (1 << value);
0879 
0880     pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
0881          rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
0882 
0883     return 0;
0884 }
0885 
0886 static void power_limit_irq_save_cpu(void *info)
0887 {
0888     u32 l, h = 0;
0889     struct rapl_package *rp = (struct rapl_package *)info;
0890 
0891     /* save the state of PLN irq mask bit before disabling it */
0892     rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
0893     if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
0894         rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
0895         rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
0896     }
0897     l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
0898     wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
0899 }
0900 
0901 /* REVISIT:
0902  * When package power limit is set artificially low by RAPL, LVT
0903  * thermal interrupt for package power limit should be ignored
0904  * since we are not really exceeding the real limit. The intention
0905  * is to avoid excessive interrupts while we are trying to save power.
0906  * A useful feature might be routing the package_power_limit interrupt
0907  * to userspace via eventfd. once we have a usecase, this is simple
0908  * to do by adding an atomic notifier.
0909  */
0910 
0911 static void package_power_limit_irq_save(struct rapl_package *rp)
0912 {
0913     if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
0914         return;
0915 
0916     smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
0917 }
0918 
0919 /*
0920  * Restore per package power limit interrupt enable state. Called from cpu
0921  * hotplug code on package removal.
0922  */
0923 static void package_power_limit_irq_restore(struct rapl_package *rp)
0924 {
0925     u32 l, h;
0926 
0927     if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
0928         return;
0929 
0930     /* irq enable state not saved, nothing to restore */
0931     if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
0932         return;
0933 
0934     rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
0935 
0936     if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
0937         l |= PACKAGE_THERM_INT_PLN_ENABLE;
0938     else
0939         l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
0940 
0941     wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
0942 }
0943 
0944 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
0945 {
0946     int nr_powerlimit = find_nr_power_limit(rd);
0947 
0948     /* always enable clamp such that p-state can go below OS requested
0949      * range. power capping priority over guranteed frequency.
0950      */
0951     rapl_write_data_raw(rd, PL1_CLAMP, mode);
0952 
0953     /* some domains have pl2 */
0954     if (nr_powerlimit > 1) {
0955         rapl_write_data_raw(rd, PL2_ENABLE, mode);
0956         rapl_write_data_raw(rd, PL2_CLAMP, mode);
0957     }
0958 }
0959 
0960 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
0961 {
0962     static u32 power_ctrl_orig_val;
0963     u32 mdata;
0964 
0965     if (!rapl_defaults->floor_freq_reg_addr) {
0966         pr_err("Invalid floor frequency config register\n");
0967         return;
0968     }
0969 
0970     if (!power_ctrl_orig_val)
0971         iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
0972                   rapl_defaults->floor_freq_reg_addr,
0973                   &power_ctrl_orig_val);
0974     mdata = power_ctrl_orig_val;
0975     if (enable) {
0976         mdata &= ~(0x7f << 8);
0977         mdata |= 1 << 8;
0978     }
0979     iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
0980                rapl_defaults->floor_freq_reg_addr, mdata);
0981 }
0982 
0983 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
0984                      bool to_raw)
0985 {
0986     u64 f, y;       /* fraction and exp. used for time unit */
0987 
0988     /*
0989      * Special processing based on 2^Y*(1+F/4), refer
0990      * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
0991      */
0992     if (!to_raw) {
0993         f = (value & 0x60) >> 5;
0994         y = value & 0x1f;
0995         value = (1 << y) * (4 + f) * rp->time_unit / 4;
0996     } else {
0997         do_div(value, rp->time_unit);
0998         y = ilog2(value);
0999         f = div64_u64(4 * (value - (1 << y)), 1 << y);
1000         value = (y & 0x1f) | ((f & 0x3) << 5);
1001     }
1002     return value;
1003 }
1004 
1005 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
1006                      bool to_raw)
1007 {
1008     /*
1009      * Atom time unit encoding is straight forward val * time_unit,
1010      * where time_unit is default to 1 sec. Never 0.
1011      */
1012     if (!to_raw)
1013         return (value) ? value * rp->time_unit : rp->time_unit;
1014 
1015     value = div64_u64(value, rp->time_unit);
1016 
1017     return value;
1018 }
1019 
1020 static const struct rapl_defaults rapl_defaults_core = {
1021     .floor_freq_reg_addr = 0,
1022     .check_unit = rapl_check_unit_core,
1023     .set_floor_freq = set_floor_freq_default,
1024     .compute_time_window = rapl_compute_time_window_core,
1025 };
1026 
1027 static const struct rapl_defaults rapl_defaults_hsw_server = {
1028     .check_unit = rapl_check_unit_core,
1029     .set_floor_freq = set_floor_freq_default,
1030     .compute_time_window = rapl_compute_time_window_core,
1031     .dram_domain_energy_unit = 15300,
1032 };
1033 
1034 static const struct rapl_defaults rapl_defaults_spr_server = {
1035     .check_unit = rapl_check_unit_core,
1036     .set_floor_freq = set_floor_freq_default,
1037     .compute_time_window = rapl_compute_time_window_core,
1038     .dram_domain_energy_unit = 15300,
1039     .psys_domain_energy_unit = 1000000000,
1040     .spr_psys_bits = true,
1041 };
1042 
1043 static const struct rapl_defaults rapl_defaults_byt = {
1044     .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
1045     .check_unit = rapl_check_unit_atom,
1046     .set_floor_freq = set_floor_freq_atom,
1047     .compute_time_window = rapl_compute_time_window_atom,
1048 };
1049 
1050 static const struct rapl_defaults rapl_defaults_tng = {
1051     .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
1052     .check_unit = rapl_check_unit_atom,
1053     .set_floor_freq = set_floor_freq_atom,
1054     .compute_time_window = rapl_compute_time_window_atom,
1055 };
1056 
1057 static const struct rapl_defaults rapl_defaults_ann = {
1058     .floor_freq_reg_addr = 0,
1059     .check_unit = rapl_check_unit_atom,
1060     .set_floor_freq = NULL,
1061     .compute_time_window = rapl_compute_time_window_atom,
1062 };
1063 
1064 static const struct rapl_defaults rapl_defaults_cht = {
1065     .floor_freq_reg_addr = 0,
1066     .check_unit = rapl_check_unit_atom,
1067     .set_floor_freq = NULL,
1068     .compute_time_window = rapl_compute_time_window_atom,
1069 };
1070 
1071 static const struct rapl_defaults rapl_defaults_amd = {
1072     .check_unit = rapl_check_unit_core,
1073 };
1074 
1075 static const struct x86_cpu_id rapl_ids[] __initconst = {
1076     X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,     &rapl_defaults_core),
1077     X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,   &rapl_defaults_core),
1078 
1079     X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,       &rapl_defaults_core),
1080     X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,     &rapl_defaults_core),
1081 
1082     X86_MATCH_INTEL_FAM6_MODEL(HASWELL,     &rapl_defaults_core),
1083     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,       &rapl_defaults_core),
1084     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,       &rapl_defaults_core),
1085     X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,       &rapl_defaults_hsw_server),
1086 
1087     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,       &rapl_defaults_core),
1088     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,     &rapl_defaults_core),
1089     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,     &rapl_defaults_core),
1090     X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,     &rapl_defaults_hsw_server),
1091 
1092     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,     &rapl_defaults_core),
1093     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,       &rapl_defaults_core),
1094     X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,       &rapl_defaults_hsw_server),
1095     X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,      &rapl_defaults_core),
1096     X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,        &rapl_defaults_core),
1097     X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,    &rapl_defaults_core),
1098     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,       &rapl_defaults_core),
1099     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,     &rapl_defaults_core),
1100     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,    &rapl_defaults_core),
1101     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,       &rapl_defaults_hsw_server),
1102     X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,       &rapl_defaults_hsw_server),
1103     X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,     &rapl_defaults_core),
1104     X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,       &rapl_defaults_core),
1105     X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,     &rapl_defaults_core),
1106     X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,       &rapl_defaults_core),
1107     X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,      &rapl_defaults_core),
1108     X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,       &rapl_defaults_core),
1109     X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,     &rapl_defaults_core),
1110     X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,     &rapl_defaults_core),
1111     X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,      &rapl_defaults_core),
1112     X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &rapl_defaults_core),
1113     X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,    &rapl_defaults_spr_server),
1114     X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,       &rapl_defaults_core),
1115 
1116     X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt),
1117     X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,    &rapl_defaults_cht),
1118     X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng),
1119     X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,    &rapl_defaults_ann),
1120     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,   &rapl_defaults_core),
1121     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,  &rapl_defaults_core),
1122     X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core),
1123     X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,    &rapl_defaults_core),
1124     X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,  &rapl_defaults_core),
1125     X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,  &rapl_defaults_core),
1126 
1127     X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,    &rapl_defaults_hsw_server),
1128     X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,    &rapl_defaults_hsw_server),
1129 
1130     X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
1131     X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
1132     X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
1133     {}
1134 };
1135 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1136 
1137 /* Read once for all raw primitive data for domains */
1138 static void rapl_update_domain_data(struct rapl_package *rp)
1139 {
1140     int dmn, prim;
1141     u64 val;
1142 
1143     for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1144         pr_debug("update %s domain %s data\n", rp->name,
1145              rp->domains[dmn].name);
1146         /* exclude non-raw primitives */
1147         for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1148             if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1149                         rpi[prim].unit, &val))
1150                 rp->domains[dmn].rdd.primitives[prim] = val;
1151         }
1152     }
1153 
1154 }
1155 
1156 static int rapl_package_register_powercap(struct rapl_package *rp)
1157 {
1158     struct rapl_domain *rd;
1159     struct powercap_zone *power_zone = NULL;
1160     int nr_pl, ret;
1161 
1162     /* Update the domain data of the new package */
1163     rapl_update_domain_data(rp);
1164 
1165     /* first we register package domain as the parent zone */
1166     for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1167         if (rd->id == RAPL_DOMAIN_PACKAGE) {
1168             nr_pl = find_nr_power_limit(rd);
1169             pr_debug("register package domain %s\n", rp->name);
1170             power_zone = powercap_register_zone(&rd->power_zone,
1171                         rp->priv->control_type, rp->name,
1172                         NULL, &zone_ops[rd->id], nr_pl,
1173                         &constraint_ops);
1174             if (IS_ERR(power_zone)) {
1175                 pr_debug("failed to register power zone %s\n",
1176                      rp->name);
1177                 return PTR_ERR(power_zone);
1178             }
1179             /* track parent zone in per package/socket data */
1180             rp->power_zone = power_zone;
1181             /* done, only one package domain per socket */
1182             break;
1183         }
1184     }
1185     if (!power_zone) {
1186         pr_err("no package domain found, unknown topology!\n");
1187         return -ENODEV;
1188     }
1189     /* now register domains as children of the socket/package */
1190     for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1191         struct powercap_zone *parent = rp->power_zone;
1192 
1193         if (rd->id == RAPL_DOMAIN_PACKAGE)
1194             continue;
1195         if (rd->id == RAPL_DOMAIN_PLATFORM)
1196             parent = NULL;
1197         /* number of power limits per domain varies */
1198         nr_pl = find_nr_power_limit(rd);
1199         power_zone = powercap_register_zone(&rd->power_zone,
1200                             rp->priv->control_type,
1201                             rd->name, parent,
1202                             &zone_ops[rd->id], nr_pl,
1203                             &constraint_ops);
1204 
1205         if (IS_ERR(power_zone)) {
1206             pr_debug("failed to register power_zone, %s:%s\n",
1207                  rp->name, rd->name);
1208             ret = PTR_ERR(power_zone);
1209             goto err_cleanup;
1210         }
1211     }
1212     return 0;
1213 
1214 err_cleanup:
1215     /*
1216      * Clean up previously initialized domains within the package if we
1217      * failed after the first domain setup.
1218      */
1219     while (--rd >= rp->domains) {
1220         pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1221         powercap_unregister_zone(rp->priv->control_type,
1222                      &rd->power_zone);
1223     }
1224 
1225     return ret;
1226 }
1227 
1228 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1229 {
1230     struct reg_action ra;
1231 
1232     switch (domain) {
1233     case RAPL_DOMAIN_PACKAGE:
1234     case RAPL_DOMAIN_PP0:
1235     case RAPL_DOMAIN_PP1:
1236     case RAPL_DOMAIN_DRAM:
1237     case RAPL_DOMAIN_PLATFORM:
1238         ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1239         break;
1240     default:
1241         pr_err("invalid domain id %d\n", domain);
1242         return -EINVAL;
1243     }
1244     /* make sure domain counters are available and contains non-zero
1245      * values, otherwise skip it.
1246      */
1247 
1248     ra.mask = ENERGY_STATUS_MASK;
1249     if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1250         return -ENODEV;
1251 
1252     return 0;
1253 }
1254 
1255 /*
1256  * Check if power limits are available. Two cases when they are not available:
1257  * 1. Locked by BIOS, in this case we still provide read-only access so that
1258  *    users can see what limit is set by the BIOS.
1259  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1260  *    exist at all. In this case, we do not show the constraints in powercap.
1261  *
1262  * Called after domains are detected and initialized.
1263  */
1264 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1265 {
1266     u64 val64;
1267     int i;
1268 
1269     /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1270     if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1271         if (val64) {
1272             pr_info("RAPL %s domain %s locked by BIOS\n",
1273                 rd->rp->name, rd->name);
1274             rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1275         }
1276     }
1277     /* check if power limit MSR exists, otherwise domain is monitoring only */
1278     for (i = 0; i < NR_POWER_LIMITS; i++) {
1279         int prim = rd->rpl[i].prim_id;
1280 
1281         if (rapl_read_data_raw(rd, prim, false, &val64))
1282             rd->rpl[i].name = NULL;
1283     }
1284 }
1285 
1286 /* Detect active and valid domains for the given CPU, caller must
1287  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1288  */
1289 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1290 {
1291     struct rapl_domain *rd;
1292     int i;
1293 
1294     for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1295         /* use physical package id to read counters */
1296         if (!rapl_check_domain(cpu, i, rp)) {
1297             rp->domain_map |= 1 << i;
1298             pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1299         }
1300     }
1301     rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1302     if (!rp->nr_domains) {
1303         pr_debug("no valid rapl domains found in %s\n", rp->name);
1304         return -ENODEV;
1305     }
1306     pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1307 
1308     rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1309                   GFP_KERNEL);
1310     if (!rp->domains)
1311         return -ENOMEM;
1312 
1313     rapl_init_domains(rp);
1314 
1315     for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1316         rapl_detect_powerlimit(rd);
1317 
1318     return 0;
1319 }
1320 
1321 /* called from CPU hotplug notifier, hotplug lock held */
1322 void rapl_remove_package(struct rapl_package *rp)
1323 {
1324     struct rapl_domain *rd, *rd_package = NULL;
1325 
1326     package_power_limit_irq_restore(rp);
1327 
1328     for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1329         rapl_write_data_raw(rd, PL1_ENABLE, 0);
1330         rapl_write_data_raw(rd, PL1_CLAMP, 0);
1331         if (find_nr_power_limit(rd) > 1) {
1332             rapl_write_data_raw(rd, PL2_ENABLE, 0);
1333             rapl_write_data_raw(rd, PL2_CLAMP, 0);
1334             rapl_write_data_raw(rd, PL4_ENABLE, 0);
1335         }
1336         if (rd->id == RAPL_DOMAIN_PACKAGE) {
1337             rd_package = rd;
1338             continue;
1339         }
1340         pr_debug("remove package, undo power limit on %s: %s\n",
1341              rp->name, rd->name);
1342         powercap_unregister_zone(rp->priv->control_type,
1343                      &rd->power_zone);
1344     }
1345     /* do parent zone last */
1346     powercap_unregister_zone(rp->priv->control_type,
1347                  &rd_package->power_zone);
1348     list_del(&rp->plist);
1349     kfree(rp);
1350 }
1351 EXPORT_SYMBOL_GPL(rapl_remove_package);
1352 
1353 /* caller to ensure CPU hotplug lock is held */
1354 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1355 {
1356     int id = topology_logical_die_id(cpu);
1357     struct rapl_package *rp;
1358 
1359     list_for_each_entry(rp, &rapl_packages, plist) {
1360         if (rp->id == id
1361             && rp->priv->control_type == priv->control_type)
1362             return rp;
1363     }
1364 
1365     return NULL;
1366 }
1367 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1368 
1369 /* called from CPU hotplug notifier, hotplug lock held */
1370 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1371 {
1372     int id = topology_logical_die_id(cpu);
1373     struct rapl_package *rp;
1374     int ret;
1375 
1376     if (!rapl_defaults)
1377         return ERR_PTR(-ENODEV);
1378 
1379     rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1380     if (!rp)
1381         return ERR_PTR(-ENOMEM);
1382 
1383     /* add the new package to the list */
1384     rp->id = id;
1385     rp->lead_cpu = cpu;
1386     rp->priv = priv;
1387 
1388     if (topology_max_die_per_package() > 1)
1389         snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1390              "package-%d-die-%d",
1391              topology_physical_package_id(cpu), topology_die_id(cpu));
1392     else
1393         snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1394              topology_physical_package_id(cpu));
1395 
1396     /* check if the package contains valid domains */
1397     if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1398         ret = -ENODEV;
1399         goto err_free_package;
1400     }
1401     ret = rapl_package_register_powercap(rp);
1402     if (!ret) {
1403         INIT_LIST_HEAD(&rp->plist);
1404         list_add(&rp->plist, &rapl_packages);
1405         return rp;
1406     }
1407 
1408 err_free_package:
1409     kfree(rp->domains);
1410     kfree(rp);
1411     return ERR_PTR(ret);
1412 }
1413 EXPORT_SYMBOL_GPL(rapl_add_package);
1414 
1415 static void power_limit_state_save(void)
1416 {
1417     struct rapl_package *rp;
1418     struct rapl_domain *rd;
1419     int nr_pl, ret, i;
1420 
1421     cpus_read_lock();
1422     list_for_each_entry(rp, &rapl_packages, plist) {
1423         if (!rp->power_zone)
1424             continue;
1425         rd = power_zone_to_rapl_domain(rp->power_zone);
1426         nr_pl = find_nr_power_limit(rd);
1427         for (i = 0; i < nr_pl; i++) {
1428             switch (rd->rpl[i].prim_id) {
1429             case PL1_ENABLE:
1430                 ret = rapl_read_data_raw(rd,
1431                          POWER_LIMIT1, true,
1432                          &rd->rpl[i].last_power_limit);
1433                 if (ret)
1434                     rd->rpl[i].last_power_limit = 0;
1435                 break;
1436             case PL2_ENABLE:
1437                 ret = rapl_read_data_raw(rd,
1438                          POWER_LIMIT2, true,
1439                          &rd->rpl[i].last_power_limit);
1440                 if (ret)
1441                     rd->rpl[i].last_power_limit = 0;
1442                 break;
1443             case PL4_ENABLE:
1444                 ret = rapl_read_data_raw(rd,
1445                          POWER_LIMIT4, true,
1446                          &rd->rpl[i].last_power_limit);
1447                 if (ret)
1448                     rd->rpl[i].last_power_limit = 0;
1449                 break;
1450             }
1451         }
1452     }
1453     cpus_read_unlock();
1454 }
1455 
1456 static void power_limit_state_restore(void)
1457 {
1458     struct rapl_package *rp;
1459     struct rapl_domain *rd;
1460     int nr_pl, i;
1461 
1462     cpus_read_lock();
1463     list_for_each_entry(rp, &rapl_packages, plist) {
1464         if (!rp->power_zone)
1465             continue;
1466         rd = power_zone_to_rapl_domain(rp->power_zone);
1467         nr_pl = find_nr_power_limit(rd);
1468         for (i = 0; i < nr_pl; i++) {
1469             switch (rd->rpl[i].prim_id) {
1470             case PL1_ENABLE:
1471                 if (rd->rpl[i].last_power_limit)
1472                     rapl_write_data_raw(rd, POWER_LIMIT1,
1473                         rd->rpl[i].last_power_limit);
1474                 break;
1475             case PL2_ENABLE:
1476                 if (rd->rpl[i].last_power_limit)
1477                     rapl_write_data_raw(rd, POWER_LIMIT2,
1478                         rd->rpl[i].last_power_limit);
1479                 break;
1480             case PL4_ENABLE:
1481                 if (rd->rpl[i].last_power_limit)
1482                     rapl_write_data_raw(rd, POWER_LIMIT4,
1483                         rd->rpl[i].last_power_limit);
1484                 break;
1485             }
1486         }
1487     }
1488     cpus_read_unlock();
1489 }
1490 
1491 static int rapl_pm_callback(struct notifier_block *nb,
1492                 unsigned long mode, void *_unused)
1493 {
1494     switch (mode) {
1495     case PM_SUSPEND_PREPARE:
1496         power_limit_state_save();
1497         break;
1498     case PM_POST_SUSPEND:
1499         power_limit_state_restore();
1500         break;
1501     }
1502     return NOTIFY_OK;
1503 }
1504 
1505 static struct notifier_block rapl_pm_notifier = {
1506     .notifier_call = rapl_pm_callback,
1507 };
1508 
1509 static struct platform_device *rapl_msr_platdev;
1510 
1511 static int __init rapl_init(void)
1512 {
1513     const struct x86_cpu_id *id;
1514     int ret;
1515 
1516     id = x86_match_cpu(rapl_ids);
1517     if (!id) {
1518         pr_err("driver does not support CPU family %d model %d\n",
1519                boot_cpu_data.x86, boot_cpu_data.x86_model);
1520 
1521         return -ENODEV;
1522     }
1523 
1524     rapl_defaults = (struct rapl_defaults *)id->driver_data;
1525 
1526     ret = register_pm_notifier(&rapl_pm_notifier);
1527     if (ret)
1528         return ret;
1529 
1530     rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1531     if (!rapl_msr_platdev) {
1532         ret = -ENOMEM;
1533         goto end;
1534     }
1535 
1536     ret = platform_device_add(rapl_msr_platdev);
1537     if (ret)
1538         platform_device_put(rapl_msr_platdev);
1539 
1540 end:
1541     if (ret)
1542         unregister_pm_notifier(&rapl_pm_notifier);
1543 
1544     return ret;
1545 }
1546 
1547 static void __exit rapl_exit(void)
1548 {
1549     platform_device_unregister(rapl_msr_platdev);
1550     unregister_pm_notifier(&rapl_pm_notifier);
1551 }
1552 
1553 fs_initcall(rapl_init);
1554 module_exit(rapl_exit);
1555 
1556 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1557 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1558 MODULE_LICENSE("GPL v2");