Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * POWERNV cpufreq driver for the IBM POWER processors
0004  *
0005  * (C) Copyright IBM 2014
0006  *
0007  * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
0008  */
0009 
0010 #define pr_fmt(fmt) "powernv-cpufreq: " fmt
0011 
0012 #include <linux/kernel.h>
0013 #include <linux/sysfs.h>
0014 #include <linux/cpumask.h>
0015 #include <linux/module.h>
0016 #include <linux/cpufreq.h>
0017 #include <linux/smp.h>
0018 #include <linux/of.h>
0019 #include <linux/reboot.h>
0020 #include <linux/slab.h>
0021 #include <linux/cpu.h>
0022 #include <linux/hashtable.h>
0023 #include <trace/events/power.h>
0024 
0025 #include <asm/cputhreads.h>
0026 #include <asm/firmware.h>
0027 #include <asm/reg.h>
0028 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
0029 #include <asm/opal.h>
0030 #include <linux/timer.h>
0031 
0032 #define POWERNV_MAX_PSTATES_ORDER  8
0033 #define POWERNV_MAX_PSTATES (1UL << (POWERNV_MAX_PSTATES_ORDER))
0034 #define PMSR_PSAFE_ENABLE   (1UL << 30)
0035 #define PMSR_SPR_EM_DISABLE (1UL << 31)
0036 #define MAX_PSTATE_SHIFT    32
0037 #define LPSTATE_SHIFT       48
0038 #define GPSTATE_SHIFT       56
0039 #define MAX_NR_CHIPS        32
0040 
0041 #define MAX_RAMP_DOWN_TIME              5120
0042 /*
0043  * On an idle system we want the global pstate to ramp-down from max value to
0044  * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
0045  * then ramp-down rapidly later on.
0046  *
0047  * This gives a percentage rampdown for time elapsed in milliseconds.
0048  * ramp_down_percentage = ((ms * ms) >> 18)
0049  *          ~= 3.8 * (sec * sec)
0050  *
0051  * At 0 ms  ramp_down_percent = 0
0052  * At 5120 ms   ramp_down_percent = 100
0053  */
0054 #define ramp_down_percent(time)     ((time * time) >> 18)
0055 
0056 /* Interval after which the timer is queued to bring down global pstate */
0057 #define GPSTATE_TIMER_INTERVAL              2000
0058 
0059 /**
0060  * struct global_pstate_info -  Per policy data structure to maintain history of
0061  *              global pstates
0062  * @highest_lpstate_idx:    The local pstate index from which we are
0063  *              ramping down
0064  * @elapsed_time:       Time in ms spent in ramping down from
0065  *              highest_lpstate_idx
0066  * @last_sampled_time:      Time from boot in ms when global pstates were
0067  *              last set
0068  * @last_lpstate_idx:       Last set value of local pstate and global
0069  * @last_gpstate_idx:       pstate in terms of cpufreq table index
0070  * @timer:          Is used for ramping down if cpu goes idle for
0071  *              a long time with global pstate held high
0072  * @gpstate_lock:       A spinlock to maintain synchronization between
0073  *              routines called by the timer handler and
0074  *              governer's target_index calls
0075  * @policy:         Associated CPUFreq policy
0076  */
0077 struct global_pstate_info {
0078     int highest_lpstate_idx;
0079     unsigned int elapsed_time;
0080     unsigned int last_sampled_time;
0081     int last_lpstate_idx;
0082     int last_gpstate_idx;
0083     spinlock_t gpstate_lock;
0084     struct timer_list timer;
0085     struct cpufreq_policy *policy;
0086 };
0087 
0088 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
0089 
0090 static DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER);
0091 /**
0092  * struct pstate_idx_revmap_data: Entry in the hashmap pstate_revmap
0093  *                indexed by a function of pstate id.
0094  *
0095  * @pstate_id: pstate id for this entry.
0096  *
0097  * @cpufreq_table_idx: Index into the powernv_freqs
0098  *             cpufreq_frequency_table for frequency
0099  *             corresponding to pstate_id.
0100  *
0101  * @hentry: hlist_node that hooks this entry into the pstate_revmap
0102  *      hashtable
0103  */
0104 struct pstate_idx_revmap_data {
0105     u8 pstate_id;
0106     unsigned int cpufreq_table_idx;
0107     struct hlist_node hentry;
0108 };
0109 
0110 static bool rebooting, throttled, occ_reset;
0111 
0112 static const char * const throttle_reason[] = {
0113     "No throttling",
0114     "Power Cap",
0115     "Processor Over Temperature",
0116     "Power Supply Failure",
0117     "Over Current",
0118     "OCC Reset"
0119 };
0120 
0121 enum throttle_reason_type {
0122     NO_THROTTLE = 0,
0123     POWERCAP,
0124     CPU_OVERTEMP,
0125     POWER_SUPPLY_FAILURE,
0126     OVERCURRENT,
0127     OCC_RESET_THROTTLE,
0128     OCC_MAX_REASON
0129 };
0130 
0131 static struct chip {
0132     unsigned int id;
0133     bool throttled;
0134     bool restore;
0135     u8 throttle_reason;
0136     cpumask_t mask;
0137     struct work_struct throttle;
0138     int throttle_turbo;
0139     int throttle_sub_turbo;
0140     int reason[OCC_MAX_REASON];
0141 } *chips;
0142 
0143 static int nr_chips;
0144 static DEFINE_PER_CPU(struct chip *, chip_info);
0145 
0146 /*
0147  * Note:
0148  * The set of pstates consists of contiguous integers.
0149  * powernv_pstate_info stores the index of the frequency table for
0150  * max, min and nominal frequencies. It also stores number of
0151  * available frequencies.
0152  *
0153  * powernv_pstate_info.nominal indicates the index to the highest
0154  * non-turbo frequency.
0155  */
0156 static struct powernv_pstate_info {
0157     unsigned int min;
0158     unsigned int max;
0159     unsigned int nominal;
0160     unsigned int nr_pstates;
0161     bool wof_enabled;
0162 } powernv_pstate_info;
0163 
0164 static inline u8 extract_pstate(u64 pmsr_val, unsigned int shift)
0165 {
0166     return ((pmsr_val >> shift) & 0xFF);
0167 }
0168 
0169 #define extract_local_pstate(x) extract_pstate(x, LPSTATE_SHIFT)
0170 #define extract_global_pstate(x) extract_pstate(x, GPSTATE_SHIFT)
0171 #define extract_max_pstate(x)  extract_pstate(x, MAX_PSTATE_SHIFT)
0172 
0173 /* Use following functions for conversions between pstate_id and index */
0174 
0175 /*
0176  * idx_to_pstate : Returns the pstate id corresponding to the
0177  *         frequency in the cpufreq frequency table
0178  *         powernv_freqs indexed by @i.
0179  *
0180  *         If @i is out of bound, this will return the pstate
0181  *         corresponding to the nominal frequency.
0182  */
0183 static inline u8 idx_to_pstate(unsigned int i)
0184 {
0185     if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
0186         pr_warn_once("idx_to_pstate: index %u is out of bound\n", i);
0187         return powernv_freqs[powernv_pstate_info.nominal].driver_data;
0188     }
0189 
0190     return powernv_freqs[i].driver_data;
0191 }
0192 
0193 /*
0194  * pstate_to_idx : Returns the index in the cpufreq frequencytable
0195  *         powernv_freqs for the frequency whose corresponding
0196  *         pstate id is @pstate.
0197  *
0198  *         If no frequency corresponding to @pstate is found,
0199  *         this will return the index of the nominal
0200  *         frequency.
0201  */
0202 static unsigned int pstate_to_idx(u8 pstate)
0203 {
0204     unsigned int key = pstate % POWERNV_MAX_PSTATES;
0205     struct pstate_idx_revmap_data *revmap_data;
0206 
0207     hash_for_each_possible(pstate_revmap, revmap_data, hentry, key) {
0208         if (revmap_data->pstate_id == pstate)
0209             return revmap_data->cpufreq_table_idx;
0210     }
0211 
0212     pr_warn_once("pstate_to_idx: pstate 0x%x not found\n", pstate);
0213     return powernv_pstate_info.nominal;
0214 }
0215 
0216 static inline void reset_gpstates(struct cpufreq_policy *policy)
0217 {
0218     struct global_pstate_info *gpstates = policy->driver_data;
0219 
0220     gpstates->highest_lpstate_idx = 0;
0221     gpstates->elapsed_time = 0;
0222     gpstates->last_sampled_time = 0;
0223     gpstates->last_lpstate_idx = 0;
0224     gpstates->last_gpstate_idx = 0;
0225 }
0226 
0227 /*
0228  * Initialize the freq table based on data obtained
0229  * from the firmware passed via device-tree
0230  */
0231 static int init_powernv_pstates(void)
0232 {
0233     struct device_node *power_mgt;
0234     int i, nr_pstates = 0;
0235     const __be32 *pstate_ids, *pstate_freqs;
0236     u32 len_ids, len_freqs;
0237     u32 pstate_min, pstate_max, pstate_nominal;
0238     u32 pstate_turbo, pstate_ultra_turbo;
0239     int rc = -ENODEV;
0240 
0241     power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
0242     if (!power_mgt) {
0243         pr_warn("power-mgt node not found\n");
0244         return -ENODEV;
0245     }
0246 
0247     if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
0248         pr_warn("ibm,pstate-min node not found\n");
0249         goto out;
0250     }
0251 
0252     if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
0253         pr_warn("ibm,pstate-max node not found\n");
0254         goto out;
0255     }
0256 
0257     if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
0258                  &pstate_nominal)) {
0259         pr_warn("ibm,pstate-nominal not found\n");
0260         goto out;
0261     }
0262 
0263     if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
0264                  &pstate_ultra_turbo)) {
0265         powernv_pstate_info.wof_enabled = false;
0266         goto next;
0267     }
0268 
0269     if (of_property_read_u32(power_mgt, "ibm,pstate-turbo",
0270                  &pstate_turbo)) {
0271         powernv_pstate_info.wof_enabled = false;
0272         goto next;
0273     }
0274 
0275     if (pstate_turbo == pstate_ultra_turbo)
0276         powernv_pstate_info.wof_enabled = false;
0277     else
0278         powernv_pstate_info.wof_enabled = true;
0279 
0280 next:
0281     pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
0282         pstate_nominal, pstate_max);
0283     pr_info("Workload Optimized Frequency is %s in the platform\n",
0284         (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
0285 
0286     pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
0287     if (!pstate_ids) {
0288         pr_warn("ibm,pstate-ids not found\n");
0289         goto out;
0290     }
0291 
0292     pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
0293                       &len_freqs);
0294     if (!pstate_freqs) {
0295         pr_warn("ibm,pstate-frequencies-mhz not found\n");
0296         goto out;
0297     }
0298 
0299     if (len_ids != len_freqs) {
0300         pr_warn("Entries in ibm,pstate-ids and "
0301             "ibm,pstate-frequencies-mhz does not match\n");
0302     }
0303 
0304     nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
0305     if (!nr_pstates) {
0306         pr_warn("No PStates found\n");
0307         goto out;
0308     }
0309 
0310     powernv_pstate_info.nr_pstates = nr_pstates;
0311     pr_debug("NR PStates %d\n", nr_pstates);
0312 
0313     for (i = 0; i < nr_pstates; i++) {
0314         u32 id = be32_to_cpu(pstate_ids[i]);
0315         u32 freq = be32_to_cpu(pstate_freqs[i]);
0316         struct pstate_idx_revmap_data *revmap_data;
0317         unsigned int key;
0318 
0319         pr_debug("PState id %d freq %d MHz\n", id, freq);
0320         powernv_freqs[i].frequency = freq * 1000; /* kHz */
0321         powernv_freqs[i].driver_data = id & 0xFF;
0322 
0323         revmap_data = kmalloc(sizeof(*revmap_data), GFP_KERNEL);
0324         if (!revmap_data) {
0325             rc = -ENOMEM;
0326             goto out;
0327         }
0328 
0329         revmap_data->pstate_id = id & 0xFF;
0330         revmap_data->cpufreq_table_idx = i;
0331         key = (revmap_data->pstate_id) % POWERNV_MAX_PSTATES;
0332         hash_add(pstate_revmap, &revmap_data->hentry, key);
0333 
0334         if (id == pstate_max)
0335             powernv_pstate_info.max = i;
0336         if (id == pstate_nominal)
0337             powernv_pstate_info.nominal = i;
0338         if (id == pstate_min)
0339             powernv_pstate_info.min = i;
0340 
0341         if (powernv_pstate_info.wof_enabled && id == pstate_turbo) {
0342             int j;
0343 
0344             for (j = i - 1; j >= (int)powernv_pstate_info.max; j--)
0345                 powernv_freqs[j].flags = CPUFREQ_BOOST_FREQ;
0346         }
0347     }
0348 
0349     /* End of list marker entry */
0350     powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
0351 
0352     of_node_put(power_mgt);
0353     return 0;
0354 out:
0355     of_node_put(power_mgt);
0356     return rc;
0357 }
0358 
0359 /* Returns the CPU frequency corresponding to the pstate_id. */
0360 static unsigned int pstate_id_to_freq(u8 pstate_id)
0361 {
0362     int i;
0363 
0364     i = pstate_to_idx(pstate_id);
0365     if (i >= powernv_pstate_info.nr_pstates || i < 0) {
0366         pr_warn("PState id 0x%x outside of PState table, reporting nominal id 0x%x instead\n",
0367             pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
0368         i = powernv_pstate_info.nominal;
0369     }
0370 
0371     return powernv_freqs[i].frequency;
0372 }
0373 
0374 /*
0375  * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
0376  * the firmware
0377  */
0378 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
0379                     char *buf)
0380 {
0381     return sprintf(buf, "%u\n",
0382         powernv_freqs[powernv_pstate_info.nominal].frequency);
0383 }
0384 
0385 static struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
0386     __ATTR_RO(cpuinfo_nominal_freq);
0387 
0388 #define SCALING_BOOST_FREQS_ATTR_INDEX      2
0389 
0390 static struct freq_attr *powernv_cpu_freq_attr[] = {
0391     &cpufreq_freq_attr_scaling_available_freqs,
0392     &cpufreq_freq_attr_cpuinfo_nominal_freq,
0393     &cpufreq_freq_attr_scaling_boost_freqs,
0394     NULL,
0395 };
0396 
0397 #define throttle_attr(name, member)                 \
0398 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)    \
0399 {                                   \
0400     struct chip *chip = per_cpu(chip_info, policy->cpu);        \
0401                                     \
0402     return sprintf(buf, "%u\n", chip->member);          \
0403 }                                   \
0404                                     \
0405 static struct freq_attr throttle_attr_##name = __ATTR_RO(name)      \
0406 
0407 throttle_attr(unthrottle, reason[NO_THROTTLE]);
0408 throttle_attr(powercap, reason[POWERCAP]);
0409 throttle_attr(overtemp, reason[CPU_OVERTEMP]);
0410 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
0411 throttle_attr(overcurrent, reason[OVERCURRENT]);
0412 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
0413 throttle_attr(turbo_stat, throttle_turbo);
0414 throttle_attr(sub_turbo_stat, throttle_sub_turbo);
0415 
0416 static struct attribute *throttle_attrs[] = {
0417     &throttle_attr_unthrottle.attr,
0418     &throttle_attr_powercap.attr,
0419     &throttle_attr_overtemp.attr,
0420     &throttle_attr_supply_fault.attr,
0421     &throttle_attr_overcurrent.attr,
0422     &throttle_attr_occ_reset.attr,
0423     &throttle_attr_turbo_stat.attr,
0424     &throttle_attr_sub_turbo_stat.attr,
0425     NULL,
0426 };
0427 
0428 static const struct attribute_group throttle_attr_grp = {
0429     .name   = "throttle_stats",
0430     .attrs  = throttle_attrs,
0431 };
0432 
0433 /* Helper routines */
0434 
0435 /* Access helpers to power mgt SPR */
0436 
0437 static inline unsigned long get_pmspr(unsigned long sprn)
0438 {
0439     switch (sprn) {
0440     case SPRN_PMCR:
0441         return mfspr(SPRN_PMCR);
0442 
0443     case SPRN_PMICR:
0444         return mfspr(SPRN_PMICR);
0445 
0446     case SPRN_PMSR:
0447         return mfspr(SPRN_PMSR);
0448     }
0449     BUG();
0450 }
0451 
0452 static inline void set_pmspr(unsigned long sprn, unsigned long val)
0453 {
0454     switch (sprn) {
0455     case SPRN_PMCR:
0456         mtspr(SPRN_PMCR, val);
0457         return;
0458 
0459     case SPRN_PMICR:
0460         mtspr(SPRN_PMICR, val);
0461         return;
0462     }
0463     BUG();
0464 }
0465 
0466 /*
0467  * Use objects of this type to query/update
0468  * pstates on a remote CPU via smp_call_function.
0469  */
0470 struct powernv_smp_call_data {
0471     unsigned int freq;
0472     u8 pstate_id;
0473     u8 gpstate_id;
0474 };
0475 
0476 /*
0477  * powernv_read_cpu_freq: Reads the current frequency on this CPU.
0478  *
0479  * Called via smp_call_function.
0480  *
0481  * Note: The caller of the smp_call_function should pass an argument of
0482  * the type 'struct powernv_smp_call_data *' along with this function.
0483  *
0484  * The current frequency on this CPU will be returned via
0485  * ((struct powernv_smp_call_data *)arg)->freq;
0486  */
0487 static void powernv_read_cpu_freq(void *arg)
0488 {
0489     unsigned long pmspr_val;
0490     struct powernv_smp_call_data *freq_data = arg;
0491 
0492     pmspr_val = get_pmspr(SPRN_PMSR);
0493     freq_data->pstate_id = extract_local_pstate(pmspr_val);
0494     freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
0495 
0496     pr_debug("cpu %d pmsr %016lX pstate_id 0x%x frequency %d kHz\n",
0497          raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
0498          freq_data->freq);
0499 }
0500 
0501 /*
0502  * powernv_cpufreq_get: Returns the CPU frequency as reported by the
0503  * firmware for CPU 'cpu'. This value is reported through the sysfs
0504  * file cpuinfo_cur_freq.
0505  */
0506 static unsigned int powernv_cpufreq_get(unsigned int cpu)
0507 {
0508     struct powernv_smp_call_data freq_data;
0509 
0510     smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
0511             &freq_data, 1);
0512 
0513     return freq_data.freq;
0514 }
0515 
0516 /*
0517  * set_pstate: Sets the pstate on this CPU.
0518  *
0519  * This is called via an smp_call_function.
0520  *
0521  * The caller must ensure that freq_data is of the type
0522  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
0523  * on this CPU should be present in freq_data->pstate_id.
0524  */
0525 static void set_pstate(void *data)
0526 {
0527     unsigned long val;
0528     struct powernv_smp_call_data *freq_data = data;
0529     unsigned long pstate_ul = freq_data->pstate_id;
0530     unsigned long gpstate_ul = freq_data->gpstate_id;
0531 
0532     val = get_pmspr(SPRN_PMCR);
0533     val = val & 0x0000FFFFFFFFFFFFULL;
0534 
0535     pstate_ul = pstate_ul & 0xFF;
0536     gpstate_ul = gpstate_ul & 0xFF;
0537 
0538     /* Set both global(bits 56..63) and local(bits 48..55) PStates */
0539     val = val | (gpstate_ul << 56) | (pstate_ul << 48);
0540 
0541     pr_debug("Setting cpu %d pmcr to %016lX\n",
0542             raw_smp_processor_id(), val);
0543     set_pmspr(SPRN_PMCR, val);
0544 }
0545 
0546 /*
0547  * get_nominal_index: Returns the index corresponding to the nominal
0548  * pstate in the cpufreq table
0549  */
0550 static inline unsigned int get_nominal_index(void)
0551 {
0552     return powernv_pstate_info.nominal;
0553 }
0554 
0555 static void powernv_cpufreq_throttle_check(void *data)
0556 {
0557     struct chip *chip;
0558     unsigned int cpu = smp_processor_id();
0559     unsigned long pmsr;
0560     u8 pmsr_pmax;
0561     unsigned int pmsr_pmax_idx;
0562 
0563     pmsr = get_pmspr(SPRN_PMSR);
0564     chip = this_cpu_read(chip_info);
0565 
0566     /* Check for Pmax Capping */
0567     pmsr_pmax = extract_max_pstate(pmsr);
0568     pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
0569     if (pmsr_pmax_idx != powernv_pstate_info.max) {
0570         if (chip->throttled)
0571             goto next;
0572         chip->throttled = true;
0573         if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
0574             pr_warn_once("CPU %d on Chip %u has Pmax(0x%x) reduced below that of nominal frequency(0x%x)\n",
0575                      cpu, chip->id, pmsr_pmax,
0576                      idx_to_pstate(powernv_pstate_info.nominal));
0577             chip->throttle_sub_turbo++;
0578         } else {
0579             chip->throttle_turbo++;
0580         }
0581         trace_powernv_throttle(chip->id,
0582                       throttle_reason[chip->throttle_reason],
0583                       pmsr_pmax);
0584     } else if (chip->throttled) {
0585         chip->throttled = false;
0586         trace_powernv_throttle(chip->id,
0587                       throttle_reason[chip->throttle_reason],
0588                       pmsr_pmax);
0589     }
0590 
0591     /* Check if Psafe_mode_active is set in PMSR. */
0592 next:
0593     if (pmsr & PMSR_PSAFE_ENABLE) {
0594         throttled = true;
0595         pr_info("Pstate set to safe frequency\n");
0596     }
0597 
0598     /* Check if SPR_EM_DISABLE is set in PMSR */
0599     if (pmsr & PMSR_SPR_EM_DISABLE) {
0600         throttled = true;
0601         pr_info("Frequency Control disabled from OS\n");
0602     }
0603 
0604     if (throttled) {
0605         pr_info("PMSR = %16lx\n", pmsr);
0606         pr_warn("CPU Frequency could be throttled\n");
0607     }
0608 }
0609 
0610 /**
0611  * calc_global_pstate - Calculate global pstate
0612  * @elapsed_time:       Elapsed time in milliseconds
0613  * @local_pstate_idx:       New local pstate
0614  * @highest_lpstate_idx:    pstate from which its ramping down
0615  *
0616  * Finds the appropriate global pstate based on the pstate from which its
0617  * ramping down and the time elapsed in ramping down. It follows a quadratic
0618  * equation which ensures that it reaches ramping down to pmin in 5sec.
0619  */
0620 static inline int calc_global_pstate(unsigned int elapsed_time,
0621                      int highest_lpstate_idx,
0622                      int local_pstate_idx)
0623 {
0624     int index_diff;
0625 
0626     /*
0627      * Using ramp_down_percent we get the percentage of rampdown
0628      * that we are expecting to be dropping. Difference between
0629      * highest_lpstate_idx and powernv_pstate_info.min will give a absolute
0630      * number of how many pstates we will drop eventually by the end of
0631      * 5 seconds, then just scale it get the number pstates to be dropped.
0632      */
0633     index_diff =  ((int)ramp_down_percent(elapsed_time) *
0634             (powernv_pstate_info.min - highest_lpstate_idx)) / 100;
0635 
0636     /* Ensure that global pstate is >= to local pstate */
0637     if (highest_lpstate_idx + index_diff >= local_pstate_idx)
0638         return local_pstate_idx;
0639     else
0640         return highest_lpstate_idx + index_diff;
0641 }
0642 
0643 static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
0644 {
0645     unsigned int timer_interval;
0646 
0647     /*
0648      * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
0649      * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
0650      * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
0651      * seconds of ramp down time.
0652      */
0653     if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
0654          > MAX_RAMP_DOWN_TIME)
0655         timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
0656     else
0657         timer_interval = GPSTATE_TIMER_INTERVAL;
0658 
0659     mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
0660 }
0661 
0662 /**
0663  * gpstate_timer_handler
0664  *
0665  * @t: Timer context used to fetch global pstate info struct
0666  *
0667  * This handler brings down the global pstate closer to the local pstate
0668  * according quadratic equation. Queues a new timer if it is still not equal
0669  * to local pstate
0670  */
0671 static void gpstate_timer_handler(struct timer_list *t)
0672 {
0673     struct global_pstate_info *gpstates = from_timer(gpstates, t, timer);
0674     struct cpufreq_policy *policy = gpstates->policy;
0675     int gpstate_idx, lpstate_idx;
0676     unsigned long val;
0677     unsigned int time_diff = jiffies_to_msecs(jiffies)
0678                     - gpstates->last_sampled_time;
0679     struct powernv_smp_call_data freq_data;
0680 
0681     if (!spin_trylock(&gpstates->gpstate_lock))
0682         return;
0683     /*
0684      * If the timer has migrated to the different cpu then bring
0685      * it back to one of the policy->cpus
0686      */
0687     if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) {
0688         gpstates->timer.expires = jiffies + msecs_to_jiffies(1);
0689         add_timer_on(&gpstates->timer, cpumask_first(policy->cpus));
0690         spin_unlock(&gpstates->gpstate_lock);
0691         return;
0692     }
0693 
0694     /*
0695      * If PMCR was last updated was using fast_swtich then
0696      * We may have wrong in gpstate->last_lpstate_idx
0697      * value. Hence, read from PMCR to get correct data.
0698      */
0699     val = get_pmspr(SPRN_PMCR);
0700     freq_data.gpstate_id = extract_global_pstate(val);
0701     freq_data.pstate_id = extract_local_pstate(val);
0702     if (freq_data.gpstate_id  == freq_data.pstate_id) {
0703         reset_gpstates(policy);
0704         spin_unlock(&gpstates->gpstate_lock);
0705         return;
0706     }
0707 
0708     gpstates->last_sampled_time += time_diff;
0709     gpstates->elapsed_time += time_diff;
0710 
0711     if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
0712         gpstate_idx = pstate_to_idx(freq_data.pstate_id);
0713         lpstate_idx = gpstate_idx;
0714         reset_gpstates(policy);
0715         gpstates->highest_lpstate_idx = gpstate_idx;
0716     } else {
0717         lpstate_idx = pstate_to_idx(freq_data.pstate_id);
0718         gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
0719                          gpstates->highest_lpstate_idx,
0720                          lpstate_idx);
0721     }
0722     freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
0723     gpstates->last_gpstate_idx = gpstate_idx;
0724     gpstates->last_lpstate_idx = lpstate_idx;
0725     /*
0726      * If local pstate is equal to global pstate, rampdown is over
0727      * So timer is not required to be queued.
0728      */
0729     if (gpstate_idx != gpstates->last_lpstate_idx)
0730         queue_gpstate_timer(gpstates);
0731 
0732     set_pstate(&freq_data);
0733     spin_unlock(&gpstates->gpstate_lock);
0734 }
0735 
0736 /*
0737  * powernv_cpufreq_target_index: Sets the frequency corresponding to
0738  * the cpufreq table entry indexed by new_index on the cpus in the
0739  * mask policy->cpus
0740  */
0741 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
0742                     unsigned int new_index)
0743 {
0744     struct powernv_smp_call_data freq_data;
0745     unsigned int cur_msec, gpstate_idx;
0746     struct global_pstate_info *gpstates = policy->driver_data;
0747 
0748     if (unlikely(rebooting) && new_index != get_nominal_index())
0749         return 0;
0750 
0751     if (!throttled) {
0752         /* we don't want to be preempted while
0753          * checking if the CPU frequency has been throttled
0754          */
0755         preempt_disable();
0756         powernv_cpufreq_throttle_check(NULL);
0757         preempt_enable();
0758     }
0759 
0760     cur_msec = jiffies_to_msecs(get_jiffies_64());
0761 
0762     freq_data.pstate_id = idx_to_pstate(new_index);
0763     if (!gpstates) {
0764         freq_data.gpstate_id = freq_data.pstate_id;
0765         goto no_gpstate;
0766     }
0767 
0768     spin_lock(&gpstates->gpstate_lock);
0769 
0770     if (!gpstates->last_sampled_time) {
0771         gpstate_idx = new_index;
0772         gpstates->highest_lpstate_idx = new_index;
0773         goto gpstates_done;
0774     }
0775 
0776     if (gpstates->last_gpstate_idx < new_index) {
0777         gpstates->elapsed_time += cur_msec -
0778                          gpstates->last_sampled_time;
0779 
0780         /*
0781          * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
0782          * we should be resetting all global pstate related data. Set it
0783          * equal to local pstate to start fresh.
0784          */
0785         if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
0786             reset_gpstates(policy);
0787             gpstates->highest_lpstate_idx = new_index;
0788             gpstate_idx = new_index;
0789         } else {
0790         /* Elaspsed_time is less than 5 seconds, continue to rampdown */
0791             gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
0792                              gpstates->highest_lpstate_idx,
0793                              new_index);
0794         }
0795     } else {
0796         reset_gpstates(policy);
0797         gpstates->highest_lpstate_idx = new_index;
0798         gpstate_idx = new_index;
0799     }
0800 
0801     /*
0802      * If local pstate is equal to global pstate, rampdown is over
0803      * So timer is not required to be queued.
0804      */
0805     if (gpstate_idx != new_index)
0806         queue_gpstate_timer(gpstates);
0807     else
0808         del_timer_sync(&gpstates->timer);
0809 
0810 gpstates_done:
0811     freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
0812     gpstates->last_sampled_time = cur_msec;
0813     gpstates->last_gpstate_idx = gpstate_idx;
0814     gpstates->last_lpstate_idx = new_index;
0815 
0816     spin_unlock(&gpstates->gpstate_lock);
0817 
0818 no_gpstate:
0819     /*
0820      * Use smp_call_function to send IPI and execute the
0821      * mtspr on target CPU.  We could do that without IPI
0822      * if current CPU is within policy->cpus (core)
0823      */
0824     smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
0825     return 0;
0826 }
0827 
0828 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
0829 {
0830     int base, i;
0831     struct kernfs_node *kn;
0832     struct global_pstate_info *gpstates;
0833 
0834     base = cpu_first_thread_sibling(policy->cpu);
0835 
0836     for (i = 0; i < threads_per_core; i++)
0837         cpumask_set_cpu(base + i, policy->cpus);
0838 
0839     kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
0840     if (!kn) {
0841         int ret;
0842 
0843         ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
0844         if (ret) {
0845             pr_info("Failed to create throttle stats directory for cpu %d\n",
0846                 policy->cpu);
0847             return ret;
0848         }
0849     } else {
0850         kernfs_put(kn);
0851     }
0852 
0853     policy->freq_table = powernv_freqs;
0854     policy->fast_switch_possible = true;
0855 
0856     if (pvr_version_is(PVR_POWER9))
0857         return 0;
0858 
0859     /* Initialise Gpstate ramp-down timer only on POWER8 */
0860     gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
0861     if (!gpstates)
0862         return -ENOMEM;
0863 
0864     policy->driver_data = gpstates;
0865 
0866     /* initialize timer */
0867     gpstates->policy = policy;
0868     timer_setup(&gpstates->timer, gpstate_timer_handler,
0869             TIMER_PINNED | TIMER_DEFERRABLE);
0870     gpstates->timer.expires = jiffies +
0871                 msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
0872     spin_lock_init(&gpstates->gpstate_lock);
0873 
0874     return 0;
0875 }
0876 
0877 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
0878 {
0879     struct powernv_smp_call_data freq_data;
0880     struct global_pstate_info *gpstates = policy->driver_data;
0881 
0882     freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
0883     freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
0884     smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
0885     if (gpstates)
0886         del_timer_sync(&gpstates->timer);
0887 
0888     kfree(policy->driver_data);
0889 
0890     return 0;
0891 }
0892 
0893 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
0894                 unsigned long action, void *unused)
0895 {
0896     int cpu;
0897     struct cpufreq_policy *cpu_policy;
0898 
0899     rebooting = true;
0900     for_each_online_cpu(cpu) {
0901         cpu_policy = cpufreq_cpu_get(cpu);
0902         if (!cpu_policy)
0903             continue;
0904         powernv_cpufreq_target_index(cpu_policy, get_nominal_index());
0905         cpufreq_cpu_put(cpu_policy);
0906     }
0907 
0908     return NOTIFY_DONE;
0909 }
0910 
0911 static struct notifier_block powernv_cpufreq_reboot_nb = {
0912     .notifier_call = powernv_cpufreq_reboot_notifier,
0913 };
0914 
0915 static void powernv_cpufreq_work_fn(struct work_struct *work)
0916 {
0917     struct chip *chip = container_of(work, struct chip, throttle);
0918     struct cpufreq_policy *policy;
0919     unsigned int cpu;
0920     cpumask_t mask;
0921 
0922     cpus_read_lock();
0923     cpumask_and(&mask, &chip->mask, cpu_online_mask);
0924     smp_call_function_any(&mask,
0925                   powernv_cpufreq_throttle_check, NULL, 0);
0926 
0927     if (!chip->restore)
0928         goto out;
0929 
0930     chip->restore = false;
0931     for_each_cpu(cpu, &mask) {
0932         int index;
0933 
0934         policy = cpufreq_cpu_get(cpu);
0935         if (!policy)
0936             continue;
0937         index = cpufreq_table_find_index_c(policy, policy->cur, false);
0938         powernv_cpufreq_target_index(policy, index);
0939         cpumask_andnot(&mask, &mask, policy->cpus);
0940         cpufreq_cpu_put(policy);
0941     }
0942 out:
0943     cpus_read_unlock();
0944 }
0945 
0946 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
0947                    unsigned long msg_type, void *_msg)
0948 {
0949     struct opal_msg *msg = _msg;
0950     struct opal_occ_msg omsg;
0951     int i;
0952 
0953     if (msg_type != OPAL_MSG_OCC)
0954         return 0;
0955 
0956     omsg.type = be64_to_cpu(msg->params[0]);
0957 
0958     switch (omsg.type) {
0959     case OCC_RESET:
0960         occ_reset = true;
0961         pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
0962         /*
0963          * powernv_cpufreq_throttle_check() is called in
0964          * target() callback which can detect the throttle state
0965          * for governors like ondemand.
0966          * But static governors will not call target() often thus
0967          * report throttling here.
0968          */
0969         if (!throttled) {
0970             throttled = true;
0971             pr_warn("CPU frequency is throttled for duration\n");
0972         }
0973 
0974         break;
0975     case OCC_LOAD:
0976         pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
0977         break;
0978     case OCC_THROTTLE:
0979         omsg.chip = be64_to_cpu(msg->params[1]);
0980         omsg.throttle_status = be64_to_cpu(msg->params[2]);
0981 
0982         if (occ_reset) {
0983             occ_reset = false;
0984             throttled = false;
0985             pr_info("OCC Active, CPU frequency is no longer throttled\n");
0986 
0987             for (i = 0; i < nr_chips; i++) {
0988                 chips[i].restore = true;
0989                 schedule_work(&chips[i].throttle);
0990             }
0991 
0992             return 0;
0993         }
0994 
0995         for (i = 0; i < nr_chips; i++)
0996             if (chips[i].id == omsg.chip)
0997                 break;
0998 
0999         if (omsg.throttle_status >= 0 &&
1000             omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
1001             chips[i].throttle_reason = omsg.throttle_status;
1002             chips[i].reason[omsg.throttle_status]++;
1003         }
1004 
1005         if (!omsg.throttle_status)
1006             chips[i].restore = true;
1007 
1008         schedule_work(&chips[i].throttle);
1009     }
1010     return 0;
1011 }
1012 
1013 static struct notifier_block powernv_cpufreq_opal_nb = {
1014     .notifier_call  = powernv_cpufreq_occ_msg,
1015     .next       = NULL,
1016     .priority   = 0,
1017 };
1018 
1019 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
1020                     unsigned int target_freq)
1021 {
1022     int index;
1023     struct powernv_smp_call_data freq_data;
1024 
1025     index = cpufreq_table_find_index_dl(policy, target_freq, false);
1026     freq_data.pstate_id = powernv_freqs[index].driver_data;
1027     freq_data.gpstate_id = powernv_freqs[index].driver_data;
1028     set_pstate(&freq_data);
1029 
1030     return powernv_freqs[index].frequency;
1031 }
1032 
1033 static struct cpufreq_driver powernv_cpufreq_driver = {
1034     .name       = "powernv-cpufreq",
1035     .flags      = CPUFREQ_CONST_LOOPS,
1036     .init       = powernv_cpufreq_cpu_init,
1037     .exit       = powernv_cpufreq_cpu_exit,
1038     .verify     = cpufreq_generic_frequency_table_verify,
1039     .target_index   = powernv_cpufreq_target_index,
1040     .fast_switch    = powernv_fast_switch,
1041     .get        = powernv_cpufreq_get,
1042     .attr       = powernv_cpu_freq_attr,
1043 };
1044 
1045 static int init_chip_info(void)
1046 {
1047     unsigned int *chip;
1048     unsigned int cpu, i;
1049     unsigned int prev_chip_id = UINT_MAX;
1050     cpumask_t *chip_cpu_mask;
1051     int ret = 0;
1052 
1053     chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL);
1054     if (!chip)
1055         return -ENOMEM;
1056 
1057     /* Allocate a chip cpu mask large enough to fit mask for all chips */
1058     chip_cpu_mask = kcalloc(MAX_NR_CHIPS, sizeof(cpumask_t), GFP_KERNEL);
1059     if (!chip_cpu_mask) {
1060         ret = -ENOMEM;
1061         goto free_and_return;
1062     }
1063 
1064     for_each_possible_cpu(cpu) {
1065         unsigned int id = cpu_to_chip_id(cpu);
1066 
1067         if (prev_chip_id != id) {
1068             prev_chip_id = id;
1069             chip[nr_chips++] = id;
1070         }
1071         cpumask_set_cpu(cpu, &chip_cpu_mask[nr_chips-1]);
1072     }
1073 
1074     chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
1075     if (!chips) {
1076         ret = -ENOMEM;
1077         goto out_free_chip_cpu_mask;
1078     }
1079 
1080     for (i = 0; i < nr_chips; i++) {
1081         chips[i].id = chip[i];
1082         cpumask_copy(&chips[i].mask, &chip_cpu_mask[i]);
1083         INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
1084         for_each_cpu(cpu, &chips[i].mask)
1085             per_cpu(chip_info, cpu) =  &chips[i];
1086     }
1087 
1088 out_free_chip_cpu_mask:
1089     kfree(chip_cpu_mask);
1090 free_and_return:
1091     kfree(chip);
1092     return ret;
1093 }
1094 
1095 static inline void clean_chip_info(void)
1096 {
1097     int i;
1098 
1099     /* flush any pending work items */
1100     if (chips)
1101         for (i = 0; i < nr_chips; i++)
1102             cancel_work_sync(&chips[i].throttle);
1103     kfree(chips);
1104 }
1105 
1106 static inline void unregister_all_notifiers(void)
1107 {
1108     opal_message_notifier_unregister(OPAL_MSG_OCC,
1109                      &powernv_cpufreq_opal_nb);
1110     unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
1111 }
1112 
1113 static int __init powernv_cpufreq_init(void)
1114 {
1115     int rc = 0;
1116 
1117     /* Don't probe on pseries (guest) platforms */
1118     if (!firmware_has_feature(FW_FEATURE_OPAL))
1119         return -ENODEV;
1120 
1121     /* Discover pstates from device tree and init */
1122     rc = init_powernv_pstates();
1123     if (rc)
1124         goto out;
1125 
1126     /* Populate chip info */
1127     rc = init_chip_info();
1128     if (rc)
1129         goto out;
1130 
1131     if (powernv_pstate_info.wof_enabled)
1132         powernv_cpufreq_driver.boost_enabled = true;
1133     else
1134         powernv_cpu_freq_attr[SCALING_BOOST_FREQS_ATTR_INDEX] = NULL;
1135 
1136     rc = cpufreq_register_driver(&powernv_cpufreq_driver);
1137     if (rc) {
1138         pr_info("Failed to register the cpufreq driver (%d)\n", rc);
1139         goto cleanup;
1140     }
1141 
1142     if (powernv_pstate_info.wof_enabled)
1143         cpufreq_enable_boost_support();
1144 
1145     register_reboot_notifier(&powernv_cpufreq_reboot_nb);
1146     opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
1147 
1148     return 0;
1149 cleanup:
1150     clean_chip_info();
1151 out:
1152     pr_info("Platform driver disabled. System does not support PState control\n");
1153     return rc;
1154 }
1155 module_init(powernv_cpufreq_init);
1156 
1157 static void __exit powernv_cpufreq_exit(void)
1158 {
1159     cpufreq_unregister_driver(&powernv_cpufreq_driver);
1160     unregister_all_notifiers();
1161     clean_chip_info();
1162 }
1163 module_exit(powernv_cpufreq_exit);
1164 
1165 MODULE_LICENSE("GPL");
1166 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");