Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  cpuidle-pseries - idle state cpuidle driver.
0004  *  Adapted from drivers/idle/intel_idle.c and
0005  *  drivers/acpi/processor_idle.c
0006  *
0007  */
0008 
0009 #include <linux/kernel.h>
0010 #include <linux/module.h>
0011 #include <linux/init.h>
0012 #include <linux/moduleparam.h>
0013 #include <linux/cpuidle.h>
0014 #include <linux/cpu.h>
0015 #include <linux/notifier.h>
0016 
0017 #include <asm/paca.h>
0018 #include <asm/reg.h>
0019 #include <asm/machdep.h>
0020 #include <asm/firmware.h>
0021 #include <asm/runlatch.h>
0022 #include <asm/idle.h>
0023 #include <asm/plpar_wrappers.h>
0024 #include <asm/rtas.h>
0025 
0026 static struct cpuidle_driver pseries_idle_driver = {
0027     .name             = "pseries_idle",
0028     .owner            = THIS_MODULE,
0029 };
0030 
0031 static int max_idle_state __read_mostly;
0032 static struct cpuidle_state *cpuidle_state_table __read_mostly;
0033 static u64 snooze_timeout __read_mostly;
0034 static bool snooze_timeout_en __read_mostly;
0035 
0036 static int snooze_loop(struct cpuidle_device *dev,
0037             struct cpuidle_driver *drv,
0038             int index)
0039 {
0040     u64 snooze_exit_time;
0041 
0042     set_thread_flag(TIF_POLLING_NRFLAG);
0043 
0044     pseries_idle_prolog();
0045     local_irq_enable();
0046     snooze_exit_time = get_tb() + snooze_timeout;
0047 
0048     while (!need_resched()) {
0049         HMT_low();
0050         HMT_very_low();
0051         if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
0052             /*
0053              * Task has not woken up but we are exiting the polling
0054              * loop anyway. Require a barrier after polling is
0055              * cleared to order subsequent test of need_resched().
0056              */
0057             clear_thread_flag(TIF_POLLING_NRFLAG);
0058             smp_mb();
0059             break;
0060         }
0061     }
0062 
0063     HMT_medium();
0064     clear_thread_flag(TIF_POLLING_NRFLAG);
0065 
0066     local_irq_disable();
0067 
0068     pseries_idle_epilog();
0069 
0070     return index;
0071 }
0072 
0073 static void check_and_cede_processor(void)
0074 {
0075     /*
0076      * Ensure our interrupt state is properly tracked,
0077      * also checks if no interrupt has occurred while we
0078      * were soft-disabled
0079      */
0080     if (prep_irq_for_idle()) {
0081         cede_processor();
0082 #ifdef CONFIG_TRACE_IRQFLAGS
0083         /* Ensure that H_CEDE returns with IRQs on */
0084         if (WARN_ON(!(mfmsr() & MSR_EE)))
0085             __hard_irq_enable();
0086 #endif
0087     }
0088 }
0089 
0090 /*
0091  * XCEDE: Extended CEDE states discovered through the
0092  *        "ibm,get-systems-parameter" RTAS call with the token
0093  *        CEDE_LATENCY_TOKEN
0094  */
0095 
0096 /*
0097  * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
0098  * table with all the parameters to ibm,get-system-parameters.
0099  * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
0100  * Settings Information.
0101  */
0102 #define CEDE_LATENCY_TOKEN  45
0103 
0104 /*
0105  * If the platform supports the cede latency settings information system
0106  * parameter it must provide the following information in the NULL terminated
0107  * parameter string:
0108  *
0109  * a. The first byte is the length ā€œNā€ of each cede latency setting record minus
0110  *    one (zero indicates a length of 1 byte).
0111  *
0112  * b. For each supported cede latency setting a cede latency setting record
0113  *    consisting of the first ā€œNā€ bytes as per the following table.
0114  *
0115  *    -----------------------------
0116  *    | Field           | Field   |
0117  *    | Name            | Length  |
0118  *    -----------------------------
0119  *    | Cede Latency    | 1 Byte  |
0120  *    | Specifier Value |         |
0121  *    -----------------------------
0122  *    | Maximum wakeup  |         |
0123  *    | latency in      | 8 Bytes |
0124  *    | tb-ticks        |         |
0125  *    -----------------------------
0126  *    | Responsive to   |         |
0127  *    | external        | 1 Byte  |
0128  *    | interrupts      |         |
0129  *    -----------------------------
0130  *
0131  * This version has cede latency record size = 10.
0132  *
0133  * The structure xcede_latency_payload represents a) and b) with
0134  * xcede_latency_record representing the table in b).
0135  *
0136  * xcede_latency_parameter is what gets returned by
0137  * ibm,get-systems-parameter RTAS call when made with
0138  * CEDE_LATENCY_TOKEN.
0139  *
0140  * These structures are only used to represent the data obtained by the RTAS
0141  * call. The data is in big-endian.
0142  */
0143 struct xcede_latency_record {
0144     u8  hint;
0145     __be64  latency_ticks;
0146     u8  wake_on_irqs;
0147 } __packed;
0148 
0149 // Make space for 16 records, which "should be enough".
0150 struct xcede_latency_payload {
0151     u8     record_size;
0152     struct xcede_latency_record records[16];
0153 } __packed;
0154 
0155 struct xcede_latency_parameter {
0156     __be16  payload_size;
0157     struct xcede_latency_payload payload;
0158     u8 null_char;
0159 } __packed;
0160 
0161 static unsigned int nr_xcede_records;
0162 static struct xcede_latency_parameter xcede_latency_parameter __initdata;
0163 
0164 static int __init parse_cede_parameters(void)
0165 {
0166     struct xcede_latency_payload *payload;
0167     u32 total_xcede_records_size;
0168     u8 xcede_record_size;
0169     u16 payload_size;
0170     int ret, i;
0171 
0172     ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
0173             NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
0174             sizeof(xcede_latency_parameter));
0175     if (ret) {
0176         pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
0177         return ret;
0178     }
0179 
0180     payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
0181     payload = &xcede_latency_parameter.payload;
0182 
0183     xcede_record_size = payload->record_size + 1;
0184 
0185     if (xcede_record_size != sizeof(struct xcede_latency_record)) {
0186         pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
0187                sizeof(struct xcede_latency_record), xcede_record_size);
0188         return -EINVAL;
0189     }
0190 
0191     pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
0192 
0193     /*
0194      * Since the payload_size includes the last NULL byte and the
0195      * xcede_record_size, the remaining bytes correspond to array of all
0196      * cede_latency settings.
0197      */
0198     total_xcede_records_size = payload_size - 2;
0199     nr_xcede_records = total_xcede_records_size / xcede_record_size;
0200 
0201     for (i = 0; i < nr_xcede_records; i++) {
0202         struct xcede_latency_record *record = &payload->records[i];
0203         u64 latency_ticks = be64_to_cpu(record->latency_ticks);
0204         u8 wake_on_irqs = record->wake_on_irqs;
0205         u8 hint = record->hint;
0206 
0207         pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
0208             i, hint, latency_ticks, wake_on_irqs);
0209     }
0210 
0211     return 0;
0212 }
0213 
0214 #define NR_DEDICATED_STATES 2 /* snooze, CEDE */
0215 static u8 cede_latency_hint[NR_DEDICATED_STATES];
0216 
0217 static int dedicated_cede_loop(struct cpuidle_device *dev,
0218                 struct cpuidle_driver *drv,
0219                 int index)
0220 {
0221     u8 old_latency_hint;
0222 
0223     pseries_idle_prolog();
0224     get_lppaca()->donate_dedicated_cpu = 1;
0225     old_latency_hint = get_lppaca()->cede_latency_hint;
0226     get_lppaca()->cede_latency_hint = cede_latency_hint[index];
0227 
0228     HMT_medium();
0229     check_and_cede_processor();
0230 
0231     local_irq_disable();
0232     get_lppaca()->donate_dedicated_cpu = 0;
0233     get_lppaca()->cede_latency_hint = old_latency_hint;
0234 
0235     pseries_idle_epilog();
0236 
0237     return index;
0238 }
0239 
0240 static int shared_cede_loop(struct cpuidle_device *dev,
0241             struct cpuidle_driver *drv,
0242             int index)
0243 {
0244 
0245     pseries_idle_prolog();
0246 
0247     /*
0248      * Yield the processor to the hypervisor.  We return if
0249      * an external interrupt occurs (which are driven prior
0250      * to returning here) or if a prod occurs from another
0251      * processor. When returning here, external interrupts
0252      * are enabled.
0253      */
0254     check_and_cede_processor();
0255 
0256     local_irq_disable();
0257     pseries_idle_epilog();
0258 
0259     return index;
0260 }
0261 
0262 /*
0263  * States for dedicated partition case.
0264  */
0265 static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
0266     { /* Snooze */
0267         .name = "snooze",
0268         .desc = "snooze",
0269         .exit_latency = 0,
0270         .target_residency = 0,
0271         .enter = &snooze_loop },
0272     { /* CEDE */
0273         .name = "CEDE",
0274         .desc = "CEDE",
0275         .exit_latency = 10,
0276         .target_residency = 100,
0277         .enter = &dedicated_cede_loop },
0278 };
0279 
0280 /*
0281  * States for shared partition case.
0282  */
0283 static struct cpuidle_state shared_states[] = {
0284     { /* Snooze */
0285         .name = "snooze",
0286         .desc = "snooze",
0287         .exit_latency = 0,
0288         .target_residency = 0,
0289         .enter = &snooze_loop },
0290     { /* Shared Cede */
0291         .name = "Shared Cede",
0292         .desc = "Shared Cede",
0293         .exit_latency = 10,
0294         .target_residency = 100,
0295         .enter = &shared_cede_loop },
0296 };
0297 
0298 static int pseries_cpuidle_cpu_online(unsigned int cpu)
0299 {
0300     struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
0301 
0302     if (dev && cpuidle_get_driver()) {
0303         cpuidle_pause_and_lock();
0304         cpuidle_enable_device(dev);
0305         cpuidle_resume_and_unlock();
0306     }
0307     return 0;
0308 }
0309 
0310 static int pseries_cpuidle_cpu_dead(unsigned int cpu)
0311 {
0312     struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
0313 
0314     if (dev && cpuidle_get_driver()) {
0315         cpuidle_pause_and_lock();
0316         cpuidle_disable_device(dev);
0317         cpuidle_resume_and_unlock();
0318     }
0319     return 0;
0320 }
0321 
0322 /*
0323  * pseries_cpuidle_driver_init()
0324  */
0325 static int pseries_cpuidle_driver_init(void)
0326 {
0327     int idle_state;
0328     struct cpuidle_driver *drv = &pseries_idle_driver;
0329 
0330     drv->state_count = 0;
0331 
0332     for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
0333         /* Is the state not enabled? */
0334         if (cpuidle_state_table[idle_state].enter == NULL)
0335             continue;
0336 
0337         drv->states[drv->state_count] = /* structure copy */
0338             cpuidle_state_table[idle_state];
0339 
0340         drv->state_count += 1;
0341     }
0342 
0343     return 0;
0344 }
0345 
0346 static void __init fixup_cede0_latency(void)
0347 {
0348     struct xcede_latency_payload *payload;
0349     u64 min_xcede_latency_us = UINT_MAX;
0350     int i;
0351 
0352     if (parse_cede_parameters())
0353         return;
0354 
0355     pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
0356         nr_xcede_records);
0357 
0358     payload = &xcede_latency_parameter.payload;
0359 
0360     /*
0361      * The CEDE idle state maps to CEDE(0). While the hypervisor
0362      * does not advertise CEDE(0) exit latency values, it does
0363      * advertise the latency values of the extended CEDE states.
0364      * We use the lowest advertised exit latency value as a proxy
0365      * for the exit latency of CEDE(0).
0366      */
0367     for (i = 0; i < nr_xcede_records; i++) {
0368         struct xcede_latency_record *record = &payload->records[i];
0369         u8 hint = record->hint;
0370         u64 latency_tb = be64_to_cpu(record->latency_ticks);
0371         u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
0372 
0373         /*
0374          * We expect the exit latency of an extended CEDE
0375          * state to be non-zero, it to since it takes at least
0376          * a few nanoseconds to wakeup the idle CPU and
0377          * dispatch the virtual processor into the Linux
0378          * Guest.
0379          *
0380          * So we consider only non-zero value for performing
0381          * the fixup of CEDE(0) latency.
0382          */
0383         if (latency_us == 0) {
0384             pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
0385                 i, hint);
0386             continue;
0387         }
0388 
0389         if (latency_us < min_xcede_latency_us)
0390             min_xcede_latency_us = latency_us;
0391     }
0392 
0393     if (min_xcede_latency_us != UINT_MAX) {
0394         dedicated_states[1].exit_latency = min_xcede_latency_us;
0395         dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
0396         pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
0397             min_xcede_latency_us);
0398     }
0399 
0400 }
0401 
0402 /*
0403  * pseries_idle_probe()
0404  * Choose state table for shared versus dedicated partition
0405  */
0406 static int __init pseries_idle_probe(void)
0407 {
0408 
0409     if (cpuidle_disable != IDLE_NO_OVERRIDE)
0410         return -ENODEV;
0411 
0412     if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
0413         /*
0414          * Use local_paca instead of get_lppaca() since
0415          * preemption is not disabled, and it is not required in
0416          * fact, since lppaca_ptr does not need to be the value
0417          * associated to the current CPU, it can be from any CPU.
0418          */
0419         if (lppaca_shared_proc(local_paca->lppaca_ptr)) {
0420             cpuidle_state_table = shared_states;
0421             max_idle_state = ARRAY_SIZE(shared_states);
0422         } else {
0423             /*
0424              * Use firmware provided latency values
0425              * starting with POWER10 platforms. In the
0426              * case that we are running on a POWER10
0427              * platform but in an earlier compat mode, we
0428              * can still use the firmware provided values.
0429              *
0430              * However, on platforms prior to POWER10, we
0431              * cannot rely on the accuracy of the firmware
0432              * provided latency values. On such platforms,
0433              * go with the conservative default estimate
0434              * of 10us.
0435              */
0436             if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
0437                 fixup_cede0_latency();
0438             cpuidle_state_table = dedicated_states;
0439             max_idle_state = NR_DEDICATED_STATES;
0440         }
0441     } else
0442         return -ENODEV;
0443 
0444     if (max_idle_state > 1) {
0445         snooze_timeout_en = true;
0446         snooze_timeout = cpuidle_state_table[1].target_residency *
0447                  tb_ticks_per_usec;
0448     }
0449     return 0;
0450 }
0451 
0452 static int __init pseries_processor_idle_init(void)
0453 {
0454     int retval;
0455 
0456     retval = pseries_idle_probe();
0457     if (retval)
0458         return retval;
0459 
0460     pseries_cpuidle_driver_init();
0461     retval = cpuidle_register(&pseries_idle_driver, NULL);
0462     if (retval) {
0463         printk(KERN_DEBUG "Registration of pseries driver failed.\n");
0464         return retval;
0465     }
0466 
0467     retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
0468                        "cpuidle/pseries:online",
0469                        pseries_cpuidle_cpu_online, NULL);
0470     WARN_ON(retval < 0);
0471     retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
0472                        "cpuidle/pseries:DEAD", NULL,
0473                        pseries_cpuidle_cpu_dead);
0474     WARN_ON(retval < 0);
0475     printk(KERN_DEBUG "pseries_idle_driver registered\n");
0476     return 0;
0477 }
0478 
0479 device_initcall(pseries_processor_idle_init);