Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Copyright(c) 2022 Intel Corporation. */
0003 
0004 #include <linux/cpu.h>
0005 #include <linux/delay.h>
0006 #include <linux/fs.h>
0007 #include <linux/nmi.h>
0008 #include <linux/slab.h>
0009 #include <linux/stop_machine.h>
0010 
0011 #include "ifs.h"
0012 
0013 /*
0014  * Note all code and data in this file is protected by
0015  * ifs_sem. On HT systems all threads on a core will
0016  * execute together, but only the first thread on the
0017  * core will update results of the test.
0018  */
0019 
0020 #define CREATE_TRACE_POINTS
0021 #include <trace/events/intel_ifs.h>
0022 
0023 /* Max retries on the same chunk */
0024 #define MAX_IFS_RETRIES  5
0025 
0026 /*
0027  * Number of TSC cycles that a logical CPU will wait for the other
0028  * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
0029  */
0030 #define IFS_THREAD_WAIT 100000
0031 
0032 enum ifs_status_err_code {
0033     IFS_NO_ERROR                = 0,
0034     IFS_OTHER_THREAD_COULD_NOT_JOIN     = 1,
0035     IFS_INTERRUPTED_BEFORE_RENDEZVOUS   = 2,
0036     IFS_POWER_MGMT_INADEQUATE_FOR_SCAN  = 3,
0037     IFS_INVALID_CHUNK_RANGE         = 4,
0038     IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS  = 5,
0039     IFS_CORE_NOT_CAPABLE_CURRENTLY      = 6,
0040     IFS_UNASSIGNED_ERROR_CODE       = 7,
0041     IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
0042     IFS_INTERRUPTED_DURING_EXECUTION    = 9,
0043 };
0044 
0045 static const char * const scan_test_status[] = {
0046     [IFS_NO_ERROR] = "SCAN no error",
0047     [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
0048     [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
0049     [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
0050     "Core Abort SCAN Response due to power management condition.",
0051     [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
0052     [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
0053     [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
0054     [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
0055     [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
0056     "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
0057     [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
0058 };
0059 
0060 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
0061 {
0062     if (status.error_code < ARRAY_SIZE(scan_test_status)) {
0063         dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
0064              cpumask_pr_args(cpu_smt_mask(cpu)),
0065              scan_test_status[status.error_code]);
0066     } else if (status.error_code == IFS_SW_TIMEOUT) {
0067         dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
0068              cpumask_pr_args(cpu_smt_mask(cpu)));
0069     } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
0070         dev_info(dev, "CPU(s) %*pbl: %s\n",
0071              cpumask_pr_args(cpu_smt_mask(cpu)),
0072              "Not all scan chunks were executed. Maximum forward progress retries exceeded");
0073     } else {
0074         dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
0075              cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
0076     }
0077 }
0078 
0079 static void message_fail(struct device *dev, int cpu, union ifs_status status)
0080 {
0081     /*
0082      * control_error is set when the microcode runs into a problem
0083      * loading the image from the reserved BIOS memory, or it has
0084      * been corrupted. Reloading the image may fix this issue.
0085      */
0086     if (status.control_error) {
0087         dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image\n",
0088             cpumask_pr_args(cpu_smt_mask(cpu)));
0089     }
0090 
0091     /*
0092      * signature_error is set when the output from the scan chains does not
0093      * match the expected signature. This might be a transient problem (e.g.
0094      * due to a bit flip from an alpha particle or neutron). If the problem
0095      * repeats on a subsequent test, then it indicates an actual problem in
0096      * the core being tested.
0097      */
0098     if (status.signature_error) {
0099         dev_err(dev, "CPU(s) %*pbl: test signature incorrect.\n",
0100             cpumask_pr_args(cpu_smt_mask(cpu)));
0101     }
0102 }
0103 
0104 static bool can_restart(union ifs_status status)
0105 {
0106     enum ifs_status_err_code err_code = status.error_code;
0107 
0108     /* Signature for chunk is bad, or scan test failed */
0109     if (status.signature_error || status.control_error)
0110         return false;
0111 
0112     switch (err_code) {
0113     case IFS_NO_ERROR:
0114     case IFS_OTHER_THREAD_COULD_NOT_JOIN:
0115     case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
0116     case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
0117     case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
0118     case IFS_INTERRUPTED_DURING_EXECUTION:
0119         return true;
0120     case IFS_INVALID_CHUNK_RANGE:
0121     case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
0122     case IFS_CORE_NOT_CAPABLE_CURRENTLY:
0123     case IFS_UNASSIGNED_ERROR_CODE:
0124         break;
0125     }
0126     return false;
0127 }
0128 
0129 /*
0130  * Execute the scan. Called "simultaneously" on all threads of a core
0131  * at high priority using the stop_cpus mechanism.
0132  */
0133 static int doscan(void *data)
0134 {
0135     int cpu = smp_processor_id();
0136     u64 *msrs = data;
0137     int first;
0138 
0139     /* Only the first logical CPU on a core reports result */
0140     first = cpumask_first(cpu_smt_mask(cpu));
0141 
0142     /*
0143      * This WRMSR will wait for other HT threads to also write
0144      * to this MSR (at most for activate.delay cycles). Then it
0145      * starts scan of each requested chunk. The core scan happens
0146      * during the "execution" of the WRMSR. This instruction can
0147      * take up to 200 milliseconds (in the case where all chunks
0148      * are processed in a single pass) before it retires.
0149      */
0150     wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
0151 
0152     if (cpu == first) {
0153         /* Pass back the result of the scan */
0154         rdmsrl(MSR_SCAN_STATUS, msrs[1]);
0155     }
0156 
0157     return 0;
0158 }
0159 
0160 /*
0161  * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
0162  * on all threads of the core to be tested. Loop if necessary to complete
0163  * run of all chunks. Include some defensive tests to make sure forward
0164  * progress is made, and that the whole test completes in a reasonable time.
0165  */
0166 static void ifs_test_core(int cpu, struct device *dev)
0167 {
0168     union ifs_scan activate;
0169     union ifs_status status;
0170     unsigned long timeout;
0171     struct ifs_data *ifsd;
0172     u64 msrvals[2];
0173     int retries;
0174 
0175     ifsd = ifs_get_data(dev);
0176 
0177     activate.rsvd = 0;
0178     activate.delay = IFS_THREAD_WAIT;
0179     activate.sigmce = 0;
0180     activate.start = 0;
0181     activate.stop = ifsd->valid_chunks - 1;
0182 
0183     timeout = jiffies + HZ / 2;
0184     retries = MAX_IFS_RETRIES;
0185 
0186     while (activate.start <= activate.stop) {
0187         if (time_after(jiffies, timeout)) {
0188             status.error_code = IFS_SW_TIMEOUT;
0189             break;
0190         }
0191 
0192         msrvals[0] = activate.data;
0193         stop_core_cpuslocked(cpu, doscan, msrvals);
0194 
0195         status.data = msrvals[1];
0196 
0197         trace_ifs_status(cpu, activate, status);
0198 
0199         /* Some cases can be retried, give up for others */
0200         if (!can_restart(status))
0201             break;
0202 
0203         if (status.chunk_num == activate.start) {
0204             /* Check for forward progress */
0205             if (--retries == 0) {
0206                 if (status.error_code == IFS_NO_ERROR)
0207                     status.error_code = IFS_SW_PARTIAL_COMPLETION;
0208                 break;
0209             }
0210         } else {
0211             retries = MAX_IFS_RETRIES;
0212             activate.start = status.chunk_num;
0213         }
0214     }
0215 
0216     /* Update status for this core */
0217     ifsd->scan_details = status.data;
0218 
0219     if (status.control_error || status.signature_error) {
0220         ifsd->status = SCAN_TEST_FAIL;
0221         message_fail(dev, cpu, status);
0222     } else if (status.error_code) {
0223         ifsd->status = SCAN_NOT_TESTED;
0224         message_not_tested(dev, cpu, status);
0225     } else {
0226         ifsd->status = SCAN_TEST_PASS;
0227     }
0228 }
0229 
0230 /*
0231  * Initiate per core test. It wakes up work queue threads on the target cpu and
0232  * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
0233  * wait for all sibling threads to finish the scan test.
0234  */
0235 int do_core_test(int cpu, struct device *dev)
0236 {
0237     int ret = 0;
0238 
0239     /* Prevent CPUs from being taken offline during the scan test */
0240     cpus_read_lock();
0241 
0242     if (!cpu_online(cpu)) {
0243         dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
0244         ret = -EINVAL;
0245         goto out;
0246     }
0247 
0248     ifs_test_core(cpu, dev);
0249 out:
0250     cpus_read_unlock();
0251     return ret;
0252 }