selftests/kvm/rseq_test.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 #define _GNU_SOURCE /* for program_invocation_short_name */
0003 #include <errno.h>
0004 #include <fcntl.h>
0005 #include <pthread.h>
0006 #include <sched.h>
0007 #include <stdio.h>
0008 #include <stdlib.h>
0009 #include <string.h>
0010 #include <signal.h>
0011 #include <syscall.h>
0012 #include <sys/ioctl.h>
0013 #include <sys/sysinfo.h>
0014 #include <asm/barrier.h>
0015 #include <linux/atomic.h>
0016 #include <linux/rseq.h>
0017 #include <linux/unistd.h>
0018
0019 #include "kvm_util.h"
0020 #include "processor.h"
0021 #include "test_util.h"
0022
0023 #include "../rseq/rseq.c"
0024
0025 /*
0026  * Any bug related to task migration is likely to be timing-dependent; perform
0027  * a large number of migrations to reduce the odds of a false negative.
0028  */
0029 #define NR_TASK_MIGRATIONS 100000
0030
0031 static pthread_t migration_thread;
0032 static cpu_set_t possible_mask;
0033 static int min_cpu, max_cpu;
0034 static bool done;
0035
0036 static atomic_t seq_cnt;
0037
0038 static void guest_code(void)
0039 {
0040     for (;;)
0041         GUEST_SYNC(0);
0042 }
0043
0044 /*
0045  * We have to perform direct system call for getcpu() because it's
0046  * not available until glic 2.29.
0047  */
0048 static void sys_getcpu(unsigned *cpu)
0049 {
0050     int r;
0051
0052     r = syscall(__NR_getcpu, cpu, NULL, NULL);
0053     TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", errno, strerror(errno));
0054 }
0055
0056 static int next_cpu(int cpu)
0057 {
0058     /*
0059      * Advance to the next CPU, skipping those that weren't in the original
0060      * affinity set.  Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's
0061      * data storage is considered as opaque.  Note, if this task is pinned
0062      * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will
0063      * burn a lot cycles and the test will take longer than normal to
0064      * complete.
0065      */
0066     do {
0067         cpu++;
0068         if (cpu > max_cpu) {
0069             cpu = min_cpu;
0070             TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),
0071                     "Min CPU = %d must always be usable", cpu);
0072             break;
0073         }
0074     } while (!CPU_ISSET(cpu, &possible_mask));
0075
0076     return cpu;
0077 }
0078
0079 static void *migration_worker(void *__rseq_tid)
0080 {
0081     pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid;
0082     cpu_set_t allowed_mask;
0083     int r, i, cpu;
0084
0085     CPU_ZERO(&allowed_mask);
0086
0087     for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) {
0088         CPU_SET(cpu, &allowed_mask);
0089
0090         /*
0091          * Bump the sequence count twice to allow the reader to detect
0092          * that a migration may have occurred in between rseq and sched
0093          * CPU ID reads.  An odd sequence count indicates a migration
0094          * is in-progress, while a completely different count indicates
0095          * a migration occurred since the count was last read.
0096          */
0097         atomic_inc(&seq_cnt);
0098
0099         /*
0100          * Ensure the odd count is visible while getcpu() isn't
0101          * stable, i.e. while changing affinity is in-progress.
0102          */
0103         smp_wmb();
0104         r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask);
0105         TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",
0106                 errno, strerror(errno));
0107         smp_wmb();
0108         atomic_inc(&seq_cnt);
0109
0110         CPU_CLR(cpu, &allowed_mask);
0111
0112         /*
0113          * Wait 1-10us before proceeding to the next iteration and more
0114          * specifically, before bumping seq_cnt again.  A delay is
0115          * needed on three fronts:
0116          *
0117          *  1. To allow sched_setaffinity() to prompt migration before
0118          *     ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME
0119          *     (or TIF_NEED_RESCHED, which indirectly leads to handling
0120          *     NOTIFY_RESUME) is handled in KVM context.
0121          *
0122          *     If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters
0123          *     the guest, the guest will trigger a IO/MMIO exit all the
0124          *     way to userspace and the TIF flags will be handled by
0125          *     the generic "exit to userspace" logic, not by KVM.  The
0126          *     exit to userspace is necessary to give the test a chance
0127          *     to check the rseq CPU ID (see #2).
0128          *
0129          *     Alternatively, guest_code() could include an instruction
0130          *     to trigger an exit that is handled by KVM, but any such
0131          *     exit requires architecture specific code.
0132          *
0133          *  2. To let ioctl(KVM_RUN) make its way back to the test
0134          *     before the next round of migration.  The test's check on
0135          *     the rseq CPU ID must wait for migration to complete in
0136          *     order to avoid false positive, thus any kernel rseq bug
0137          *     will be missed if the next migration starts before the
0138          *     check completes.
0139          *
0140          *  3. To ensure the read-side makes efficient forward progress,
0141          *     e.g. if getcpu() involves a syscall. Stalling the read-side
0142          *     means the test will spend more time waiting for getcpu()
0143          *     to stabilize and less time trying to hit the timing-dependent
0144          *     bug.
0145          *
0146          * Because any bug in this area is likely to be timing-dependent,
0147          * run with a range of delays at 1us intervals from 1us to 10us
0148          * as a best effort to avoid tuning the test to the point where
0149          * it can hit _only_ the original bug and not detect future
0150          * regressions.
0151          *
0152          * The original bug can reproduce with a delay up to ~500us on
0153          * x86-64, but starts to require more iterations to reproduce
0154          * as the delay creeps above ~10us, and the average runtime of
0155          * each iteration obviously increases as well.  Cap the delay
0156          * at 10us to keep test runtime reasonable while minimizing
0157          * potential coverage loss.
0158          *
0159          * The lower bound for reproducing the bug is likely below 1us,
0160          * e.g. failures occur on x86-64 with nanosleep(0), but at that
0161          * point the overhead of the syscall likely dominates the delay.
0162          * Use usleep() for simplicity and to avoid unnecessary kernel
0163          * dependencies.
0164          */
0165         usleep((i % 10) + 1);
0166     }
0167     done = true;
0168     return NULL;
0169 }
0170
0171 static void calc_min_max_cpu(void)
0172 {
0173     int i, cnt, nproc;
0174
0175     TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2);
0176
0177     /*
0178      * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that
0179      * this task is affined to in order to reduce the time spent querying
0180      * unusable CPUs, e.g. if this task is pinned to a small percentage of
0181      * total CPUs.
0182      */
0183     nproc = get_nprocs_conf();
0184     min_cpu = -1;
0185     max_cpu = -1;
0186     cnt = 0;
0187
0188     for (i = 0; i < nproc; i++) {
0189         if (!CPU_ISSET(i, &possible_mask))
0190             continue;
0191         if (min_cpu == -1)
0192             min_cpu = i;
0193         max_cpu = i;
0194         cnt++;
0195     }
0196
0197     __TEST_REQUIRE(cnt >= 2,
0198                "Only one usable CPU, task migration not possible");
0199 }
0200
0201 int main(int argc, char *argv[])
0202 {
0203     int r, i, snapshot;
0204     struct kvm_vm *vm;
0205     struct kvm_vcpu *vcpu;
0206     u32 cpu, rseq_cpu;
0207
0208     /* Tell stdout not to buffer its content */
0209     setbuf(stdout, NULL);
0210
0211     r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
0212     TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
0213             strerror(errno));
0214
0215     calc_min_max_cpu();
0216
0217     r = rseq_register_current_thread();
0218     TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)",
0219             errno, strerror(errno));
0220
0221     /*
0222      * Create and run a dummy VM that immediately exits to userspace via
0223      * GUEST_SYNC, while concurrently migrating the process by setting its
0224      * CPU affinity.
0225      */
0226     vm = vm_create_with_one_vcpu(&vcpu, guest_code);
0227     ucall_init(vm, NULL);
0228
0229     pthread_create(&migration_thread, NULL, migration_worker,
0230                (void *)(unsigned long)syscall(SYS_gettid));
0231
0232     for (i = 0; !done; i++) {
0233         vcpu_run(vcpu);
0234         TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
0235                 "Guest failed?");
0236
0237         /*
0238          * Verify rseq's CPU matches sched's CPU.  Ensure migration
0239          * doesn't occur between getcpu() and reading the rseq cpu_id
0240          * by rereading both if the sequence count changes, or if the
0241          * count is odd (migration in-progress).
0242          */
0243         do {
0244             /*
0245              * Drop bit 0 to force a mismatch if the count is odd,
0246              * i.e. if a migration is in-progress.
0247              */
0248             snapshot = atomic_read(&seq_cnt) & ~1;
0249
0250             /*
0251              * Ensure calling getcpu() and reading rseq.cpu_id complete
0252              * in a single "no migration" window, i.e. are not reordered
0253              * across the seq_cnt reads.
0254              */
0255             smp_rmb();
0256             sys_getcpu(&cpu);
0257             rseq_cpu = rseq_current_cpu_raw();
0258             smp_rmb();
0259         } while (snapshot != atomic_read(&seq_cnt));
0260
0261         TEST_ASSERT(rseq_cpu == cpu,
0262                 "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu);
0263     }
0264
0265     /*
0266      * Sanity check that the test was able to enter the guest a reasonable
0267      * number of times, e.g. didn't get stalled too often/long waiting for
0268      * getcpu() to stabilize.  A 2:1 migration:KVM_RUN ratio is a fairly
0269      * conservative ratio on x86-64, which can do _more_ KVM_RUNs than
0270      * migrations given the 1us+ delay in the migration task.
0271      */
0272     TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2),
0273             "Only performed %d KVM_RUNs, task stalled too much?\n", i);
0274
0275     pthread_join(migration_thread, NULL);
0276
0277     kvm_vm_free(vm);
0278
0279     rseq_unregister_current_thread();
0280
0281     return 0;
0282 }